def generate_summaries(path):
    from os.path import join as joinpath
    # ----------------------------------------
    # Load the parsing model
    pm = ParsingModel()
    pm.loadmodel("parsing-model.pickle.gz")

    # ----------------------------------------
    # Read all files from the given path
    doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus')]
    for fedus in doclist:
        pred_rst = parse(pm, fedus=fedus)

        # You can pass in either summary_factor or summary_p
        # summary_p hardcodes it to p sentences
        # summary_factor is a percentage of the edu length
        top_scoring = calc_marcu(pred_rst, summary_p = 2)
        summary_fname = fedus.replace('.edus', '.summary')
        s = []

        # Get top scoring and format it appropriately
        for edu in top_scoring:
            edu.text = edu.text.strip()
            str_array = word_tokenize(edu.text)

            # Remove PP phrase by finding index of the VBG and removing all words that point to it in dependency graph
            pp_indices = [i for i,x in enumerate(edu.tags) if x == 'VBG']
            pp_phrase_indices = set(pp_indices)
            for idx in pp_indices:
                pp_phrase_indices.add(edu.head_words_indices[idx] - 1)

            # Remove all PP phrase from sentence
            new_str_array = [v for i,v in enumerate(str_array) if i not in pp_phrase_indices]
            edu.text = ' '.join(new_str_array)

            # Remove initial adverbials
            for stop_phrase in adverbial_phr_list:
                # If at beginning, remove it
                if edu.text.find(stop_phrase + ', ') == 0:
                    edu.text = edu.text.replace(stop_phrase + ', ', '').strip()
                elif edu.text.find(stop_phrase) == 0:
                    edu.text = edu.text.replace(stop_phrase, '').strip()

            # Format so capitalization is correct for our new sentence
            caps = edu.text.upper()
            edu.text = list(edu.text)
            edu.text[0] = caps[0]
            edu.text = "".join(edu.text)
            s.append(str(edu.text))

        # Form raw sentences for summary from chosen edu's
        s = ' '.join(s).replace('\t', '').strip()

        # Now do simplification step

        f = open(summary_fname, 'w')
        f.write(s)
        f.close()
Ejemplo n.º 2
0
def evalparser(path='./examples',
               report=False,
               bcvocab=None,
               draw=True,
               withdp=False,
               fdpvocab=None,
               fprojmat=None):
    """ Test the parsing performance

    :type path: string
    :param path: path to the evaluation data

    :type report: boolean
    :param report: whether to report (calculate) the f1 score
    """
    # ----------------------------------------
    # Load the parsing model
    print('Load parsing model ...')
    pm = ParsingModel(withdp=withdp, fdpvocab=fdpvocab, fprojmat=fprojmat)
    pm.loadmodel("model/parsing-model.pickle.gz")
    # ----------------------------------------
    # Evaluation
    met = Metrics(levels=['span', 'nuclearity', 'relation'])
    # ----------------------------------------
    # Read all files from the given path
    exsisting_files = [
        ".".join(fname.split(".")[:-1]) for fname in listdir(path)
        if fname.endswith('.brackets')
    ]
    all_files = [
        ".".join(fname.split(".")[:-1]) for fname in listdir(path)
        if fname.endswith('.merge')
    ]
    todo_files = list(set(all_files) - set(exsisting_files))
    doclist = [joinpath(path, fname + '.merge') for fname in todo_files]
    print("TODO files len:")
    print(len(doclist))
    print(doclist[0])
    global_pm = pm
    global global_pm
    global_bv = bcvocab
    global global_bv
    eval_parser_unit(doclist[0])
    cnt = multiprocessing.cpu_count()

    pool = multiprocessing.Pool(processes=cnt)

    pool.map(eval_parser_unit, doclist)
    pool.close()
    pool.join()
    """
Ejemplo n.º 3
0
def evalparser(path='./examples', report=False, 
               bcvocab=None, draw=True,
               withdp=False, fdpvocab=None, fprojmat=None):
    """ Test the parsing performance

    :type path: string
    :param path: path to the evaluation data

    :type report: boolean
    :param report: whether to report (calculate) the f1 score
    """
    # ----------------------------------------
    # Load the parsing model
    print 'Load parsing model ...'
    pm = ParsingModel(withdp=withdp,
        fdpvocab=fdpvocab, fprojmat=fprojmat)
    pm.loadmodel("model/parsing-model.pickle.gz")
    # ----------------------------------------
    # Evaluation
    met = Metrics(levels=['span','nuclearity','relation'])
    # ----------------------------------------
    # Read all files from the given path
    doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.merge')]
    for fmerge in doclist:
        # ----------------------------------------
        # Read *.merge file
        dr = DocReader()
        doc = dr.read(fmerge)
        # ----------------------------------------
        # Parsing
        pred_rst = pm.sr_parse(doc, bcvocab)
        if draw:
            strtree = pred_rst.parse()
            drawrst(strtree, fmerge.replace(".merge",".ps"))
        # Get brackets from parsing results
        pred_brackets = pred_rst.bracketing()
        fbrackets = fmerge.replace('.merge', '.brackets')
        # Write brackets into file
        writebrackets(fbrackets, pred_brackets)
        # ----------------------------------------
        # Evaluate with gold RST tree
        if report:
            fdis = fmerge.replace('.merge', '.dis')
            gold_rst = RSTTree(fdis, fmerge)
            gold_rst.build()
            gold_brackets = gold_rst.bracketing()
            met.eval(gold_rst, pred_rst)
    if report:
        met.report()
Ejemplo n.º 4
0
def evalparser(path='./examples', report=False):
    """ Test the parsing performance

    :type path: string
    :param path: path to the evaluation data

    :type report: boolean
    :param report: whether to report (calculate) the f1 score
    """
    from os import listdir
    from os.path import join as joinpath
    # ----------------------------------------
    # Load the parsing model
    pm = ParsingModel()
    pm.loadmodel("parsing-model.pickle.gz")
    # ----------------------------------------
    # Evaluation
    met = Metrics(levels=['span', 'nuclearity', 'relation'])
    # ----------------------------------------
    # Read all files from the given path
    doclist = [
        joinpath(path, fname) for fname in listdir(path)
        if fname.endswith('.edus')
    ]
    for fedus in doclist:
        # ----------------------------------------
        # Parsing
        pred_rst = parse(pm, fedus=fedus)
        # Get brackets from parsing results
        #      print fedus
        fin = open("test.dis", "w")
        r = fin.write(str(pred_rst))
        #   pred_brackets = pred_rst.bracketing()
        # fbrackets = fedus.replace('edus', 'brackets')
        #  writebrackets(fbrackets, pred_brackets)
        # ----------------------------------------
        # Evaluate with gold RST tree
        if report:
            fdis = fedus.replace('edus', 'dis')
            gold_rst = RSTTree(fname=fdis)
            gold_rst.build()
            gold_brackets = gold_rst.bracketing()
            met.eval(gold_rst, pred_rst)
    if report:
        met.report()
Ejemplo n.º 5
0
def evalparser(path='./examples', report=False):
    """ Test the parsing performance

    :type path: string
    :param path: path to the evaluation data

    :type report: boolean
    :param report: whether to report (calculate) the f1 score
    """
    from os import listdir
    from os.path import join as joinpath
    # ----------------------------------------
    # Load the parsing model
    pm = ParsingModel()
    pm.loadmodel("parsing-model.pickle.gz")
    # ----------------------------------------
    # Evaluation
    met = Metrics(levels=['span','nuclearity','relation'])
    # ----------------------------------------
    # Read all files from the given path
    doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus')]
    for fedus in doclist:
        # ----------------------------------------
        # Parsing
        fpos = fedus + ".pos"
        d_pos = get_d_pos(fpos)
        fdep = fedus + ".dep"
        d_dep = get_d_dep(fdep)
        pred_rst = parse(pm, fedus=fedus, d_pos=d_pos, d_dep=d_dep)
        # Get brackets from parsing results
        pred_brackets = pred_rst.bracketing()
        fbrackets = fedus.replace('edus', 'brackets')
        writebrackets(fbrackets, pred_brackets)
        # ----------------------------------------
        # Evaluate with gold RST tree
        if report:
            fdis = fedus.replace('edus', 'dis')
            gold_rst = RSTTree(fname=fdis)
            gold_rst.build()
            gold_brackets = gold_rst.bracketing()
            met.eval(gold_rst, pred_rst)
    if report:
        met.report()
Ejemplo n.º 6
0
            """ Parameters for composition

            :type L: 2-d numpy.array
            :param L: composition matrix for left node

            :type R: 2-d numpy.array
            :param R: composition matrix for right node

            :type bias: 1-d numpy.array
            :param bias: composition bias
            """
            self.S = S
            self.N = N
            self.bias = bias
if __name__ == '__main__':
    
    D =loadmodel("weights.pickle.gz")

    weights = D["words"]
    vocab = D["vocab"]
    vocab_no = D["vocabno"]
    pm = ParsingModel()
    pm.loadmodel("../parsing-model.pickle.gz")
    path = "../../../Movies/edu-input-final/"
    path = "../../../Movies/Bigger-set/"
    files = [os.path.join(path, fname) for fname in os.listdir(path) if fname.endswith('.edus')]
    # param = miniKJointSGD(files,400,sa_dict,iterations=40)
    param = miniHingeJointTopSGD(pm,files,1500,weights,iterations=100)
    print param.N
    print param.S
    
from model import ParsingModel
from tree import RSTTree
from os import listdir
from os.path import join as joinpath
import buildtree

pm = ParsingModel()
pm.loadmodel("parsing-model.pickle.gz")


def parse(pm, textedus, pos_tags, headwords):
    """ Parse one document using the given parsing model

    :type pm: ParsingModel
    :param pm: an well-trained parsing model

    :type textedus: string
    :param textedus: file name of an document (with segmented EDUs) 
    """

    edus = textedus.split('\n')
    pos_tags = pos_tags.split('\n')
    heads = headwords.split('\n')
    if len(edus[-1]) == 0:
        edus.pop()
    pred_rst = pm.sr_parse(edus, pos_tags, headwords)
    return pred_rst


def getParseTree(document, pos_tags, headwords):
    pred_rst = parse(pm, document, pos_tags, headwords)
def generate_summaries(path):
    from os.path import join as joinpath
    # ----------------------------------------
    # Load the parsing model
    pm = ParsingModel()
    pm.loadmodel("parsing-model.pickle.gz")

    # ----------------------------------------
    # Read all files from the given path
    doclist = [
        joinpath(path, fname) for fname in listdir(path)
        if fname.endswith('.edus')
    ]
    for fedus in doclist:
        pred_rst = parse(pm, fedus=fedus)

        # You can pass in either summary_factor or summary_p
        # summary_p hardcodes it to p sentences
        # summary_factor is a percentage of the edu length
        top_scoring = calc_marcu(pred_rst, summary_p=2)
        summary_fname = fedus.replace('.edus', '.summary')
        s = []

        # Get top scoring and format it appropriately
        for edu in top_scoring:
            edu.text = edu.text.strip()
            str_array = word_tokenize(edu.text)

            # Remove PP phrase by finding index of the VBG and removing all words that point to it in dependency graph
            pp_indices = [i for i, x in enumerate(edu.tags) if x == 'VBG']
            pp_phrase_indices = set(pp_indices)
            for idx in pp_indices:
                pp_phrase_indices.add(edu.head_words_indices[idx] - 1)

            # Remove all PP phrase from sentence
            new_str_array = [
                v for i, v in enumerate(str_array)
                if i not in pp_phrase_indices
            ]
            edu.text = ' '.join(new_str_array)

            # Remove initial adverbials
            for stop_phrase in adverbial_phr_list:
                # If at beginning, remove it
                if edu.text.find(stop_phrase + ', ') == 0:
                    edu.text = edu.text.replace(stop_phrase + ', ', '').strip()
                elif edu.text.find(stop_phrase) == 0:
                    edu.text = edu.text.replace(stop_phrase, '').strip()

            # Format so capitalization is correct for our new sentence
            caps = edu.text.upper()
            edu.text = list(edu.text)
            edu.text[0] = caps[0]
            edu.text = "".join(edu.text)
            s.append(str(edu.text))

        # Form raw sentences for summary from chosen edu's
        s = ' '.join(s).replace('\t', '').strip()

        # Now do simplification step

        f = open(summary_fname, 'w')
        f.write(s)
        f.close()