def generate_summaries(path): from os.path import join as joinpath # ---------------------------------------- # Load the parsing model pm = ParsingModel() pm.loadmodel("parsing-model.pickle.gz") # ---------------------------------------- # Read all files from the given path doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus')] for fedus in doclist: pred_rst = parse(pm, fedus=fedus) # You can pass in either summary_factor or summary_p # summary_p hardcodes it to p sentences # summary_factor is a percentage of the edu length top_scoring = calc_marcu(pred_rst, summary_p = 2) summary_fname = fedus.replace('.edus', '.summary') s = [] # Get top scoring and format it appropriately for edu in top_scoring: edu.text = edu.text.strip() str_array = word_tokenize(edu.text) # Remove PP phrase by finding index of the VBG and removing all words that point to it in dependency graph pp_indices = [i for i,x in enumerate(edu.tags) if x == 'VBG'] pp_phrase_indices = set(pp_indices) for idx in pp_indices: pp_phrase_indices.add(edu.head_words_indices[idx] - 1) # Remove all PP phrase from sentence new_str_array = [v for i,v in enumerate(str_array) if i not in pp_phrase_indices] edu.text = ' '.join(new_str_array) # Remove initial adverbials for stop_phrase in adverbial_phr_list: # If at beginning, remove it if edu.text.find(stop_phrase + ', ') == 0: edu.text = edu.text.replace(stop_phrase + ', ', '').strip() elif edu.text.find(stop_phrase) == 0: edu.text = edu.text.replace(stop_phrase, '').strip() # Format so capitalization is correct for our new sentence caps = edu.text.upper() edu.text = list(edu.text) edu.text[0] = caps[0] edu.text = "".join(edu.text) s.append(str(edu.text)) # Form raw sentences for summary from chosen edu's s = ' '.join(s).replace('\t', '').strip() # Now do simplification step f = open(summary_fname, 'w') f.write(s) f.close()
def evalparser(path='./examples', report=False, bcvocab=None, draw=True, withdp=False, fdpvocab=None, fprojmat=None): """ Test the parsing performance :type path: string :param path: path to the evaluation data :type report: boolean :param report: whether to report (calculate) the f1 score """ # ---------------------------------------- # Load the parsing model print('Load parsing model ...') pm = ParsingModel(withdp=withdp, fdpvocab=fdpvocab, fprojmat=fprojmat) pm.loadmodel("model/parsing-model.pickle.gz") # ---------------------------------------- # Evaluation met = Metrics(levels=['span', 'nuclearity', 'relation']) # ---------------------------------------- # Read all files from the given path exsisting_files = [ ".".join(fname.split(".")[:-1]) for fname in listdir(path) if fname.endswith('.brackets') ] all_files = [ ".".join(fname.split(".")[:-1]) for fname in listdir(path) if fname.endswith('.merge') ] todo_files = list(set(all_files) - set(exsisting_files)) doclist = [joinpath(path, fname + '.merge') for fname in todo_files] print("TODO files len:") print(len(doclist)) print(doclist[0]) global_pm = pm global global_pm global_bv = bcvocab global global_bv eval_parser_unit(doclist[0]) cnt = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=cnt) pool.map(eval_parser_unit, doclist) pool.close() pool.join() """
def evalparser(path='./examples', report=False, bcvocab=None, draw=True, withdp=False, fdpvocab=None, fprojmat=None): """ Test the parsing performance :type path: string :param path: path to the evaluation data :type report: boolean :param report: whether to report (calculate) the f1 score """ # ---------------------------------------- # Load the parsing model print 'Load parsing model ...' pm = ParsingModel(withdp=withdp, fdpvocab=fdpvocab, fprojmat=fprojmat) pm.loadmodel("model/parsing-model.pickle.gz") # ---------------------------------------- # Evaluation met = Metrics(levels=['span','nuclearity','relation']) # ---------------------------------------- # Read all files from the given path doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.merge')] for fmerge in doclist: # ---------------------------------------- # Read *.merge file dr = DocReader() doc = dr.read(fmerge) # ---------------------------------------- # Parsing pred_rst = pm.sr_parse(doc, bcvocab) if draw: strtree = pred_rst.parse() drawrst(strtree, fmerge.replace(".merge",".ps")) # Get brackets from parsing results pred_brackets = pred_rst.bracketing() fbrackets = fmerge.replace('.merge', '.brackets') # Write brackets into file writebrackets(fbrackets, pred_brackets) # ---------------------------------------- # Evaluate with gold RST tree if report: fdis = fmerge.replace('.merge', '.dis') gold_rst = RSTTree(fdis, fmerge) gold_rst.build() gold_brackets = gold_rst.bracketing() met.eval(gold_rst, pred_rst) if report: met.report()
def evalparser(path='./examples', report=False): """ Test the parsing performance :type path: string :param path: path to the evaluation data :type report: boolean :param report: whether to report (calculate) the f1 score """ from os import listdir from os.path import join as joinpath # ---------------------------------------- # Load the parsing model pm = ParsingModel() pm.loadmodel("parsing-model.pickle.gz") # ---------------------------------------- # Evaluation met = Metrics(levels=['span', 'nuclearity', 'relation']) # ---------------------------------------- # Read all files from the given path doclist = [ joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus') ] for fedus in doclist: # ---------------------------------------- # Parsing pred_rst = parse(pm, fedus=fedus) # Get brackets from parsing results # print fedus fin = open("test.dis", "w") r = fin.write(str(pred_rst)) # pred_brackets = pred_rst.bracketing() # fbrackets = fedus.replace('edus', 'brackets') # writebrackets(fbrackets, pred_brackets) # ---------------------------------------- # Evaluate with gold RST tree if report: fdis = fedus.replace('edus', 'dis') gold_rst = RSTTree(fname=fdis) gold_rst.build() gold_brackets = gold_rst.bracketing() met.eval(gold_rst, pred_rst) if report: met.report()
def evalparser(path='./examples', report=False): """ Test the parsing performance :type path: string :param path: path to the evaluation data :type report: boolean :param report: whether to report (calculate) the f1 score """ from os import listdir from os.path import join as joinpath # ---------------------------------------- # Load the parsing model pm = ParsingModel() pm.loadmodel("parsing-model.pickle.gz") # ---------------------------------------- # Evaluation met = Metrics(levels=['span','nuclearity','relation']) # ---------------------------------------- # Read all files from the given path doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus')] for fedus in doclist: # ---------------------------------------- # Parsing fpos = fedus + ".pos" d_pos = get_d_pos(fpos) fdep = fedus + ".dep" d_dep = get_d_dep(fdep) pred_rst = parse(pm, fedus=fedus, d_pos=d_pos, d_dep=d_dep) # Get brackets from parsing results pred_brackets = pred_rst.bracketing() fbrackets = fedus.replace('edus', 'brackets') writebrackets(fbrackets, pred_brackets) # ---------------------------------------- # Evaluate with gold RST tree if report: fdis = fedus.replace('edus', 'dis') gold_rst = RSTTree(fname=fdis) gold_rst.build() gold_brackets = gold_rst.bracketing() met.eval(gold_rst, pred_rst) if report: met.report()
""" Parameters for composition :type L: 2-d numpy.array :param L: composition matrix for left node :type R: 2-d numpy.array :param R: composition matrix for right node :type bias: 1-d numpy.array :param bias: composition bias """ self.S = S self.N = N self.bias = bias if __name__ == '__main__': D =loadmodel("weights.pickle.gz") weights = D["words"] vocab = D["vocab"] vocab_no = D["vocabno"] pm = ParsingModel() pm.loadmodel("../parsing-model.pickle.gz") path = "../../../Movies/edu-input-final/" path = "../../../Movies/Bigger-set/" files = [os.path.join(path, fname) for fname in os.listdir(path) if fname.endswith('.edus')] # param = miniKJointSGD(files,400,sa_dict,iterations=40) param = miniHingeJointTopSGD(pm,files,1500,weights,iterations=100) print param.N print param.S
from model import ParsingModel from tree import RSTTree from os import listdir from os.path import join as joinpath import buildtree pm = ParsingModel() pm.loadmodel("parsing-model.pickle.gz") def parse(pm, textedus, pos_tags, headwords): """ Parse one document using the given parsing model :type pm: ParsingModel :param pm: an well-trained parsing model :type textedus: string :param textedus: file name of an document (with segmented EDUs) """ edus = textedus.split('\n') pos_tags = pos_tags.split('\n') heads = headwords.split('\n') if len(edus[-1]) == 0: edus.pop() pred_rst = pm.sr_parse(edus, pos_tags, headwords) return pred_rst def getParseTree(document, pos_tags, headwords): pred_rst = parse(pm, document, pos_tags, headwords)
def generate_summaries(path): from os.path import join as joinpath # ---------------------------------------- # Load the parsing model pm = ParsingModel() pm.loadmodel("parsing-model.pickle.gz") # ---------------------------------------- # Read all files from the given path doclist = [ joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus') ] for fedus in doclist: pred_rst = parse(pm, fedus=fedus) # You can pass in either summary_factor or summary_p # summary_p hardcodes it to p sentences # summary_factor is a percentage of the edu length top_scoring = calc_marcu(pred_rst, summary_p=2) summary_fname = fedus.replace('.edus', '.summary') s = [] # Get top scoring and format it appropriately for edu in top_scoring: edu.text = edu.text.strip() str_array = word_tokenize(edu.text) # Remove PP phrase by finding index of the VBG and removing all words that point to it in dependency graph pp_indices = [i for i, x in enumerate(edu.tags) if x == 'VBG'] pp_phrase_indices = set(pp_indices) for idx in pp_indices: pp_phrase_indices.add(edu.head_words_indices[idx] - 1) # Remove all PP phrase from sentence new_str_array = [ v for i, v in enumerate(str_array) if i not in pp_phrase_indices ] edu.text = ' '.join(new_str_array) # Remove initial adverbials for stop_phrase in adverbial_phr_list: # If at beginning, remove it if edu.text.find(stop_phrase + ', ') == 0: edu.text = edu.text.replace(stop_phrase + ', ', '').strip() elif edu.text.find(stop_phrase) == 0: edu.text = edu.text.replace(stop_phrase, '').strip() # Format so capitalization is correct for our new sentence caps = edu.text.upper() edu.text = list(edu.text) edu.text[0] = caps[0] edu.text = "".join(edu.text) s.append(str(edu.text)) # Form raw sentences for summary from chosen edu's s = ' '.join(s).replace('\t', '').strip() # Now do simplification step f = open(summary_fname, 'w') f.write(s) f.close()