def evaluate_all_duc(fp, papers_path, pk_path, modeltype='mymodel', inc_abscon=True, useabstr=1, maxtxt=-1, topk=10): print "model type =", modeltype # -- if modeltype is not in our model list, not execute the following process -- if modeltype not in idname.keys(): return # -- get algorithms generated summaries directory -- if not inc_abscon: system_dir = os.path.join(fp, 'systemFliter_html1') else: system_dir = os.path.join(fp, 'systemFliter_html2') if not os.path.exists(system_dir): os.makedirs(system_dir) # -- get the original filename list in the path variable -- file_list = sxpGetDirFileList(papers_path)[0] # -- for each file, get its sxpText and run modeltype ranking model on it -- for i, file_name in enumerate(file_list): # ---- get file name ---- fset = file_name.split('.') if fset[-1] != 'xhtml': # if current file is not a txt file, it is not one of the original papers continue print "processing", i, "th paper named as", file_name # -- get single pickle file directory -- if not inc_abscon: pickle_path = os.path.join(pk_path, file_name +'_1.pickle') else: pickle_path = os.path.join(pk_path, file_name +'_2.pickle') # -- run ranking model on sxpText object at pickle_path -- if modeltype == 'mymodel': model = MyModel(pickle_path) if modeltype == 'tfidf': model = TfIdf(pickle_path) if modeltype == 'graphb': model = GraphBased(pickle_path) if modeltype == 'graphw': model = WordGraph(pickle_path) if modeltype == 'context1': model = conTextModel(pickle_path) if modeltype == 'mysecmodel': model = MySecModel(pickle_path) if modeltype == 'myseccontextmodel': model = SecConTextModel(pickle_path) if modeltype == 'hybrid': # sxpHybridGraph.py model = HybridGraph(pickle_path) if modeltype == 'sectitle': # MySecTitleNetwork.py model = MySecTitleModel(pickle_path) # -- get the .html file's suffix NO. -- idstr = idname[modeltype] # -- save original model ranked topk sentences to text -- topksent_path = os.path.join(system_dir, file_name+'.html.'+idstr) tops = model.OutPutTopKSent(topk, useabstr, maxtxt) st = ProduceSystem(tops, file_name, 1) WriteStrFile(topksent_path, st, 'utf-8') # -- save pattern matched cause-effect links ranked topk sentences to text -- sysce_topk = topk if topk <= len(model.text.sysce_sent_id_dict) else len(model.text.sysce_sent_id_dict) if sysce_topk > 0: topk_sysce_sent_path = os.path.join(system_dir, file_name+'.html.'+str(30 + int(idstr))) topk_sysce_sent = OutPutTopKsysCESent(model, sysce_topk, useabstr, maxtxt) sysce_st = ProduceSystem(topk_sysce_sent, file_name, 1) WriteStrFile(topk_sysce_sent_path, sysce_st, 'utf-8') print "ranking complete"
def evaluate_one_model(pickle_path, pk_sys_set, system_path, system_id, modeltype='mymodel', topksent=10, on_mance=False, on_sysce=False): print "model type =", modeltype, "top k =", topksent for i, (file_name, system_name) in enumerate(pk_sys_set): print i, file_name pickle_file = os.path.join(pickle_path, file_name) if modeltype == 'mymodel': # MyModel.py model = MyModel(pickle_file) if modeltype == 'tfidf': # tf_idf.py model = TfIdf(pickle_file) if modeltype == 'graphb': # graph_base.py model = GraphBased(pickle_file) if modeltype == 'graphw': # word_graph.py model = WordGraph(pickle_file) if modeltype == 'context1': # sxpContextModel.py SubPara Model model = conTextModel(pickle_file) if modeltype == 'mysecmodel': # MySecModel.py model = MySecModel(pickle_file) if modeltype == 'myseccontextmodel': # MySecContextModel.py SecSub Model model = SecConTextModel(pickle_file) if modeltype == 'hybrid': # sxpHybridGraph.py model = HybridGraph(pickle_file) if modeltype == 'sectitle': # MySecTitleNetwork.py model = MySecTitleModel(pickle_file) # -- save top k sentences to text -- topksent_path = os.path.join(system_path, system_name + "." + system_id + ".txt") tops = model.OutPutTopKSent(topksent, 1, -1) st = ProduceSystem(tops, file_name, 1) WriteStrFile(topksent_path, st, 'utf-8') if on_mance: topk_mance_sent_path = os.path.join(system_path, system_name + "." + system_id + ".mance.txt") topk_mance_sent = OutPutTopKmanCESent(model, topksent, 1, -1) mance_st = ProduceSystem(topk_mance_sent, file_name, 1) WriteStrFile(topk_mance_sent_path, mance_st, 'utf-8') if on_sysce: topk_sysce_sent_path = os.path.join(system_path, system_name+"."+system_id+".sysce.txt") topk_sysce_sent = OutPutTopKsysCESent(model, topksent, 1, -1) sysce_st = ProduceSystem(topk_sysce_sent, file_name, 1) WriteStrFile(topk_sysce_sent_path, sysce_st, 'utf-8') # -- save all ranked sentences to text -- allsent = model.OutputAllRankSentence() pkfname = topksent_path + 'allsent.pk' StoreSxptext(allsent, pkfname) i += 1 print "ranking complete!"
def evaluate_all_duc(fp, papers_path, pk_path, modeltype='mymodel', inc_abscon=True, useabstr=1, maxtxt=-1, topk=10): print "model type =", modeltype # -- if modeltype is not in our model list, not execute the following process -- if modeltype not in idname.keys(): return # -- get algorithms generated summaries directory -- if not inc_abscon: system_dir = os.path.join(fp, 'systemPure_html1') rankedsentdir = os.path.join(fp, 'RankedSentPure_1') else: system_dir = os.path.join(fp, 'systemPure_html2') rankedsentdir = os.path.join(fp, 'RankedSentPure_2') if not os.path.exists(system_dir): os.makedirs(system_dir) if not os.path.exists(rankedsentdir): os.makedirs(rankedsentdir) # -- get the original filename list in the path variable -- file_list = sxpGetDirFileList(papers_path)[0] # -- for each file, get its sxpText and run modeltype ranking model on it -- for i, file_name in enumerate(file_list): # ---- get file name ---- if not re.match(ur'AP\d{6}-\d{4}|FBIS\d-\d+', file_name ): # if current file is not one of the original papers continue print "processing", i, "th paper named as", file_name # -- get single pickle file directory -- if not inc_abscon: pickle_path = os.path.join(pk_path, file_name) else: pickle_path = os.path.join(pk_path, file_name) # -- run ranking model on sxpText object at pickle_path -- if modeltype == 'mymodel': model = MyModel(pickle_path) if modeltype == 'tfidf': model = TfIdf(pickle_path) if modeltype == 'graphb': model = GraphBased(pickle_path) if modeltype == 'graphw': model = WordGraph(pickle_path) if modeltype == 'context1': model = conTextModel(pickle_path) if modeltype == 'mysecmodel': model = MySecModel(pickle_path) if modeltype == 'myseccontextmodel': model = SecConTextModel(pickle_path) if modeltype == 'hybrid': # sxpHybridGraph.py model = HybridGraph(pickle_path) if modeltype == 'sectitle': # MySecTitleNetwork.py model = MySecTitleModel(pickle_path) # -- get the .html file's suffix NO. -- idstr = idname[modeltype] # -- save ranked sentence to a pickle file -- ranked_sent_fp = os.path.join( rankedsentdir, file_name + ".html." + idstr + ".allsent.pk") sentidlst = sorted(model.text.ce_sys.iteritems(), key=lambda asd: asd[1], reverse=True) ranked_sentences = [ model.text.sentenceset[sentid[0]].sentence_text for sentid in sentidlst ] StoreSxptext(ranked_sentences, ranked_sent_fp) # -- save original model ranked topk sentences to text -- topksent_path = os.path.join(system_dir, file_name + '.html.' + idstr) tops = model.OutPutTopKSent(topk, useabstr, maxtxt) st = ProduceSystem(tops, file_name, 1) WriteStrFile(topksent_path, st, 'utf-8')
def evaluate_all_kg_sec(system_path, pk_path, modeltype='mymodel', ce_opt='mance', bias=0.65, fname_topk_dict={}, fname_maxstr_dict={}, useabstr=0, maxstr=2500, topk=25): print "model type =", modeltype # -- get the original filename list in the path variable -- file_list = sxpGetDirFileList(pk_path)[0] # -- for each file, get its sxpText and run modeltype ranking model on it -- for i, file_name in enumerate(file_list): # ---- get file name ---- fset = file_name.split('.') fname = '.'.join(fset[0:2]) secname = fset[2] if fset[-1] != 'pk': # if current file is not a pickle file, it is not one of the original papers continue # ---- Get system summaries directory ---- system_dir = os.path.join(system_path, fname, secname) if not os.path.exists(system_dir): os.makedirs(system_dir) # ---- get the upper bound of a system summary's str number ---- if fname_maxstr_dict.has_key(fname + '.' + secname): maxstr = fname_maxstr_dict[fname + '.' + secname] if fname_topk_dict.has_key(fname + '.' + secname): topk = fname_topk_dict[fname + '.' + secname] print "processing", i, "th paper named as", file_name # -- get single pickle file directory -- pickle_path = os.path.join(pk_path, file_name) # ----------- Original Sentence Ranking Models -------------- if modeltype == 'mymodel': model = MyModel(pickle_path) if modeltype == 'tfidf': model = TfIdf(pickle_path) if modeltype == 'graphb': model = GraphBased(pickle_path) if modeltype == 'graphw': model = WordGraph(pickle_path) if modeltype == 'context1': model = conTextModel(pickle_path) if modeltype == 'mysecmodel': model = MySecModel(pickle_path) if modeltype == 'myseccontextmodel': model = SecConTextModel(pickle_path) if modeltype == 'hybrid': # sxpHybridGraph.py model = HybridGraph(pickle_path) if modeltype == 'sectitle': # MySecTitleNetwork.py model = MySecTitleModel(pickle_path) # --------------- CEBias Ranking Models ------------------- if modeltype == 'mymodel2': model = MyModel2(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'tfidf2': model = TfIdf2(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'graphb2': model = GraphBased2(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'graphw2': model = WordGraph2(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'context2': model = conTextModel2(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'mysecmodel2': model = MySecModel2(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'myseccontextmodel2': model = SecConTextModel2(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'hybrid2': # sxpHybridGraph.py model = HybridGraph2(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'sectitle2': # MySecTitleNetwork.py model = MySecTitleModel2(pickle_path, opt=ce_opt, bias=bias) # --------------- CEIter Ranking Models ------------------- if modeltype == 'mymodel3': model = MyModel3(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'context3': model = conTextModel3(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'mysecmodel3': model = MySecModel3(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'myseccontextmodel3': model = SecConTextModel3(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'hybrid3': # sxpHybridGraph.py model = HybridGraph3(pickle_path, opt=ce_opt, bias=bias) if modeltype == 'sectitle3': # MySecTitleNetwork.py model = MySecTitleModel3(pickle_path, opt=ce_opt, bias=bias) # -- get the .html file's suffix NO. -- if modeltype in idname.keys(): idstr = idname[modeltype] # -- save original model ranked topk sentences to text -- topksent_path = os.path.join(system_dir, fname + '.' + secname + '.html.' + idstr) topksent_path_2 = os.path.join( system_dir, fname + '.' + secname + '.html.sent.' + idstr) tops = model.OutPutTopKSent(topk, useabstr, maxstr) st = ProduceSystem(tops, fname + '.' + secname, 1) if useabstr < 0: WriteStrFile(topksent_path_2, st, 'utf-8') else: WriteStrFile(topksent_path, st, 'utf-8') print "ranking complete"
def evaluate_all_kg(fp, papers_path, pk_path, modeltype='mymodel', inc_abscon=True, useabstr=1, maxtxt=-1, fname_topk_dict={}): print "model type =", modeltype # systemdir should be managed accoding to the inc_abscon parameter if not inc_abscon: system_dir = os.path.join(fp, 'systemPure_1') system_mandir = os.path.join(fp, 'systemPure_man1') else: system_dir = os.path.join(fp, 'systemPure_2') system_mandir = os.path.join(fp, 'systemPure_man2') if not os.path.exists(system_dir): os.makedirs(system_dir) if not os.path.exists(system_mandir): os.makedirs(system_mandir) # -- get the original filename list in the path variable -- file_list = sxpGetDirFileList(papers_path)[0] # -- for each file, get its sxpText and run modeltype ranking model on it -- for i, file_name in enumerate(file_list): # ---- get topk sentence ---- if not fname_topk_dict.has_key(file_name): continue else: topk = fname_topk_dict[file_name] # ---- get file name ---- fset = file_name.split('.') if fset[-1] != 'txt': # if current file is not a txt file, it is not one of the original papers continue print "processing", i, "th paper named as", file_name # -- get single pickle file directory -- if not inc_abscon: pickle_path = os.path.join(pk_path, file_name + '_1.pk') else: pickle_path = os.path.join(pk_path, file_name + '_2.pk') # -- run ranking model on sxpText object at pickle_path -- if modeltype == 'mymodel': model = MyModel(pickle_path) if modeltype == 'tfidf': model = TfIdf(pickle_path) if modeltype == 'graphb': model = GraphBased(pickle_path) if modeltype == 'graphw': model = WordGraph(pickle_path) if modeltype == 'context1': model = conTextModel(pickle_path) if modeltype == 'mysecmodel': model = MySecModel(pickle_path) if modeltype == 'myseccontextmodel': model = SecConTextModel(pickle_path) if modeltype == 'hybrid': # sxpHybridGraph.py model = HybridGraph(pickle_path) if modeltype == 'sectitle': # MySecTitleNetwork.py model = MySecTitleModel(pickle_path) # -- get the .html file's suffix NO. -- if modeltype in idname.keys(): idstr = idname[modeltype] # -- save original model ranked topk sentences to text -- tops = model.OutPutTopKSent(topk, useabstr, maxtxt) st = ProduceSystem(tops, file_name, 1) topksent_path = os.path.join(system_dir, file_name + '.html.' + idstr) WriteStrFile(topksent_path, st, 'utf-8') topksent_path = os.path.join(system_mandir, file_name + '.html.' + idstr) WriteStrFile(topksent_path, st, 'utf-8') print "ranking complete"