def train_IE256_svm(traincourse, model_dir, name='simlearn_cv'): sim_extractor = Similarity() allfeatures = sorted(sim_extractor.features.keys()) features = allfeatures name = '_'.join(features) lectures = annotation.Lectures dict = defaultdict(int) if traincourse == 'IE256': train = [x for x in range(14, 26) if x != 22] else: train = [x for x in range(3, 27)] model_file = os.path.join(model_dir, '%s_%s.model' % (traincourse, name)) if fio.IsExist(model_file): with open(model_file, 'rb') as handle: clf = pickle.load(handle) else: train_X, train_Y = combine_files_course(traincourse, train, features) clf = svm.SVC() clf.fit(train_X, train_Y) with open(model_file, 'wb') as handle: pickle.dump(clf, handle)
def extractPhrase(excelfile, folder, sennadatadir, method): sheets = range(0, maxWeek) for i, sheet in enumerate(sheets): week = i + 1 for type in ['q1', 'q2', 'q3', 'q4']: #for type in ['POI', 'MP']: print excelfile, sheet, type student_summaryList = CourseMirror_Survey.getStudentResponseList( excelfile, course, week, type, withSource=False) if len(student_summaryList) == 0: continue path = folder + str(week) + '/' fio.NewPath(path) filename = path + type + '.' + method + '.key' sennafile = sennadatadir + "senna." + str( week) + "." + type + '.output' if not fio.IsExist(sennafile): continue phrases = getKeyPhrases(student_summaryList, sennafile, method=method, MalformedFlilter=True) fio.SaveList(phrases, filename)
def PrintClusterRankSummary(datadir): sheets = range(0,maxWeek) lectures = fio.LoadDictJson('../data/CourseMIRROR/lectures.json') head = ['week', 'data', 'Point of Interest', "Muddiest Point"] body = [] for i, sheet in enumerate(sheets): row = [] week = i + 1 row.append(week) row.append(getDate(lectures, course, week)) for type in ['q1', 'q2', 'q3', 'q4']: path = datadir + str(i+1)+ '/' summaryfile = path + type + '.summary' if not fio.IsExist(summaryfile): continue summaries = [line.strip() for line in fio.ReadFile(summaryfile)] sourcefile = path + type + '.summary.source' sources = [line.split(',') for line in fio.ReadFile(sourcefile)] combinedSummary = [] for j, (summary, source) in enumerate(zip(summaries, sources)): summary = summary.replace('"', '\'') combinedSummary.append(str(j+1) + ") " + summary + " [" + str(len(source)) + "]") row.append('"' + chr(10).join(combinedSummary)+ '"') body.append(row) fio.WriteMatrix(datadir + "summary.txt", body, head)
def train_leave_one_lecture_out(model_dir, name='simlearn_cv'): # model_dir = '../data/IE256/%s/model/%s/'%(system, name) # fio.NewPath(model_dir) # # outputdir = '../data/IE256/%s/extraction/%s_output/'%(system, name) # fio.NewPath(outputdir) sim_extractor = Similarity() allfeatures = sorted(sim_extractor.features.keys()) if True: k = len(allfeatures) #for k in range(len(allfeatures)+1): #features = allfeatures#['WordEmbedding'] if k == len(allfeatures): #use all features features = allfeatures else: features = [allfeatures[k]] name = '_'.join(features) lectures = annotation.Lectures dict = defaultdict(int) MSE = [] for i, lec in enumerate(lectures): train = [x for x in lectures if x != lec] test = [lec] print train print test model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name)) if fio.IsExist(model_file): with open(model_file, 'rb') as handle: clf = pickle.load(handle) else: train_X, train_Y = combine_files(train, features) clf = svm.SVR() clf.fit(train_X, train_Y) with open(model_file, 'wb') as handle: pickle.dump(clf, handle) for q in ['q1', 'q2']: test_X, test_Y = combine_files(test, features, prompts=[q]) predict_Y = clf.predict(test_X) mse = mean_squared_error(test_Y, predict_Y) MSE.append([lec, q, mse]) output = '../data/%s/simlearning.cv.%s.txt' % (course, name) fio.WriteMatrix(output, MSE, header=['lec', 'prompt', 'MSE'])
def generate_all_files(datadir, extension, anotators=anotators, lectures=AllLectures): for annotator in anotators: for lec in lectures: filename = datadir + annotator + doc_prefix + str( lec) + '_Completed' + extension assert (fio.IsExist(filename)) yield filename, lec, annotator
def WriteDocsent(excelfile, folder, phrasedir, np=None): sheets = range(0, maxWeek) for i, sheet in enumerate(sheets): week = i + 1 for type in ['q1', 'q2', 'q3', 'q4']: phrasefile = os.path.join(phrasedir, str(week), type + '.' + method + '.key') if not fio.IsExist(phrasefile): continue print phrasefile DID = str(week) + '_' + type path = folder + str(week) + '/' fio.NewPath(path) path = path + type + '/' fio.NewPath(path) path = path + 'docsent/' fio.NewPath(path) filename = path + DID + '.docsent' #create a XML file root = ET.Element(tag='DOCSENT', attrib={ 'DID': DID, 'LANG': "ENG" }) root.tail = '\n' tree = ET.ElementTree(root) phrases = fio.ReadFileUTF8(phrasefile) sno_id = 1 for par, phrase in enumerate(phrases): phrase = phrase.rstrip() s = [phrase] for RSNT, value in enumerate(s): node = ET.Element(tag='S', attrib={ 'PAR': str(par + 1), 'RSNT': str(RSNT + 1), 'SNO': str(sno_id) }) node.text = value node.tail = '\n' root.append(node) sno_id = sno_id + 1 tree.write(filename)
def gather_rouge(output): datadir = '../data/%s/' % course #output = '../data/IE256/result.rouge.txt' models = [ 'QPS_NP', #'QPS_A1_N', 'QPS_A2_N', 'QPS_union', 'QPS_intersect', 'QPS_combine' ] methods = [ 'rouge_crf_optimumComparerLSATasa', 'rouge_crf_ct.svm.default', #'rouge_crf_svm', #'rouge_crf_svr', 'rouge_crf_ct.svm.default', #'rouge_crf_ct.svr.default', ] Header = [ 'method', 'model', 'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R', 'RSU4-P', 'RSU4-F', ] xbody = [] for method in methods: for model in models: filename = os.path.join(datadir, model, "%s.txt" % method) if not fio.IsExist(filename): continue head, body = fio.ReadMatrix(filename, hasHead=True) row = [method, model] row += body[-1][1:] xbody.append(row) fio.WriteMatrix(output, xbody, Header)
def test_cross_course(train, name='all'): wapiti_home = global_params.wapiti_dir pattern_file = '../data/%s.pattern.txt' % name model_dir = '../data/%s/%s/model/%s/' % (course, system, name) fio.NewPath(model_dir) feature_dir = '../data/%s/%s/extraction/' % (course, system) feature_cv_dir = '../data/%s/%s/extraction/%s/' % (course, system, name) fio.NewPath(feature_cv_dir) outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name) fio.NewPath(outputdir) lectures = annotation.Lectures dict = defaultdict(int) for i, lec in enumerate(lectures): test = [lec] model_file = os.path.join(model_dir, '%s.model' % train) print model_file crf = CRF(wapiti_home) if not fio.IsExist(model_file): print "Model is not available" for q in ['q1', 'q2']: test_filename = os.path.join(feature_cv_dir, 'test_%d_%s.feature.crf' % (i, q)) output_file = os.path.join(outputdir, 'test_%d_%s.out' % (i, q)) dict['test_%d_%s' % (i, q)] = 1 if method == 'combine': test_filename_old = test_filename.replace('_combine', '_A1') cmd = 'cp %s %s' % (test_filename_old, test_filename) os.system(cmd) else: combine_files(feature_dir, test, test_filename, prompts=[q]) crf.predict(test_filename, model_file, output_file) if debug: break file_util.save_dict2json(dict, class_index_dict_file)
def getRougeTmp(ref, model): #return the Rouge scores given the reference summary and the models #create a temp file temp_path = mkdtemp() print(temp_path) #write the files fio.SaveList(ref, os.path.join(temp_path, 'ref.txt'), '\n') fio.SaveList(model, os.path.join(temp_path, 'model.txt'), '\n') retcode = subprocess.call(['./get_rouge_tmp %s'%temp_path], shell=True) if retcode != 0: print("Failed!") exit(-1) else: print "Passed!" row = [] for scorename in RougeNames: filename = os.path.join(temp_path, "OUT_"+scorename+".csv") if not fio.IsExist(filename): print filename, " not exist" row = row + [0, 0, 0] continue lines = fio.ReadFile(filename) try: scorevalues = lines[1].split(',') score = scorevalues[1].strip() row.append(score) score = scorevalues[2].strip() row.append(score) score = scorevalues[3].strip() row.append(score) fio.DeleteFolder(temp_path) except Exception: print filename, scorename, lines return row
def __init__(self, prefix=""): self.features = { 'optimumComparerLSATasa': self.LSA, 'LexicalOverlap': self.LexicalOverlap, 'optimumComparerWNLin': self.LIN, 'BLEU': self.BLEU, 'ROUGE': self.ROUGE, 'Cosine': self.Cosine, 'WordEmbedding': self.WordEmbedding, #'WMD': self.WMD, } self.prefix = prefix self.Cache = {} self.cachefile = os.path.join(prefix + 'cache.json') print self.cachefile if fio.IsExist(self.cachefile): with open(self.cachefile, 'r') as fin: self.Cache = json.load(fin) if self.prefix != '': self.matrixdict = {} for sim in [ 'optimumComparerLSATasa', 'LexicalOverlap', 'optimumComparerWNLin', 'BLEU' ]: self.matrixdict[sim] = {} filename = self.prefix + sim phrases, matrix = fio.ReadMatrix(filename, hasHead=True) index = {} for i, p in enumerate(phrases): index[p] = i self.matrixdict[sim]['index'] = index self.matrixdict[sim]['matrix'] = matrix self.word2vec = fio.LoadDictJson(global_params.word2vec_model)
def readgraph_leave_one_lecture_out(phrasedir, modelname='svr'): lectures = annotation.Lectures oslom = OSLOM() if modelname == 'svr': weighted = True undirect = True else: weighted = False undirect = True for i, lec in enumerate(lectures): path = os.path.join(phrasedir, str(lec)) for q in ['q1', 'q2']: #write the output phrasefile = os.path.join(path, "%s.%s.key" % (q, method)) phrases = fio.LoadList(phrasefile) netgraphfile = os.path.join( path, "%s.%s.%s%s_oslo_files" % (q, method, modelname, net_exe), 'tp') if not fio.IsExist(netgraphfile): #no communities print netgraphfile communites = [[x] for x in range(len(phrases))] else: communites = oslom.readgraph_partitions(netgraphfile) #if len(communites) == 1:#break it # communites = [[x] for x in range(len(phrases))] name = 'ct.%s.%s' % (modelname, 'default') output = os.path.join( path, "%s.cluster.kmedoids.sqrt.%s.%s" % (q, name, method)) write_communite_to_clusters(communites, phrases, output) print "%d\t%s\t%d" % (lec, q, len(communites))
def getRouge(ref, model): #return the Rouge scores given the reference summary and the models #write the files fio.SaveList(ref, tmpdir+'ref.txt', '\n') fio.SaveList(model, tmpdir+'model.txt', '\n') retcode = subprocess.call(['./get_rouge'], shell=True) if retcode != 0: print("Failed!") exit(-1) else: print "Passed!" row = [] for scorename in RougeNames: filename = tmpdir + "OUT_"+scorename+".csv" if not fio.IsExist(filename): print filename, " not exist" row = row + [0, 0, 0] continue lines = fio.ReadFile(filename) try: scorevalues = lines[1].split(',') score = scorevalues[1].strip() row.append(score) score = scorevalues[2].strip() row.append(score) score = scorevalues[3].strip() row.append(score) except Exception: print filename, scorename, lines return row
def train_on_course(traincourse, name='all'): wapiti_home = global_params.wapiti_dir pattern_file = '../data/%s.pattern.txt' % name model_dir = '../data/%s/%s/model/%s/' % (course, system, name) fio.NewPath(model_dir) feature_dir = '../data/%s/%s/extraction/' % (traincourse, system) feature_cv_dir = '../data/%s/%s/extraction/%s/' % (traincourse, system, name) fio.NewPath(feature_cv_dir) outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name) fio.NewPath(outputdir) if traincourse == 'IE256': lectures = [x for x in range(14, 26) if x != 22] else: lectures = [x for x in range(3, 27)] dict = defaultdict(int) train = [x for x in lectures] train_filename = os.path.join(feature_cv_dir, 'train.feature.crf') model_file = os.path.join(model_dir, '%s.model' % traincourse) print train_filename print model_file crf = CRF(wapiti_home) if not fio.IsExist(model_file): #if True: combine_files(feature_dir, train, train_filename) crf.train(train_filename, pattern_file, model_file)
def getOracleRougeSplit(oracledir, np, L, metric, outputdir): #sheets = range(0,1) sheets = range(0,12) body = [] for i, sheet in enumerate(sheets): week = i + 1 #Add a cache to make it faster Cache = {} cachefile = oracledir + str(week) + '/' + 'cache.json' print cachefile if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) row = [] for type in ['POI', 'MP', 'LP']: row.append(week) #read TA's summmary reffile = oracledir + str(week) + '/' + type + '.ref.summary' lines = fio.ReadFile(reffile) ref = [line.strip() for line in lines] Round = 1 while True: sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary' if not fio.IsExist(sumfile): break Round = Round + 1 Round = Round - 1 sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary' if fio.IsExist(sumfile): import os ssfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + ".summary" cmd = 'cp ' + sumfile + ' ' + ssfile print cmd os.system(cmd) lines = fio.ReadFile(sumfile) TmpSum = [line.strip() for line in lines] cacheKey = getKey(ref, TmpSum) if cacheKey in Cache: scores = Cache[cacheKey] print "Hit" else: print "Miss", cacheKey print sumfile scores = getRouge(ref, TmpSum) Cache[cacheKey] = scores #exit() row = row + scores else: row = row + [0]*len(RougeHeader) body.append(row) print body print "RougeHeader", len(RougeHeader) header = ['week'] + RougeHeader*3 row = [] row.append("average") print len(header) for i in range(1, len(header)): scores = [float(xx[i]) for xx in body] row.append(numpy.mean(scores)) body.append(row) fio.WriteMatrix(outputdir + "rouge." + str(np) + '.L' + str(L) + "." + str(metric) + ".txt", body, header)
def gather_rouge(): Allbody = [] for cid in [ 'IE256', 'IE256_2016', 'CS0445', ]: ilpdir = "../data/%s/" % cid baseline_rougefile = os.path.join(ilpdir, 'rouge_np.txt') if not fio.IsExist(baseline_rougefile): continue basehead, basebody = fio.ReadMatrix(baseline_rougefile, hasHead=True) row = [cid, '', 'PhrasSum' ] + ['%.3f' % float(x) for x in basebody[-1][1:-3]] Allbody.append(row) for A in [ '1', '2', ]: for model in [ 'optimumComparerLSATasa', 'oracle', 'oracle_selection' ]: modeldir = os.path.join(ilpdir, 'oracle_annotator_%s' % A) model_rouge_file = os.path.join( modeldir, 'rouge_annotator%s_%s.txt' % (A, model)) head, body = fio.ReadMatrix(model_rouge_file, hasHead=True) if model == 'optimumComparerLSATasa': basehead1, basebody1 = fio.ReadMatrix(model_rouge_file, hasHead=True) elif model == 'oracle': basehead2, basebody2 = fio.ReadMatrix(model_rouge_file, hasHead=True) row = [cid, 'A%s' % A, model ] + ['%.3f' % float(x) for x in body[-1][1:-3]] print cid, model print model_rouge_file print baseline_rougefile #get p values from stats_util import get_ttest_pvalues pvalues = get_ttest_pvalues(basebody[1:-1], body[1:-1], range(1, len(head) - 3)) if model == 'optimumComparerLSATasa': k = 3 for p in pvalues: if p < 0.05: row[k] = row[k] + '$^*$' k += 1 elif model == 'oracle': pvalues1 = get_ttest_pvalues(basebody1[1:-1], body[1:-1], range(1, len(head) - 3)) k = 3 for p1, p2 in zip(pvalues, pvalues1): if p1 < 0.05 and p2 < 0.05: row[k] = row[k] + '$^{*\dag}$' elif p1 < 0.05: row[k] = row[k] + '$^*$' elif p2 < 0.05: row[k] = row[k] + '$^\dag$' k += 1 elif model == 'oracle_selection': pvalues1 = get_ttest_pvalues(basebody1[1:-1], body[1:-1], range(1, len(head) - 3)) pvalues2 = get_ttest_pvalues(basebody2[1:-1], body[1:-1], range(1, len(head) - 3)) k = 3 for p1, p2, p3 in zip(pvalues, pvalues1, pvalues2): if p1 >= 0.05 and p2 >= 0.05 and p3 >= 0.05: k += 1 continue row[k] = row[k] + '$^{' if p1 < 0.05: row[k] = row[k] + '*' if p2 < 0.05: row[k] = row[k] + '\dag' if p3 < 0.05: row[k] = row[k] + '\circ' row[k] = row[k] + '}$' k += 1 Allbody.append(row) output = '../data/rouge_oracle_all_gather.txt' fio.Write2Latex(output, Allbody, [''] + head)
def Greedy(oracledir, np, L, metric='R1-F'): #sheets = range(0,1) sheets = range(0,12) RIndex = RougeHeader.index(metric) assert(RIndex != -1) for i, sheet in enumerate(sheets): week = i + 1 #Add a cache to make it faster Cache = {} cachefile = oracledir + str(week) + '/' + 'cache.json' if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) #for type in ['POI']: for type in ['POI', 'MP', 'LP']: #read TA's summmary reffile = oracledir + str(week) + '/' + type + '.ref.summary' lines = fio.ReadFile(reffile) ref = [line.strip() for line in lines] #read Phrases phrasefile = oracledir + str(week) + '/' + type + '.' + str(np) + '.key' lines = fio.ReadFile(phrasefile) candidates = [line.strip() for line in lines] summary = [] Length = 0 maxSum = [] maxScore = 0 Round = 1 Changed = True while Changed: Changed = False for phrase in candidates: WC = len(phrase.split()) if Length + WC > L: continue TmpSum = copy.deepcopy(summary) TmpSum.append(phrase) #get Rouge Score cacheKey = getKey(ref, TmpSum) if cacheKey in Cache: scores = Cache[cacheKey] print "Hit" else: scores = getRouge(ref, TmpSum) Cache[cacheKey] = scores s = float(scores[RIndex]) #s = scores[RIndex] if s > maxScore: maxSum = TmpSum maxScore = scores Changed = True if Changed: #write the results sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary' fio.SaveList(maxSum, sumfile, '\r\n') summary = maxSum Length = 0 for s in maxSum: Length = Length + len(s.split()) Round = Round + 1 newCandidates = [] #remove the candidate from the existing summary for phrase in candidates: if phrase not in maxSum: newCandidates.append(phrase) candidates = newCandidates with open(cachefile, 'w') as outfile: json.dump(Cache, outfile, indent=2)
def GetLexRankScore(datadir, np, outputdir): sheets = range(0, maxWeek) for type in ['q1', 'q2', 'q3', 'q4']: for sheet in sheets: week = sheet + 1 DID = str(week) + '_' + type phrases = [] scores = [] #read Docsent path = datadir + str(week)+ '/' path = path + type + '/' path = path + 'docsent/' filename = path + DID + '.docsent' #print filename if not fio.IsExist(filename): continue tree = ET.parse(filename) root = tree.getroot() for child in root: phrases.append(child.text) #read feature path = datadir + str(week)+ '/' path = path + type + '/' path = path + 'feature/' filename = path + type + '.LexRank.sentfeature' if fio.IsExist(filename): tree = ET.parse(filename) root = tree.getroot() for child in root: feature = child[0] #print feature.tag, feature.attrib, feature.attrib['V'] #print child.tag, child.attrib scores.append(feature.attrib['V']) else: for phrase in phrases: scores.append("0") #write assert(len(phrases) == len(scores)) dict = {} for phrase, score in zip(phrases, scores): dict[phrase.lower()] = score output = outputdir + str(week)+ '/' + str(type) + "." + np + ".lexrank.dict" fio.NewPath(outputdir + str(week)+ '/') fio.SaveDict(dict, output, SortbyValueflag=True) dict = {} for phrase, score in zip(phrases, scores): if phrase.lower() in dict: dict[phrase.lower()] = max(score, dict[phrase.lower()]) else: dict[phrase.lower()] = score output = outputdir + str(week)+ '/' + str(type) + "." + np + ".lexrankmax.dict" fio.SaveDict(dict, output, SortbyValueflag=True)
def getRouge(datadir, maxWeek, output): print datadir sheets = range(0, maxWeek) body = [] for sheet in sheets: week = sheet + 1 dir = datadir + str(week) + '/' for type in ['q1', 'q2']: summary_file = dir + type + "." + 'summary' print summary_file if not fio.IsExist(summary_file): print summary_file continue Cache = {} cachefile = os.path.join(datadir, str(week), 'cache.json') print cachefile if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) #read TA's summmary refs = [] for i in range(2): reffile = os.path.join(datadir, str(week), type + '.ref.%d' % i) if not fio.IsExist(reffile): print reffile continue lines = fio.ReadFile(reffile) ref = [line.strip() for line in lines] refs.append(ref) if len(refs) == 0: continue lstref = refs[0] + refs[1] lines = fio.ReadFile(summary_file) TmpSum = [line.strip() for line in lines] cacheKey = OracleExperiment.getKey(lstref, TmpSum) if cacheKey in Cache: scores = Cache[cacheKey] print "Hit" else: print "Miss" print summary_file scores = OracleExperiment.getRouge_IE256(refs, TmpSum) Cache[cacheKey] = scores row = [week] row = row + scores body.append(row) try: fio.SaveDict2Json(Cache, cachefile) except Exception as e: #fio.SaveDict(Cache, cachefile + '.dict') print e header = ['id'] + RougeHeader row = ['ave'] for i in range(1, len(header)): scores = [float(xx[i]) for xx in body] row.append(numpy.mean(scores)) body.append(row) fio.WriteMatrix(output, body, header)
def train_leave_one_lecture_out_svm(model_dir, name='simlearn_cv'): # model_dir = '../data/IE256/%s/model/%s/'%(system, name) # fio.NewPath(model_dir) # # outputdir = '../data/IE256/%s/extraction/%s_output/'%(system, name) # fio.NewPath(outputdir) sim_extractor = Similarity() allfeatures = sorted(sim_extractor.features.keys()) #for k in range(len(allfeatures)+1): k = len(allfeatures) if True: #for k in range(len(allfeatures)): #if allfeatures[k] != 'optimumComparerLSATasa': continue if k == len(allfeatures): #use all features features = allfeatures else: features = [allfeatures[k]] #features = allfeatures[0:k] + allfeatures[k+1:] name = '_'.join(features) lectures = annotation.Lectures dict = defaultdict(int) MSE = [] for i, lec in enumerate(lectures): train = [x for x in lectures if x != lec] test = [lec] print train print test model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name)) if fio.IsExist(model_file): with open(model_file, 'rb') as handle: clf = pickle.load(handle) else: train_X, train_Y = combine_files(train, features) clf = svm.SVC() clf.fit(train_X, train_Y) with open(model_file, 'wb') as handle: pickle.dump(clf, handle) for q in ['q1', 'q2']: test_X, test_Y = combine_files(test, features, prompts=[q]) predict_Y = clf.predict(test_X) prf = precision_recall_fscore_support(test_Y, predict_Y, average='weighted') accuracy = accuracy_score(test_Y, predict_Y) MSE.append([lec, q, accuracy] + [prf[0], prf[1], prf[2]]) output = '../data/%s/simlearning.cv.svm.%s.txt' % (course, name) fio.WriteMatrix(output, MSE, header=[ 'lec', 'prompt', 'accuracy', 'precision', 'recall', 'f-score' ])
def extractPhraseFromAnnotation(phrasedir, annotator, summarydir=None): for doc, lec, annotator in annotation.generate_all_files( annotation.datadir + 'json/', '.json', anotators=annotator, lectures=annotation.Lectures): print doc #load task task = annotation.Task() task.loadjson(doc) path = phrasedir + str(lec) + '/' fio.NewPath(path) #Add a cache to make it faster Cache = {} cachefile = phrasedir + str(lec) + '/' + 'cache.json' if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' cluster_output = path + prompt + '.cluster.kmedoids.sqrt.oracle.%s' % method if summarydir: fio.NewPath(os.path.join(summarydir, str(lec))) summary_file = os.path.join(summarydir, str(lec), '%s.summary' % prompt) body = [] if summarydir: summaries = [] phrase_summary_dict = task.get_phrase_summary_textdict(prompt) extracted_phrases = [] phrase_annotation = task.get_phrase_annotation(prompt) for rank in sorted(phrase_annotation): rank_phrases = [] phrases = phrase_annotation[rank] for phrasedict in phrases: phrase = phrasedict['phrase'].lower() extracted_phrases.append(phrase) rank_phrases.append(phrase) row = [phrase, rank] body.append(row) if summarydir: rank_summary = phrase_summary_dict[rank] max_summary = get_max_phrase_by_ROUGE( rank_summary, rank_phrases, Cache) print max_summary summaries.append(max_summary) fio.SaveList(extracted_phrases, filename) fio.WriteMatrix(cluster_output, body, header=None) if summarydir: fio.SaveList(summaries, summary_file) with open(cachefile, 'w') as outfile: json.dump(Cache, outfile, indent=2)
def gather_rouge(output): courses = ['IE256', 'IE256_2016', 'CS0445'] rouges = [ ('LexRank', 'QPS_NP', 'rouge_LexRank'), ('PhraseSum', 'QPS_NP', 'rouge_crf_optimumComparerLSATasa'), ('SequenceSum', 'QPS_combine_coling', 'rouge_crf_optimumComparerLSATasa'), ('SimSum', 'QPS_combine_coling', 'rouge_crf_svm'), ('CDSum', 'QPS_combine_coling', 'rouge_crf_ct.svm.default'), ] baseline1 = ('PhraseSum', 'QPS_NP', 'rouge_crf_optimumComparerLSATasa') baseline2 = ('SequenceSum', 'QPS_combine_coling', 'rouge_crf_optimumComparerLSATasa') Header = [ 'course', 'name', 'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R', 'RSU4-P', 'RSU4-F', ] ROUGE_Head = [ 'id', 'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R', 'RSU4-P', 'RSU4-F' ] ROUGE_index = [ ROUGE_Head.index(name) for name in ROUGE_Head if name != 'id' ] xbody = [] for course in courses: for name, model, method in rouges: datadir = '../data/%s/' % course filename = os.path.join(datadir, model, "%s.txt" % method) if not fio.IsExist(filename): continue baseline1_name = os.path.join(datadir, baseline1[1], "%s.txt" % baseline1[2]) baseline2_name = os.path.join(datadir, baseline2[1], "%s.txt" % baseline2[2]) if name in ['LexRank', 'SequenceSum', 'SimSum', 'CDSum']: pvalues1 = get_pvalues(filename, baseline1_name, ROUGE_index) else: pvalues1 = [1] * len(ROUGE_index) if name in ['SimSum', 'CDSum']: pvalues2 = get_pvalues(filename, baseline2_name, ROUGE_index) else: pvalues2 = [1] * len(ROUGE_index) head, body = fio.ReadMatrix(filename, hasHead=True) row = [course, name] row += [ '%.3f%s%s' % (float(x), '*' if pvalues1[i] < 0.05 else '', '+' if pvalues2[i] < 0.05 else '') for i, x in enumerate(body[-1][1:]) ] xbody.append(row) fio.WriteMatrix(output, xbody, Header)
def getRouge(datadir, maxWeek, output): sheets = range(0, maxWeek) body = [] allbody = [] #Krange = range(1, 25) #Krange = range(1, 25) Krange = [gK] for sheet in sheets: week = sheet + 1 dir = datadir + str(week) + '/' for type in ['q1', 'q2']: maxS = 0 maxK = -1 maxScore = [] Cache = {} cachefile = os.path.join(datadir, str(week), 'cache.json') print cachefile if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) allrow = [week] #Krange = [np.random.randint(1, 25)] for K in Krange: summary_file = dir + type + '.%d.summary' % K print summary_file if not fio.IsExist(summary_file): print summary_file continue #read TA's summmary refs = [] for i in range(2): reffile = os.path.join(datadir, str(week), type + '.ref.%d' % i) if not fio.IsExist(reffile): print reffile continue lines = fio.ReadFile(reffile) ref = [line.strip() for line in lines] refs.append(ref) if len(refs) == 0: continue lstref = refs[0] + refs[1] lines = fio.ReadFile(summary_file) TmpSum = [line.strip() for line in lines] cacheKey = OracleExperiment.getKey(lstref, TmpSum) if cacheKey in Cache: scores = Cache[cacheKey] print "Hit" else: print "Miss" print summary_file scores = OracleExperiment.getRouge_IE256(refs, TmpSum) Cache[cacheKey] = scores s = float(scores[RIndex]) allrow.append(s) if s >= maxS: maxS = s maxScore = scores maxK = K if maxK == -1: continue row = [week] row = row + maxScore + [maxK] body.append(row) allrow.append(maxK) allbody.append(allrow) try: fio.SaveDict2Json(Cache, cachefile) except: #fio.SaveDict(Cache, cachefile + '.dict') pass header = ['id'] + RougeHeader row = ['ave'] for i in range(1, len(header)): scores = [float(xx[i]) for xx in body] row.append(numpy.mean(scores)) body.append(row) fio.WriteMatrix(output, body, header) fio.WriteMatrix(output + '.all', allbody, ['week'] + Krange)
def train_leave_one_lecture_out(name='cv'): wapiti_home = global_params.wapiti_dir pattern_file = '../data/%s.pattern.txt' % name model_dir = '../data/%s/%s/model/%s/' % (course, system, name) fio.NewPath(model_dir) feature_dir = '../data/%s/%s/extraction/' % (course, system) feature_cv_dir = '../data/%s/%s/extraction/%s/' % (course, system, name) fio.NewPath(feature_cv_dir) outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name) fio.NewPath(outputdir) lectures = annotation.Lectures dict = defaultdict(int) for i, lec in enumerate(lectures): train = [x for x in lectures if x != lec] test = [lec] train_filename = os.path.join(feature_cv_dir, 'train_%d.feature.crf' % i) model_file = os.path.join(model_dir, '%d.model' % i) print train_filename print model_file crf = CRF(wapiti_home) if not fio.IsExist(model_file): #if True: combine_files(feature_dir, train, train_filename) crf.train(train_filename, pattern_file, model_file) for q in ['q1', 'q2']: test_filename = os.path.join(feature_cv_dir, 'test_%d_%s.feature.crf' % (i, q)) output_file = os.path.join(outputdir, 'test_%d_%s.out' % (i, q)) dict['test_%d_%s' % (i, q)] = 1 if empty == 'Y': test_filename_old = test_filename.replace('_Y', '_N') cmd = 'cp %s %s' % (test_filename_old, test_filename) os.system(cmd) else: if method == 'combine': test_filename_old = test_filename.replace( '_combine', '_A1') cmd = 'cp %s %s' % (test_filename_old, test_filename) os.system(cmd) else: combine_files(feature_dir, test, test_filename, prompts=[q]) crf.predict(test_filename, model_file, output_file) if debug: break file_util.save_dict2json(dict, class_index_dict_file)
def getShallowSummary(excelfile, folder, clusterdir, K=30, method=None, similarity=None, ratio=None, lex='lexrank'): #K is the number of words per points sheets = range(0,maxWeek) for i, sheet in enumerate(sheets): week = i + 1 for type in ['q1', 'q2', 'q3', 'q4']: path = folder + str(week)+ '/' fio.NewPath(path) filename = path + type + '.%d.summary'%ratio #produce the cluster file on the fly phrasefile = os.path.join(clusterdir, str(week), type + '.' + method + '.key') if not fio.IsExist(phrasefile): continue print excelfile, sheet, type cluster_output = clusterdir + str(week) +'/' + type + ".cluster.kmedoids." + str(ratio) + "." +similarity + '.' + method print cluster_output weightfile = clusterdir + str(week)+ '/' + type + '.' + method + '.' + similarity print weightfile if not fio.IsExist(cluster_output): #if True: print "clustering" phraseClusteringKmedoid.getPhraseClusterPhrase(phrasefile, weightfile, cluster_output, ratio, method=method) if not fio.IsExist(cluster_output): continue body = fio.ReadMatrix(cluster_output, False) NPCandidates = fio.ReadFile(phrasefile) lexfile = clusterdir + str(week)+ '/' + str(type) + "." + method + "."+lex+".dict" lexdict = fio.LoadDict(lexfile, 'float') NPs = [row[0] for row in body] clusterids = [row[1] for row in body] #assert(NPCandidates == NPs) if NPCandidates != NPs: print NPCandidates print NPs cluster = {} for row in body: cluster[row[0]] = int(row[1]) Summary = [] #sort the clusters according to the number of response keys = postProcess.RankClusterNoSource(NPs, lexdict, clusterids) total_word = 0 word_count = 0 for key in keys: #phrase = NPs[key] phrase = postProcess.getTopRankPhraseNoSource(NPs, clusterids, int(key), lexdict) if phrase in Summary: continue word_count = len(phrase.split()) total_word = total_word + word_count #if total_word <= K: if len(Summary) + 1 <= K: Summary.append(phrase) fio.SaveList(Summary, filename)
def plot_rouge_by_time(): for metric in [ 'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R', 'RSU4-P', 'RSU4-F' ]: for prompt in ['q1', 'q2']: courses = ['IE256', 'IE256_2016', 'CS0445'] rouges = [ #('LexRank', 'QPS_NP', 'rouge_LexRank'), ('PhraseSum', 'QPS_NP', 'rouge_crf_optimumComparerLSATasa'), ('SequenceSum', 'QPS_combine_coling', 'rouge_crf_optimumComparerLSATasa'), #('SimSum', 'QPS_combine_coling', 'rouge_crf_svm'), #('CDSum', 'QPS_combine_coling', 'rouge_crf_ct.svm.default'), ] baseline1 = ('PhraseSum', 'QPS_NP', 'rouge_crf_optimumComparerLSATasa') baseline2 = ('SequenceSum', 'QPS_combine_coling', 'rouge_crf_optimumComparerLSATasa') Header = [ 'course', 'name', 'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R', 'RSU4-P', 'RSU4-F', ] ROUGE_Head = [ 'id', 'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R', 'RSU4-P', 'RSU4-F' ] ROUGE_index = [ ROUGE_Head.index(name) for name in ROUGE_Head if name != 'id' ] metric_index = ROUGE_Head.index(metric) fig = plt.figure() ax = fig.add_subplot(111) ax.set_xlabel('week', fontsize=12) ax.set_ylabel('Rouge', fontsize=12) plt.title('%s %s' % (metric, annotation.prompt_name[prompt])) plt.grid(True) colors = ['#d8b365', "#f5f5f5", "#5ab4ac"] for cid, course in enumerate(courses): #c = colors[cid] for name, model, method in rouges: datadir = '../data/%s/' % course filename = os.path.join(datadir, model, "%s_%s.txt" % (method, prompt)) if not fio.IsExist(filename): continue X, Y = get_X_Y(filename, metric_index) #plt.plot(X, Y, label=metric, marker='D', color="b", alpha=0.6, ) plt.plot(X, Y, label='%s_%s' % (course, name), alpha=0.6, linewidth=2) #legend = plt.legend(loc='right center', shadow=True, fontsize='x-large') legend = plt.legend(loc='upper right', shadow=True, fontsize='small') pp = PdfPages('../data/%s_%s.pdf' % (metric, prompt)) plt.savefig(pp, format='pdf') pp.close()