def getRouge_Tac(refs, model): #return the Rouge scores given the reference summary and the models #write the files fio.SaveList(model, tmpdir+'model.txt', '\n') for i, ref in enumerate(refs): fio.SaveList(ref, tmpdir+'ref%d.txt'%(i+1), '\n') retcode = subprocess.call(['./get_rouge_tac'], shell=True) if retcode != 0: print("Failed!") exit(-1) else: print "Passed!" row = [] for scorename in RougeNames: filename = tmpdir + "OUT_"+scorename+".csv" lines = fio.ReadFile(filename) try: scorevalues = lines[1].split(',') score = scorevalues[1].strip() row.append(score) score = scorevalues[2].strip() row.append(score) score = scorevalues[3].strip() row.append(score) except Exception: print filename, scorename, lines return row
def extractPhrase(excelfile, folder, sennadatadir, method): sheets = range(0, maxWeek) for i, sheet in enumerate(sheets): week = i + 1 for type in ['q1', 'q2', 'q3', 'q4']: #for type in ['POI', 'MP']: print excelfile, sheet, type student_summaryList = CourseMirror_Survey.getStudentResponseList( excelfile, course, week, type, withSource=False) if len(student_summaryList) == 0: continue path = folder + str(week) + '/' fio.NewPath(path) filename = path + type + '.' + method + '.key' sennafile = sennadatadir + "senna." + str( week) + "." + type + '.output' if not fio.IsExist(sennafile): continue phrases = getKeyPhrases(student_summaryList, sennafile, method=method, MalformedFlilter=True) fio.SaveList(phrases, filename)
def extractPhraseFromCRFWithColor(phrasedir, systemdir): crf_reader = CRF() aligner = AlignPhraseAnnotation() lectures = annotation.Lectures for i, lec in enumerate(lectures): path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' extracted_phrases = [] extracted_colors = [] crf_file = os.path.join(systemdir, 'extraction', 'all_output', 'test_%i_%s.out' % (i, prompt)) for tokens, tags, color0, color1 in crf_reader.read_file_generator_index( crf_file, [0, -1, -4, -3]): phrases, phrase_colors = aligner.get_phrase_with_colors( tokens, tags, [color0, color1]) for phrase, phrase_color in zip(phrases, phrase_colors): extracted_phrases.append(phrase.lower()) extracted_colors.append(phrase_color) fio.SaveList(extracted_phrases, filename) filename = path + prompt + '.' + method + '.key.color' fio.SaveDict2Json(extracted_colors, filename)
def get_phrase_reference_summary_phrase(outputs = None): for output in outputs: fio.NewPath(output) counts = [] for doc, lec, annotator in annotation.generate_all_files(annotation.datadir + 'json/', '.json', anotators = annotation.anotators, lectures=annotation.Lectures): print doc task = annotation.Task() task.loadjson(doc) sub_tasks = task.get_tasks() for sub_task in sub_tasks: if sub_task["task_name"] == "Phrase": if sub_task['prompt'] == 0: #POI type = 'q1' else: type = 'q2' summary_filename = os.path.join(output, str(lec), type+'.ref.' + str(annotation.anotator_dict[annotator])) #summary_filename = os.path.join(output, str(lec), type+'.ref.summary') print summary_filename summaries = [row[1] for row in sub_task["summary"][1:]] colors = [row[0].strip()[1] for row in sub_task["summary"][1:]] student_numbers = [row[2].strip() for row in sub_task["summary"][1:]] count = 0 for summary in summaries: count += len(NLTKWrapper.wordtokenizer(summary)) counts.append(count) fio.SaveList(summaries, summary_filename) color_filename = os.path.join(output, str(lec), '%s.ref.%s.color'%(type, str(annotation.anotator_dict[annotator]))) fio.SaveList(colors, color_filename) no_filename = os.path.join(output, str(lec), '%s.ref.%s.no'%(type, str(annotation.anotator_dict[annotator]))) fio.SaveList(student_numbers, no_filename) print counts print numpy.mean(counts) print numpy.median(counts)
def getRougeTmp(ref, model): #return the Rouge scores given the reference summary and the models #create a temp file temp_path = mkdtemp() print(temp_path) #write the files fio.SaveList(ref, os.path.join(temp_path, 'ref.txt'), '\n') fio.SaveList(model, os.path.join(temp_path, 'model.txt'), '\n') retcode = subprocess.call(['./get_rouge_tmp %s'%temp_path], shell=True) if retcode != 0: print("Failed!") exit(-1) else: print "Passed!" row = [] for scorename in RougeNames: filename = os.path.join(temp_path, "OUT_"+scorename+".csv") if not fio.IsExist(filename): print filename, " not exist" row = row + [0, 0, 0] continue lines = fio.ReadFile(filename) try: scorevalues = lines[1].split(',') score = scorevalues[1].strip() row.append(score) score = scorevalues[2].strip() row.append(score) score = scorevalues[3].strip() row.append(score) fio.DeleteFolder(temp_path) except Exception: print filename, scorename, lines return row
def getStudentResponses4Senna(excelfile, cid, maxWeek, datadir): sheets = range(1, maxWeek + 1) for sheet in sheets: week = sheet for type in ['q1', 'q2', 'q3', 'q4']: student_summaryList = getStudentResponseList( excelfile, cid, week, type) if len(student_summaryList) == 0: continue filename = datadir + "senna." + str(week) + "." + type + ".input" fio.SaveList(student_summaryList, filename)
def getRouge(ref, model): #return the Rouge scores given the reference summary and the models #write the files fio.SaveList(ref, tmpdir+'ref.txt', '\n') fio.SaveList(model, tmpdir+'model.txt', '\n') retcode = subprocess.call(['./get_rouge'], shell=True) if retcode != 0: print("Failed!") exit(-1) else: print "Passed!" row = [] for scorename in RougeNames: filename = tmpdir + "OUT_"+scorename+".csv" if not fio.IsExist(filename): print filename, " not exist" row = row + [0, 0, 0] continue lines = fio.ReadFile(filename) try: scorevalues = lines[1].split(',') score = scorevalues[1].strip() row.append(score) score = scorevalues[2].strip() row.append(score) score = scorevalues[3].strip() row.append(score) except Exception: print filename, scorename, lines return row
def PrepareIE256(): cid = "IE256" maxWeek = 25 excelfile = "../data/CourseMirror/Reflection.json" sennadir = "../../AbstractPhraseSummarization/data/IE256/senna/" #fio.NewPath(sennadir) #getStudentResponses4Senna(excelfile, cid, maxWeek, sennadir) outdirs = [ #'../../AbstractPhraseSummarization/data/IE256/ILP_Baseline_Sentence/', #'../../AbstractPhraseSummarization/data/IE256/MC/', #'../../AbstractPhraseSummarization/data/IE256/ILP_Sentence_MC/', '../../AbstractPhraseSummarization/data/IE256/ILP_Sentence_Supervised_FeatureWeightingAveragePerceptron/', ] sheets = range(1, maxWeek + 1) for outdir in outdirs: for sheet in sheets: week = sheet for type in ['q1', 'q2', 'q3', 'q4']: student_summaryList = getStudentResponseList( excelfile, cid, week, type, True) if len(student_summaryList) == 0: continue path = os.path.join(outdir, str(week)) fio.NewPath(path) source = {} responses = [] count = defaultdict(int) for response, student in student_summaryList: responses.append(response) count[response] += 1 if response not in source: source[response] = [] source[response].append(student) outout = os.path.join(path, type + ".sentence.key") fio.SaveList(set(responses), outout) output = os.path.join(path, type + '.sentence.keys.source') fio.SaveDict2Json(source, output) output = os.path.join(path, type + '.sentence.dict') fio.SaveDict(count, output)
def extractPhraseFromCRF(phrasedir, systemdir): crf_reader = CRF() aligner = AlignPhraseAnnotation() lectures = annotation.Lectures for i, lec in enumerate(lectures): path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' phrases = [] crf_file = os.path.join(systemdir, 'extraction', 'all_output', 'test_%i_%s.out' % (i, prompt)) for tokens, tags in crf_reader.read_file_generator(crf_file): for phrase in aligner.get_phrase(tokens, tags): phrases.append(phrase.lower()) fio.SaveList(phrases, filename)
def extractPhraseFromAnnotationIntersect(phrasedir, annotators): for docs in annotation.generate_all_files_by_annotators( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): doc0, lec0, annotator0 = docs[0] doc1, lec1, annotator1 = docs[1] assert (lec0 == lec1) lec = lec0 #load tasks task0 = annotation.Task() task0.loadjson(doc0) task1 = annotation.Task() task1.loadjson(doc1) path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' print filename extracted_phrases = [] phrase_annotation0 = task0.get_phrase_annotation(prompt) phrase_annotation1 = task1.get_phrase_annotation(prompt) aligner = AlignPhraseAnnotation(task0, task1, prompt) aligner.align() extracted_phrases = aligner.get_intersect() fio.SaveList(extracted_phrases, filename)
def getOracleRouge(oracledir, np, L, metric, outputdir): #sheets = range(0,1) sheets = range(0,12) body = [] for i, sheet in enumerate(sheets): week = i + 1 #Add a cache to make it faster Cache = {} cachefile = oracledir + str(week) + '/' + 'cache.json' print cachefile if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) for type in ['POI', 'MP', 'LP']: row = [] row.append(week) #read TA's summmary reffile = oracledir + str(week) + '/' + type + '.ref.summary' lines = fio.ReadFile(reffile) ref = [line.strip() for line in lines] Round = 1 while True: sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary' if not fio.IsExist(sumfile): break Round = Round + 1 Round = Round - 1 sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary' if not fio.IsExist(sumfile): lines = [] else: lines = fio.ReadFile(sumfile) TmpSum = [line.strip() for line in lines] newsumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) +'.summary' fio.SaveList(TmpSum, newsumfile) cacheKey = getKey(ref, TmpSum) if cacheKey in Cache: scores = Cache[cacheKey] print "Hit" else: print "Miss", cacheKey print sumfile scores = getRouge(ref, TmpSum) Cache[cacheKey] = scores #exit() row = row + scores body.append(row) header = ['week'] + RougeHeader row = [] row.append("average") for i in range(1, len(header)): scores = [float(xx[i]) for xx in body] row.append(numpy.mean(scores)) body.append(row) fio.WriteMatrix(outputdir + "rouge." + str(np) + '.L' + str(L) + "." + str(metric) + ".txt", body, header)
def Greedy(oracledir, np, L, metric='R1-F'): #sheets = range(0,1) sheets = range(0,12) RIndex = RougeHeader.index(metric) assert(RIndex != -1) for i, sheet in enumerate(sheets): week = i + 1 #Add a cache to make it faster Cache = {} cachefile = oracledir + str(week) + '/' + 'cache.json' if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) #for type in ['POI']: for type in ['POI', 'MP', 'LP']: #read TA's summmary reffile = oracledir + str(week) + '/' + type + '.ref.summary' lines = fio.ReadFile(reffile) ref = [line.strip() for line in lines] #read Phrases phrasefile = oracledir + str(week) + '/' + type + '.' + str(np) + '.key' lines = fio.ReadFile(phrasefile) candidates = [line.strip() for line in lines] summary = [] Length = 0 maxSum = [] maxScore = 0 Round = 1 Changed = True while Changed: Changed = False for phrase in candidates: WC = len(phrase.split()) if Length + WC > L: continue TmpSum = copy.deepcopy(summary) TmpSum.append(phrase) #get Rouge Score cacheKey = getKey(ref, TmpSum) if cacheKey in Cache: scores = Cache[cacheKey] print "Hit" else: scores = getRouge(ref, TmpSum) Cache[cacheKey] = scores s = float(scores[RIndex]) #s = scores[RIndex] if s > maxScore: maxSum = TmpSum maxScore = scores Changed = True if Changed: #write the results sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary' fio.SaveList(maxSum, sumfile, '\r\n') summary = maxSum Length = 0 for s in maxSum: Length = Length + len(s.split()) Round = Round + 1 newCandidates = [] #remove the candidate from the existing summary for phrase in candidates: if phrase not in maxSum: newCandidates.append(phrase) candidates = newCandidates with open(cachefile, 'w') as outfile: json.dump(Cache, outfile, indent=2)
def extractPhraseFromAnnotation(phrasedir, annotator, summarydir=None): for doc, lec, annotator in annotation.generate_all_files( annotation.datadir + 'json/', '.json', anotators=annotator, lectures=annotation.Lectures): print doc #load task task = annotation.Task() task.loadjson(doc) path = phrasedir + str(lec) + '/' fio.NewPath(path) #Add a cache to make it faster Cache = {} cachefile = phrasedir + str(lec) + '/' + 'cache.json' if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' cluster_output = path + prompt + '.cluster.kmedoids.sqrt.oracle.%s' % method if summarydir: fio.NewPath(os.path.join(summarydir, str(lec))) summary_file = os.path.join(summarydir, str(lec), '%s.summary' % prompt) body = [] if summarydir: summaries = [] phrase_summary_dict = task.get_phrase_summary_textdict(prompt) extracted_phrases = [] phrase_annotation = task.get_phrase_annotation(prompt) for rank in sorted(phrase_annotation): rank_phrases = [] phrases = phrase_annotation[rank] for phrasedict in phrases: phrase = phrasedict['phrase'].lower() extracted_phrases.append(phrase) rank_phrases.append(phrase) row = [phrase, rank] body.append(row) if summarydir: rank_summary = phrase_summary_dict[rank] max_summary = get_max_phrase_by_ROUGE( rank_summary, rank_phrases, Cache) print max_summary summaries.append(max_summary) fio.SaveList(extracted_phrases, filename) fio.WriteMatrix(cluster_output, body, header=None) if summarydir: fio.SaveList(summaries, summary_file) with open(cachefile, 'w') as outfile: json.dump(Cache, outfile, indent=2)
def getShallowSummary(excelfile, folder, clusterdir, K=30, method=None, similarity=None, ratio=None, lex='lexrank'): #K is the number of words per points sheets = range(0,maxWeek) for i, sheet in enumerate(sheets): week = i + 1 for type in ['q1', 'q2', 'q3', 'q4']: path = folder + str(week)+ '/' fio.NewPath(path) filename = path + type + '.%d.summary'%ratio #produce the cluster file on the fly phrasefile = os.path.join(clusterdir, str(week), type + '.' + method + '.key') if not fio.IsExist(phrasefile): continue print excelfile, sheet, type cluster_output = clusterdir + str(week) +'/' + type + ".cluster.kmedoids." + str(ratio) + "." +similarity + '.' + method print cluster_output weightfile = clusterdir + str(week)+ '/' + type + '.' + method + '.' + similarity print weightfile if not fio.IsExist(cluster_output): #if True: print "clustering" phraseClusteringKmedoid.getPhraseClusterPhrase(phrasefile, weightfile, cluster_output, ratio, method=method) if not fio.IsExist(cluster_output): continue body = fio.ReadMatrix(cluster_output, False) NPCandidates = fio.ReadFile(phrasefile) lexfile = clusterdir + str(week)+ '/' + str(type) + "." + method + "."+lex+".dict" lexdict = fio.LoadDict(lexfile, 'float') NPs = [row[0] for row in body] clusterids = [row[1] for row in body] #assert(NPCandidates == NPs) if NPCandidates != NPs: print NPCandidates print NPs cluster = {} for row in body: cluster[row[0]] = int(row[1]) Summary = [] #sort the clusters according to the number of response keys = postProcess.RankClusterNoSource(NPs, lexdict, clusterids) total_word = 0 word_count = 0 for key in keys: #phrase = NPs[key] phrase = postProcess.getTopRankPhraseNoSource(NPs, clusterids, int(key), lexdict) if phrase in Summary: continue word_count = len(phrase.split()) total_word = total_word + word_count #if total_word <= K: if len(Summary) + 1 <= K: Summary.append(phrase) fio.SaveList(Summary, filename)