def PrintClusterRankSummary(datadir): sheets = range(0,maxWeek) lectures = fio.LoadDictJson('../data/CourseMIRROR/lectures.json') head = ['week', 'data', 'Point of Interest', "Muddiest Point"] body = [] for i, sheet in enumerate(sheets): row = [] week = i + 1 row.append(week) row.append(getDate(lectures, course, week)) for type in ['q1', 'q2', 'q3', 'q4']: path = datadir + str(i+1)+ '/' summaryfile = path + type + '.summary' if not fio.IsExist(summaryfile): continue summaries = [line.strip() for line in fio.ReadFile(summaryfile)] sourcefile = path + type + '.summary.source' sources = [line.split(',') for line in fio.ReadFile(sourcefile)] combinedSummary = [] for j, (summary, source) in enumerate(zip(summaries, sources)): summary = summary.replace('"', '\'') combinedSummary.append(str(j+1) + ") " + summary + " [" + str(len(source)) + "]") row.append('"' + chr(10).join(combinedSummary)+ '"') body.append(row) fio.WriteMatrix(datadir + "summary.txt", body, head)
def load(self, filename): self.filename = filename self.lines = fio.ReadFile(filename) self.extract_task_anntation() self.combine_info()
def getRouge_Tac(refs, model): #return the Rouge scores given the reference summary and the models #write the files fio.SaveList(model, tmpdir+'model.txt', '\n') for i, ref in enumerate(refs): fio.SaveList(ref, tmpdir+'ref%d.txt'%(i+1), '\n') retcode = subprocess.call(['./get_rouge_tac'], shell=True) if retcode != 0: print("Failed!") exit(-1) else: print "Passed!" row = [] for scorename in RougeNames: filename = tmpdir + "OUT_"+scorename+".csv" lines = fio.ReadFile(filename) try: scorevalues = lines[1].split(',') score = scorevalues[1].strip() row.append(score) score = scorevalues[2].strip() row.append(score) score = scorevalues[3].strip() row.append(score) except Exception: print filename, scorename, lines return row
def SennaParseWithCountDict(filename): """ @function: Parse the file and return a list of sentence with index. @param filename: string, the filename of the sennafile, the sennafile is an output file given by SENNA @return: <list, dict>, the dict stores for the start line for each sentence """ lines = fio.ReadFile(filename) print "nLine=", len(lines) sys.stdout.flush() CountDict = {} nCount = 0 nLast = -1 for i in range(len(lines)): line = lines[i] row = [] line = line.strip() if len(line) == 0: #the last sentence is finished CountDict[nCount] = nLast + 1 nLast = i nCount = nCount + 1 print "nCount=", nCount sys.stdout.flush() #for s in sentences: # print s return lines, CountDict
def getPhraseClusterPhrase(phrasefile, weightfile, output, ratio=None, method=None): NPCandidates = fio.ReadFile(phrasefile) if len(NPCandidates) == 0: return NPs, matrix = fio.ReadMatrix(weightfile, hasHead=True) #change the similarity to distance matrix = Similarity2Distance(matrix) index = {} for i, NP in enumerate(NPs): index[NP] = i newMatrix = [] for NP1 in NPCandidates: if NP1 not in index: continue i = index[NP1] row = [] for NP2 in NPCandidates: if NP2 not in index: print NP2, weightfile, method continue j = index[NP2] row.append(matrix[i][j]) newMatrix.append(row) V = len(NPCandidates) if ratio == "sqrt": K = int(math.sqrt(V)) elif float(ratio) >= 1: K = int(ratio) else: K = int(ratio * V) if K < 1: K = 1 K = min(K, V) clusterid = ClusterWrapper.KMedoidCluster(newMatrix, K) body = [] for NP, id in zip(NPCandidates, clusterid): row = [] row.append(NP) row.append(id) body.append(row) fio.WriteMatrix(output, body, header=None)
def readgraph_partitions(self, input): lines = fio.ReadFile(input) communites = [] for line in lines: if line.startswith('#'): continue nodes = [int(x) for x in line.strip().split()] communites.append(nodes) return communites
def getRougeTmp(ref, model): #return the Rouge scores given the reference summary and the models #create a temp file temp_path = mkdtemp() print(temp_path) #write the files fio.SaveList(ref, os.path.join(temp_path, 'ref.txt'), '\n') fio.SaveList(model, os.path.join(temp_path, 'model.txt'), '\n') retcode = subprocess.call(['./get_rouge_tmp %s'%temp_path], shell=True) if retcode != 0: print("Failed!") exit(-1) else: print "Passed!" row = [] for scorename in RougeNames: filename = os.path.join(temp_path, "OUT_"+scorename+".csv") if not fio.IsExist(filename): print filename, " not exist" row = row + [0, 0, 0] continue lines = fio.ReadFile(filename) try: scorevalues = lines[1].split(',') score = scorevalues[1].strip() row.append(score) score = scorevalues[2].strip() row.append(score) score = scorevalues[3].strip() row.append(score) fio.DeleteFolder(temp_path) except Exception: print filename, scorename, lines return row
def SennaParse(filename): """ @function: Parse the file and return a list of sentence. Each sentence is a SennaSentence @param filename: string, the filename of the sennafile, the sennafile is an output file given by SENNA @return: list, Each item is a SennaSentence """ lines = fio.ReadFile(filename) #print "nLine=", len(lines) sys.stdout.flush() nCount = 0 for line in lines: row = [] line = line.strip() if len(line) == 0: #the last sentence is finished nCount = nCount + 1 #print "nCount=", nCount sys.stdout.flush() sentences = [None] * nCount nCount = 0 tm = [] for line in lines: row = [] line = line.strip() if len(line) == 0: #the last sentence is finished sentences[nCount] = SennaSentence(tm) nCount = nCount + 1 tm = [] continue for num in line.split("\t"): row.append(num.strip()) tm.append(row) #for s in sentences: # print s return sentences
def getRouge(ref, model): #return the Rouge scores given the reference summary and the models #write the files fio.SaveList(ref, tmpdir+'ref.txt', '\n') fio.SaveList(model, tmpdir+'model.txt', '\n') retcode = subprocess.call(['./get_rouge'], shell=True) if retcode != 0: print("Failed!") exit(-1) else: print "Passed!" row = [] for scorename in RougeNames: filename = tmpdir + "OUT_"+scorename+".csv" if not fio.IsExist(filename): print filename, " not exist" row = row + [0, 0, 0] continue lines = fio.ReadFile(filename) try: scorevalues = lines[1].split(',') score = scorevalues[1].strip() row.append(score) score = scorevalues[2].strip() row.append(score) score = scorevalues[3].strip() row.append(score) except Exception: print filename, scorename, lines return row
def getOracleRougeSplit(oracledir, np, L, metric, outputdir): #sheets = range(0,1) sheets = range(0,12) body = [] for i, sheet in enumerate(sheets): week = i + 1 #Add a cache to make it faster Cache = {} cachefile = oracledir + str(week) + '/' + 'cache.json' print cachefile if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) row = [] for type in ['POI', 'MP', 'LP']: row.append(week) #read TA's summmary reffile = oracledir + str(week) + '/' + type + '.ref.summary' lines = fio.ReadFile(reffile) ref = [line.strip() for line in lines] Round = 1 while True: sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary' if not fio.IsExist(sumfile): break Round = Round + 1 Round = Round - 1 sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary' if fio.IsExist(sumfile): import os ssfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + ".summary" cmd = 'cp ' + sumfile + ' ' + ssfile print cmd os.system(cmd) lines = fio.ReadFile(sumfile) TmpSum = [line.strip() for line in lines] cacheKey = getKey(ref, TmpSum) if cacheKey in Cache: scores = Cache[cacheKey] print "Hit" else: print "Miss", cacheKey print sumfile scores = getRouge(ref, TmpSum) Cache[cacheKey] = scores #exit() row = row + scores else: row = row + [0]*len(RougeHeader) body.append(row) print body print "RougeHeader", len(RougeHeader) header = ['week'] + RougeHeader*3 row = [] row.append("average") print len(header) for i in range(1, len(header)): scores = [float(xx[i]) for xx in body] row.append(numpy.mean(scores)) body.append(row) fio.WriteMatrix(outputdir + "rouge." + str(np) + '.L' + str(L) + "." + str(metric) + ".txt", body, header)
def Greedy(oracledir, np, L, metric='R1-F'): #sheets = range(0,1) sheets = range(0,12) RIndex = RougeHeader.index(metric) assert(RIndex != -1) for i, sheet in enumerate(sheets): week = i + 1 #Add a cache to make it faster Cache = {} cachefile = oracledir + str(week) + '/' + 'cache.json' if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) #for type in ['POI']: for type in ['POI', 'MP', 'LP']: #read TA's summmary reffile = oracledir + str(week) + '/' + type + '.ref.summary' lines = fio.ReadFile(reffile) ref = [line.strip() for line in lines] #read Phrases phrasefile = oracledir + str(week) + '/' + type + '.' + str(np) + '.key' lines = fio.ReadFile(phrasefile) candidates = [line.strip() for line in lines] summary = [] Length = 0 maxSum = [] maxScore = 0 Round = 1 Changed = True while Changed: Changed = False for phrase in candidates: WC = len(phrase.split()) if Length + WC > L: continue TmpSum = copy.deepcopy(summary) TmpSum.append(phrase) #get Rouge Score cacheKey = getKey(ref, TmpSum) if cacheKey in Cache: scores = Cache[cacheKey] print "Hit" else: scores = getRouge(ref, TmpSum) Cache[cacheKey] = scores s = float(scores[RIndex]) #s = scores[RIndex] if s > maxScore: maxSum = TmpSum maxScore = scores Changed = True if Changed: #write the results sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary' fio.SaveList(maxSum, sumfile, '\r\n') summary = maxSum Length = 0 for s in maxSum: Length = Length + len(s.split()) Round = Round + 1 newCandidates = [] #remove the candidate from the existing summary for phrase in candidates: if phrase not in maxSum: newCandidates.append(phrase) candidates = newCandidates with open(cachefile, 'w') as outfile: json.dump(Cache, outfile, indent=2)
def getShallowSummary(excelfile, folder, clusterdir, K=30, method=None, similarity=None, ratio=None, lex='lexrank'): #K is the number of words per points sheets = range(0,maxWeek) for i, sheet in enumerate(sheets): week = i + 1 for type in ['q1', 'q2', 'q3', 'q4']: path = folder + str(week)+ '/' fio.NewPath(path) filename = path + type + '.%d.summary'%ratio #produce the cluster file on the fly phrasefile = os.path.join(clusterdir, str(week), type + '.' + method + '.key') if not fio.IsExist(phrasefile): continue print excelfile, sheet, type cluster_output = clusterdir + str(week) +'/' + type + ".cluster.kmedoids." + str(ratio) + "." +similarity + '.' + method print cluster_output weightfile = clusterdir + str(week)+ '/' + type + '.' + method + '.' + similarity print weightfile if not fio.IsExist(cluster_output): #if True: print "clustering" phraseClusteringKmedoid.getPhraseClusterPhrase(phrasefile, weightfile, cluster_output, ratio, method=method) if not fio.IsExist(cluster_output): continue body = fio.ReadMatrix(cluster_output, False) NPCandidates = fio.ReadFile(phrasefile) lexfile = clusterdir + str(week)+ '/' + str(type) + "." + method + "."+lex+".dict" lexdict = fio.LoadDict(lexfile, 'float') NPs = [row[0] for row in body] clusterids = [row[1] for row in body] #assert(NPCandidates == NPs) if NPCandidates != NPs: print NPCandidates print NPs cluster = {} for row in body: cluster[row[0]] = int(row[1]) Summary = [] #sort the clusters according to the number of response keys = postProcess.RankClusterNoSource(NPs, lexdict, clusterids) total_word = 0 word_count = 0 for key in keys: #phrase = NPs[key] phrase = postProcess.getTopRankPhraseNoSource(NPs, clusterids, int(key), lexdict) if phrase in Summary: continue word_count = len(phrase.split()) total_word = total_word + word_count #if total_word <= K: if len(Summary) + 1 <= K: Summary.append(phrase) fio.SaveList(Summary, filename)
import sys import re import fio import xml.etree.ElementTree as ET from collections import defaultdict import postProcess import random import CourseMirror_Survey import phraseClusteringKmedoid import os stopwords = [line.lower().strip() for line in fio.ReadFile("../data/smart_common_words.txt")] #print "stopwords:", len(stopwords) noremove = ['nothing', 'none'] for w in noremove: if w in stopwords: index = stopwords.index(w) stopwords.pop(index) stopwords = stopwords + ['.', '?', '-', ',', '[', ']', '-', ';', '\'', '"', '+', '&', '!', '/', '>', '<', ')', '(', '#', '='] def getTopRankPhrase(NPs, clusterids, cid, lexdict, sources): #get cluster NP, and scores dict = {} s = [] for NP, id, source in zip(NPs, clusterids, sources): if int(id) == cid:
] #'none', "no", "nothing" import datetime RatingKey = { "slightly": 1, "somewhat": 2, "moderately": 3, "mostly": 4, "completely": 5 } RateSplitTag = "||Rating: " stopwordfilename = "../data/smart_common_words.txt" stopwords = [line.lower().strip() for line in fio.ReadFile(stopwordfilename)] punctuations = [ '.', '?', '-', ',', '[', ']', '-', ';', '\'', '"', '+', '&', '!', '/', '>', '<', ')', '(', '#', '=' ] stopwords = stopwords + punctuations def getRatingkey(rate): key = rate.strip().lower() if key in RatingKey: return RatingKey[key] return -1
def getRouge(datadir, maxWeek, output): print datadir sheets = range(0, maxWeek) body = [] for sheet in sheets: week = sheet + 1 dir = datadir + str(week) + '/' for type in ['q1', 'q2']: summary_file = dir + type + "." + 'summary' print summary_file if not fio.IsExist(summary_file): print summary_file continue Cache = {} cachefile = os.path.join(datadir, str(week), 'cache.json') print cachefile if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) #read TA's summmary refs = [] for i in range(2): reffile = os.path.join(datadir, str(week), type + '.ref.%d' % i) if not fio.IsExist(reffile): print reffile continue lines = fio.ReadFile(reffile) ref = [line.strip() for line in lines] refs.append(ref) if len(refs) == 0: continue lstref = refs[0] + refs[1] lines = fio.ReadFile(summary_file) TmpSum = [line.strip() for line in lines] cacheKey = OracleExperiment.getKey(lstref, TmpSum) if cacheKey in Cache: scores = Cache[cacheKey] print "Hit" else: print "Miss" print summary_file scores = OracleExperiment.getRouge_IE256(refs, TmpSum) Cache[cacheKey] = scores row = [week] row = row + scores body.append(row) try: fio.SaveDict2Json(Cache, cachefile) except Exception as e: #fio.SaveDict(Cache, cachefile + '.dict') print e header = ['id'] + RougeHeader row = ['ave'] for i in range(1, len(header)): scores = [float(xx[i]) for xx in body] row.append(numpy.mean(scores)) body.append(row) fio.WriteMatrix(output, body, header)
def getRouge(datadir, maxWeek, output): sheets = range(0, maxWeek) body = [] allbody = [] #Krange = range(1, 25) #Krange = range(1, 25) Krange = [gK] for sheet in sheets: week = sheet + 1 dir = datadir + str(week) + '/' for type in ['q1', 'q2']: maxS = 0 maxK = -1 maxScore = [] Cache = {} cachefile = os.path.join(datadir, str(week), 'cache.json') print cachefile if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) allrow = [week] #Krange = [np.random.randint(1, 25)] for K in Krange: summary_file = dir + type + '.%d.summary' % K print summary_file if not fio.IsExist(summary_file): print summary_file continue #read TA's summmary refs = [] for i in range(2): reffile = os.path.join(datadir, str(week), type + '.ref.%d' % i) if not fio.IsExist(reffile): print reffile continue lines = fio.ReadFile(reffile) ref = [line.strip() for line in lines] refs.append(ref) if len(refs) == 0: continue lstref = refs[0] + refs[1] lines = fio.ReadFile(summary_file) TmpSum = [line.strip() for line in lines] cacheKey = OracleExperiment.getKey(lstref, TmpSum) if cacheKey in Cache: scores = Cache[cacheKey] print "Hit" else: print "Miss" print summary_file scores = OracleExperiment.getRouge_IE256(refs, TmpSum) Cache[cacheKey] = scores s = float(scores[RIndex]) allrow.append(s) if s >= maxS: maxS = s maxScore = scores maxK = K if maxK == -1: continue row = [week] row = row + maxScore + [maxK] body.append(row) allrow.append(maxK) allbody.append(allrow) try: fio.SaveDict2Json(Cache, cachefile) except: #fio.SaveDict(Cache, cachefile + '.dict') pass header = ['id'] + RougeHeader row = ['ave'] for i in range(1, len(header)): scores = [float(xx[i]) for xx in body] row.append(numpy.mean(scores)) body.append(row) fio.WriteMatrix(output, body, header) fio.WriteMatrix(output + '.all', allbody, ['week'] + Krange)