def plot_reference_summary_no_distribution(): import collections C = {} M = 0 for cid in ['Engineer', 'IE256', 'IE256_2016', 'CS0445']: support_file = '../data/%s_supporters.txt'%cid supports = fio.LoadDictJson(support_file) M = max(M, max(supports)) C[cid] = collections.Counter(supports) for cid in ['Engineer', 'IE256', 'IE256_2016', 'CS0445']: del C[cid][0] A = {} for i in range(1, M+1): for cid in ['Engineer', 'IE256', 'IE256_2016', 'CS0445']: if cid not in A: A[cid] = collections.defaultdict(float) r = C[cid][i]*1.0/sum(C[cid].values()) if i in C[cid] else 0 A[cid][i] += r + A[cid][i-1] for i in range(1, M+1): for cid in ['Engineer', 'IE256', 'IE256_2016', 'CS0445']: print A[cid][i], '\t', print
def combine_files(lectures, features=None, prompts=['q1', 'q2']): phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course X = [] Y = [] if features == None: sim_extractor = Similarity() features = sorted(sim_extractor.features.keys()) for i, lec in enumerate(lectures): for q in prompts: for phrasedir in [phrasedir1, phrasedir2]: path = phrasedir + str(lec) + '/' filename = os.path.join(path, q + sim_exe) data = fio.LoadDictJson(filename) for fdict, score, _ in data: row = [] for name in features: x = fdict[name] if str(x) == 'nan': x = 0.0 row.append(x) X.append(row) Y.append(score) return X, Y
def PrintClusterRankSummary(datadir): sheets = range(0,maxWeek) lectures = fio.LoadDictJson('../data/CourseMIRROR/lectures.json') head = ['week', 'data', 'Point of Interest', "Muddiest Point"] body = [] for i, sheet in enumerate(sheets): row = [] week = i + 1 row.append(week) row.append(getDate(lectures, course, week)) for type in ['q1', 'q2', 'q3', 'q4']: path = datadir + str(i+1)+ '/' summaryfile = path + type + '.summary' if not fio.IsExist(summaryfile): continue summaries = [line.strip() for line in fio.ReadFile(summaryfile)] sourcefile = path + type + '.summary.source' sources = [line.split(',') for line in fio.ReadFile(sourcefile)] combinedSummary = [] for j, (summary, source) in enumerate(zip(summaries, sources)): summary = summary.replace('"', '\'') combinedSummary.append(str(j+1) + ") " + summary + " [" + str(len(source)) + "]") row.append('"' + chr(10).join(combinedSummary)+ '"') body.append(row) fio.WriteMatrix(datadir + "summary.txt", body, head)
def correlation_analysis(course): phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course outdir = '../data/%s/simlearning/' % course fio.NewPath(outdir) sim_extractor = Similarity() features = sorted(sim_extractor.features.keys()) head = features + ['score', 'predict'] body = [] lectures = annotation.Lectures name = '_'.join(features) for i, lec in enumerate(lectures): model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name)) with open(model_file, 'rb') as handle: clf = pickle.load(handle) for q in ['q1', 'q2']: outfile = os.path.join(outdir, str(lec), '%s%s' % (q, sim_exe)) for phrasedir in [phrasedir1, phrasedir2]: path = phrasedir + str(lec) + '/' filename = os.path.join(path, q + sim_exe) data = fio.LoadDictJson(filename) for fdict, score, _ in data: row = [] for fname in features: x = fdict[fname] if str(x) == 'nan': x = 0.0 row.append(x) predict_score = clf.predict([row]) row.append(score) row.append(predict_score[0]) body.append(row) out_correlation = os.path.join(outdir, 'data.txt') print out_correlation fio.WriteMatrix(out_correlation, body, head)
def check_stopword(): from CourseMirror_Survey import stopwords vocab = fio.LoadDictJson(global_params.vocab) for word, count in vocab.items(): if count < 5: continue if word in stopwords: print word, '\t', count
def correlation_analysis_noduplicate(): phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course outdir = '../data/%s/simlearning/' % course fio.NewPath(outdir) sim_extractor = Similarity() features = sorted(sim_extractor.features.keys()) head = features + ['score'] body = [] lectures = annotation.Lectures for i, lec in enumerate(lectures): for q in ['q1', 'q2']: outfile = os.path.join(outdir, str(lec), '%s%s' % (q, sim_exe)) for phrasedir in [phrasedir1, phrasedir2]: path = phrasedir + str(lec) + '/' filename = os.path.join(path, q + sim_exe) data = fio.LoadDictJson(filename) for fdict, score, pd in data: if pd['p1'] == pd['p2']: print pd['p1'] continue row = [] for name in features: x = fdict[name] if str(x) == 'nan': x = 0.0 row.append(x) row.append(score) body.append(row) out_correlation = os.path.join(outdir, 'data.txt') fio.WriteMatrix(out_correlation, body, head)
def __init__(self, key_prefix, sum_prefix, N): ''' N is number of annotators ''' self.key_prefix = key_prefix self.sum_prefix = sum_prefix self.N = N #load phrase color map phrasefile = key_prefix + phrase_exe phrases = fio.LoadList(phrasefile) colorfile = key_prefix + color_exe color_map = fio.LoadDictJson(colorfile) phrase_color_map = self.combine_phrase_color(phrases, color_map) #get phrase summary color map sumfile = sum_prefix + sum_exe summaries = fio.LoadList(sumfile) self.summary_color = self.get_summary_color(summaries, phrase_color_map) #get summary count sumcountfile = sum_prefix + sum_count_exe self.summary_no = [int(x) for x in fio.LoadList(sumcountfile)] assert(len(self.summary_color) == len(self.summary_no)) #load human_summary color map self.ref_color = [] for i in range(N): d = {} ref_sumcolor_file = '%s%s.%d.color'%(sum_prefix, ref_exe, i) ref_sumno_file = '%s%s.%d.no'%(sum_prefix, ref_exe, i) for color, no in zip(fio.LoadList(ref_sumcolor_file), fio.LoadList(ref_sumno_file)): d[int(color)] = int(no) self.ref_color.append(d)
def __init__(self, prefix=""): self.features = { 'optimumComparerLSATasa': self.LSA, 'LexicalOverlap': self.LexicalOverlap, 'optimumComparerWNLin': self.LIN, 'BLEU': self.BLEU, 'ROUGE': self.ROUGE, 'Cosine': self.Cosine, 'WordEmbedding': self.WordEmbedding, #'WMD': self.WMD, } self.prefix = prefix self.Cache = {} self.cachefile = os.path.join(prefix + 'cache.json') print self.cachefile if fio.IsExist(self.cachefile): with open(self.cachefile, 'r') as fin: self.Cache = json.load(fin) if self.prefix != '': self.matrixdict = {} for sim in [ 'optimumComparerLSATasa', 'LexicalOverlap', 'optimumComparerWNLin', 'BLEU' ]: self.matrixdict[sim] = {} filename = self.prefix + sim phrases, matrix = fio.ReadMatrix(filename, hasHead=True) index = {} for i, p in enumerate(phrases): index[p] = i self.matrixdict[sim]['index'] = index self.matrixdict[sim]['matrix'] = matrix self.word2vec = fio.LoadDictJson(global_params.word2vec_model)
def loadmodel(modelbin, vocabjson, output): vocab = fio.LoadDictJson(vocabjson) word_vecs = load_bin_vec(modelbin, vocab) fio.SaveDict2Json(word_vecs, output)
def run(self, cid, summarylastlecture=False): max_lecture = 30 # max_lecture = self.get_max_lecture_num(cid) # print "max_lecture", max_lecture # #get reflections # reflections = self.get_reflections(cid) jsonfile = '../data/CourseMIRROR/reflections.json' # with open(jsonfile, 'w') as outfile: # json.dump(reflections, outfile, encoding='utf-8', indent=2) # reflections = fio.LoadDictJson(jsonfile) #get lectures lectures = self.get_lectures(cid) jsonfile = '../data/CourseMIRROR/lectures.json' with open(jsonfile, 'w') as outfile: json.dump(lectures, outfile, encoding='utf-8', indent=2) self.N = len(reflections['results']) print "total number of reflections:", self.N if self.N == self.old_N: #no need to summary return self.old_N = self.N #run senna # os.system('python CourseMirror_Survey.py ' + str(cid) + ' ' + str(max_lecture)) # # cmd = 'cmd /C "runSennaCourseMirror.bat '+str(cid)+ ' ' + str(max_lecture) + '"' # os.system(cmd) # # # cmd = 'python QPS_extraction.py %s %d %s %s %s'%(cid, max_lecture, self.system, str(self.method), 'N') # os.system(cmd) # cmd = 'python QPS_prepare.py ' + str(cid) + ' ' + str(max_lecture) + ' ' + str(self.system) + ' ' + str(self.method) # os.system(cmd) # # #. get PhraseMead input (CourseMirror_MeadPhrase.py) # cmd = 'python CourseMirror_MeadPhrase.py ' + str(cid) + ' ' + str(max_lecture) + ' ' + str(self.system) + ' ' + str(self.method) # print cmd # os.system(cmd) # # # olddir = os.path.dirname(os.path.realpath(__file__)) # # # . get PhraseMead output # meaddir = global_params.meaddir # cmd = './get_mead_summary_phrase_qps.sh ' + str(cid) + ' ' + str(max_lecture) + ' ' + str(self.system) # os.chdir(meaddir) # retcode = subprocess.call([cmd], shell=True) # print retcode # subprocess.call("exit 1", shell=True) # os.chdir(olddir) # # # . get LSA results (CourseMirrorphrase2phraseSimilarity.java) # #cmd = 'cmd /C "runLSA.bat '+str(cid)+ ' ' + str(max_lecture) + ' ' + str(self.system) + ' ' + str(self.method) + '"' # cmd = 'cmd /C "runLSA_All.bat '+str(cid)+ ' ' + str(max_lecture) + ' ' + str(self.system) + ' ' + str(self.method) + '"' # os.system(cmd) #get community dection results # get ClusterARank (CourseMirror_phraseClusteringbasedShallowSummaryKmedoid-New-Malformed-LexRank.py) cmd = "python CourseMirror_ClusterARank.py %s %d %s %s %s" % ( cid, max_lecture, self.system, self.method, self.similarity) print cmd os.system(cmd) cmd = "python get_summary.py %s %s" % (cid, self.system) print cmd os.system(cmd) cmd = "python get_Rouge.py %s %d %s %s" % ( cid, max_lecture, self.system, self.method + '_' + self.similarity) print cmd os.system(cmd) cmd = "python eval_student_number.py %s %d %s %s %s" % ( cid, max_lecture, self.system, self.method, self.similarity) print cmd os.system(cmd)