def extractVocab(annotators, output): vocab = defaultdict(int) for docs in annotation.generate_all_files_by_annotators( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): doc0, lec0, annotator0 = docs[0] doc1, lec1, annotator1 = docs[1] assert (lec0 == lec1) lec = lec0 #load tasks task0 = annotation.Task() task0.loadjson(doc0) task1 = annotation.Task() task1.loadjson(doc1) for prompt in ['q1', 'q2']: phrase_annotation0 = task0.get_phrase_annotation(prompt) phrase_annotation1 = task1.get_phrase_annotation(prompt) aligner = AlignPhraseAnnotation(task0, task1, prompt) aligner.align() #add sentences to the extrator for global feature extraction for d in aligner.responses: tokens = [token.lower() for token in d['response']] for token in tokens: vocab[token] += 1 fio.SaveDict2Json(vocab, output)
def extractPhraseFromCRFWithColor(phrasedir, systemdir): crf_reader = CRF() aligner = AlignPhraseAnnotation() lectures = annotation.Lectures for i, lec in enumerate(lectures): path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' extracted_phrases = [] extracted_colors = [] crf_file = os.path.join(systemdir, 'extraction', 'all_output', 'test_%i_%s.out' % (i, prompt)) for tokens, tags, color0, color1 in crf_reader.read_file_generator_index( crf_file, [0, -1, -4, -3]): phrases, phrase_colors = aligner.get_phrase_with_colors( tokens, tags, [color0, color1]) for phrase, phrase_color in zip(phrases, phrase_colors): extracted_phrases.append(phrase.lower()) extracted_colors.append(phrase_color) fio.SaveList(extracted_phrases, filename) filename = path + prompt + '.' + method + '.key.color' fio.SaveDict2Json(extracted_colors, filename)
def extractPhrasePaireFeature(phrasedir): for lec in annotation.Lectures: path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: prefix = os.path.join(path, '%s.%s.' % (prompt, method)) filename = path + prompt + sim_exe print filename featureset = [] feature_extractor = Similarity(prefix) phrasefile = os.path.join(path, "%s.%s.key" % (prompt, method)) phrases = fio.LoadList(phrasefile) for p1 in phrases: for p2 in phrases: featureset.append( (feature_extractor.get_features(p1, p2), 0.0, { 'p1': p1, 'p2': p2 })) fio.SaveDict2Json(featureset, filename) feature_extractor.save()
def extractPhrasePaireFromAnnotation(phrasedir, annotators, id): for doc, lec, annotator in annotation.generate_all_files( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): print doc #load task task = annotation.Task() task.loadjson(doc) path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: prefix = os.path.join(path, '%s.%s.' % (prompt, method)) filename = path + prompt + sim_exe print filename featureset = [] feature_extractor = Similarity(prefix) phrase_annotation = task.get_phrase_annotation(prompt) #positive examples for rank1 in sorted(phrase_annotation): for rank2 in sorted(phrase_annotation): if rank1 == rank2: score = 1.0 else: score = 0.0 phrases1 = phrase_annotation[rank1] phrases2 = phrase_annotation[rank2] for phrasedict1 in phrases1: p1 = phrasedict1['phrase'].lower().strip() for phrasedict2 in phrases2: p2 = phrasedict2['phrase'].lower().strip() featureset.append( (feature_extractor.get_features(p1, p2), score, { 'p1': p1, 'p2': p2 })) fio.SaveDict2Json(featureset, filename) feature_extractor.save()
def PrepareIE256(): cid = "IE256" maxWeek = 25 excelfile = "../data/CourseMirror/Reflection.json" sennadir = "../../AbstractPhraseSummarization/data/IE256/senna/" #fio.NewPath(sennadir) #getStudentResponses4Senna(excelfile, cid, maxWeek, sennadir) outdirs = [ #'../../AbstractPhraseSummarization/data/IE256/ILP_Baseline_Sentence/', #'../../AbstractPhraseSummarization/data/IE256/MC/', #'../../AbstractPhraseSummarization/data/IE256/ILP_Sentence_MC/', '../../AbstractPhraseSummarization/data/IE256/ILP_Sentence_Supervised_FeatureWeightingAveragePerceptron/', ] sheets = range(1, maxWeek + 1) for outdir in outdirs: for sheet in sheets: week = sheet for type in ['q1', 'q2', 'q3', 'q4']: student_summaryList = getStudentResponseList( excelfile, cid, week, type, True) if len(student_summaryList) == 0: continue path = os.path.join(outdir, str(week)) fio.NewPath(path) source = {} responses = [] count = defaultdict(int) for response, student in student_summaryList: responses.append(response) count[response] += 1 if response not in source: source[response] = [] source[response].append(student) outout = os.path.join(path, type + ".sentence.key") fio.SaveList(set(responses), outout) output = os.path.join(path, type + '.sentence.keys.source') fio.SaveDict2Json(source, output) output = os.path.join(path, type + '.sentence.dict') fio.SaveDict(count, output)
def get_phrase_reference_summary_phrase_no(outputs = None): Numbers = [] counts = [] for doc, lec, annotator in annotation.generate_all_files(annotation.datadir + 'json/', '.json', anotators = annotation.anotators[:1], lectures=annotation.Lectures): print doc task = annotation.Task() task.loadjson(doc) sub_tasks = task.get_tasks() for sub_task in sub_tasks: if sub_task["task_name"] == "Phrase": if sub_task['prompt'] == 0: #POI type = 'q1' else: type = 'q2' student_numbers = [row[2].strip() for row in sub_task["summary"][1:]] Numbers += [int(x) for x in student_numbers] fio.SaveDict2Json(Numbers, '../data/%s_supporters.txt'%global_params.g_cid)
def loadmodel(modelbin, vocabjson, output): vocab = fio.LoadDictJson(vocabjson) word_vecs = load_bin_vec(modelbin, vocab) fio.SaveDict2Json(word_vecs, output)
def getRouge(datadir, maxWeek, output): print datadir sheets = range(0, maxWeek) body = [] for sheet in sheets: week = sheet + 1 dir = datadir + str(week) + '/' for type in ['q1', 'q2']: summary_file = dir + type + "." + 'summary' print summary_file if not fio.IsExist(summary_file): print summary_file continue Cache = {} cachefile = os.path.join(datadir, str(week), 'cache.json') print cachefile if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) #read TA's summmary refs = [] for i in range(2): reffile = os.path.join(datadir, str(week), type + '.ref.%d' % i) if not fio.IsExist(reffile): print reffile continue lines = fio.ReadFile(reffile) ref = [line.strip() for line in lines] refs.append(ref) if len(refs) == 0: continue lstref = refs[0] + refs[1] lines = fio.ReadFile(summary_file) TmpSum = [line.strip() for line in lines] cacheKey = OracleExperiment.getKey(lstref, TmpSum) if cacheKey in Cache: scores = Cache[cacheKey] print "Hit" else: print "Miss" print summary_file scores = OracleExperiment.getRouge_IE256(refs, TmpSum) Cache[cacheKey] = scores row = [week] row = row + scores body.append(row) try: fio.SaveDict2Json(Cache, cachefile) except Exception as e: #fio.SaveDict(Cache, cachefile + '.dict') print e header = ['id'] + RougeHeader row = ['ave'] for i in range(1, len(header)): scores = [float(xx[i]) for xx in body] row.append(numpy.mean(scores)) body.append(row) fio.WriteMatrix(output, body, header)
def getRouge(datadir, maxWeek, output): sheets = range(0, maxWeek) body = [] allbody = [] #Krange = range(1, 25) #Krange = range(1, 25) Krange = [gK] for sheet in sheets: week = sheet + 1 dir = datadir + str(week) + '/' for type in ['q1', 'q2']: maxS = 0 maxK = -1 maxScore = [] Cache = {} cachefile = os.path.join(datadir, str(week), 'cache.json') print cachefile if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) allrow = [week] #Krange = [np.random.randint(1, 25)] for K in Krange: summary_file = dir + type + '.%d.summary' % K print summary_file if not fio.IsExist(summary_file): print summary_file continue #read TA's summmary refs = [] for i in range(2): reffile = os.path.join(datadir, str(week), type + '.ref.%d' % i) if not fio.IsExist(reffile): print reffile continue lines = fio.ReadFile(reffile) ref = [line.strip() for line in lines] refs.append(ref) if len(refs) == 0: continue lstref = refs[0] + refs[1] lines = fio.ReadFile(summary_file) TmpSum = [line.strip() for line in lines] cacheKey = OracleExperiment.getKey(lstref, TmpSum) if cacheKey in Cache: scores = Cache[cacheKey] print "Hit" else: print "Miss" print summary_file scores = OracleExperiment.getRouge_IE256(refs, TmpSum) Cache[cacheKey] = scores s = float(scores[RIndex]) allrow.append(s) if s >= maxS: maxS = s maxScore = scores maxK = K if maxK == -1: continue row = [week] row = row + maxScore + [maxK] body.append(row) allrow.append(maxK) allbody.append(allrow) try: fio.SaveDict2Json(Cache, cachefile) except: #fio.SaveDict(Cache, cachefile + '.dict') pass header = ['id'] + RougeHeader row = ['ave'] for i in range(1, len(header)): scores = [float(xx[i]) for xx in body] row.append(numpy.mean(scores)) body.append(row) fio.WriteMatrix(output, body, header) fio.WriteMatrix(output + '.all', allbody, ['week'] + Krange)
def save(self): fio.SaveDict2Json(self.Cache, self.cachefile)