def extractVocab(annotators, output): vocab = defaultdict(int) for docs in annotation.generate_all_files_by_annotators( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): doc0, lec0, annotator0 = docs[0] doc1, lec1, annotator1 = docs[1] assert (lec0 == lec1) lec = lec0 #load tasks task0 = annotation.Task() task0.loadjson(doc0) task1 = annotation.Task() task1.loadjson(doc1) for prompt in ['q1', 'q2']: phrase_annotation0 = task0.get_phrase_annotation(prompt) phrase_annotation1 = task1.get_phrase_annotation(prompt) aligner = AlignPhraseAnnotation(task0, task1, prompt) aligner.align() #add sentences to the extrator for global feature extraction for d in aligner.responses: tokens = [token.lower() for token in d['response']] for token in tokens: vocab[token] += 1 fio.SaveDict2Json(vocab, output)
def extractPhrasePaireFromAnnotation(phrasedir, annotators, id): for doc, lec, annotator in annotation.generate_all_files( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): print doc #load task task = annotation.Task() task.loadjson(doc) path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: prefix = os.path.join(path, '%s.%s.' % (prompt, method)) filename = path + prompt + sim_exe print filename featureset = [] feature_extractor = Similarity(prefix) phrase_annotation = task.get_phrase_annotation(prompt) #positive examples for rank1 in sorted(phrase_annotation): for rank2 in sorted(phrase_annotation): if rank1 == rank2: score = 1.0 else: score = 0.0 phrases1 = phrase_annotation[rank1] phrases2 = phrase_annotation[rank2] for phrasedict1 in phrases1: p1 = phrasedict1['phrase'].lower().strip() for phrasedict2 in phrases2: p2 = phrasedict2['phrase'].lower().strip() featureset.append( (feature_extractor.get_features(p1, p2), score, { 'p1': p1, 'p2': p2 })) fio.SaveDict2Json(featureset, filename) feature_extractor.save()
def extractPhraseFromAnnotationIntersect(phrasedir, annotators): for docs in annotation.generate_all_files_by_annotators( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): doc0, lec0, annotator0 = docs[0] doc1, lec1, annotator1 = docs[1] assert (lec0 == lec1) lec = lec0 #load tasks task0 = annotation.Task() task0.loadjson(doc0) task1 = annotation.Task() task1.loadjson(doc1) path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' print filename extracted_phrases = [] phrase_annotation0 = task0.get_phrase_annotation(prompt) phrase_annotation1 = task1.get_phrase_annotation(prompt) aligner = AlignPhraseAnnotation(task0, task1, prompt) aligner.align() extracted_phrases = aligner.get_intersect() fio.SaveList(extracted_phrases, filename)
def get_phrase_reference_summary_phrase(outputs = None): for output in outputs: fio.NewPath(output) counts = [] for doc, lec, annotator in annotation.generate_all_files(annotation.datadir + 'json/', '.json', anotators = annotation.anotators, lectures=annotation.Lectures): print doc task = annotation.Task() task.loadjson(doc) sub_tasks = task.get_tasks() for sub_task in sub_tasks: if sub_task["task_name"] == "Phrase": if sub_task['prompt'] == 0: #POI type = 'q1' else: type = 'q2' summary_filename = os.path.join(output, str(lec), type+'.ref.' + str(annotation.anotator_dict[annotator])) #summary_filename = os.path.join(output, str(lec), type+'.ref.summary') print summary_filename summaries = [row[1] for row in sub_task["summary"][1:]] colors = [row[0].strip()[1] for row in sub_task["summary"][1:]] student_numbers = [row[2].strip() for row in sub_task["summary"][1:]] count = 0 for summary in summaries: count += len(NLTKWrapper.wordtokenizer(summary)) counts.append(count) fio.SaveList(summaries, summary_filename) color_filename = os.path.join(output, str(lec), '%s.ref.%s.color'%(type, str(annotation.anotator_dict[annotator]))) fio.SaveList(colors, color_filename) no_filename = os.path.join(output, str(lec), '%s.ref.%s.no'%(type, str(annotation.anotator_dict[annotator]))) fio.SaveList(student_numbers, no_filename) print counts print numpy.mean(counts) print numpy.median(counts)
def compare_length(annotator, output): students = set() lengthes = defaultdict(list) body = [] for doc, lec, annotator in annotation.generate_all_files( annotation.datadir + 'json/', '.json', anotators=annotator, lectures=annotation.Lectures): print doc #load task task = annotation.Task() task.loadjson(doc) for prompt in ['q1', 'q2']: #for each lecture, prompt row = [lec, prompt] stu = set() #number of students dict = {} raw_responses = task.get_raw_response(prompt) for response_row in raw_responses[1:]: student_id, response = response_row[ 'student_id'], response_row['response'] student_id = student_id.lower() if student_id not in dict: dict[student_id] = [] dict[student_id].append(response) for stu in dict: lengthes[prompt].append(len(' '.join(dict[stu]).split())) import stats_util print '%s\t%f\t%f\t%f' % ( course, np.mean(lengthes['q1']), np.mean(lengthes['q2']), stats_util.ttest(lengthes['q1'], lengthes['q2'], 2, 2)[-1])
def get_phrase_reference_summary_phrase_no(outputs = None): Numbers = [] counts = [] for doc, lec, annotator in annotation.generate_all_files(annotation.datadir + 'json/', '.json', anotators = annotation.anotators[:1], lectures=annotation.Lectures): print doc task = annotation.Task() task.loadjson(doc) sub_tasks = task.get_tasks() for sub_task in sub_tasks: if sub_task["task_name"] == "Phrase": if sub_task['prompt'] == 0: #POI type = 'q1' else: type = 'q2' student_numbers = [row[2].strip() for row in sub_task["summary"][1:]] Numbers += [int(x) for x in student_numbers] fio.SaveDict2Json(Numbers, '../data/%s_supporters.txt'%global_params.g_cid)
def extractPhraseFromAnnotation(phrasedir, annotator, summarydir=None): for doc, lec, annotator in annotation.generate_all_files( annotation.datadir + 'json/', '.json', anotators=annotator, lectures=annotation.Lectures): print doc #load task task = annotation.Task() task.loadjson(doc) path = phrasedir + str(lec) + '/' fio.NewPath(path) #Add a cache to make it faster Cache = {} cachefile = phrasedir + str(lec) + '/' + 'cache.json' if fio.IsExist(cachefile): with open(cachefile, 'r') as fin: Cache = json.load(fin) for prompt in ['q1', 'q2']: filename = path + prompt + '.' + method + '.key' cluster_output = path + prompt + '.cluster.kmedoids.sqrt.oracle.%s' % method if summarydir: fio.NewPath(os.path.join(summarydir, str(lec))) summary_file = os.path.join(summarydir, str(lec), '%s.summary' % prompt) body = [] if summarydir: summaries = [] phrase_summary_dict = task.get_phrase_summary_textdict(prompt) extracted_phrases = [] phrase_annotation = task.get_phrase_annotation(prompt) for rank in sorted(phrase_annotation): rank_phrases = [] phrases = phrase_annotation[rank] for phrasedict in phrases: phrase = phrasedict['phrase'].lower() extracted_phrases.append(phrase) rank_phrases.append(phrase) row = [phrase, rank] body.append(row) if summarydir: rank_summary = phrase_summary_dict[rank] max_summary = get_max_phrase_by_ROUGE( rank_summary, rank_phrases, Cache) print max_summary summaries.append(max_summary) fio.SaveList(extracted_phrases, filename) fio.WriteMatrix(cluster_output, body, header=None) if summarydir: fio.SaveList(summaries, summary_file) with open(cachefile, 'w') as outfile: json.dump(Cache, outfile, indent=2)
def extractStatistics(annotator, output): students = set() body = [] for doc, lec, annotator in annotation.generate_all_files( annotation.datadir + 'json/', '.json', anotators=annotator, lectures=annotation.Lectures): print doc #load task task = annotation.Task() task.loadjson(doc) for prompt in ['q1', 'q2']: #for each lecture, prompt row = [lec, prompt] stu = set() #number of students wc = 0.0 dict = {} raw_responses = task.get_raw_response(prompt) for response_row in raw_responses[1:]: student_id, response = response_row[ 'student_id'], response_row['response'] student_id = student_id.lower() if student_id not in dict: dict[student_id] = [] dict[student_id].append(response) students.add(student_id) stu.add(student_id) wc += len(response.split()) response_number = len(dict) row.append(response_number) #number of responses row.append(wc) #word count row.append(wc / response_number) #averaged number of words per response phrase_summary_dict = task.get_phrase_summary_textdict(prompt) extracted_phrases = [] phrase_annotation = task.get_phrase_annotation(prompt) stu_h = set() ph_c = 0 for rank in sorted(phrase_annotation): phrases = phrase_annotation[rank] ph_c += len(phrases) for phrasedict in phrases: phrase = phrasedict['phrase'].lower() #phrase extracted_phrases.append(phrase) student_id = phrasedict['student_id'].lower().strip() stu_h.add(student_id) row.append(ph_c) #phrase count coverage = stu.intersection(stu_h) coverage_ratio = len(coverage) * 1.0 / len(stu) row.append(coverage_ratio) body.append(row) #add average head = [ 'lec', 'prompt', 'Response', 'Word', 'Word/Response', 'Highlights', 'Coverage' ] row = ['', 'ave'] for i in range(2, len(head)): scores = [float(xx[i]) for xx in body] row.append(np.mean(scores)) body.append(row) #add std row = ['', 'std'] for i in range(2, len(head)): scores = [float(xx[i]) for xx in body] row.append(np.std(scores)) body.append(row) fio.WriteMatrix(output, body, head) print(len(students))
def extractPhraseFromSyntax(extractiondir, annotators): for docs in annotation.generate_all_files_by_annotators( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): doc0, lec0, annotator0 = docs[0] doc1, lec1, annotator1 = docs[1] assert (lec0 == lec1) lec = lec0 # if lec != 11: continue #load tasks task0 = annotation.Task() task0.loadjson(doc0) task1 = annotation.Task() task1.loadjson(doc1) path = extractiondir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: filename = path + prompt + '.feature.crf' print filename fout = codecs.open(filename, "w", "utf-8") extracted_phrases = [] phrase_annotation0 = task0.get_phrase_annotation(prompt) phrase_annotation1 = task1.get_phrase_annotation(prompt) aligner = AlignPhraseAnnotation(task0, task1, prompt) aligner.align() for d in aligner.responses: tokens = [token.lower() for token in d['response']] tags = d['tags'][0] colors = d['colors'] n_tokens = [] n_tags = [] for token, tag in zip(tokens, tags): if len(token) == 0: continue n_tokens.append(token) n_tags.append(tag) if len(n_tokens) == 0: continue tokens = n_tokens tags = n_tags body = [] words = tokens N = len(tokens) #first row: the word token for word in words: row = [] row.append(word) body.append(row) #add the color for color in colors: for i, tag in enumerate(tags): body[i].append(str(color[i])) #last row: tags = [tag for tag in tags] for i, tag in enumerate(tags): body[i].append(tag) #extract the NP tags psg_tags = getSennaPSGtags(tokens) for i, tag in enumerate(psg_tags): body[i].append(tag) for row in body: fout.write(' '.join(row)) fout.write('\n') fout.write('\n') fout.close() if debug: break
def extractPhraseFeatureFromCombine(extractiondir, annotators, empty='N'): for docs in annotation.generate_all_files_by_annotators( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): doc0, lec0, annotator0 = docs[0] doc1, lec1, annotator1 = docs[1] assert (lec0 == lec1) lec = lec0 #print lec #if lec != 17: continue #load tasks task0 = annotation.Task() task0.loadjson(doc0) task1 = annotation.Task() task1.loadjson(doc1) path = extractiondir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: #if prompt != 'q2': continue filename = path + prompt + '.feature.crf' print filename fout = codecs.open(filename, "w", "utf-8") extracted_phrases = [] phrase_annotation0 = task0.get_phrase_annotation(prompt) phrase_annotation1 = task1.get_phrase_annotation(prompt) aligner = AlignPhraseAnnotation(task0, task1, prompt) aligner.align() crf_feature_extractor = CRF_Extractor() #add sentences to the extrator for global feature extraction for d in aligner.responses: tokens = [token.lower() for token in d['response']] colors = d['colors'] if d['tags'][0] == d['tags'][1]: combinetags = [d['tags'][0]] else: combinetags = [d['tags'][0], d['tags'][1]] for tags in combinetags: n_tokens = [] n_tags = [] for token, tag in zip(tokens, tags): if len(token) == 0: continue n_tokens.append(token) n_tags.append(tag) if len(n_tokens) == 0: continue tokens = n_tokens tags = n_tags crf_feature_extractor.add_sentence((tokens, tags, colors)) for tokens, tags, colors in crf_feature_extractor.sentences: if empty == 'Y': flag = True for tag in tags: if tag != 'O': flag = False if flag: continue body = crf_feature_extractor.extract_crf_features( tokens, tags, prompt, colors) for row in body: fout.write(' '.join(row)) fout.write('\n') fout.write('\n') fout.close()