def chron_order(dataset='training'): top_sentences = reranker.select_top(dataset) input_directoryPath = getDirectoryPath("outputs/reranker/devtest/") output_directoryPath = getDirectoryPath("outputs/reorder/devtest/") chron_sents = {} for topic_id in top_sentences.keys(): sentences = top_sentences[topic_id] id_part1 = topic_id[:-1] id_part2 = topic_id[-1:] output_file_name = id_part1 + "-A.M.100." + id_part2 + ".1" output_file_path = output_directoryPath + "/" + output_file_name chron_list = [] date_index = defaultdict(list) for sentence in sentences: date = sentence.time_stamp date_index[date].append(sentence) for date in sorted(date_index): date_sents = date_index[date] date_sents.sort(key=lambda x: x.order) chron_list.extend(date_sents) with io.open(output_file_path, 'w', encoding='utf8') as outputFile: for sentence in chron_list: outputFile.write(sentence.clean_sent, '\n') outputFile.write(' ') outputFile.flush() outputFile.close() chron_sents[topic_id] = chron_list return chron_sents
def cohesion_order(dataset='training'): top_sentences = reranker.select_top(dataset) input_directoryPath = getDirectoryPath("outputs/reranker/devtest/") output_directoryPath = getDirectoryPath("outputs/reorder/devtest/") cohesion_sents = {} for topic_id in top_sentences.keys(): sentences = top_sentences[topic_id] id_part1 = topic_id[:-1] id_part2 = topic_id[-1:] output_file_name = id_part1 + "-A.M.100." + id_part2 + ".1" output_file_path = output_directoryPath + "/" + output_file_name num_sents = len(sentences) clean_sents = [sentence.clean_sent for sentence in sentences] vectorizer = TfidfVectorizer() cosine_matrix = cosine_similarity( vectorizer.fit_transform(clean_sents)) ids = list(range(num_sents)) perms = list(permutations(ids, num_sents)) max_score = 0 for perm in perms: perm_score = 0 for i in range(num_sents - 1): sent1_id = perm[i] sent2_id = perm[i + 1] adj_sim = cosine_matrix[sent1_id][sent2_id] perm_score += adj_sim if perm_score > max_score: max_score = perm_score winning_perm = perm cohesion_list = [sentences[i] for i in winning_perm] with io.open(output_file_path, 'w', encoding='utf8') as outputFile: for sentence in cohesion_list: outputFile.write(sentence.clean_sent) outputFile.write('\n') outputFile.flush() outputFile.close() cohesion_sents[topic_id] = cohesion_list return cohesion_sent
def entitygrid_reorder(dataset = 'training'): output_directoryPath = getDirectoryPath("outputs/D3/") model_file_path = getFilePath("model") KNN = 11 number_of_random_orders = 20 # reading model vectors, labels = readModel(model_file_path) # building calssifier neigh = KNeighborsClassifier(n_neighbors=KNN) neigh.fit(vectors, labels) # NER + Dep parser ner_tagger = loadStanfordNERTagger() stanford_dependency_parser = loadStanfordDependencyParser() # page rank + cosine reordering top_sentences = reranker.select_top(dataset) for topic_id in top_sentences.keys(): sentences = top_sentences[topic_id] id_part1 = topic_id[:-1] id_part2 = topic_id[-1:] output_file_name = id_part1 + "-A.M.100." + id_part2 + ".1" output_file_path = output_directoryPath + "/" + output_file_name print("summary ....") sentences = [sentence.clean_sent.strip() for sentence in sentences] for s in sentences: print(s) sent_ent_matrix = generateMatrixForSummary(sentences, ner_tagger, stanford_dependency_parser) if sent_ent_matrix == None: continue # original order original_order = [] for key in sent_ent_matrix: original_order.append(key) # generate random ordering print("\n3: random ordering ..") random_orders = generateRandomOrders(original_order, number_of_random_orders) max_prob = -1 best_order = [] # generate vectors for random orders for random_order in random_orders: print(random_order) feature_vector = createFeatureVector(sent_ent_matrix, random_order) print(feature_vector) scores = neigh.predict_proba(feature_vector) print("scores: " + str(scores)) if scores[0][1] > max_prob: max_prob = scores[0][1] best_order = random_order print("\n") # generate vector for original order print(original_order) feature_vector = createFeatureVector(sent_ent_matrix, original_order) print(feature_vector) scores = neigh.predict_proba(feature_vector) print("scores: " + str(scores)) if scores[0][1] > max_prob: max_prob = scores[0][1] best_order = random_order print("Best score: " + str(max_prob)) print(best_order) # print best order to the output file with io.open(output_file_path, 'w', encoding='utf8') as outputFile: for order in best_order: outputFile.write(sentences[order]+"\n") outputFile.flush() outputFile.close()