def process_dataset(self, dataset, pickle_name): tfidf = TfidfPreprocess() pack = tfidf.preprocess_tfidf_dataset(dataset) sim_vector = pack[0] labels = pack[1] sim_vector = sim_vector.flatten() print(sim_vector) qwords = QwordPreprocess() question_vector = qwords.get_question_word_data(dataset) print(question_vector) we = WordEmbeddings() det_val_vector = we.get_det_val_dataset(dataset) print(det_val_vector) sum_val_vector = we.get_sum_vals_dataset(dataset) print(sum_val_vector) spacy_sim_vector = we.get_spacy_sim_dataset(dataset) print(spacy_sim_vector) ui = UnionIntersect() ui_vector = ui.get_percentage_dataset(dataset) print(ui_vector) matrix = np.vstack((sim_vector, question_vector, sum_val_vector, spacy_sim_vector, ui_vector)) matrix = matrix.transpose() print(matrix) processed_data = dict(x=matrix, y=labels) with open(str(pickle_name) + ".p", "wb") as p: pickle.dump(processed_data, p)
def __init__(self, text_utils): self.text_utils = text_utils self.logger = logging.getLogger('YesNoBot') self.relevancy_detector = LGB_RelevancyDetector() #self.yes_no_model = XGB_YesNoModel() self.yes_no_model = NN_YesNoModel() self.word_embeddings = WordEmbeddings() self.show_relevancy = True
class YesNoBot: def __init__(self, text_utils): self.text_utils = text_utils self.logger = logging.getLogger('YesNoBot') self.relevancy_detector = LGB_RelevancyDetector() #self.yes_no_model = XGB_YesNoModel() self.yes_no_model = NN_YesNoModel() self.word_embeddings = WordEmbeddings() self.show_relevancy = True def load_models(self, models_folder, w2v_folder): self.logger.info(u'Loading models from {}'.format(models_folder)) self.models_folder = models_folder self.relevancy_detector.load(models_folder) self.yes_no_model.load(models_folder) self.wordchar2vector_path = os.path.join(models_folder, 'wordchar2vector.dat') self.word_embeddings.load_wc2v_model(self.wordchar2vector_path) p = self.yes_no_model.w2v_path if p is not None: p = os.path.join(w2v_folder, os.path.basename(p)) self.word_embeddings.load_w2v_model(p) def get_yes_answer(self): return self.text_utils.language_resources[u'да'] def get_no_answer(self): return self.text_utils.language_resources[u'нет'] def get_unknown_answer(self): return self.text_utils.language_resources[u'неопределено'] def infer_answer(self, premises0, question0): premises = [self.text_utils.canonize_text(f) for f in premises0] question = self.text_utils.canonize_text(question0) rel = 1.0 if len(premises) == 1: # Проверим, что введенная пользователем предпосылка релевантна заданному вопросу. premise = premises[0] rel = self.relevancy_detector.calc_relevancy1( premise, question, self.text_utils, self.word_embeddings) self.logger.debug('relevancy={}'.format(rel)) y = self.yes_no_model.calc_yes_no(premises, question, self.text_utils, self.word_embeddings) self.logger.debug('y={}'.format(y)) answer = None if y < 0.5: answer = self.get_no_answer() else: answer = self.get_yes_answer() return u'{} ({:4.2f})'.format(answer, rel)
def process_run(self, query_candidates): tfidf = TfidfPreprocess() sim_vector = tfidf.preprocess_tfidf_runtime(query_candidates) sim_vector = sim_vector.flatten() qwords = QwordPreprocess() question_vector = qwords.get_question_word_run(query_candidates) we = WordEmbeddings() det_val_vector = we.get_det_vals_run(query_candidates) sum_val_vector = we.get_sum_vals_run(query_candidates) spacy_sim_vector = we.get_spacy_sim_run(query_candidates) ui = UnionIntersect() ui_vector = ui.get_percentage_run(query_candidates) feature_matrix = np.vstack((sim_vector, question_vector, sum_val_vector, spacy_sim_vector, ui_vector)) feature_matrix = feature_matrix.transpose() return(feature_matrix)
def main(args: argparse.Namespace) -> None: """Main entrypoint for the script.""" temporal_embeddings = [] paths = Path(args.checkpoint_root).glob(args.glob) for path in paths: path_name = path.stem path_name = path_name[path_name.find('-') + 1:path_name.find('_p')].replace('_', '-') path_date = parse(path_name, fuzzy=True).replace(day=1) # Load the word embeddings embeddings = WordEmbeddings( checkpoint_filepath=path, suffix_tree=False, nearest_neighbours=False ) temporal_embeddings.append((path_date.date(), embeddings)) figsize = (args.figure_width / args.figure_dpi, args.figure_height / args.figure_dpi) plt.figure(figsize=figsize, dpi=args.figure_dpi) # Draw the graph. plot_similarity_over_time(temporal_embeddings, args.word_a, args.word_b) if not args.output_path: plt.show() else: output_format = (args.output_path.suffix or 'png').replace('.', '') args.output_path.parent.mkdir(parents=True, exist_ok=True) if output_format == 'tex' or output_format == 'latex': tikzplotlib.save(args.output_path) else: plt.savefig(args.output_path, dpi=args.export_dpi) logger.info('Exported figure to {}'.format(args.output_path))
def main(args: argparse.Namespace) -> None: """Main entrypoint for the script.""" # Ensure that at least on data argument was provided if args.checkpoint_directory is None \ and args.weights_filepath is None \ and args.vocab_filepath is None: logger.error('One of --checkpoints / (--weights-filepath ' 'and --vocab-filepath) is required!') exit(1) if args.checkpoint_directory is not None: weights_filepath = args.checkpoint_directory / 'proj_weights.npy' vocab_filepath = args.checkpoint_directory / 'vocab.txt' else: weights_filepath = args.weights_filepath args.vocab_filepath = args.vocab_filepath embeddings = WordEmbeddings( weights_filepath, vocab_filepath, name_metadata=weights_filepath.parent.stem ) # Start the embedding projector embedding_projector([embeddings], debug=args.debug, port=args.port)
def event_extraction(events_table, node, parent_verb): ancora = Ancora("ancora.vbs") if verb_not_aux(node): events_table[node.id] = Event(node, None, None, parent_verb) for child in node.children: node_child = child[1] relation_child = child[0] if is_verb(node): current_event = events_table[node.id] if relation_child == 'suj': current_event.subj = node_child elif relation_child == 'cd': current_event.obj = node_child if parent_verb is not None and verb_not_gerunde_nor_participle( parent_verb): parent_event = events_table[parent_verb.id] if parent_event.obj is None: unanimity_valence_obj = ancora.unanimity_argument( parent_verb.form, parent_verb.lemma, SRL.OBJ) if unanimity_valence_obj: parent_event.obj = node_child elif unanimity_arg == None or ( unanimity_arg == False and ancora.one_category_argument( parent_verb.form, parent_verb.lemma, SRL.OBJ)): word_embeddings = WordEmbeddings() if word_embeddings.similar_words( parent_verb.form, node_child.form): parent_event.obj = node_child elif relation_child in ('cpred', 'ci', 'cc', 'creg'): check_majority = ancora.check_majority_rule_category( node.form, node.lemma, SRL.THIRD, node_child.tag[0]) if is_location_or_time(node_child): current_event.add_circumstance_complements(node_child) elif check_majority or check_majority == None: current_event.complement(node_child) if parent_verb is not None: parent_event = events_table[parent_verb.id] if parent_event.complement == None: check_unanimity_complement = ancora.check_unanimity_categories_argument_rule( parent_verb.form, parent_verb.lemma, SRL.THIRD, node_child.tag[0]) if check_unanimity_complement: parent_event.complement = node_child event_extraction(events_table, node_child, node if node.tag[0] == 'V' else parent_verb)
def run_embedding_projector(root_directory: Union[str, Path]) -> None: """Run the embedding projector on ALL the trained embeddings in the output/word2vec folder. Args: root_directory: The directory containing all the embedding checkpoints. """ root_directory = Path(root_directory) embeddings_list = [ WordEmbeddings(checkpoint_filepath=path, name_metadata=path.stem) for path in root_directory.glob('*/') ] # We can't have no embeddings! assert len(embeddings_list) > 0 embedding_projector(embeddings_list)
def build_k_hop_graph(embeddings: WordEmbeddings, target_word: str, k: int, alpha: Optional[float] = 0.50) -> nx.Graph: """Builds the k-hop graph for a word embeddings space. Args: embeddings: The word embeddings to generate the graph for. target_word: The word of interest. k: The number of 'hops' between the word of interest and every node in the graph. The resultant graph has the property that the word of interest is reachable from any node in at most k edges. alpha: The similarity threshold. Words that have a cosine similarity of at least this threshold are kept, and the rest are discarded. """ # Verify the alpha threshold is <= max(similarity between interest word). max_alpha = embeddings.most_similar(target_word, k=1)[0][1] if alpha > max_alpha: raise ValueError( 'Alpha threshold too high! The word of interest was not included ' 'in the graph. For the given target word, ' '\'{}\', alpha can be AT MOST {}!'.format(target_word, max_alpha)) graph = build_infinity_hop_graph(embeddings, alpha) # Get the word index of the word of interest. T = embeddings._vocabulary[target_word] # Compute the shortest paths from the word of interest to all reachable nodes. logger.info('Computing shortest paths') paths = nx.single_source_shortest_path_length(graph, T) logger.info('Building k-hop graph') nodes_to_delete = set() for node in tqdm.tqdm(graph.nodes): # Remove the node if the word of interest is not reachable in at most k edges. if node not in paths or paths[node] > k: nodes_to_delete.add(node) for node in nodes_to_delete: graph.remove_node(node) logger.info('Generated k-hop graph (nodes: {}, edges: {})'.format( len(graph.nodes), len(graph.edges))) return graph
def main(): print('Starting test model...') with tf.Graph().as_default() as g: #session = tf.Session() with tf.Session() as session: # saver = tf.train.Saver(tf.all_variables()) model = Classifier() # model.session.run(tf.global_variables_initializer()) saver = tf.train.import_meta_graph(model_path + '.meta') saver.restore(model.session, save_path='/tmp/' + model_path) # model.session.run(tf.global_variables_initializer()) print('Model restored.') # print([v.op.name for v in tf.all_variables()]) # print(model.session.run()) # TODO complete this portion (feed in our own data, command line interface) embeddings = WordEmbeddings() while True: headline = input('Headline? ') article = input('Article? ') true_label = input('True label? ') h, a, t, l_h, l_a = get_articles_word_vectors( headline, article, true_label, embeddings) g_f = generate_features(headline, article) pred_stances = model.session.run( [model.pred_stance, model.train_fn], { model.inputs_articles: a, # INSERT EMBEDDING model.inputs_headlines: h, # INSERT EMBEDDING model.outputs: t, # INSERT EMBEDDING model.h_lengths: l_h, model.a_lengths: l_a, model.global_feats: g_f })[0] print('predicted label = ' + str(LABELS[pred_stances[0]]) + '\n')
def __init__(self): Helpers.__init__(self) WordEmbeddings.__init__(self) FeatureEngineering.__init__(self)
# -*- coding: utf-8 -*- from random import shuffle from chinese import dist, silouhette_coefficent from node import Node from evento import Event from etiquetadoeventos import Etiquetado_eventos from word_embeddings import WordEmbeddings from functools import reduce from collections import Counter import math we = WordEmbeddings() #Chinese Whispers Algorithm # def CWA(event_list): # for i in range(len(event_list)): # event_list[i].type = i # not_converged = True # iteracion = 0 # while not_converged: # # for e in event_list: # # print(e) # iteracion += 1 # print("==ITERACION==>"+str(iteracion)) # shuffle(event_list) # clases_antes = [event.type for event in event_list] # for i in range(len(event_list)): # max_similarity = -1 # for j in range(len(event_list)): # if i != j:
def load_models(self, models_folder, w2v_folder): self.logger.info(u'Loading models from {}'.format(models_folder)) self.models_folder = models_folder # Загружаем общие параметры для сеточных моделей with open(os.path.join(models_folder, 'qa_model_selector.config'), 'r') as f: model_config = json.load(f) self.max_inputseq_len = model_config['max_inputseq_len'] self.wordchar2vector_path = self.get_model_filepath( models_folder, model_config['wordchar2vector_path']) self.PAD_WORD = model_config['PAD_WORD'] self.word_dims = model_config['word_dims'] self.qa_model_config = model_config # TODO: выбор конкретной реализации для каждого типа моделей сделать внутри базового класса # через анализ поля 'engine' в конфигурации модели. Для нейросетевых моделей там будет # значение 'nn', для градиентного бустинга - 'xgb'. Таким образом, уберем ненужную связность # данного класса и конкретных реализации моделей. # Определение релевантности предпосылки и вопроса на основе XGB модели #self.relevancy_detector = XGB_RelevancyDetector() self.relevancy_detector = LGB_RelevancyDetector() self.relevancy_detector.load(models_folder) # Модель определения синонимичности двух фраз #self.synonymy_detector = NN_SynonymyDetector() #self.synonymy_detector.load(models_folder) self.synonymy_detector = Jaccard_SynonymyDetector() self.interpreter = NN_Interpreter() self.interpreter.load(models_folder) # Определение достаточности набора предпосылок для ответа на вопрос self.enough_premises = NN_EnoughPremisesModel() self.enough_premises.load(models_folder) # Комплексная модель (группа моделей) для генерации текста ответа self.answer_builder = AnswerBuilder() self.answer_builder.load_models(models_folder) # Классификатор грамматического лица на базе XGB self.person_classifier = XGB_PersonClassifierModel() self.person_classifier.load(models_folder) # Нейросетевая модель для манипуляции с грамматическим лицом self.person_changer = NN_PersonChange() self.person_changer.load(models_folder) # Загрузка векторных словарей self.word_embeddings = WordEmbeddings() self.word_embeddings.load_models(models_folder) self.word_embeddings.load_wc2v_model(self.wordchar2vector_path) for p in self.answer_builder.get_w2v_paths(): p = os.path.join(w2v_folder, os.path.basename(p)) self.word_embeddings.load_w2v_model(p) self.word_embeddings.load_w2v_model( os.path.join(w2v_folder, os.path.basename( self.enough_premises.get_w2v_path()))) self.logger.debug('All models loaded')
from node import Node from word_embeddings import WordEmbeddings from ancora import Ancora from ancora_enum import SRL from evento import Event ancora = Ancora("ancora.vbs") word_embeddings = WordEmbeddings() def main(node, sn_root_list): events_table = {} event_extraction(events_table, node, None) sentenceList = [] #soluciono coref for event in events_table.values(): if event.subj is not None and is_pronoun(event.subj): event.subj = resolve_coref(event, sn_root_list, node) #aumento for verb_id, event in events_table.items(): if event.subj is None and event.parent_verb is not None: augment_subject(events_table, verb_id, event.parent_verb.id, node) list_bow = [(event.bag_of_words(), event) for event in events_table.values()] erase_list = [] for i, duple in enumerate(list_bow): for j in range(0, len(list_bow)): if i != j:
from flask import request from sentence_embeddings import get_sentence_embedding, operations from word_embeddings import WordEmbeddings ukp_embeddings_share_url = 'https://public.ukp.informatik.tu-darmstadt.de/arxiv2018-xling-sentence-embeddings' ukp_xling_embeddings_share_url = '{}/xling-wordembeddings'.format( ukp_embeddings_share_url) ukp_monoling_embeddings_share_url = '{}/monolingual-wordembeddings'.format( ukp_embeddings_share_url) embeddings = { 'en-de': [ WordEmbeddings('mapped_bivcd_en_de', ukp_xling_embeddings_share_url, 'mapped_bivcd_en_de.txt.gz', approximate_filesize=101035275, file_n_lines=86761, lowercased=True), WordEmbeddings('mapped_attract_repel_en_de', ukp_xling_embeddings_share_url, 'mapped_attract_repel_en_de.txt.gz', approximate_filesize=270155631, file_n_lines=234036, lowercased=True), WordEmbeddings( # this is the small FT version. See full version in the comments below 'mapped_fasttext_en_de', ukp_xling_embeddings_share_url, 'mapped_fasttext_300k_en_de.txt.gz', approximate_filesize=680901729, file_n_lines=599959,
def es_similar_we(arg_1,arg_2): res = WordEmbeddings().similar_words(arg_1.form(), arg_2.form()) if arg_1.tag[0] == arg_2.tag[0] == 'v': res *= 0.2 return res
def load_embeddings(wordchar2vector_path, word2vector_path, computed_params): embeddings = WordEmbeddings.load_word_vectors(wordchar2vector_path, word2vector_path) computed_params['word_dims'] = embeddings.get_vector_size() computed_params['word2vec'] = embeddings return embeddings
def main(): # set up data d = DataSet() folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) embeddings = WordEmbeddings() print('Created data set and word embeddings') # create classifier model = Classifier() print('Set up model') # get word vector data x_articles = {} x_headlines = {} y_vals = {} for fold in fold_stances: x_headlines[fold], x_articles[fold], y_vals[ fold] = get_articles_word_vectors(fold_stances[fold], d, embeddings) test_x_headlines, test_x_articles, test_y = get_articles_word_vectors( hold_out_stances, d, embeddings) print('Finished separating folds') # TODO get global feature data # train LSTM (fold -> epoch -> batch) model.session.run(tf.global_variables_initializer()) for fold in fold_stances: ids = list(range(len(folds))) del ids[fold] x_train_articles = np.vstack(tuple([x_articles[i] for i in ids])) x_train_headlines = np.vstack(tuple([x_headlines[i] for i in ids])) y_train = np.vstack(tuple([y_vals[i] for i in ids])) print('train articles shape = ' + str(x_train_articles.shape)) print('train headlines shape = ' + str(x_train_headlines.shape)) print('y train shape = ' + str(y_train.shape)) x_valid_articles = x_articles[fold] x_valid_headlines = x_headlines[fold] y_valid = y_vals[fold] fold_error = 0 print('Training fold ' + str(fold)) for epoch in range(10): batch_size = 512 article_batches = [] headline_batches = [] output_batches = [] start = 0 while start < len(x_train_articles): article_chunk = x_train_articles[start:start + batch_size] headline_chunk = x_train_headlines[start:start + batch_size] output_chunk = y_train[start:start + batch_size] article_batches.append(article_chunk) headline_batches.append(headline_chunk) output_batches.append(output_chunk) start += batch_size for i in range(len(article_batches)): # Training error print("inputs_a: " + str(type(article_batches[i]))) print("inputs_h: " + str(headline_batches[i])) print("inputs_a: " + str(article_batches[i].shape)) print("inputs_h: " + str(headline_batches[i].shape)) print("outputs: " + str(output_batches[i][:, -1, :].shape)) epoch_error = model.session.run( [model.error, model.train_fn], { model.inputs_articles: article_batches[i], model.inputs_headlines: headline_batches[i], model.outputs: output_batches[i][:, -1, :] })[0] if i % 10 == 0: print('\tEpoch error for batch ' + str(i) + ' = ' + str(epoch_error)) fold_error += epoch_error print('Training error (fold) = ' + str(fold_error / 10.0) + '\n') # cross-validation error valid_accuracy, pred_y_stances = model.session.run( [model.accuracy, model.pred_stance], { model.inputs_articles: x_valid_articles, model.inputs_headlines: x_valid_headlines, model.outputs: y_valid[:, -1, :] }) # assess performance on validation set print('\n#### RUNNING ON HOLDOUT SET ####') test_accuracy, pred_y_stances = model.session.run( [model.accuracy, model.pred_stance], { model.inputs_articles: test_x_articles, model.inputs_headlines: test_x_headlines, model.outputs: test_y[:, -1, :] }) simple_y = np.array([array[0].tolist().index(1) for array in test_y]) f1_score = metrics.f1_score(simple_y, pred_y_stances, average='macro') print("F1 MEAN score: " + str(f1_score)) f1_score_labels = metrics.f1_score(simple_y, pred_y_stances, labels=[0, 1, 2, 3], average=None) print("F1 LABEL scores: " + str(f1_score_labels)) # Convert to string labels for FNC scoring metric label_map = {0: "agree", 1: "disagree", 2: "discuss", 3: "unrelated"} simple_y_str = [label_map[label] for label in simple_y] pred_y_stances_str = [label_map[label] for label in pred_y_stances] report_score(simple_y_str, pred_y_stances_str)
def main(): d = DataSet() folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) embeddings = WordEmbeddings() print('Created data set and word embeddings') # with tf.Graph().as_default() as g: # create classifier model = Classifier() print('Set up model') with model.graph.as_default(): # get word vector data x_articles = {} x_headlines = {} y_vals = {} lengths_a = {} lengths_h = {} x_global = {} y_global = {} for fold in fold_stances: x_headlines[fold], x_articles[fold], y_vals[fold], lengths_h[ fold], lengths_a[fold] = get_articles_word_vectors( fold_stances[fold], d, embeddings) x_global[fold], y_global[fold] = generate_features( fold_stances[fold], d, str(fold)) test_x_headlines, test_x_articles, test_y, test_h_lengths, test_a_lengths = get_articles_word_vectors( hold_out_stances, d, embeddings) test_x_global, test_y_global = generate_features( hold_out_stances, d, 'holdout') print('Finished separating folds') # TODO get global feature data # train LSTM (fold -> epoch -> batch) # model.session.run(tf.initialize_all_variables()) saver = tf.train.Saver(tf.all_variables()) ''' saver = tf.train.Saver({'cell_articles_fw' : model.cell_articles_fw, 'cell_articles_bw' : model.cell_articles_bw, 'cell_headlines_fw' : model.cell_headlines_fw, 'cell_headlines_bw' : model.cell_headlines_bw }) ''' for fold in fold_stances: ids = list(range(len(folds))) del ids[fold] x_train_articles = np.vstack(tuple([x_articles[i] for i in ids])) x_train_headlines = np.vstack(tuple([x_headlines[i] for i in ids])) y_train = np.vstack(tuple([y_vals[i] for i in ids])) lengths_h_train = np.vstack(tuple([lengths_h[i] for i in ids])) lengths_a_train = np.vstack(tuple([lengths_a[i] for i in ids])) global_train = np.vstack(tuple([x_global[i] for i in ids])) # print('train articles shape = ' + str(x_train_articles.shape)) # print('train headlines shape = ' + str(x_train_headlines.shape)) # print('y train shape = ' + str(y_train.shape)) x_valid_articles = x_articles[fold] x_valid_headlines = x_headlines[fold] y_valid = y_vals[fold] length_h_valid = lengths_h[fold] length_a_valid = lengths_a[fold] global_valid = x_global[fold] # Training batches article_batches_train, headline_batches_train, output_batches_train, length_h_batches_train, length_a_batches_train, global_batches_train = create_batches( x_train_articles, x_train_headlines, y_train, lengths_h_train, lengths_a_train, global_train) fold_error = 0 print('Training fold ' + str(fold)) j = 0 for epoch in range(5): ''' # Training batches article_batches_train,headline_batches_train,output_batches_train,length_h_batches_train,length_a_batches_train, global_batches_train = create_batches(x_train_articles, x_train_headlines, y_train, lengths_h_train, lengths_a_train, global_train) ''' print(len(article_batches_train)) for i in range(len(article_batches_train)): # Training error epoch_error = model.session.run( [model.error, model.train_fn], { model.inputs_articles: article_batches_train[i], model.inputs_headlines: headline_batches_train[i], model.outputs: output_batches_train[i], model.h_lengths: length_h_batches_train[i], model.a_lengths: length_a_batches_train[i], model.global_feats: global_batches_train[i] })[0] print('\tEpoch ' + str(j) + ' error = ' + str(epoch_error)) fold_error += epoch_error j += 1 print('Training error (fold) = ' + str(fold_error / j) + '\n') print('LSTM Cell Weights') print('\tFW articles ' + str(model.rnn_states_articles[0])) print('\tBW articles ' + str(model.rnn_states_articles[1])) print('\tFW headlines ' + str(model.rnn_states_headlines[0])) print('\tBW headlines ' + str(model.rnn_states_headlines[1])) # Validation batches article_batches_valid, headline_batches_valid, output_batches_valid, length_h_batches_valid, length_a_batches_valid, global_batches_valid = create_batches( x_valid_articles, x_valid_headlines, y_valid, length_h_valid, length_a_valid, global_valid) all_pred_y_stances = [] for i in range(len(article_batches_valid)): # cross-validation error pred_y_stances = model.session.run( [model.pred_stance], { model.inputs_articles: article_batches_valid[i], model.inputs_headlines: headline_batches_valid[i], model.outputs: output_batches_valid[i], model.h_lengths: length_h_batches_valid[i], model.a_lengths: length_a_batches_valid[i], model.global_feats: global_batches_valid[i] }) all_pred_y_stances = np.append(all_pred_y_stances, pred_y_stances) simple_y = np.array([array.tolist().index(1) for array in y_valid]) ''' f1_score = metrics.f1_score(simple_y, pred_y_stances, average='macro') print("F1 MEAN score: " + str(f1_score)) f1_score_labels = metrics.f1_score(simple_y, pred_y_stances, labels=[0, 1, 2, 3], average=None) print("F1 LABEL scores: " + str(f1_score_labels)) ''' # Convert to string labels for FNC scoring metric label_map = { 0: "agree", 1: "disagree", 2: "discuss", 3: "unrelated" } simple_y_str = [label_map[label] for label in simple_y] pred_y_stances_str = [ label_map[label] for label in all_pred_y_stances ] report_score(simple_y_str, pred_y_stances_str) # assess performance on test set print('\n#### RUNNING ON HOLDOUT SET ####') # Test batches article_batches_test, headline_batches_test, output_batches_test, length_h_batches_test, length_a_batches_test, global_batches_test = create_batches( test_x_articles, test_x_headlines, test_y, test_h_lengths, test_a_lengths, test_x_global) all_pred_y_test = [] for i in range(len(article_batches_test)): pred_y_stances = model.session.run( [model.pred_stance], { model.inputs_articles: article_batches_test[i], model.inputs_headlines: headline_batches_test[i], model.outputs: output_batches_test[i], model.h_lengths: length_h_batches_test[i], model.a_lengths: length_a_batches_test[i], model.global_feats: global_batches_test[i] }) all_pred_y_test = np.append(all_pred_y_test, pred_y_stances) simple_y = np.array([array.tolist().index(1) for array in test_y]) f1_score = metrics.f1_score(simple_y, all_pred_y_test, average='macro') print("F1 MEAN score: " + str(f1_score)) f1_score_labels = metrics.f1_score(simple_y, all_pred_y_test, labels=[0, 1, 2, 3], average=None) print("F1 LABEL scores: " + str(f1_score_labels)) # Convert to string labels for FNC scoring metric label_map = {0: "agree", 1: "disagree", 2: "discuss", 3: "unrelated"} simple_y_str = [label_map[label] for label in simple_y] pred_y_stances_str = [label_map[label] for label in all_pred_y_test] report_score(simple_y_str, pred_y_stances_str) h, b = [], [] for stance in hold_out_stances: h.append(stance['Headline']) b.append(d.articles[stance['Body ID']]) b = [" ".join(body) for body in b] print('### CLASSIFICATIONS ###') for i in range(len(b)): print('Pair ' + str(i)) print('\tHeadline: ' + str(h[i])) print('\tBody: ' + str(b[i])) print('\tTrue label: ' + str(simple_y_str[i])) print('\tAssigned label: ' + str(pred_y_stances_str[i])) saver.save(model.session, '/tmp/' + model_path)
class SimpleAnsweringMachine(BaseAnsweringMachine): """ Чат-бот на основе набора нейросетевых и прочих моделей (https://github.com/Koziev/chatbot). """ def __init__(self, facts_storage, text_utils): super(SimpleAnsweringMachine, self).__init__() self.facts_storage = facts_storage self.trace_enabled = False self.session_factory = SimpleDialogSessionFactory(self.facts_storage) self.text_utils = text_utils self.logger = logging.getLogger('SimpleAnsweringMachine') self.scripting = None self.enable_smalltalk = False self.enable_scripting = False # Если релевантность факта к вопросу в БФ ниже этого порога, то факт не подойдет # для генерации ответа на основе факта. self.min_premise_relevancy = 0.3 def get_model_filepath(self, models_folder, old_filepath): """ Для внутреннего использования - корректирует абсолютный путь к файлам данных модели так, чтобы был указанный каталог. """ _, tail = os.path.split(old_filepath) return os.path.join(models_folder, tail) def load_models(self, models_folder, w2v_folder): self.logger.info(u'Loading models from {}'.format(models_folder)) self.models_folder = models_folder # Загружаем общие параметры для сеточных моделей with open(os.path.join(models_folder, 'qa_model_selector.config'), 'r') as f: model_config = json.load(f) self.max_inputseq_len = model_config['max_inputseq_len'] self.wordchar2vector_path = self.get_model_filepath( models_folder, model_config['wordchar2vector_path']) self.PAD_WORD = model_config['PAD_WORD'] self.word_dims = model_config['word_dims'] self.qa_model_config = model_config # TODO: выбор конкретной реализации для каждого типа моделей сделать внутри базового класса # через анализ поля 'engine' в конфигурации модели. Для нейросетевых моделей там будет # значение 'nn', для градиентного бустинга - 'xgb'. Таким образом, уберем ненужную связность # данного класса и конкретных реализации моделей. # Определение релевантности предпосылки и вопроса на основе XGB модели #self.relevancy_detector = XGB_RelevancyDetector() self.relevancy_detector = LGB_RelevancyDetector() self.relevancy_detector.load(models_folder) # Модель определения синонимичности двух фраз #self.synonymy_detector = NN_SynonymyDetector() #self.synonymy_detector.load(models_folder) self.synonymy_detector = Jaccard_SynonymyDetector() self.interpreter = NN_Interpreter() self.interpreter.load(models_folder) # Определение достаточности набора предпосылок для ответа на вопрос self.enough_premises = NN_EnoughPremisesModel() self.enough_premises.load(models_folder) # Комплексная модель (группа моделей) для генерации текста ответа self.answer_builder = AnswerBuilder() self.answer_builder.load_models(models_folder) # Классификатор грамматического лица на базе XGB self.person_classifier = XGB_PersonClassifierModel() self.person_classifier.load(models_folder) # Нейросетевая модель для манипуляции с грамматическим лицом self.person_changer = NN_PersonChange() self.person_changer.load(models_folder) # Загрузка векторных словарей self.word_embeddings = WordEmbeddings() self.word_embeddings.load_models(models_folder) self.word_embeddings.load_wc2v_model(self.wordchar2vector_path) for p in self.answer_builder.get_w2v_paths(): p = os.path.join(w2v_folder, os.path.basename(p)) self.word_embeddings.load_w2v_model(p) self.word_embeddings.load_w2v_model( os.path.join(w2v_folder, os.path.basename( self.enough_premises.get_w2v_path()))) self.logger.debug('All models loaded') def set_scripting(self, scripting): self.scripting = scripting def start_conversation(self, interlocutor): """ Начало общения бота с interlocutor. Ни одной реплики еще не было. Бот может поприветствовать собеседника или напомнить ему что-то, если в сессии с ним была какая-то напоминалка, т.д. Фразу, которую надо показать собеседнику, поместим в буфер выходных фраз с помощью метода say, а внешний цикл обработки уже извлечет ее оттуда и напечатает в консоли и т.д. :param interlocutor: строковый идентификатор собеседника. :return: строка реплики, которую скажет бот. """ session = self.get_session(interlocutor) if self.scripting is not None and self.enable_scripting: phrase = self.scripting.start_conversation(self, session) if phrase is not None: self.say(session, phrase) def change_person(self, phrase, target_person): return self.person_changer.change_person(phrase, target_person, self.text_utils, self.word_embeddings) def get_session_factory(self): return self.session_factory def is_question(self, phrase): return phrase[-1] == u'?' def interpret_phrase(self, session, raw_phrase): interpreted = InterpretedPhrase(raw_phrase) phrase = raw_phrase phrase_is_question = self.is_question(raw_phrase) # история фраз доступна в session как conversation_history if len(session.conversation_history) > 0\ and session.conversation_history[-1].is_bot_phrase\ and session.conversation_history[-1].is_question\ and not phrase_is_question\ and self.interpreter is not None: # В отдельной ветке обрабатываем ситуацию, когда бот # задал вопрос, на который собеседник дал краткий ответ. # с помощью специальной модели мы попробуем восстановить полный # текст ответа ообеседника. context_phrases = [] context_phrases.append( session.conversation_history[-1].interpretation) context_phrases.append(raw_phrase) phrase = self.interpreter.interpret(context_phrases, self.text_utils, self.word_embeddings) # определим грамматическое лицо получившейся интерпретации person = self.person_classifier.detect_person( phrase, self.text_utils, self.word_embeddings) if person == '2s': # интерпретация "Тебя зовут Илья" получена из "Меня зовут илья" person = '1s' elif person == '1s': person = '2s' if self.trace_enabled: self.logger.debug('detected person={}'.format(person)) else: # определим грамматическое лицо введенного предложения. person = self.person_classifier.detect_person( raw_phrase, self.text_utils, self.word_embeddings) if self.trace_enabled: self.logger.debug('detected person={}'.format(person)) # Может потребоваться смена грамматического лица. if person == '1s': phrase = self.change_person(raw_phrase, '2s') elif person == '2s': phrase = self.change_person(raw_phrase, '1s') interpreted.interpretation = phrase interpreted.is_question = phrase_is_question interpreted.phrase_person = person return interpreted def say(self, session, answer): answer_interpretation = InterpretedPhrase(answer) answer_interpretation.is_bot_phrase = True answer_interpretation.is_question = self.is_question(answer) session.add_to_buffer(answer) session.add_phrase_to_history(answer_interpretation) def push_phrase(self, interlocutor, phrase): question = self.text_utils.canonize_text(phrase) if question == u'#traceon': self.trace_enabled = True return elif question == u'#traceoff': self.trace_enabled = False return elif question == u'#facts': for fact, person, fact_id in self.facts_storage.enumerate_facts( interlocutor): print(u'{}'.format(fact)) return session = self.get_session(interlocutor) # Выполняем интерпретацию фразы с учетом ранее полученных фраз, # так что мы можем раскрыть анафору, подставить в явном виде опущенные составляющие и т.д., # определить, является ли фраза вопросом, фактом или императивным высказыванием. interpreted_phrase = self.interpret_phrase(session, question) # Интерпретация фраз и в общем случае реакция на них зависит и от истории # общения, поэтому результат интерпретации сразу добавляем в историю. session.add_phrase_to_history(interpreted_phrase) answer_generated = False if not interpreted_phrase.is_question: # Утверждение добавляем как факт в базу знаний, в раздел для # текущего собеседника. # TODO: факты касательно третьих лиц надо вносить в общий раздел базы, а не # для текущего собеседника. fact_person = '3' if interpreted_phrase.phrase_person == '1s': fact_person = '2s' elif interpreted_phrase.phrase_person == '2s': fact_person = '1s' fact = interpreted_phrase.interpretation if self.trace_enabled: print(u'Adding [{}] to knowledge base'.format(fact)) self.facts_storage.store_new_fact( interlocutor, (fact, fact_person, '--from dialogue--')) if self.scripting is not None and self.enable_scripting: answer = self.scripting.generate_response4nonquestion( self, interlocutor, interpreted_phrase) if answer is not None: answer_generated = True if not answer_generated: if self.enable_smalltalk: # подбираем подходящую реплику в ответ на не-вопрос собеседника (обычно это # ответ на наш вопрос, заданный ранее). smalltalk_phrases = self.facts_storage.enumerate_smalltalk_replicas( ) best_premise, best_rel = self.synonymy_detector.get_most_similar( interpreted_phrase.interpretation, [(item.query, -1, -1) for item in smalltalk_phrases], self.text_utils, self.word_embeddings) # если релевантность найденной реплики слишком мала, то нужен другой алгоритм... for item in smalltalk_phrases: if item.query == best_premise: # выбираем случайный вариант ответа # TODO: уточнить выбор, подбирая наиболее релевантный вариант, так что выдаваемая # реплика будет учитывать либо текущий дискурс, либо ???... # Следует учесть, что ответные реплики в SmalltalkReplicas могут быть ненормализованы, # поэтому их следует сначала нормализовать. answer = np.random.choice(item.answers) answer_generated = True break if answer_generated: self.say(session, answer) else: # обрабатываем вопрос answers = self.build_answers(interlocutor, interpreted_phrase) for answer in answers: self.say(session, answer) # Возможно, кроме ответа на вопрос, надо выдать еще какую-то реплику. # Например, для смены темы разговора. if len(answers) > 0: if self.scripting is not None and self.enable_scripting: additional_speech = self.scripting.generate_after_answer( self, interlocutor, interpreted_phrase, answers[-1]) if additional_speech is not None: self.say(session, additional_speech) def build_answers0(self, interlocutor, interpreted_phrase): if self.trace_enabled: self.logger.debug(u'Question to process={}'.format( interpreted_phrase.interpretation)) # Нужна ли предпосылка, чтобы ответить на вопрос? # Используем модель, которая вернет вероятность того, что # пустой список предпосылок достаточен. p_enough = self.enough_premises.is_enough( premise_str_list=[], question_str=interpreted_phrase.interpretation, text_utils=self.text_utils, word_embeddings=self.word_embeddings) if p_enough > 0.5: # Единственный ответ можно построить без предпосылки, например для вопроса "Сколько будет 2 плюс 2?" answer_rel = p_enough answers, answer_rels = self.answer_builder.build_answer_text( [u''], [1.0], interpreted_phrase.interpretation, self.text_utils, self.word_embeddings) if len(answers) != 1: self.logger.debug( u'Exactly 1 answer was expected for question={}, got {}'. format(interpreted_phrase.interpretation, len(answers))) return answers, answer_rels else: # определяем наиболее релевантную предпосылку memory_phrases = list( self.facts_storage.enumerate_facts(interlocutor)) best_premises, best_rels = self.relevancy_detector.get_most_relevant( interpreted_phrase.interpretation, memory_phrases, self.text_utils, self.word_embeddings, nb_results=3) if self.trace_enabled: self.logger.debug( u'Best premise is "{}" with relevancy={}'.format( best_premises[0], best_rels[0])) premises2 = [] premise_rels2 = [] max_rel = max(best_rels) for premise, rel in itertools.izip(best_premises, best_rels): if rel >= self.min_premise_relevancy and rel >= 0.5 * max_rel: premises2.append([premise]) premise_rels2.append(rel) # генерация ответа на основе выбранной предпосылки. answers, answer_rels = self.answer_builder.build_answer_text( premises2, premise_rels2, interpreted_phrase.interpretation, self.text_utils, self.word_embeddings) return answers, answer_rels def build_answers(self, interlocutor, interpreted_phrase): answers, answer_confidenses = self.build_answers0( interlocutor, interpreted_phrase) if len(answer_confidenses ) == 0 or max(answer_confidenses) < self.min_premise_relevancy: # тут нужен алгоритм генерации ответа в условиях, когда # у бота нет нужных фактов. Это может быть как ответ "не знаю", # так и вариант "нет" для определенных категорий вопросов. if self.scripting is not None: answer = self.scripting.buid_answer(self, interlocutor, interpreted_phrase) answers = [answer] return answers def pop_phrase(self, interlocutor): session = self.get_session(interlocutor) return session.extract_from_buffer() def get_session(self, interlocutor): return self.session_factory[interlocutor]
import pickle from word_embeddings import WordEmbeddings from nltk import word_tokenize def findKMostFrequentWords(data_x, k): data = [] for ele in data_x: sentence = ele[1] sentence = word_tokenize(sentence) data.append(sentence) ctr = Counter(tuple([word for sublist in data for word in sublist])) sorted_ctr = sorted(ctr.items(), key=operator.itemgetter(1), reverse=True) return [item[0] for item in sorted_ctr[0:k]] full_embeddings = WordEmbeddings() full_embeddings.create_embeddings_from_file(args.embedding_path) words = findKMostFrequentWords(train_dataset.final_data, args.vocab_size) reduced_embeddings = WordEmbeddings() reduced_embeddings.create_reduced_embeddings(full_embeddings, words)
def visualise_k_hop_graph(target_word: str, checkpoint: Optional[Union[str, Path]] = None, weights_filepath: Optional[Union[str, Path]] = None, vocab_filepath: Optional[Union[str, Path]] = None, k: Optional[int] = 2, alpha: Optional[float] = None, min_node_size: Optional[float] = 20, max_node_size: Optional[float] = 120, min_font_size: Optional[float] = 6, max_font_size: Optional[float] = 24, node_alpha: Optional[float] = 1, edge_alpha: Optional[float] = 0.15, target_word_label_colour: Optional[str] = 'black', colour_map: Optional[str] = 'tab20c', output_path: Optional[Union[str, Path]] = None, figure_width: Optional[int] = 800, figure_height: Optional[int] = 600, figure_dpi: Optional[int] = 96, export_dpi: Optional[int] = 96, verbose: Optional[bool] = False) -> None: """Visualise the k-hop graph for the given word embeddings and interest word. Requires one of checkpoint / (weights_filepath and vocab_filepath). If output_path is specified, then no preview window is drawn. """ # Ensure that at least on data argument was provided if checkpoint is None and weights_filepath is None and vocab_filepath is None: logger.error( 'One of checkpoint / (weights-filepath and vocab-filepath) is required!' ) exit(1) if checkpoint is not None: checkpoint = Path(checkpoint) weights_filepath = checkpoint / 'proj_weights.npy' vocab_filepath = checkpoint / 'vocab.txt' else: weights_filepath = Path(weights_filepath) vocab_filepath = Path(vocab_filepath) if not verbose: logger.setLevel(logging.ERROR) embeddings = WordEmbeddings(weights_filepath, vocab_filepath, name_metadata=weights_filepath.parent.stem) figsize = (figure_width / figure_dpi, figure_height / figure_dpi) plt.figure(figsize=figsize, dpi=figure_dpi) draw_k_hop_graph(embeddings, target_word, k, alpha=alpha, min_node_size=min_node_size, max_node_size=max_node_size, min_font_size=min_font_size, max_font_size=max_font_size, node_alpha=node_alpha, edge_alpha=edge_alpha, target_word_label_colour=target_word_label_colour, community_colour_map=colour_map) # Show the plot, or output it, depending on the mode. plt.axis('off') if not output_path: plt.show() else: output_path = Path(output_path) output_format = (output_path.suffix or 'png').replace('.', '') output_path.parent.mkdir(parents=True, exist_ok=True) if output_format == 'tex' or output_format == 'latex': tikzplotlib.save(output_path) else: plt.savefig(output_path, dpi=export_dpi) logger.info('Exported figure to {}'.format(output_path))
def draw_k_hop_graph(embeddings: WordEmbeddings, target_word: str, k: int, alpha: Optional[float] = 0.50, min_node_size: Optional[float] = 20, max_node_size: Optional[float] = 120, min_font_size: Optional[float] = 6, max_font_size: Optional[float] = 24, node_alpha: Optional[float] = 1, edge_alpha: Optional[float] = 0.05, target_word_label_colour: Optional[str] = 'black', community_colour_map: Optional[str] = 'plasma') -> None: """Draw the k-hop graph for the given word embeddings and interest word. This function DOES NOT show the matplotlib plot. Args: embeddings: The word embeddings to generate the graph for. target_word: The word of interest. k: The number of 'hops' between the word of interest and every node in the graph. The resultant graph has the property that the word of interest is reachable from any node in at most k edges. alpha: The similarity threshold. Words that have a cosine similarity of at least this threshold are kept, and the rest are discarded. min_node_size: The minimum size of a node, in pixels. max_node_size: The maximum size of a node, in pixels. min_font_size: The minimum size of a label, in pixels. max_font_size: The maximum size of a label, in pixels. node_alpha: The alpha/transparency to draw nodes with. edge_alpha: The alpha/transparency to draw edges with. target_word_label_colour: The colour of the target word label. Makes the target word stand out. Useless when there are many words. community_colour_map: The colour map to use when assigning colours to communities. """ if alpha is None: _, similarity = embeddings.most_similar(target_word, k=1)[0] alpha = similarity - 0.05 logger.info( 'No alpha threshold provided. Using alpha = {}'.format(alpha)) graph = build_k_hop_graph(embeddings, target_word, k, alpha=alpha) logger.info('Computing best partition (Louvain community detection)') # compute the best partition partition = community_louvain.best_partition(graph) logger.info('Computing layout (ForceAtlas2)') forceatlas2 = ForceAtlas2(outboundAttractionDistribution=True, edgeWeightInfluence=1.0, jitterTolerance=1.0, barnesHutOptimize=True, barnesHutTheta=1.2, scalingRatio=2.0, strongGravityMode=False, gravity=1.0, verbose=False) positions = forceatlas2.forceatlas2_networkx_layout(graph) logger.info('Rendering graph with matplotlib') cmap = cm.get_cmap(community_colour_map, max(partition.values()) + 1) degrees = dict(graph.degree) max_degree = max(degrees.values()) size_multipliers = {i: degrees[i] / max_degree for i in positions} # Generate node sizes node_size = [ max(max_node_size * size_multipliers[i], min_node_size) for i in positions ] # Draw the nodes nx.draw_networkx_nodes(graph, positions, partition.keys(), node_size=node_size, cmap=cmap, node_color=list(partition.values()), alpha=node_alpha) # Draw the edges with a bezier curve curves = curved_edges(graph, positions) # Remove nan values curves = np.nan_to_num(curves) # Assign a colour to each edge, based on the community of the source node. edge_color = [cmap(partition[a]) for a, _ in graph.edges] edge_lines = LineCollection(curves, color=edge_color, cmap=cmap, alpha=edge_alpha, linewidths=1) plt.gca().add_collection(edge_lines) # Draw node labels (words) for i, (x, y) in positions.items(): # The size of the label is proportional to the degree of the node. fontsize = max(max_font_size * size_multipliers[i]**4, min_font_size) word = embeddings.words[i] colour = target_word_label_colour if word == target_word else 'black' plt.text(x, y, word, fontsize=fontsize, ha='center', va='center', color=colour)
help='Weights file path to save the pretrained model.', type=str, required=True) #print(parser.format_help()) args = parser.parse_args() word_embeddings_file_path = args.word2vec pretrained_weights_file_path = args.save epochs = args.epochs df = read_SEMEVAL_data(args.data) # initialize objects print('Initializing objects ...') print('Initializing word embeddings ...') t1 = time.time() word_embeddings = WordEmbeddings(word_embeddings_file_path) t2 = time.time() print('\tTook %f seconds' % (t2 - t1)) print('Initializing tokenizer ...') tokenizer = Tokenizer() print('Initializing vectorizer ...') vectorizer = Vectorizer(word_embeddings, tokenizer) #### training dataset #### # vectorizing ids, train_a_vectors, train_b_vectors, train_gold = vectorizer.vectorize_df(df) train_max_a_length = len(max(train_a_vectors, key=len)) train_max_b_length = len(max(train_b_vectors, key=len)) print('maximum number of tokens per sentence A in training set is %d' % train_max_a_length) print('maximum number of tokens per sentence B in training set is %d' %
def main(): d = DataSet() folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances, hold_out_stances_small = get_stances_for_folds2( d, folds, hold_out) embeddings = WordEmbeddings() print('Created data set and word embeddings') # create classifier model = Classifier() print('Set up model') # get word vector data x_articles = {} x_headlines = {} y_vals = {} lengths_a = {} lengths_h = {} x_global = {} y_global = {} for fold in fold_stances: x_headlines[fold], x_articles[fold], y_vals[fold], lengths_h[ fold], lengths_a[fold] = get_articles_word_vectors( fold_stances[fold], d, embeddings) x_global[fold], y_global[fold] = generate_features( fold_stances[fold], d, str(fold)) test_x_headlines, test_x_articles, test_y, test_h_lengths, test_a_lengths = get_articles_word_vectors( hold_out_stances, d, embeddings) test_x_global, test_y_global = generate_features(hold_out_stances, d, 'holdout') print("hold_out_stances: " + str(len(hold_out_stances))) print("test_x_global: " + str(len(test_x_global))) print("test_y_global: " + str(len(test_y_global))) test_x_headlines_small, test_x_articles_small, test_y_small, test_h_lengths_small, test_a_lengths_small = get_articles_word_vectors( hold_out_stances_small, d, embeddings) test_x_global_small, test_y_global_small = generate_features( hold_out_stances_small, d, 'holdout_small') print("test_x_global_small: " + str(len(test_x_global_small))) print("test_y_global_small: " + str(len(test_y_global_small))) print('Finished separating folds') # train LSTM (fold -> epoch -> batch) model.session.run(tf.global_variables_initializer()) for fold in fold_stances: ids = list(range(len(folds))) del ids[fold] x_train_articles = np.vstack(tuple([x_articles[i] for i in ids])) x_train_headlines = np.vstack(tuple([x_headlines[i] for i in ids])) y_train = np.vstack(tuple([y_vals[i] for i in ids])) lengths_h_train = np.vstack(tuple([lengths_h[i] for i in ids])) lengths_a_train = np.vstack(tuple([lengths_a[i] for i in ids])) global_train = np.vstack(tuple([x_global[i] for i in ids])) # print('train articles shape = ' + str(x_train_articles.shape)) # print('train headlines shape = ' + str(x_train_headlines.shape)) # print('y train shape = ' + str(y_train.shape)) x_valid_articles = x_articles[fold] x_valid_headlines = x_headlines[fold] y_valid = y_vals[fold] length_h_valid = lengths_h[fold] length_a_valid = lengths_a[fold] global_valid = x_global[fold] # SVM 1 : distinguishing unrelated from related X_train = global_train y_train_round1 = [] for item in y_train: # if unrelated if item[3] == 1: y_train_round1.append(0) else: y_train_round1.append(1) y_train_round1 = np.array(y_train_round1) X_valid = global_valid y_valid_round1 = [] for item in y_valid: if item[3] == 1: y_valid_round1.append(0) else: y_valid_round1.append(1) y_valid_round1 = np.array(y_valid_round1) clf1 = svm.SVC() # NOTE: Train on valid because it's a smaller set # Want to use train set in LSTM clf1.fit(X_valid, y_valid_round1) print("X_valid, y_valid_round1 shape: " + str(X_valid.shape) + ", " + str(y_valid_round1.shape)) round1_pred = clf1.predict(X_train) round1_score = 0 for i in range(len(round1_pred)): if round1_pred[i] == y_train_round1[i]: round1_score += 1 round1_score = 1.0 * round1_score / len(round1_pred) print('round 1 score: ' + str(round1_score)) # REFORMAT FOR BILSTM # Reformat y_train so it only has related labels (now only 3 labels) y_train_round2 = [] for index, label in enumerate(round1_pred): if label == 1: y_train_round2.append(y_train[index][:-1]) # If unrelated, append as all 0's else: y_train_round2.append([0, 0, 0]) # Reformat y_valid so it only has related labels y_valid_round2 = [] for label in y_valid: if label[3] != 1: y_valid_round2.append(label[:-1]) # If unrelated, append as all 0's else: y_valid_round2.append([0, 0, 0]) # TRAINING fold_error = 0 print('Training fold ' + str(fold)) j = 0 #NOTE: Change epoch back to 5!! for epoch in range(5): # Training batches article_batches_train, headline_batches_train, output_batches_train, length_h_batches_train, length_a_batches_train, global_batches_train = create_batches( x_train_articles, x_train_headlines, y_train_round2, lengths_h_train, lengths_a_train, global_train) for i in range(len(article_batches_train)): # Training error epoch_error = model.session.run( [model.error, model.train_fn], { model.inputs_articles: article_batches_train[i], model.inputs_headlines: headline_batches_train[i], model.outputs: output_batches_train[i], model.h_lengths: length_h_batches_train[i], model.a_lengths: length_a_batches_train[i], model.global_feats: global_batches_train[i] })[0] print('\tEpoch ' + str(j) + ' error = ' + str(epoch_error)) fold_error += epoch_error j += 1 print('Training error (fold) = ' + str(fold_error / j) + '\n') # Validation batches article_batches_valid, headline_batches_valid, output_batches_valid, length_h_batches_valid, length_a_batches_valid, global_batches_valid = create_batches( x_valid_articles, x_valid_headlines, y_valid_round2, length_h_valid, length_a_valid, global_valid) all_pred_y_stances = [] for i in range(len(article_batches_valid)): # cross-validation error pred_y_stances = model.session.run( [model.pred_stance], { model.inputs_articles: article_batches_valid[i], model.inputs_headlines: headline_batches_valid[i], model.outputs: output_batches_valid[i], model.h_lengths: length_h_batches_valid[i], model.a_lengths: length_a_batches_valid[i], model.global_feats: global_batches_valid[i] }) all_pred_y_stances = np.append(all_pred_y_stances, pred_y_stances) # Merge related and unrelated labels together for final prediction final_pred = [] all_pred_count = 0 for label in y_valid_round1: # If unrelated if label == 0: final_pred.append(3) else: final_pred.append(all_pred_y_stances[all_pred_count]) all_pred_count += 1 simple_y = np.array([array.tolist().index(1) for array in y_valid]) ''' f1_score = metrics.f1_score(simple_y, pred_y_stances, average='macro') print("F1 MEAN score: " + str(f1_score)) f1_score_labels = metrics.f1_score(simple_y, pred_y_stances, labels=[0, 1, 2, 3], average=None) print("F1 LABEL scores: " + str(f1_score_labels)) ''' # Convert to string labels for FNC scoring metric label_map = {0: "agree", 1: "disagree", 2: "discuss", 3: "unrelated"} simple_y_str = [label_map[label] for label in simple_y] pred_y_stances_str = [label_map[label] for label in final_pred] report_score(simple_y_str, pred_y_stances_str) # assess performance on test set print('\n#### RUNNING ON HOLDOUT SET ####') # Reformat yvals for round1 y_test_round1 = [] for item in test_y: if item[3] == 1: y_test_round1.append(0) else: y_test_round1.append(1) y_test_round1 = np.array(y_test_round1) y_test_small_round1 = [] for item in test_y_small: if item[3] == 1: y_test_small_round1.append(0) else: y_test_small_round1.append(1) y_test_small_round1 = np.array(y_test_small_round1) # ROUND 1 TESTING clf2 = svm.SVC() # NOTE: Train on valid because it's a smaller set # Want to use train set in LSTM clf2.fit(test_x_global_small, test_y_global_small) round1_pred = clf2.predict(test_x_global) round1_score = 0 for i in range(len(round1_pred)): if round1_pred[i] == y_test_round1[i]: round1_score += 1 round1_score = 1.0 * round1_score / len(round1_pred) print('round 1 score: ' + str(round1_score)) # REFORMAT FOR BILSTM y_test_round2 = [] for index, label in enumerate(round1_pred): if label == 1: y_test_round2.append(test_y_global[index][:-1]) # If unrelated, append as all 0's else: y_test_round2.append([0, 0, 0]) # Test batches article_batches_test, headline_batches_test, output_batches_test, length_h_batches_test, length_a_batches_test, global_batches_test = create_batches( test_x_articles, test_x_headlines, y_test_round2, test_h_lengths, test_a_lengths, test_x_global) all_pred_y_test = [] for i in range(len(article_batches_test)): pred_y_stances = model.session.run( [model.pred_stance], { model.inputs_articles: article_batches_test[i], model.inputs_headlines: headline_batches_test[i], model.outputs: output_batches_test[i], model.h_lengths: length_h_batches_test[i], model.a_lengths: length_a_batches_test[i], model.global_feats: global_batches_test[i] }) all_pred_y_test = np.append(all_pred_y_test, pred_y_stances) # Merge related and unrelated labels together for final prediction final_pred = [] all_pred_count = 0 for label in y_test_round1: # If unrelated if label == 0: final_pred.append(3) else: final_pred.append(all_pred_y_test[all_pred_count]) all_pred_count += 1 simple_y = np.array([array.tolist().index(1) for array in test_y]) f1_score = metrics.f1_score(simple_y, final_pred, average='macro') print("F1 MEAN score: " + str(f1_score)) f1_score_labels = metrics.f1_score(simple_y, final_pred, labels=[0, 1, 2, 3], average=None) print("F1 LABEL scores: " + str(f1_score_labels)) # Convert to string labels for FNC scoring metric label_map = {0: "agree", 1: "disagree", 2: "discuss", 3: "unrelated"} simple_y_str = [label_map[label] for label in simple_y] pred_y_stances_str = [label_map[label] for label in final_pred] report_score(simple_y_str, pred_y_stances_str)