def out_of_core_x_normalisation(data_dir=HEP_TRAIN_PATH, batch_size=1024, persist=False): """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it. This scaler can be used afterwards for normalizing feature matrices. """ doc_generator = get_documents(data_dir=data_dir) word2vec_model = Word2Vec.load(WORD2VEC_MODELPATH) scaler = StandardScaler(copy=False) no_more_samples = False while not no_more_samples: batch = [] for i in xrange(batch_size): try: batch.append(doc_generator.next()) except StopIteration: no_more_samples = True break vectors = [] for doc in batch: for word in doc.get_all_words(): if word in word2vec_model: vectors.append(word2vec_model[word]) matrix = np.array(vectors) print "Matrix shape: {}".format(matrix.shape) scaler.partial_fit(matrix) if persist: save_to_disk(SCALER_PATH, scaler) return scaler
def fit_scaler(data_dir, word2vec_model=WORD2VEC_MODELPATH, batch_size=1024, persist_to_path=None): """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it. This scaler can be used afterwards for normalizing feature matrices. """ if type(word2vec_model) == str: word2vec_model = Word2Vec.load(word2vec_model) doc_generator = get_documents(data_dir) scaler = StandardScaler(copy=False) no_more_samples = False while not no_more_samples: batch = [] for i in range(batch_size): try: batch.append(six.next(doc_generator)) except StopIteration: no_more_samples = True break vectors = [] for doc in batch: for word in doc.get_all_words(): if word in word2vec_model: vectors.append(word2vec_model[word]) matrix = np.array(vectors) print("Fitted to {} vectors".format(matrix.shape[0])) scaler.partial_fit(matrix) if persist_to_path: save_to_disk(persist_to_path, scaler) return scaler
def fit_scaler(data_dir, word2vec_model, batch_size=1024, persist_to_path=None): """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it. This scaler can be used afterwards for normalizing feature matrices. """ if type(word2vec_model) == str: word2vec_model = Word2Vec.load(word2vec_model) doc_generator = get_documents(data_dir) scaler = StandardScaler(copy=False) no_more_samples = False while not no_more_samples: batch = [] for i in range(batch_size): try: batch.append(six.next(doc_generator)) except StopIteration: no_more_samples = True break vectors = [] for doc in batch: for word in doc.get_all_words(): if word in word2vec_model: vectors.append(word2vec_model[word]) matrix = np.array(vectors) print("Fitted to {} vectors".format(matrix.shape[0])) scaler.partial_fit(matrix) if persist_to_path: save_to_disk(persist_to_path, scaler) return scaler
def calculate_recall_for_kw_candidates(data_dir, recreate_ontology=False, verbose=False): """ Generate keyword candidates for files in a given directory and compute their recall in reference to ground truth answers :param data_dir: directory with .txt and .key files :param recreate_ontology: boolean flag for recreating the ontology :param verbose: whether to print computation times :return average_recall: float """ average_recall = 0 total_kw_number = 0 ontology = get_ontology(recreate=recreate_ontology) docs = get_documents(data_dir) considered_keywords = set(get_keywords()) total_docs = 0 start_time = time.clock() for doc in docs: kw_candidates = {kw.get_canonical_form() for kw in generate_keyword_candidates(doc, ontology)} answers = get_answers_for_doc(doc.filename, data_dir, filtered_by=considered_keywords) # print(document.get_meaningful_words()) # print(u"Candidates:") # for kw in sorted(kw_candidates): # print(u"\t" + unicode(kw)) # print # # print(u"Answers:") # for kw in sorted(answers): # print(u"\t" + unicode(kw)) # print # # print(u"Conjunction:") # for kw in sorted(kw_candidates & answers): # print(u"\t" + unicode(kw)) # print recall = 1 if not answers else len(kw_candidates & answers) / (len(answers)) if verbose: print print("Paper: " + doc.filename) print("Candidates: " + str(len(kw_candidates))) print("Recall: " + unicode(recall * 100) + "%") average_recall += recall total_kw_number += len(kw_candidates) total_docs += 1 average_recall /= total_docs if verbose: print print("Total # of keywords: " + str(total_kw_number)) print("Time elapsed: " + str(time.clock() - start_time)) return average_recall
def test( testset_path, ontology=ONTOLOGY_PATH, model=MODEL_PATH, recreate_ontology=False, verbose=True, ): """ Test the trained model on a set under a given path. :param testset_path: path to the directory with the test set :param ontology: path to the ontology :param model: path where the model is pickled :param recreate_ontology: boolean flag whether to recreate the ontology :param verbose: whether to print computation times :return tuple of three floats (precision, recall, f1_score) """ if type(model) in [str, unicode]: model = load_from_disk(model) if type(ontology) in [str, unicode]: ontology = get_ontology(path=ontology, recreate=recreate_ontology) keywords = get_keywords() keyword_indices = {kw: i for i, kw in enumerate(keywords)} all_metrics = calculate_basic_metrics([range(5)]).keys() metrics_agg = {m: [] for m in all_metrics} for doc in get_documents(testset_path, as_generator=True): x, answers, kw_vector = build_test_matrices( [doc], model, testset_path, ontology, ) y_true = build_y_true(answers, keyword_indices, doc.doc_id) # Predict ranking = model.scale_and_predict(x.as_matrix()) y_pred = y_true[0][ranking[::-1]] metrics = calculate_basic_metrics([y_pred]) for k, v in metrics.iteritems(): metrics_agg[k].append(v) return {k: np.mean(v) for k, v in metrics_agg.iteritems()}
def train( trainset_dir, word2vec_path=WORD2VEC_MODELPATH, ontology_path=ONTOLOGY_PATH, model_path=MODEL_PATH, recreate_ontology=False, verbose=True, ): """ Train and save the model on a given dataset :param trainset_dir: path to the directory with the training set :param word2vec_path: path to the gensim word2vec model :param ontology_path: path to the ontology file :param model_path: path where the model should be pickled :param recreate_ontology: boolean flag whether to recreate the ontology :param verbose: whether to print computation times :return None if everything goes fine, error otherwise """ ontology = get_ontology(path=ontology_path, recreate=recreate_ontology) docs = get_documents(trainset_dir) global_index = build_global_frequency_index(trainset_dir, verbose=verbose) word2vec_model = Word2Vec.load(word2vec_path) model = LearningModel(global_index, word2vec_model) tick = time.clock() x, y = build_train_matrices(docs, model, trainset_dir, ontology) if verbose: print("Matrices built in: {0:.2f}s".format(time.clock() - tick)) t1 = time.clock() if verbose: print("X size: {}".format(x.shape)) # Normalize features x = model.maybe_fit_and_scale(x) # Train the model model.fit_classifier(x, y) if verbose: print("Fitting the model: {0:.2f}s".format(time.clock() - t1)) # Pickle the model save_to_disk(model_path, model, overwrite=True)
def build_global_frequency_index(trainset_dir, verbose=True): """ Build the GlobalFrequencyIndex object from the files in a given directory :param trainset_dir: path to the directory with files for training :return: GlobalFrequencyIndex object """ tick = time.clock() global_index = GlobalFrequencyIndex() for doc in get_documents(trainset_dir): global_index.add_document(doc) if verbose: print("Global index built in : {0:.2f}s".format(time.clock() - tick)) return global_index
def test( testset_path=HEP_TEST_PATH, ontology=HEP_ONTOLOGY, model=MODEL_PATH, recreate_ontology=False, verbose=True, ): """ Test the trained model on a set under a given path. :param testset_path: path to the directory with the test set :param ontology: path to the ontology :param model: path where the model is pickled :param recreate_ontology: boolean flag whether to recreate the ontology :param verbose: whether to print computation times :return tuple of three floats (precision, recall, f1_score) """ if type(model) in [str, unicode]: model = load_from_disk(model) if type(ontology) in [str, unicode]: ontology = get_ontology(path=ontology, recreate=recreate_ontology) tick = time.clock() x, answers, kw_vector = build_test_matrices( get_documents(testset_path), model, testset_path, ontology, ) if verbose: print("Matrices built in: {0:.2f}s".format(time.clock() - tick)) # Predict y_pred = model.scale_and_predict_confidence(x) # Evaluate the results return evaluate_results( y_pred, kw_vector, answers, )
def batch_train( trainset_dir, testset_dir, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, ontology_path=ONTOLOGY_PATH, model_path=MODEL_PATH, recreate_ontology=False, word2vec_path=WORD2VEC_MODELPATH, verbose=True, ): """ Train and save the model on a given dataset :param trainset_dir: path to the directory with the training set :param testset_dir: path to the directory with the test set :param nb_epochs: number of passes over the training set :param batch_size: the size of a single batch :param ontology_path: path to the ontology file :param model_path: path to the pickled LearningModel object :param word2vec_path: path to the gensim word2vec model :param recreate_ontology: boolean flag whether to recreate the ontology :param verbose: whether to print computation times :return None if everything goes fine, error otherwise """ ontology = get_ontology(path=ontology_path, recreate=recreate_ontology, verbose=False) global_index = build_global_frequency_index(trainset_dir, verbose=False) word2vec_model = Word2Vec.load(word2vec_path) model = LearningModel(global_index, word2vec_model) previous_best = -1 for epoch in xrange(nb_epochs): doc_generator = get_documents( trainset_dir, as_generator=True, shuffle=True, ) epoch_start = time.clock() if verbose: print("Epoch {}".format(epoch + 1), end=' ') no_more_samples = False batch_number = 0 while not no_more_samples: batch_number += 1 batch = [] for i in xrange(batch_size): try: batch.append(doc_generator.next()) except StopIteration: no_more_samples = True break if not batch: break x, y = build_train_matrices(batch, model, trainset_dir, ontology) # Normalize features x = model.maybe_fit_and_scale(x) # Train the model model.partial_fit_classifier(x, y) if verbose: sys.stdout.write(b'.') sys.stdout.flush() if verbose: print(" {0:.2f}s".format(time.clock() - epoch_start)) metrics = test( testset_dir, model=model, ontology=ontology, verbose=False ) for k, v in metrics.iteritems(): print("{0}: {1}".format(k, v)) if metrics['map'] > previous_best: previous_best = metrics['map'] save_to_disk(model_path, model, overwrite=True)
def calculate_recall_for_kw_candidates(data_dir, recreate_ontology=False, verbose=False): """ Generate keyword candidates for files in a given directory and compute their recall in reference to ground truth answers :param data_dir: directory with .txt and .key files :param recreate_ontology: boolean flag for recreating the ontology :param verbose: whether to print computation times :return average_recall: float """ average_recall = 0 total_kw_number = 0 ontology = get_ontology(recreate=recreate_ontology) docs = get_documents(data_dir) considered_keywords = set(get_keywords()) total_docs = 0 start_time = time.clock() for doc in docs: kw_candidates = { kw.get_canonical_form() for kw in generate_keyword_candidates(doc, ontology) } answers = get_answers_for_doc(doc.filename, data_dir, filtered_by=considered_keywords) # print(document.get_meaningful_words()) # print(u"Candidates:") # for kw in sorted(kw_candidates): # print(u"\t" + unicode(kw)) # print # # print(u"Answers:") # for kw in sorted(answers): # print(u"\t" + unicode(kw)) # print # # print(u"Conjunction:") # for kw in sorted(kw_candidates & answers): # print(u"\t" + unicode(kw)) # print recall = 1 if not answers else len(kw_candidates & answers) / (len(answers)) if verbose: print print("Paper: " + doc.filename) print("Candidates: " + str(len(kw_candidates))) print("Recall: " + unicode(recall * 100) + "%") average_recall += recall total_kw_number += len(kw_candidates) total_docs += 1 average_recall /= total_docs if verbose: print print("Total # of keywords: " + str(total_kw_number)) print("Time elapsed: " + str(time.clock() - start_time)) return average_recall
def batch_test( testset_path=HEP_TEST_PATH, batch_size=BATCH_SIZE, ontology=HEP_ONTOLOGY, model=MODEL_PATH, recreate_ontology=False, verbose=True, ): """ Test the trained model on a set under a given path. :param testset_path: path to the directory with the test set :param batch_size: size of the testing batch :param ontology: path to the ontology :param model: path where the model is pickled :param recreate_ontology: boolean flag whether to recreate the ontology :param verbose: whether to print computation times :return tuple of three floats (precision, recall, f1_score) """ if type(model) in [str, unicode]: model = load_from_disk(model) if type(ontology) in [str, unicode]: ontology = get_ontology(path=ontology, recreate=recreate_ontology) doc_generator = get_documents(testset_path, as_generator=True) start_time = time.clock() all_metrics = ['map', 'mrr', 'ndcg', 'r_prec', 'p_at_3', 'p_at_5'] metrics_agg = {m: [] for m in all_metrics} if verbose: print("Batches:", end=' ') no_more_samples = False batch_number = 0 while not no_more_samples: batch_number += 1 batch = [] for i in xrange(batch_size): try: batch.append(doc_generator.next()) except StopIteration: no_more_samples = True break if not batch: break X, answers, kw_vector = build_test_matrices( batch, model, testset_path, ontology, ) # Predict y_pred = model.scale_and_predict_confidence(X) # Evaluate the results metrics = evaluate_results( y_pred, kw_vector, answers, ) for k, v in metrics.iteritems(): metrics_agg[k].append(v) if verbose: sys.stdout.write(b'.') sys.stdout.flush() if verbose: print() print("Testing finished in: {0:.2f}s".format(time.clock() - start_time)) return {k: np.mean(v) for k, v in metrics_agg.iteritems()}