def fit(self, X, y): """ Fits TOPIC DEFINITIONS and topic labels to MSC Model. Parameters ---------- X : The training input samples (Topic definitions) list of strings or wiki pages. y : The target values. An array of int.s Returns ------- self : object Returns self. """ # Adequating corpus for inference later X = list( doc_utils.prepare_corpus(X, train_data=True, preprocess=self.preprocess, dataset_type=self.dataset_type)) self.model.build_vocab(X) self.model.train(X, total_examples=self.model.corpus_count, epochs=self.model.epochs) self.classes_ = unique_labels(y) self.X_ = X self.y_ = y return self
def predict(self, X): """ Prediction function for max. similarity classifier. Parameters ---------- X : list The input dataset: Wiki or Arxiv dataset to classify Returns ------- y : ndarray, shape (n_samples,) Prediction outputs . """ # Check is fit had been called check_is_fitted(self, ['X_', 'y_']) # Input validation input_articles = list( doc_utils.prepare_corpus(X, train_data=False, preprocess=self.preprocess, dataset_type=self.dataset_type)) outputs = list() for doc in input_articles: inferred_vector = self.model.infer_vector(doc) sims = self.model.docvecs.most_similar([inferred_vector], topn=len( self.model.docvecs)) most_similar_label = sims[0][0] # index 0 === most similar outputs.append(most_similar_label) pred_labels = np.array(outputs) return pred_labels
def score(self, X, y, eval="weighted"): """ Score function that returns depending on "eval" parameter: - top1: top-1 accuracy - top2: top-2 accuracy - weighted: weighted accuracy of the predictions for articles "X" given true labels "y". """ accuracy_list = list() outputs = list() check_is_fitted(self, ['X_', 'y_']) input_articles = list( doc_utils.prepare_corpus(X, train_data=False, preprocess=self.preprocess, dataset_type=self.dataset_type)) for i, doc in enumerate(input_articles): inferred_vector = self.model.infer_vector(doc) sims = self.model.docvecs.most_similar([inferred_vector], topn=len( self.model.docvecs)) most_similar_label = sims[0][0] # index 0 === most similar outputs.append(most_similar_label) second_most_similar_label = sims[1][0] if most_similar_label == y[i]: accuracy_list.append(1) elif (second_most_similar_label == y[i] and "weighted" in eval): accuracy_list.append(0.5) elif (second_most_similar_label == y[i] and "top2" in eval): accuracy_list.append(1) else: accuracy_list.append(0) accuracy_list = np.array(accuracy_list) # print("Model {} accuracy over {} test documents: {}%.".format(eval, len(y), np.mean(accuracy_list) * 100)) return np.mean(accuracy_list)
def wiki_pseudo_label(self, x_train, dataset, result="extended", top_n=2, debug=False): input_articles = list( doc_utils.prepare_corpus(dataset, train_data=False, preprocess=self.preprocess, dataset_type=self.dataset_type)) doc_topics_sims = [[], [], [], [], [], [], [], [], []] article_list = list() for topic in dataset: for article in topic[0]: article_list.append(article) for doc_id, doc in enumerate(input_articles): inferred_vector = self.model.infer_vector(doc) sims = self.model.docvecs.most_similar([inferred_vector], topn=len( self.model.docvecs)) top_n_sims = sims[:top_n] for i in range(top_n): topic_id = top_n_sims[i][0] #predicted topic id of paper topic_sim = top_n_sims[i][1] #similarity doc_topics_sims[topic_id].append((topic_sim, doc_id)) best_articles_per_topic = [-1, -1, -1, -1, -1, -1, -1, -1, -1] for i, topic in enumerate(doc_topics_sims): article_id = (max(topic, key=lambda i: i[0])[1]) if debug: print("For topic {}, best article id: {}".format( i, article_id)) best_articles_per_topic[ i] = article_id #TODO: add topN papers and not just best one x_train_ext = ["", "", "", "", "", "", "", "", ""] x_train_articles = ["", "", "", "", "", "", "", "", ""] for topic_id, train_sample in enumerate(x_train): best_article_id = best_articles_per_topic[topic_id] if debug: print( "For topic id {}, Adding article id {} to dataset".format( topic_id, best_article_id)) print(article_list[best_article_id]) #Creating extended train data if result in "extended": x_train_ext[topic_id] = " . ".join( [article_list[best_article_id], train_sample]) elif result in "bestpapers": #train samples are the best articles x_train_articles[topic_id] = article_list[best_article_id] else: print( "[ERROR] Result argument can be only 'extended' or 'bestpapers'." ) return -1 if result in "extended": new_x_train = x_train_ext else: #best articles new_x_train = x_train_articles return new_x_train
def arxiv_pseudo_label(self, x_train, dataset, paperslist, result="extended", top_n=1, debug=False): if paperslist is None: print("ERROR: paperlist from Arxivparser needed") return -1 input_articles = list( doc_utils.prepare_corpus(dataset, train_data=False, preprocess=self.preprocess, dataset_type=self.dataset_type)) doc_topics_sims = [[], [], [], [], [], [], [], []] for doc_id, doc in enumerate(input_articles): inferred_vector = self.model.infer_vector(doc) sims = self.model.docvecs.most_similar([inferred_vector], topn=len( self.model.docvecs)) top_n_sims = sims[:top_n] for i in range(top_n): topic_id = top_n_sims[i][0] #predicted topic id of paper topic_sim = top_n_sims[i][1] #similarity doc_topics_sims[topic_id].append((topic_sim, doc_id)) best_papers_per_topic = [-1, -1, -1, -1, -1, -1, -1, -1] n_papers_per_topic = len(paperslist) // len( doc_utils.ARXIV_WIKI_TOPICS) for i, topic in enumerate(doc_topics_sims): paper_id = (max(topic, key=lambda i: i[0])[1]) best_papers_per_topic[ i] = paper_id #TODO: add topN papers and not just best one if debug: true_label = paper_id // n_papers_per_topic print("Topic {} ({}) best matching paper: id #{}".format( i, doc_utils.ARXIV_WIKI_TOPICS[i], paper_id)) print("\t--->True label:[", str(true_label), "](", doc_utils.ARXIV_WIKI_TOPICS[true_label], ") \t\tPaper title:", paperslist[paper_id]['title']) x_train_ext = ["", "", "", "", "", "", "", ""] x_train_papers = ["", "", "", "", "", "", "", ""] for topic_id, train_sample in enumerate(x_train): best_paper_id = best_papers_per_topic[topic_id] #Creating extended train data if result in "extended": x_train_ext[topic_id] = " . ".join([ train_sample, paperslist[best_paper_id]["title"], paperslist[best_paper_id]["abstract"] ]) elif result in "bestpapers": #train samples are the best papers x_train_papers[topic_id] = paperslist[best_paper_id][ "title"] + " : " + paperslist[best_paper_id]["abstract"] else: print( "[ERROR] Result argument can be only 'extended' or 'bestpapers'." ) return -1 if result in "extended": new_x_train = x_train_ext else: #best papers new_x_train = x_train_papers return new_x_train