def most_similar(self, search_text, vectorizer, top_n=5): """Returns top n most similar professors for a given search text.""" x = vectorizer.transform(clean_text(search_text, stopwords_)) similarities = cosine_similarity(x, self.X) pairs = enumerate(similarities[0]) most_similar = sorted(pairs, key=lambda item: item[1])[:top_n] return np.array(most_similar)
def search(search_text, type='text_input', choose_model='topic_model'): ''' Parameters ---------- search_text: the text to be used for searching. type: 'text_input' for abstract/whole body of their past paper, 'research_area_input' for research_area. model: 'topic_model' for topic modeling and 'cluster_model' for KMeans clustering ''' # If searching by research_area # If searching by text_input if type == 'text_input': if choose_model == 'cluster_model': y_test = model.predict( vectorizer.transform(clean_text( search_text))) # predicted cluster label for given text # results_df = final_df[final_df['predicted_cluster_num'] == y_test[0]] similarities = model.most_similar( search_text, vectorizer, top_n=5) # document_id, similarity similarities = similarities[ similarities[:, 0].argsort()] # sorting by document_id document_ids = list(map(int, similarities[:, 0])) results_df = final_df[final_df.index.isin( document_ids)].sort_index() results_df['similarity'] = similarities[:, 1] return results_df.sort_values(by='similarity')[[ 'faculty_name', 'research_areas', 'predicted_research_areas' ]] else: similarities = topic_model.most_similar( clean_text(search_text), topic_vectorizer, top_n=5) # document_id, similarity similarities = similarities[ similarities[:, 0].argsort()] # sorting by document_id document_ids = list(map(int, similarities[:, 0])) results_df = final_topic_df[final_topic_df.index.isin( document_ids)].sort_index() results_df['similarity'] = similarities[:, 1] cols = [ 'faculty_name', 'faculty_title', 'research_areas', 'predicted_research_areas', 'office', 'email', 'phone', 'page', 'google_scholar_link' ] search_df = results_df.sort_values(by='similarity')[cols][:5] return search_df
def most_similar(self, search_text, vectorizer, top_n=5): """Returns most similar professors for a given search text (cleaned and tokenized).""" x = self._model.transform( vectorizer.transform(clean_text(search_text, stopwords_)))[0] similarities = cosine_similarity(x.reshape(1, -1), self.transformed_X) pairs = enumerate(similarities[0]) most_similar = sorted(pairs, key=lambda item: item[1], reverse=True)[:top_n] return np.array(most_similar)
def transform(self, data): """Transform training data.""" # For gensim we need to tokenize the data and filter out stopwords self.tokens = [clean_text(doc, stopwords_) for doc in data] # bigrams if self.bigrams: bigram = models.Phrases( self.tokens, min_count=5, threshold=100) # higher threshold fewer phrases. bigram_mod = models.phrases.Phraser(bigram) self.tokens = make_bigrams(self.tokens, bigram_mod) # trigrams if self.trigrams: bigram = models.Phrases(self.tokens, min_count=5, threshold=100) bigram_mod = models.phrases.Phraser(bigram) trigram = models.Phrases(bigram[self.tokens], threshold=100) trigram_mod = models.phrases.Phraser(trigram) self.tokens = make_trigrams(self.tokens, bigram_mod, trigram_mod) # lemmatization if self.lemmatization: # Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency) spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv self.tokens = do_lemmatization( spacy_nlp=spacy_nlp, texts=self.tokens, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Again remove stopwords after doing lemmatization self.tokens = [[token for token in doc if token not in stopwords_] for doc in self.tokens] # Build a Dictionary - association word to numeric id self.dictionary = corpora.Dictionary(self.tokens) # Transform the collection of texts to a numerical form [(word_id, count), ...] self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens] # tf-idf vectorizer if self.tf_idf: self._tfidf_model = models.TfidfModel(self.corpus, id2word=self.dictionary) self.corpus = self._tfidf_model[self.corpus]
def transform_new(self, search_text): """Return transformed new data.""" bow = self.dictionary.doc2bow(clean_text(search_text, stopwords_)) if self.tf_idf: return self._model[self._tfidf_model[bow]] return self._model[bow]