Exemple #1
0
 def most_similar(self, search_text, vectorizer, top_n=5):
     """Returns top n most similar professors for a given search text."""
     x = vectorizer.transform(clean_text(search_text, stopwords_))
     similarities = cosine_similarity(x, self.X)
     pairs = enumerate(similarities[0])
     most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
     return np.array(most_similar)
def search(search_text, type='text_input', choose_model='topic_model'):
    '''
    Parameters
    ----------
    search_text: the text to be used for searching.
    type: 'text_input' for abstract/whole body of their past paper,
    'research_area_input' for research_area.
    model: 'topic_model' for topic modeling and 'cluster_model' for KMeans clustering
    '''
    # If searching by research_area
    # If searching by text_input
    if type == 'text_input':
        if choose_model == 'cluster_model':
            y_test = model.predict(
                vectorizer.transform(clean_text(
                    search_text)))  # predicted cluster label for given text
            # results_df = final_df[final_df['predicted_cluster_num'] == y_test[0]]
            similarities = model.most_similar(
                search_text, vectorizer, top_n=5)  # document_id, similarity
            similarities = similarities[
                similarities[:, 0].argsort()]  # sorting by document_id
            document_ids = list(map(int, similarities[:, 0]))
            results_df = final_df[final_df.index.isin(
                document_ids)].sort_index()
            results_df['similarity'] = similarities[:, 1]
            return results_df.sort_values(by='similarity')[[
                'faculty_name', 'research_areas', 'predicted_research_areas'
            ]]
        else:
            similarities = topic_model.most_similar(
                clean_text(search_text), topic_vectorizer,
                top_n=5)  # document_id, similarity
            similarities = similarities[
                similarities[:, 0].argsort()]  # sorting by document_id
            document_ids = list(map(int, similarities[:, 0]))
            results_df = final_topic_df[final_topic_df.index.isin(
                document_ids)].sort_index()
            results_df['similarity'] = similarities[:, 1]
            cols = [
                'faculty_name', 'faculty_title', 'research_areas',
                'predicted_research_areas', 'office', 'email', 'phone', 'page',
                'google_scholar_link'
            ]
            search_df = results_df.sort_values(by='similarity')[cols][:5]
            return search_df
 def most_similar(self, search_text, vectorizer, top_n=5):
     """Returns most similar professors for a given search text (cleaned and tokenized)."""
     x = self._model.transform(
         vectorizer.transform(clean_text(search_text, stopwords_)))[0]
     similarities = cosine_similarity(x.reshape(1, -1), self.transformed_X)
     pairs = enumerate(similarities[0])
     most_similar = sorted(pairs, key=lambda item: item[1],
                           reverse=True)[:top_n]
     return np.array(most_similar)
    def transform(self, data):
        """Transform training data."""
        # For gensim we need to tokenize the data and filter out stopwords
        self.tokens = [clean_text(doc, stopwords_) for doc in data]

        # bigrams
        if self.bigrams:
            bigram = models.Phrases(
                self.tokens, min_count=5,
                threshold=100)  # higher threshold fewer phrases.
            bigram_mod = models.phrases.Phraser(bigram)
            self.tokens = make_bigrams(self.tokens, bigram_mod)

        # trigrams
        if self.trigrams:
            bigram = models.Phrases(self.tokens, min_count=5, threshold=100)
            bigram_mod = models.phrases.Phraser(bigram)
            trigram = models.Phrases(bigram[self.tokens], threshold=100)
            trigram_mod = models.phrases.Phraser(trigram)
            self.tokens = make_trigrams(self.tokens, bigram_mod, trigram_mod)

        # lemmatization
        if self.lemmatization:
            # Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency)
            spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
            # Do lemmatization keeping only noun, adj, vb, adv
            self.tokens = do_lemmatization(
                spacy_nlp=spacy_nlp,
                texts=self.tokens,
                allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

        # Again remove stopwords after doing lemmatization
        self.tokens = [[token for token in doc if token not in stopwords_]
                       for doc in self.tokens]

        # Build a Dictionary - association word to numeric id
        self.dictionary = corpora.Dictionary(self.tokens)

        # Transform the collection of texts to a numerical form [(word_id, count), ...]
        self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens]

        # tf-idf vectorizer
        if self.tf_idf:
            self._tfidf_model = models.TfidfModel(self.corpus,
                                                  id2word=self.dictionary)
            self.corpus = self._tfidf_model[self.corpus]
 def transform_new(self, search_text):
     """Return transformed new data."""
     bow = self.dictionary.doc2bow(clean_text(search_text, stopwords_))
     if self.tf_idf:
         return self._model[self._tfidf_model[bow]]
     return self._model[bow]