Esempio n. 1
0
def rp(dataframe, num_topics=300):
    """Returns an RP model for documents stored in a DataFrame.

    Precomputed models are read from file if previously cached, or generated then cached otherwise.

    Parameters
    ----------
    dataframe : Pandas DataFrame
        The DataFrame containing the documents to process.
    num_topics : int (default is 300)
        The number of topics to train the RP model with.

    Returns
    -------
    model : Gensim RpModel
        RP model for documents stored in the DataFrame.
    """
    filename = 'caches/models/rp.model'

    if not os.path.isfile(filename):
        dictionary = dictionary_corpus(dataframe)
        bow = bow_corpus(dataframe)
        tfidf_model = tfidf(dataframe)
        tfidf_corpus = tfidf_model[bow]
        rp_model = RpModel(tfidf_corpus,
                           id2word=dictionary,
                           num_topics=num_topics)
        rp_model.save(filename)
    else:
        rp_model = RpModel.load(filename)

    return rp_model
Esempio n. 2
0
    def rpmodel(self, corpus_t=None, topic=200, save=False, savename=None):
        """

        :param corpus_t:
        :param topic:
        :param save:
        :param savename:
        :return:
        """
        print('using Random Projections model...')
        rpmodel =RpModel(corpus=corpus_t, id2word=self.word_dict, num_topics=topic)
        if save:
            print('输出rpm模型到文件:{}'.format(savename))
            rpmodel.save(savename)
        return rpmodel
Esempio n. 3
0
def create_doc_term_matrix(docs,
                           id2word,
                           tfidf=False,
                           logentropy=False,
                           random_projections=False):
    doc_term_matrix = [id2word.doc2bow(doc) for doc in docs]
    _save_model2(doc_term_matrix, 'doc_term_matrix')

    if random_projections:
        rp_model = RpModel(corpus=doc_term_matrix,
                           id2word=id2word,
                           num_topics=params['num_topics'])
        doc_term_matrix = rp_model[doc_term_matrix]
        _save_model2(doc_term_matrix, 'doc_term_matrix_random_projections')

    if tfidf:
        tfidf_model = TfidfModel(id2word=id2word,
                                 corpus=doc_term_matrix,
                                 normalize=True)
        doc_term_matrix = tfidf_model[doc_term_matrix]
        _save_model2(doc_term_matrix, 'doc_term_matrix_tfidf')

    if logentropy:
        log_model = LogEntropyModel(corpus=doc_term_matrix, normalize=True)
        doc_term_matrix = log_model[doc_term_matrix]
        _save_model2(doc_term_matrix, 'doc_term_matrix_logentropy')

    return doc_term_matrix
Esempio n. 4
0
	def compute(self):
		vec_texts = [text.split() for text in self.texts]
		write("\n    "+"-> Computing the dictionary".ljust(50,'.')) if self.debug else ''
		dictionary = Dictionary(vec_texts)
		write("[OK]") if self.debug else ''
		write("\n    "+"-> Creating the bag-of-words space".ljust(50,'.')) if self.debug else '' 
		corpus = [dictionary.doc2bow(vec) for vec in vec_texts]
		write("[OK]") if self.debug else ''
		write("\n    "+("-> Creating the %s space" % self.method).ljust(50,'.') ) if self.debug else '' 
		tfidf_space = TfidfModel(corpus)
		tfidf_corpus = tfidf_space[corpus]
		if self.method == 'TFIDF':
			self.space = tfidf_space
			self.index = MatrixSimilarity(tfidf_corpus)
		elif self.method == 'LSI': 
			self.space = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) 
			self.index = MatrixSimilarity(self.space[tfidf_corpus])
		elif self.method == 'RP': 
			self.space = RpModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) 
			self.index = MatrixSimilarity(self.space[tfidf_corpus])
		elif self.method == 'LDA':
			self.space = ldamodel.LdaModel(tfidf_corpus, id2word=dictionary, 
														 num_topics=self.num_t)
			self.index = MatrixSimilarity(self.space[tfidf_corpus])
		self.dictionary = dictionary
		write("[OK]\n") if self.debug else ''
Esempio n. 5
0
 def fit_model(self, corpus: List):
     """
     This method creates the model, using Gensim Random Projection.
     The model isn't then returned, but gets stored in the 'model' class attribute.
     """
     dictionary = Dictionary(corpus)
     self.model = RpModel(corpus,
                          id2word=dictionary,
                          **self.additional_parameters)
Esempio n. 6
0
 def fit(self):
     """
     This method creates the model, using Gensim Random Projection.
     The model isn't then returned, but gets stored in the 'model' class attribute.
     """
     corpus = self.extract_corpus()
     dictionary = Dictionary(corpus)
     model = RpModel(corpus, id2word=dictionary)
     self.model = model
Esempio n. 7
0
 def __init__(self, file_path: str, embedding_type: str):
     super().__init__()
     self.__file_path: str = file_path
     embedding_type = embedding_type.lower()
     if embedding_type == "word2vec":
         self.model = KeyedVectors.load_word2vec_format(self.__file_path,
                                                        binary=True)
     elif embedding_type == "doc2vec":
         self.model = Doc2Vec.load(self.__file_path)
     elif embedding_type == "fasttext":
         self.model = fasttext.load_facebook_vectors(self.__file_path)
     elif embedding_type == "ri":
         self.model = RpModel.load(self.__file_path)
     else:
         raise ValueError(
             "Must specify a valid embedding model type for loading from binary file"
         )
Esempio n. 8
0
 def load_model(self):
     return RpModel.load(self.reference)
def main(chosen_model_no=0, num_items_displayed=10, use_spacy=False, use_soft_cosine_similarity=False,
         num_topics=None, no_below=5, no_above=0.5, normalize_vectors=False):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    if num_topics is None:
        num_topics = 100

    possible_model_names = [
        'tf_idf',  # 0
        'lsi_bow', 'lsi_tf_idf',  # 1, 2
        'rp_bow', 'rp_tf_idf',  # 3, 4
        'lda_bow', 'lda_tf_idf',  # 5, 6
        'hdp_bow', 'hdp_tf_idf',  # 7, 8
        'word2vec',  # 9
    ]
    chosen_model_name = possible_model_names[chosen_model_no]
    print(chosen_model_name)

    game_names, _ = load_game_names(include_genres=False, include_categories=False)

    steam_tokens = load_tokens()

    nlp = spacy.load('en_core_web_lg')

    documents = list(steam_tokens.values())

    dct = Dictionary(documents)
    print(len(dct))
    dct.filter_extremes(no_below=no_below, no_above=no_above)
    print(len(dct))

    corpus = [dct.doc2bow(doc) for doc in documents]

    # Pre-processing

    pre_process_corpus_with_tf_idf = chosen_model_name.endswith('_tf_idf')

    tfidf_model = TfidfModel(corpus, id2word=dct, normalize=normalize_vectors)

    if pre_process_corpus_with_tf_idf:
        # Caveat: the leading underscore is important. Do not use this pre-processing if the chosen model is Tf-Idf!
        print('Corpus as Tf-Idf')
        pre_processed_corpus = tfidf_model[corpus]
    else:
        print('Corpus as Bag-of-Words')
        pre_processed_corpus = corpus

    # Model

    model = None
    wv = None
    index2word_set = None

    if chosen_model_name == 'tf_idf':
        print('Term Frequency * Inverse Document Frequency (Tf-Idf)')
        model = tfidf_model

    elif chosen_model_name.startswith('lsi'):
        print('Latent Semantic Indexing (LSI/LSA)')
        model = LsiModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('rp'):
        print('Random Projections (RP)')
        model = RpModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('lda'):
        print('Latent Dirichlet Allocation (LDA)')
        model = LdaModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('hdp'):
        print('Hierarchical Dirichlet Process (HDP)')
        model = HdpModel(pre_processed_corpus, id2word=dct)

    elif chosen_model_name == 'word2vec':
        use_a_lot_of_ram = False

        if use_a_lot_of_ram:
            model = None

            print('Loading Word2Vec based on Google News')
            # Warning: this takes a lot of time and uses a ton of RAM!
            wv = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
        else:
            if use_spacy:
                print('Using Word2Vec with spaCy')
            else:
                print('Training Word2Vec')

                model = Word2Vec(documents)

                wv = model.wv

        if not use_spacy:
            wv.init_sims(replace=normalize_vectors)

            index2word_set = set(wv.index2word)

    else:
        print('No model specified.')
        model = None

    if chosen_model_name != 'word2vec':
        if not use_soft_cosine_similarity:
            index = MatrixSimilarity(model[pre_processed_corpus], num_best=10, num_features=len(dct))
        else:
            w2v_model = Word2Vec(documents)
            similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv)
            similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dct, tfidf_model, nonzero_limit=100)
            index = SoftCosineSimilarity(model[pre_processed_corpus], similarity_matrix)
    else:
        index = None

    query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True)

    app_ids = list(int(app_id) for app_id in steam_tokens.keys())

    matches_as_app_ids = []

    for query_count, query_app_id in enumerate(query_app_ids):
        print('[{}/{}] Query appID: {} ({})'.format(query_count + 1, len(query_app_ids),
                                                    query_app_id, get_app_name(query_app_id, game_names)))

        query = steam_tokens[str(query_app_id)]

        if use_spacy:
            spacy_query = Doc(nlp.vocab, query)
        else:
            spacy_query = None

        if chosen_model_name != 'word2vec':
            vec_bow = dct.doc2bow(query)
            if pre_process_corpus_with_tf_idf:
                pre_preoccessed_vec = tfidf_model[vec_bow]
            else:
                pre_preoccessed_vec = vec_bow
            vec_lsi = model[pre_preoccessed_vec]
            sims = index[vec_lsi]

            if use_soft_cosine_similarity:
                sims = enumerate(sims)

            similarity_scores_as_tuples = [(str(app_ids[i]), sim) for (i, sim) in sims]
            similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples)
        else:
            if use_spacy:
                similarity_scores = {}
                for app_id in steam_tokens:
                    reference_sentence = steam_tokens[app_id]
                    spacy_reference = Doc(nlp.vocab, reference_sentence)
                    similarity_scores[app_id] = spacy_query.similarity(spacy_reference)
            else:
                query_sentence = filter_out_words_not_in_vocabulary(query, index2word_set)

                similarity_scores = {}

                counter = 0
                num_games = len(steam_tokens)

                for app_id in steam_tokens:
                    counter += 1

                    if (counter % 1000) == 0:
                        print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id]))

                    reference_sentence = steam_tokens[app_id]
                    reference_sentence = filter_out_words_not_in_vocabulary(reference_sentence, index2word_set)

                    try:
                        similarity_scores[app_id] = wv.n_similarity(query_sentence, reference_sentence)
                    except ZeroDivisionError:
                        similarity_scores[app_id] = 0

        similar_app_ids = print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed,
                                                       verbose=False)
        matches_as_app_ids.append(similar_app_ids)

    print_ranking(query_app_ids,
                  matches_as_app_ids,
                  only_print_banners=True)

    return
 def lsa_fact(tf, tokenized_docs):
     corpus = RpModel(corpus=tf.distance_matrix,
                      id2word=tf.id2Word,
                      num_topics=num_topics)
     return corpus[tf.distance_matrix]