Beispiel #1
0
def cal_tfidf(documents, topk=10) -> List:
    """
    tfidf模型训练
    :param documents: 要进行训练的文档
    :param topk: 提取tfidf score 的前多少个单词, 如果topk大于提取到的单词个数,返回所有单词
    :return:
    """
    # 单个文档分成列表
    docs = [[word for word in document.split(' ')] for document in documents]
    # 生成字典
    dictionary = corpora.Dictionary(docs)
    # 生成bag of word
    docs_bow = [dictionary.doc2bow(doc) for doc in docs]
    if os.path.isfile(tfidfmodel):
        model = TfidfModel.load(tfidfmodel)
    else:
        model = TfidfModel(docs_bow)
        model.save(tfidfmodel)
    # 生成文本向量
    docs_vector = list(model[docs_bow])
    # 对所有的文本向量进行排序,取钱topk
    docs_sort_vector = [
        sorted(doc, key=lambda x: x[1], reverse=True)[:topk]
        for doc in docs_vector
    ]
    # 把对应的向量id转换成中文单词,docs_sort_chinese是中文单词和tfidf的score的列表
    docs_sort_chinese = [[(dictionary[vec[0]], vec[1]) for vec in doc]
                         for doc in docs_sort_vector]
    return docs_sort_chinese
Beispiel #2
0
def get_lda_feature():
    doc_train = pd.read_csv(id_content_path)
    documents = doc_train['content'].apply(lambda x: x.split(' '))
    #    建立词和ID的映射字典(id:word)
    dictionary = corpora.Dictionary(documents)
    #    建立文档和id和list(tuple(id,num)) of list df
    ds_df = [dictionary.doc2bow(document) for document in documents]
    #    建立tfidf模型,通过语料文档的tf,预测的时候只要提供语料的df
    tfidf_model = TfidfModel(ds_df)
    #    获取文档的tdf获取文档tfidf
    ds_tfidf = tfidf_model[ds_df]
    #    定义文档的主题个数
    n = 60
    #    构建lda模型,输入参数是文档的tfidf,并指明主题的个数
    lda_model = LdaModel(ds_tfidf, num_topics=n, passes=10, random_state=12)
    vec_size = (len(documents), n)
    lda_feature = np.zeros(vec_size)
    i = 0

    for doc in ds_tfidf:
        topics = lda_model.get_document_topics(doc, minimum_probability=0.01)
        for topic in topics:
            num_topic = topic[0]
            prob = round(topic[1], 5)
            lda_feature[i, num_topic] = prob
        i += 1

    f_names = get_lda_feacture_name(n)
    pd.DataFrame(lda_feature, columns=f_names).to_csv(id_content_lda_path,
                                                      index=0)
Beispiel #3
0
    def __init__(self,
                 docs,
                 strip_diac=True,
                 num_option=OPTION_GROUP,
                 usr_option=OPTION_GROUP,
                 url_option=OPTION_GROUP,
                 emo_option=OPTION_GROUP,
                 lc=True,
                 del_dup1=True,
                 token_list=[-1],
                 lang=None,
                 **kwargs):
        self.strip_diac = strip_diac
        self.num_option = num_option
        self.usr_option = usr_option
        self.url_option = url_option
        self.emo_option = emo_option
        self.emoclassifier = EmoticonClassifier()
        self.lc = lc
        self.del_dup1 = del_dup1
        self.token_list = token_list

        if lang:
            self.lang = LangDependency(lang)
        else:
            self.lang = None

        self.kwargs = {k: v for k, v in kwargs.items() if k[0] != '_'}

        docs = [self.tokenize(d) for d in docs]
        self.dictionary = corpora.Dictionary(docs)
        corpus = [self.dictionary.doc2bow(d) for d in docs]
        self.model = TfidfModel(corpus)
Beispiel #4
0
def gensim_similarity(data_c):
    """
    使用Gensim包计算相似度:
        词频
            COUNT
            LDA
            LSI
        Tfidf:
            TFIDF
            LDA
            LSI
    """
    # 合并获取词袋
    data_c['s1'] = data_c['s1'].apply(lambda text: list(text))
    data_c['s2'] = data_c['s2'].apply(lambda text: list(text))
    data_c_all = data_c['s1'].append(data_c['s2'], ignore_index=True).to_frame(name='s')

    # 构建词典
    print("starting create dic....")
    dic = corpora.Dictionary(data_c['s1'].values)
    dic.add_documents(data_c['s2'].values)

    print("文档数:", dic.num_docs)
    print("starting create count bow...")
    data_c['s1'] = data_c['s1'].apply(lambda text: dic.doc2bow(text))
    data_c['s2'] = data_c['s2'].apply(lambda text: dic.doc2bow(text))
    data_c_all['s'] = data_c_all['s'].apply(lambda text: dic.doc2bow(text))

    # cps1 = [dic.doc2bow(text) for text in list(data_c['s1'].values)]
    # cps2 = [dic.doc2bow(text) for text in list(data_c['s2'].values)]

    cps1 = list(data_c['s1'])
    cps2 = list(data_c['s2'])
    cps = list(data_c_all['s'])

    # 计算s1,s2词频相似度
    print("starting count similarity....")
    sm = similarities.SparseMatrixSimilarity(corpus=cps1, num_features=10000)
    count_sm = np.diag(sm[cps2])

    # 计算s1,s2词频LDA,LSI相似度
    count_lda_sm = lda_similarity(cps, cps1, cps2, dic)
    # count_lsi_sm= lsi_similarity(cps,cps1,cps2,dic)

    # 计算s1,s2 tfidf相似度
    print("starting tfidf similarity....")
    tfidf = TfidfModel(corpus=cps, id2word=dic)
    cps1_tfidf = tfidf[cps1]
    cps2_tfidf = tfidf[cps2]
    cps_tfidf = tfidf[cps]

    # 计算s1,s2 TFIDF相似度
    sm = similarities.SparseMatrixSimilarity(corpus=cps1_tfidf, num_features=10000)
    tfidf_sm = np.diag(sm[cps2_tfidf])

    # 计算s1,s2词频LDA,LSI相似度
    tfidf_lda_sm = lda_similarity(cps_tfidf, cps1_tfidf, cps2_tfidf, dic)
    tfidf_lsi_sm = lda_similarity(cps_tfidf, cps1_tfidf, cps2_tfidf, dic)

    return count_sm, count_lda_sm, tfidf_sm, tfidf_lda_sm, tfidf_lsi_sm
def calculate_embedding(corpus: Corpus,
                        *,
                        rank=2,
                        svd_dims=50,
                        perplexity=30,
                        seed=0):
    """ Calculate a document embedding that assigns each document in the
    corpus a N-d position based on the word usage.

    :returns: A list of N-d tuples for the documents in the corpus.
    """
    from gensim.models.tfidfmodel import TfidfModel
    from sklearn.decomposition import TruncatedSVD
    from sklearn.manifold import TSNE

    dic = corpus.dictionary
    freqs = corpus.frequencies
    tfidf = corpus2dense(TfidfModel(dictionary=dic)[freqs], len(dic)).T

    if svd_dims is not None:
        svd = TruncatedSVD(n_components=svd_dims, random_state=seed)
        components = svd.fit_transform(tfidf)
    else:
        components = tfidf

    model = TSNE(rank,
                 metric='cosine',
                 square_distances=True,
                 perplexity=perplexity,
                 random_state=seed)
    return model.fit_transform(components)
Beispiel #6
0
 def fit(self, documents, labels=None):
     self.lexicon = Dictionary(documents)
     self.tfidf = TfidfModel(
         [self.lexicon.doc2bow(doc) for doc in documents],
         id2word=self.lexicon)
     self.save()
     return self
def lsi(documents, topicNum):
	texts = [[word for word in document.split(' ')] for document in documents]
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(len(texts)))
	dictionary = corpora.Dictionary(texts)
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get corpus..')
	corpusD = [dictionary.doc2bow(text) for text in texts]
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' tfidf Model...')
	tfidf = TfidfModel(corpusD)
	corpus_tfidf = tfidf[corpusD]

	model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1)

	lsiFeature = np.zeros((len(texts), topicNum))
	print('translate...')
	i = 0

	for doc in corpusD:
		topic = model[doc]
		
		for t in topic:
			 lsiFeature[i, t[0]] = round(t[1],5)
		i = i + 1
		if i%1000 == 1:
			print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(i))

	return lsiFeature
def getLsiFeature(documents, topicNum):
    '''
     Funciton:
         generate lsi features by training lsi model
     Input:
         documents: list of preprocessed sentences
         topicNum: output vector dimension
     Output:
         lsi features(DataFrame format)
    '''
    # get corpus
#     LogInfo(' Get corpus...')
    texts = [[word for word in document.split(' ')] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpusD = [dictionary.doc2bow(text) for text in texts]
    
    # train lsi model
#     LogInfo(' Train LSI model...')
    tfidf = TfidfModel(corpusD)
    corpus_tfidf = tfidf[corpusD]
    model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1)

    # generate lsi features
    LogInfo(' Generate LSI features...')
    lsiFeature = np.zeros((len(texts), topicNum))
    i = 0
    for doc in corpusD:
        topic = model[doc]
        for t in topic:
             lsiFeature[i, t[0]] = round(t[1],5)
        i = i + 1
    colName = getColName(topicNum, "qlsi")
    lsiFeature = pd.DataFrame(lsiFeature, columns = colName)
    return lsiFeature
Beispiel #9
0
    def load(self):

        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)
    def tf_idf_transform(self, doc):
        """
        Perform tf-idf transformation on doc.
        """
        self.dictionary = corpora.Dictionary(doc)
        corpus = [self.dictionary.doc2bow(text) for text in doc]
        self.tfIdfModel = TfidfModel(corpus)

        conf.mk_dir(self.tfIdfPath)

        self.dictionary.save(self.dictPath)
        logger.info('Dictionary has been saved in %s.' % self.dictPath)

        self.tfIdfModel.save(self.tfIdfPath)
        logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath)

        tfidf_corpus = self.tfIdfModel[corpus]
        tfidf_corpus_path = conf.get_filename_via_tpl('tfidf',
                                                      n_users=self.nUsers,
                                                      postfix='mm',
                                                      n_samples=self.nSamples)
        corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus)
        logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' %
                    (np.array(tfidf_corpus).shape, tfidf_corpus_path))

        return tfidf_corpus
Beispiel #11
0
def get_tfidf_model(path="data/swiki.json",
                    save_path="data/swiki_dict.txt",
                    stem=False):
    """
    :param path:
    :param save_path:
    :return:
    """
    texts = map(lambda x: _preprocess_text(x, stem=stem),
                _load_json_list("data/swiki.json"))

    def _get_swiki_dictionary():
        dict_file = os.path.join(BASE_DIR, save_path)
        if os.path.exists(dict_file):
            dictionary = corpora.Dictionary.load_from_text(dict_file)
        else:
            dictionary = corpora.Dictionary(texts)
            dictionary.save_as_text(dict_file)
        return dictionary

    dct = _get_swiki_dictionary()

    bow_texts = map(dct.doc2bow, texts)
    tfidf = TfidfModel(bow_texts)
    return dct, tfidf
Beispiel #12
0
    def __init__(self):
        self.inner_model = None

        # load dictionary and corpus
        vocabulary = "raw"
        corpora_folder = os.path.join(*[
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            'data', 'corpora'
        ])
        self.dictionary = corpora.Dictionary.load(
            os.path.join(corpora_folder, "%s.dict" % (vocabulary, )))
        self.corpus = corpora.MmCorpus(
            os.path.join(corpora_folder, "%s.mm" % (vocabulary, )))

        # parameters
        self.dataset = "CASEREPORT"

        # data file path
        models_folder = os.path.join(*[
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            'data', 'models'
        ])
        filename = "TFIDF_%s" % (self.dataset, )
        self.filepath = os.path.join(models_folder, filename)
        model_exists = os.path.isfile(self.filepath)

        if model_exists:
            logging.info("found data file %s" % (self.filepath, ))
            self.inner_model = TfidfModel.load(self.filepath)
        else:
            self.inner_model = TfidfModel(corpus=self.corpus)
            self.inner_model.save(self.filepath)
Beispiel #13
0
 def get_tfidf(self):
     docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
     model_tfidf = TfidfModel(docs_corpus, id2word=self.docs_dict)
     docs_tfidf = model_tfidf[docs_corpus]
     docs_vecs = np.vstack(
         [sparse2full(c, len(self.docs_dict)) for c in docs_tfidf])
     return docs_vecs
Beispiel #14
0
def tf_idf_weight(spacy_contexts):
    """
    @param spacy_contexts Spacy-fied contexts

    Returns list of Dicts, each dictionary corresponds to one document and
    contains words and their tf-idf weights
    """
    docs_dict = Dictionary(spacy_contexts)
    docs_dict.compactify()

    docs_corpus = [docs_dict.doc2bow(doc) for doc in spacy_contexts]

    model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
    docs_tfidf = model_tfidf[docs_corpus]

    # Now generate a list of dicts with k,v = "word": tfidf_frequency
    # each dict contains words from one document (sentence)
    doc_tfidf_dicts = []

    for doc in docs_tfidf:
        d = dict()
        for term, freq in doc:
            d[docs_dict[term]] = freq

        doc_tfidf_dicts.append(d)

    return doc_tfidf_dicts
def getLdaFeature(documents, topicNum):
    '''
     Funciton:
         generate lda features by training lda model
     Input:
         documents: list of preprocessed sentences
         topicNum: output vector dimension
     Output:
         lda features(DataFrame format)
    '''
    # get corpus
#     LogInfo(' Get corpus...')
    texts = [[word for word in document.split(' ')] for document in documents]
    dictionary = corpora.Dictionary(texts)    
    corpusD = [dictionary.doc2bow(text) for text in texts]

    # train lda model
#     LogInfo(' Train LDA model...')
    tfidf = TfidfModel(corpusD)
    corpus_tfidf = tfidf[corpusD]
#     ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12)
    ldaModel = LdaModel(corpus_tfidf, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12)
    # generate lda features
    LogInfo(' Generate LDA features...')
    ldaFeature = np.zeros((len(texts), topicNum))
    i = 0
    for doc in corpus_tfidf:
        topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01)
        for t in topic:
             ldaFeature[i, t[0]] = round(t[1],5)
        i = i + 1
    colName = getColName(topicNum, "qlda")
    ldaFeature = pd.DataFrame(ldaFeature, columns = colName)
    return ldaFeature
def lda(documents, topicNum):
	texts = [[word for word in document.split(' ')] for document in documents]
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(len(texts)))
	dictionary = corpora.Dictionary(texts)
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get corpus..')
	corpusD = [dictionary.doc2bow(text) for text in texts]

	#id2word = dictionary.id2word
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' tfidf Model...')
	tfidf = TfidfModel(corpusD)
	corpus_tfidf = tfidf[corpusD]
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' train lda Model...')
	ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12)
	#ldaModel = gensim.models.ldamodel.LdaModel(corpus=corpusD, num_topics=topicNum, update_every=1, chunksize=8000, passes=10)
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get lda feature...')
	ldaFeature = np.zeros((len(texts), topicNum))
	i = 0

	for doc in corpus_tfidf:
		topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01)
		
		for t in topic:
			 ldaFeature[i, t[0]] = round(t[1],5)
		i = i + 1
		if i%1000 == 1:
			print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(i))

	return ldaFeature
def compute_tfidf():
    from gensim.models.tfidfmodel import TfidfModel

    keys, unstem_map, paragraph_lengths, int2word, word2int = compute_all_words(
    )
    with time_code('compute_tfidf'):
        tfidf = TfidfModel(corpus, smartirs='ltc', id2word=int2word)
    return tfidf
Beispiel #18
0
 def loadCorpus(self, mmfile, dictfile, doctuplesfile=None):
     self.corpus = corpora.MmCorpus(mmfile)
     self.dictionary = corpora.Dictionary.load(dictfile)
     if doctuplesfile != None:
         with open(doctuplesfile, 'rb') as docpicklef:
             self.doctuples = pickle.load(docpicklef)
     if self.toweight:
         self.tfidf = TfidfModel(self.corpus)
Beispiel #19
0
def predict_on_group(model,
                     docs_data,
                     word2vec_model300,
                     length=5) -> 'pd.DataFrame of type :  pair_id  || target':
    """ 
    Parameters:

        model -- model object with methods train and predict

        docs_data -- pandas Data Frame with fields pair_id, content, target
        word2vec_model300 -- w2v model (object)


    Returns:

       pd.DataFrame of type : { pair_id  || target }   with predicted target for each pair_id

    """

    dictionary = corpora.Dictionary()
    for i in docs_data.content:
        try:
            dictionary.add_documents([i])
        except:
            dictionary.add_documents([['a']])

    docs_data['vector'] = docs_data.content.apply(doc_opti,
                                                  args=(dictionary, ))
    # except:
    #     docs_data['vector'] = docs_data.content.apply(dictionary.doc2bow)

    corpus = []
    for line in docs_data.content:
        try:
            if math.isnan(line): line = ["мимо"]
        except:
            pass
        corpus = corpus + [dictionary.doc2bow(line)]

    similarity_matrix = word2vec_model300.similarity_matrix(
        dictionary,
        tfidf=TfidfModel(corpus, dictionary=dictionary),
        threshold=0.0,
        exponent=2.0,
        nonzero_limit=100)

    docs_data['dist_vec'] = docs_data.vector.apply(make_dist_vec,
                                                   args=(docs_data.vector,
                                                         similarity_matrix))

    features = [str(i) for i in range(length)]

    for i in range(length):
        docs_data[str(i)] = docs_data.dist_vec.apply(take_S, args=(i, ))

    docs_data['target'] = model.predict(np.array(docs_data[features]))

    return docs_data[['pair_id', 'target']]
Beispiel #20
0
def train_model_on_group(model, docs_data, word2vec_model300, length=5):
    """ 
    Parameters:

        model -- model object with methods train and predict

        docs_data -- pandas Data Frame with fields pair_id, content, target
        word2vec_model300 -- w2v model (object)


    Returns:

         model trained on data


    """

    dictionary = corpora.Dictionary()
    for i in docs_data.content:
        try:
            dictionary.add_documents([i])
        except:
            dictionary.add_documents([['a']])

    docs_data['vector'] = docs_data.content.apply(doc_opti,
                                                  args=(dictionary, ))

    corpus = []
    for line in docs_data.content:
        try:
            if math.isnan(line): line = ["мимо"]
        except:
            pass
        corpus = corpus + [dictionary.doc2bow(line)]

    similarity_matrix = word2vec_model300.similarity_matrix(
        dictionary,
        tfidf=TfidfModel(corpus, dictionary=dictionary),
        threshold=0.0,
        exponent=2.0,
        nonzero_limit=100)

    docs_data['dist_vec'] = docs_data.vector.apply(make_dist_vec,
                                                   args=(docs_data.vector,
                                                         similarity_matrix))

    features = [str(i) for i in range(length)]

    for i in range(length):
        docs_data[str(i)] = docs_data.dist_vec.apply(take_S, args=(i, ))

    print(docs_data.head())

    model = model.fit(docs_data[features], docs_data['target'])

    print(model.score(docs_data[features], docs_data['target']))

    return model
Beispiel #21
0
    def __init__(self,
                 docs,
                 num_option=OPTION_GROUP,
                 usr_option=OPTION_GROUP,
                 url_option=OPTION_GROUP,
                 emo_option=OPTION_GROUP,
                 lc=True,
                 del_dup=True,
                 del_punc=False,
                 del_diac=True,
                 token_list=[-1],
                 token_min_filter=-1,
                 token_max_filter=1.0,
                 tfidf=True,
                 **kwargs):
        self.del_diac = del_diac
        self.num_option = num_option
        self.usr_option = usr_option
        self.url_option = url_option
        self.emo_option = emo_option
        self.lc = lc
        self.del_dup = del_dup
        self.del_punc = del_punc
        self.token_list = token_list
        self.token_min_filter = token_min_filter
        self.token_max_filter = token_max_filter
        self.tfidf = tfidf
        self.kwargs = {k: v for k, v in kwargs.items() if k[0] != '_'}

        if emo_option == OPTION_NONE:
            self.emo_map = None
        else:
            # self.emo_map = get_compiled_map(os.path.join(os.path.dirname(__file__), 'resources', 'emoticons.json'))
            self.emo_map = EmoticonClassifier()

        docs = [self.tokenize(d) for d in docs]
        self.dictionary = corpora.Dictionary(docs)
        corpus = [self.dictionary.doc2bow(d) for d in docs]
        if self.token_min_filter != 1 or self.token_max_filter != 1.0:
            if self.token_min_filter < 0:
                self.token_min_filter = abs(self.token_min_filter)
            else:
                self.token_min_filter = int(
                    len(corpus) * self.token_min_filter)

            if self.token_max_filter < 0:
                self.token_max_filter = abs(
                    self.token_max_filter) / len(corpus)

            self.dictionary.filter_extremes(no_below=self.token_min_filter,
                                            no_above=self.token_max_filter,
                                            keep_n=None)

        if self.tfidf:
            self.model = TfidfModel(corpus)
        else:
            self.model = None
Beispiel #22
0
def corpus_vec(docs, model, corpus, size = DEFAULT_SAMPLE_SIZE):
    """ Creates a NxD array of document vectors for each document in a list"""

    tfidf = TfidfModel(corpus)
    N,D = len(docs), model.wv.syn0.shape[1]
    arr = np.empty((N, D))
    for i in range(N):
        arr[i,:] = doc_vec(docs[i], model, corpus, size, tfidf)
    return arr
Beispiel #23
0
 def __init__(self, documents):
     self.documents = documents
     self.texts = [[word for word in document.lower().split()]
                   for document in documents]
     self.dictionary = corpora.Dictionary(self.texts)
     self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
     self.tfidf = TfidfModel(self.corpus)
     self._make_random_indexing()
     print "initialized!"
Beispiel #24
0
def buildTfidfModel(corpus):
    print('get tfidf model...')
    if not os.path.exists(modelpath + 'tfidf.model'):
        # 构造tfidf向量
        tfidf = TfidfModel(corpus)
        tfidf.save(modelpath + 'tfidf.model')
    else:
        tfidf = TfidfModel.load(modelpath + 'tfidf.model')
    print('done')
    return tfidf
Beispiel #25
0
def tf_idf_keywords(text, bow, dictionary):
    tfidf = TfidfModel(bow)  # generates the model
    text = dictionary.doc2bow(text)
    tfidf_weights = tfidf[text]
    sorted_tfidf_weights = sorted(tfidf_weights,
                                  key=lambda w: w[1],
                                  reverse=True)  # sort by value
    keywords = []
    for term_id, weight in sorted_tfidf_weights[:5]:
        keywords.append(str(dictionary.get(term_id)))
    return keywords
 def fit(self, documents, labels=None):
     if self.lexicon == None or self.tfidf == None:
         inputDocuments = list(documents)
         self.lexicon = Dictionary(inputDocuments)
         self.tfidf = TfidfModel(
             [self.lexicon.doc2bow(doc) for doc in inputDocuments],
             id2word=self.lexicon)
         self.save()
         return self
     else:
         return self
Beispiel #27
0
 def tf_idf(dataSeg_save):
     corpus = pd.read_csv(dataSeg_save,header=None)[0]
     texts = [sentence.split(' ') for sentence in corpus]
     dictionary = corpora.Dictionary(texts)
     corpus = [dictionary.doc2bow(text) for text in texts]
     tf_idf_model = TfidfModel(corpus, normalize=False)
     word_tf_tdf = list(tf_idf_model[corpus])
     # print('词典:', dictionary.token2id)
     # print('词频:', corpus)
     # print('词的tf-idf值:', word_tf_tdf)
     return word_tf_tdf,dictionary.token2id
    def _get_tfidf(self):
        # Convert document (a list of words) into the bag-of-words format
        docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
        model_tf_idf = TfidfModel(docs_corpus, id2word=self.docs_dict)
        docs_tf_idf = model_tf_idf[docs_corpus]

        docs_tuples = []
        for c in docs_tf_idf:
            docs_tuples.append(sparse2full(c, len(self.docs_dict)))
            # print('ccc',len(c),c)
        tf_idf_vec = np.vstack(docs_tuples)
        return tf_idf_vec
Beispiel #29
0
def news_recommend_keywords(keywords, num=10):
    keywords = [word for word in keywords.split()]
    path_df = "Pickles/News_central_rec2.pickle"

    with open(path_df, 'rb') as data:
        df = pickle.load(data)

    df['bag_of_words'] = ''
    columns = df.columns
    for index, row in df.iterrows():
        words = []
        Words = ''
        for col in columns:
            if col == 'Content':
                words += row[col].split()
        words = list(set(words))
        row['bag_of_words'] = words

    processed_keywords = df.bag_of_words.to_list()
    dictionary = Dictionary(
        processed_keywords)  # create a dictionary of words from our keywords
    corpus = [dictionary.doc2bow(doc) for doc in processed_keywords]
    #create corpus where the corpus is a bag of words for each document

    tfidf = TfidfModel(corpus)  #create tfidf model of the corpus

    # Create the similarity data structure. This is the most important part where we get the similarities between the news.
    sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary))
    query_doc_bow = dictionary.doc2bow(
        keywords)  # get a bag of words from the query_doc
    query_doc_tfidf = tfidf[
        query_doc_bow]  #convert the regular bag of words model to a tf-idf model where we have tuples
    # of the news ID and it's tf-idf value for the news

    similarity_array = sims[
        query_doc_tfidf]  # get the array of similarity values between our news and every other news.
    #So the length is the number of news we have. To do this, we pass our list of tf-idf tuples to sims.

    similarity_series = pd.Series(similarity_array.tolist(),
                                  index=df.Title.values)  #Convert to a Series
    top_hits = similarity_series.sort_values(
        ascending=False)[:num]  #get the top matching results,
    # i.e. most similar news

    titles = []
    scores = []
    for idx, (title, score) in enumerate(zip(top_hits.index, top_hits)):
        #print("%d '%s' with a similarity score of %.3f" %(idx+1, title, score))
        titles.append(title)
        scores.append(score)

    return titles, scores
Beispiel #30
0
def transform_to_sparse(infiles,
                        N,
                        feature_size,
                        vectorizer=None,
                        feature_weight='logent'):
    """
	Param
	------
	infiles: dict, in the form {word_file: "path", lemma_file: "path"}
	N: the number of instances in the file
	feature_size: int
	vectorizer: sklearn vectorizer
	feature_weight: {'logent', 'tfidf', 'binary'}, weighting scheme
	
	Return
	------
	X: sparse matrix, feature representation of infiles with specific weighting scheme
	y: 1d array,  indicators of labels in infiles
	
	"""
    infile = infiles['word_file'] if infiles[
        'word_file'] is not None else infiles['lemma_file']
    if vectorizer is not None:
        if feature_weight == 'binary':
            X = vectorizer.fit_transform(get_line_as_str(**infiles))
            y = get_y(infile)
        else:
            X = Scipy2Corpus(
                vectorizer.fit_transform(get_line_as_str(**infiles)))
            if feature_weight == 'tfidf':
                weighting_scheme = TfidfModel(X)
            elif feature_weight == 'logent':
                weighting_scheme = LogEntropyModel(X)
            x = weighting_scheme[X]
            y = get_y(infile)
            data = []
            rows = []
            cols = []
            line_count = 0
            for line in x:
                for elem in line:
                    rows.append(line_count)
                    cols.append(elem[0])
                    data.append(elem[1])
                line_count += 1
            print(len(data))
            print(len(rows))
            print(len(cols))
            print(N)
            print(feature_size)
            X = csr_matrix((data, (rows, cols)), shape=(N, feature_size))
    return X, y