Example #1
0
def main():
    news_df = pd.read_pickle("news_df.pkl")

    # Bag Of Words - Vocab 1
    dictionary: Dictionary = Dictionary.load('vocabulary1.gensim')
    bow_voc1_corpus = [
        dictionary.doc2bow(doc_tokens) for doc_tokens in news_df['DocTokens']
    ]
    pickle.dump(bow_voc1_corpus, open('bow_voc1_corpus.pkl', 'wb'))

    # Bag Of Words - Vocab 2
    dictionary2: Dictionary = Dictionary.load('vocabulary2.gensim')
    bow_voc2_corpus = [
        dictionary2.doc2bow(doc_tokens) for doc_tokens in news_df['DocTokens']
    ]
    pickle.dump(bow_voc2_corpus, open('bow_voc2_corpus.pkl', 'wb'))

    # TF-IDF - Vocab 1
    tfidf1 = TfidfModel(bow_voc1_corpus, smartirs='ntc')
    tfidf_voc1_corpus = tfidf1[bow_voc1_corpus]
    pickle.dump(tfidf_voc1_corpus, open('tfidf_voc1_corpus.pkl', 'wb'))

    # TF-IDF - Vocab 2
    tfidf2 = TfidfModel(bow_voc2_corpus, smartirs='ntc')
    tfidf_voc2_corpus = tfidf2[bow_voc2_corpus]
    pickle.dump(tfidf_voc2_corpus, open('tfidf_voc2_corpus.pkl', 'wb'))
Example #2
0
 def __init__(self, name, dirs="D:\\Stack Flow\\data\\", n=200, flag=False):
     '''
     given a bias -n for extract IDF smallest 200 words
           dir of file
     To return:
           context of whole keyword documents
           n for select model
     '''
     self.n = n
     self.dir = dirs
     self.filename = name
     self.id = name.replace(".kdoc", "")
     d = Dictionary.load(dirs + "keyword.dict")
     self.tag = d[int(self.id)]
     context = open(dirs + "kdoc\\" + name, 'r').read()
     self.stopword = open(dirs + "stopword.txt", 'r').read().split('\n')
     doc = context.split("\n")
     self.raw = context
     self.context = [[
         w for w in word.split()
         if w not in self.stopword and w[0].isalpha() == True
     ] for word in doc if (len(word) > 10)]
     self.dict = Dictionary.load(dirs + "facebook.dict")
     self.cod_context = [self.dict.doc2bow(doc) for doc in self.context]
     self.model = TfidfModel(self.cod_context)
     self.tf = TfidfModel(dictionary=self.dict)
     self.mixture = {}
     self.flag = flag
     if (flag):
         self.idfs()
         self.tfs()
Example #3
0
    def build_model(self):
        start = time.time()
        tickers = [
            i for i in os.listdir(dir_cleaned_news) if i.endswith(".csv")
        ]
        corpus = []
        for ticker in tickers:
            df = pd.read_csv(dir_cleaned_news + ticker, index_col=0)
            for tokenized_doc in tokenizer(df['content'], self.phraser):
                corpus += [self.dic.doc2bow(tokenized_doc)]

        if self.wlocal and self.wglobal:
            self.get_model = TfidfModel(corpus,
                                        dictionary=self.dic,
                                        wlocal=self.wlocal,
                                        wglobal=self.wglobal,
                                        smartirs=self.smartirs)
        else:
            self.get_model = TfidfModel(corpus,
                                        dictionary=self.dic,
                                        smartirs=self.smartirs)

        end = time.time()

        print(self.name, " finished ", end - start, " seconds")
Example #4
0
def create_model(numTopics, docs, vmType, alpha):
    dct = Dictionary(docs)
    corpus = [dct.doc2bow(line) for line in docs]

    # Build Vector Models ######
    # Binary Model
    if vmType == 'B':
        model = TfidfModel(corpus, smartirs='bnn') # Binary term frequency weighting

    # TFIDF Model
    elif vmType == 'T':
        model = TfidfModel(corpus, smartirs='tfn') # fit tfidf

    # Term Frequency model
    elif vmType == 't':
        model = TfidfModel(corpus, smartirs='tnn') # Term frequency only

    else:
        print("Invalid Vector Model parameter.")

    # Build LDA Model ##########
    corpus = model[corpus]
    lda = LdaModel(corpus=corpus, id2word=dct, num_topics=numTopics, alpha=alpha)

    return lda, corpus, dct
Example #5
0
def generate_model(dictionary, bow_corpus, corpus_path):
    try:
        tfidf = TfidfModel.load(corpus_path + 'wiki-tfidf.model')
        print('tfidf model generated')
    except:
        tfidf = TfidfModel()
        tfidf = TfidfModel(bow_corpus, dictionary)
        tfidf._smart_save(corpus_path + 'wiki-tfidf.model')
        pass
    return tfidf
Example #6
0
 def create_tfidf(self):
     tfidf_model = TfidfModel(self.bag_of_words)
     self.bag_of_words = [
         tfidf_model[vector]
         for vector in tqdm(self.bag_of_words,
                            desc="Creating tf-idf matrix")
     ]
Example #7
0
def topic_extraction(reviews, title):
    nooftopics = 10
    # Joining bigrams for a review to capture the phrases like 'tasty pizza'
    for id, review in enumerate(reviews):
        reviews[id] = review + ["_".join(w) for w in ngrams(review, 2)]
    # Create a dictionary of words from overall reviews.
    dicionary = Dictionary(reviews)
    # change reviews into Bag of words model/ unigram model, here feature is term frequency.
    corpus = [dicionary.doc2bow(review) for review in reviews]
    # to transform from term frequency to tfidf matrix.
    tfidf = TfidfModel(corpus)
    tfidf_corpus = tfidf[corpus]
    # Making a lda model.
    lda_tf = ldamodel.LdaModel(corpus,
                               id2word=dicionary,
                               alpha='auto',
                               num_topics=nooftopics,
                               passes=5)
    lda_tfidf = ldamodel.LdaModel(tfidf_corpus,
                                  id2word=dicionary,
                                  alpha='auto',
                                  num_topics=nooftopics,
                                  passes=5)
    # with open("lda.pkl", "wb") as f:
    #     pickle.dump(lda, f)
    topic_list = lda_tf.print_topics(num_topics=10, num_words=10)
    topic_list_tfidf = lda_tfidf.print_topics(num_topics=10, num_words=10)

    with open("topic_list.pkl", "wb") as f:
        pickle.dump(topic_list, f)
    draw_graph_for_topics(topic_list, title, nooftopics)
    return True
    def __init__(self, documents):
        print("Initializing GloVe")
        if isinstance(documents[0], list):
            print("It is a list")
            documents = [[" ".join(document)] for document in documents
                         if isinstance(document, list)]

        documents = [str(document) for document in documents]

        self.corpus = [
            preprocess(document) for document in documents
            if type(document) is str
        ]
        self.documents = documents
        '''
        Then we create a similarity matrix, that contains the similarity between each pair of words, 
        weighted using the term frequency:
        '''
        # Load the model: this is a big file, can take a while to download and open
        glove = api.load("glove-wiki-gigaword-50")
        print("Document loaded")
        self.similarity_index = WordEmbeddingSimilarityIndex(glove)
        self.dictionary = Dictionary(self.corpus)
        self.tfidf = TfidfModel(dictionary=self.dictionary)
        print("Model is running")

        # Create the term similarity matrix.
        self.similarity_matrix = SparseTermSimilarityMatrix(
            self.similarity_index, self.dictionary, self.tfidf)
        print("Everything has been initialized")
def train_tfidf_model():
    corpus, dictionary, titles = retrieve_data()
    # first, construct tfidf
    print("tfidf")
    tfidf = TfidfModel(corpus)  # initialize model
    corpus_tfidf = tfidf[corpus]
    return corpus_tfidf, dictionary, titles
Example #10
0
    def train(self):
        if not os.path.exists(os.path.join(DATA_ANSWER_PATH, 'tfidf.model')):
            traindata = p.load(open(CORPUS_PATH, 'rb'))
            for qid in self.trainset:
                duplicates = self.trainset[qid]['duplicates']
                for duplicate in duplicates:
                    question = duplicate['rel_question']['tokens']
                    traindata.append(question)

                    rel_comments = duplicate['rel_comments']
                    for rel_comment in rel_comments:
                        q2 = rel_comment['tokens']
                        traindata.append(q2)

            self.dict = Dictionary(traindata)  # fit dictionary
            corpus = [self.dict.doc2bow(line)
                      for line in traindata]  # convert corpus to BoW format
            self.tfidf = TfidfModel(corpus)  # fit model
            self.dict.save(os.path.join(DATA_ANSWER_PATH, 'dict.model'))
            self.tfidf.save(os.path.join(DATA_ANSWER_PATH, 'tfidf.model'))
        else:
            self.dict = Dictionary.load(
                os.path.join(DATA_ANSWER_PATH, 'dict.model'))
            self.tfidf = TfidfModel.load(
                os.path.join(DATA_ANSWER_PATH, 'tfidf.model'))
    def walid_similarity_query(self, answer: str, key: str):
        if len(answer) == 0 or len(key) == 0:
            return False

        if self.model_ready:
            documents = [answer, key]

            if self.verbose:
                print(
                    f'{len(documents)} documents loaded and ready to preprocess'
                )

            corpus = [self.preprocess(document) for document in documents]

            if self.verbose:
                print(f'{len(corpus)} documents loaded into corpus')

            dictionary = Dictionary(corpus)
            tfidf = TfidfModel(dictionary=dictionary)
            similarity_matrix = SparseTermSimilarityMatrix(
                self.similarity_index, dictionary, tfidf)

            answer_bow = dictionary.doc2bow(self.preprocess(answer))
            key_bow = dictionary.doc2bow(self.preprocess(key))

            # Measure soft cosine similarity
            scores = similarity_matrix.inner_product(answer_bow,
                                                     key_bow,
                                                     normalized=True)

            return scores

        else:
            raise NotReadyError('Word embedding model is not ready.')
Example #12
0
    def _build_vocab(self, max_vocab_cnt):
        all_words = []
        for data in self.valid + self.non_valid:
            all_words.append(data["title"] + data["content"])
        vocab = Dictionary(all_words)
        raw_vocab_size = len(vocab)

        vocab.filter_extremes(no_below=5)
        vocab.filter_extremes(keep_n=max_vocab_cnt)
        len_1_words = list(
            filter(
                lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w
                not in ["a", "i"] and True or False, vocab.values()))
        vocab.filter_tokens(list(map(vocab.token2id.get, len_1_words)))
        if self.config.use_dict == "seq" and self.config.enable_pad:
            vocab.token2id[PAD] = len(vocab)
            vocab.compactify()
            self.pad_wid = vocab.token2id.get(PAD)
        self.vocab_seq = vocab  # seq dictionary
        # build bow dictionary
        self.vocab_bow = copy.deepcopy(vocab)
        self.vocab_bow.filter_tokens(
            map(self.vocab_bow.token2id.get, STOPWORDS))  # filter stop words
        self.vocab_bow.compactify()
        if self.config.tfidf:
            tfidf_corpus = [self.vocab_bow.doc2bow(line) for line in all_words]
            self.tfidf_model = TfidfModel(tfidf_corpus)
        print("Load corpus with non_valid size %d, valid size %d, "
              "raw vocab size %d seq vocab size %d, bow vocab size %d" %
              (len(self.non_valid), len(self.valid), raw_vocab_size,
               len(self.vocab_seq), len(self.vocab_bow)))
Example #13
0
    def train(self, classdict, nb_topics, *args, **kwargs):
        """ Train the topic modeler.

        :param classdict: training data
        :param nb_topics: number of latent topics
        :param args: arguments to pass to the `train` method for gensim topic models
        :param kwargs: arguments to pass to the `train` method for gensim topic models
        :return: None
        :type classdict: dict
        :type nb_topics: int
        """
        self.nb_topics = nb_topics
        self.generate_corpus(classdict)
        if self.toweigh:
            self.tfidf = TfidfModel(self.corpus)
            normcorpus = self.tfidf[self.corpus]
        else:
            self.tfidf = None
            normcorpus = self.corpus

        self.topicmodel = gensim_topic_model_dict[self.algorithm](
            normcorpus, num_topics=self.nb_topics, *args, **kwargs)
        self.matsim = MatrixSimilarity(self.topicmodel[normcorpus])

        # change the flag
        self.trained = True
Example #14
0
    def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print "Gathering sentences and removing stopwords"
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [
                word for word in TreebankWordTokenizer().tokenize(line.lower())
                if word not in stopwords
            ]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        # print(documents)
        print len(documents), "documents read"
        print len(self.dictionary), " unique tokens", self.dictionary
Example #15
0
def create_tfidf_from_papers(
    path_to_jsonl_index: Path = BIOPAPERS_JSON_PATH,
    path_to_bow: Path = BOW_PATH,
    outfile: Path = TFIDF_VECTORIZER,
) -> TfidfModel:
    """
    Creates TFIDF model from BOW corpora.

    Parameters
    ----------
    path_to_jsonl_index: Path
        Path to json lines index
    path_to_bow: Path
        Path to Bag of Words Dictionary
    outfile: Path
        Path to TFIDF vectorizer

    Returns
    -------
    tfidf_model: TfidfModel
        Gensim TFIDF Model
    """
    # Load dictionary
    dictionary = Dictionary.load(str(path_to_bow))
    # Load corpus generator
    corpus = BiopapersCorpus(dictionary, path_to_jsonl_index)
    # Train TFIDF
    tfidf_model = TfidfModel(corpus)
    # Save TFIDF model to file:
    tfidf_model.save(str(outfile))

    return tfidf_model
Example #16
0
def tfidf_filter(dataset, threshold):
    tokens = []
    #print('tokenizing documents...')
    for doc in dataset:
        #doc = clean_text(doc)
        tokenize = regTokenize(doc)
        tokens.append(tokenize)
    #print('creating dictionary...')
    dct = Dictionary(tokens)
    corpus = [dct.doc2bow(line) for line in tokens]
    #print(len(corpus))
    #print('creating tf-idf model...')
    model = TfidfModel(corpus, id2word=dct)
    low_value_words = []
    for bow in corpus:
        low_value_words += [
            id for id, value in model[bow] if (value < threshold)
        ]  #and dct[id] != "reforma_tributaria")]
    #print("low_value_words:",len(low_value_words))
    dct.filter_tokens(bad_ids=low_value_words)
    new_corpus = [dct.doc2bow(doc) for doc in tokens]
    #print(len(new_corpus))
    corp = []
    for doc in new_corpus:
        corp.append([dct[id] for id, value in doc])
    return corp
Example #17
0
 def summarize(self, text):
     self.sentences = self.factory.text2sentences(text)
     self.num_sentences = len(self.sentences)
     self.corpus = SentenceCorpus(self.sentences, self.max_dictionary_size)
     self.model = TfidfModel(self.corpus.bows, id2word=self.corpus.dictionary, normalize=True)
     self._inject_tfidfs()
     self._build_matrix()
Example #18
0
def build_tfid_model(dictionary, corpus, should_rebuild):
    tfid = list()

    # DEBUG
    should_rebuild = True

    if not should_rebuild:
        try:
            print('Loading TFID Model backup...')
            tfid_file = utils.get_file_path(cfg.TFID_BACKUP)
            print('TFID file = {}'.format(tfid_file))

            tfid = LdaModel.load(tfid_file)

        except Exception as exc:
            utils.print_exception_details('Building TFID Model', exc)

    else:
        print('Building TFID Model...')
        tfid = TfidfModel(corpus)
        print('Done!')
        # Save Model Structures
        TFID_FILE = utils.get_file_path(cfg.TFID_BACKUP)
        tfid.save(TFID_FILE)

    return tfid
Example #19
0
def tfidf_similarity(corpus, dictionary, categories, seed_article_title):
    mm, metadata, index = corpus
    
    # Create tfidf model
    tfidf = TfidfModel(dictionary = dictionary)
    
    # Get offset of seed article
    seed_article_offset = None
    for article_index, offset in enumerate(index):
        article_id, article_title = metadata[article_index]
        if article_title == seed_article_title:
            seed_article_offset = offset
            
    # Load seed article
    if seed_article_offset is None:
        logging.error('Seed article "%s" not found', seed_article_title)
    else:
        logging.info('Loading seed article "%s"', seed_article_title)
        seed_article = dict(mm.docbyoffset(seed_article_offset))
        
        def tfidf_similarity_query(title, content):
            tokens = wikicorpus.tokenize(wikicorpus.filter_wiki(content))
            vector = dict(tfidf[dictionary.doc2bow(tokens)])
            return cosine_similarity(seed_article, vector)
        return SearchQuery(tfidf_similarity_query)
    def tfidf_model(self):
        print('Logging Info - Get Tf-idf model...')
        tfidf_model_path = os.path.join(FEATURE_DIR,
                                        '{}_tfidf.model').format(self.genre)
        dict_path = os.path.join(FEATURE_DIR,
                                 '{}_tfidf.dict').format(self.genre)
        if os.path.exists(tfidf_model_path):
            dictionary = pickle_load(dict_path)
            tfidf_model = TfidfModel.load(tfidf_model_path)
        else:
            corpus = [
                text.split() for text in self.train_data['premise'] +
                self.train_data['hypothesis'] + self.dev_data['premise'] +
                self.dev_data['hypothesis'] + self.test_data['premise'] +
                self.test_data['hypothesis']
            ]
            dictionary = corpora.Dictionary(corpus)
            corpus = [dictionary.doc2bow(text) for text in corpus]
            tfidf_model = TfidfModel(corpus)

            del corpus
            tfidf_model.save(tfidf_model_path)
            pickle_dump(dict_path, dictionary)

        return dictionary, tfidf_model
Example #21
0
def train(file=DATA_FILE, type=JSON):
    delete_previous_models()

    faq_df = get_dataframe(os.path.join(DATA_DIR, file), type=type)
    faq_df = clean_data(faq_df)
    faq_df[PROCESSED_QUESTION] = faq_df[CLEAN_QUESTION].apply(preprocess)
    faq_df[PROCESSED_ANSWER] = faq_df[CLEAN_ANSWER].apply(preprocess)
    print('Preprocessing Done')
    if DEBUG:
        print(faq_df.head())

    for mode in modes:
        model = modes[mode]
        dictionary = corpora.Dictionary(faq_df[model.column])
        dictionary.save(os.path.join(MODEL_DIR, model.dictionary))
        corpus = faq_df[model.column].map(dictionary.doc2bow)
        if DEBUG:
            print(f'{model.corpus} generated')
            print(corpus.head())
        corpora.MmCorpus.serialize(os.path.join(MODEL_DIR, model.corpus),
                                   corpus)
        tfidf_model = TfidfModel(corpus)
        if DEBUG:
            print(f'{model.tfidf} generated')
        tfidf_model.save(os.path.join(MODEL_DIR, model.tfidf))
        tfidf = tfidf_model[corpus]
        lda_model = LdaMulticore(corpus=tfidf,
                                 id2word=dictionary,
                                 num_topics=30)
        lda_model.save(os.path.join(MODEL_DIR, model.model))
        if DEBUG:
            print(f'{model.model} generated')
            print(lda_model.print_topics(5))
    print('Training completed')
def get_corpus(docs):
    print("Building corpus ...")
    tfidf_model = None

    # load corpus from disk 
    if ARGS.load_corpus: 
        corpus = MmCorpus(ARGS.path_corpus)

    else:
        corpus = [dictionary.doc2bow(doc) for doc in docs]

        # serialize corpus to disk to prevent memory problems if corpus gets too large
        MmCorpus.serialize(ARGS.save_dir + '/corpora/corpus_bow.mm', corpus)  
        corpus = MmCorpus(ARGS.save_dir + '/corpora/corpus_bow.mm')

        if ARGS.corpus_type == "TFIDF": 
            tfidf_model = TfidfModel(corpus)

            tfidf_model.save(ARGS.save_dir + "/models/tfidf_model.mm")
            corpus = tfidf_model[corpus]

            # serialize corpus to disk to prevent memory problems if corpus gets too large
            MmCorpus.serialize(ARGS.save_dir + '/corpora/corpus_tfidf.mm', corpus)  
            corpus = MmCorpus(ARGS.save_dir + '/corpora/corpus_tfidf.mm')
    return corpus, tfidf_model
    def recommend(self, article):
        article.tokenize().remove_stop_words().lemmatize()
        n_grams = article.get_n_grams(2)

        vocab = Dictionary([n_grams])
        corpus = [vocab.doc2bow(n_grams)]  # convert corpus to BoW format
        model = TfidfModel(corpus)

        vector = 0
        for n in model[corpus[0]]:
            vector += n[1]

        distances = []

        for c in self.new_centroids:
            distance = math.fabs(np.linalg.norm(vector - c))
            distances.append(distance)

        min = np.array(distances).argmin()

        all_recommended = []
        i = 0
        for c in self.clusters:
            if c == min:
                all_recommended.append(i)
            i += 1

        print(all_recommended)

        recommended_article_ids = []
        for i in range(0, 3):
            random_article = random.choice(all_recommended)
            recommended_article_ids.append(random_article)

        return recommended_article_ids
Example #24
0
def getSparseMatrixSimilarity(keyword, texts):

    # 1、将【文本集】生成【分词列表】
    texts = [jieba.lcut(text) for text in texts]

    # 2、基于文本集建立【词典】,并获得词典特征数
    dictionary = Dictionary(texts)
    num_features = len(dictionary.token2id)

    # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
    corpus = [dictionary.doc2bow(text) for text in texts]
    # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
    kw_vector = dictionary.doc2bow(jieba.lcut(keyword))

    # 4、创建【TF-IDF模型】,传入【语料库】来训练
    tfidf = TfidfModel(corpus)
    # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
    tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
    tf_kw = tfidf[kw_vector]
    # 6、相似度计算
    sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
    similarities = sparse_matrix.get_similarities(tf_kw)
    for e, s in enumerate(similarities, 1):
        print('kw 与 text%d 相似度为:%.2f' % (e, s))

    print(sparse_matrix)
    print(similarities)
Example #25
0
def samilarRate(texts, keyword):
    # 传入texts,keyword
    # 文本集和搜索词

    # 1、将【文本集】生成【分词列表】
    texts = [lcut(text) for text in texts]

    # 2、基于文本集建立【词典】,并获得词典特征数
    dictionary = Dictionary(texts)
    num_features = len(dictionary.token2id)

    # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
    corpus = [dictionary.doc2bow(text) for text in texts]

    # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
    kw_vector = dictionary.doc2bow(lcut(keyword))

    # 4、创建【TF-IDF模型】,传入【语料库】来训练
    tfidf = TfidfModel(corpus)

    # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
    tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
    tf_kw = tfidf[kw_vector]

    # 6、相似度计算
    sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
    similarities = sparse_matrix.get_similarities(tf_kw)
    result = []
    sorft = []
    for e, s in enumerate(similarities, 1):
        result.append('kw 与 text%d 相似度为:%.2f' % (e, s))
        sorft.append(s)
    return result, sorft
Example #26
0
def tfidf_w2v_top5w(all_docs_prepro):
    # TFIDF MODEL
    exists = os.path.isfile('embedding/models/tfidf_all.model')
    if exists:
        print('Tfidf embedding model already existing')
    else:
        dct = Dictionary(all_docs_prepro)  # fit dictionary
        corpus = [dct.doc2bow(line)
                  for line in all_docs_prepro]  # convert corpus to BoW format
        model_tfidf = TfidfModel(corpus)
        word_path = 'embedding/models/tfidf_all.model'
        model_tfidf.save(word_path)

    # WORD2VEC MODEL
    exists = os.path.isfile('embedding/models/word2vec_all.model')
    if exists:
        print('Word2vec embedding model already existing')
    else:
        print('Training word2vec on all answers')
        word_path = "embedding/models/word2vec_all.model"
        word_tempfile = get_tmpfile(word_path)
        word_model = Word2Vec(all_docs_prepro,
                              size=128,
                              window=5,
                              min_count=1,
                              workers=4)
        word_model.save(word_path)
Example #27
0
 def train(self):
     tfidf = TfidfModel(corpus)
     corpus_tfidf = tfidf[corpus]
     lda = LdaMulticore(corpus=corpus_tfidf,
                        id2word=dictionary,
                        num_topics=100)
     lda.save('lda.model')
Example #28
0
def cluster_data(state):
    # Bigram model
    data_words_bigrams = make_bigrams(state)
    INPUT = data_words_bigrams
    # Create Dictionary
    id2word = corpora.Dictionary(INPUT)
    # Create Corpus
    texts = INPUT
    # Filter out words that occur less than and greater than
    id2word.filter_extremes(no_below=state.no_below, no_above=state.no_above)
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    tfidf = TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    TOPICS_LIST = range(1, state.lda_topics + 1)
    lda_models = []
    coherence_scores = []
    for TOPICS in TOPICS_LIST:
        lda_model = run_LDA_model(corpus, id2word, TOPICS)
        lda_models.append(lda_model)

        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=data_words_bigrams,
                                             dictionary=id2word,
                                             coherence='c_v')
        score = coherence_model_lda.get_coherence()
        coherence_scores.append(score)

    return coherence_scores, lda_models, corpus
Example #29
0
def make_cor_dict(tf_idf=True, is_reply=IS_REPLY):
	print("Corpus | Dictionary 생성 중...")

	videos = get_data()
	
	corpus = []
	dictionary = corpora.Dictionary()

	for video in videos:
		for comment in video['comments']:
			if is_reply:
				for reply in comment['replies']:
					dictionary.add_documents([reply['tokens']])
					corpus += [reply['tokens']]
			dictionary.add_documents([comment['tokens']])
			corpus += [comment['tokens']]
		dictionary.add_documents([video['tokens']])
		corpus += [video['tokens']]

	# 등장 빈도수 및 길이로 딕셔너리 최종 필터링 => 고려해볼 것. 왜냐? 댓글당 Token 개수가 너무 적다.
	dictionary.filter_extremes(no_below=MIN_COUNT)

	print("Corpus: ", len(corpus))

	# 딕셔너리 기반으로 모든 토큰을 정수로 인코딩
	corpus = [dictionary.doc2bow(tokens) for tokens in corpus]

	# 코퍼스 TF-IDF 수식 적용
	if tf_idf:
		print(":::: TF-IDF 적용 중...")
		tfidf = TfidfModel(corpus)
		corpus = tfidf[corpus]

	return corpus, dictionary
Example #30
0
def create_corpus_and_dict(documents):
    """
    Retrieve all the necessary data to train the LSI model.
    """
    if not os.path.exists('./tmp/dictionary.dict'):
        print("Starting construction dictionary now")
        dictionary = corpora.Dictionary(documents)
        dictionary.save('./tmp/dictionary.dict')
    else:
        print("Dictionary already constructed, loading now...")
        dictionary = corpora.Dictionary()
        dictionary = dictionary.load('./tmp/dictionary.dict')

    #construct BOW corpus
    if not os.path.exists('./tmp/bow_corpus.mm'):
        print("Starting construction bow corpus now")
        bow_corpus = [dictionary.doc2bow(text) for text in documents]
        corpora.MmCorpus.serialize('./tmp/bow_corpus.mm', bow_corpus)
    else:
        print('BOW corpus already created, loading now...')
        bow_corpus = corpora.MmCorpus('./tmp/bow_corpus.mm')

    #construct TFIDF corpus
    if not os.path.exists('./tmp/tfidf_corpus.mm'):
        print("Starting construction TFIDF corpus now")
        corpus = [dictionary.doc2bow(text) for text in documents]
        model_tfidf = TfidfModel(corpus)
        tfidf_corpus = model_tfidf[corpus]
        corpora.MmCorpus.serialize('./tmp/tfidf_corpus.mm', tfidf_corpus)
    else:
        print('TFIDF corpus already created, loading now...')
        tfidf_corpus = corpora.MmCorpus('./tmp/tfidf_corpus.mm')

    return dictionary, bow_corpus, tfidf_corpus