Ejemplo n.º 1
0
def lda_and_SVC(train_group, train_group_label, test_data, test_label,
                num_topics):

    dct, bow = create_bow(train_group)
    lda_model = LdaModel(corpus=bow, num_topics=num_topics, id2word=dct)
    document_topics = []
    for doc_bow in bow:
        ls = []
        for top, prob in lda_model.get_document_topics(
                bow=doc_bow, minimum_probability=0.0):
            ls.append(prob)
        document_topics.append(ls)

    t_document_topics = []
    for doc in test_data:
        doc_bow = dct.doc2bow(doc)
        ls = []
        for top, prob in lda_model.get_document_topics(
                bow=doc_bow, minimum_probability=0.0):
            ls.append(prob)
        t_document_topics.append(ls)

    clf = svm.LinearSVC()
    clf.fit(document_topics, train_group_label)

    y_pred = clf.predict(t_document_topics)
    return metrics.accuracy_score(test_label, y_pred)
Ejemplo n.º 2
0
def topic_analysis(df, nTopics=5, cleanTextCol='cleaned_text'):
  df[cleanTextCol]=df[cleanTextCol].fillna('')
  cleandata = df[cleanTextCol].fillna('').apply(lambda x: x.split(' '))
  dictionary = corpora.Dictionary(cleandata)
  tokens = [dictionary.doc2bow(d) for d in cleandata]
  model = LdaModel(tokens, num_topics=nTopics, id2word=dictionary, 
                update_every=1, chunksize=50, passes=10,
                per_word_topics=True, alpha='auto')
  docweights = [model.get_document_topics(t, minimum_probability=0) for t in tokens]
  doctopics = pd.DataFrame(docweights).apply(lambda x: x.apply(lambda y: y[-1] if y else 0))
  doctopics.columns = [f'topic{n+1}' for n in doctopics.columns]
  doctopics['KeyTopic']=doctopics.apply(lambda y:doctopics.columns[y==y.max()][0], axis=1)

  # create topicdescribe
  topics = model.show_topics(num_words=6)
  keywords = [re.findall(r'\*"(.*?)"',d[1]) for d in topics]
  weights = [re.findall(r'([\d\.]+)\*', d[1]) for d in topics]
  kwdf= pd.DataFrame(keywords, columns=[f'keyword_{n}' for n in range(len(keywords[0]))])
  wtdf= pd.DataFrame(weights, columns=[f'weight_{n}' for n in range(len(weights[0]))])
  topicDescribe =  kwdf.merge(wtdf,left_index=True, right_index=True)
  topicDescribe[sorted(topicDescribe.columns, key=lambda x:x.split('_')[-1])]
  topicDescribe['KeyTopic'] = [f'topic{n+1}' for n in range(len(topics))]
  topicDescribe['TopicKeywords'] = [' '.join(k) for k in keywords]
  topicDescribe['DocCount'] = doctopics['KeyTopic'].value_counts().sort_index().values
  topicDescribe = topicDescribe[['KeyTopic']+[col for col in topicDescribe.columns if \
    col != 'KeyTopic']]
  
  doctopics= doctopics.merge(topicDescribe[['KeyTopic','TopicKeywords']], on='KeyTopic', how='left')
  return doctopics, topicDescribe, model, tokens, dictionary
Ejemplo n.º 3
0
def do_cluster(obj, query):
    texts = [article['title'] for article in obj]

    processor = Processor(query)

    tokens = [processor.get_tokens(text) for text in texts]

    dictionary = corpora.Dictionary(tokens)

    corpus = [dictionary.doc2bow(token) for token in tokens]

    num_clusters = len(texts) / 5
    model = LdaModel(corpus,
                     num_topics=num_clusters,
                     id2word=dictionary,
                     update_every=5,
                     chunksize=10000,
                     passes=50)

    # size 10
    topic_matrix = model.show_topics(formatted=False, num_topics=num_clusters)

    clusters = [{
        "keywords": [str(word) for word, _ in topic[1]],
        "articles": []
    } for topic in topic_matrix]

    for i, document in enumerate(corpus):

        topic = np.array(model.get_document_topics(document))
        cluster = int(topic[np.argmax(topic[:, 1])][0])

        clusters[cluster]['articles'].append(obj[i])

    return clusters
Ejemplo n.º 4
0
def get_topics(data, filepath='./data/spam_topics.pkl'):
    if not os.path.exists(filepath):
        import pyLDAvis.gensim
        from gensim.corpora import Dictionary
        from gensim.models import LdaModel, CoherenceModel

        texts = [sample['lemmas'] for sample in data]

        dictionary = Dictionary(texts)
        dictionary.filter_extremes(no_below=20, no_above=0.4)
        corpus = [dictionary.doc2bow(text) for text in texts]

        chunksize = 500
        passes = 5
        iterations = 400
        eval_every = None

        temp = dictionary[0]  # This is only to "load" the dictionary.
        id2word = dictionary.id2token

        best_coherence = 0
        best_model_filepath = ''
        for num_topics in list(range(2, 20)):
            for alpha in ['asymmetric', 'symmetric']:
                for eta in ['symmetric', 'auto']:
                    filepath = 'out/topics/{}_{}_{}'.format(num_topics, alpha, eta)
                    model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every)
                    coherence = float(CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence())
                    filepath += '_{:.4f}'.format(coherence)
                    model.save(filepath + '_model.pkl')

                    prepared = pyLDAvis.gensim.prepare(model, corpus, dictionary)
                    pyLDAvis.save_html(prepared, filepath + '_plot.html')

                    if coherence > best_coherence:
                        best_coherence = coherence
                        best_model_filepath = filepath + '_model.pkl'

        model = LdaModel.load(best_model_filepath)
        print('Best model: {}'.format(best_model_filepath))

        topics = [x[0] for x in model.top_topics(corpus=corpus, texts=texts, dictionary=dictionary, topn=100)]

        data_topics = []
        for i, text in enumerate(texts):
            data_topics.append({k: v for k, v in model.get_document_topics(dictionary.doc2bow(text), minimum_probability=0.0)})

        pickle.dump([topics, data_topics], open(filepath, 'wb'))
    else:
        [topics, data_topics] = pickle.load(open(filepath, 'rb'))

    for i in range(len(data_topics)):
        data[i]['topics'] = data_topics[i]

    return topics, data
Ejemplo n.º 5
0
def get_document_topics_from_model(bow, lda: LdaModel) -> Dict[int, float]:
    """
    A method used concurrently in create_document_topics
    :param lda: the lda model
    :param text: a document string
    :param dictionary: the dictionary over the whole document
    :return: a dict with the topics in the given document based on the lda model
    """
    query = lda.get_document_topics(bow, minimum_probability=0.0)
    # 1/K is alternative threshold
    return dict(query)
Ejemplo n.º 6
0
def Lda_topic_model(docs, dictionary, nb_topics, true_labels):
    k = 5
    lda = LdaModel(docs, num_topics=k, id2word=dictionary, passes=10)

    top_words = [[word[::-1] for word, _ in lda.show_topic(topic_id, topn=50)]
                 for topic_id in range(lda.num_topics)]
    top_betas = [[beta for _, beta in lda.show_topic(topic_id, topn=50)]
                 for topic_id in range(lda.num_topics)]
    nb_words = 12
    f, ax = plt.subplots(3, 2, figsize=(20, 15))
    for i in range(nb_topics):
        # ax = plt.subplot(gs[i])
        m, n = np.unravel_index(i,
                                shape=(3,
                                       2))[0], np.unravel_index(i,
                                                                shape=(3,
                                                                       2))[1]
        ax[m, n].barh(range(nb_words),
                      top_betas[i][:nb_words],
                      align='center',
                      color='green',
                      ecolor='black')
        ax[m, n].invert_yaxis()
        ax[m, n].set_yticks(range(nb_words))
        ax[m, n].set_yticklabels(top_words[i][:nb_words])
        ax[m, n].set_title("Topic " + str(i))
    plt.show()
    # get distribution of docs on topics.
    dist_on_topics = lda.get_document_topics(docs)
    topic_predict = []
    for d in dist_on_topics:
        p = 0
        win_topic = 0
        print(d)
        for i, t in enumerate(d):
            if t[1] > p:
                p = t[1]
                win_topic = t[0]
        print(win_topic)
        topic_predict.append(win_topic)
    mat = confusion_matrix(true_labels, topic_predict)
    print(mat)
    cluster_to_class = {}
    for i in range(5):
        cluster_to_class[i] = np.argmax(mat[:, i])
    custom_labels = [cluster_to_class[c] for c in topic_predict]
    print("accuracy:", accuracy_score(true_labels, custom_labels))
    print("f1_score micro: ",
          f1_score(true_labels, custom_labels, average='micro'))
    print("f1_score: macro",
          f1_score(true_labels, custom_labels, average='macro'))
    print("NMI", NMI(true_labels, custom_labels))
Ejemplo n.º 7
0
    def calculate_fitness(self, gene):
        # Make LDA model
        self.fitness_budget -= 1
        lda = LdaModel(corpus=self.corpus,
                       id2word=self.dictionary,
                       num_topics=gene.n,
                       alpha=gene.a)

        if self.objective == 'coherence':
            cm = CoherenceModel(model=lda,
                                corpus=self.corpus,
                                coherence='u_mass')
            result = cm.get_coherence()

        elif self.objective == 'silhouette':
            labels = []
            word_cntLst = []
            if (len(self.corpus) < 2):
                gene.set_fitness(-99)
                return -99
            for text in self.corpus:
                # Make label list
                topic_probLst = lda.get_document_topics(text)
                if (len(topic_probLst) == 0):
                    print("LDA is f****d")
                    print("GA.py gene.a = ", gene.a)
                    if (0 in gene.a):
                        print("calculate fitness: Zero in a")
                    if (0 in gene.b):
                        print("calculate fitness: Zero in b")
                    gene.set_fitness(-99)
                    return -99
                labels.append(max(topic_probLst, key=lambda tup: tup[1])[0])
                # Make word count list
                words = [0] * self.vocab_size
                for tup in text:
                    words[tup[0]] = tup[1]
                word_cntLst.append(words[:])
            # Calculate silhouette score
            if (len(np.unique(labels)) < 2):
                gene.set_fitness(-99)
                return -99
            result = metrics.silhouette_score(word_cntLst,
                                              labels,
                                              metric='cosine')

        gene.set_fitness(result)
        return result
Ejemplo n.º 8
0
class LDA():
    def __init__(self, K, data, AMask, params, name, dataName):
        self.K = K  # [int] nb of topics
        self.AMask = AMask  # [n_a,n_d float] matrix of author participation to each paper (1 if author participated to paper)
        self.n_a, self.n_d = self.AMask.shape  # [int] nb authors
        self.D = data
        self.n_dic, self.n_d = self.D.shape
        self.name = name
        self.train_C_ = []
        self.train_param = params['train_param']
        for d in range(self.n_d):
            self.train_C_.append([(k, self.D[k, d])
                                  for k in range(self.n_dic)])

        self.dataName = dataName

    def train(self):
        self.LDA = LdaModel(self.train_C_,
                            num_topics=self.K,
                            decay=0.5,
                            offset=1024,
                            passes=80)
        self.phi = self.LDA.get_topics().transpose()
        self.theta = np.zeros((self.K, self.n_d))
        for d in range(self.n_d):
            tmp = self.LDA.get_document_topics(self.train_C_[d])
            ind = [c for (c, b) in tmp]
            self.theta[ind, d] = [b for (c, b) in tmp]
        self.D_reb = self.phi.dot(self.theta)
        self.A = normalize(self.AMask, 'l1', 0)
        return ()

    def save(self, path):
        '''
        path example
        '''
        toSave = {}
        toSave['theta'] = self.theta
        toSave['phi'] = self.phi
        toSave['A'] = self.A
        toSave['K'] = self.K
        toSave['train_param'] = self.train_param
        with open(path + self.name + '_' + self.dataName + '.pkl',
                  'wb') as output:
            pickle.dump(toSave, output, pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 9
0
class TopicModel(object):

    def __init__(self, documents, cut=True, num_topics=10, min_length=1):
        from cla.util.util import CutDocument
        from gensim.corpora import Dictionary
        from gensim.models import LdaModel

        self.document = CutDocument(documents, cut, cleanup=True, min_length=min_length)
        self.dictionary = Dictionary(self.document)
        self.model = LdaModel(BowCorpus(self.document, self.dictionary),
                              id2word=self.dictionary,
                              num_topics=num_topics)

    def topic_words(self, topic_id, limit=10):
        return self.model.show_topic(topicid=topic_id, topn=limit)

    def identify_topic(self, words):
        return self.model.get_document_topics(self.dictionary.doc2bow(words))
Ejemplo n.º 10
0
def BasicLDA(doclist, num_topics):
    start = time.clock()
    num_topics = num_topics
    texts = clean(doclist)
    print(texts[1])
    # frequency = {}
    # for text in texts:
    #     for token in text:
    #         if token not in frequency:
    #             frequency[token] = 0
    #         else:
    #             frequency[token] += 1
    dictionary = corpora.Dictionary(texts)
    size_dictionary = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, chunksize=500, passes=10, iterations=100)
    topics = []
    for i in lda.show_topics(num_topics=-1, num_words=20):
        print(i)
        topics.append(i)

    for i in lda.get_document_topics(corpus):  # i是按照词袋中的顺序,每个文档的主题分布
        s = str(i)
        pattern1 = r'\((\d+),'
        a = re.findall(pattern1, s)
        print(a)  # 匹配出每个文档的包含的主题标签

        word_list = []  # 存放当前文档包含的所有的主题
        for idx in a:  # 取主题号
            w = topics[int(idx)]  # 取主题词分布
            word_list.append(w)  # 按照主题标签, 把对应主题的词分布 ,按照顺序存起来

        l = [list(k)[1] for k in i]  # list(k)[1] 每个主题的取概率
        doc2top = {}
        for num in range(len(l)):
            doc2top[l[num]] = word_list[num]

        print(doc2top)
        break
        # print(list(chain.from_iterable(zip(l, word_list))))

    elapsed = time.clock() - start
    return lda, corpus, dictionary, size_dictionary, elapsed
Ejemplo n.º 11
0
class MyLda:
    def __init__(self, myDictionary, num_topics=100, topic_threshold=0.15):
        self.num_topics = num_topics
        self.topic_threshold = topic_threshold
        self.myDictionary = myDictionary
        self.model = LdaModel(self.myDictionary.doc2bows, \
         id2word=self.myDictionary.dictionary, \
         num_topics=num_topics)
        self.topic2ids, self.id2topics = self.get_mappings()
        self.coherenceModel = None
        print("- Created MyLda with {} topics".format(self.num_topics))

    def get_mappings(self):
        topic2ids, id2topics = defaultdict(list), defaultdict(list)
        for i, doc2bow in enumerate(self.myDictionary.doc2bows):
            topic_pairs = self.model.get_document_topics(doc2bow)
            for j, (topic, prob) in enumerate(topic_pairs):
                if prob >= self.topic_threshold or j == 0:
                    topic2ids[topic].append(i)
                    id2topics[i].append(topic)
        return topic2ids, id2topics

    def get_topic_terms(self, topic):
        terms = self.model.get_topic_terms(topic)
        return terms

    def get_top_topic(self):
        top_topics = self.model.top_topics(corpus=self.myDictionary.doc2bows)
        average = sum([t[1] for t in top_topics]) / self.num_topics
        return top_topics, average

    def get_perplexity(self):
        return self.model.log_perplexity(self.myDictionary.doc2bows)

    def get_coherence(self):
        if not self.coherenceModel:
            self.coherenceModel = CoherenceModel(model=self.model, \
             corpus=self.myDictionary.doc2bows, \
             dictionary=self.myDictionary.dictionary, \
             coherence='u_mass')
        return self.coherenceModel.get_coherence()
Ejemplo n.º 12
0
def lda_topics(processed_data: list, n_topics: int = 10, learning_decay: float = 0.5,
               learning_offset: float = 1.0, max_iter: int = 50, n_words: int = 10) -> Tuple[list, list]:
    """
    lda_topics perfoms LDA topic modeling on the input data

    :param processed_data: list of preprocessed segments
    :param n_topics: number of topics to extract form corpus
    :param learning_decay: learning decay parameter for LDA
    :param learning_offset: learning offset parameter for LDA
    :param max_iter: max. number of interations
    :param n_words: number of topic representatives

    :return:
        - topics - list of topics (and their representatives
        - doc_topics - list of predicted topics, one for each segment
    """

    dictionary = corpora.Dictionary(processed_data, )
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_data]

    lda_model = LdaModel(doc_term_matrix, id2word=dictionary, num_topics=n_topics, offset=learning_offset,
                         random_state=42, update_every=1, iterations=max_iter,
                         passes=10, alpha='auto', eta="auto", decay=learning_decay, per_word_topics=True)

    topics = []
    for i_t, topic_word_dist in enumerate(lda_model.get_topics()):
        topic = [lda_model.id2word[w_id] for w_id, _ in lda_model.get_topic_terms(i_t, topn=n_words)]
        topics.append(topic)

    # getting documents topic labels
    doc_topics = []
    for doc in doc_term_matrix:

        doc_t_dist = sorted(lda_model.get_document_topics(doc), key=lambda item: item[1], reverse=True)
        t, _ = doc_t_dist[0]
        doc_topics.append(t)

    assert len(doc_topics) == len(processed_data)
    return topics, doc_topics
Ejemplo n.º 13
0
class LDAVecGen():
    def __init__(self, path):
        with open(path, 'rb') as fp:
            data = pickle.load(fp)
        self.id_list = data['id']
        self.doc_list = list(map(lambda doc: doc.split(' '), data['doc']))

    def fit_model(self, topic_num):
        self.dictionary = Dictionary(self.doc_list)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.doc_list]
        self.model = LdaModel(corpus=self.corpus,
                              id2word=self.dictionary,
                              num_topics=topic_num)

    def out(self, model_path, col_name):
        buffer = []
        vecs = self.model.get_document_topics(self.corpus)

        col = db[col_name]
        for id, vec in zip(self.id_list, vecs):
            buffer.append({
                'fulltextid': id,
                'vec': [[item[0], float(item[1])] for item in vec]
            })

            if len(buffer) >= 1000:
                col.insert_many(buffer)
                buffer.clear()

        if len(buffer) > 0:
            col.insert_many(buffer)
            buffer.clear()

        if not os.path.exists(model_path):
            os.makedirs(model_path)

        self.dictionary.save(os.path.join(model_path, 'lda.dic'))
        self.model.save(os.path.join(model_path, 'lda.model'))
Ejemplo n.º 14
0
def build_lda_model(tokens_tags,
                    pos_tags,
                    use_nouns=True,
                    use_verbs=True,
                    use_all=False,
                    num_of_topics=8,
                    passes=25,
                    verbose=True):
    path = os.getcwd()[:os.getcwd().rfind('/')]
    topics_filename = str(num_of_topics) + "topics"
    if use_nouns:
        topics_filename += "_nouns"
    if use_verbs:
        topics_filename += "_verbs"
    if use_all:
        topics_filename += "_all"

    # Set the LDA, Dictionary and Corpus filenames
    lda_filename = path + "/models/topic_models/lda_" + topics_filename + ".model"
    dict_filename = path + "/res/topic_data/dict/dict_" + topics_filename + ".dict"
    corpus_filename = path + "/res/topic_data/corpus/corpus_" + topics_filename + ".mm"

    # Build a topic model if it wasn't created yet
    if not os.path.exists(lda_filename):
        # Extract the lemmatized documents
        docs = []
        for index in range(len(tokens_tags)):
            tokens = tokens_tags[index].split()
            pos = pos_tags[index].split()
            docs.append(
                data_proc.extract_lemmatized_tweet(tokens, pos, use_verbs,
                                                   use_nouns, use_all))

        # Compute the dictionary and save it
        dictionary = Dictionary(docs)
        dictionary.filter_extremes(keep_n=40000)
        dictionary.compactify()
        Dictionary.save(dictionary, dict_filename)

        # Compute the bow corpus and save it
        corpus = [dictionary.doc2bow(d) for d in docs]
        MmCorpus.serialize(corpus_filename, corpus)

        if verbose:
            print("\nCleaned documents:", docs)
            print("\nDictionary:", dictionary)
            print("\nCorpus in BoW form:", corpus)

        # Start training an LDA Model
        start = time.time()
        print("\nBuilding the LDA topic model...")
        lda_model = LdaModel(corpus=corpus,
                             num_topics=num_of_topics,
                             passes=passes,
                             id2word=dictionary)
        lda_model.save(lda_filename)
        end = time.time()
        print("Completion time for building LDA model: %.3f s = %.3f min" %
              ((end - start), (end - start) / 60.0))

        if verbose:
            print("\nList of words associated with each topic:")
            lda_topics = lda_model.show_topics(formatted=False)
            lda_topics_list = [[word for word, prob in topic]
                               for topic_id, topic in lda_topics]
            print([t for t in lda_topics_list])

    # Load the previously saved dictionary
    dictionary = Dictionary.load(dict_filename)

    # Load the previously saved corpus
    mm_corpus = MmCorpus(corpus_filename)

    # Load the previously saved LDA model
    lda_model = LdaModel.load(lda_filename)

    # Print the top 10 words for each topic
    if verbose:
        for topic_id in range(num_of_topics):
            print("\nTop 10 words for topic ", topic_id)
            print([
                dictionary[word_id]
                for (word_id,
                     prob) in lda_model.get_topic_terms(topic_id, topn=10)
            ])

    index = 0
    if verbose:
        for doc_topics, word_topics, word_phis in lda_model.get_document_topics(
                mm_corpus, per_word_topics=True):
            print('Index ', index)
            print('Document topics:', doc_topics)
            print('Word topics:', word_topics)
            print('Phi values:', word_phis)
            print('-------------- \n')
            index += 1
    return dictionary, mm_corpus, lda_model
Ejemplo n.º 15
0
class LDAModel(object):
    """

    """

    def __init__(self,path,model_file,dictionary_file,corpus_file,num_topics=21):
            """
            进行数据预处理,获取训练集和测试集
            class biological分子与细胞_cleaned.csv : 12
            class biological现代生物技术专题_cleaned.csv : 14
            class biological生物技术实践_cleaned.csv : 16
            class biological生物科学与社会_cleaned.csv : 18
            class biological稳态与环境_cleaned.csv : 110
            class biological遗传与进化_cleaned.csv : 112
            class geography人口与城市_cleaned.csv : 42
            class geography区域可持续发展_cleaned.csv : 44
            class geography地球与地图_cleaned.csv : 46
            class geography宇宙中的地球_cleaned.csv : 48
            class geography生产活动与地域联系_cleaned.csv : 410
            class history古代史_cleaned.csv : 52
            class history现代史_cleaned.csv : 54
            AttributeError: 'PyDB' object has no attribute 'has_plugin_line_breaks'
            Exception ignored in: '_pydevd_frame_eval.pydevd_frame_evaluator_darwin_36_64.get_bytecode_while_frame_eval'
            AttributeError: 'PyDB' object has no attribute 'has_plugin_line_breaks'
            class history近代史_cleaned.csv : 56
            class political公民道德与伦理常识_cleaned.csv : 102
            class political时事政治_cleaned.csv : 104
            class political生活中的法律常识_cleaned.csv : 106
            class political科学思维常识_cleaned.csv : 108
            class political科学社会主义常识_cleaned.csv : 1010
            class political经济学常识_cleaned.csv : 1012
            :param file:语料文件
            :param ratio:测试训练的比列
            :return lda:返回lda模型
            """



            dirs = os.listdir(path)
            x_list = []
            item_x = []
            labels = []
            multiLabels = []
            label11 = 0

            for file in dirs:
                #print(os.path.join(path, file))
                path2 = os.path.join(path, file)
                if os.path.isdir(path2):
                    category = file
                    dirs2 = os.listdir(path2)
                    label12 = 0
                    for file2 in dirs2:
                        file3 = os.path.join(path2, file2)
                        if os.path.isfile(file3) and file2.endswith('_cleaned.csv'):
                            print('class {}{} : {}{}'.format(file, file2, label11, label12))
                            src_df = pd.read_csv(file3)
                            src_df = parallelize(src_df, data_fram_proc) #上采样

                            #merged_df = pd.concat([src_df['items'], src_df['knowledge']], axis=1)
                            src_df['item'] = src_df['items'] + src_df['knowledge']
                            x = np.array(src_df['item']).tolist()
                            item_x += x
                            x = [[word for word in doc.split(' ') if word != "" ] for  doc in x]
                            x_list+= x # list
                            #labels += ['__label__'+str(label11)+''+str(label12) for i in range(len(x))]
                            fn = str(file2).replace('_cleaned.csv','').replace('\t','').replace('\n','')
                            labels += ['__label__' + str(file) + '_' + fn  for i in range(len(x))]
                            bug = 0
                            mls = np.array(src_df['label']).tolist()
                            multiLabels += [ str(file).replace('_',' ') +' '+fn+' '+  str(ml).replace('\t','').replace('\n','') for ml in mls ]
                            bug = 1
                        label12 += 1
                label11 += 1

            c = {'label': labels,'item': item_x,'multiLabels':multiLabels}  # 合并成一个新的字典c
            df = pd.DataFrame(c)  # 将c传入DataFrame并创建
            df.to_csv(corpus_file, index=None,  header=True)


            # 把文章转成list,字典里面 "token2id "
            self.dictionary = Dictionary(x_list)
            # 把文本转成词袋形式  id : freq
            self.corpus = [self.dictionary.doc2bow(text) for text in x_list]

            # 调用lda模型,并指定10个主题
            self.lda = LdaModel(self.corpus, id2word=self.dictionary, num_topics=num_topics)
            # 检查结果
            results = self.lda.print_topics(num_topics, num_words=50)
            for result in results:
                print(result)

            # Save model to disk.
            self.lda.save(model_file)

            self.dictionary.save_as_text(dictionary_file)


    def __retrain(self, model_file,other_texts):
           """
           lda = LdaModel.load(model_file)
           other_corpus = [self.dictionary.doc2bow(text) for text in other_texts]
           lda.update(other_corpus)
           """

    def getDocSVector(self):
        self.docSVector = []
        for d in self.corpus:
            self.docSVector.append(self.lda.get_document_topics(d,minimum_probability = 0))
        return self.docSVector
Ejemplo n.º 16
0
num_topics = 30
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     alpha='auto',
                     eta='auto',
                     num_topics=num_topics)

pickle.dump(lda_model, open("gensim_lda_model.p", "wb"))
pickle.dump(dictionary, open("gensim_lda_dictionary.p", "wb"))
diccc = pickle.load(open("gensim_lda_dictionary.p", "rb"))

s = ''

doc_topic = []
for i in range(len(corpus)):
    doc_topic.append(lda_model.get_document_topics(corpus[i]))

doc_topic_temp = []
for i in range(len(corpus)):
    temp = []
    for i in range(0, num_topics):
        a = 0
        temp.append(a)
    doc_topic_temp.append(temp)

longg = len(doc_topic)
for i in range(0, longg):
    #print(doc_topic[i])
    #print(i)
    kk = len(doc_topic[i])
    for a in range(kk):
Ejemplo n.º 17
0
class CustomLDA:
    '''
    custimized lda model
    '''
    def __init__(self, documents, titles, dictionary):
        self.documents = documents
        self.titles = titles
        self.dictionary = dictionary
        self.model = None
        return

    def train(
        self,
        train_data,
        val_data,
        output_path,
        num_topics=1000,
        iterations=100,
        chunksize=2000,
        passes=1,
        eval_every=1,
    ):
        '''
        train/val a model and save the trained model.

        Args:
            train_data (DataFrame): training data
            val_data (DataFrame): validation data
            output_path: where to save models
            num_topics: the number of topics
            iterations: train iterations
            eval_every: eval model every `eval_every` iterations

        Returns:
            model object
        '''
        val_data = pd.concat([train_data, val_data], ignore_index=True)
        self.model = LdaModel(
            corpus=self.documents.bow.tolist(),
            id2word=self.dictionary.id2token,
            alpha='auto',
            eta='auto',
            iterations=iterations,
            num_topics=num_topics,
            passes=passes,
            chunksize=chunksize,
            eval_every=eval_every,
            callbacks=[
                # utils.EpochSaver(output_path),
                # utils.EpochLogger(log_start=True),
                # utils.SupervisedEvalute(val_data, documents, titles)
                # CoherenceMetric(corpus=documents.bow, logger='shell'),
                # ConvergenceMetric(logger='shell'),
            ],
        )
        return self

    def validate(self, data):
        '''validate this model'''
        prediction = self.predict(data)
        mrr = utils.calculate_MRR(prediction)
        return mrr

    def predict(self, data, progress=True):
        '''
        make a prediction

        Args:
            data: input data
            progress: whether progress bar should be displayed

        Returns:
            prediction results
        '''
        if progress:
            tqdm.pandas('predicion')
            data = data.progress_apply(
                self.sort_candidates,
                axis=1,
            )
        else:
            data = data.apply(
                self.sort_candidates,
                axis=1,
            )
        return data

    def sort_candidates(self, series, log_before=False, log_after=False):
        '''
        sort candidate titles contained in series.

        Args:
            series: pd.Series with index[title_id, candidates]
            log_before: whether this func should log candidates before sorting
            log_after: whether this func should log candidates after sorting

        Returns:
            series
        '''
        title_info = self.titles.loc[series.title_id]

        if log_before:
            print(list(map(
                lambda doc_id: utils.get_coherence(
                    self.model.get_document_topics(self.documents.loc[doc_id].bow),
                    title_info.bow
                ),
                series.candidates,
            )))

        series.candidates = sorted(
            series.candidates,
            key=lambda doc_id: - utils.get_coherence(
                self.model.get_document_topics(self.documents.loc[doc_id].bow, 0),
                title_info.bow
            ),
        )

        if log_after:
            print(list(map(
                lambda doc_id: self.get_coherence(
                    self.model.get_document_topics(self.documents.loc[doc_id].bow, 0),
                    title_info.bow
                ),
                series.candidates,
            )))
            print()
        return series

    def save(self, path):
        '''save'''
        self.model.save(path)
        return self

    def load(self, path):
        '''load'''
        self.model = LdaModel.load(path)
        return self
Ejemplo n.º 18
0
def LDA(texts, num_topics, token2tag, token2tag_dic, dic):
    num_topics = num_topics
    '''将输入文本中的所有的词,按照相似性,变成tag_num'''
    '''将文本中的词替换称编号,如果词不在字典中,则增加到字典中'''
    # 获取未替换的文本的tfidf
    dictionary = corpora.Dictionary(texts)
    token2id = dictionary.token2id
    id2token = {v: k for k, v in token2id.items()}
    # dictionary.save('patent_dictionary.dict')
    corpus = [dictionary.doc2bow(text) for text in texts]
    # corpora.MmCorpus.serialize('patent_corpuse.mm', corpus)
    tfidf = models.TfidfModel(corpus)
    # tfidf.save('patent_tfidf.model')
    corpus_tfidf = tfidf[corpus]
    sort_keywords = []
    # 将tfidf进行降序排序,获取前三的tfidf的词作为关键词,否则就取一个。
    for doc in corpus_tfidf:
        sorttfidf = sorted(doc, key=lambda x: x[1], reverse=True)
        n = 0
        if len(sorttfidf) >= 3:
            sort_keywords.append([id2token[sorttfidf[0][0]], id2token[sorttfidf[1][0]], id2token[sorttfidf[2][0]]])
        elif len(sorttfidf) == 2:
            sort_keywords.append([id2token[sorttfidf[0][0]], id2token[sorttfidf[1][0]]])
        elif len(sorttfidf) == 1:
            sort_keywords.append([id2token[sorttfidf[0][0]]])

    for text in texts:
        for word in text:
            text[text.index(word)] = str(token2tag[word])
    topic = {}
    for key, item in token2tag_dic.items():
        topicword = ' '.join(list(item))
        # print(topicword)
        topic[str(key)] = topicword
        insert = "insert into topicword_info(主题词ID,主题词) value (%s,%s)"
        # print(insert,[key,topicword])
        cur.execute(insert, [key, topicword])
        con.commit()
    start = time.clock()
    dictionary = corpora.Dictionary(texts)
    dictionary.save('patent_dictionary.dict')
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('patent_corpuse.mm', corpus)
    tfidf = models.TfidfModel(corpus)
    tfidf.save('patent_tfidf.model')
    corpus_tfidf = tfidf[corpus]
    # sort_keywords = []
    # #将tfidf进行降序排序,获取前三的tfidf的词作为关键词,否则就取一个。
    # for doc in corpus_tfidf:
    #     sorttfidf = sorted(doc, key=lambda x: x[1], reverse=True)
    #     n = 0
    #     if len(sorttfidf)>=3:
    #         sort_keywords.append([sorttfidf[0][0],sorttfidf[1][0],sorttfidf[2][0]])
    #     else:
    #         sort_keywords.append([sorttfidf[0][0]])
    # print('关键词',sort_keywords)

    size_dictionary = len(dictionary)  # 所有单词的长度
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, chunksize=500, passes=100, iterations=100)
    lda.save('patent_lda.model')
    topic1 = {}  # 用来存放每个主题的词分布 按照从0到最大的顺序
    for i in lda.show_topics(num_topics=-1, num_words=20):
        s = str(i)
        pattern1 = r'"(\d+)"'
        a = re.findall(pattern1, s)  # 匹配出 词簇tag
        topic1[i[0]] = [topic[a[0]], topic[a[1]], topic[a[2]], topic[a[3]], topic[a[4]]] # 用来存放每个主题的词分布 按照从0到最大的顺序
        insert = 'insert into topic_info(主题ID,主题词ID1,主题词ID2,主题词ID3,主题词ID4) values(%s,%s,%s,%s,%s)'
        # print(insert,[i[0],a[0],a[1],a[2],a[3]])
        cur.execute(insert, [i[0], a[0], a[1], a[2], a[3]])
        con.commit()
        # word_list = []
        # for i in a:
        #     w = token2tag_dic[int(i)]
        #     word_list.append(w)
        # pattern2 = r'"\d+"'
        # st = re.split(pattern2, s)
        # topics.append(list(chain.from_iterable(zip(st, word_list))))
        # print(list(chain.from_iterable(zip(st, word_list))))
    c = 0
    topics = []
    ppp = []
    for i in lda.get_document_topics(corpus):  # i是按照词袋中的顺序,每个文档的主题分布
        s = str(i)
        ppp.append([])
        vec = [0.0] * 30
        for j in i:
            a = int(j[0])
            vec[a] = j[1]
            ppp[-1].extend([j[1],topic1[j[0]]])
        topics.append(vec)
        # print(vec)
        pattern1 = r'\((\d+),'
        a = re.findall(pattern1, s)
        # print(a)  #匹配出每个文档的包含的主题标签
        keyword = sort_keywords[c]
        # print(keyword)
        # guanjianci = ' '.join([str(token2tag_dic[int(j)]) for j in keyword]) #[1,2,3] 按照索引把
        guanjianci = ' '.join(keyword)  # [1,2,3] 按照索引把
        detail = dic[c]
        name, abstract, id = detail['专利名称'], detail['专利摘要'], detail['专利号']
        if len(a) >= 4:
            insert = 'insert into patent_info(专利名,专利摘要,专利号,关键词,主题ID1,主题ID2,主题ID3,主题ID4,主题向量) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
            cur.execute(insert, [name, abstract, id, guanjianci, a[0], a[1], a[2], a[3], str(vec)])
            con.commit()
            c += 1
        else:
            if len(a) == 1:
                insert = 'insert into patent_info(专利名,专利摘要,专利号,关键词,主题ID1,主题向量) values(%s,%s,%s,%s,%s,%s)'
                cur.execute(insert, [name, abstract, id, guanjianci, a[0], str(vec)])
                con.commit()
                c += 1
            elif len(a) == 2:
                insert = 'insert into patent_info(专利名,专利摘要,专利号,关键词,主题ID1,主题ID2,主题向量) values(%s,%s,%s,%s,%s,%s,%s)'
                cur.execute(insert, [name, abstract, id, guanjianci, a[0], a[1], str(vec)])
                con.commit()
                c += 1
            elif len(a) == 3:
                insert = 'insert into patent_info(专利名,专利摘要,专利号,关键词,主题ID1,主题ID2,主题ID3,主题向量) values(%s,%s,%s,%s,%s,%s,%s,%s)'
                cur.execute(insert, [name, abstract, id, guanjianci, a[0], a[1], a[2], str(vec)])
                con.commit()
                c += 1
    for i in ppp:
        print(i)
    with open('lda文本的主题词', 'w', encoding='utf8') as f:
        f.write(str(ppp))
    # word_list = [] #存放当前文档包含的所有的主题
    # for idx in a: #取主题号
    #     w = topics[int(idx)] #取主题词分布
    #     word_list.append(w) #按照主题标签, 把对应主题的词分布 ,按照顺序存起来
    #
    # l = [list(k)[1] for k in i] #list(k)[1] 每个主题的取概率
    # doc2top = {}
    # for num in range(len(l)):
    #     doc2top[l[num]]  = word_list[num]
    #
    # print(doc2top)
    # print(list(chain.from_iterable(zip(l, word_list))))
    # doctopic = []
    # for i in lda.get_document_topics(corpus)[:]:
    #     listj = []
    #     for j in i:
    #         listj.append(j[1])
    #     bz = listj.index(max(listj))
    #
    #     k = i[bz][0]
    #     doctopic.append(k)
    with open('lda_topic', 'w', encoding='utf8') as f:
        f.write(str(topics))
    elapsed = time.clock() - start
    return lda, corpus, dictionary, size_dictionary, elapsed
    tokens = vectorizer.get_feature_names()
    vocab_size = len(tokens)
    pd.Series(tokens).to_csv(token_path, index=False)

    id2word = pd.Series(tokens).to_dict()
    corpus = Sparse2Corpus(dtm, documents_columns=False)

    # dictionary = Dictionary.from_corpus(corpus=train_corpus, id2word=id2word)

    # for n_topics in [3, 5, 7, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 75, 100]:
    for n_topics in [5, 10, 15, 20, 30]:
        print(n_topics, end=' ', flush=True)
        lda = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word)

        doc_topics = pd.DataFrame()
        for i, topics in enumerate(lda.get_document_topics(corpus)):
            doc_topics = pd.concat([
                doc_topics,
                pd.DataFrame(topics, columns=['topic', 'value']).assign(doc=i)
            ])
        doc_topics.to_csv(model_path / f'doc_topics_{key}_{n_topics}.csv',
                          index=False)

        model_file = datapath((model_path / f'{key}_{n_topics}').resolve())
        lda.save(model_file)
        train_lda = LdaModel(corpus=train_corpus,
                             num_topics=n_topics,
                             id2word=pd.Series(train_tokens).to_dict())

        # see https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.log_perplexity
        test_perplexity = 2**(-train_lda.log_perplexity(test_corpus))
Ejemplo n.º 20
0
lda = LdaModel(
    corpus=corpus,
    num_topics=num_topics,
    id2word=dct,
    alpha='auto',
    random_state=100,
    # eta=None,
    update_every=1,
    chunksize=chunk_size,
    minimum_probability=0.0,
    # iterations=100,
    # gamma_threshold=0.001,
    passes=10,
    per_word_topics=True)

lda.get_document_topics(bow=corpus, per_word_topics=True)
tpl = lda.print_topics(num_topics=6, num_words=5)
topic, contrib = zip(*tpl)

t2 = time.time()
print("Time to train LDA model on", len(df), "articles:", (t2 - t1) / 60,
      "min")

top_k_topics = lda.top_topics(corpus,
                              topn=5,
                              dictionary=dct,
                              texts=train_df['tokenized'])
indx = [i + 1 for i in range(6)]
contrib = np.transpose(contrib)
#%%
tpl = lda.print_topics(num_topics=6, num_words=5)
Ejemplo n.º 21
0
class TopicsService():

    data_service = None
    model = gensim.models.basemodel.BaseTopicModel()

    def model_save(self, _model, _model_name, _active_dataset):
        """
        Save trained model to disk
        @params:
            _model          - Required  : model (gensim.models.LdaModel)
            _model_name     - Required  : name of the active topic model (Str)
            _active_dataset - Required  : name of active dataset (Str)
        """

        result = self.data_service.save_model(_model, _model_name, _active_dataset)

        return result

    def model_load(self, _model_name, _active_dataset):
        """
        Load pre-trained model from disk
        @params:
            _model_name        - Required  : name of the topic mining model(Str)
            _active_dataset    - Required  : name of active dataset (Str)
        """

        # TODO: тут надо проверять что созданы все нужные файлы
        # TODO: перенести загрузку в слой работы с данными
        try:
            self.model = models.LdaModel.load(
                config.path2data + _active_dataset + "." + _model_name + ".model")
            # self.model = models.LdaModel.load(config.path2data + "lda.model")
        except Exception as e:
            self.model = self.process_topics_mining(_active_dataset)
            self.model = models.LdaModel.load(
                config.path2data + _active_dataset + "." + _model_name + ".model")

        return True

    def __init__(self, _model_name, _active_dataset):
        """
        Constructor
        @params:
            _model_name     - Required  : name of the active topic model (Str)
            _active_dataset - Required  : name of active dataset (Str)
        """

        self.data_service = DataService(_active_dataset)
        self.model_load(_model_name, _active_dataset)

    def topic_mining(self, _active_dataset):
        """
        Internal function for process topic mining and sace trained models
        @params:
            _active_dataset - Required  : name of active dataset (Str)
        """

        # TODO: перенести сохранение в файлы в слой models

        print("topic minning start..")
        vectorizer = TfidfVectorizer(max_df=0.5, max_features=500,
                                     min_df=2, stop_words='english',
                                     use_idf=True)

        _cousines, _reviews = self.data_service.get_reviews_for_cousines()

        print("text uploaded")
        text = _reviews

        X = vectorizer.fit_transform(text)
        print("text transformed")

        # mapping from feature id to acutal word
        id2words = {}
        for i, word in enumerate(vectorizer.get_feature_names()):
            id2words[i] = word

        corpus = matutils.Sparse2Corpus(X, documents_columns=False)

        print("train LDA models")
        #####################################################################
        _model_name = "LDA10"
        self.modelLDA_10 = LdaModel(corpus, num_topics=10, id2word=id2words)
        self.model_save(self.modelLDA_10, _model_name, _active_dataset)

        _cousines2topics = self.modelLDA_10.get_document_topics(
            corpus, minimum_probability=0)
        _topics2cousines = []
        for i, _topics_weight in enumerate(_cousines2topics):
            _topics2cousines.append([_cousines[i], _topics_weight])
        with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2topics2cousines, 'wb') as f:
            pickle.dump(_topics2cousines, f)

        _rests_topics = []
        _rests = self.data_service.get_rests()
        for _rest in _rests:

            _cousines = self.data_service.get_cousines_for_rest(_rest[0])

            _rest_vector = []
            for _rc in _cousines:
                for _c in _topics2cousines:
                    if _c[0] == _rc:
                        if not _rest_vector:
                            # _rest_vector = _c[1]
                            for _t, _w in _c[1]:
                                _rest_vector.append([_t, float(_w)])
                        else:
                            for _t, _w in _c[1]:
                                _rest_vector[_t][1] = (_rest_vector[_t][1] + float(_w)) / 2

            _rests_topics.append([_rest, _rest_vector, _cousines])

        with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2rests2topics, 'wb') as f:
            pickle.dump(_rests_topics, f)

        #####################################################################
        _model_name = "LDA15"
        self.modelLDA_15 = LdaModel(corpus, num_topics=15, id2word=id2words)
        self.model_save(self.modelLDA_15, _model_name, _active_dataset)

        _cousines, _reviews = self.data_service.get_reviews_for_cousines()

        _cousines2topics = self.modelLDA_15.get_document_topics(
            corpus, minimum_probability=0)
        _topics2cousines = []
        for i, _topics_weight in enumerate(_cousines2topics):
            _topics2cousines.append([_cousines[i], _topics_weight])
        with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2topics2cousines, 'wb') as f:
            pickle.dump(_topics2cousines, f)

        _rests_topics = []
        _rests = self.data_service.get_rests()
        for _rest in _rests:

            _cousines = self.data_service.get_cousines_for_rest(_rest[0])

            _rest_vector = []
            for _rc in _cousines:
                for _c in _topics2cousines:
                    if _c[0] == _rc:
                        if not _rest_vector:
                            # _rest_vector = _c[1]
                            for _t, _w in _c[1]:
                                _rest_vector.append([_t, float(_w)])
                        else:
                            for _t, _w in _c[1]:
                                _rest_vector[_t][1] = (
                                    _rest_vector[_t][1] + float(_w)) / 2

            _rests_topics.append([_rest, _rest_vector, _cousines])

        with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2rests2topics, 'wb') as f:
            pickle.dump(_rests_topics, f)

        #####################################################################
        _model_name = "LDA20"
        self.modelLDA_20 = LdaModel(corpus, num_topics=20, id2word=id2words)
        self.model_save(self.modelLDA_20, _model_name, _active_dataset)

        _cousines, _reviews = self.data_service.get_reviews_for_cousines()

        _cousines2topics = self.modelLDA_20.get_document_topics(
            corpus, minimum_probability=0)
        _topics2cousines = []
        for i, _topics_weight in enumerate(_cousines2topics):
            _topics2cousines.append([_cousines[i], _topics_weight])
        with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2topics2cousines, 'wb') as f:
            pickle.dump(_topics2cousines, f)

        _rests_topics = []
        _rests = self.data_service.get_rests()
        for _rest in _rests:

            _cousines = self.data_service.get_cousines_for_rest(_rest[0])

            _rest_vector = []
            for _rc in _cousines:
                for _c in _topics2cousines:
                    if _c[0] == _rc:
                        if not _rest_vector:
                            # _rest_vector = _c[1]
                            for _t, _w in _c[1]:
                                _rest_vector.append([_t, float(_w)])
                        else:
                            for _t, _w in _c[1]:
                                _rest_vector[_t][1] = (
                                    _rest_vector[_t][1] + float(_w)) / 2

            _rests_topics.append([_rest, _rest_vector, _cousines])

        with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2rests2topics, 'wb') as f:
            pickle.dump(_rests_topics, f)

        ###################################################################

        print("TRAIN MODELS DONE")
        return self.modelLDA_10

    def get_topics(self, _num_words=10):
        """
        Return a list of topics
        @params:
            _num_words     - Option  : number of words for each topic to return (Int)
        """

        _topics = []
        for i, item in enumerate(self.model.show_topics(num_topics=20, num_words=_num_words, formatted=False)):
            _topic_words = []
            for term, weight in item[1]:
                _topic_words.append([term, weight])

            _topics.append([i, _topic_words])

        return _topics

    def process_topics_mining(self, _active_dataset):
        """
        Process topic mining and sace trained models
        @params:
            _active_dataset - Required  : name of active dataset (Str)
        """

        _result = self.topic_mining(_active_dataset)
        return _result

    def get_review_number(self):
        """
        Return a number of reviews in active dataset
        @params:
        """

        return self.data_service.get_review_number()
Ejemplo n.º 22
0
from gensim.models import LdaModel
from gensim import corpora
from pprint import pprint
import pandas as pd

DIR = "LDAs/"
data_set = "smartplugs1130-merged-lemmatized"
model_type = "nt100na0.1-1"
model = data_set + model_type
lda = LdaModel.load(DIR + model)

mm_file = data_set + '.mm'
mm = corpora.MmCorpus(mm_file)

DIR = "data/"
filename = data_set + ".csv"
file = DIR + filename

df = pd.read_csv(file)
reviews = df.Review

r = 3
review = mm[r]

topic_dist = LdaModel.get_document_topics(lda, review)
topics = [x[0] for x in topic_dist]
print(topics)

pprint(LdaModel.print_topics(lda, -1, 10))
Ejemplo n.º 23
0
class W2V_cpp2(W2V_base):
    def __init__(self,n_topic, path, folder):
        self.n_topic = n_topic
        W2V_base.__init__(self, path, folder)

        #process dict
        for prod_id in self.idx2prod.keys():
            prod = self.idx2prod[prod_id]
            n_prod_id = prod_id - len(self.word_count) - 1
            del self.idx2prod[prod_id]
            self.idx2prod[n_prod_id] = prod
            self.prod2idx[prod] = n_prod_id

        for user_id in self.idx2user.keys():
            user = self.idx2user[user_id]
            n_user_id = user_id - len(self.word_count) - len(self.prod2idx) - 1
            del self.idx2user[user_id]
            self.idx2user[n_user_id] = user
            self.user2idx[user] = n_user_id

    def train(self):
        data = []
        entity2id = {}
        id2entity = []

        for obj in self.data:
            doc = []
            obj_sents = obj["text_data"]
            entity = obj["prod"]
            if entity not in entity2id:
                entity2id[entity] = len(entity2id)
                id2entity.append(entity)
            doc_id = entity2id[entity]

            for obj_sent in obj_sents:
                for pair in obj_sent:
                    if pair[0] >= 0:
                        doc.append((pair[0], doc_id))
            data.append(doc)



        self.ldamodel = LdaModel(corpus=data, id2word=self.idx2word, num_topics=self.n_topic)

        f_entity = open("lda/prod.txt", "w")
        f_model = open("lda/model.txt", "w")
        f_model.write(str(len(entity2id)))
        f_model.write(" ")
        f_model.write(str(self.n_topic))
        f_model.write("\n")

        for entity in id2entity:
            f_entity.write(entity)
            f_entity.write("\n")

            f_model.write(entity)
            f_model.write(" ")

            distr = self.ldamodel.get_document_topics(data[1], minimum_phi_value=0, minimum_probability=0)
            distr = [pair[1] for pair in distr]

            for prod in distr:
                f_model.write(str(prod))
                f_model.write(" ")

            f_model.write("\n")

        self.ldamodel.save("lda/model_200")
        corpus=corpus,
        id2word=dictionary.id2token,
        chunksize=1000,
        alpha='asymmetric',
        eta='auto',
        iterations=iterations,
        num_topics=args.num_topics,
        passes=passes,
        eval_every=None
    )

    topic_tokens = []
    for topicid in range(args.num_topics):
        topic_tokens.append([dictionary.id2token[k[0]] for i, k in enumerate(model.get_topic_terms(topicid, topn=4)) if i < 2 or k[1] > 0.025])

    paper_topic_data = []
    for paper, paper_bow in zip(data, corpus):
        topic_distr = model.get_document_topics(paper_bow, minimum_probability=0)
        paper_topic_data.append({
            "key": paper["key"],
            "year": paper["year"],
            "title": paper["title"],
            "topic_distr": {t: float(p) for t, p in topic_distr}
        })

    with open(args.outpath, 'w') as f:
        json.dump({
            "topics": topic_tokens,
            "paper_data": paper_topic_data 
        }, f)
Ejemplo n.º 25
0
dct = Dictionary(corpus)
# transfer into corpus
corpus = [dct.doc2bow(doc) for doc in corpus]
# train lda model
lda = LdaModel(corpus, num_topics=2, id2word=dct)

# 参数(bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False)
# Parameters:
#   bow (list) – Bag-of-words representation of the document to get topics for.
#   minimum_probability (float) – Ignore topics with probability below this value (None by default). If set to None, a value of 1e-8 is used to prevent 0s.
#   per_word_topics (bool) – If True, also returns a list of topics, sorted in descending order of most likely topics for that word. It also returns a list of word_ids and each words corresponding topics’ phi_values, multiplied by feature length (i.e, word count).
#   minimum_phi_value (float) – if per_word_topics is True, this represents a lower bound on the term probabilities that are included (None by default). If set to None, a value of 1e-8 is used to prevent 0s.
# Returns:
#   topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples.
test = dct.doc2bow("I love Kitten".lower().strip().split())
print(lda.get_document_topics(test))
print(lda[test])

# 参数(word_id, minimum_probability=None)
# 关联的topics for the given word.
# Each topic is represented as a tuple of (topic_id, term_probability).
print(lda.get_term_topics(0))

# ----- 输出指定topic的构成 -----
# 参数(word_id, minimum_probability=None)
# 输出形式 list, format: [(word, probability), … ].
print(lda.get_topic_terms(0))
# 参数(topicno, topn=10)
print(lda.show_topic(0))
# 输出形式 String, format: ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘.
# 参数(topicno, topn=10)
Ejemplo n.º 26
0
def basicLDA(texts, num_topics, ):
    num_topics = num_topics
    # 获取未替换的文本的tfidf
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    start = time.clock()
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, chunksize=500, passes=100, iterations=100)
    lda.save('patent_basiclda.model')
    topic = {}  # 用来存放每个主题的词分布 按照从0到最大的顺序
    for i in lda.show_topics(num_topics=-1, num_words=20):
        s = str(i)
        pattern1 = "[\u4e00-\u9fa5]+"
        a = re.findall(pattern1, s)  # 匹配出 词簇tag
        topic[i[0]] = i[1]  # 用来存放每个主题的词分布 按照从0到最大的顺序

    #     # word_list = []
    #     # for i in a:
    #     #     w = token2tag_dic[int(i)]
    #     #     word_list.append(w)
    #     # pattern2 = r'"\d+"'
    #     # st = re.split(pattern2, s)
    #     # topics.append(list(chain.from_iterable(zip(st, word_list))))
    #     # print(list(chain.from_iterable(zip(st, word_list))))
    c = 0
    ppp = []
    topics = []
    for i in lda.get_document_topics(corpus):  # i是按照词袋中的顺序,每个文档的主题分布
        s = str(i)
        vec = [0.0] * 30
        ppp.append([])
        for j in i:
            a = int(j[0])
            vec[a] = j[1]
            ppp[-1].extend([j[1],topic[j[0]]])
        topics.append(vec)
    for i in ppp:
        print(i)
    with open('basiclda文本的主题词', 'w', encoding='utf8') as f:
        f.write(str(ppp))
        # print(vec)
        # pattern1 = r'\((\d+),'
        # a = re.findall(pattern1, s)
        # print(a)  #匹配出每个文档的包含的主题标签
        # print(keyword)
        # guanjianci = ' '.join([str(token2tag_dic[int(j)]) for j in keyword]) #[1,2,3] 按照索引把

        # word_list = [] #存放当前文档包含的所有的主题
        # for idx in a: #取主题号
        #     w = topics[int(idx)] #取主题词分布
        #     word_list.append(w) #按照主题标签, 把对应主题的词分布 ,按照顺序存起来
        #
        # l = [list(k)[1] for k in i] #list(k)[1] 每个主题的取概率
        # doc2top = {}
        # for num in range(len(l)):
        #     doc2top[l[num]]  = word_list[num]
        #
        # print(doc2top)
        # print(list(chain.from_iterable(zip(l, word_list))))
    # doctopic = []
    # for i in lda.get_document_topics(corpus)[:]:
    #     listj = []
    #     for j in i:
    #         listj.append(j[1])
    #     bz = listj.index(max(listj))
    #
    #     k = i[bz][0]
    #     doctopic.append(k)
    with open('basiclda_topic', 'w', encoding='utf8') as f:
        f.write(str(topics))
    elapsed = time.clock() - start
    return lda, corpus, dictionary, elapsed
Ejemplo n.º 27
0
    chunksize=10000,
    alpha='asymmetric',
    decay=0.5,
    offset=64,
    eta=None,
    eval_every=0,
    iterations=100,
    gamma_threshold=0.001,
    per_word_topics=True)

## See the topics
lda_model.print_topics(-1)  #this allows to observe the topics
lda_model.get_topic_terms(0,
                          topn=10)  # this provides the top 10 words in topic 0
lda_model.log_perplexity(corpus)  # this compute the log perplexity
lda_model.get_document_topics(
    corpus[0]
)  # This provide the document topic distribution. Note that by default, when a document has a low probability on a topic, it is not displayed
lda_model.get_document_topics(
    corpus[0], minimum_probability=0
)  # This provide the document topic distribution. Here, every topics and associated probabilities are printed.
### Document topic
####
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.show(vis)
8
Ejemplo n.º 28
0
print('Reading dataset')
data = pd.read_parquet(args.input_filepath)

print('Normalizing text')
data.text = data.text.map(nlp.normalize_text)

print('Building docterm matrix')
docterm, dictionary = nlp.get_docterm_matrix(data.text)
doclength = np.array([sum(x[1] for x in doc) for doc in docterm])

print('Training LDA model')
lda = LdaModel(docterm, num_topics=args.n_topics)

print('Getting document topics')
doctopics = corpus2csc([lda.get_document_topics(doc) for doc in docterm])
termtopics = lda.get_topics()

print('Computing topic volume time series')
topic_volume_over_time = nlp.get_topic_volume_over_time(data, doctopics, 20)

print('Computing topic coordinates')
topic_coordinates = nlp.get_topic_coordinates(termtopics, method='mds')
topic_proportions = nlp.get_topic_proportions(doctopics, doclength)

print('Computing term frequencies')
term_frequencies = nlp.get_term_frequencies(docterm, termtopics,
                                            topic_proportions, doclength)

print('Computing term ranks per topic')
term_ranks = nlp.get_topic_term_ranks(docterm, termtopics)
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

from collections import defaultdict

frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] >= 1]
         for text in texts]

from pprint import pprint  # pretty-printer

dictionary = corpora.Dictionary(texts)
# dictionary.save('/tmp/deerwester.dict') # store the dictionary, for future reference
# print(dictionary)
corpus = [dictionary.doc2bow(text) for text in texts]
# corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)

lda = LdaModel(corpus, num_topics=2)

# on a new document:
new_doc = "pretty obvious that when i write my tellall memoir someday there will be four to six"
new_vec = dictionary.doc2bow(new_doc.lower().split())

print(lda.print_topic(0))
print(lda.show_topic(1))
print(lda.get_document_topics(new_vec))
Ejemplo n.º 30
0
def gensim_lda_topic_modelling(path,
                               documents,
                               num_of_topics=6,
                               passes=50,
                               verbose=True,
                               plotTopicsResults=True):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    if verbose:
        print("Cleaned documents:\n", documents)
        print("\nDictionary:\n", dictionary)
        print("\nCorpus in BoW form: \n", corpus)
    start = time.time()
    ldamodel = LdaModel(corpus=corpus,
                        num_topics=num_of_topics,
                        passes=passes,
                        id2word=dictionary)
    end = time.time()
    print("Completion time for building LDA model: %.3f s = %.3f min" %
          ((end - start), (end - start) / 60.0))

    ldatopics = ldamodel.show_topics(formatted=False)
    ldatopics_words = [[[word, prob] for word, prob in topic]
                       for topicid, topic in ldatopics]

    if verbose:
        print("\nList of words associated with each topic:\n")
        for i in range(len(ldatopics_words)):
            print("\nTopic %d:\n" % i)
            for w, p in ldatopics_words[i]:
                print(p, " - ", w)

    if plotTopicsResults:
        plot_top_10_words_per_topic(path,
                                    ldatopics_words,
                                    num_topics=6,
                                    num_top_words=10)

    all_documents_topics = [
        (doc_topics, word_topics, word_phis)
        for doc_topics, word_topics, word_phis in ldamodel.get_document_topics(
            corpus, per_word_topics=True)
    ]
    all_doc_topics = []
    for i in range(len(all_documents_topics)):
        doc_topics, word_topics, phi_values = all_documents_topics[i]
        all_doc_topics.append(
            [doc_topics[i][1] for i in range(len(doc_topics))])
        if verbose:
            print('Document topics:', doc_topics)
            print('Word topics:', word_topics)
            print('Phi values:', phi_values)
            print('-------------- \n')

    if plotTopicsResults:
        plot_share_of_topics(path, all_doc_topics, no_random_tweets=10)

    # Plot words coloured differently depending on the topic
    for doc in documents[0:100]:
        if len(doc) > 4:
            color_words(ldamodel, doc)
Ejemplo n.º 31
0
class W2V_cpp2(W2V_base):
    def __init__(self, n_topic, path_review, path_business, folder):
        self.n_topic = n_topic

        print('Init W2V_base')
        W2V_base.__init__(self, path_review, path_business, folder)

        print('Process idx2prod')
        #process dict
        for prod_id in self.idx2prod.keys():
            prod = self.idx2prod[prod_id]
            n_prod_id = prod_id - len(self.word_count) - 1
            del self.idx2prod[prod_id]
            self.idx2prod[n_prod_id] = prod
            self.prod2idx[prod] = n_prod_id

        print('Process idx2user')
        for user_id in self.idx2user.keys():
            user = self.idx2user[user_id]
            n_user_id = user_id - len(self.word_count) - len(self.prod2idx) - 1
            del self.idx2user[user_id]
            self.idx2user[n_user_id] = user
            self.user2idx[user] = n_user_id

        print('Init W2V_cpp2 done')

    def train(self):
        data = []
        entity2id = {}
        id2entity = []

        print('Loading data')
        for obj in self.data:
            doc = []
            obj_sents = obj["text_data"]
            entity = obj["prod"]
            if entity not in entity2id:
                entity2id[entity] = len(entity2id)
                id2entity.append(entity)
            doc_id = entity2id[entity]

            for obj_sent in obj_sents:
                for pair in obj_sent:
                    if pair[0] >= 0:
                        doc.append((pair[0], doc_id))
            data.append(doc)

        print('Start training')
        self.ldamodel = LdaModel(corpus=data,
                                 id2word=self.idx2word,
                                 num_topics=self.n_topic)
        print('Training complete, start exporting')

        f_entity = open(home + "/Data/yelp/lda/prod.txt", "w")
        f_model = open(home + "/Data/yelp/lda/model.txt", "w")
        f_model.write(str(len(entity2id)))
        f_model.write(" ")
        f_model.write(str(self.n_topic))
        f_model.write("\n")

        for entity in id2entity:
            f_entity.write(entity)
            f_entity.write("\n")

            f_model.write(entity)
            f_model.write(" ")

            distr = self.ldamodel.get_document_topics(data[1],
                                                      minimum_phi_value=0,
                                                      minimum_probability=0)
            distr = [pair[1] for pair in distr]

            for prod in distr:
                f_model.write(str(prod))
                f_model.write(" ")

            f_model.write("\n")

        self.ldamodel.save(home + "/Data/yelp/lda/model_200")
Ejemplo n.º 32
0
train_data, train_labels = parse("wikisection_dataset_json/wikisection_en_city_train.json")
test_data, test_labels = parse("wikisection_dataset_json/wikisection_en_city_test.json")
dct, bow = create_bow(train_data)
print("Preprocessing is finished!")

lda_model = LdaModel(
    corpus=bow,
    num_topics=NUM_TOPICS,
    id2word=dct)
print("Lda is finished!")

document_topics = []
for doc_bow in bow:
    ls = []
    for top, prob in lda_model.get_document_topics(bow=doc_bow, minimum_probability=0.0):
        ls.append(prob)
    document_topics.append(ls)

t_document_topics = []
for doc in test_data:
    doc_bow = dct.doc2bow(doc)
    ls = []
    for top, prob in lda_model.get_document_topics(bow=doc_bow, minimum_probability=0.0):
        ls.append(prob)
    t_document_topics.append(ls)

clf = svm.LinearSVC()
clf.fit(document_topics, train_labels)

y_pred = clf.predict(t_document_topics)