def getRelationDetailByHDP(sentence_list):
    # 聚类获取结果
    corpus = []
    pairs_all, position_all = segmentor.segListWithNerTag(sentence_list)
    words_list = []
    for pairs in pairs_all:
        word_list = []
        for pair in pairs:
            if pair.flag.__contains__("v") or pair.flag.__contains__("n"):
                word_list.append(pair.word)
        words_list.append(word_list)
    # words_list = list(map(lambda pairs: map(lambda x: x.word, pairs), pairs_all))
    from gensim import corpora
    dictionary = corpora.Dictionary(words_list)
    for words in words_list:
        corpus.append(dictionary.doc2bow(words))
    from gensim.models import HdpModel
    hdp = HdpModel(corpus, dictionary)
    a = hdp.print_topics()
    words = {}
    for topic in a:
        word_details = str(topic[1]).split(" + ")
        for word_detail in word_details:
            word = str(word_detail[word_detail.index("*") + 1:])
            num = float(str(word_detail[:word_detail.index("*")]))
            if not (words.__contains__(word)):
                words[word] = num
            else:
                words[word] += num
    words = sorted(words.items(), key=lambda d: d[1])
    return words  # 后获取句法分析中的高频动词名词)
Beispiel #2
0
    def runModels(self, number_of_topics, corpus, dictionary, start, end):

        #do hdp model

        hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

        hdpmodel.print_topics(num_topics=int(number_of_topics), num_words=10)
        hdptopics = hdpmodel.show_topics(num_topics=int(number_of_topics))

        #   result_dict=addTotalTermResults(hdptopics)

        #add results to total kept in a list
        #   addToResults(result_dict)

        #output results
        self.printResults(number_of_topics, hdptopics, 'hdp', start, end)

        #d lda model
        ldamodel = LdaModel(corpus=corpus,
                            num_topics=number_of_topics,
                            id2word=dictionary,
                            random_state=100,
                            update_every=1,
                            chunksize=100,
                            passes=10,
                            alpha='auto',
                            per_word_topics=True)

        ldamodel.save('lda' + number_of_topics + '.model')
        ldatopics = ldamodel.show_topics(num_topics=int(number_of_topics))

        #   result_dict=addTotalTermResults(ldatopics)
        #   addToResults(result_dict)
        self.printResults(number_of_topics, ldatopics, 'lda', start, end)

        visualisation = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

        location = os.path.join(pn, 'topic_model_results')

        #visualize outputs in html
        pyLDAvis.save_html(
            visualisation,
            os.path.join(
                location, 'LDA_Visualization' + str(number_of_topics) + "_" +
                start + "_" + end + '.html'))
Beispiel #3
0
def create_hdp(num_topic, dictionary):
    print("__________________________Create HDP_________________________")
    corpus, dic = generate_corpus(dictionary)
    hdpmodel = HdpModel(corpus=corpus, id2word=dic)
    topics = hdpmodel.print_topics(num_topics=num_topic, num_words=7)
    # see list of topics
    for topic in topics:
        print(topic)

    return hdpmodel
Beispiel #4
0
def train_topics(args):
    print(f"Arguments: {args}")

    nlp = spacy.load("en", disable=["parser", "ner"])

    files = args["text"]
    lines = extract_stories(files)

    def tozenize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        allowed_postags = set(allowed_postags)
        docs = nlp.pipe(texts)
        text_tokens = []
        for doc in docs:
            tokens = [
                token.lemma_ for token in doc if token.pos_ in allowed_postags
                and not token.is_punct and not token.is_stop
            ]
            text_tokens.append(tokens)
        return text_tokens

    docs = tozenize(lines, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    print("Preprocessed Docs")

    bigram = gensim.models.Phrases(docs, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[docs], threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    docs = make_bigrams(docs)
    docs = make_trigrams(docs)

    print("Create Dictionary")
    # Create Dictionary
    corpus_dict = corpora.Dictionary(docs)
    # Create Corpus
    texts = docs
    # Term Document Frequency
    corpus = [corpus_dict.doc2bow(text) for text in texts]

    print("Train Model")
    hdp = HdpModel(corpus, corpus_dict)

    print(hdp.print_topics(num_topics=50, num_words=20))

    hdp.save(args["target"])
    def get_num_topics(self):

        self.rev_train['title'] = self.strip_newline(self.rev_train.title)
        self.rev_test['title'] = self.strip_newline(self.rev_test.title)
        # rev_train.text[21:22].values

        words_tr = list(self.sent_to_words(self.rev_train.title))
        words_te = list(self.sent_to_words(self.rev_test.title))

        words_tr = self.remove_stopwords(words_tr)

        bigram_tr, trigram_tr = self.bigrams(words_tr)

        trigrams_tr = [trigram_tr[bigram_tr[review]] for review in words_tr]

        lemma_lg = self.lemmatization(trigrams_tr)

        with open(os.path.join('.', 'data', 'lemma_lg.pkl'), 'wb') as f:
            pickle.dump(lemma_lg, f)

        id2word_lg = gensim.corpora.Dictionary(lemma_lg)
        id2word_lg.filter_extremes(no_below=2, no_above=0.6)
        id2word_lg.compactify()
        id2word_lg.save(os.path.join('.', 'data', 'train_dict_lg'))
        corpus_lg = [id2word_lg.doc2bow(text) for text in lemma_lg]

        with open(os.path.join('.', 'data', 'corpus_lg.pkl'), 'wb') as f:
            pickle.dump(corpus_lg, f)

        hdp = HdpModel(corpus_lg, id2word_lg, chunksize=100)
        n_topics = len(hdp.print_topics())
        hdptopics = hdp.print_topics(num_topics=n_topics)

        for tp in hdptopics:
            print(tp)

        return n_topics
Beispiel #6
0
def try_news_cluster():

    docs = feed_doc()
    df_threshold_lower = 50
    df_threshold_upper = 500
    dictionary = corpora.Dictionary(doc for doc in docs)
    print 'dictionary ready'
    low_df = [
        tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
        if docfreq <= df_threshold_lower
    ]
    high_df = [
        tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
        if docfreq > df_threshold_upper
    ]
    dictionary.filter_tokens(low_df + high_df)
    dictionary.compactify()
    corpus = [dictionary.doc2bow(doc) for doc in feed_doc()]
    print 'corpus ready'
    hdp = HdpModel(corpus, dictionary)
    for topic in hdp.print_topics(num_topics=50, num_words=20):
        print topic
def hierarchical_dirichlet_process_topic_extraction():
    """
    Function performs topic extraction on Tweets using the Gensim HDP model.

    :return: None.
    """
    from gensim.models import HdpModel

    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model.
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=1000,
                                    stop_words='english')
    tf = tf_vectorizer.fit_transform(slo_feature_series)
    tf_feature_names = tf_vectorizer.get_feature_names()

    log.info(
        "\n.fit_transform - Learn the vocabulary dictionary and return term-document matrix."
    )
    log.info(f"{tf}\n")
    log.info(
        "\n.get_feature_names - Array mapping from feature integer indices to feature name"
    )
    log.info(f"{tf_feature_names}\n")

    # Train the HDP model.
    hdp = HdpModel(corpus, dictionary)
    time.sleep(3)

    # # For use as wrapper with Scikit-Learn API.
    # model = HdpTransformer(id2word=dictionary)
    # distribution = model.fit_transform(corpus)

    # Display the top words for each topic.
    topic_info = hdp.print_topics(num_topics=20, num_words=10)

    for topic in topic_info:
        print(topic)
Beispiel #8
0
class TcModel:
    """
    Using gensim LDA model to implement the topic cluster
    """
    def __init__(self):
        self.original_data = []
        self.text = []
        self.token = []
        self.corpus = []
        self.id2word = []
        self.model_name = ''
        self.num_topics = 10
        self.iterations = 100
        self.model = None
        self.stop_words = stopwords.words('english')
        self.stop_words.extend(
            ['be', 'say', '-PRON-', 'ms', 'mr', 'year', 'cent'])

    def _tokenize_words(self, text):
        token = []
        total = len(text)
        for i in range(total):
            token.append(gensim.utils.simple_preprocess(text[i], deacc=True))
        return token

    def _phrase(self, token):
        bigram = Phrases(token, min_count=5, threshold=100)
        bigram_mod = Phraser(bigram)
        trigram = Phrases(bigram_mod[token], min_count=5, threshold=100)
        trigram_mod = Phraser(trigram)
        return [trigram_mod[bigram_mod[doc]] for doc in token]

    def _lemmatization(self, token):
        nlp = spacy.load('en', disable=['parser', 'ner'])
        return_text = []
        allow_postags = ['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']
        for i in token:
            sentence = nlp(" ".join(i))
            return_text.append([
                token.lemma_ for token in sentence
                if token.pos_ in allow_postags
            ])
        return return_text

    def find_most_common(self, token, plot=False):
        word_list = []
        extra_stopwords = []
        for i in token:
            word_list.extend(i)
        word_dic = collections.Counter(word_list)
        #print(word_dic.most_common(100))
        tf = list(word_dic.values())
        tf.sort(reverse=True)
        if plot == True:
            print(tf[:100])
            plt.plot(range(500), tf[:500])
            plt.xlabel('word sequence')
            plt.ylabel('Term Frequency')
            plt.show()

        m_list = []
        for i in range(len(tf) - 1):
            m_list.append(tf[i] - tf[i + 1])
        k = tf[m_list.index(max(m_list))]
        print(k)
        k = 5000
        for i in word_dic:
            if word_dic[i] > k:
                extra_stopwords.append(i)

        print(extra_stopwords)
        return extra_stopwords

    def _remove_stopwords(self, token):
        return_text = []
        self.stop_words.extend(self.find_most_common(token))
        for i in token:
            return_text.append(
                [word for word in i if word not in self.stop_words])
        return return_text

    def _doc_topic(self):
        """
        Matrix = [doc_id,title,topic,probability,summary,content]
        """
        matrix = []
        for num in range(len(self.corpus)):
            row = self.model[self.corpus[num]]
            row = sorted(row, key=lambda x: x[1], reverse=True)
            for i, j in row:
                if float(j) < 0.05:
                    continue
                value = [
                    self.original_data.ix[num]['id'],
                    self.original_data.ix[num]['title'], i, j,
                    self.original_data.ix[num]['summary'],
                    self.original_data.ix[num]['content']
                ]
                if value not in matrix:
                    matrix.append(value)

        matrix = pd.DataFrame(matrix,
                              columns=[
                                  'doc_id', 'title', 'topic', 'probability',
                                  'summary', 'content'
                              ])
        self.doc_topic = matrix
        print(matrix)
        return matrix

    def _topic_doc(self):
        matrix = []
        for i in range(self.num_topics):
            doc_list = [
                i for i in list(self.doc_topic[
                    self.doc_topic.topic == i].sort_values(
                        by='probability', ascending=False)['doc_id'])
            ]
            if doc_list == []:
                self.num_topics = i
                break
            output = ",".join([str(i) for i in doc_list])
            print('topic {}: {}'.format(i, output))
            matrix.append([i, output])
        return matrix

    def _readable_topic(self, sent_num=3):
        output = []
        for i in range(self.num_topics):
            sent = ''
            content = []
            score_list = []
            topic_term = dict(self.model.show_topic(i, topn=1000))
            topic_list = self.doc_topic[self.doc_topic.topic == i]
            max_pro = heapq.nlargest(5, topic_list['probability'])
            for pro in max_pro:
                content.append(
                    list(topic_list[topic_list.probability == pro]['content'])
                    [0])
            content = ' '.join(content)

            content = [text for text in sent_tokenize(content)]
            for j in range(len(content)):
                words = gensim.utils.simple_preprocess(content[j], deacc=True)
                corpus = self.model.id2word.doc2bow(words)
                score = 0
                for word, num in corpus:
                    word = self.model.id2word.get(word)
                    if word in topic_term.keys():
                        score += num * topic_term[word]
                score_list.append(score)
            score_list = list(set(score_list))
            max_score = heapq.nlargest(sent_num, score_list)
            for j in range(len(max_score)):
                max_sent = score_list.index(max_score[j])
                print('topic {}: {}'.format(i, content[max_sent]))
                sent = sent + str('sentence {}: {}\n'.format(
                    j + 1, content[max_sent]))
            output.append([i, sent])
        return output

    def _topic_key(self):
        output = []
        for i in range(self.num_topics):
            output.append(
                [i, ','.join([item[0] for item in self.model.show_topic(i)])])
        print(output)
        return output

    def train(self,
              path,
              num_topics=20,
              iterations=500,
              n_gram=True,
              lemmatization=True,
              stop_words=True,
              tfidf=True,
              model='lda'):
        """
        Trian the topic cluster model.
        Input value: data: pd.DataFrame format ['id','title','content','summary']
                     num_topics: (int) the number of topics
                     iterations: (int) total number of iteration times
        example:
        >>> lda = LDA_Model
        >>> lda.train(text)
        """
        data = load_data(str(path + '/output/data.csv'))
        self.original_data = data
        self.text = list(data['content'])
        self.num_topics = num_topics
        self.iterations = iterations
        self.model_name = model

        print('tokenizing...')
        self.token = self._tokenize_words(self.text)
        if n_gram == True:
            print('phrasing...')
            self.token = self._phrase(self.token)
        if lemmatization == True:
            print('lemmatization...')
            self.token = self._lemmatization(self.token)
        if stop_words == True:
            print('remove stop words...')
            self.token = self._remove_stopwords(self.token)

        self.id2word = Dictionary(self.token)
        self.corpus = [self.id2word.doc2bow(text) for text in self.token]
        if tfidf == True:
            print('calculate tfidf...')
            tfidf_model = TfidfModel(self.corpus)
            self.corpus = [tfidf_model[i] for i in self.corpus]
        if model == 'lda':
            self.model = LdaModel(corpus=self.corpus,
                                  id2word=self.id2word,
                                  num_topics=self.num_topics,
                                  iterations=self.iterations)
        if model == 'lsi':
            self.model = LsiModel(corpus=self.corpus,
                                  id2word=self.id2word,
                                  num_topics=self.num_topics)
        if model == 'hdp':
            self.model = HdpModel(corpus=self.corpus, id2word=self.id2word)
            self.num_topics = self.model.get_topics().shape[0]

        self.topic_key = pd.DataFrame(self._topic_key(),
                                      columns=['topic_id', 'key_words'])
        self.doc_topic = self._doc_topic()
        self.topic_doc = pd.DataFrame(self._topic_doc(),
                                      columns=['topic_id', 'document_id'])
        self.topic_sent = pd.DataFrame(
            self._readable_topic(),
            columns=['topic_id', 'most relative sentence'])

    def save(self, path='default'):
        #timestr = time.strftime('%Y%m%d%H%M%S',time.localtime(time.time()))
        if path == 'default':
            path = 'model'
            try:
                os.mkdir(path)
            except:
                pass

        else:
            try:
                os.mkdir(path)
            except:
                pass

        if self.model_name == 'lda':
            self.model.save(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model.save(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model.save(str(path + '/hdp.model'))

        f = open(str(path + '/original_data.pickle'), 'wb')
        pickle.dump(self.original_data, f)
        f.close()
        f = open(str(path + '/text.pickle'), 'wb')
        pickle.dump(self.text, f)
        f.close()
        f = open(str(path + '/token.pickle'), 'wb')
        pickle.dump(self.token, f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'wb')
        pickle.dump(self.corpus, f)
        f.close()
        path = path + '/result'
        self.save_result(path)

        avg, cosine_matrix = self.similarity()
        sns.set()
        label = []
        col = []
        for i in range(self.num_topics):
            cosine_matrix[i][i] = 0.5
            col.append('topic {}'.format(i))
        cosine_matrix = pd.DataFrame(cosine_matrix)
        cosine_matrix.columns = col
        cosine_matrix.index = col
        sns.heatmap(cosine_matrix, cmap='YlGnBu')
        plt.savefig(path + '/topic_similarity.jpg')
        cosine_matrix.to_csv(str(path + '/cosine_matrix.csv'))

    def save_result(self, path='default'):
        if path == 'default':
            path = 'model/result'
            try:
                os.mkdir(path)
            except:
                pass

        else:
            try:
                os.mkdir(path)
            except:
                pass

        # topic_key = pd.DataFrame(self.print_topics(num_topics=self.num_topics,num_words=10),columns=['topic id','key words'])
        # topic_key.to_csv(str(path+'/topic_key.csv'),index=False)
        # doc_topic = self._doc_topic()
        # doc_topic.to_csv(str(path+'/doc_topic.csv'))
        # topic_doc = pd.DataFrame(self._topic_doc(),columns=['topic id','document id'])
        # topic_doc.to_csv(str(path+'/topic_doc.csv'),index=False)
        # topic_sent = pd.DataFrame(self._readable_topic(),columns=['topic id','most relative sentence'])
        # topic_sent.to_csv(str(path+'/topic_sent.csv'),index=False)

        f = open(str(path + '/topic_key.pickle'), 'wb')
        pickle.dump(self.topic_key, f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'wb')
        pickle.dump(self.doc_topic, f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'wb')
        pickle.dump(self.topic_doc, f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'wb')
        pickle.dump(self.topic_sent, f)
        f.close()

    def load(self, path='default'):
        """
        :param path: the path of trained model.
        :return:
        """
        if path == 'default':
            path = 'model'
        file_list = os.listdir(path)
        for file in file_list:
            if file.endswith('.model'):
                self.model_name = file.split('.')[0]
        if self.model_name == 'lda':
            self.model = LdaModel.load(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model = LsiModel.load(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model = HdpModel.load(str(path + '/hdp.model'))

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.model.get_topics().shape[0]
        else:
            self.num_topics = self.model.num_topics
        #self.iterations = self.model.iterations

        f = open(str(path + '/original_data.pickle'), 'rb')
        self.original_data = pickle.load(f)
        f.close()
        f = open(str(path + '/text.pickle'), 'rb')
        self.text = pickle.load(f)
        f.close()
        f = open(str(path + '/token.pickle'), 'rb')
        self.token = pickle.load(f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'rb')
        self.corpus = pickle.load(f)
        f.close()

        path = path + '/result'
        f = open(str(path + '/topic_key.pickle'), 'rb')
        self.topic_key = pickle.load(f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'rb')
        self.doc_topic = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'rb')
        self.topic_doc = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'rb')
        self.topic_sent = pickle.load(f)
        f.close()

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.topic_doc.shape[0]
        else:
            self.num_topics = self.model.num_topics

    def update(self,
               path,
               iterations=100,
               n_gram=True,
               lemmatization=True,
               stop_words=True,
               model='lda'):
        """
        :param path: The path of training file
        :param iterations: Only for lda model
        :param n_gram: choose if use n_gram feature, default is true
        :param lemmatization: choose if use lemmatization feature, default is true
        :param stop_words: choose if need to remove stop words, default is true
        :param model: choose what model to use, default is 'lda'
        :return:
        """
        data = load_data(path + '/output/data.csv')
        pd.concat([self.original_data, data], axis=0)
        text = list(data['content'])
        self.text.extend(text)

        print('tokenizing...')
        token = self._tokenize_words(text)
        self.token.extend(token)
        if n_gram == True:
            print('phrasing...')
            token = self._phrase(token)
            self.token.extend(token)
        if lemmatization == True:
            print('lemmatization...')
            token = self._lemmatization(token)
            self.token.extend(token)
        if stop_words == True:
            print('remove stop words...')
            token = self._remove_stopwords(token)
            self.token.extend(token)

        corpus = [self.id2word.doc2bow(text) for text in self.token]
        self.corpus.extend(corpus)
        self.model.update(corpus=corpus, iterations=iterations)

    def print_topics(self, num_topics=-1, num_words=10):
        """
        :param num_topics:(int, optional) – The number of topics to be selected
        :param num_words:(int, optional) – The number of words to be included per topics
        :return: list of (int, list of (str, float))
        """
        if num_topics == -1:
            num_topics = self.num_topics
        pprint.pprint(
            self.model.print_topics(num_topics=num_topics,
                                    num_words=num_words))
        return self.model.print_topics(num_topics=num_topics,
                                       num_words=num_words)

    def score(self):
        """
        Print the Coherence score of the model.

        """

        #print('\nPerplexity: ', self.model.log_perplexity(self.corpus))
        coherence_model_lda = CoherenceModel(model=self.model,
                                             texts=self.token,
                                             corpus=self.corpus,
                                             dictionary=self.id2word,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

    def vis(self):
        """
        Visualization of the data through browser.
        """

        vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.id2word)
        pyLDAvis.show(vis)

    def consine(self, v1, v2):
        cosine = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
        return cosine

    def similarity(self):
        topic_vector = self.model.get_topics()
        num_topics = topic_vector.shape[0]
        consine_matrix = np.diag(np.ones(num_topics))
        consine_list = []
        for i in range(num_topics - 1):
            for j in range(i + 1, num_topics):
                consine_matrix[i][j] = self.consine(topic_vector[i],
                                                    topic_vector[j])
                consine_matrix[j][i] = consine_matrix[i][j]
                consine_list.append(consine_matrix[i][j])
        average = np.average(consine_list)
        return average, consine_matrix

    def to_gephi(self):
        _, cosine_matrix = self.similarity()
        edge = []
        for i in range(self.num_topics - 1):
            for j in range(i + 1, self.num_topics):
                edge.append([
                    'topic {}'.format(i), 'topic {}'.format(j),
                    cosine_matrix[i][j]
                ])

        for i in range(self.doc_topic.shape[0]):
            edge.append([
                'topic {}'.format(self.doc_topic.ix[i]['topic']),
                self.doc_topic.ix[i]['doc_id'],
                self.doc_topic.ix[i]['probability']
            ])
        # edge = []
        # node = []
        # topic_vector = self.model.get_topics()

        #decomposition

    #     pca = PCA(n_components=1000)
    #     topic_vector = pca.fit_transform(topic_vector)
    #     print(len(topic_vector[0]))
    #     for i in range(len(topic_vector)):
    #         for j in range(len(topic_vector[i])):
    #             edge.append(['topic {}'.format(i),j,topic_vector[i][j]])
    #         node.append(['topic {}'.format(i),'topic {}'.format(i)])
    #
    #     return node,edge
    #
    def to_neo4j(self):
        output = []
        for i in range(self.num_topics):
            output.append('CREATE(:Topic{id:"topic %d"})' % i)
            for word, pro in self.model.show_topic(i):
                output.append(
                    'MATCH (t:Topic) where t.id = "topic %d" CREATE t-[:Include{probability:%f}]-> (:Word{word:"%s"})'
                    % (i, pro, word))

        for i in range(len(self.original_data)):
            output.append('CREATE(:Document{id:%d})' %
                          (self.original_data.ix[i]['id']))

        for i in range(len(self.doc_topic)):
            output.append(
                'MATCH (t:Topic),(d:Document) WHERE t.id = "topic %d" and d.id = %d CREATE t-[:Include{probability:%f}]->d'
                %
                (self.doc_topic.ix[i]['topic'], self.doc_topic.ix[i]['doc_id'],
                 self.doc_topic.ix[i]['probability']))

        return output
Beispiel #9
0
class TcModel:
    """
    Using gensim LDA model to implement the topic cluster
    """

    def __init__(self):
        self.original_data = []
        self.text = []
        self.token = []
        self.corpus = []
        self.id2word = []
        self.model_name = ''
        self.num_topics = 10
        self.iterations = 100
        self.model = None
        self.stop_words = stopwords.words('english')
        self.stop_words.extend(['be', 'say', '-PRON-', 'ms','Mr','Ms','mr', 'year', 'cent', 'per', 'www', 'http', 'com'])

    def _phrase(self, token):
        bigram = Phrases(token, min_count=5, threshold=100)
        bigram_mod = Phraser(bigram)
        # trigram = Phrases(bigram_mod[token],min_count=5,threshold=100)
        # trigram_mod = Phraser(trigram)
        # return [trigram_mod[bigram_mod[doc]] for doc in token]
        return [bigram_mod[doc] for doc in token]

    def _tokenize_words(self,text):
        token = []
        total = len(text)
        for i in range(total):
            token.append(gensim.utils.simple_preprocess(text[i],deacc=True))
        return token

    def _preprocess(self, doc,lemma = True, stop_words = True):
        nlp = spacy.load('en')
        return_text = []
        allow_NER = ["NORP","FAC","ORG","GPE","LOC","PERSON","PRODUCT","LANGUAGE","EVENT"]
        allow_POS = ["ADJ","NOUN","VERB"]
        for i in doc:
            i = re.sub("[\!\/_,%^*(+\"\')]+|[+——()?【】'’“”!,。?、~@#¥%……&*()]+"," ",i)
            i = re.sub("[\s+]"," ",i)
            sentence = nlp(i,disable = ['parser'])
            return_text.append([ent.text for ent in sentence.ents if ent.label_ in allow_NER])
            if lemma == True and stop_words == True:
                return_text[-1].extend([token.lemma_ for token in sentence if token.ent_type_ == '' and token.lemma_ not in self.stop_words and token.pos_ in allow_POS])
            elif lemma == True and stop_words == False:
                return_text[-1].extend([token.lemma_ for token in sentence if token.ent_type_ == '' and token.pos_ in allow_POS])
            elif lemma == False and stop_words == False:
                return_text[-1].extend([token for token in sentence if token.ent_type_ == '' and token.pos_ in allow_POS])
            elif lemma == False and stop_words == True:
                return_text[-1].extend([token for token in sentence if token.ent_type_ == '' and token.lemma_ not in self.stop_words and token.pos_ in allow_POS])
        return return_text

    def find_most_common(self, token, plot=False):
        word_list = []
        extra_stopwords = []
        for i in token:
            word_list.extend(i)
        word_dic = collections.Counter(word_list)
        # print(word_dic.most_common(100))
        tf = list(word_dic.values())
        tf.sort(reverse=True)
        if plot == True:
            print(tf[:100])
            plt.plot(range(500), tf[:500])
            plt.xlabel('word sequence')
            plt.ylabel('Term Frequency')
            plt.show()

        m_list = []
        for i in range(len(tf) - 1):
            m_list.append(tf[i] - tf[i + 1])
        k = tf[m_list.index(max(m_list))]
        print(k)
        k = 5000
        for i in word_dic:
            if word_dic[i] > k:
                extra_stopwords.append(i)

        print(extra_stopwords)
        return extra_stopwords

    def _doc_topic(self):
        """
        Matrix = [doc_id,title,topic,probability,summary,content]
        """
        matrix = []
        for num in range(len(self.corpus)):
            row = self.model[self.corpus[num]]
            row = sorted(row, key=lambda x: x[1], reverse=True)
            for i, j in row:
                if float(j) < 0.05:
                    continue
                value = [self.original_data.ix[num]['id'], self.original_data.ix[num]['title'], i, j,
                         self.original_data.ix[num]['summary'], self.original_data.ix[num]['content']]
                if value not in matrix:
                    matrix.append(value)

        matrix = pd.DataFrame(matrix, columns=['doc_id', 'title', 'topic', 'probability', 'summary', 'content'])
        self.doc_topic = matrix
        print(matrix)
        return matrix

    def _topic_doc(self):
        matrix = []
        for i in range(self.num_topics):
            doc_list = [i for i in list(
                self.doc_topic[self.doc_topic.topic == i].sort_values(by='probability', ascending=False)['doc_id'])]
            if doc_list == []:
                self.num_topics = i
                break
            output = ",".join([str(i) for i in doc_list])
            print('topic {}: {}'.format(i, output))
            matrix.append([i, output])
        return matrix

    def _readable_topic(self, sent_num=5):
        output = []
        for i in range(self.num_topics):
            sent = ''
            content = []
            score_list = []
            topic_term = dict(self.model.show_topic(i, topn=1000))

            content = ' '.join(list(self.doc_topic[self.doc_topic['topic'] == i].drop_duplicates('doc_id').sort_values('probability',ascending=False)[:10]['content']))

            content = sent_tokenize(content)
            for j in range(len(content)):
                words = gensim.utils.simple_preprocess(content[j], deacc=True)
                corpus = self.model.id2word.doc2bow(words)
                score = 0
                for word, num in corpus:
                    word = self.model.id2word.get(word)
                    if word in topic_term.keys():
                        score += num * topic_term[word]
                score_list.append(score)
            #score_list = list(set(score_list))
            max_score = heapq.nlargest(sent_num, score_list)
            for j in range(len(max_score)):
                max_sent = score_list.index(max_score[j])
                print('topic {}: {}'.format(i, content[max_sent]))
                sent = sent + str('sentence {}: {}\n'.format(j + 1, content[max_sent]))
            output.append([i, sent])
        return output

    def _topic_key(self):
        output = []
        for i in range(self.num_topics):
            output.append([i, ','.join([item[0] for item in self.model.show_topic(i, topn=30)])])
        print(output)
        return output

    def train(self, path, num_topics=20, iterations=1000, n_gram=True, lemmatization=True, stop_words=True, tfidf=True,
              model='lda'):
        """
        Trian the topic cluster model.
        Input value: data: pd.DataFrame format ['id','title','content','summary']
                     num_topics: (int) the number of topics
                     iterations: (int) total number of iteration times
        example:
        >>> lda = LDA_Model
        >>> lda.train(text)
        """
        data = load_data(str(path + '/output/data.csv'))
        self.original_data = data
        self.text = list(data['content'])
        self.num_topics = num_topics
        self.iterations = iterations
        self.model_name = model

        print('preprocessing...')
        self.token = self._preprocess(self.text,lemma = lemmatization, stop_words = stop_words)

        self.id2word = Dictionary(self.token)
        self.corpus = [self.id2word.doc2bow(text) for text in self.token]
        if tfidf == True:
            print('calculate tfidf...')
            tfidf_model = TfidfModel(self.corpus)
            self.corpus = tfidf_model[self.corpus]

        if model == 'lda':
            self.model = LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics,
                                  iterations=self.iterations)
        if model == 'lsi':
            self.model = LsiModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics)
        if model == 'hdp':
            self.model = HdpModel(corpus=self.corpus, id2word=self.id2word)
            self.num_topics = self.model.get_topics().shape[0]

        self.topic_key = pd.DataFrame(self._topic_key(), columns=['topic_id', 'key_words'])
        self.doc_topic = self._doc_topic()
        self.topic_doc = pd.DataFrame(self._topic_doc(), columns=['topic_id', 'document_id'])
        self.topic_sent = pd.DataFrame(self._readable_topic(), columns=['topic_id', 'most relative sentence'])

    def save(self, path='default'):
        # timestr = time.strftime('%Y%m%d%H%M%S',time.localtime(time.time()))
        if path == 'default':
            path = 'model'
            try:
                os.mkdir(path)
            except:
                pass

        else:
            try:
                os.mkdir(path)
            except:
                pass

        if self.model_name == 'lda':
            self.model.save(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model.save(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model.save(str(path + '/hdp.model'))

        f = open(str(path + '/original_data.pickle'), 'wb')
        pickle.dump(self.original_data, f)
        f.close()
        f = open(str(path + '/text.pickle'), 'wb')
        pickle.dump(self.text, f)
        f.close()
        f = open(str(path + '/token.pickle'), 'wb')
        pickle.dump(self.token, f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'wb')
        pickle.dump(self.corpus, f)
        f.close()

        self.to_wordcloud(path)
        self.to_neo4j(path)

        path = path + '/result'
        self.save_result(path)

        avg, cosine_matrix = self.similarity()
        sns.set()
        label = []
        col = []
        for i in range(self.num_topics):
            cosine_matrix[i][i] = 1
            # for j in range(i,self.num_topics):
            #     cosine_matrix[i][j] = 0
            col.append('topic{}'.format(i))
        cosine_matrix = pd.DataFrame(cosine_matrix)
        cosine_matrix.columns = col
        cosine_matrix.index = col
        sns.heatmap(cosine_matrix, cmap='YlGnBu')
        plt.savefig(path + '/topic_similarity.jpg')
        cosine_matrix.to_csv(str(path + '/cosine_matrix.csv'))


    def save_result(self, path='default'):
        if path == 'default':
            path = 'model/result'
            try:
                os.mkdir(path)
            except:
                pass

        else:
            try:
                os.mkdir(path)
            except:
                pass

        # topic_key = pd.DataFrame(self.print_topics(num_topics=self.num_topics,num_words=10),columns=['topic id','key words'])
        # topic_key.to_csv(str(path+'/topic_key.csv'),index=False)
        # doc_topic = self._doc_topic()
        # doc_topic.to_csv(str(path+'/doc_topic.csv'))
        # topic_doc = pd.DataFrame(self._topic_doc(),columns=['topic id','document id'])
        # topic_doc.to_csv(str(path+'/topic_doc.csv'),index=False)
        # topic_sent = pd.DataFrame(self._readable_topic(),columns=['topic id','most relative sentence'])
        # topic_sent.to_csv(str(path+'/topic_sent.csv'),index=False)

        f = open(str(path + '/topic_key.pickle'), 'wb')
        pickle.dump(self.topic_key, f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'wb')
        pickle.dump(self.doc_topic, f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'wb')
        pickle.dump(self.topic_doc, f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'wb')
        pickle.dump(self.topic_sent, f)
        f.close()

    def load(self, path='default'):
        """
        :param path: the path of trained model.
        :return:
        """
        if path == 'default':
            path = 'model'
        file_list = os.listdir(path)
        for file in file_list:
            if file.endswith('.model'):
                self.model_name = file.split('.')[0]
        if self.model_name == 'lda':
            self.model = LdaModel.load(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model = LsiModel.load(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model = HdpModel.load(str(path + '/hdp.model'))

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.model.get_topics().shape[0]
        else:
            self.num_topics = self.model.num_topics
        # self.iterations = self.model.iterations

        f = open(str(path + '/original_data.pickle'), 'rb')
        self.original_data = pickle.load(f)
        f.close()
        f = open(str(path + '/text.pickle'), 'rb')
        self.text = pickle.load(f)
        f.close()
        f = open(str(path + '/token.pickle'), 'rb')
        self.token = pickle.load(f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'rb')
        self.corpus = pickle.load(f)
        f.close()

        path = path + '/result'
        f = open(str(path + '/topic_key.pickle'), 'rb')
        self.topic_key = pickle.load(f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'rb')
        self.doc_topic = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'rb')
        self.topic_doc = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'rb')
        self.topic_sent = pickle.load(f)
        f.close()

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.topic_doc.shape[0]
        else:
            self.num_topics = self.model.num_topics

    def update(self, path, iterations=100, n_gram=True, lemmatization=True, stop_words=True, model='lda'):
        """
        :param path: The path of training file
        :param iterations: Only for lda model
        :param n_gram: choose if use n_gram feature, default is true
        :param lemmatization: choose if use lemmatization feature, default is true
        :param stop_words: choose if need to remove stop words, default is true
        :param model: choose what model to use, default is 'lda'
        :return:
        """
        data = load_data(path + '/output/data.csv')
        pd.concat([self.original_data, data], axis=0)
        text = list(data['content'])
        self.text.extend(text)

        print('preprocessing...')
        self.token = self._preprocess(self.text, lemma=lemmatization, stop_words=stop_words)

        corpus = [self.id2word.doc2bow(text) for text in self.token]
        self.corpus.extend(corpus)
        self.model.update(corpus=corpus, iterations=iterations)

    def print_topics(self, num_topics=-1, num_words=10):
        """
        :param num_topics:(int, optional) – The number of topics to be selected
        :param num_words:(int, optional) – The number of words to be included per topics
        :return: list of (int, list of (str, float))
        """
        if num_topics == -1:
            num_topics = self.num_topics
        pprint.pprint(self.model.print_topics(num_topics=num_topics, num_words=num_words))
        return self.model.print_topics(num_topics=num_topics, num_words=num_words)

    def score(self):
        """
        Print the Coherence score of the model.

        """

        # print('\nPerplexity: ', self.model.log_perplexity(self.corpus))
        coherence_model_lda = CoherenceModel(model=self.model, texts=self.token, corpus=self.corpus,
                                             dictionary=self.id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

    def vis(self):
        """
        Visualization of the data through browser.
        """

        vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.id2word)
        pyLDAvis.show(vis)

    def consine(self, v1, v2):
        cosine = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
        return cosine

    def similarity(self):
        topic_vector = self.model.get_topics()
        num_topics = topic_vector.shape[0]
        consine_matrix = np.diag(np.ones(num_topics))
        consine_list = []
        for i in range(num_topics - 1):
            for j in range(i + 1, num_topics):
                consine_matrix[i][j] = self.consine(topic_vector[i], topic_vector[j])
                consine_matrix[j][i] = consine_matrix[i][j]
                consine_list.append(consine_matrix[i][j])
        average = np.average(consine_list)
        return average, consine_matrix

    def to_gephi(self):
        _, cosine_matrix = self.similarity()
        edge = []
        for i in range(self.num_topics - 1):
            for j in range(i + 1, self.num_topics):
                edge.append(['topic {}'.format(i), 'topic {}'.format(j), cosine_matrix[i][j]])

        for i in range(self.doc_topic.shape[0]):
            edge.append(['topic {}'.format(self.doc_topic.ix[i]['topic']), self.doc_topic.ix[i]['doc_id'],
                         self.doc_topic.ix[i][
                             'probability']])
        return edge

    def to_wordcloud(self,path):
        try:
            os.mkdir(path + '/wordcloud')
        except:
            pass
        path = path + '/wordcloud'
        cont = []
        for i in range(self.num_topics):
            key_word = dict(self.model.show_topic(i,topn=1000))
            #cont = " ".join([word * int(value*10000) for word,value in key_word])
            #cont = ",".join([(word + ",") * int(value*10000) for word,value in key_word])
            wordcloud = WordCloud(max_words=300, background_color="white",height=600,width=800).generate_from_frequencies(key_word)
            wordcloud.to_file(path+"/topic{}.png".format(i))

    def to_neo4j(self, path):
        try:
            os.mkdir(path + '/database')
        except:
            pass

        path = path + '/database'
        self.original_data.to_csv(path + '/document.csv', index=False)
        topic = []
        relationship = []
        words = []
        for i in range(self.num_topics):
            topic.append(['topic {}'.format(i)])
            for word, pro in self.model.show_topic(i):
                words.append([word])
                relationship.append(['topic {}'.format(i), pro, word])

        topic = pd.DataFrame(topic)
        topic.columns = ['id']
        topic.to_csv(path + '/topic.csv', index=False)
        words = pd.DataFrame(words)
        words.columns = ['word']
        words.to_csv(path + '/words.csv', index=False)

        for i in range(len(self.doc_topic)):
            relationship.append(['topic {}'.format(self.doc_topic.ix[i]['topic']), self.doc_topic.ix[i]['probability'],
                                 self.doc_topic.ix[i]['doc_id']])

        _, consine_matrix = self.similarity()
        for i in range(self.num_topics - 1):
            for j in range(i + 1, self.num_topics):
                relationship.append(['topic %d' % i, consine_matrix[i][j], 'topic %d' % j])

        relationship = pd.DataFrame(relationship)
        relationship.columns = ['source', 'probability', 'target']
        relationship.to_csv(path + '/relationship.csv', index=False)

        f = open(path + '/script.txt', 'w')
        f.write(
            'load csv with headers from "file:///document.csv" as line \nmerge (d:Document{id:toInteger(line.id),title:line.title,summary:line.title,content:line.content})\n\n')
        f.write('load csv with headers from "file:///topic.csv" as line\nmerge (t:Topic{id:line.id})\n\n')
        f.write('load csv with headers from "file:///words.csv" as line\nmerge (w:Word{id:line.word})\n\n')
        f.write(
            'load csv with headers from "file:///relationship.csv" as line\nmatch (from:Topic{id:line.source}),(to:Word{id:line.target})\nmerge (from)-[r:Key_word{probability:line.probability}]->(to)\n\n')
        f.write(
            'load csv with headers from "file:///relationship.csv" as line\nmatch (from:Topic{id:line.source}),(to:Document{id:toInteger(line.target)})\nmerge (from)-[r:Include{probability:line.probability}]->(to)\n\n')
        f.write(
            'load csv with headers from "file:///relationship.csv" as line\nmatch (from:Topic{id:line.source}),(to:Topic{id:line.target})\nmerge (from)<-[r:Similarity{probability:line.probability}]->(to)\n\n')
        f.close()
Beispiel #10
0
from gensim.models import HdpModel
import os

from utils import read_traj_synthetic, ObsQuantizer

__author__ = 'Romain Tavenard romain.tavenard[at]univ-rennes2.fr'

corpus_raw = []
path = "data/toy_pierre"
max_iter = 100

for fname in os.listdir(path):
    if not fname.endswith(".txt"):
        continue
    fullname = os.path.join(path, fname)
    traj = read_traj_synthetic(fullname)
    corpus_raw.append(traj)

oq = ObsQuantizer(min_x=-25, max_x=5, min_y=-10, max_y=10)
corpus_gensim = oq.fit(corpus_raw)

hdp = HdpModel(corpus=corpus_gensim, id2word=oq.dictionary)
for iter in range(max_iter):
    hdp.update(corpus=corpus_gensim)

topic_info = hdp.print_topics(num_topics=20, num_words=10)

print(hdp)
for topic in topic_info:
    print(topic)
Beispiel #11
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : gensim_hdp.py
# @Author: TW
# @Date  : 2018/3/23
# @Desc  :
from gensim import corpora
from gensim.models import HdpModel

import tw_word2vec.word2vec as tw_w2v
from tw_segment import jiebaseg

default_model: dict = tw_w2v.get_word2vec_dic("../data/needed_zh_word2vec.bin")
keys = list(default_model.keys())
dictionary = corpora.Dictionary(keys)
corpus = []
with open("../data/rawZhData/news_raw_wc2017-12-19.txt", "r") as f:
    for line in f.readlines():
        line = line.strip()
        bow = dictionary.doc2bow(jiebaseg.segOnly(line))
        corpus.append(bow)

hdp = HdpModel(corpus, dictionary)
hdp.print_topics(num_topics=20, num_words=10)
Beispiel #12
0
from gensim import corpora
from gensim.models import HdpModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

stop_words = set(stopwords.words('english'))

file = open("ap_changed.txt", "r+")
documents = file.readlines()

#word_tokens = word_tokenize([for doc in documents])

texts = [[
    re.sub(r'[^a-z]', '', text.lower()) for text in doc.split()
    if not text in stop_words and re.sub(r'[^a-z]', '', text)
] for doc in documents]
#print('TEXT: ', texts[0])

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

print(hdpmodel.print_topics(1000, 1))

#print(hdptopics)
Beispiel #13
0
def hlda(corpus, eldictionary, topic, probably_words):
    hdp = HdpModel(corpus, eldictionary)
    topic_info = hdp.print_topics(num_topics=20, num_words=10)
    print topic_info
Beispiel #14
0
        #content.download()
        #content.parse()
        #content.nlp()
        article = {}

        # stemmed_words = set(stem_tokens(content.cleaned_text, stemmer))
        article['keywords'] = text
        article['url'] = u
        article['title'] = content.title
        articles.append(article)
    except:
        continue

#add existing articles to new articles
if len(timelines) > 0:
    recent_timelines = [*map(lambda t: t[0], timelines)]
    articles = recent_timelines + articles

texts = [*map(lambda x: x['keywords'], articles)]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

hdp = HdpModel(corpus, dictionary)
#print(hdp.print_topics(num_topics=3, num_words=10))
print(hdp.show_topics(num_topics=-1, num_words=10))

topics = hdp.print_topics(num_topics=-1)
texst = 1