Esempi in Python per HdpModel.save

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: gensim.models

Classe/tipologia: HdpModel

Metodo/funzione: save

Esempi su hotexamples.com: 12

HdpModel.save in Python: 12 esempi trovati. Questi sono i migliori esempi reali in Python per gensim.models.HdpModel.save, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

HdpModel(30)

show_topics(21)

print_topics(14)

save(10)

load(9)

get_topics(5)

update(5)

hdp_to_lda(3)

show_topic(2)

suggested_lda_model(2)

__getitem__(1)

Esempio n. 1

Mostra file

def train_topics(args):
    print(f"Arguments: {args}")

    nlp = spacy.load("en", disable=["parser", "ner"])

    files = args["text"]
    lines = extract_stories(files)

    def tozenize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        allowed_postags = set(allowed_postags)
        docs = nlp.pipe(texts)
        text_tokens = []
        for doc in docs:
            tokens = [
                token.lemma_ for token in doc if token.pos_ in allowed_postags
                and not token.is_punct and not token.is_stop
            ]
            text_tokens.append(tokens)
        return text_tokens

    docs = tozenize(lines, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    print("Preprocessed Docs")

    bigram = gensim.models.Phrases(docs, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[docs], threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    docs = make_bigrams(docs)
    docs = make_trigrams(docs)

    print("Create Dictionary")
    # Create Dictionary
    corpus_dict = corpora.Dictionary(docs)
    # Create Corpus
    texts = docs
    # Term Document Frequency
    corpus = [corpus_dict.doc2bow(text) for text in texts]

    print("Train Model")
    hdp = HdpModel(corpus, corpus_dict)

    print(hdp.print_topics(num_topics=50, num_words=20))

    hdp.save(args["target"])

Esempio n. 2

Mostra file

    def hdpmodel(self, corpus_t, save=False, savename=None):
        """

        :param corpus_t:
        :param save:
        :param savename:
        :return:
        """
        print('using Hierarchical Dirichlet Process model...')
        hdpmodel = HdpModel(corpus=corpus_t, id2word=self.word_dict)
        if save:
            print('输出hdp模型到文件：{}'.format(savename))
            hdpmodel.save(savename)
        return hdpmodel

Esempio n. 3

Mostra file

File: topicModeling.py Progetto: deakkon/TechDashboard

    def createHDP(self, fileName='', modelName=''):
        '''
        fileName -> file for the dictionary (.dict) and corpus (.mm) files 
        modelName -> model name for LDA to save to disk
        ldaPasses -~ number of passes, 10 default
        topicNum -> number of topics to generate, 100 by default
        '''
        if fileName == '':
            fileName = self.__fileName

        if modelName == '':
            modelName = self.__fileName

        dict = corpora.Dictionary.load(self.__destination + fileName + '.dict')
        mm = corpora.MmCorpus(self.__destination + fileName + '.mm')

        hdp = HdpModel(corpus=mm, id2word=dict)
        hdp.save(self.__destination + modelName + '.hdp')
        print hdp
        print 'Created HDP model %s' % self.__fileName

Esempio n. 4

Mostra file

File: topicModeling.py Progetto: deakkon/TechDashboard

    def createHDP(self, fileName = '', modelName= ''):
        '''
        fileName -> file for the dictionary (.dict) and corpus (.mm) files 
        modelName -> model name for LDA to save to disk
        ldaPasses -~ number of passes, 10 default
        topicNum -> number of topics to generate, 100 by default
        '''
        if fileName == '':
            fileName = self.__fileName
            
        if modelName == '':
            modelName = self.__fileName
        
        dict = corpora.Dictionary.load(self.__destination+fileName+'.dict')
        mm = corpora.MmCorpus(self.__destination+fileName+'.mm')

        hdp = HdpModel(corpus=mm, id2word=dict)
        hdp.save(self.__destination+modelName+'.hdp')
        print hdp
        print 'Created HDP model %s'%self.__fileName

Esempio n. 5

Mostra file

    def set_model(self, lang: str, data_version: int,
                  dictionary_version: float, model_version: str,
                  param_name: str, param_version: int, model_file_path: str,
                  language_processed_data: list):
        # Make a index to word dictionary.
        logging.info("---- Creating HDP model")
        temp = self.essentials.dictionary[0]
        model = HdpModel(corpus=self.essentials.corpus,
                         id2word=self.essentials.dictionary.id2token)
        # , alpha="symmetric",
        # eta=self.beta, chunksize=self.chunk_size)
        model.save(model_file_path)
        self.model = model
        logging.info("---- HDP model is created")

        metrics = self.get_model_evaluation_metrics(language_processed_data)
        parameters = self.get_model_parameters()
        self.write_model_evaluation_metrics(lang, data_version,
                                            dictionary_version, model_version,
                                            param_name, param_version, metrics,
                                            parameters)
        return

Esempio n. 6

Mostra file

class TcModel:
    """
    Using gensim LDA model to implement the topic cluster
    """
    def __init__(self):
        self.original_data = []
        self.text = []
        self.token = []
        self.corpus = []
        self.id2word = []
        self.model_name = ''
        self.num_topics = 10
        self.iterations = 100
        self.model = None
        self.stop_words = stopwords.words('english')
        self.stop_words.extend(
            ['be', 'say', '-PRON-', 'ms', 'mr', 'year', 'cent'])

    def _tokenize_words(self, text):
        token = []
        total = len(text)
        for i in range(total):
            token.append(gensim.utils.simple_preprocess(text[i], deacc=True))
        return token

    def _phrase(self, token):
        bigram = Phrases(token, min_count=5, threshold=100)
        bigram_mod = Phraser(bigram)
        trigram = Phrases(bigram_mod[token], min_count=5, threshold=100)
        trigram_mod = Phraser(trigram)
        return [trigram_mod[bigram_mod[doc]] for doc in token]

    def _lemmatization(self, token):
        nlp = spacy.load('en', disable=['parser', 'ner'])
        return_text = []
        allow_postags = ['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']
        for i in token:
            sentence = nlp(" ".join(i))
            return_text.append([
                token.lemma_ for token in sentence
                if token.pos_ in allow_postags
            ])
        return return_text

    def find_most_common(self, token, plot=False):
        word_list = []
        extra_stopwords = []
        for i in token:
            word_list.extend(i)
        word_dic = collections.Counter(word_list)
        #print(word_dic.most_common(100))
        tf = list(word_dic.values())
        tf.sort(reverse=True)
        if plot == True:
            print(tf[:100])
            plt.plot(range(500), tf[:500])
            plt.xlabel('word sequence')
            plt.ylabel('Term Frequency')
            plt.show()

        m_list = []
        for i in range(len(tf) - 1):
            m_list.append(tf[i] - tf[i + 1])
        k = tf[m_list.index(max(m_list))]
        print(k)
        k = 5000
        for i in word_dic:
            if word_dic[i] > k:
                extra_stopwords.append(i)

        print(extra_stopwords)
        return extra_stopwords

    def _remove_stopwords(self, token):
        return_text = []
        self.stop_words.extend(self.find_most_common(token))
        for i in token:
            return_text.append(
                [word for word in i if word not in self.stop_words])
        return return_text

    def _doc_topic(self):
        """
        Matrix = [doc_id,title,topic,probability,summary,content]
        """
        matrix = []
        for num in range(len(self.corpus)):
            row = self.model[self.corpus[num]]
            row = sorted(row, key=lambda x: x[1], reverse=True)
            for i, j in row:
                if float(j) < 0.05:
                    continue
                value = [
                    self.original_data.ix[num]['id'],
                    self.original_data.ix[num]['title'], i, j,
                    self.original_data.ix[num]['summary'],
                    self.original_data.ix[num]['content']
                ]
                if value not in matrix:
                    matrix.append(value)

        matrix = pd.DataFrame(matrix,
                              columns=[
                                  'doc_id', 'title', 'topic', 'probability',
                                  'summary', 'content'
                              ])
        self.doc_topic = matrix
        print(matrix)
        return matrix

    def _topic_doc(self):
        matrix = []
        for i in range(self.num_topics):
            doc_list = [
                i for i in list(self.doc_topic[
                    self.doc_topic.topic == i].sort_values(
                        by='probability', ascending=False)['doc_id'])
            ]
            if doc_list == []:
                self.num_topics = i
                break
            output = ",".join([str(i) for i in doc_list])
            print('topic {}: {}'.format(i, output))
            matrix.append([i, output])
        return matrix

    def _readable_topic(self, sent_num=3):
        output = []
        for i in range(self.num_topics):
            sent = ''
            content = []
            score_list = []
            topic_term = dict(self.model.show_topic(i, topn=1000))
            topic_list = self.doc_topic[self.doc_topic.topic == i]
            max_pro = heapq.nlargest(5, topic_list['probability'])
            for pro in max_pro:
                content.append(
                    list(topic_list[topic_list.probability == pro]['content'])
                    [0])
            content = ' '.join(content)

            content = [text for text in sent_tokenize(content)]
            for j in range(len(content)):
                words = gensim.utils.simple_preprocess(content[j], deacc=True)
                corpus = self.model.id2word.doc2bow(words)
                score = 0
                for word, num in corpus:
                    word = self.model.id2word.get(word)
                    if word in topic_term.keys():
                        score += num * topic_term[word]
                score_list.append(score)
            score_list = list(set(score_list))
            max_score = heapq.nlargest(sent_num, score_list)
            for j in range(len(max_score)):
                max_sent = score_list.index(max_score[j])
                print('topic {}: {}'.format(i, content[max_sent]))
                sent = sent + str('sentence {}: {}\n'.format(
                    j + 1, content[max_sent]))
            output.append([i, sent])
        return output

    def _topic_key(self):
        output = []
        for i in range(self.num_topics):
            output.append(
                [i, ','.join([item[0] for item in self.model.show_topic(i)])])
        print(output)
        return output

    def train(self,
              path,
              num_topics=20,
              iterations=500,
              n_gram=True,
              lemmatization=True,
              stop_words=True,
              tfidf=True,
              model='lda'):
        """
        Trian the topic cluster model.
        Input value: data: pd.DataFrame format ['id','title','content','summary']
                     num_topics: (int) the number of topics
                     iterations: (int) total number of iteration times
        example:
        >>> lda = LDA_Model
        >>> lda.train(text)
        """
        data = load_data(str(path + '/output/data.csv'))
        self.original_data = data
        self.text = list(data['content'])
        self.num_topics = num_topics
        self.iterations = iterations
        self.model_name = model

        print('tokenizing...')
        self.token = self._tokenize_words(self.text)
        if n_gram == True:
            print('phrasing...')
            self.token = self._phrase(self.token)
        if lemmatization == True:
            print('lemmatization...')
            self.token = self._lemmatization(self.token)
        if stop_words == True:
            print('remove stop words...')
            self.token = self._remove_stopwords(self.token)

        self.id2word = Dictionary(self.token)
        self.corpus = [self.id2word.doc2bow(text) for text in self.token]
        if tfidf == True:
            print('calculate tfidf...')
            tfidf_model = TfidfModel(self.corpus)
            self.corpus = [tfidf_model[i] for i in self.corpus]
        if model == 'lda':
            self.model = LdaModel(corpus=self.corpus,
                                  id2word=self.id2word,
                                  num_topics=self.num_topics,
                                  iterations=self.iterations)
        if model == 'lsi':
            self.model = LsiModel(corpus=self.corpus,
                                  id2word=self.id2word,
                                  num_topics=self.num_topics)
        if model == 'hdp':
            self.model = HdpModel(corpus=self.corpus, id2word=self.id2word)
            self.num_topics = self.model.get_topics().shape[0]

        self.topic_key = pd.DataFrame(self._topic_key(),
                                      columns=['topic_id', 'key_words'])
        self.doc_topic = self._doc_topic()
        self.topic_doc = pd.DataFrame(self._topic_doc(),
                                      columns=['topic_id', 'document_id'])
        self.topic_sent = pd.DataFrame(
            self._readable_topic(),
            columns=['topic_id', 'most relative sentence'])

    def save(self, path='default'):
        #timestr = time.strftime('%Y%m%d%H%M%S',time.localtime(time.time()))
        if path == 'default':
            path = 'model'
            try:
                os.mkdir(path)
            except:
                pass

        else:
            try:
                os.mkdir(path)
            except:
                pass

        if self.model_name == 'lda':
            self.model.save(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model.save(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model.save(str(path + '/hdp.model'))

        f = open(str(path + '/original_data.pickle'), 'wb')
        pickle.dump(self.original_data, f)
        f.close()
        f = open(str(path + '/text.pickle'), 'wb')
        pickle.dump(self.text, f)
        f.close()
        f = open(str(path + '/token.pickle'), 'wb')
        pickle.dump(self.token, f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'wb')
        pickle.dump(self.corpus, f)
        f.close()
        path = path + '/result'
        self.save_result(path)

        avg, cosine_matrix = self.similarity()
        sns.set()
        label = []
        col = []
        for i in range(self.num_topics):
            cosine_matrix[i][i] = 0.5
            col.append('topic {}'.format(i))
        cosine_matrix = pd.DataFrame(cosine_matrix)
        cosine_matrix.columns = col
        cosine_matrix.index = col
        sns.heatmap(cosine_matrix, cmap='YlGnBu')
        plt.savefig(path + '/topic_similarity.jpg')
        cosine_matrix.to_csv(str(path + '/cosine_matrix.csv'))

    def save_result(self, path='default'):
        if path == 'default':
            path = 'model/result'
            try:
                os.mkdir(path)
            except:
                pass

        else:
            try:
                os.mkdir(path)
            except:
                pass

        # topic_key = pd.DataFrame(self.print_topics(num_topics=self.num_topics,num_words=10),columns=['topic id','key words'])
        # topic_key.to_csv(str(path+'/topic_key.csv'),index=False)
        # doc_topic = self._doc_topic()
        # doc_topic.to_csv(str(path+'/doc_topic.csv'))
        # topic_doc = pd.DataFrame(self._topic_doc(),columns=['topic id','document id'])
        # topic_doc.to_csv(str(path+'/topic_doc.csv'),index=False)
        # topic_sent = pd.DataFrame(self._readable_topic(),columns=['topic id','most relative sentence'])
        # topic_sent.to_csv(str(path+'/topic_sent.csv'),index=False)

        f = open(str(path + '/topic_key.pickle'), 'wb')
        pickle.dump(self.topic_key, f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'wb')
        pickle.dump(self.doc_topic, f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'wb')
        pickle.dump(self.topic_doc, f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'wb')
        pickle.dump(self.topic_sent, f)
        f.close()

    def load(self, path='default'):
        """
        :param path: the path of trained model.
        :return:
        """
        if path == 'default':
            path = 'model'
        file_list = os.listdir(path)
        for file in file_list:
            if file.endswith('.model'):
                self.model_name = file.split('.')[0]
        if self.model_name == 'lda':
            self.model = LdaModel.load(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model = LsiModel.load(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model = HdpModel.load(str(path + '/hdp.model'))

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.model.get_topics().shape[0]
        else:
            self.num_topics = self.model.num_topics
        #self.iterations = self.model.iterations

        f = open(str(path + '/original_data.pickle'), 'rb')
        self.original_data = pickle.load(f)
        f.close()
        f = open(str(path + '/text.pickle'), 'rb')
        self.text = pickle.load(f)
        f.close()
        f = open(str(path + '/token.pickle'), 'rb')
        self.token = pickle.load(f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'rb')
        self.corpus = pickle.load(f)
        f.close()

        path = path + '/result'
        f = open(str(path + '/topic_key.pickle'), 'rb')
        self.topic_key = pickle.load(f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'rb')
        self.doc_topic = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'rb')
        self.topic_doc = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'rb')
        self.topic_sent = pickle.load(f)
        f.close()

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.topic_doc.shape[0]
        else:
            self.num_topics = self.model.num_topics

    def update(self,
               path,
               iterations=100,
               n_gram=True,
               lemmatization=True,
               stop_words=True,
               model='lda'):
        """
        :param path: The path of training file
        :param iterations: Only for lda model
        :param n_gram: choose if use n_gram feature, default is true
        :param lemmatization: choose if use lemmatization feature, default is true
        :param stop_words: choose if need to remove stop words, default is true
        :param model: choose what model to use, default is 'lda'
        :return:
        """
        data = load_data(path + '/output/data.csv')
        pd.concat([self.original_data, data], axis=0)
        text = list(data['content'])
        self.text.extend(text)

        print('tokenizing...')
        token = self._tokenize_words(text)
        self.token.extend(token)
        if n_gram == True:
            print('phrasing...')
            token = self._phrase(token)
            self.token.extend(token)
        if lemmatization == True:
            print('lemmatization...')
            token = self._lemmatization(token)
            self.token.extend(token)
        if stop_words == True:
            print('remove stop words...')
            token = self._remove_stopwords(token)
            self.token.extend(token)

        corpus = [self.id2word.doc2bow(text) for text in self.token]
        self.corpus.extend(corpus)
        self.model.update(corpus=corpus, iterations=iterations)

    def print_topics(self, num_topics=-1, num_words=10):
        """
        :param num_topics:(int, optional) – The number of topics to be selected
        :param num_words:(int, optional) – The number of words to be included per topics
        :return: list of (int, list of (str, float))
        """
        if num_topics == -1:
            num_topics = self.num_topics
        pprint.pprint(
            self.model.print_topics(num_topics=num_topics,
                                    num_words=num_words))
        return self.model.print_topics(num_topics=num_topics,
                                       num_words=num_words)

    def score(self):
        """
        Print the Coherence score of the model.

        """

        #print('\nPerplexity: ', self.model.log_perplexity(self.corpus))
        coherence_model_lda = CoherenceModel(model=self.model,
                                             texts=self.token,
                                             corpus=self.corpus,
                                             dictionary=self.id2word,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

    def vis(self):
        """
        Visualization of the data through browser.
        """

        vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.id2word)
        pyLDAvis.show(vis)

    def consine(self, v1, v2):
        cosine = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
        return cosine

    def similarity(self):
        topic_vector = self.model.get_topics()
        num_topics = topic_vector.shape[0]
        consine_matrix = np.diag(np.ones(num_topics))
        consine_list = []
        for i in range(num_topics - 1):
            for j in range(i + 1, num_topics):
                consine_matrix[i][j] = self.consine(topic_vector[i],
                                                    topic_vector[j])
                consine_matrix[j][i] = consine_matrix[i][j]
                consine_list.append(consine_matrix[i][j])
        average = np.average(consine_list)
        return average, consine_matrix

    def to_gephi(self):
        _, cosine_matrix = self.similarity()
        edge = []
        for i in range(self.num_topics - 1):
            for j in range(i + 1, self.num_topics):
                edge.append([
                    'topic {}'.format(i), 'topic {}'.format(j),
                    cosine_matrix[i][j]
                ])

        for i in range(self.doc_topic.shape[0]):
            edge.append([
                'topic {}'.format(self.doc_topic.ix[i]['topic']),
                self.doc_topic.ix[i]['doc_id'],
                self.doc_topic.ix[i]['probability']
            ])
        # edge = []
        # node = []
        # topic_vector = self.model.get_topics()

        #decomposition

    #     pca = PCA(n_components=1000)
    #     topic_vector = pca.fit_transform(topic_vector)
    #     print(len(topic_vector[0]))
    #     for i in range(len(topic_vector)):
    #         for j in range(len(topic_vector[i])):
    #             edge.append(['topic {}'.format(i),j,topic_vector[i][j]])
    #         node.append(['topic {}'.format(i),'topic {}'.format(i)])
    #
    #     return node,edge
    #
    def to_neo4j(self):
        output = []
        for i in range(self.num_topics):
            output.append('CREATE(:Topic{id:"topic %d"})' % i)
            for word, pro in self.model.show_topic(i):
                output.append(
                    'MATCH (t:Topic) where t.id = "topic %d" CREATE t-[:Include{probability:%f}]-> (:Word{word:"%s"})'
                    % (i, pro, word))

        for i in range(len(self.original_data)):
            output.append('CREATE(:Document{id:%d})' %
                          (self.original_data.ix[i]['id']))

        for i in range(len(self.doc_topic)):
            output.append(
                'MATCH (t:Topic),(d:Document) WHERE t.id = "topic %d" and d.id = %d CREATE t-[:Include{probability:%f}]->d'
                %
                (self.doc_topic.ix[i]['topic'], self.doc_topic.ix[i]['doc_id'],
                 self.doc_topic.ix[i]['probability']))

        return output

Esempio n. 7

Mostra file

File: TCmodel.py Progetto: n1ck404/Topic-Cluster

class TcModel:
    """
    Using gensim LDA model to implement the topic cluster
    """

    def __init__(self):
        self.original_data = []
        self.text = []
        self.token = []
        self.corpus = []
        self.id2word = []
        self.model_name = ''
        self.num_topics = 10
        self.iterations = 100
        self.model = None
        self.stop_words = stopwords.words('english')
        self.stop_words.extend(['be', 'say', '-PRON-', 'ms','Mr','Ms','mr', 'year', 'cent', 'per', 'www', 'http', 'com'])

    def _phrase(self, token):
        bigram = Phrases(token, min_count=5, threshold=100)
        bigram_mod = Phraser(bigram)
        # trigram = Phrases(bigram_mod[token],min_count=5,threshold=100)
        # trigram_mod = Phraser(trigram)
        # return [trigram_mod[bigram_mod[doc]] for doc in token]
        return [bigram_mod[doc] for doc in token]

    def _tokenize_words(self,text):
        token = []
        total = len(text)
        for i in range(total):
            token.append(gensim.utils.simple_preprocess(text[i],deacc=True))
        return token

    def _preprocess(self, doc,lemma = True, stop_words = True):
        nlp = spacy.load('en')
        return_text = []
        allow_NER = ["NORP","FAC","ORG","GPE","LOC","PERSON","PRODUCT","LANGUAGE","EVENT"]
        allow_POS = ["ADJ","NOUN","VERB"]
        for i in doc:
            i = re.sub("[\!\/_,%^*(+\"\')]+|[+——()?【】'’“”！，。？、~@#￥%……&*（）]+"," ",i)
            i = re.sub("[\s+]"," ",i)
            sentence = nlp(i,disable = ['parser'])
            return_text.append([ent.text for ent in sentence.ents if ent.label_ in allow_NER])
            if lemma == True and stop_words == True:
                return_text[-1].extend([token.lemma_ for token in sentence if token.ent_type_ == '' and token.lemma_ not in self.stop_words and token.pos_ in allow_POS])
            elif lemma == True and stop_words == False:
                return_text[-1].extend([token.lemma_ for token in sentence if token.ent_type_ == '' and token.pos_ in allow_POS])
            elif lemma == False and stop_words == False:
                return_text[-1].extend([token for token in sentence if token.ent_type_ == '' and token.pos_ in allow_POS])
            elif lemma == False and stop_words == True:
                return_text[-1].extend([token for token in sentence if token.ent_type_ == '' and token.lemma_ not in self.stop_words and token.pos_ in allow_POS])
        return return_text

    def find_most_common(self, token, plot=False):
        word_list = []
        extra_stopwords = []
        for i in token:
            word_list.extend(i)
        word_dic = collections.Counter(word_list)
        # print(word_dic.most_common(100))
        tf = list(word_dic.values())
        tf.sort(reverse=True)
        if plot == True:
            print(tf[:100])
            plt.plot(range(500), tf[:500])
            plt.xlabel('word sequence')
            plt.ylabel('Term Frequency')
            plt.show()

        m_list = []
        for i in range(len(tf) - 1):
            m_list.append(tf[i] - tf[i + 1])
        k = tf[m_list.index(max(m_list))]
        print(k)
        k = 5000
        for i in word_dic:
            if word_dic[i] > k:
                extra_stopwords.append(i)

        print(extra_stopwords)
        return extra_stopwords

    def _doc_topic(self):
        """
        Matrix = [doc_id,title,topic,probability,summary,content]
        """
        matrix = []
        for num in range(len(self.corpus)):
            row = self.model[self.corpus[num]]
            row = sorted(row, key=lambda x: x[1], reverse=True)
            for i, j in row:
                if float(j) < 0.05:
                    continue
                value = [self.original_data.ix[num]['id'], self.original_data.ix[num]['title'], i, j,
                         self.original_data.ix[num]['summary'], self.original_data.ix[num]['content']]
                if value not in matrix:
                    matrix.append(value)

        matrix = pd.DataFrame(matrix, columns=['doc_id', 'title', 'topic', 'probability', 'summary', 'content'])
        self.doc_topic = matrix
        print(matrix)
        return matrix

    def _topic_doc(self):
        matrix = []
        for i in range(self.num_topics):
            doc_list = [i for i in list(
                self.doc_topic[self.doc_topic.topic == i].sort_values(by='probability', ascending=False)['doc_id'])]
            if doc_list == []:
                self.num_topics = i
                break
            output = ",".join([str(i) for i in doc_list])
            print('topic {}: {}'.format(i, output))
            matrix.append([i, output])
        return matrix

    def _readable_topic(self, sent_num=5):
        output = []
        for i in range(self.num_topics):
            sent = ''
            content = []
            score_list = []
            topic_term = dict(self.model.show_topic(i, topn=1000))

            content = ' '.join(list(self.doc_topic[self.doc_topic['topic'] == i].drop_duplicates('doc_id').sort_values('probability',ascending=False)[:10]['content']))

            content = sent_tokenize(content)
            for j in range(len(content)):
                words = gensim.utils.simple_preprocess(content[j], deacc=True)
                corpus = self.model.id2word.doc2bow(words)
                score = 0
                for word, num in corpus:
                    word = self.model.id2word.get(word)
                    if word in topic_term.keys():
                        score += num * topic_term[word]
                score_list.append(score)
            #score_list = list(set(score_list))
            max_score = heapq.nlargest(sent_num, score_list)
            for j in range(len(max_score)):
                max_sent = score_list.index(max_score[j])
                print('topic {}: {}'.format(i, content[max_sent]))
                sent = sent + str('sentence {}: {}\n'.format(j + 1, content[max_sent]))
            output.append([i, sent])
        return output

    def _topic_key(self):
        output = []
        for i in range(self.num_topics):
            output.append([i, ','.join([item[0] for item in self.model.show_topic(i, topn=30)])])
        print(output)
        return output

    def train(self, path, num_topics=20, iterations=1000, n_gram=True, lemmatization=True, stop_words=True, tfidf=True,
              model='lda'):
        """
        Trian the topic cluster model.
        Input value: data: pd.DataFrame format ['id','title','content','summary']
                     num_topics: (int) the number of topics
                     iterations: (int) total number of iteration times
        example:
        >>> lda = LDA_Model
        >>> lda.train(text)
        """
        data = load_data(str(path + '/output/data.csv'))
        self.original_data = data
        self.text = list(data['content'])
        self.num_topics = num_topics
        self.iterations = iterations
        self.model_name = model

        print('preprocessing...')
        self.token = self._preprocess(self.text,lemma = lemmatization, stop_words = stop_words)

        self.id2word = Dictionary(self.token)
        self.corpus = [self.id2word.doc2bow(text) for text in self.token]
        if tfidf == True:
            print('calculate tfidf...')
            tfidf_model = TfidfModel(self.corpus)
            self.corpus = tfidf_model[self.corpus]

        if model == 'lda':
            self.model = LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics,
                                  iterations=self.iterations)
        if model == 'lsi':
            self.model = LsiModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics)
        if model == 'hdp':
            self.model = HdpModel(corpus=self.corpus, id2word=self.id2word)
            self.num_topics = self.model.get_topics().shape[0]

        self.topic_key = pd.DataFrame(self._topic_key(), columns=['topic_id', 'key_words'])
        self.doc_topic = self._doc_topic()
        self.topic_doc = pd.DataFrame(self._topic_doc(), columns=['topic_id', 'document_id'])
        self.topic_sent = pd.DataFrame(self._readable_topic(), columns=['topic_id', 'most relative sentence'])

    def save(self, path='default'):
        # timestr = time.strftime('%Y%m%d%H%M%S',time.localtime(time.time()))
        if path == 'default':
            path = 'model'
            try:
                os.mkdir(path)
            except:
                pass

        else:
            try:
                os.mkdir(path)
            except:
                pass

        if self.model_name == 'lda':
            self.model.save(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model.save(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model.save(str(path + '/hdp.model'))

        f = open(str(path + '/original_data.pickle'), 'wb')
        pickle.dump(self.original_data, f)
        f.close()
        f = open(str(path + '/text.pickle'), 'wb')
        pickle.dump(self.text, f)
        f.close()
        f = open(str(path + '/token.pickle'), 'wb')
        pickle.dump(self.token, f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'wb')
        pickle.dump(self.corpus, f)
        f.close()

        self.to_wordcloud(path)
        self.to_neo4j(path)

        path = path + '/result'
        self.save_result(path)

        avg, cosine_matrix = self.similarity()
        sns.set()
        label = []
        col = []
        for i in range(self.num_topics):
            cosine_matrix[i][i] = 1
            # for j in range(i,self.num_topics):
            #     cosine_matrix[i][j] = 0
            col.append('topic{}'.format(i))
        cosine_matrix = pd.DataFrame(cosine_matrix)
        cosine_matrix.columns = col
        cosine_matrix.index = col
        sns.heatmap(cosine_matrix, cmap='YlGnBu')
        plt.savefig(path + '/topic_similarity.jpg')
        cosine_matrix.to_csv(str(path + '/cosine_matrix.csv'))


    def save_result(self, path='default'):
        if path == 'default':
            path = 'model/result'
            try:
                os.mkdir(path)
            except:
                pass

        else:
            try:
                os.mkdir(path)
            except:
                pass

        # topic_key = pd.DataFrame(self.print_topics(num_topics=self.num_topics,num_words=10),columns=['topic id','key words'])
        # topic_key.to_csv(str(path+'/topic_key.csv'),index=False)
        # doc_topic = self._doc_topic()
        # doc_topic.to_csv(str(path+'/doc_topic.csv'))
        # topic_doc = pd.DataFrame(self._topic_doc(),columns=['topic id','document id'])
        # topic_doc.to_csv(str(path+'/topic_doc.csv'),index=False)
        # topic_sent = pd.DataFrame(self._readable_topic(),columns=['topic id','most relative sentence'])
        # topic_sent.to_csv(str(path+'/topic_sent.csv'),index=False)

        f = open(str(path + '/topic_key.pickle'), 'wb')
        pickle.dump(self.topic_key, f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'wb')
        pickle.dump(self.doc_topic, f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'wb')
        pickle.dump(self.topic_doc, f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'wb')
        pickle.dump(self.topic_sent, f)
        f.close()

    def load(self, path='default'):
        """
        :param path: the path of trained model.
        :return:
        """
        if path == 'default':
            path = 'model'
        file_list = os.listdir(path)
        for file in file_list:
            if file.endswith('.model'):
                self.model_name = file.split('.')[0]
        if self.model_name == 'lda':
            self.model = LdaModel.load(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model = LsiModel.load(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model = HdpModel.load(str(path + '/hdp.model'))

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.model.get_topics().shape[0]
        else:
            self.num_topics = self.model.num_topics
        # self.iterations = self.model.iterations

        f = open(str(path + '/original_data.pickle'), 'rb')
        self.original_data = pickle.load(f)
        f.close()
        f = open(str(path + '/text.pickle'), 'rb')
        self.text = pickle.load(f)
        f.close()
        f = open(str(path + '/token.pickle'), 'rb')
        self.token = pickle.load(f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'rb')
        self.corpus = pickle.load(f)
        f.close()

        path = path + '/result'
        f = open(str(path + '/topic_key.pickle'), 'rb')
        self.topic_key = pickle.load(f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'rb')
        self.doc_topic = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'rb')
        self.topic_doc = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'rb')
        self.topic_sent = pickle.load(f)
        f.close()

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.topic_doc.shape[0]
        else:
            self.num_topics = self.model.num_topics

    def update(self, path, iterations=100, n_gram=True, lemmatization=True, stop_words=True, model='lda'):
        """
        :param path: The path of training file
        :param iterations: Only for lda model
        :param n_gram: choose if use n_gram feature, default is true
        :param lemmatization: choose if use lemmatization feature, default is true
        :param stop_words: choose if need to remove stop words, default is true
        :param model: choose what model to use, default is 'lda'
        :return:
        """
        data = load_data(path + '/output/data.csv')
        pd.concat([self.original_data, data], axis=0)
        text = list(data['content'])
        self.text.extend(text)

        print('preprocessing...')
        self.token = self._preprocess(self.text, lemma=lemmatization, stop_words=stop_words)

        corpus = [self.id2word.doc2bow(text) for text in self.token]
        self.corpus.extend(corpus)
        self.model.update(corpus=corpus, iterations=iterations)

    def print_topics(self, num_topics=-1, num_words=10):
        """
        :param num_topics:(int, optional) – The number of topics to be selected
        :param num_words:(int, optional) – The number of words to be included per topics
        :return: list of (int, list of (str, float))
        """
        if num_topics == -1:
            num_topics = self.num_topics
        pprint.pprint(self.model.print_topics(num_topics=num_topics, num_words=num_words))
        return self.model.print_topics(num_topics=num_topics, num_words=num_words)

    def score(self):
        """
        Print the Coherence score of the model.

        """

        # print('\nPerplexity: ', self.model.log_perplexity(self.corpus))
        coherence_model_lda = CoherenceModel(model=self.model, texts=self.token, corpus=self.corpus,
                                             dictionary=self.id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

    def vis(self):
        """
        Visualization of the data through browser.
        """

        vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.id2word)
        pyLDAvis.show(vis)

    def consine(self, v1, v2):
        cosine = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
        return cosine

    def similarity(self):
        topic_vector = self.model.get_topics()
        num_topics = topic_vector.shape[0]
        consine_matrix = np.diag(np.ones(num_topics))
        consine_list = []
        for i in range(num_topics - 1):
            for j in range(i + 1, num_topics):
                consine_matrix[i][j] = self.consine(topic_vector[i], topic_vector[j])
                consine_matrix[j][i] = consine_matrix[i][j]
                consine_list.append(consine_matrix[i][j])
        average = np.average(consine_list)
        return average, consine_matrix

    def to_gephi(self):
        _, cosine_matrix = self.similarity()
        edge = []
        for i in range(self.num_topics - 1):
            for j in range(i + 1, self.num_topics):
                edge.append(['topic {}'.format(i), 'topic {}'.format(j), cosine_matrix[i][j]])

        for i in range(self.doc_topic.shape[0]):
            edge.append(['topic {}'.format(self.doc_topic.ix[i]['topic']), self.doc_topic.ix[i]['doc_id'],
                         self.doc_topic.ix[i][
                             'probability']])
        return edge

    def to_wordcloud(self,path):
        try:
            os.mkdir(path + '/wordcloud')
        except:
            pass
        path = path + '/wordcloud'
        cont = []
        for i in range(self.num_topics):
            key_word = dict(self.model.show_topic(i,topn=1000))
            #cont = " ".join([word * int(value*10000) for word,value in key_word])
            #cont = ",".join([(word + ",") * int(value*10000) for word,value in key_word])
            wordcloud = WordCloud(max_words=300, background_color="white",height=600,width=800).generate_from_frequencies(key_word)
            wordcloud.to_file(path+"/topic{}.png".format(i))

    def to_neo4j(self, path):
        try:
            os.mkdir(path + '/database')
        except:
            pass

        path = path + '/database'
        self.original_data.to_csv(path + '/document.csv', index=False)
        topic = []
        relationship = []
        words = []
        for i in range(self.num_topics):
            topic.append(['topic {}'.format(i)])
            for word, pro in self.model.show_topic(i):
                words.append([word])
                relationship.append(['topic {}'.format(i), pro, word])

        topic = pd.DataFrame(topic)
        topic.columns = ['id']
        topic.to_csv(path + '/topic.csv', index=False)
        words = pd.DataFrame(words)
        words.columns = ['word']
        words.to_csv(path + '/words.csv', index=False)

        for i in range(len(self.doc_topic)):
            relationship.append(['topic {}'.format(self.doc_topic.ix[i]['topic']), self.doc_topic.ix[i]['probability'],
                                 self.doc_topic.ix[i]['doc_id']])

        _, consine_matrix = self.similarity()
        for i in range(self.num_topics - 1):
            for j in range(i + 1, self.num_topics):
                relationship.append(['topic %d' % i, consine_matrix[i][j], 'topic %d' % j])

        relationship = pd.DataFrame(relationship)
        relationship.columns = ['source', 'probability', 'target']
        relationship.to_csv(path + '/relationship.csv', index=False)

        f = open(path + '/script.txt', 'w')
        f.write(
            'load csv with headers from "file:///document.csv" as line \nmerge (d:Document{id:toInteger(line.id),title:line.title,summary:line.title,content:line.content})\n\n')
        f.write('load csv with headers from "file:///topic.csv" as line\nmerge (t:Topic{id:line.id})\n\n')
        f.write('load csv with headers from "file:///words.csv" as line\nmerge (w:Word{id:line.word})\n\n')
        f.write(
            'load csv with headers from "file:///relationship.csv" as line\nmatch (from:Topic{id:line.source}),(to:Word{id:line.target})\nmerge (from)-[r:Key_word{probability:line.probability}]->(to)\n\n')
        f.write(
            'load csv with headers from "file:///relationship.csv" as line\nmatch (from:Topic{id:line.source}),(to:Document{id:toInteger(line.target)})\nmerge (from)-[r:Include{probability:line.probability}]->(to)\n\n')
        f.write(
            'load csv with headers from "file:///relationship.csv" as line\nmatch (from:Topic{id:line.source}),(to:Topic{id:line.target})\nmerge (from)<-[r:Similarity{probability:line.probability}]->(to)\n\n')
        f.close()

Esempio n. 8

Mostra file

hdp = HdpModel(start_corpus, id2word = common_dictionary)
print("Created.")

print("Iterating through docs...")
lastTime = time.time()
for batchIndex in range(2, len(batches)+1):

    batch = batches[batchIndex-1]    
    docs = []

    for filename in batch:
        with open(docDir + filename, 'rb') as fp:
            doc = pickle.load(fp)
            docs.append(doc)
            
    other_corpus = [common_dictionary.doc2bow(doc) for doc in docs]
    print("length of other_corpus: {}".format(len(other_corpus)))

    hdp.update(other_corpus)
    print("m_num: {}".format(hdp.m_num_docs_processed))
    
    batchTime = time.time() - lastTime
    timeElapsed += batchTime
    ETA = (timeElapsed/(batchIndex-1)) * (len(batches) - (batchIndex-1))
    ETAstring = "{}:{}:{}".format( int(ETA / 3600), int( (ETA % 3600) / 60 ), int(ETA % 60))

    print("Batch {} of {} | Batch time: {:.3} | ETA: {}".format(batchIndex, len(batches), batchTime, ETAstring))
    lastTime = time.time()
    
hdp.save(args.modelName)

Esempio n. 9

Mostra file

File: hdp_model.py Progetto: rbshaffer/Constitution_Similarity

__author__ = 'rbshaffer'

from gensim.models import HdpModel
from gensim.corpora import BleiCorpus
from gensim.corpora import Dictionary

corpus = BleiCorpus(fname='/home/rbshaffer/PycharmProjects/Constitution_Similarity/const_corpus_07242015.lda-c',
                    fname_vocab='/home/rbshaffer/PycharmProjects/Constitution_Similarity/const_corpus_07242015.lda-c.vocab')
dictionary = Dictionary.load('/home/rbshaffer/PycharmProjects/Constitution_Similarity/const_dic_07242015.lda-c.dic')
hdp_model = HdpModel(corpus=corpus, id2word=dictionary, max_time=28800)
hdp_model.save('/home/rbshaffer/Desktop/hdp_output_0726015.pydata')

Esempio n. 10

Mostra file

    if count % 100000 == 0:
        print(count, end=' ')

with open("texts.txt", "wb") as fp:  # Pickling
    pickle.dump(texts, fp)
print('texts.csv created')

bigram = gensim.models.Phrases(texts)

dictionary = Dictionary(texts)
dictionary.save("hdp_dictionary.dict")
print("Dictionary saved as hdp_dictionary.dict")
corpus = [dictionary.doc2bow(text) for text in texts]
MmCorpus.serialize('hdp_corpus.mm', corpus)
print('Corpus saved as hdp_corpus.mm')

hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

hdpmodel.save('hdp_model_spacy.gensim')
print('hdp model created')

hdptopics = [[word for word, prob in topic]
             for topicid, topic in hdpmodel.show_topics(formatted=False)]

hdp_coherence = CoherenceModel(topics=hdptopics[:10],
                               texts=texts,
                               dictionary=dictionary,
                               window_size=10).get_coherence()

print(f"The topic coherence is {hdp_coherence}")

Esempio n. 11

Mostra file

File: topic_modeling.py Progetto: lbyiuou0329/MIT-NLP-project

    print("Creating TFIDF BoW...")
    bow, features = create_bag_of_words(df['tweet_text_stemmed'],
                                        ngram_range=(1, 3),
                                        use_idf=True,
                                        min_df=MIN_DF)
    print('Done: %s features' % len(features))

    print('Training HDP model...')
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        hdp = HdpModel(Sparse2Corpus(bow, documents_columns=False),
                       Dictionary([features]))
    topic_dists = get_topic_distributions(
        hdp, Sparse2Corpus(bow, documents_columns=False),
        df['tweet_id'].values)
    print('Done')

    print('Saving...')
    hdp.save(MODEL_PATH + date + '_topics.model')
    np.savetxt(MODEL_PATH + date + '_features.txt',
               features,
               fmt='%s',
               delimiter='\n',
               encoding="utf-8")
    topic_dists.to_csv(ASSIGNED_PATH + date + DATA_SUFFIX,
                       sep=SEP,
                       index=False)
    print('Done')

    del hdp, topic_dist, df, bow, features

Esempio n. 12

Mostra file

File: topics_analysis.py Progetto: ochachacha/contextualLSTM

def topic_analysis(corpus, dictionary, models_path, technique):

    import uuid
    uuid = str(uuid.uuid4())
    print("[BLOCK] Starting models for context")
    sys.stdout.flush()

    if technique == "all" or technique == "hdp":
        t1 = time()
        # HDP model
        model = HdpModel(corpus, id2word=dictionary)
        model.save("%s/hdp_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for HDP model: %s" % (round(t2 - t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "ldap":
        t1 = time()
        # Parallel LDA model
        model = LdaMulticore(corpus,
                             id2word=dictionary,
                             num_topics=100,
                             workers=23,
                             passes=20)
        model.save("%s/lda_parallel_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LDA multicore: %s" %
              (round(t2 - t1, 2)))
    sys.stdout.flush()

    if technique == "all" or technique == "lsa":
        t1 = time()
        # LSA model
        model = LsiModel(corpus, id2word=dictionary, num_topics=400)
        model.save("%s/lsa_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LSA: %s" % (round(t2 - t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "ldao":
        t1 = time()
        # Online LDA model
        model = LdaModel(corpus,
                         id2word=dictionary,
                         num_topics=100,
                         update_every=1,
                         chunksize=10000,
                         passes=5)
        model.save("%s/lda_online_%s" % (models_path, uuid))
        t2 = time()
        print("[BLOCK] Training time for LDA online: %s" % (round(t2 - t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "lda":
        t1 = time()
        # Offline LDA model
        model = LdaModel(corpus,
                         id2word=dictionary,
                         num_topics=100,
                         update_every=0,
                         passes=20)
        model.save("%s/lda_offline_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LDA offline: %s" %
              (round(t2 - t1, 2)))
        sys.stdout.flush()