Esempio n. 1
0
    def model(self,
              topic_num_best: int = None,
              topic_num_list: List[int] = range(2, 22, 2)):
        docs = self.docs
        time_slice = self.time_slice

        pkuseg = PKUSegment()

        docs_segmented = list()
        word_segment_list = list()
        tag_segment_list = list()
        time_slice_segmented = list()

        time_doc_count_accumulate = 0
        for time_doc_count in time_slice:
            doc_list_part, word_segment_list_part, tag_segment_list_part = pkuseg.segment_docs(
                docs[time_doc_count_accumulate: time_doc_count_accumulate + time_doc_count],
                include_tag_list=['a', 'ad', 'j', 'l', 'n', 'ns', 'nt', 'nz', 'v', 'vd', 'vn'],
                min_length=2
            )
            docs_segmented.extend(doc_list_part)
            word_segment_list.extend(word_segment_list_part)
            tag_segment_list.extend(tag_segment_list_part)
            time_slice_segmented.append(len(word_segment_list_part))

            time_doc_count_accumulate += time_doc_count

        dictionary, corpus = word_segment_list_to_dictionary_corpus(word_segment_list)
        self.dictionary = dictionary
        self.corpus = corpus
        self.word_segment_list = word_segment_list
        self.tag_segment_list = tag_segment_list
        self.docs = docs_segmented

        lda_model = LdaModelSLab('中共', docs_segmented)
        lda_model.word_segment_list = word_segment_list
        lda_model.corpus = corpus
        lda_model.dictionary = dictionary

        # 计算最佳主题数量
        if topic_num_best is None:
            coherence_list, coherence_best, model_best, topic_num_best = lda_model.select_best_topic_num(topic_num_list)

        # 训练模型
        self.dtm_model = DtmModel('dtm-win64.exe', corpus, time_slice_segmented, num_topics=topic_num_best,
                                  id2word=dictionary, initialize_lda=True,
                                  lda_sequence_min_iter=30, lda_sequence_max_iter=100,
                                  lda_max_em_iter=50
                                  )

        # 得到各文本对应主题
        self.topic_index_list = np.argmax(self.dtm_model.gamma_, axis=1)

        self.topic_num = topic_num_best

        df = pd.DataFrame({'doc': docs_segmented, 'topic': self.topic_index_list})
        self.df = df
        return df
Esempio n. 2
0
def dtm_draw_topic(dtm_model: DtmModel, topic_index, time_num=None, topn=10):
    # 自动判断主题数量
    if time_num is None:
        time_num = 0
        while True:
            try:
                dtm_model.show_topic(topic_index, time_num, topn)
                time_num += 1
            except:
                break

    x = range(time_num)

    # 统计所有时间的关键词
    word_set = set()
    for time_index in range(time_num):
        for prob, word in dtm_model.show_topic(topic_index, time_index, topn):
            word_set.add(word)
    word_stat = {word: [] for word in word_set}

    # 在各个时间下,根据关键词获取频率
    max_prob = 0

    for time_index in range(time_num):
        try:
            word_dict = {
                word: prob
                for prob, word in dtm_model.show_topic(topic_index, time_index,
                                                       topn)
            }
        except:
            break
        for word in word_set:
            if word in word_dict:
                word_stat[word].append(word_dict[word])
                if word_dict[word] > max_prob:
                    max_prob = word_dict[word]
            else:
                word_stat[word].append(0)

    # 画图
    subplot_num = len(word_stat)
    subplot_col = 4
    subplot_row = math.ceil(float(subplot_num) / subplot_col)
    plt.figure(figsize=(4 * subplot_col, 4 * subplot_row))
    for word_index, (word, prob_list) in enumerate(word_stat.items()):
        plt.subplot(subplot_row, subplot_col, word_index + 1)
        plt.plot(x, prob_list, label=word)
        plt.ylim(0, max_prob)
        plt.legend()
def dtm(dtm_path, corpus, dictionary, time_slices, num_topics=40, load=False):
    # dtm_path should have your local binary of Blei-DTM
    print("Running DTM")
    if load is False:
        model = DtmModel(dtm_path,
                         corpus,
                         time_slices,
                         num_topics=num_topics,
                         id2word=dictionary,
                         initialize_lda=True)
        model.save("DTM")
        return model
    elif load is True:
        model = DtmModel.load('DTM')
        return model
Esempio n. 4
0
def dtm(time_query_list, topic_num):
    '''
    根据查询时间列表训练DTM模型
    返回词典、语料库、模型

    :param time_query_list:
    :param topic_num:
    :return:
    '''
    words_slice = []
    total_words_ls = []
    for time_query in time_query_list:
        words_ls, _, _ = segment_comment(query_comment(*time_query))

        words_slice.append(len(words_ls))
        total_words_ls.extend(words_ls)

    dictionary, corpus = dict_corpus_comment(total_words_ls)

    dtm_model = DtmModel('dtm-win64.exe',
                         corpus,
                         words_slice,
                         num_topics=topic_num,
                         id2word=dictionary,
                         initialize_lda=True,
                         lda_sequence_min_iter=30,
                         lda_sequence_max_iter=100,
                         lda_max_em_iter=50)

    return total_words_ls, dictionary, corpus, dtm_model
Esempio n. 5
0
def dtm_model(dtm_path,
              corpus=None,
              time_seq=None,
              num_topics=10,
              id2word=None,
              alpha=0.01,
              rng_seed=0,
              model='fixed'):
    """
    :param dtm_path: path to dtm wrapper, see: https://github.com/blei-lab/dtm
    :param corpus: documents in bag-of-words format
    :param time_seq: pre-defined timestamps
    :param num_topics: number of topics
    :param id2word: mapping between tokens ids and words from corpus
    :param alpha: hyperparameter of the Dirichlet distribution that affects the document-topics sparsity
    :param id2word: mapping between tokens ids and words from corpus
    :param rng_seed: random seed
    :param model: "fixed" if document influence needed, 'dtm' otherwise
    :return: dtm model trained with the available corpus
    """

    print("initializing the model...")
    model = DtmModel(dtm_path=dtm_path,
                     corpus=corpus,
                     time_slices=time_seq,
                     num_topics=num_topics,
                     id2word=id2word,
                     alpha=alpha,
                     rng_seed=rng_seed,
                     model='fixed')
    print('DTM model loaded')
    return model
Esempio n. 6
0
def run_dtm(args, corpus, dictionary, time_slices, pre):
    """
        Function to run DTM over corpus.

        input:
            args (argparse object): input arguments
            corpus: corpus to run LDA over
            dictionary: dictionary from corpus
            time_slices (list): list containing number of files per time time slice
            pre (str): path to save all results to

        returns DTM model
    """
    DTM_PATH = os.environ.get('DTM_PATH', None)
    if not DTM_PATH:
        raise ValueError("You need to set the DTM path.")
    # Run the model
    model = DtmModel(DTM_PATH,
                     corpus=corpus,
                     num_topics=args.num_topics,
                     id2word=dictionary,
                     time_slices=time_slices,
                     prefix=pre,
                     lda_sequence_max_iter=args.num_iterations)
    return model
Esempio n. 7
0
 def create_model(self):
     self.model = DtmModel(self.dtm_path,
                           self.corpus,
                           self.time_seq,
                           num_topics=15,
                           id2word=self.corpus.dictionary,
                           initialize_lda=True)
     return self.model
Esempio n. 8
0
def dtm_print_topic_all_time(dtm_model: DtmModel, topic_index, topn=10):
    time_index = 0
    while True:
        try:
            msg = dtm_model.print_topic(topic_index, time_index, topn)
            print(msg)
        except:
            return
        time_index += 1
Esempio n. 9
0
    def run_model(self):
        ''' Run the LDA model on a given corpus and dictionary
		'''
        if not hasattr(self, 'corpus'):
            # if there's no corpus present, read in saved corpus
            corpus = gensim.corpora.MmCorpus(
                os.path.join(_MODELS_DIR, self.corpus_file))

        if not hasattr(self, 'corpus.dictionary'):
            # if there's no dictionary present, read in saved dictionary
            dictionary = gensim.corpora.Dictionary.load(
                os.path.join(_MODELS_DIR, self.dict_file))
        if self.m_type == "LDA":
            # Run LDA model
            print("Running LDA Model")
            t0 = time.time()
            self.lda = gensim.models.LdaModel(self.corpus,
                                              id2word=self.corpus.dictionary,
                                              num_topics=self.num_topics)
            print(time.time() - t0)
        if self.m_type == "DTM":
            print("Running DTM Model")
            t0 = time.time()
            self.lda = DtmModel(self.dtm_path,
                                self.corpus,
                                self.time_seq,
                                num_topics=self.num_topics,
                                id2word=self.corpus.dictionary,
                                initialize_lda=True)
            print(time.time() - t0)
        if self.m_type == "DIM":
            print("Running DIM Model")
            t0 = time.time()
            self.lda = DtmModel(self.dtm_path,
                                self.corpus,
                                self.time_seq,
                                num_topics=self.num_topics,
                                model="fixed",
                                id2word=self.corpus.dictionary,
                                initialize_lda=True)
            print(time.time() - t0)
Esempio n. 10
0
def main(args):
    f = sys.argv[1]
    if args.model_type == "lda":
        loaded_model = LdaModel.load(args.model)

        for topic_num, topic in enumerate(
                loaded_model.show_topics(num_topics=-1)):
            topic_num, topic_str = topic

            print(str(topic_num) + ':', end=' ')
            for term in topic_str.split(' + '):
                weight, word = term.split('*')
                if args.model_type == "dtm":
                    word = "\"" + word + "\""
                print(word, end=' ')
            print()

    elif args.model_type == "dtm":
        loaded_model = DtmModel.load(args.model)
        for topic_id in range(loaded_model.num_topics):
            for time in range(len(loaded_model.time_slices)):
                top_words = loaded_model.show_topic(topic_id, time, topn=10)

                print("Topic",
                      str(topic_id) + ", time slice",
                      str(time) + ':',
                      end=' ')
                for weight, word in top_words:
                    print(word, end=', ')
                print()
            print()
    elif args.model_type == "ldaseq":
        loaded_model = LdaSeqModel.load(args.model)
        # maybe use dtm_coherence?
        print(loaded_model.num_topics)
        print(loaded_model.time_slice)
        for topic_id in range(loaded_model.num_topics):
            for time in range(len(loaded_model.time_slice)):
                top_words = loaded_model.print_topic(topic=topic_id,
                                                     time=time,
                                                     top_terms=20)
                print("Topic",
                      str(topic_id) + ", time slice",
                      str(time) + ':',
                      end=' ')
                for word, weight in top_words:
                    print(word, end=' ')
                print()
            print()
    else:
        print("Unknown model type provided: " + args.model_type)
        sys.exit(1)
Esempio n. 11
0
    def initialize_model(self):

        mydict = corpora.Dictionary()
        mycorpus = [
            mydict.doc2bow(doc, allow_update=True) for doc in self.flat_list
        ]

        start = time.time()
        model = DtmModel(dtm_path,
                         mycorpus,
                         self.time_slices,
                         num_topics=num_topics,
                         id2word=mydict,
                         initialize_lda=True,
                         top_chain_var=0.05,
                         lda_sequence_min_iter=15,
                         lda_sequence_max_iter=50)
        print(time.time() - start)

        return model, mycorpus, mydict
Esempio n. 12
0
def create_dtm_encoding(corpus, vector_size, dictionary, slices):
    path = './external/dtm_bin/'
    link = 'https://github.com/magsilva/dtm/tree/master/bin'
    content = [
        f for f in os.listdir(path)
        if os.path.isfile(os.path.join(path, f)) and 'dtm' in f
    ]
    if len(content) != 1:
        print(
            "Please place the appropriate binary file (and only this one) from {} into '{}'."
            .format(path, link))
        sys.exit(1)
    mod_path = path + content[0]
    dictionary.filter_extremes(keep_n=5000)
    bow_corpus = [dictionary.doc2bow(x) for x in corpus]
    mod = DtmModel(mod_path,
                   corpus=bow_corpus,
                   id2word=dictionary,
                   time_slices=slices,
                   num_topics=vector_size)
    return mod.gamma_, mod
Esempio n. 13
0
class DtmlModelSLab():
    def __init__(self, namespace: str, docs: List[str], time_slice: List[int]):
        self.namespace = namespace
        Path(namespace).mkdir(exist_ok=True, parents=True)

        self.docs = docs
        self.time_slice = time_slice

        self.dictionary = None
        self.corpus = None

        self.topic_num = None
        self.topic_index_list = None
        self.dtm_model = None

    def model(self,
              topic_num_best: int = None,
              topic_num_list: List[int] = range(2, 22, 2)):
        pkuseg = PKUSegment()

        docs_segmented = list()
        word_segment_list = list()
        tag_segment_list = list()
        time_slice_segmented = list()

        time_doc_count_accumulate = 0
        for time_doc_count in self.time_slice:
            doc_list_part, word_segment_list_part, tag_segment_list_part = pkuseg.segment_docs(
                self.docs[time_doc_count_accumulate:time_doc_count_accumulate +
                          time_doc_count],
                include_tag_list=[
                    'a', 'ad', 'j', 'l', 'n', 'ns', 'nt', 'nz', 'v', 'vd', 'vn'
                ],
                min_length=2)
            docs_segmented.extend(doc_list_part)
            word_segment_list.extend(word_segment_list_part)
            tag_segment_list.extend(tag_segment_list_part)
            time_slice_segmented.append(len(word_segment_list_part))

            time_doc_count_accumulate += time_doc_count

        dictionary, corpus = word_segment_list_to_dictionary_corpus(
            word_segment_list)

        self.dictionary = dictionary
        self.corpus = corpus
        self.word_segment_list = word_segment_list
        self.tag_segment_list = tag_segment_list
        self.docs = docs_segmented
        self.time_slice = time_slice_segmented

        lda_model = LdaModelSLab(self.namespace, docs_segmented)
        lda_model.word_segment_list = word_segment_list
        lda_model.corpus = corpus
        lda_model.dictionary = dictionary

        # 计算最佳主题数量
        if topic_num_best is None:
            coherence_list, coherence_best, model_best, topic_num_best = lda_model.select_best_topic_num(
                topic_num_list)
        self.topic_num = topic_num_best

        # 训练模型
        self.dtm_model = DtmModel('dtm-win64.exe',
                                  corpus,
                                  time_slice_segmented,
                                  num_topics=topic_num_best,
                                  id2word=dictionary,
                                  initialize_lda=True,
                                  lda_sequence_min_iter=30,
                                  lda_sequence_max_iter=100,
                                  lda_max_em_iter=50)

        # 得到各文本对应主题
        self.topic_index_list = np.argmax(self.dtm_model.gamma_, axis=1)

        df = pd.DataFrame({
            'doc': docs_segmented,
            'topic': self.topic_index_list
        })
        self.df = df
        return df

    def save(self):
        pickle_to_file(self, f'{self.namespace}/dtm_slab.pkl')

        # self.dtm_model.save(f'{self.namespace}/dtm_{self.topic_num}.model')
        # pickle_to_file(self.docs, f'{self.namespace}/docs.pkl')
        # pickle_to_file(self.df, f'{self.namespace}/dtm_df.pkl')

    @classmethod
    def load(cls, namespace: str):
        # docs = unpickle_from_file(f'{namespace}/docs.pkl')
        # instance = cls(namespace, docs)
        # instance.df = unpickle_from_file(f'{namespace}/dtm_df.pkl')

        instance = unpickle_from_file(f'{namespace}/dtm_slab.pkl')

        return instance

    def draw_topics(self, topn=10):
        for topic_index in range(self.topic_num):
            self.draw_topic(topic_index, topn)

        # 各主题数量
        df_topic = pd.DataFrame(np.argmax(self.dtm_model.gamma_, axis=1),
                                columns=['topic'])
        # 聚合统计列
        df_topic.loc[:, 'count'] = 1
        df_g = df_topic.groupby('topic').size()

        df_g.boxplot()
        plt.savefig(f'{self.namespace}/dtm_topic_num.png')

    def draw_topic(self, topic_index: int, topn=10):
        time_length = len(self.time_slice)

        x = range(time_length)

        # 统计所有时间的关键词
        word_set = set()
        for time_index in range(time_length):
            for prob, word in self.dtm_model.show_topic(
                    topic_index, time_index, topn):
                word_set.add(word)
        word_stat = {word: [] for word in word_set}

        # 在各个时间下,根据关键词获取频率

        # 画图Y轴最大值
        max_prob = 0

        for time_index in range(time_length):
            word_dict = {
                word: prob
                for prob, word in self.dtm_model.show_topic(
                    topic_index, time_index, topn)
            }
            for word in word_set:
                if word in word_dict:
                    word_stat[word].append(word_dict[word])
                    if word_dict[word] > max_prob:
                        max_prob = word_dict[word]
                else:
                    word_stat[word].append(0)

        # 统计当前主题文档数量
        current_topic_doc_num = pd.Series(
            np.argmax(self.dtm_model.gamma_,
                      axis=1)).value_counts().sort_index()[topic_index]
        total_doc_num = len(np.argmax(self.dtm_model.gamma_, axis=1))

        # 画图
        subplot_num = len(word_stat)
        subplot_col = 4
        subplot_row = math.ceil(float(subplot_num) / subplot_col)
        plt.figure(figsize=(4 * subplot_col, 4 * subplot_row))
        plt.suptitle(
            f'主题ID:{topic_index},共{self.dtm_model.num_topics}个主题,当前主题文本数量:{current_topic_doc_num}/{total_doc_num}'
        )

        for word_index, (word, prob_list) in enumerate(word_stat.items()):
            plt.subplot(subplot_row, subplot_col, word_index + 1)
            plt.plot(x, prob_list, label=word)
            plt.xticks([*range(0, x[-1], 2), x[-1]])
            plt.ylim(0, max_prob)
            plt.legend()

        plt.show()
        plt.savefig(f'{self.namespace}/dtm_topic{topic_index}.png')

    def print_topic_all_time_slice(self, topic_index, topn=10):
        time_index = 0
        while True:
            try:
                msg = self.dtm_model.print_topic(topic_index, time_index, topn)
                print(msg)
            except:
                return
            time_index += 1
Esempio n. 14
0
pickle.dump(bow_path_by_artist, open(MODEL_SAVE_NAME + "bow_paths.pk", "wb"))


class BoWCorpus(object):
    def __iter__(self, bow_path_by_artist=bow_path_by_artist):
        for artist_id, artist_path, year in bow_path_by_artist:
            # Extract features for first song in the directory
            bow = np.load(BOW_DIR + artist_path +
                          os.listdir(BOW_DIR + artist_path)[0])
            # Convert to sparse encoding
            bow_sparse = [(idx, count) for (idx, count) in enumerate(bow)
                          if count > 0]
            yield bow_sparse


corpus = BoWCorpus()

start = time()

model = DtmModel(dtm_path,
                 corpus,
                 time_seq,
                 num_topics=NUM_TOPICS,
                 initialize_lda=True,
                 model='fixed')

# Save model
model.save(MODEL_SAVE_NAME)

print 'Model fit in', ((time() - start) / 60.) / 60., 'hours'
Esempio n. 15
0
def DTMimplementForDatasets():
    cnxn = pyodbc.connect(
        'DRIVER={SQL Server};SERVER=DESKTOP-P61DTNE;DATABASE=Medline;UID=sa;PWD=0000'
    )
    cursor = cnxn.cursor()

    tokenizer = RegexpTokenizer(r'\w+')

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    with open('F:\\publication work\\Data\\temp\\sample_dataset.txt',
              'r') as f2:
        s2 = f2.read()
        all_dataset = ast.literal_eval(s2)

    for key, value in all_dataset.items():
        dataset_name = key
        stri = 'where'
        j = 0
        for i in value:
            if (j == 0):
                stri = stri + ' ' + 'TOPIC_NO=' + str(i)
                j = j + 1
            else:
                stri = stri + ' or ' + 'TOPIC_NO=' + str(i)
    # Declare list to create a list of the whole document set
        doc_set = list()
        list_of_topics = list()
        temp_dist_of_docs_over_topics = list()
        dist_of_docs_over_topics = list()
        pubmed_identifier_list = list()
        topic_doc_dictionary = {}
        cursor.execute(
            "SELECT [TOPIC_NO],[SERIAL_NO],[PUBMED_IDENTIFIER],[ABSTRACT] FROM [Medline].[dbo].[OnlyDeeplyRelaGeno2005]"
            + stri + ";")
        for row1 in cursor.fetchall():
            abstract = row1.ABSTRACT.strip()
            serial_no = row1.SERIAL_NO
            topic_no = row1.TOPIC_NO
            pubmed_identifier = row1.PUBMED_IDENTIFIER
            pubmed_identifier_list.append(pubmed_identifier)
            if topic_no in topic_doc_dictionary.keys():
                topic_doc_dictionary[topic_no].append(pubmed_identifier)
            else:
                topic_doc_dictionary[topic_no] = list()
                topic_doc_dictionary[topic_no].append(pubmed_identifier)
            number_of_topics_in_a_dataset = len(topic_doc_dictionary.keys())
            if not abstract:
                cursor1 = cnxn.cursor()
                cursor1.execute(
                    "SELECT [TITLE] FROM [Medline].[dbo].[OnlyDeeplyRelaGeno2005] where SERIAL_NO='"
                    + str(serial_no) + "';")
                for row in cursor1.fetchall():
                    abstract = row.TITLE.strip()
                    #print k
            doc_set.append(abstract)
        #number_of_topics_produced=10
        number_of_topics_produced = len(doc_set) / 24
        #number_of_topics_produced=len(doc_set)/70
        print 'Number of documents: ' + str(len(doc_set))
        print 'Number of topics produced: ' + str(number_of_topics_produced)
        print 'Number of passes: ' + str(number_of_passes)
        print 'Number of clusters: ' + str(number_of_topics_in_a_dataset)

        # Declaring list for tokenized documents in loop
        texts = []

        # loop through document list
        for i in doc_set:

            # clean and tokenize document string
            raw = i.lower()
            tokens = tokenizer.tokenize(raw)
            #print(tokens)

            # remove stop words from tokens
            stopped_tokens = [i for i in tokens if not i in en_stop]

            # stem tokens
            stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

            # add tokens to list
            texts.append(stemmed_tokens)

        # turn our tokenized documents into a id <-> term dictionary
        dictionary = corpora.Dictionary(texts)
        #print(dictionary.token2id)

        # convert tokenized documents into a document-term matrix
        Corpus = [dictionary.doc2bow(text) for text in texts]
        #print(corpus[0])
        """
        class DTMcorpus(corpora.textcorpus.TextCorpus):
        
            def get_texts(self):
                return self.input
        
            def __len__(self):
                return len(self.input)
        
        corpus = DTMcorpus(texts)
        
        """
        #if len(doc_set)*30%100:
        #    section30=len(doc_set)*30/100+1
        #else:
        #   section30=len(doc_set)*30/100
        #section70=len(doc_set)*70/100
        #time_seq = [section30, section70]
        time_seq = [len(doc_set), 0]
        dtm_path = 'C:\Program Files\DTM\dtm-win64.exe'
        dtmModel = DtmModel(dtm_path,
                            Corpus,
                            time_seq,
                            num_topics=number_of_topics_produced,
                            id2word=dictionary,
                            initialize_lda=True)

        for i in range(0, number_of_topics_produced):
            list_of_topics.append(dtmModel.show_topic(i, 1, 10))

        for i in range(0, len(doc_set)):
            temp_dist_of_docs_over_topics.append(dtmModel.gamma_[i])

        dist_of_docs_over_topics = []
        dist_of_docs_over_topicsindexer = -1
        for i in temp_dist_of_docs_over_topics:
            dist_of_docs_over_topicsindexer = dist_of_docs_over_topicsindexer + 1
            dist_of_docs_over_topics.append([])
            for j in i:
                dist_of_docs_over_topics[
                    dist_of_docs_over_topicsindexer].append(j)

        with open('F:\\publication work\\Data\\temp\\8.pubmed_identifier.txt',
                  'w') as f3:
            f3.write(str(pubmed_identifier_list))

        with open('F:\\publication work\\Data\\temp\\9.topics_list.txt',
                  'w') as f1:
            f1.write(str(list_of_topics))

        #print dist_of_new_docs_over_topics
        with open(
                'F:\\publication work\\Data\\temp\\12.distribution_of_topics_in_docs_bracket_replaced_only_prob.txt',
                'w') as f5:
            f5.write(str(dist_of_docs_over_topics))

        with open(
                'F:\\publication work\\Data\\temp\\17.topic_doc_dictionary.txt',
                'w') as f4:
            f4.write(str(topic_doc_dictionary))

        del doc_set
        del list_of_topics
        del dist_of_docs_over_topics
        del pubmed_identifier_list
        del stopped_tokens
        del stemmed_tokens
        del texts
        del dictionary
        del Corpus
        del tokens
        del dtmModel
        del topic_doc_dictionary

        #replaceBrackets()
        #print 'Bracket replacing completed'

        #keepingOnlyProbability(number_of_topics_produced)
        #print 'Keeping only probability completed'

        distanceFromJSD()
        print 'Measuring distance completed'

        similarityFromJSD()
        print 'Measuring similarity completed'

        spectralClustering(number_of_topics_in_a_dataset)
        print 'Spectral clustering completed'

        combiningClusterResult(dataset_name)
        print 'Combining clustering result completed'

        accuracyMeasure(dataset_name)
        print 'Measuring accuracy completed'

    with open('F:\\publication work\\Data\\temp\\18.NMI_dictionary.txt',
              'w') as f6:
        f6.write(str(accuracy_dictionary))

    f1.close()
    f2.close()
    f3.close()
    f4.close()
    f5.close()
    f6.close()
    cursor.close()
    cursor1.close()
    cnxn.close()
Esempio n. 16
0
# 读取时间段

t = open(main_path + 'corpus/dtm_o/time_series.txt', 'r')
time_series = [int(i) for i in t.read().split()]
t.close()
# 建模

model_gen = DtmModel(dtm_path,
                     corpus=corpus,
                     time_slices=time_series,
                     mode=para['mode'],
                     model=para['model'],
                     num_topics=para['num_topics'],
                     id2word=corpus.dictionary,
                     prefix=None,
                     lda_sequence_min_iter=para['lda_sequence_min_iter'],
                     lda_sequence_max_iter=para['lda_sequence_max_iter'],
                     lda_max_em_iter=para['lda_max_em_iter'],
                     alpha=para['alpha'],
                     top_chain_var=para['top_chain_var'],
                     rng_seed=para['rng_seed'],
                     initialize_lda=para['initialize_lda'])

# model_gen = LdaSeqModel(corpus = corpus, time_slice=time_series, id2word = dictionary, num_topics = num_topics)
print 'model training finish'
model_gen.save(main_path + 'result/dtm_o_' + sys.platform + '_topic_' +
               str(para['num_topics']) + '.model')
print 'model saving finish'
#model1 = DtmModel.load('topic1.model')
#topics = model1.show_topic(topicid=0, time=0, topn=10)
Esempio n. 17
0
    #model_DTM = DtmModel(dtm_path, corpus_EI_toy, time_slices_EI_toy, num_topics=num_topics, id2word=corpus_EI_toy.dictionary,initialize_lda=True)
    #model_DTM.save('dtm_ei_10')
    print("Fin de l'entrainement du modèle DTM pour EI\n")

    print("\n---------------------\n")

    print("Début de l'entrainement du modèle DIM pour AE \n")
    #model_DIM = DtmModel(dtm_path, corpus_AE_toy, time_slices_AE_toy, num_topics=num_topics, id2word=corpus_AE_toy.dictionary, initialize_lda=True, model='fixed')
    #model_DIM.save('dim_ae_10')
    print("Fin de l'entrainement du modèle DIM pour AE\n")

    print("Début de l'entrainement du modèle DIM pour RI \n")
    model_DIM = DtmModel(dtm_path,
                         corpus_RI_toy,
                         time_slices_RI_toy,
                         num_topics=num_topics,
                         id2word=corpus_RI_toy.dictionary,
                         initialize_lda=True,
                         model='fixed')
    model_DIM.save('dim_ri_10')
    print("Fin de l'entrainement du modèle DIM pour RI\n")

    print("Début de l'entrainement du modèle DIM pour EI \n")
    model_DIM = DtmModel(dtm_path,
                         corpus_EI_toy,
                         time_slices_EI_toy,
                         num_topics=num_topics,
                         id2word=corpus_EI_toy.dictionary,
                         initialize_lda=True,
                         model='fixed')
    model_DIM.save('dim_ei_10')
Esempio n. 18
0
import time
from gensim.models.wrappers.dtmmodel import DtmModel
from gensim import corpora
start_time = time.time()

dtm_path = "dtm-linux64"

# Importation de la liste de texte lemmatisé
corpus = pickle.load(open('corpus_geo.pkl', 'rb'))

# Mise en format pour gensim
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]

# Pour 10 topics
time_slice = [11468]*9
time_slice.append(11472)

# Pour 20 topics
#time_slice = [5734]*9
# time_slice.append(5738)

nb_topics = 10

model = DtmModel(dtm_path, corpus, time_slice, num_topics=nb_topics,
                 id2word=dictionary, initialize_lda=True)

model.save("DTMModel")

print("---- %s seconds ----" % (time.time() - start_time))
Esempio n. 19
0
def dtm_draw_topic(dtm_model: DtmModel,
                   topic_index: int,
                   time_num: int = None,
                   topn=10):
    # 自动判断主题数量
    if time_num is None:
        time_num = 0
        while True:
            try:
                dtm_model.show_topic(topic_index, time_num, topn)
                time_num += 1
            except:
                break

    x = range(time_num)

    # 统计所有时间的关键词
    word_set = set()
    for time_index in range(time_num):
        for prob, word in dtm_model.show_topic(topic_index, time_index, topn):
            word_set.add(word)
    word_stat = {word: [] for word in word_set}

    # 在各个时间下,根据关键词获取频率
    max_prob = 0

    for time_index in range(time_num):
        word_dict = {
            word: prob
            for prob, word in dtm_model.show_topic(topic_index, time_index,
                                                   topn)
        }
        for word in word_set:
            if word in word_dict:
                word_stat[word].append(word_dict[word])
                if word_dict[word] > max_prob:
                    max_prob = word_dict[word]
            else:
                word_stat[word].append(0)

    # 统计当前主题文档数量
    current_topic_doc_num = pd.Series(np.argmax(
        dtm_model.gamma_, axis=1)).value_counts().sort_index()[topic_index]
    total_doc_num = len(np.argmax(dtm_model.gamma_, axis=1))

    # 画图
    subplot_num = len(word_stat)
    subplot_col = 4
    subplot_row = math.ceil(float(subplot_num) / subplot_col)
    plt.figure(figsize=(4 * subplot_col, 4 * subplot_row))
    plt.suptitle(
        f'主题ID:{topic_index},共{dtm_model.num_topics}个主题,当前主题文本数量:{current_topic_doc_num}/{total_doc_num}'
    )

    for word_index, (word, prob_list) in enumerate(word_stat.items()):
        plt.subplot(subplot_row, subplot_col, word_index + 1)
        plt.plot(x, prob_list, label=word)
        plt.xticks([*range(0, x[-1], 2), x[-1]])
        plt.ylim(0, max_prob)
        plt.legend()
    plt.show()
Esempio n. 20
0
with open('NuclearEnergy/Data/total.txt', 'rb') as p:
    total = pickle.load(p)

total = total[total['press'].str.contains('한수원|원자력문화재단|원자력안전위원회|산자부') == True]
time_slice = list(total.groupby('pubtime')['pubtime'].count())

dic = corpora.Dictionary(total['article'])

tf = [dic.doc2bow(i) for i in total['article']]
tfidfm = models.TfidfModel(tf)
tfidf = tfidfm[tf]

corpus = tfidf.corpus

model = DtmModel('C:/dtm-win64.exe.', corpus, time_slice,
                 num_topics=20, id2word=dic)

with open('NuclearEnergy/Result/model1.txt', 'wb') as p:
    pickle.dump(model, p)

with open('NuclearEnergy/Result/model1.txt', 'rb') as p:
    model = pickle.load(p)

for i in range(0, 36):
    doc_topic_dists = pd.DataFrame(model.dtm_vis(corpus, i)[0])
    doc_topic_dists.index.name = 'doc'
    doc_topic_dists.columns.name = 'topic'

    doc_lengths = pd.Series(model.dtm_vis(corpus, i)[2])
    doc_lengths.name = 'doc_lenghts'
    topic_freq = (doc_topic_dists.T * doc_lengths).T.sum()
Esempio n. 21
0
dialogues = [x for x in dialogues.values()]


no_topics = 2
#Create class wrapper
class DTMcorpus(corpora.textcorpus.TextCorpus):
    def get_texts(self):
        return self.input

    def __len__(self):
        return len(self.input)
corpus = DTMcorpus(dialogues)


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = DtmModel(path_to_dtm, corpus, time_slices=[1,1,1,1,1,1], num_topics=no_topics,id2word=corpus.dictionary, initialize_lda=True)


for i in range(0, no_topics):
    print(model.show_topic(topicid=i, time=0, num_words=8))


m = np.array(model.gamma_)

sns.set(style="whitegrid")
f, ax = plt.subplots()
for j in range(0, no_topics):
    plt.plot(np.arange(1, 7, 1), m[:, j], '-o', label="Topic {}".format(j))

f.legend(*ax.get_legend_handles_labels(), loc="center right",  fontsize='x-large')

#Corpus class for DTM data load
class DTMcorpus(corpora.textcorpus.TextCorpus):
    def get_texts(self):
        return self.input

    def __len__(self):
        return len(self.input)


corpus = DTMcorpus(documents)

#path where dtm file is installed
dtm_path = "/home/ankit/NLP_Project/dtm/dtm/dtm"
model = DtmModel.load("DTMMOdel.txt")

#model.save("DTModel.txt")
#Gives top 25 topics
tp = model.show_topics(num_topics=-1,
                       times=1,
                       num_words=100,
                       log=False,
                       formatted=False)
print tp
print type(tp)
for i in tp:
    for j in i:
        print type(j), j[1].decode("utf-8")

    #print i.decode("utf-8")
Esempio n. 23
0
    tagged_words = pos_tag(words)
    lemmatized = [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) for word, tag in tagged_words]
    lemmatized = list(filter(lambda w: not any(p in w for p in punctuation) and w not in stopword_list and w not in punctuation and len(w) >= 3, lemmatized))
    return lemmatized

def timestamp():
    return datetime.now().strftime('%x %X')

print('({}) DTM training data preprocessing started'.format(timestamp()))
start = datetime.now()
orig_df = pd.read_pickle(r'dfs\2020-03-22-to-2020-11-18-1000-daily')
orig_texts = [preprocess(text) for text in orig_df['full_text']]
orig_dictionary = corpora.Dictionary(orig_texts)
orig_corpus = [orig_dictionary.doc2bow(text) for text in orig_texts]

dtm_model = DtmModel.load(r'dtm\2020-03-22-to-2020-11-18-1000-daily')
print('Time to preprocess training texts:', str(datetime.now() - start))

######################################
##### DAY-BY-DAY TOPIC LABELLING #####
######################################

conn = sqlite3.connect('database/tweets.db')

# df = pd.read_pickle(r'dfs\2020-03-22-to-2020-08-19-4000-daily')
# df = pd.read_sql_query('select * from tweets where "user.screen_name" in (select screen_name from labels) and created_at between \'2020-03-22\' and \'2020-11-18\'', conn)

# comment out n_tweets_per_day lines and cumulative_tweets lines when using full dataset
# n_tweets_per_day = df['created_at'].apply(lambda x: x[:10]).value_counts().sort_index().values.tolist()

# cumulative_tweets[i] is the index of the first Tweet from i days after START_DATE
Esempio n. 24
0
        u'competitive', u'package', u'bonus', u'corporate', u'equity',
        u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays',
        u'insurance', u'flexible', u'disability', u'insurance',
        u'technologies', u'disability', u'accommodation', u'recruiter',
        u'techexpousa'
    ]
]

time_seq = [3, 7]  # first 3 documents are from time slice one
#  and the other 7 are from the second time slice.


class DTMcorpus(corpora.textcorpus.TextCorpus):
    def get_texts(self):
        return self.input

    def __len__(self):
        return len(self.input)


corpus = DTMcorpus(documents)

# trying to use compiled dtm C++ code but it won't work
dtm_path = "/Users/anselmscreen/github/dtm/gensim/dtm-darwin64.app"
model = DtmModel(dtm_path,
                 corpus,
                 time_seq,
                 num_topics=2,
                 id2word=corpus.dictionary,
                 initialize_lda=True)
Esempio n. 25
0
g.load_raw_corpus()
time_slices = [13, 11, 11, 10, 15]  # number of documents for each month


class DTMcorpus(corpora.textcorpus.TextCorpus):
    def get_texts(self):
        return self.input

    def __len__(self):
        return len(self.input)


corpus = DTMcorpus(g.corpus_raw)
model = DtmModel(dtm_compiled_path,
                 corpus,
                 time_slices,
                 num_topics=2,
                 id2word=corpus.dictionary,
                 initialize_lda=True)

# collect probabilities for chosen keyterms
words_of_interest = ['Tuerkei', 'Fluechtlinge', 'Oesterreich']
topic_choice = 0
results = {}
for w in words_of_interest:
    results[w] = []
for i in range(5):
    for (p, w) in model.show_topic(topic_choice, i):
        if w in int_words:
            results[w].append(p)

# plot
Esempio n. 26
0
# save preprocessed corpus
with open(r'dtm\full-preprocessed-pickle', 'wb') as f:
    pickle.dump(texts, f)

# get time slices (number of tweets each day)
time_slices = df['created_at'].apply(
    lambda x: x[:10]).value_counts().sort_index().values.tolist()

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

################################################################################

dtm_exe_path = r'C:\Program Files\DTM\dtm-win64.exe'

print('({}) Model started training'.format(timestamp()))
start = datetime.now()
dtm_model = DtmModel(dtm_exe_path,
                     corpus=corpus[:],
                     time_slices=time_slices,
                     num_topics=20,
                     id2word=dictionary)
elapsed = datetime.now() - start
print('({}) Model finished training'.format(timestamp()))
print('Elapsed time:', elapsed)

print('Saving model...')
dtm_model.save(dtm_out_path)
print('({}) Model saved'.format(timestamp()))
Esempio n. 27
0
class Pipeline(object):
    def __init__(self,
                 key="Y02E_10",
                 m_type="LDA",
                 num_topics=7,
                 min_slice_size=200):
        self.num_topics = num_topics
        self.key = key
        self.data_file = '../Data/{}.csv'.format(key)
        self.dict_file = "{}.dict".format(key)
        self.corpus_file = "{}.mm".format(key)
        self.coords_file = "{}_coords.csv".format(key)
        self.topics_file = "{}_Topics.txt".format(key)
        self.approved_ids_file = "{}_approved_ids".format(key)
        self.m_type = m_type
        self.min_slice_size = min_slice_size

        dtm_home = os.environ.get('DTM_HOME', "dtm-master")
        self.dtm_path = os.path.join(dtm_home, 'bin',
                                     'dtm-darwin64') if dtm_home else None

    def make_corpus(self):
        # stop list from nltk
        stoplist = set(nltk.corpus.stopwords.words("english"))

        if self.m_type in ["DTM", "DIM"]:
            # time shape stuff
            self.time_seq, self.approved_ids = get_time_seq(
                self.data_file, self.min_slice_size)

            filehandler = open(_MODELS_DIR + self.approved_ids_file + ".obj",
                               "wb")
            pickle.dump(self.approved_ids, filehandler)
            filehandler.close()

            self.approved_ids = list(itertools.chain(*self.approved_ids))
            #self.corpus = DTMcorpus(self.corpus) # warning, this reads in the whole corpus to memory!
            self.corpus = MyCorpus(self.data_file, self.approved_ids)
        else:
            # instantiate corpus object
            self.corpus = MyCorpus(self.data_file)  # memory friendly corpus!

        print("Making dictionary")
        t0 = time.time()
        # create dictionary, remove stopwords and words only occurring once, apply stemming
        self.corpus.make_dictionary(
            stoplist=stoplist, minfreq=25)  # minfreq=25 to match Blei's paper
        print(time.time() - t0)

        print("saving dictionary")
        t0 = time.time()
        # save the dictionary
        self.corpus.dictionary.save(_MODELS_DIR + self.dict_file)
        print(time.time() - t0)

        print("Saving corpus")
        t0 = time.time()
        # save the corpus
        gensim.corpora.MmCorpus.serialize(_MODELS_DIR + self.corpus_file,
                                          self.corpus)
        print(time.time() - t0)

    def run_model(self):
        ''' Run the LDA model on a given corpus and dictionary
		'''
        if not hasattr(self, 'corpus'):
            # if there's no corpus present, read in saved corpus
            corpus = gensim.corpora.MmCorpus(
                os.path.join(_MODELS_DIR, self.corpus_file))

        if not hasattr(self, 'corpus.dictionary'):
            # if there's no dictionary present, read in saved dictionary
            dictionary = gensim.corpora.Dictionary.load(
                os.path.join(_MODELS_DIR, self.dict_file))
        if self.m_type == "LDA":
            # Run LDA model
            print("Running LDA Model")
            t0 = time.time()
            self.lda = gensim.models.LdaModel(self.corpus,
                                              id2word=self.corpus.dictionary,
                                              num_topics=self.num_topics)
            print(time.time() - t0)
        if self.m_type == "DTM":
            print("Running DTM Model")
            t0 = time.time()
            self.lda = DtmModel(self.dtm_path,
                                self.corpus,
                                self.time_seq,
                                num_topics=self.num_topics,
                                id2word=self.corpus.dictionary,
                                initialize_lda=True)
            print(time.time() - t0)
        if self.m_type == "DIM":
            print("Running DIM Model")
            t0 = time.time()
            self.lda = DtmModel(self.dtm_path,
                                self.corpus,
                                self.time_seq,
                                num_topics=self.num_topics,
                                model="fixed",
                                id2word=self.corpus.dictionary,
                                initialize_lda=True)
            print(time.time() - t0)

    def save_model(self, model_name="LDA_model"):
        ''' Save the current LDA model to an object file in the saved models folder.
		'''
        filehandler = open(_MODELS_DIR + model_name + ".obj", "wb")
        pickle.dump(self.lda, filehandler)
        filehandler.close()

    def topics(self, model_name=None, save=True, viz=True):
        '''Will print and optionally save topics of a given LDA model. Uses LDA model present in object by default
		unless alternate saved version is specified. 
		'''
        # If model != None:
        # 		try to get it from the folder
        #		except, it's not there, throw an error.

        if model_name != None:
            filehandler = open(_MODELS_DIR + model_name + ".obj", 'r')
            self.lda = pickle.load(filehandler)
            filehandler.close()

        if not hasattr(self, 'lda'):
            print("no LDA model detected")

        else:
            print(self.lda.print_topics(self.num_topics, num_words=25))
            self.topics = self.lda.show_topics(num_topics=self.num_topics,
                                               num_words=20)

            # save the topics and their constituent words
            if save:
                f = open(_MODELS_DIR + self.topics_file, 'w')
                print(self.topics, end="", file=f)
                f.close()

            if viz:
                if not hasattr(self, "topics"):
                    self.topics = open(
                        os.path.join(_MODELS_DIR, self.topics_file),
                        'rb').readlines()[0]
                    self.topics.close()
                # parsing topic string
                lines = str(self.topics).strip("[()").strip("]\n").split("(")
                lines = [i.strip("),").split(", u'")[1] for i in lines]

                # plotting word clouds of each topic
                curr_topic = 0
                #classes = np.array(target_labels)[np.array(list(manual_best)) - 1]
                for j, line in enumerate(lines):
                    scores = [
                        float(x.split("*")[0]) for x in line.split(" + ")
                    ]
                    words = [
                        x.split("*")[1].strip("'), ")
                        for x in line.split(" + ")
                    ]

                    freqs = []
                    for word, score in zip(words, scores):
                        freqs.append((word, score))

                    wc = WordCloud(max_words=100)
                    elements = wc.fit_words(freqs)
                    default_colors = wc.to_array()
                    plt.figure()
                    plt.title("Topic {}".format(j))  #classes[j])
                    plt.imshow(default_colors)
                    plt.axis("off")
                    plt.show()
                    curr_topic += 1