Esempio n. 1
0
def dtm_draw_topic(dtm_model: DtmModel, topic_index, time_num=None, topn=10):
    # 自动判断主题数量
    if time_num is None:
        time_num = 0
        while True:
            try:
                dtm_model.show_topic(topic_index, time_num, topn)
                time_num += 1
            except:
                break

    x = range(time_num)

    # 统计所有时间的关键词
    word_set = set()
    for time_index in range(time_num):
        for prob, word in dtm_model.show_topic(topic_index, time_index, topn):
            word_set.add(word)
    word_stat = {word: [] for word in word_set}

    # 在各个时间下,根据关键词获取频率
    max_prob = 0

    for time_index in range(time_num):
        try:
            word_dict = {
                word: prob
                for prob, word in dtm_model.show_topic(topic_index, time_index,
                                                       topn)
            }
        except:
            break
        for word in word_set:
            if word in word_dict:
                word_stat[word].append(word_dict[word])
                if word_dict[word] > max_prob:
                    max_prob = word_dict[word]
            else:
                word_stat[word].append(0)

    # 画图
    subplot_num = len(word_stat)
    subplot_col = 4
    subplot_row = math.ceil(float(subplot_num) / subplot_col)
    plt.figure(figsize=(4 * subplot_col, 4 * subplot_row))
    for word_index, (word, prob_list) in enumerate(word_stat.items()):
        plt.subplot(subplot_row, subplot_col, word_index + 1)
        plt.plot(x, prob_list, label=word)
        plt.ylim(0, max_prob)
        plt.legend()
Esempio n. 2
0
class DtmlModelSLab():
    def __init__(self, namespace: str, docs: List[str], time_slice: List[int]):
        self.namespace = namespace
        Path(namespace).mkdir(exist_ok=True, parents=True)

        self.docs = docs
        self.time_slice = time_slice

        self.dictionary = None
        self.corpus = None

        self.topic_num = None
        self.topic_index_list = None
        self.dtm_model = None

    def model(self,
              topic_num_best: int = None,
              topic_num_list: List[int] = range(2, 22, 2)):
        pkuseg = PKUSegment()

        docs_segmented = list()
        word_segment_list = list()
        tag_segment_list = list()
        time_slice_segmented = list()

        time_doc_count_accumulate = 0
        for time_doc_count in self.time_slice:
            doc_list_part, word_segment_list_part, tag_segment_list_part = pkuseg.segment_docs(
                self.docs[time_doc_count_accumulate:time_doc_count_accumulate +
                          time_doc_count],
                include_tag_list=[
                    'a', 'ad', 'j', 'l', 'n', 'ns', 'nt', 'nz', 'v', 'vd', 'vn'
                ],
                min_length=2)
            docs_segmented.extend(doc_list_part)
            word_segment_list.extend(word_segment_list_part)
            tag_segment_list.extend(tag_segment_list_part)
            time_slice_segmented.append(len(word_segment_list_part))

            time_doc_count_accumulate += time_doc_count

        dictionary, corpus = word_segment_list_to_dictionary_corpus(
            word_segment_list)

        self.dictionary = dictionary
        self.corpus = corpus
        self.word_segment_list = word_segment_list
        self.tag_segment_list = tag_segment_list
        self.docs = docs_segmented
        self.time_slice = time_slice_segmented

        lda_model = LdaModelSLab(self.namespace, docs_segmented)
        lda_model.word_segment_list = word_segment_list
        lda_model.corpus = corpus
        lda_model.dictionary = dictionary

        # 计算最佳主题数量
        if topic_num_best is None:
            coherence_list, coherence_best, model_best, topic_num_best = lda_model.select_best_topic_num(
                topic_num_list)
        self.topic_num = topic_num_best

        # 训练模型
        self.dtm_model = DtmModel('dtm-win64.exe',
                                  corpus,
                                  time_slice_segmented,
                                  num_topics=topic_num_best,
                                  id2word=dictionary,
                                  initialize_lda=True,
                                  lda_sequence_min_iter=30,
                                  lda_sequence_max_iter=100,
                                  lda_max_em_iter=50)

        # 得到各文本对应主题
        self.topic_index_list = np.argmax(self.dtm_model.gamma_, axis=1)

        df = pd.DataFrame({
            'doc': docs_segmented,
            'topic': self.topic_index_list
        })
        self.df = df
        return df

    def save(self):
        pickle_to_file(self, f'{self.namespace}/dtm_slab.pkl')

        # self.dtm_model.save(f'{self.namespace}/dtm_{self.topic_num}.model')
        # pickle_to_file(self.docs, f'{self.namespace}/docs.pkl')
        # pickle_to_file(self.df, f'{self.namespace}/dtm_df.pkl')

    @classmethod
    def load(cls, namespace: str):
        # docs = unpickle_from_file(f'{namespace}/docs.pkl')
        # instance = cls(namespace, docs)
        # instance.df = unpickle_from_file(f'{namespace}/dtm_df.pkl')

        instance = unpickle_from_file(f'{namespace}/dtm_slab.pkl')

        return instance

    def draw_topics(self, topn=10):
        for topic_index in range(self.topic_num):
            self.draw_topic(topic_index, topn)

        # 各主题数量
        df_topic = pd.DataFrame(np.argmax(self.dtm_model.gamma_, axis=1),
                                columns=['topic'])
        # 聚合统计列
        df_topic.loc[:, 'count'] = 1
        df_g = df_topic.groupby('topic').size()

        df_g.boxplot()
        plt.savefig(f'{self.namespace}/dtm_topic_num.png')

    def draw_topic(self, topic_index: int, topn=10):
        time_length = len(self.time_slice)

        x = range(time_length)

        # 统计所有时间的关键词
        word_set = set()
        for time_index in range(time_length):
            for prob, word in self.dtm_model.show_topic(
                    topic_index, time_index, topn):
                word_set.add(word)
        word_stat = {word: [] for word in word_set}

        # 在各个时间下,根据关键词获取频率

        # 画图Y轴最大值
        max_prob = 0

        for time_index in range(time_length):
            word_dict = {
                word: prob
                for prob, word in self.dtm_model.show_topic(
                    topic_index, time_index, topn)
            }
            for word in word_set:
                if word in word_dict:
                    word_stat[word].append(word_dict[word])
                    if word_dict[word] > max_prob:
                        max_prob = word_dict[word]
                else:
                    word_stat[word].append(0)

        # 统计当前主题文档数量
        current_topic_doc_num = pd.Series(
            np.argmax(self.dtm_model.gamma_,
                      axis=1)).value_counts().sort_index()[topic_index]
        total_doc_num = len(np.argmax(self.dtm_model.gamma_, axis=1))

        # 画图
        subplot_num = len(word_stat)
        subplot_col = 4
        subplot_row = math.ceil(float(subplot_num) / subplot_col)
        plt.figure(figsize=(4 * subplot_col, 4 * subplot_row))
        plt.suptitle(
            f'主题ID:{topic_index},共{self.dtm_model.num_topics}个主题,当前主题文本数量:{current_topic_doc_num}/{total_doc_num}'
        )

        for word_index, (word, prob_list) in enumerate(word_stat.items()):
            plt.subplot(subplot_row, subplot_col, word_index + 1)
            plt.plot(x, prob_list, label=word)
            plt.xticks([*range(0, x[-1], 2), x[-1]])
            plt.ylim(0, max_prob)
            plt.legend()

        plt.show()
        plt.savefig(f'{self.namespace}/dtm_topic{topic_index}.png')

    def print_topic_all_time_slice(self, topic_index, topn=10):
        time_index = 0
        while True:
            try:
                msg = self.dtm_model.print_topic(topic_index, time_index, topn)
                print(msg)
            except:
                return
            time_index += 1
Esempio n. 3
0
def DTMimplementForDatasets():
    cnxn = pyodbc.connect(
        'DRIVER={SQL Server};SERVER=DESKTOP-P61DTNE;DATABASE=Medline;UID=sa;PWD=0000'
    )
    cursor = cnxn.cursor()

    tokenizer = RegexpTokenizer(r'\w+')

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    with open('F:\\publication work\\Data\\temp\\sample_dataset.txt',
              'r') as f2:
        s2 = f2.read()
        all_dataset = ast.literal_eval(s2)

    for key, value in all_dataset.items():
        dataset_name = key
        stri = 'where'
        j = 0
        for i in value:
            if (j == 0):
                stri = stri + ' ' + 'TOPIC_NO=' + str(i)
                j = j + 1
            else:
                stri = stri + ' or ' + 'TOPIC_NO=' + str(i)
    # Declare list to create a list of the whole document set
        doc_set = list()
        list_of_topics = list()
        temp_dist_of_docs_over_topics = list()
        dist_of_docs_over_topics = list()
        pubmed_identifier_list = list()
        topic_doc_dictionary = {}
        cursor.execute(
            "SELECT [TOPIC_NO],[SERIAL_NO],[PUBMED_IDENTIFIER],[ABSTRACT] FROM [Medline].[dbo].[OnlyDeeplyRelaGeno2005]"
            + stri + ";")
        for row1 in cursor.fetchall():
            abstract = row1.ABSTRACT.strip()
            serial_no = row1.SERIAL_NO
            topic_no = row1.TOPIC_NO
            pubmed_identifier = row1.PUBMED_IDENTIFIER
            pubmed_identifier_list.append(pubmed_identifier)
            if topic_no in topic_doc_dictionary.keys():
                topic_doc_dictionary[topic_no].append(pubmed_identifier)
            else:
                topic_doc_dictionary[topic_no] = list()
                topic_doc_dictionary[topic_no].append(pubmed_identifier)
            number_of_topics_in_a_dataset = len(topic_doc_dictionary.keys())
            if not abstract:
                cursor1 = cnxn.cursor()
                cursor1.execute(
                    "SELECT [TITLE] FROM [Medline].[dbo].[OnlyDeeplyRelaGeno2005] where SERIAL_NO='"
                    + str(serial_no) + "';")
                for row in cursor1.fetchall():
                    abstract = row.TITLE.strip()
                    #print k
            doc_set.append(abstract)
        #number_of_topics_produced=10
        number_of_topics_produced = len(doc_set) / 24
        #number_of_topics_produced=len(doc_set)/70
        print 'Number of documents: ' + str(len(doc_set))
        print 'Number of topics produced: ' + str(number_of_topics_produced)
        print 'Number of passes: ' + str(number_of_passes)
        print 'Number of clusters: ' + str(number_of_topics_in_a_dataset)

        # Declaring list for tokenized documents in loop
        texts = []

        # loop through document list
        for i in doc_set:

            # clean and tokenize document string
            raw = i.lower()
            tokens = tokenizer.tokenize(raw)
            #print(tokens)

            # remove stop words from tokens
            stopped_tokens = [i for i in tokens if not i in en_stop]

            # stem tokens
            stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

            # add tokens to list
            texts.append(stemmed_tokens)

        # turn our tokenized documents into a id <-> term dictionary
        dictionary = corpora.Dictionary(texts)
        #print(dictionary.token2id)

        # convert tokenized documents into a document-term matrix
        Corpus = [dictionary.doc2bow(text) for text in texts]
        #print(corpus[0])
        """
        class DTMcorpus(corpora.textcorpus.TextCorpus):
        
            def get_texts(self):
                return self.input
        
            def __len__(self):
                return len(self.input)
        
        corpus = DTMcorpus(texts)
        
        """
        #if len(doc_set)*30%100:
        #    section30=len(doc_set)*30/100+1
        #else:
        #   section30=len(doc_set)*30/100
        #section70=len(doc_set)*70/100
        #time_seq = [section30, section70]
        time_seq = [len(doc_set), 0]
        dtm_path = 'C:\Program Files\DTM\dtm-win64.exe'
        dtmModel = DtmModel(dtm_path,
                            Corpus,
                            time_seq,
                            num_topics=number_of_topics_produced,
                            id2word=dictionary,
                            initialize_lda=True)

        for i in range(0, number_of_topics_produced):
            list_of_topics.append(dtmModel.show_topic(i, 1, 10))

        for i in range(0, len(doc_set)):
            temp_dist_of_docs_over_topics.append(dtmModel.gamma_[i])

        dist_of_docs_over_topics = []
        dist_of_docs_over_topicsindexer = -1
        for i in temp_dist_of_docs_over_topics:
            dist_of_docs_over_topicsindexer = dist_of_docs_over_topicsindexer + 1
            dist_of_docs_over_topics.append([])
            for j in i:
                dist_of_docs_over_topics[
                    dist_of_docs_over_topicsindexer].append(j)

        with open('F:\\publication work\\Data\\temp\\8.pubmed_identifier.txt',
                  'w') as f3:
            f3.write(str(pubmed_identifier_list))

        with open('F:\\publication work\\Data\\temp\\9.topics_list.txt',
                  'w') as f1:
            f1.write(str(list_of_topics))

        #print dist_of_new_docs_over_topics
        with open(
                'F:\\publication work\\Data\\temp\\12.distribution_of_topics_in_docs_bracket_replaced_only_prob.txt',
                'w') as f5:
            f5.write(str(dist_of_docs_over_topics))

        with open(
                'F:\\publication work\\Data\\temp\\17.topic_doc_dictionary.txt',
                'w') as f4:
            f4.write(str(topic_doc_dictionary))

        del doc_set
        del list_of_topics
        del dist_of_docs_over_topics
        del pubmed_identifier_list
        del stopped_tokens
        del stemmed_tokens
        del texts
        del dictionary
        del Corpus
        del tokens
        del dtmModel
        del topic_doc_dictionary

        #replaceBrackets()
        #print 'Bracket replacing completed'

        #keepingOnlyProbability(number_of_topics_produced)
        #print 'Keeping only probability completed'

        distanceFromJSD()
        print 'Measuring distance completed'

        similarityFromJSD()
        print 'Measuring similarity completed'

        spectralClustering(number_of_topics_in_a_dataset)
        print 'Spectral clustering completed'

        combiningClusterResult(dataset_name)
        print 'Combining clustering result completed'

        accuracyMeasure(dataset_name)
        print 'Measuring accuracy completed'

    with open('F:\\publication work\\Data\\temp\\18.NMI_dictionary.txt',
              'w') as f6:
        f6.write(str(accuracy_dictionary))

    f1.close()
    f2.close()
    f3.close()
    f4.close()
    f5.close()
    f6.close()
    cursor.close()
    cursor1.close()
    cnxn.close()
Esempio n. 4
0
def dtm_draw_topic(dtm_model: DtmModel,
                   topic_index: int,
                   time_num: int = None,
                   topn=10):
    # 自动判断主题数量
    if time_num is None:
        time_num = 0
        while True:
            try:
                dtm_model.show_topic(topic_index, time_num, topn)
                time_num += 1
            except:
                break

    x = range(time_num)

    # 统计所有时间的关键词
    word_set = set()
    for time_index in range(time_num):
        for prob, word in dtm_model.show_topic(topic_index, time_index, topn):
            word_set.add(word)
    word_stat = {word: [] for word in word_set}

    # 在各个时间下,根据关键词获取频率
    max_prob = 0

    for time_index in range(time_num):
        word_dict = {
            word: prob
            for prob, word in dtm_model.show_topic(topic_index, time_index,
                                                   topn)
        }
        for word in word_set:
            if word in word_dict:
                word_stat[word].append(word_dict[word])
                if word_dict[word] > max_prob:
                    max_prob = word_dict[word]
            else:
                word_stat[word].append(0)

    # 统计当前主题文档数量
    current_topic_doc_num = pd.Series(np.argmax(
        dtm_model.gamma_, axis=1)).value_counts().sort_index()[topic_index]
    total_doc_num = len(np.argmax(dtm_model.gamma_, axis=1))

    # 画图
    subplot_num = len(word_stat)
    subplot_col = 4
    subplot_row = math.ceil(float(subplot_num) / subplot_col)
    plt.figure(figsize=(4 * subplot_col, 4 * subplot_row))
    plt.suptitle(
        f'主题ID:{topic_index},共{dtm_model.num_topics}个主题,当前主题文本数量:{current_topic_doc_num}/{total_doc_num}'
    )

    for word_index, (word, prob_list) in enumerate(word_stat.items()):
        plt.subplot(subplot_row, subplot_col, word_index + 1)
        plt.plot(x, prob_list, label=word)
        plt.xticks([*range(0, x[-1], 2), x[-1]])
        plt.ylim(0, max_prob)
        plt.legend()
    plt.show()
Esempio n. 5
0
corpus = DTMcorpus(g.corpus_raw)
model = DtmModel(dtm_compiled_path,
                 corpus,
                 time_slices,
                 num_topics=2,
                 id2word=corpus.dictionary,
                 initialize_lda=True)

# collect probabilities for chosen keyterms
words_of_interest = ['Tuerkei', 'Fluechtlinge', 'Oesterreich']
topic_choice = 0
results = {}
for w in words_of_interest:
    results[w] = []
for i in range(5):
    for (p, w) in model.show_topic(topic_choice, i):
        if w in int_words:
            results[w].append(p)

# plot
labels = ["January '16", "February '16", "March '16", "April '16'", "Mai '16"]
line1, = plt.plot(results['Fluechtlinge'], label='"Refugee"', marker='o')
line2, = plt.plot(results['Oesterreich'], label='"Austria"', marker='o')
line3, = plt.plot(results['Tuerkei'], label='"Turkey"', marker='o')
plt.xticks([0, 1, 2, 3, 4], labels, rotation=70)
plt.ylabel = 'Word probability'
plt.xlabel = 'Publishing month of article'
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)},
           bbox_to_anchor=(1.3, 1))
plt.show()
Esempio n. 6
0
#Create class wrapper
class DTMcorpus(corpora.textcorpus.TextCorpus):
    def get_texts(self):
        return self.input

    def __len__(self):
        return len(self.input)
corpus = DTMcorpus(dialogues)


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = DtmModel(path_to_dtm, corpus, time_slices=[1,1,1,1,1,1], num_topics=no_topics,id2word=corpus.dictionary, initialize_lda=True)


for i in range(0, no_topics):
    print(model.show_topic(topicid=i, time=0, num_words=8))


m = np.array(model.gamma_)

sns.set(style="whitegrid")
f, ax = plt.subplots()
for j in range(0, no_topics):
    plt.plot(np.arange(1, 7, 1), m[:, j], '-o', label="Topic {}".format(j))

f.legend(*ax.get_legend_handles_labels(), loc="center right",  fontsize='x-large')

ax.set_xlabel("Dialogue length", {'size': '16'})
ax.set_ylabel('Proportion of topics', {'size': '16'})

plt.tight_layout()