def get_lda_model():
    """
    (50,28767)
    获得话题
    :return:
    """
    text_array = list()

    with open("jobs-unigrams-filter") as f:
        for line in tqdm(f):
            line = line.strip().split(" ")
            line.remove(line[0])
            text_array.append(line)

    dictionary = Dictionary(text_array)
    # print(common_dictionary)
    common_corpus = [dictionary.doc2bow(text) for text in text_array]
    # Train the model on the corpus.
    lda = LdaModel(common_corpus,
                   id2word=dictionary,
                   num_topics=50,
                   passes=10,
                   iterations=1000)
    temp_file = datapath("LDA_twitter")
    lda.save(temp_file)
    topics = lda.get_topics()
    print(topics.shape)

    topic_list = lda.print_topics(50)
    for topic in topic_list:
        print(topic)
Beispiel #2
0
def save_topic_word_matrix(lda: LdaModel, name: str):
    matrix = lda.get_topics()
    threshold = 1 / matrix.shape[1]
    matrix = np.where(matrix < threshold, 0, matrix)
    matrix = sp.csr_matrix(matrix)
    sp.save_npz(name, matrix)
    return matrix
Beispiel #3
0
    def __init__(self, normalized_text, visualize=False):
        _id = id(self)
        _name = self.__class__.__name__
        _length = len(normalized_text)

        print(_name, _id, "Establishing bigrams")
        bi_gram = Phrases(normalized_text)
        normalized_text = [bi_gram[line] for line in normalized_text]

        print(_name, _id, "Establishing word dictionary")
        dictionary = Dictionary(normalized_text)
        dictionary_words = [
            dictionary.doc2bow(text) for text in normalized_text
        ]

        print(_name, _id, "Constructing LDA model")
        lda_model = LdaModel(corpus=dictionary_words,
                             num_topics=10,
                             id2word=dictionary)

        if visualize:
            filename = 'visualization.json'
            visualization = pyLDAvis.gensim.prepare(lda_model,
                                                    dictionary_words,
                                                    dictionary)
            pyLDAvis.save_json(visualization, filename)
            with open(filename) as json_data:
                visual = json.load(json_data)
                self._topics = visual
        else:
            self._topics = lda_model.get_topics()
        print(_name, _id, "Topic modelling done")
Beispiel #4
0
def add_topical_network(
    result: res_pb.TopicQueryResult,
    topic_model: LdaModel,
    dictionary: Dictionary,
    graph_db: Sqlite3Graph,
    bow_db: Sqlite3Bow,
) -> None:
    """
  Adds the topical_network field to the result proto.
  Creates this network by the weighted jacquard of topics.

  The source and target words are going to be assigned indices -1 and -2.
  """
    # Size n_topics X voccab_size
    term_topic_mat = topic_model.get_topics()
    num_topics, vocab_size = term_topic_mat.shape

    source_word = estimate_plaintext_from_graph_key(
        graph_key=result.source,
        graph_db=graph_db,
        bow_db=bow_db,
    )
    assert source_word is not None, \
        f"Failed to find plaintext entry for {result.source}"
    source_word_idx = dictionary.token2id[source_word]
    source_graph_idx = -1
    source_vec = np.zeros(vocab_size)
    source_vec[source_word_idx] = 1

    target_word = estimate_plaintext_from_graph_key(
        graph_key=result.target,
        graph_db=graph_db,
        bow_db=bow_db,
    )
    assert target_word is not None, \
        f"Failed to find plaintext entry for {result.target}"
    target_word_idx = dictionary.token2id[target_word]
    target_graph_idx = -2
    target_vec = np.zeros(vocab_size)
    target_vec[target_word_idx] = 1

    graph_idx2vec = {
        topic_idx: term_topic_mat[topic_idx, :]
        for topic_idx in range(num_topics)
    }
    graph_idx2vec[source_graph_idx] = source_vec
    graph_idx2vec[target_graph_idx] = target_vec

    # Set all node names
    for idx in range(num_topics):
        result.topical_network.nodes[idx].name = f"Topic: {idx}"
    result.topical_network.nodes[source_graph_idx].name = \
        f"Source: '{result.source}' -- '{source_word}'"
    result.topical_network.nodes[target_graph_idx].name = \
        f"Source: '{result.target}' -- '{target_word}'"

    # Set all edges:
    for i, j, sim in _all_pairs_jaccard_comparisions(graph_idx2vec):
        result.topical_network.nodes[i].neighbors[j] = sim
Beispiel #5
0
def get_LDA_model(text_array):
    """
    (30, 27445)
    """
    dictionary = Dictionary(text_array)
    # print(common_dictionary)
    common_corpus = [dictionary.doc2bow(text) for text in text_array]
    # Train the model on the corpus.
    lda = LdaModel(common_corpus, id2word=dictionary, num_topics=30, passes=5, iterations=500)
    temp_file = datapath("LDA_twitter")
    lda.save(temp_file)
    topics = lda.get_topics()
    print(topics.shape)
Beispiel #6
0
class LDA():
    def __init__(self, K, data, AMask, params, name, dataName):
        self.K = K  # [int] nb of topics
        self.AMask = AMask  # [n_a,n_d float] matrix of author participation to each paper (1 if author participated to paper)
        self.n_a, self.n_d = self.AMask.shape  # [int] nb authors
        self.D = data
        self.n_dic, self.n_d = self.D.shape
        self.name = name
        self.train_C_ = []
        self.train_param = params['train_param']
        for d in range(self.n_d):
            self.train_C_.append([(k, self.D[k, d])
                                  for k in range(self.n_dic)])

        self.dataName = dataName

    def train(self):
        self.LDA = LdaModel(self.train_C_,
                            num_topics=self.K,
                            decay=0.5,
                            offset=1024,
                            passes=80)
        self.phi = self.LDA.get_topics().transpose()
        self.theta = np.zeros((self.K, self.n_d))
        for d in range(self.n_d):
            tmp = self.LDA.get_document_topics(self.train_C_[d])
            ind = [c for (c, b) in tmp]
            self.theta[ind, d] = [b for (c, b) in tmp]
        self.D_reb = self.phi.dot(self.theta)
        self.A = normalize(self.AMask, 'l1', 0)
        return ()

    def save(self, path):
        '''
        path example
        '''
        toSave = {}
        toSave['theta'] = self.theta
        toSave['phi'] = self.phi
        toSave['A'] = self.A
        toSave['K'] = self.K
        toSave['train_param'] = self.train_param
        with open(path + self.name + '_' + self.dataName + '.pkl',
                  'wb') as output:
            pickle.dump(toSave, output, pickle.HIGHEST_PROTOCOL)
Beispiel #7
0
def lda_topics(processed_data: list, n_topics: int = 10, learning_decay: float = 0.5,
               learning_offset: float = 1.0, max_iter: int = 50, n_words: int = 10) -> Tuple[list, list]:
    """
    lda_topics perfoms LDA topic modeling on the input data

    :param processed_data: list of preprocessed segments
    :param n_topics: number of topics to extract form corpus
    :param learning_decay: learning decay parameter for LDA
    :param learning_offset: learning offset parameter for LDA
    :param max_iter: max. number of interations
    :param n_words: number of topic representatives

    :return:
        - topics - list of topics (and their representatives
        - doc_topics - list of predicted topics, one for each segment
    """

    dictionary = corpora.Dictionary(processed_data, )
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_data]

    lda_model = LdaModel(doc_term_matrix, id2word=dictionary, num_topics=n_topics, offset=learning_offset,
                         random_state=42, update_every=1, iterations=max_iter,
                         passes=10, alpha='auto', eta="auto", decay=learning_decay, per_word_topics=True)

    topics = []
    for i_t, topic_word_dist in enumerate(lda_model.get_topics()):
        topic = [lda_model.id2word[w_id] for w_id, _ in lda_model.get_topic_terms(i_t, topn=n_words)]
        topics.append(topic)

    # getting documents topic labels
    doc_topics = []
    for doc in doc_term_matrix:

        doc_t_dist = sorted(lda_model.get_document_topics(doc), key=lambda item: item[1], reverse=True)
        t, _ = doc_t_dist[0]
        doc_topics.append(t)

    assert len(doc_topics) == len(processed_data)
    return topics, doc_topics
Beispiel #8
0
class gensim_data(object):
    def __init__(self,mashup_descriptions, api_descriptions, mashup_categories=None, api_categories=None,tag_times=2,mashup_only=False,strict_train=False):
        self.mashup_only =mashup_only
        self.strict_train = strict_train
        # 整合text和tag信息:一个mashup/api的信息整合在一起,一行
        if tag_times>0 and mashup_categories is not None:
            assert len(mashup_descriptions)==len(mashup_categories)
            self.mashup_dow=[[]]*len(mashup_descriptions)
            for i in range(len(mashup_descriptions)):
                self.mashup_dow[i]=mashup_descriptions[i]
                for j in range(tag_times):
                    self.mashup_dow[i] += mashup_categories[i] #  直接将文本和tag拼接,是否有更好的方法?增加出现次数?
        else:
            self.mashup_dow = mashup_descriptions
        self.mashup_dow = [[str (index) for index in indexes] for indexes in self.mashup_dow] # 二维列表
        # print (self.mashup_dow[0])

        if tag_times>0 and api_categories is not None:
            assert len (api_descriptions) == len (api_categories)
            self.api_dow=[[]]*len(api_descriptions)
            for i in range(len(api_descriptions)):
                self.api_dow[i]=api_descriptions[i]
                for j in range(tag_times):
                    self.api_dow[i]+=api_categories[i]
        else:
            self.api_dow=api_descriptions
        self.api_dow = [[str (index) for index in indexes] for indexes in self.api_dow]

        if not self.mashup_only and not self.strict_train:
            self.dct = Dictionary(self.mashup_dow + self.api_dow)
        if self.mashup_only and self.strict_train:
            # 训练用的mashup,api的编码
            self.train_mashup_dow = [self.mashup_dow[m_id] for m_id in dataset.crt_ds.his_mashup_ids]
            self.dct = Dictionary(self.train_mashup_dow)
            self.train_mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.train_mashup_dow]  # 词id-数目
        # 无论怎样,总要为每个mashup/api计算feature
        self.mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.mashup_dow]  # 所有mashup文本的词id-数目
        print('self.mashup_dow, num:',len(self.mashup_dow))
        zero_num = sum([1 if len(mashup_info)==0 else 0 for mashup_info in self.mashup_dow])
        print('zero_num',zero_num)
        self.api_dow = [self.dct.doc2bow(api_info) for api_info in self.api_dow]

        # print('len of self.mashup_dow,self.api_dow:{},{}'.format(len(self.mashup_dow),len (self.api_dow)))

        self.num_topics =0
        self.model = None # 处理文本的模型
        self._mashup_features= None # 文本提取的特征向量
        self._api_features= None

        self.mashup_topics = None # 文本最高的N个topic
        self.api_topics = None

    # 只关注词在文本中是否出现过,二进制,用于计算cos和jaccard
    def get_binary_v(self):
        dict_size=len(self.dct)
        mashup_binary_matrix=np.zeros((meta_data.mashup_num,dict_size))
        api_binary_matrix = np.zeros ((meta_data.api_num, dict_size))
        mashup_words_list=[] # 每个mashup中所有出现过的词
        api_words_list = []
        for i in range(meta_data.mashup_num):
            temp_words_list,_=zip(*self.mashup_dow[i])
            mashup_words_list.append(temp_words_list)
            for j in temp_words_list:# 出现的词汇index
                mashup_binary_matrix[i][j]=1.0

        for i in range(meta_data.api_num):
            temp_words_list,_=zip(*self.api_dow[i])
            api_words_list.append(temp_words_list)
            for j in temp_words_list:# 出现的词汇index
                api_binary_matrix[i][j]=1.0
        return mashup_binary_matrix,api_binary_matrix,mashup_words_list,api_words_list

    def model_pcs(self,model_name,LDA_topic_num=None):
        # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)]
        if self.mashup_only:
            if self.strict_train:
                train_corpus = self.train_mashup_dow
            else:
                train_corpus = self.mashup_dow
        else:
            if self.strict_train:
                train_corpus = self.train_mashup_dow + self.train_api_dow
            else:
                train_corpus = self.mashup_dow + self.api_dow

        if model_name=='HDP':
            self.model = HdpModel(train_corpus, self.dct)
            self.num_topics = self.model.get_topics ().shape[0]
            print('num_topics',self.num_topics)
        elif model_name=='TF_IDF':
            self.model =TfidfModel (train_corpus)
            self.num_topics=len(self.dct)
        elif model_name=='LDA':
            if LDA_topic_num is None:
                self.model = LdaModel(train_corpus)
            else:
                self.model = LdaModel(train_corpus,num_topics=LDA_topic_num)
            self.num_topics = self.model.get_topics ().shape[0]

        else:
            raise ValueError('wrong gensim_model name!')

        # 使用模型处理文本,再转化为标准的np格式(每个topic上都有上)
        # print(self.mashup_dow)
        self.mashup_features=[self.model[mashup_info] for mashup_info in self.mashup_dow] # 每个mashup和api的feature
        # print(self.mashup_features)
        print('self.mashup_features, num:', len(self.mashup_features))
        zero_num1 = sum([1 if len(mashup_feature)==0 else 0 for mashup_feature in self.mashup_features])
        print('zero_num1',zero_num1)
        for i in range(len(self.mashup_features)):
            if len(self.mashup_features[i])==0:
                print(self.mashup_dow[i])

        self.api_features = [self.model[api_info] for api_info in self.api_dow]
        # print('when model-pcs,len of mashup_features and api_features:{},{}'.format(len(mashup_features),len(api_features)))
        self._mashup_features=np.zeros((meta_data.mashup_num, self.num_topics))
        self._api_features = np.zeros((meta_data.api_num, self.num_topics))
        for i in range(meta_data.mashup_num): # 部分维度有值,需要转化成规范array
            for index,value in self.mashup_features[i]:
                self._mashup_features[i][index]=value
        for i in range(meta_data.api_num):
            for index,value in self.api_features[i]:
                self._api_features[i][index]=value
        return self._mashup_features, self._api_features

    def get_topTopics(self,topTopicNum=3):# 选取概率最高的topK个主题 [(),(),...]
        mashup_topics = []
        api_topics = []
        for index in range(meta_data.mashup_num):
            sorted_mashup_feature = sorted(self.mashup_features[index],key = lambda x:x[1],reverse=True)
            try:
                topic_indexes,_ = zip(*sorted_mashup_feature)
            except:
                # 有时mashup_bow非空,但是mashup_feature为空
                topic_indexes = random.sample(range(meta_data.mashup_num),topTopicNum)
                # print(self.mashup_dow[index])
                # print(self.mashup_features[index])
                # print(sorted_mashup_feature)
                # raise ValueError('wrong 138!')
            num = min(len(topic_indexes),topTopicNum)
            mashup_topics.append(topic_indexes[:num])
        for index in range(meta_data.api_num):
            sorted_api_feature = sorted(self.api_features[index], key=lambda x: x[1], reverse=True)
            try:
                topic_indexes,_ = zip(*sorted_api_feature)
            except:
                topic_indexes = random.sample(range(meta_data.api_num), topTopicNum)
            num = min(len(topic_indexes),topTopicNum)
            api_topics.append(topic_indexes[:num])
        return mashup_topics,api_topics
Beispiel #9
0
class gensim_data(object):
    def __init__(self, mashup_descriptions,mashup_categories,api_descriptions,api_categories,mashup_only=False,tag_times=2,strict_train=False):
        """
        使用gensim处理mashup和api的text和tag,得到特征表示,主题表示等
        :param tag_times: 编码时是否使用tag,以及tag的次数
        :param mashup_only: 是否只使用mashup信息: 用于LDA对mashup聚类
        :param strict_train: 是否仅使用训练集的信息
        """
        self.strict_train = strict_train
        self.mashup_only = mashup_only
        self.num_topics = 0
        self.model = None  # 处理文本的模型
        self.mashup_features,self.api_features = None,None # 使用模型处理文本得到稀疏特征向量
        self.dense_mashup_features, self.dense_api_features = None, None  # 处理后的密集特征向量
        self.mashup_topics,self.api_topics = None,None  # 文本最高的N个topic

        def initialize():
            # 整合text和tag信息:一个mashup/api的信息整合在一起,一行
            if tag_times > 0:
                assert len(mashup_descriptions) == len(mashup_categories)
                self.mashup_dow = []
                for i in range(len(mashup_descriptions)):
                    # 直接将文本和tag拼接,是否有更好的方法
                    self.mashup_dow.append(mashup_descriptions[i] + mashup_categories[i] * tag_times)
            else:
                self.mashup_dow = mashup_descriptions

            if tag_times > 0:
                assert len(api_descriptions) == len(api_categories)
                self.api_dow = []
                for i in range(len(api_descriptions)):
                    self.api_dow.append(api_descriptions[i] + api_categories[i] * tag_times)
            else:
                self.api_dow = api_descriptions

            if self.strict_train:
                # 训练用的mashup,api的编码
                self.train_mashup_dow = [self.mashup_dow[m_id] for m_id in data_repository.get_ds().his_mashup_ids]
                self.dct = Dictionary(self.train_mashup_dow)
                self.train_mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in
                                         self.train_mashup_dow]  # 词id-数目
            else:
                self.dct = Dictionary(self.mashup_dow + self.api_dow)

            # 为每个mashup/api计算feature
            self.mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.mashup_dow]  # 所有mashup文本的词id-数目
            self.api_dow = [self.dct.doc2bow(api_info) for api_info in self.api_dow]
            # print('len of self.mashup_dow,self.api_dow:{},{}'.format(len(self.mashup_dow),len (self.api_dow)))

        initialize()

    def encode(self,docs):
        # 基于建立的词典,对文本编码
        return list(map(self.dct.doc2idx,docs))

    def get_feas(self,docs):
        # 编码并获得特征向量
        dows = list(map(self.dct.doc2idx,docs))
        feas = [self.model[dow] for dow in dows]
        return feas

    def get_all_encoded_comments(self):
        self.unpadded_encoded_mashup_texts = self.encode(get_iterable_values(data_repository.get_md().mashup_df,'final_description'))
        self.unpadded_encoded_mashup_tags = self.encode(get_iterable_values(data_repository.get_md().mashup_df,'Categories'))
        self.unpadded_encoded_api_texts = self.encode(get_iterable_values(data_repository.get_md().api_df,'final_description'))
        self.unpadded_encoded_api_tags = self.encode(get_iterable_values(data_repository.get_md().api_df,'Categories'))

    # 只关注词在文本中是否出现过,二进制,用于计算cos和jaccard
    def get_binary_v(self):
        dict_size = len(self.dct)
        mashup_binary_matrix = np.zeros((data_repository.get_md().mashup_num+1, dict_size))
        api_binary_matrix = np.zeros((data_repository.get_md().api_num+1, dict_size))
        mashup_words_list = []  # 每个mashup中所有出现过的词
        api_words_list = []
        for id in range(data_repository.get_md().mashup_num+1):
            temp_words_list, _ = zip(*self.mashup_dow[id])
            mashup_words_list.append(temp_words_list)
            for j in temp_words_list:  # 出现的词汇index
                mashup_binary_matrix[id][j] = 1.0

        for id in range(data_repository.get_md().api_num+1):
            temp_words_list, _ = zip(*self.api_dow[id])
            api_words_list.append(temp_words_list)
            for j in temp_words_list:  # 出现的词汇index
                api_binary_matrix[id][j] = 1.0
        return mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list

    def model_pcs(self, model_name, LDA_topic_num=None):
        # 模型处理,返回mashup和api的特征:对同一个语料,可以先后使用不同的模型处理
        # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)]
        if self.mashup_only:
            if self.strict_train:
                train_corpus = self.train_mashup_dow
            else:
                train_corpus = self.mashup_dow
        else:
            if self.strict_train:
                train_corpus = self.train_mashup_dow + self.api_dow
            else:
                train_corpus = self.mashup_dow + self.api_dow

        if model_name == 'HDP':
            self.model = HdpModel(train_corpus, self.dct)
            self.num_topics = self.model.get_topics().shape[0]
            print('num_topics', self.num_topics)
        elif model_name == 'TF_IDF':
            self.model = TfidfModel(train_corpus)
            self.num_topics = len(self.dct)
        elif model_name == 'LDA':
            if LDA_topic_num is None:
                self.model = LdaModel(train_corpus)
            else:
                self.model = LdaModel(train_corpus, num_topics=LDA_topic_num)
            self.num_topics = self.model.get_topics().shape[0]
        else:
            raise ValueError('wrong gensim_model name!')

        # 使用模型处理文本得到稀疏特征向量,再转化为标准的np格式(每个topic上都有)
        # *** 由于mashup_dow和api_dow默认是全部mashup/api的文本,所以得到的特征列表用全局性的id索引即可 ***
        self.mashup_features = [self.model[mashup_info] for mashup_info in self.mashup_dow]  # 每个mashup和api的feature
        self.api_features = [self.model[api_info] for api_info in self.api_dow]
        self.dense_mashup_features = np.zeros((data_repository.get_md().mashup_num, self.num_topics))
        self.dense_api_features = np.zeros((data_repository.get_md().api_num, self.num_topics))
        for i in range(data_repository.get_md().mashup_num):  # 部分维度有值,需要转化成规范array
            for index, value in self.mashup_features[i]:
                self.dense_mashup_features[i][index] = value
        for i in range(data_repository.get_md().api_num):
            for index, value in self.api_features[i]:
                self.dense_api_features[i][index] = value
        return self.dense_mashup_features, self.dense_api_features

    def get_topTopics(self, topTopicNum=3):  # 选取概率最高的topK个主题 [(),(),...]
        mashup_topics = []
        api_topics = []
        for index in range(data_repository.get_md().mashup_num):
            sorted_mashup_feature = sorted(self.mashup_features[index], key=lambda x: x[1], reverse=True)
            try:
                topic_indexes, _ = zip(*sorted_mashup_feature)
            except:
                # 有时mashup_bow非空,但是mashup_feature为空
                topic_indexes = random.sample(range(data_repository.get_md().mashup_num), topTopicNum)
            num = min(len(topic_indexes), topTopicNum)
            mashup_topics.append(topic_indexes[:num])
        for index in range(data_repository.get_md().api_num):
            sorted_api_feature = sorted(self.api_features[index], key=lambda x: x[1], reverse=True)
            try:
                topic_indexes, _ = zip(*sorted_api_feature)
            except:
                topic_indexes = random.sample(range(data_repository.get_md().api_num), topTopicNum)
            num = min(len(topic_indexes), topTopicNum)
            api_topics.append(topic_indexes[:num])
        return mashup_topics, api_topics
Beispiel #10
0
def abandon():
    stopWords = set(stopwords.words('english'))

    for w in string.punctuation:
        stopWords.add(w)

    stops_words = [
        "rt", "…", "...", "URL", "http", "https", "“", "”", "‘", "’", "get",
        "2", "new", "one", "i'm", "make", "go", "good", "say", "says", "know",
        "day", "..", "take", "got", "1", "going", "4", "3", "two", "n", "like",
        "via", "u", "would", "still", "first", "really", "watch", "see",
        "even", "that's", "look", "way", "last", "said", "let", "twitter",
        "ever", "always", "another", "many", "things", "may", "big", "come",
        "keep", "5", "time", "much", "want", "think", "us", "love", "people",
        "need"
    ]

    for w in stops_words:
        stopWords.add(w)

    tokenizer = CustomTweetTokenizer(preserve_case=False,
                                     reduce_len=True,
                                     strip_handles=False,
                                     normalize_usernames=False,
                                     normalize_urls=True,
                                     keep_allupper=False)

    cnt = Counter()
    texts = []
    # comm = json.load(open("data/louvain_rst.json"))
    # users_comm = {str(u) for u in comm if comm[u] == 0}
    # print(len(users_comm))

    # loading data
    data = pd.read_csv("data/ira-tweets-ele.csv",
                       usecols=["tweet_text", "userid"])
    for i, row in tqdm(data.iterrows()):
        # if row["userid"] not in users_comm:
        #     continue
        words = tokenizer.tokenize(row["tweet_text"])
        words = [w for w in words if w not in stopWords and w]
        # if words[0] == "RT":
        #     continue
        for w in words:
            cnt[w] += 1
        texts.append(words)
    print(len(texts))
    json.dump(cnt.most_common(), open("data/word_cloud.json", "w"), indent=2)

    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(t) for t in texts]

    def average_distance(v_tops):
        _sum = 0
        _cnt = 0
        for i in range(len(v_tops)):
            for j in range(i + 1, len(v_tops)):
                _sum += scipy.spatial.distance.cosine(v_tops[i], v_tops[j])
                _cnt += 1
        return _sum / _cnt

    with open("data/IRA_topics.txt", "w") as f:
        for n in range(2, 12):
            print(f"N = {n}")
            lda = LdaModel(corpus, num_topics=n, random_state=42)
            v_topics = lda.get_topics()
            lda.save(f"model/lda-ira-{n}.mod")
            # pprint(lda.print_topics())

            f.write(f"Perplexity: {lda.log_perplexity(corpus)}"
                    )  # a measure of how good the model is. lower the better.

            # Compute Coherence Score
            coherence_model_lda = CoherenceModel(model=lda,
                                                 texts=corpus,
                                                 coherence='c_v')
            coherence_lda = coherence_model_lda.get_coherence()
            f.write(f"Coherence Score: {coherence_lda}")
            f.write(f"~Average distance: {average_distance(v_topics)}\n")
            # show
            x = lda.show_topics(num_topics=n, num_words=20, formatted=False)
            topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
            dictionary.id2token = {
                v: k
                for k, v in dictionary.token2id.items()
            }
            # Below Code Prints Topics and Words
            for topic, words in topics_words:
                f.write(
                    str(topic) + " :: " +
                    str([dictionary.id2token[int(w)] for w in words]) + "\n")
            f.write("\n")
Beispiel #11
0
id2word = tokenizer.decoder
######################################
# Set training parameters.
num_topics = 20
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

model = LdaModel(corpus=topical_dataset,
                 id2word=id2word,
                 chunksize=chunksize,
                 alpha='auto',
                 eta='auto',
                 iterations=iterations,
                 num_topics=num_topics,
                 passes=passes,
                 eval_every=eval_every)

top_topics = model.top_topics(topical_dataset)

np.save("topical_dataset_topics.npy", model.get_topics())
# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint

pprint(top_topics)

pickle.dump(top_topics, open("top_topical_dataset_topics.p", "wb"))
Beispiel #12
0
# data preparation
train_C_ = []
for d in range(n_doc):
    train_C_.append([(k, M_train[k, d]) for k in range(n_dic)])

#%%
l_k = [20]  #[20,50,100,150]
n_k = len(l_k)
store = np.zeros((n_k, 5))

#%%
i = 0
for K in l_k:
    t = time.time()
    LDA = LdaModel(train_C_, num_topics=K)
    phiLDA = LDA.get_topics().transpose()
    thetaLDA = np.zeros((K, n_doc))
    for d in range(n_doc):
        tmp = LDA.get_document_topics(train_C_[d])
        ind = [c for (c, b) in tmp]
        thetaLDA[ind, d] = [b for (c, b) in tmp]
    t1 = time.time()
    print('LDA for k = ' + str(K), ', time = ' + str(t1 - t))
    t = t1
    # aLDA on LDA
    init = {}
    init['A'] = np.eye(n_doc)
    init['theta'] = thetaLDA
    init['phi'] = phiLDA

    aLDALDA = aLDA_estimator(K, M_train, np.eye(n_doc), 5, 5, 1, True, init)
# Set training parameters.
num_topics = 20
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

model = LdaModel(
    corpus=wiki,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

top_topics = model.top_topics(wiki)


np.save("wiki_topics.npy", model.get_topics())
# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

pickle.dump(top_topics, open("top_topics.p", "wb"))
Beispiel #14
0
print(lda.get_document_topics(test))
print(lda[test])

# 参数(word_id, minimum_probability=None)
# 关联的topics for the given word.
# Each topic is represented as a tuple of (topic_id, term_probability).
print(lda.get_term_topics(0))

# ----- 输出指定topic的构成 -----
# 参数(word_id, minimum_probability=None)
# 输出形式 list, format: [(word, probability), … ].
print(lda.get_topic_terms(0))
# 参数(topicno, topn=10)
print(lda.show_topic(0))
# 输出形式 String, format: ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘.
# 参数(topicno, topn=10)
print(lda.print_topic(0))

# ----- 输出所有topic的构成 -----
# 默认参数(num_topics=10, num_words=10, log=False, formatted=True)
# 输出形式 String, format: [(0, ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘), ...]
print(lda.show_topics())
# [num_topics, vocabulary_size] array of floats (self.dtype)
# which represents the term topic matrix learned during inference.
print(lda.get_topics())

# ----- save and load model -----
lda.save(fname="lda_model")
lda.load(fname="lda_model")
print(lda[test])
Beispiel #15
0
def get_topic_word_matrix(lda: LdaModel) -> np.ndarray:
    return lda.get_topics()
Beispiel #16
0
k_list = [10,15,20,30]
nb_k = len(k_list)
aLDA_store_train = np.zeros((nb_fold,nb_k))
aLDA_store_test = np.zeros((nb_fold,nb_k))
LDA_store_train = np.zeros((nb_fold,nb_k))
LDA_store_test = np.zeros((nb_fold,nb_k)) 

aLDAgen = aLDA_generator(n_dic, n_w, A_mask, K, alpha, beta, gamma)
aLDAgen.itialise()
for f in range(nb_fold):
      train_Z,train_C,train_D,train_C_  = aLDAgen.generate()
      for k in range(nb_k):
            aLDA = aLDA_estimator(k_list[k], train_C, A_mask, 3, 3, 1, False)
            aLDA.gd_ll(0.05, 60, 0,0.0,0,1)
            LDA = LdaModel(train_C_, num_topics=k_list[k])
            phiGen = LDA.get_topics().transpose()
            thetaGen = 0*aLDA.thetaStar
            for d in  range(n_a):
                  tmp = LDA.get_document_topics(train_C_[d])
                  ind = [c for (c,b) in tmp]
                  thetaGen[ind,d] = [b for (c,b) in tmp]
            aLDA_store_train[f,k] = aLDA.llgd[-1,0]
            LDA_store_train[f,k] = loglikaLDA(thetaGen, phiGen, A_mask, train_D,  alpha, beta,1)
            aLDA_store_test[f,k] = np.sum(np.sum(np.log(aLDA.phiStar.dot(aLDA.thetaStar).dot(aLDA.AStar))*(aLDAgen.phi.dot(aLDAgen.theta).dot(aLDAgen.A))))*n_w
            LDA_store_test[f,k] = np.sum(np.sum(np.log(phiGen.dot(thetaGen).dot(aLDA.AStar))*(aLDAgen.phi.dot(aLDAgen.theta).dot(aLDAgen.A))))*n_w
      print(f)


#plt.plot(aaa.llgd[:,0])
#print('elapsed'+str(time.time()-t1))   
Beispiel #17
0
tpl = lda.print_topics(num_topics=6, num_words=5)
topic, contrib = zip(*tpl)
DTdist = pd.DataFrame(
    contrib,
    columns=[
        "Top 5 words that contribute to each topic with associated probability"
    ],
    index=indx)

distLatex = DTdist.to_latex(index=True, index_names="Topics")
# document distribution
doc_distribution = np.array([
    tup[1]
    for tup in lda.get_document_topics(bow=corpus, per_word_topics=False)
])
obj = lda.get_topics()
a = lda.inference(corpus)
print(doc_distribution[:853])
# training corpus document by topic matrix
doc_topic_dist_corpus = np.array([[tup[1] for tup in lst]
                                  for lst in lda[corpus]])
save_obj(lda, 'LDA_MODEL_APPLICATION')
#%%
lda = load_obj('LDA_MODEL_APPLICATION')
fig, axes = plt.subplots(2, 3, figsize=(20, 10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    plt.imshow(
        WordCloud(background_color="white").fit_words(
            dict(lda.show_topic(i, 200))))
Beispiel #18
0
                 num_topics=num_topics,
                 chunksize=chunksize,
                 passes=passes,
                 iterations=iterations,
                 eval_every=eval_every
                 #alpha='auto',
                 #eta='auto',
                 )

#top_topics = model.top_topics(corpus)
#print(top_topics)
#print(type(top_topics))

#avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
#print('Average topic coherence: %.4f.' % avg_topic_coherence)
tops = model.get_topics()
np.savetxt("lda_output.csv", tops, delimiter=",")
# print(len(tops[0]))
# print(tops)

#from pprint import pprint
#pprint(top_topics)

POLE_topics = {'has_POLE': {}, 'no_POLE': {}}

MSI_topics = {'MSI_high': {}, 'MSI_medium': {}, 'MSI_low': {}}

with open('counts_with_metadata.csv', mode='r') as f:
    reader = csv.reader(f)
    header = reader.__next__()
    index = 0
Beispiel #19
0
                                                      cluster_range,
                                                      n_rand_init=1)
# plot_max_modularities(modularities, cluster_range)
nbre_cluster = modularities.index(max(modularities)) + cluster_range[0]

liste_doc_traite = pickle.load(open("docment_traites.p", 'rb'))
corpus = [
    objet_texte.txt_lemma_str for doc in liste_doc_traite
    for (objet_texte, nature, docID) in doc
]
bow = Text2BowTransformer()
corpus_2 = bow.fit_transform(corpus)
model_lda = LdaModel(corpus_2,
                     num_topics=nbre_cluster,
                     id2word=bow.gensim_model)
topics_lda = model_lda.get_topics()
model_lsi = LsiModel(corpus_2,
                     num_topics=nbre_cluster,
                     id2word=bow.gensim_model)
dico_id2mot = bow.gensim_model.id2token


def subsampling(*arg):
    m = max(arg)
    rep = [False] * len(arg)
    ind = arg.index(m)
    rep[ind] = m
    return rep


def mot_clustering(model, dico):
Beispiel #20
0
print('Reading dataset')
data = pd.read_parquet(args.input_filepath)

print('Normalizing text')
data.text = data.text.map(nlp.normalize_text)

print('Building docterm matrix')
docterm, dictionary = nlp.get_docterm_matrix(data.text)
doclength = np.array([sum(x[1] for x in doc) for doc in docterm])

print('Training LDA model')
lda = LdaModel(docterm, num_topics=args.n_topics)

print('Getting document topics')
doctopics = corpus2csc([lda.get_document_topics(doc) for doc in docterm])
termtopics = lda.get_topics()

print('Computing topic volume time series')
topic_volume_over_time = nlp.get_topic_volume_over_time(data, doctopics, 20)

print('Computing topic coordinates')
topic_coordinates = nlp.get_topic_coordinates(termtopics, method='mds')
topic_proportions = nlp.get_topic_proportions(doctopics, doclength)

print('Computing term frequencies')
term_frequencies = nlp.get_term_frequencies(docterm, termtopics,
                                            topic_proportions, doclength)

print('Computing term ranks per topic')
term_ranks = nlp.get_topic_term_ranks(docterm, termtopics)
Beispiel #21
0
def lda_train(p_generate, theta_generate, phi_generate, num_topics, num_docs):
    import matplotlib.pyplot as plt
    from gensim.models import LdaModel, LdaMulticore
    import gensim.downloader as api
    from gensim.utils import simple_preprocess, lemmatize
    import nltk
    from nltk.corpus import stopwords
    from gensim import corpora
    import re
    import pyLDAvis
    import logging
    import numpy as np
    import scipy
    import sys
    from itertools import permutations
    from gensim.models import CoherenceModel
    np.set_printoptions(threshold=sys.maxsize)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    if __name__ == '__main__':
        __spec__ = None
    #Load dictionnary and corpus
    dct = corpora.Dictionary.load('dct.dict')
    corpus = corpora.MmCorpus('corpus.mm')
    num_words = len(dct)
    # Step 4: Train the LDA model
    lda_model = LdaModel(corpus=corpus,
                         id2word=None,
                         num_topics=num_topics,
                         random_state=100,
                         update_every=1,
                         chunksize=100,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True,
                         minimum_probability=0)

    # save the model
    lda_model.save('lda_model.model')

    # See the topics
    i = 0
    theta_matrix = np.zeros((num_docs, num_topics))
    for c in lda_model[corpus]:
        print(i)
        print("Document Topics      : ", c[0])  # [(Topics, Perc Contrib)]
        for j in range(theta_matrix.shape[1]):
            theta_matrix[i, j] = c[0][j][1]
        i = i + 1

    #    print("Word id, Topics      : ", c[1][:])  # [(Word id, [Topics])]
    #print("Phi Values (word id) : ", c[2][:])  # [(Word id, [(Topic, Phi Value)])]
    #    print("Word, Topics         : ", [(dct[wd], topic) for wd, topic in c[1][:]])   # [(Word, [Topics])]
    #    print("Phi Values (word)    : ", [(dct[wd], topic) for wd, topic in c[2][:]])  # [(Word, [(Topic, Phi Value)])]
    #    print("------------------------------------------------------\n")

    for j in range(num_topics):
        print("Topic", j)
        for i in range(len(lda_model.get_topic_terms(j, 10))):
            print(dct[lda_model.get_topic_terms(j, 10)[i][0]],
                  lda_model.get_topic_terms(j, 10)[i][1])

    phi_matrix = lda_model.get_topics()
    row_sums = theta_matrix.sum(axis=1)
    theta_matrix_new = theta_matrix / row_sums[:, np.newaxis]
    p = np.matmul(theta_matrix_new, phi_matrix)
    p_logit = scipy.special.logit(p)

    for i in range(p_logit.shape[0]):
        print(i)
        print(p_logit[i, ])
    p_logit_generate = np.load('p_logit_generate.npy')
    p_generate = np.load('p_generate.npy')
    theta_generate = np.load('theta_generate.npy')
    phi_generate = np.load('phi_generate.npy')
    corr_p = np.zeros((1, num_docs))
    corr_p_logit = np.zeros((1, num_docs))
    cosine_p = np.zeros((1, num_docs))
    for i in range(p_logit.shape[0]):
        corr_p_logit[0, i] = np.corrcoef(p_logit[i, ],
                                         p_logit_generate[i, ])[1, 0]
        corr_p[0, i] = np.corrcoef(p[i, ], p_generate[i, ])[1, 0]
        cosine_p[0, i] = scipy.spatial.distance.cosine(p[i, ], p_generate[i, ])
    corr_avg_p_inter = np.mean(corr_p)
    cosine_avg_p_inter = np.mean(cosine_p)
    corr_avg_p_logit_inter = np.mean(corr_p_logit)
    corr_avg_p_wordDist = np.mean(
        np.corrcoef(p)
    )  #Average of the correlation matrix for the word distributions of each documents (shape numDocxnumDoc)
    corr_avg_p_docDist = np.mean(
        np.corrcoef(np.transpose(p))
    )  #Average of the correlation matrix for the document distributions for each words (shape dictLenxdictLen)

    corr_avg_pgenerate_wordDist = np.mean(
        np.corrcoef(p_generate)
    )  #Average of the correlation matrix for the word distributions of each documents (shape numDocxnumDoc)
    corr_avg_pgenerate_docDist = np.mean(
        np.corrcoef(np.transpose(p_generate))
    )  #Average of the correlation matrix for the document distributions for each words (shape dictLenxdictLen)

    theta = theta_matrix_new
    phi = phi_matrix

    #This section is to compile to correlation and cosine of each column arrangment combination of a 3 topic model (theta_matrix)
    compilation_corr_theta = []
    compilation_cosine_theta = []
    compilation_corr_phi = []
    compilation_cosine_phi = []
    compilation_KL_theta = []
    compilation_KL_phi = []

    l = list(permutations(range(1, num_topics + 1)))

    for combi in range(len(l)):
        v_theta = np.zeros([num_docs, num_topics])
        v_phi = np.zeros([num_topics, num_words])
        for tid in range(num_topics):
            v_theta[:, tid] = theta[:, l[combi][tid] - 1]
            v_phi[tid, :] = phi[l[combi][tid] - 1, :]
        corr_theta = np.zeros((1, num_docs))
        cosine_theta = np.zeros((1, num_docs))
        KL_theta = np.zeros((1, num_docs))
        corr_phi = np.zeros((1, num_topics))
        cosine_phi = np.zeros((1, num_topics))
        KL_phi = np.zeros((1, num_topics))

        for i in range(theta_generate.shape[0]):
            corr_theta[0, i] = np.corrcoef(v_theta[i, :],
                                           theta_generate[i, :])[1, 0]
            cosine_theta[0, i] = scipy.spatial.distance.cosine(
                v_theta[i, :], theta_generate[i, :])
            KL_theta[0, i] = scipy.stats.entropy(theta_generate[i, :],
                                                 v_theta[i, :])
        compilation_corr_theta.append(corr_theta.mean())
        compilation_cosine_theta.append(cosine_theta.mean())
        compilation_KL_theta.append(KL_theta.mean())
        for i in range(phi_generate.shape[0]):
            corr_phi[0, i] = np.corrcoef(v_phi[i, :], phi_generate[i, :])[1, 0]
            cosine_phi[0, i] = scipy.spatial.distance.cosine(
                v_phi[i, :], phi_generate[i, :])
            KL_phi[0, i] = scipy.stats.entropy(phi_generate[i, :], v_phi[i, :])
        compilation_corr_phi.append(corr_phi.mean())
        compilation_cosine_phi.append(cosine_phi.mean())
        compilation_KL_phi.append(KL_phi.mean())

    compilation_cosine_phi = np.array(compilation_cosine_phi)
    compilation_corr_phi = np.array(compilation_corr_phi)
    compilation_KL_phi = np.array(compilation_KL_phi)
    compilation_cosine_theta = np.array(compilation_cosine_theta)
    compilation_corr_theta = np.array(compilation_corr_theta)
    compilation_KL_theta = np.array(compilation_KL_theta)

    alignment = compilation_KL_phi.argmin()
    if alignment != compilation_cosine_phi.argmin(
    ) | alignment != compilation_cosine_theta.argmin(
    ) | alignment != compilation_corr_theta.argmax(
    ) | alignment != compilation_corr_phi.argmax(
    ) | alignment != compilation_KL_theta.argmin():
        print('Warning!!! The alignments are not coherents.')

    #Determining the final correlation and cosine values
    v_theta = np.zeros([num_docs, num_topics])
    v_phi = np.zeros([num_topics, num_words])
    for tid in range(num_topics):
        v_theta[:, tid] = theta[:, l[alignment][tid] - 1]
        v_phi[tid, :] = phi[l[alignment][tid] - 1, :]
    corr_theta = np.zeros((1, num_docs))
    cosine_theta = np.zeros((1, num_docs))
    KL_theta = np.zeros((1, num_docs))
    corr_phi = np.zeros((1, num_topics))
    cosine_phi = np.zeros((1, num_topics))
    KL_phi = np.zeros((1, num_topics))
    for i in range(theta_generate.shape[0]):
        corr_theta[0, i] = np.corrcoef(v_theta[i, :], theta_generate[i, :])[1,
                                                                            0]
        cosine_theta[0, i] = scipy.spatial.distance.cosine(
            v_theta[i, :], theta_generate[i, :])
        KL_theta = scipy.stats.entropy(theta_generate[i, :], v_theta[i, :])
    for i in range(phi_generate.shape[0]):
        corr_phi[0, i] = np.corrcoef(v_phi[i, :], phi_generate[i, :])[1, 0]
        cosine_phi[0,
                   i] = scipy.spatial.distance.cosine(v_phi[i, :],
                                                      phi_generate[i, :])
        KL_phi = scipy.stats.entropy(phi_generate[i, :], v_phi[i, :])
    corr_theta = corr_theta.mean()
    cosine_theta = cosine_theta.mean()
    KL_theta = KL_theta.mean()
    corr_phi = corr_phi.mean()
    cosine_phi = cosine_phi.mean()
    KL_phi = KL_phi.mean()
    words_id = np.arange(num_words)
    #coherence_model_lda=CoherenceModel(model=lda_model,texts=corpus,dictionary=dct,coherence='c_v')
    #coherence_lda=coherence_model_lda.get_coherence()
    #print('\nCoherence Score: ', coherence_lda)
    return (v_phi, corr_theta, corr_phi, cosine_theta, cosine_phi, KL_theta,
            KL_phi)