Esempio n. 1
0
 def create_AT_model(self, num_topics):
     corpus = self.corpus
     dictionary = self.dictionary
     author2doc = self.author2doc
     model_list = []
     for i in range(1):
         print(i)
         model = AuthorTopicModel(corpus=corpus, num_topics=num_topics, id2word=dictionary.id2token, \
                         author2doc=author2doc, chunksize=2000, passes=100, gamma_threshold=1e-10, \
                         eval_every=0, iterations=1, random_state=i)
         top_topics = model.top_topics(corpus)
         tc = sum([t[1] for t in top_topics])
         model_list.append((model, tc))
     model, tc = max(model_list, key=lambda x: x[1])
     print('Topic coherence: %.3e' % tc)
     model.save(self.model_name)
     print('AT Model saved as %s' % self.model_name)
     self.model = model
     print('Creating author Vecs')
     self.create_author_vecs()
     print('\n Creating Clustering:')
     self.create_author_clustering(self.model.num_topics)
     #        print('\nCreating Classification from  cluster Data')
     #        self.create_classification_from_cluster_data()
     print('\nCreating TSNE embeddings')
     self.create_tsne_embeddings()
Esempio n. 2
0
    def atm_model(self):
        docs = []
        author2doc = {}
        index = 0
        for line in open(self.corpus, encoding="utf-8"):
            line = line.strip()
            if not line:
                continue
            author = line.split('\t')[0]
            if author not in author2doc:
                author2doc[author] = [index]
            else:
                author2doc[author].append(index)

            doc = line.split('\t')[1].replace(",", "").replace("。",
                                                               "").split(' ')
            docs.append(doc)
            index += 1
        print(len(docs))
        # 构建词典
        dictionary = corpora.Dictionary(docs)
        # 对文本进行向量化
        corpus = [dictionary.doc2bow(doc) for doc in docs]
        # 使用atm模型进行训练
        model = AuthorTopicModel(corpus,
                                 author2doc=author2doc,
                                 id2word=dictionary,
                                 num_topics=100)
        # 保存模型
        model.save('topicmodel/author_topic.model')
Esempio n. 3
0
    def train(self):
        self.aTM = AuthorTopicModel(self.train_C_,
                                    author2doc=self.Adic,
                                    num_topics=self.K,
                                    passes=100)
        self.phi = self.aTM.get_topics().transpose()
        self.theta = np.zeros((self.K, self.n_a))
        self.A = normalize(self.AMask, 'l1', 0)
        for a in range(self.n_a):
            self.theta[:, a] = [
                b for (c, b) in self.aTM.get_author_topics(str(a), 0)
            ]

        self.D_reb = self.phi.dot(self.theta).dot(self.A)
Esempio n. 4
0
class aTM():
    def __init__(self, K, data, AMask, params, name, dataName):
        self.K = K  # [int] nb of topics
        self.AMask = AMask  # [n_a,n_d float] matrix of author participation to each paper (1 if author participated to paper)
        self.n_a, self.n_d = self.AMask.shape  # [int] nb authors
        self.D = data
        self.n_dic, self.n_d = self.D.shape
        self.name = name
        self.train_param = params['train_param']

        self.train_C_ = []
        for d in range(self.n_d):
            self.train_C_.append([(k, self.D[k, d])
                                  for k in range(self.n_dic)])

        self.Adic = {}
        for a in range(self.n_a):
            self.Adic[str(a)] = list(np.where(self.AMask[a, :] > 0)[0])

        self.dataName = dataName

    def train(self):
        self.aTM = AuthorTopicModel(self.train_C_,
                                    author2doc=self.Adic,
                                    num_topics=self.K,
                                    passes=100)
        self.phi = self.aTM.get_topics().transpose()
        self.theta = np.zeros((self.K, self.n_a))
        self.A = normalize(self.AMask, 'l1', 0)
        for a in range(self.n_a):
            self.theta[:, a] = [
                b for (c, b) in self.aTM.get_author_topics(str(a), 0)
            ]

        self.D_reb = self.phi.dot(self.theta).dot(self.A)

    def save(self, path):
        '''
        path example 
        '''
        toSave = {}
        toSave['theta'] = self.theta
        toSave['phi'] = self.phi
        toSave['A'] = self.A
        toSave['K'] = self.K
        toSave['train_param'] = self.train_param
        with open(path + self.name + '_' + self.dataName + '.pkl',
                  'wb') as output:
            pickle.dump(toSave, output, pickle.HIGHEST_PROTOCOL)
Esempio n. 5
0
def show():
    model = AuthorTopicModel.load('model/at_all.atmodel')

    author_list = model.id2author.values()
    t_res = [[] for i in range(10)]

    for a in author_list:
        res = model.get_author_topics(a, minimum_probability=0.0)
        for i in range(10):
            t_res[i].append((a, res[i][1]))

    res = []
    for i in range(10):
        res.append(
            sorted(t_res[i], key=lambda item: item[1], reverse=True)[:6])
    # for topic in model.show_topics(num_topics=10):
    #     print('Label: ')
    #     words = ''
    #     for word, prob in model.show_topic(topic[0]):
    #         words += word + ' '
    #     print('Words: ' + words)
    for i in range(len(res)):
        print 'topic' + str(i)
        for j in res[i]:
            print j[0], "%.6f" % (j[1] / 2974)
Esempio n. 6
0
def calAuthorSim():
    conn = sqlite3.connect(config.db_path)
    db = conn.cursor()

    model = AuthorTopicModel.load(config.author_model128_path)
    poets = list(model.id2author.values())
    print(len(poets))
    # vec = model.get_author_topics('苏轼')
    index = MatrixSimilarity(model[list(model.id2author.values())], num_best=30)
    index.save(config.author_simMatrix_path)
    # index = MatrixSimilarity.load(config.author_simMatrix_path)

    for name in poets:
        # print(name)
        sims = index[model[name]]
        sims = sorted(sims, key=lambda item: -item[1])
        sims = [ [poets[sim[0]] , sim[1]] for sim in sims]
        # print(sims)
        # sql_comment  = "UPDATE author SET sims=? WHERE id=?"
        # db.execute(sql_comment, (toJson(sims), name))

        sql_comment  = "UPDATE author SET sims=\'{}\' WHERE id=\'{}\'".format(toJson(sims), name)
        db.execute(sql_comment)
        # print(sql_comment)
    # print(len(poets))
    conn.commit()
Esempio n. 7
0
    def tsne_clusting(self):
        model = AuthorTopicModel.load('topicmodel/author_topic.model')
        tsne = TSNE(n_components=2, random_state=0)
        smallest_author = 200  # 想看最少写作数量多少的诗人
        authors = [
            model.author2id[a] for a in model.author2id.keys()
            if len(model.author2doc[a]) >= smallest_author
        ]
        print(authors)
        embeddings = tsne.fit_transform(model.state.gamma[authors, :])
        # print(model.state.gamma[authors, :])
        # print(embeddings)
        authors_list = [model.id2author[k] for k in authors]

        print(authors_list)

        # plt.scatter(embeddings[:, 0], embeddings[:, 1], c=y_predict)
        #
        # plt.show()

        # labels = ['李世民', '李白', '白居易', '武则天', '白居易', '杜甫', '刘禹锡', '武元衡', '权德舆']#对应的需要查找某几个诗人的方法
        # author_ids = [model.author2id[author] for author in labels]
        # print(author_ids)
        # author_embs = tsne.fit_transform([embeddings[i] for i in author_ids])
        # print(author_embs)
        # print(authors, author_ids, author_embs)

        self.plot_with_labels(embeddings, authors_list)  # 在这里该以下对应诗人的
Esempio n. 8
0
def build_lda_models(course_corpus, course_dictionary, mapping, course_texts):
    # ==== Train Unsupervised LDA ====
    lda_model = LdaModel(corpus=course_corpus, id2word=course_dictionary)

    # ==== Train Unsupervised HDP-LDA ====
    hdp_model = HdpModel(corpus=course_corpus, id2word=course_dictionary)

    # ==== Train Author Topic Model ====
    author_to_doc = {}  # author topic LDA (authors are modules,lessons,items)
    for author_type in ["modules", "lessons", "items"]:
        entity_to_doc = mapping[author_type]
        for entity_name, entity_docs in entity_to_doc.items():
            author_to_doc["{}: {}".format(author_type[0].capitalize(),
                                          entity_name)] = entity_docs
    at_model = AuthorTopicModel(corpus=course_corpus,
                                id2word=course_dictionary,
                                author2doc=author_to_doc)

    # ==== Train Labeled LDA ====
    # explicitly supervised, labeled LDA
    llda_alpha = 0.01
    llda_beta = 0.001
    llda_iterations = 50
    llda_labels = []
    llda_corpus = []
    labelset = set()
    for course_text_id in range(0, len(course_texts)):
        doc_labels = []
        # get module label name
        for module_name, doc_vec in mapping["modules"].items():
            if course_text_id in doc_vec:
                doc_labels.append("M: {}".format(module_name))
                break

        # get lesson label name
        for lesson_name, doc_vec in mapping["lessons"].items():
            if course_text_id in doc_vec:
                doc_labels.append("L: {}".format(lesson_name))
                break

        for item_name, doc_vec in mapping["items"].items():
            if course_text_id in doc_vec:
                doc_labels.append("I: {}".format(item_name))
                break

        llda_labels.append(doc_labels)
        llda_corpus.append(course_texts[course_text_id])
        labelset = labelset.union(doc_labels)

    llda_model = LLDA(llda_alpha, llda_beta, K=len(llda_labels))
    llda_model.set_corpus(llda_corpus, llda_labels)
    llda_model.train(iteration=llda_iterations)

    # phi = llda.phi()
    # for k, label in enumerate(labelset):
    #     print ("\n-- label %d : %s" % (k + 1, label))
    #     for w in argsort(-phi[k + 1])[:10]:
    #         print("%s: %.4f" % (llda.vocas[w], phi[k + 1,w]))
    return lda_model, hdp_model, at_model, llda_model, llda_labels
def ATM(period):
    print(period)
    if period == "period0":
        train_corpus, train_author2doc, train_dictionary = corpus_author2doc(
            period)
        global ATMModel
        ATMModel = AuthorTopicModel(corpus=train_corpus,
                                    num_topics=nb_topics,
                                    id2word=train_dictionary.id2token,
                                    author2doc=train_author2doc,
                                    chunksize=2000,
                                    passes=1,
                                    eval_every=0,
                                    iterations=1,
                                    random_state=1,
                                    minimum_probability=0)

        author_list = []
        for index, author in enumerate(train_author2doc):
            dict = show_author(ATMModel, index, author)
            author_list.append(dict)

        import json
        filename = "User_Topics_Distribution_" + period + ".json"
        with open(filename, 'w') as fout:
            json.dump(author_list, fout)
    else:
        global ATMModel
        cwd = os.getcwd()
        perioddictionary = cwd + "/Dataset/" + period
        if os.path.exists(perioddictionary):
            new_corpus, new_author2doc, new_dictionary = corpus_author2doc(
                period)
            ATMModel.update(new_corpus, new_author2doc)
            author_list = []
            for index, author in enumerate(ATMModel.author2doc):
                dict = show_author(ATMModel, index, author)
                author_list.append(dict)

            import json
            filename = "User_Topics_Distribution_" + period + ".json"
            with open(filename, 'w') as fout:
                json.dump(author_list, fout)

        else:
            print("No more text information to update!!!!")
Esempio n. 10
0
def train():
    author2doc = {}
    corpus = []
    count = 0
    files = os.listdir(poem_dir)  #列出文件夹下所有的目录与文件
    for file in files:
        path = os.path.join(poem_dir, file)
        # if count>100 :
        #     break
        if os.path.isfile(path):
            # print(path)
            f = open(path, 'r', encoding='utf-8')
            file = json.loads(f.read())
            f.close()
            for poem_id in file:
                if count % 100000 == 0 and count != 0:
                    print(count, len(corpus))
                poem = file[poem_id]
                # print(poem_id)
                for shi_data in poem['ShiData']:
                    author = shi_data['Author']
                    if author == '无名氏':
                        continue
                    sentences = [
                        sentence['Content'] for sentence in shi_data['Clauses']
                    ]
                    content = ' '.join(sentences)
                    words = seg(content)  #+ segAll(content)
                    # words = dictionary.doc2bow(words)
                    corpus.append(words)
                    if author not in author2doc:
                        author2doc[author] = []
                    author2doc[author].append(count)
                    count += 1
                    # print(count)
    dictionary = corpora.Dictionary(corpus)
    corpus = [dictionary.doc2bow(doc) for doc in corpus]
    print('开始训练', len(corpus), len(author2doc.keys()))
    model = AuthorTopicModel(corpus,
                             author2doc=author2doc,
                             id2word=dictionary,
                             iterations=10)
    # model = AuthorTopicModel.load(author_model_path)
    # model.update(corpus, author2doc=author2doc,  iterations=10)
    model.save(author_model_path)
def example_1():
    """
    Example code from Gensim documentation on author-topic class.
    :return:
    """
    author2doc = {
        'john': [0, 1, 2, 3, 4, 5, 6],
        'jane': [2, 3, 4, 5, 6, 7, 8],
        'jack': [0, 2, 4, 6, 8]
    }

    corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))

    print("Corpus contents:")
    print(f"{corpus}\n")

    print(f"Documents in the corpus: ")
    for document in corpus:
        print(f"{document}")

    print("\nDictionary contents:")
    print(f"{common_dictionary}\n")
    print(f"Dictionary contents with word index value:")
    print(f"{common_dictionary.token2id}\n")

    with temporary_file("serialized") as s_path:
        model = AuthorTopicModel(corpus,
                                 author2doc=author2doc,
                                 id2word=common_dictionary,
                                 num_topics=4,
                                 serialized=True,
                                 serialization_path=s_path)

        model.update(
            corpus, author2doc
        )  # update the author-topic model with additional documents

    # construct vectors for authors
    author_vecs = [
        model.get_author_topics(author) for author in model.id2author.values()
    ]
    print(f"Vectors for authors:")
    print(f"{author_vecs}\n")
Esempio n. 12
0
def ver():
    # 每词单词边界指标
    model = AuthorTopicModel.load('model/at_all.atmodel')
    dictionary = Dictionary.load('model/atdc.dict')
    from gensim.models.coherencemodel import CoherenceModel

    atcm = CoherenceModel(model,
                          corpus=model.corpus,
                          dictionary=dictionary,
                          coherence='u_mass')
    print atcm.get_coherence()
Esempio n. 13
0
def calAuthorCat():
    model = AuthorTopicModel.load(
        './data_process/model/author2vec/author2vec.model')
    conn = sqlite3.connect(db_path)
    db = conn.cursor()

    rows = db.execute('SELECT id FROM author WHERE potery_num>=5')
    ids = set(model.id2author.values())

    ids = [row[0] for row in rows if row[0] in ids]
    print(len(ids))

    def getSpraceVec(id):
        topics = model.get_author_topics(id)
        vec = np.zeros(128)
        for topic in topics:
            vec[topic[0]] = topic[1]
        return vec

    vecs = [getSpraceVec(id) for id in ids]

    print('开始计算')
    id2label = {}
    label2id = {}
    labels = KMeans(n_clusters=50, max_iter=1000, n_jobs=-1).fit_predict(vecs)
    for index, label in enumerate(labels):
        label = str(label)
        id = ids[index]
        if label not in label2id:
            label2id[label] = []
        label2id[label].append(id)
        id2label[id] = label

    # print('开始计算第二层')
    # for label in label2id:
    #     print(label)
    #     sub_ids = label2id[label]
    #     sub_vecs = [getSpraceVec(id) for id in sub_ids]
    #     cluster_num = 10 if len(sub_ids)>10 else len(sub_ids)
    #     sub_labels = KMeans(n_clusters=cluster_num, max_iter=1000, n_jobs=-1).fit_predict(sub_vecs)
    #     for index, label in enumerate(sub_labels):
    #         label = str(label)
    #         id = sub_ids[index]

    #         id2label[id] += '-' + label

    for id in id2label:
        label = id2label[id]
        db.execute('UPDATE author SET cat=? WHERE id=?', (label, id))
    conn.commit()
    conn.close()
Esempio n. 14
0
    def author_cluster(self):
        model = AuthorTopicModel.load('author_topic.model')
        tsne = TSNE(n_components=2, random_state=0)
        smallest_author = 0
        authors = [
            model.author2id[a] for a in model.author2id.keys()
            if len(model.author2doc[a]) >= smallest_author
        ]
        embeddings = tsne.fit_transform(model.state.gamma[authors, :])
        authors = list(model.id2author.values())

        labels = ['柳永', '晏殊', '欧阳修', '李煜', '李清照', '范仲淹', '苏轼', '辛弃疾', '岳飞']
        author_ids = [model.author2id[author] for author in labels]
        author_embs = tsne.fit_transform([embeddings[i] for i in author_ids])
        print(authors, author_ids, author_embs)
        self.plot_with_labels(author_embs, labels)
Esempio n. 15
0
 def test_model(self):
     model = AuthorTopicModel.load('topicmodel/author_topic.model')
     # 每个作者的向量,每个作者向量维度不一样,对应的主题不一样,主题分别是概率。
     author_vecs = [
         model.get_author_topics(author)
         for author in model.id2author.values()
     ]
     print(len(author_vecs))  # 2610个诗人,每个诗人有几个主题(<100),之后对应的概率
     for author in author_vecs:
         print(author, len(author))  # 每个作者身上的主题个数不同
     # 介绍每位作者
     authors = model.id2author.values()
     print(len(authors), authors)
     # 显示某位作者的向量
     print(model['李白'])
     # 显示模型主题
     for topic in model.show_topics(num_topics=100):
         print(topic)
Esempio n. 16
0
    def __init__(
        self,
        documents,
        attributions,
        topics,
        iterations,
        DOC_KEY,
        USER_KEY,
        DOC_VALUE='TARGET',
        minimum_attributions=2,
        maximum_attributions=50,
    ):
        attributions = self._remove_doubles(attributions, [DOC_KEY, USER_KEY])

        attributions = self._bound_authors(attributions, minimum_attributions,
                                           maximum_attributions, USER_KEY)

        documents, attributions = self._author_document_downselect(
            documents, attributions, DOC_KEY)

        train, test, vocab = self._get_train_test_and_vocab(
            documents, DOC_VALUE)
        self.train, self.test, self.vocab = train, test, vocab

        train_attr = self._attribution_table(self.train, attributions, DOC_KEY,
                                             USER_KEY)
        test_attr = self._attribution_table(self.test, attributions, DOC_KEY,
                                            USER_KEY)
        self.train_attr, self.test_attr = train_attr, test_attr

        self.train_corpus = self._processed_to_bow(self.train[DOC_VALUE],
                                                   self.vocab)
        self.test_corpus = self._processed_to_bow(self.train[DOC_VALUE],
                                                  self.vocab)

        self.topics = topics
        self.iterations = iterations

        self.model = AuthorTopicModel(
            corpus=self.train_corpus,
            author2doc=self.train_attr,
            num_topics=self.topics,
            iterations=self.iterations,
        )
Esempio n. 17
0
def run_model(corpus1, corpus2, dictionary, author2doc1, author2doc2):

    model = AuthorTopicModel(corpus=corpus1,
                             num_topics=10,
                             id2word=dictionary.id2token,
                             author2doc=author2doc1,
                             chunksize=2000,
                             passes=1,
                             eval_every=0,
                             serialized=True,
                             iterations=1,
                             random_state=1,
                             serialization_path='tmp/corpus1.mm')
    model.update(corpus=corpus2, author2doc=author2doc2)
    model.save('model/at_all.atmodel')
Esempio n. 18
0
nb_fold = 10
k_list = [10,15,20,30]
nb_k = len(k_list)
aLDA_store_train = np.zeros((nb_fold,nb_k))
aLDA_store_test = np.zeros((nb_fold,nb_k))
aTM_store_train = np.zeros((nb_fold,nb_k))
aTM_store_test = np.zeros((nb_fold,nb_k)) 

aLDAgen = aLDA_generator(n_dic, n_w, A_mask, K, alpha, beta, gamma)
aLDAgen.itialise()
for f in range(nb_fold):
      train_Z,train_C,train_D,train_C_  = aLDAgen.generate()
      for k in range(nb_k):
            aLDA = aLDA_estimator(k_list[k], train_C, A_mask, alpha, beta,1,False)
            aLDA.gd_ll(0.05, 60, 0,0.0,0,1)
            aTM = AuthorTopicModel(train_C_ , author2doc=Adic, num_topics=k_list[k])
            aTM_phi = aTM.get_topics().transpose()
            aTM_theta = 0*aLDA.thetaStar
            for a in  range(n_a):
                  aTM_theta[:,a] = [b for (c,b) in aTM.get_author_topics(str(a),0)]
            aLDA_store_train[f,k] = aLDA.llgd[-1,0]
            aTM_store_train[f,k] = loglikaLDA(aTM_theta, aTM_phi, A_mask, train_D,  alpha, beta,1)
            aLDA_store_test[f,k] = np.sum(np.sum(np.log(aLDA.phiStar.dot(aLDA.thetaStar).dot(aLDA.AStar))*(aLDAgen.phi.dot(aLDAgen.theta).dot(aLDAgen.A))))*n_w
            aTM_store_test[f,k] = np.sum(np.sum(np.log(aTM_phi.dot(aTM_theta).dot(aLDAgen.A))*(aLDAgen.phi.dot(aLDAgen.theta).dot(aLDAgen.A))))*n_w
      print(f)



#%%
max_ll = np.sum(np.sum(np.log(aLDAgen.phi.dot(aLDAgen.theta).dot(aLDA.AStar))*(aLDAgen.phi.dot(aLDAgen.theta).dot(aLDAgen.A))))*n_w
plt.Figure()
Esempio n. 19
0
# In[ ]:

# TODO (Lee) review pat_inv_map workflow
# partitions data_1000 to size of training set (80/20 split so grabs first 800 rows)
data_800 = data_1000[:800]

# create inventor-to-doc mapping from original list of dicts in json api response
pat2inv = pat_inv_map(data_800)

# #### Construct author-topic model

# In[ ]:

# construct author-topic model
model_at = AuthorTopicModel(corpus=corpus_1000train,
                            doc2author=pat2inv,
                            id2word=id_to_word_1000train)

# In[ ]:

# construct vectors for authors
author_vecs = [
    model_at.get_author_topics(author)
    for author in model_at.id2author.values()
]
author_vecs

# In[ ]:

# retrieve topic distribution for author using use model[name] syntax
# each topic has a probability of being expressed given the particular author,
Esempio n. 20
0
    init['phi'] = phiLDA

    aLDALDA = aLDA_estimator(K, M_train, np.eye(n_doc), 5, 5, 1, True, init)
    aLDALDA.gd_ll(0.00004, 60, 0, 0, 0, 1, 0)
    plt.plot(aLDALDA.llgd[:, 0] / aLDALDA.llgd[0, 0])
    t1 = time.time()
    print('gd on LDA for k = ' + str(K), ', time = ' + str(t1 - t))
    t = t1

    # aTM ---------------------------------------------------

    Adic = {}
    for a in range(n_a):
        Adic[str(a)] = list(np.where(At[a, :] > 0)[0])

    aTM = AuthorTopicModel(train_C_, author2doc=Adic, num_topics=K)
    phiaTM = aTM.get_topics().transpose()
    thetaaTM = np.zeros((K, n_a))
    for a in range(n_a):
        thetaaTM[:, a] = [b for (c, b) in aTM.get_author_topics(str(a), 0)]
    t1 = time.time()
    print('aTM for k = ' + str(K), ', time = ' + str(t1 - t))
    t = t1

    # aLDA on aTM
    init2 = {}
    init2['A'] = normalize(At, 'l1', 0)
    init2['theta'] = thetaaTM
    init2['phi'] = phiaTM

    aLDATaTM = aLDA_estimator(K, M_train, At, 10, 10, 1, True, init2)
Esempio n. 21
0
from joblib import dump, load

# document embedding
from gensim.models import TfidfModel, AuthorTopicModel
from gensim.corpora import Dictionary
from gensim.matutils import corpus2csc
from gensim.test.utils import get_tmpfile
import bz2

from Preprocessing.preprocessing import Preprocessing
import itertools

print('Loading and preprocessing data')
with bz2.open('/gepris_data/train_filtered.csv.bz2', mode='rt') as f:
    csvreader = csv.reader(f)
    traindata = Preprocessing().fit_transform((row[1] for row in csvreader))
    f.seek(0)
    doc2author = {i: row[3:] for i, row in enumerate(csvreader)}

print('Building dict and training TfIdf model:')
dct = Dictionary(doc for doc in traindata)  # fit dictionary
tfidf_model = TfidfModel((dct.doc2bow(doc) for doc in traindata))  # fit model
dump(tfidf_model, '/models/tfidf/tfidf.joblib')
dump(dct, '/models/dict/dict.joblib')

print('ATM training:')
atm_model = AuthorTopicModel([dct.doc2bow(doc) for doc in traindata],
                             doc2author=doc2author,
                             id2word=dct)
dump(atm_model, '/models/atm/atm.joblib')
Esempio n. 22
0
def topicandresult(num, corpus, id2doc, dic):

    """
    model_list = []
    for i in range(5):
        model = AuthorTopicModel(corpus = corpus, num_topics = num, id2word=dic, \
                                 author2doc = id2doc, chunksize = 2000, passes = 1, eval_every = 0, \
                                 #random_state = int(random.random()*1000),
                                 iterations=100000
                                 )
        top_topics = model.top_topics(corpus)
        tc = sum([t[1] for t in top_topics])
        model_list.append((model, tc))
    
    model, tc = max(model_list, key=lambda x: x[1])
    """
    model = AuthorTopicModel(corpus = corpus,
                             num_topics = num,
                             id2word = dic,
                             author2doc = id2doc,
                             chunksize = 2000,
                             passes = 55,
                             eval_every = 0,
                             gamma_threshold=1e-11,
                             iterations = 10000000)

    with open('NuclearEnergy/Data/' + str(num) + 'topic.model', 'wb') as p:
        pickle.dump(model, p)

    '''
    with open('NuclearEnergy/Data/' + str(num) + 'topic.model', 'rb') as p:
        model = pickle.load(p)
    '''

    pubid = pd.Series(list(set([total['press'][i] + ', ' + total['pubtime'][i]
                                for i in range(0, len(total))])), name='id', dtype='category')

    result = {'publisher': [], 'time': [], 'docnum': [], 'corpusp': []}
    for i in range(0, num):
        result[i] = []

    def proportion(pubid):
        docs = model.author2doc[pubid]
        return sum([word[1] for doc in docs for word in corpus[doc]]) / vocnum

    for i in pubid:
        result['publisher'].append(re.split(', ', i)[0])
        result['time'].append(re.split(', ', i)[1])
        cr1 = total['press'].map(lambda x: x == re.split(', ', i)[0])
        cr2 = total['pubtime'].map(lambda x: x == re.split(', ', i)[1])
        result['docnum'].append(len(total[cr1 & cr2]))
        result['corpusp'].append(proportion(i))
        topics = {j[0]: j[1] for j in model[i]}
        for j in result:
            if j in topics:
                result[j].append(topics[j])
            elif j is 'publisher' or j is 'time' or j is 'docnum' or j is 'corpusp':
                pass
            else:
                result[j].append(0)

    result = pd.DataFrame(result)
    result = result.reindex_axis(natsorted(result.columns, alg=ns.IC), axis=1)

    ratio = {}
    for i in range(0, num):
        ratio[i] = 0
    for index, row in result.iterrows():
        for i in ratio:
            ratio[i] += row[i] * row['corpusp']

    agg = sum(ratio.values())
    ratio = {i: str(ratio[i] * 100 / agg) + '%' for i in ratio}

    topics = pd.DataFrame({
                          i[0]: [re.split('\*', j)[1] + '*' + str(round(float(re.split('\*', j)[0]) * 100, 3)) + '%' for
                                 j in re.split(' \\+ ', re.sub('"', '', i[1]))] for i
                          in model.show_topics(num_topics= num, num_words=50)})
    topics.loc[30] = ratio
    topics = topics.reindex([30] + list(range(0, 30)))
    topics.index = ['토픽 비율'] + list(range(1, 31))
    '''
    topic_labels = ['1 사이버 해킹과 공격', '2 ',
                    '3 원전에 대한 비판적 담론들', '4 원전의 안전과 기술적 문제들',
                    '5 후쿠시마 원전 사고와 안전', '6 지진과 원자력 발전소 및 정치권', '7 해외 동향',
                    '8 지진과 원전 위치 지역의 불안', '9 원자력 안전과 지역사회', '10 원자력 관련 기관 인사',
                    '11 원자력을 둘러싼 정치적 갈등', '12 원전 건설 사업과 관련된 이슈', '13 고리/월성 원전',
                    '14 영화 판도라 관련', '15 원자력 산업 관리', '16 원전 관련 투자와 에너지 산업',
                    '17 사회적 이슈들과 일본 지진','18 원전 관련 기술', '19 원전에 대한 사업적 접근',
                    '20 원전과 북한, 안보']
                    '''
    topic_labels = [str(i) for i in range(1, num + 1)]
    topics.columns = topic_labels
    result.columns = topic_labels + ['형태소의 비율', '기사 숫자', '발표 기관', '발표 시기']

    return(topics, result, model)
Esempio n. 23
0
print(dict(dictionary.items()))
dictionary.id2token
dictionary.token2id
dictionary.num_pos
dictionary.num_nnz
dictionary.num_docs
# 对文本进行向量化,得到每首诗词的稀疏向量,向量的每一个元素代表了一个 word 在这首诗词中的出现次数
corpus = [dictionary.doc2bow(doc) for doc in docs]



######### atm 模型 ##########
''' author topic model '''

# 使用 atm 模型进行训练, 设置100个主题
model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, num_topics=100)
# 保存模型
model_file = '/Users/wxy/Documents/Sufepost/Courses/文本挖掘/report/MyPoemMining/atm_model/atm.model'
model.save(model_file)



######### 模型信息与降维 ##########
'''  模型信息与降维 '''

model_file = '/Users/wxy/Documents/Sufepost/Courses/文本挖掘/report/MyPoemMining/atm_model/atm.model'
model = AuthorTopicModel.load(model_file)
# 模型的一些信息
model.id2author
model.author2id
model.author2doc
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 29 11:33:01 2017

@author: jadm
"""
#%%
from gensim.models import AuthorTopicModel
model = AuthorTopicModel.load('modelo1/model.atmodel')

#%%
top_topics = model.top_topics(model.corpus)
#%%

for i in range(len(top_topics)):
    print(top_topics[i][1])
    
    
Esempio n. 25
0
for index, row in df.iterrows():
    row = row.tolist()
    curr_doc = []
    for i in range(len(row)):
        curr_item = row[i]
        if curr_item > 0:
            curr_doc.append((i, curr_item))
    corpus.append(curr_doc)
# dictionary = Dictionary(data)
# corpus = [dictionary.doc2bow(text) for text in data]

num_topics = 12

model_POLE = AuthorTopicModel(
    corpus=corpus,
    author2doc=author2doc_POLE,
    #id2word=dictionary,
    num_topics=num_topics)

model_MSI = AuthorTopicModel(
    corpus=corpus,
    author2doc=author2doc_MSI,
    #id2word=dictionary,
    num_topics=num_topics)

# construct vectors for authors
# author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]
#
# print(author_vecs)
POLE_tops = model_POLE.get_topics()
np.savetxt("authortopic_POLE_output.csv", POLE_tops, delimiter=",")
Esempio n. 26
0
        df, f_ini, f_fin = archivos_csv(sys.argv[1])
else:
    df, f_ini, f_fin = archivos_csv()
#%%

corpus, dictionary, author2doc = preprocesamiento(df)
print('# de autores: %d' % len(author2doc))
print('# tokens unicos: %d' % len(dictionary))
print('# de documentos: %d' % len(corpus))

print('Corriendo modelo')
model = AuthorTopicModel(corpus=corpus,
                         num_topics=100,
                         id2word=dictionary.id2token,
                         author2doc=author2doc,
                         chunksize=2000,
                         passes=55,
                         eval_every=0,
                         iterations=10000000,
                         gamma_threshold=1e-11)

f = 'modelo' + folder()
os.makedirs(f)
os.makedirs(f + '/LDA')

model.save(f + '/model.atmodel')

print("MODELO TERMINADO Y GUARDADO")
#  LDA
print('Corriendo LDA')
ldamodel = LdaModel(corpus=corpus, num_topics=100, id2word=dictionary)
Esempio n. 27
0
# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 20
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

_ = dictionary[0]  # This sort of "initializes" dictionary.id2token.

corpus = [dictionary.doc2bow(doc) for doc in docs]
print('Number of authors: %d' % len(author2doc))
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

from gensim.models import AuthorTopicModel
model = AuthorTopicModel(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \
                author2doc=author2doc, chunksize=2000, passes=1, eval_every=0, \
                iterations=1, random_state=1)

model_list = []
for i in range(5):
    model = AuthorTopicModel(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \
                    author2doc=author2doc, chunksize=2000, passes=100, gamma_threshold=1e-10, \
                    eval_every=0, iterations=1, random_state=i)
    top_topics = model.top_topics(corpus)
    tc = sum([t[1] for t in top_topics])
    model_list.append((model, tc))

model, tc = max(model_list, key=lambda x: x[1])
print('Topic coherence: %.3e' % tc)

model.save('/tmp/model.atmodel')
Esempio n. 28
0
# In[143]:


patdf2inv = dict((df_pat_idx[key], value) for (key, value) in pat2inv.items())
patdf2inv


# #### Construct author-topic model

# In[144]:


# construct author-topic model
model_at = AuthorTopicModel(corpus=corpus_at,
                         doc2author=patdf2inv,
                         id2word=id_to_word_at, 
                         num_topics=25)


# In[145]:


# construct vectors for authors
author_vecs = [model_at.get_author_topics(author) for author in model_at.id2author.values()]
author_vecs


# In[146]:


# inspect topic distribution for author with id# 7788103-1
Esempio n. 29
0
# Parameters:
#
# **num_topics**: The number of topics ion the model. There is no 'correct' value here, and it depends entirely on how many different topics occur in the corpus. 100 is generally a reasonable compromise for a corpus this size.  <br />
# **chunksize**: Controls the size of the mini-batches. This depends entirely on the corpus - 2000 is the default, but this obviously makes no sense if a corpus only contains 1000 documents. <br />
# **passes**: 100 by default <br />
# **iterations**: iterations is the maximum number of times the model loops over each document <br />
# **alpha**: Can be set to 'asymmetric' <br />
# **eta**: Can be set to ‘auto’, which learns an asymmetric prior over words directly from the data

# In[23]:

#get_ipython().run_cell_magic('time', '', 'model = AuthorTopicModel(corpus=corpus, num_topics=100, id2word=dictionary.id2token, \\\n                    author2doc=author2doc_test, chunksize=100, passes=100, gamma_threshold=0.001, \\\n                    eval_every=0, iterations=1, random_state=1)')

print('Training...')
model = AuthorTopicModel(corpus=corpus, num_topics=100, id2word=dictionary.id2token, \
                    author2doc=author2doc_test, chunksize=100, passes=100, gamma_threshold=0.001, \
                    eval_every=0, iterations=1, random_state=1)

# In[24]:

# Save model.
model.save('./results/model_presentation_github.atmodel')

# In[18]:

#import pandas as pd
#import spacy
#from gensim.models import Phrases
#from gensim.corpora import Dictionary
#from gensim.models import AuthorTopicModel
Esempio n. 30
0
    def load_AT_model(self, name='model.atmodel'):
        print("\nLoading model from %s" % self.model_name)

        self.model = AuthorTopicModel.load(self.model_name)