def create_AT_model(self, num_topics): corpus = self.corpus dictionary = self.dictionary author2doc = self.author2doc model_list = [] for i in range(1): print(i) model = AuthorTopicModel(corpus=corpus, num_topics=num_topics, id2word=dictionary.id2token, \ author2doc=author2doc, chunksize=2000, passes=100, gamma_threshold=1e-10, \ eval_every=0, iterations=1, random_state=i) top_topics = model.top_topics(corpus) tc = sum([t[1] for t in top_topics]) model_list.append((model, tc)) model, tc = max(model_list, key=lambda x: x[1]) print('Topic coherence: %.3e' % tc) model.save(self.model_name) print('AT Model saved as %s' % self.model_name) self.model = model print('Creating author Vecs') self.create_author_vecs() print('\n Creating Clustering:') self.create_author_clustering(self.model.num_topics) # print('\nCreating Classification from cluster Data') # self.create_classification_from_cluster_data() print('\nCreating TSNE embeddings') self.create_tsne_embeddings()
def atm_model(self): docs = [] author2doc = {} index = 0 for line in open(self.corpus, encoding="utf-8"): line = line.strip() if not line: continue author = line.split('\t')[0] if author not in author2doc: author2doc[author] = [index] else: author2doc[author].append(index) doc = line.split('\t')[1].replace(",", "").replace("。", "").split(' ') docs.append(doc) index += 1 print(len(docs)) # 构建词典 dictionary = corpora.Dictionary(docs) # 对文本进行向量化 corpus = [dictionary.doc2bow(doc) for doc in docs] # 使用atm模型进行训练 model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, num_topics=100) # 保存模型 model.save('topicmodel/author_topic.model')
def train(self): self.aTM = AuthorTopicModel(self.train_C_, author2doc=self.Adic, num_topics=self.K, passes=100) self.phi = self.aTM.get_topics().transpose() self.theta = np.zeros((self.K, self.n_a)) self.A = normalize(self.AMask, 'l1', 0) for a in range(self.n_a): self.theta[:, a] = [ b for (c, b) in self.aTM.get_author_topics(str(a), 0) ] self.D_reb = self.phi.dot(self.theta).dot(self.A)
class aTM(): def __init__(self, K, data, AMask, params, name, dataName): self.K = K # [int] nb of topics self.AMask = AMask # [n_a,n_d float] matrix of author participation to each paper (1 if author participated to paper) self.n_a, self.n_d = self.AMask.shape # [int] nb authors self.D = data self.n_dic, self.n_d = self.D.shape self.name = name self.train_param = params['train_param'] self.train_C_ = [] for d in range(self.n_d): self.train_C_.append([(k, self.D[k, d]) for k in range(self.n_dic)]) self.Adic = {} for a in range(self.n_a): self.Adic[str(a)] = list(np.where(self.AMask[a, :] > 0)[0]) self.dataName = dataName def train(self): self.aTM = AuthorTopicModel(self.train_C_, author2doc=self.Adic, num_topics=self.K, passes=100) self.phi = self.aTM.get_topics().transpose() self.theta = np.zeros((self.K, self.n_a)) self.A = normalize(self.AMask, 'l1', 0) for a in range(self.n_a): self.theta[:, a] = [ b for (c, b) in self.aTM.get_author_topics(str(a), 0) ] self.D_reb = self.phi.dot(self.theta).dot(self.A) def save(self, path): ''' path example ''' toSave = {} toSave['theta'] = self.theta toSave['phi'] = self.phi toSave['A'] = self.A toSave['K'] = self.K toSave['train_param'] = self.train_param with open(path + self.name + '_' + self.dataName + '.pkl', 'wb') as output: pickle.dump(toSave, output, pickle.HIGHEST_PROTOCOL)
def show(): model = AuthorTopicModel.load('model/at_all.atmodel') author_list = model.id2author.values() t_res = [[] for i in range(10)] for a in author_list: res = model.get_author_topics(a, minimum_probability=0.0) for i in range(10): t_res[i].append((a, res[i][1])) res = [] for i in range(10): res.append( sorted(t_res[i], key=lambda item: item[1], reverse=True)[:6]) # for topic in model.show_topics(num_topics=10): # print('Label: ') # words = '' # for word, prob in model.show_topic(topic[0]): # words += word + ' ' # print('Words: ' + words) for i in range(len(res)): print 'topic' + str(i) for j in res[i]: print j[0], "%.6f" % (j[1] / 2974)
def calAuthorSim(): conn = sqlite3.connect(config.db_path) db = conn.cursor() model = AuthorTopicModel.load(config.author_model128_path) poets = list(model.id2author.values()) print(len(poets)) # vec = model.get_author_topics('苏轼') index = MatrixSimilarity(model[list(model.id2author.values())], num_best=30) index.save(config.author_simMatrix_path) # index = MatrixSimilarity.load(config.author_simMatrix_path) for name in poets: # print(name) sims = index[model[name]] sims = sorted(sims, key=lambda item: -item[1]) sims = [ [poets[sim[0]] , sim[1]] for sim in sims] # print(sims) # sql_comment = "UPDATE author SET sims=? WHERE id=?" # db.execute(sql_comment, (toJson(sims), name)) sql_comment = "UPDATE author SET sims=\'{}\' WHERE id=\'{}\'".format(toJson(sims), name) db.execute(sql_comment) # print(sql_comment) # print(len(poets)) conn.commit()
def tsne_clusting(self): model = AuthorTopicModel.load('topicmodel/author_topic.model') tsne = TSNE(n_components=2, random_state=0) smallest_author = 200 # 想看最少写作数量多少的诗人 authors = [ model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author ] print(authors) embeddings = tsne.fit_transform(model.state.gamma[authors, :]) # print(model.state.gamma[authors, :]) # print(embeddings) authors_list = [model.id2author[k] for k in authors] print(authors_list) # plt.scatter(embeddings[:, 0], embeddings[:, 1], c=y_predict) # # plt.show() # labels = ['李世民', '李白', '白居易', '武则天', '白居易', '杜甫', '刘禹锡', '武元衡', '权德舆']#对应的需要查找某几个诗人的方法 # author_ids = [model.author2id[author] for author in labels] # print(author_ids) # author_embs = tsne.fit_transform([embeddings[i] for i in author_ids]) # print(author_embs) # print(authors, author_ids, author_embs) self.plot_with_labels(embeddings, authors_list) # 在这里该以下对应诗人的
def build_lda_models(course_corpus, course_dictionary, mapping, course_texts): # ==== Train Unsupervised LDA ==== lda_model = LdaModel(corpus=course_corpus, id2word=course_dictionary) # ==== Train Unsupervised HDP-LDA ==== hdp_model = HdpModel(corpus=course_corpus, id2word=course_dictionary) # ==== Train Author Topic Model ==== author_to_doc = {} # author topic LDA (authors are modules,lessons,items) for author_type in ["modules", "lessons", "items"]: entity_to_doc = mapping[author_type] for entity_name, entity_docs in entity_to_doc.items(): author_to_doc["{}: {}".format(author_type[0].capitalize(), entity_name)] = entity_docs at_model = AuthorTopicModel(corpus=course_corpus, id2word=course_dictionary, author2doc=author_to_doc) # ==== Train Labeled LDA ==== # explicitly supervised, labeled LDA llda_alpha = 0.01 llda_beta = 0.001 llda_iterations = 50 llda_labels = [] llda_corpus = [] labelset = set() for course_text_id in range(0, len(course_texts)): doc_labels = [] # get module label name for module_name, doc_vec in mapping["modules"].items(): if course_text_id in doc_vec: doc_labels.append("M: {}".format(module_name)) break # get lesson label name for lesson_name, doc_vec in mapping["lessons"].items(): if course_text_id in doc_vec: doc_labels.append("L: {}".format(lesson_name)) break for item_name, doc_vec in mapping["items"].items(): if course_text_id in doc_vec: doc_labels.append("I: {}".format(item_name)) break llda_labels.append(doc_labels) llda_corpus.append(course_texts[course_text_id]) labelset = labelset.union(doc_labels) llda_model = LLDA(llda_alpha, llda_beta, K=len(llda_labels)) llda_model.set_corpus(llda_corpus, llda_labels) llda_model.train(iteration=llda_iterations) # phi = llda.phi() # for k, label in enumerate(labelset): # print ("\n-- label %d : %s" % (k + 1, label)) # for w in argsort(-phi[k + 1])[:10]: # print("%s: %.4f" % (llda.vocas[w], phi[k + 1,w])) return lda_model, hdp_model, at_model, llda_model, llda_labels
def ATM(period): print(period) if period == "period0": train_corpus, train_author2doc, train_dictionary = corpus_author2doc( period) global ATMModel ATMModel = AuthorTopicModel(corpus=train_corpus, num_topics=nb_topics, id2word=train_dictionary.id2token, author2doc=train_author2doc, chunksize=2000, passes=1, eval_every=0, iterations=1, random_state=1, minimum_probability=0) author_list = [] for index, author in enumerate(train_author2doc): dict = show_author(ATMModel, index, author) author_list.append(dict) import json filename = "User_Topics_Distribution_" + period + ".json" with open(filename, 'w') as fout: json.dump(author_list, fout) else: global ATMModel cwd = os.getcwd() perioddictionary = cwd + "/Dataset/" + period if os.path.exists(perioddictionary): new_corpus, new_author2doc, new_dictionary = corpus_author2doc( period) ATMModel.update(new_corpus, new_author2doc) author_list = [] for index, author in enumerate(ATMModel.author2doc): dict = show_author(ATMModel, index, author) author_list.append(dict) import json filename = "User_Topics_Distribution_" + period + ".json" with open(filename, 'w') as fout: json.dump(author_list, fout) else: print("No more text information to update!!!!")
def train(): author2doc = {} corpus = [] count = 0 files = os.listdir(poem_dir) #列出文件夹下所有的目录与文件 for file in files: path = os.path.join(poem_dir, file) # if count>100 : # break if os.path.isfile(path): # print(path) f = open(path, 'r', encoding='utf-8') file = json.loads(f.read()) f.close() for poem_id in file: if count % 100000 == 0 and count != 0: print(count, len(corpus)) poem = file[poem_id] # print(poem_id) for shi_data in poem['ShiData']: author = shi_data['Author'] if author == '无名氏': continue sentences = [ sentence['Content'] for sentence in shi_data['Clauses'] ] content = ' '.join(sentences) words = seg(content) #+ segAll(content) # words = dictionary.doc2bow(words) corpus.append(words) if author not in author2doc: author2doc[author] = [] author2doc[author].append(count) count += 1 # print(count) dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(doc) for doc in corpus] print('开始训练', len(corpus), len(author2doc.keys())) model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, iterations=10) # model = AuthorTopicModel.load(author_model_path) # model.update(corpus, author2doc=author2doc, iterations=10) model.save(author_model_path)
def example_1(): """ Example code from Gensim documentation on author-topic class. :return: """ author2doc = { 'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8] } corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) print("Corpus contents:") print(f"{corpus}\n") print(f"Documents in the corpus: ") for document in corpus: print(f"{document}") print("\nDictionary contents:") print(f"{common_dictionary}\n") print(f"Dictionary contents with word index value:") print(f"{common_dictionary.token2id}\n") with temporary_file("serialized") as s_path: model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4, serialized=True, serialization_path=s_path) model.update( corpus, author2doc ) # update the author-topic model with additional documents # construct vectors for authors author_vecs = [ model.get_author_topics(author) for author in model.id2author.values() ] print(f"Vectors for authors:") print(f"{author_vecs}\n")
def ver(): # 每词单词边界指标 model = AuthorTopicModel.load('model/at_all.atmodel') dictionary = Dictionary.load('model/atdc.dict') from gensim.models.coherencemodel import CoherenceModel atcm = CoherenceModel(model, corpus=model.corpus, dictionary=dictionary, coherence='u_mass') print atcm.get_coherence()
def calAuthorCat(): model = AuthorTopicModel.load( './data_process/model/author2vec/author2vec.model') conn = sqlite3.connect(db_path) db = conn.cursor() rows = db.execute('SELECT id FROM author WHERE potery_num>=5') ids = set(model.id2author.values()) ids = [row[0] for row in rows if row[0] in ids] print(len(ids)) def getSpraceVec(id): topics = model.get_author_topics(id) vec = np.zeros(128) for topic in topics: vec[topic[0]] = topic[1] return vec vecs = [getSpraceVec(id) for id in ids] print('开始计算') id2label = {} label2id = {} labels = KMeans(n_clusters=50, max_iter=1000, n_jobs=-1).fit_predict(vecs) for index, label in enumerate(labels): label = str(label) id = ids[index] if label not in label2id: label2id[label] = [] label2id[label].append(id) id2label[id] = label # print('开始计算第二层') # for label in label2id: # print(label) # sub_ids = label2id[label] # sub_vecs = [getSpraceVec(id) for id in sub_ids] # cluster_num = 10 if len(sub_ids)>10 else len(sub_ids) # sub_labels = KMeans(n_clusters=cluster_num, max_iter=1000, n_jobs=-1).fit_predict(sub_vecs) # for index, label in enumerate(sub_labels): # label = str(label) # id = sub_ids[index] # id2label[id] += '-' + label for id in id2label: label = id2label[id] db.execute('UPDATE author SET cat=? WHERE id=?', (label, id)) conn.commit() conn.close()
def author_cluster(self): model = AuthorTopicModel.load('author_topic.model') tsne = TSNE(n_components=2, random_state=0) smallest_author = 0 authors = [ model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author ] embeddings = tsne.fit_transform(model.state.gamma[authors, :]) authors = list(model.id2author.values()) labels = ['柳永', '晏殊', '欧阳修', '李煜', '李清照', '范仲淹', '苏轼', '辛弃疾', '岳飞'] author_ids = [model.author2id[author] for author in labels] author_embs = tsne.fit_transform([embeddings[i] for i in author_ids]) print(authors, author_ids, author_embs) self.plot_with_labels(author_embs, labels)
def test_model(self): model = AuthorTopicModel.load('topicmodel/author_topic.model') # 每个作者的向量,每个作者向量维度不一样,对应的主题不一样,主题分别是概率。 author_vecs = [ model.get_author_topics(author) for author in model.id2author.values() ] print(len(author_vecs)) # 2610个诗人,每个诗人有几个主题(<100),之后对应的概率 for author in author_vecs: print(author, len(author)) # 每个作者身上的主题个数不同 # 介绍每位作者 authors = model.id2author.values() print(len(authors), authors) # 显示某位作者的向量 print(model['李白']) # 显示模型主题 for topic in model.show_topics(num_topics=100): print(topic)
def __init__( self, documents, attributions, topics, iterations, DOC_KEY, USER_KEY, DOC_VALUE='TARGET', minimum_attributions=2, maximum_attributions=50, ): attributions = self._remove_doubles(attributions, [DOC_KEY, USER_KEY]) attributions = self._bound_authors(attributions, minimum_attributions, maximum_attributions, USER_KEY) documents, attributions = self._author_document_downselect( documents, attributions, DOC_KEY) train, test, vocab = self._get_train_test_and_vocab( documents, DOC_VALUE) self.train, self.test, self.vocab = train, test, vocab train_attr = self._attribution_table(self.train, attributions, DOC_KEY, USER_KEY) test_attr = self._attribution_table(self.test, attributions, DOC_KEY, USER_KEY) self.train_attr, self.test_attr = train_attr, test_attr self.train_corpus = self._processed_to_bow(self.train[DOC_VALUE], self.vocab) self.test_corpus = self._processed_to_bow(self.train[DOC_VALUE], self.vocab) self.topics = topics self.iterations = iterations self.model = AuthorTopicModel( corpus=self.train_corpus, author2doc=self.train_attr, num_topics=self.topics, iterations=self.iterations, )
def run_model(corpus1, corpus2, dictionary, author2doc1, author2doc2): model = AuthorTopicModel(corpus=corpus1, num_topics=10, id2word=dictionary.id2token, author2doc=author2doc1, chunksize=2000, passes=1, eval_every=0, serialized=True, iterations=1, random_state=1, serialization_path='tmp/corpus1.mm') model.update(corpus=corpus2, author2doc=author2doc2) model.save('model/at_all.atmodel')
nb_fold = 10 k_list = [10,15,20,30] nb_k = len(k_list) aLDA_store_train = np.zeros((nb_fold,nb_k)) aLDA_store_test = np.zeros((nb_fold,nb_k)) aTM_store_train = np.zeros((nb_fold,nb_k)) aTM_store_test = np.zeros((nb_fold,nb_k)) aLDAgen = aLDA_generator(n_dic, n_w, A_mask, K, alpha, beta, gamma) aLDAgen.itialise() for f in range(nb_fold): train_Z,train_C,train_D,train_C_ = aLDAgen.generate() for k in range(nb_k): aLDA = aLDA_estimator(k_list[k], train_C, A_mask, alpha, beta,1,False) aLDA.gd_ll(0.05, 60, 0,0.0,0,1) aTM = AuthorTopicModel(train_C_ , author2doc=Adic, num_topics=k_list[k]) aTM_phi = aTM.get_topics().transpose() aTM_theta = 0*aLDA.thetaStar for a in range(n_a): aTM_theta[:,a] = [b for (c,b) in aTM.get_author_topics(str(a),0)] aLDA_store_train[f,k] = aLDA.llgd[-1,0] aTM_store_train[f,k] = loglikaLDA(aTM_theta, aTM_phi, A_mask, train_D, alpha, beta,1) aLDA_store_test[f,k] = np.sum(np.sum(np.log(aLDA.phiStar.dot(aLDA.thetaStar).dot(aLDA.AStar))*(aLDAgen.phi.dot(aLDAgen.theta).dot(aLDAgen.A))))*n_w aTM_store_test[f,k] = np.sum(np.sum(np.log(aTM_phi.dot(aTM_theta).dot(aLDAgen.A))*(aLDAgen.phi.dot(aLDAgen.theta).dot(aLDAgen.A))))*n_w print(f) #%% max_ll = np.sum(np.sum(np.log(aLDAgen.phi.dot(aLDAgen.theta).dot(aLDA.AStar))*(aLDAgen.phi.dot(aLDAgen.theta).dot(aLDAgen.A))))*n_w plt.Figure()
# In[ ]: # TODO (Lee) review pat_inv_map workflow # partitions data_1000 to size of training set (80/20 split so grabs first 800 rows) data_800 = data_1000[:800] # create inventor-to-doc mapping from original list of dicts in json api response pat2inv = pat_inv_map(data_800) # #### Construct author-topic model # In[ ]: # construct author-topic model model_at = AuthorTopicModel(corpus=corpus_1000train, doc2author=pat2inv, id2word=id_to_word_1000train) # In[ ]: # construct vectors for authors author_vecs = [ model_at.get_author_topics(author) for author in model_at.id2author.values() ] author_vecs # In[ ]: # retrieve topic distribution for author using use model[name] syntax # each topic has a probability of being expressed given the particular author,
init['phi'] = phiLDA aLDALDA = aLDA_estimator(K, M_train, np.eye(n_doc), 5, 5, 1, True, init) aLDALDA.gd_ll(0.00004, 60, 0, 0, 0, 1, 0) plt.plot(aLDALDA.llgd[:, 0] / aLDALDA.llgd[0, 0]) t1 = time.time() print('gd on LDA for k = ' + str(K), ', time = ' + str(t1 - t)) t = t1 # aTM --------------------------------------------------- Adic = {} for a in range(n_a): Adic[str(a)] = list(np.where(At[a, :] > 0)[0]) aTM = AuthorTopicModel(train_C_, author2doc=Adic, num_topics=K) phiaTM = aTM.get_topics().transpose() thetaaTM = np.zeros((K, n_a)) for a in range(n_a): thetaaTM[:, a] = [b for (c, b) in aTM.get_author_topics(str(a), 0)] t1 = time.time() print('aTM for k = ' + str(K), ', time = ' + str(t1 - t)) t = t1 # aLDA on aTM init2 = {} init2['A'] = normalize(At, 'l1', 0) init2['theta'] = thetaaTM init2['phi'] = phiaTM aLDATaTM = aLDA_estimator(K, M_train, At, 10, 10, 1, True, init2)
from joblib import dump, load # document embedding from gensim.models import TfidfModel, AuthorTopicModel from gensim.corpora import Dictionary from gensim.matutils import corpus2csc from gensim.test.utils import get_tmpfile import bz2 from Preprocessing.preprocessing import Preprocessing import itertools print('Loading and preprocessing data') with bz2.open('/gepris_data/train_filtered.csv.bz2', mode='rt') as f: csvreader = csv.reader(f) traindata = Preprocessing().fit_transform((row[1] for row in csvreader)) f.seek(0) doc2author = {i: row[3:] for i, row in enumerate(csvreader)} print('Building dict and training TfIdf model:') dct = Dictionary(doc for doc in traindata) # fit dictionary tfidf_model = TfidfModel((dct.doc2bow(doc) for doc in traindata)) # fit model dump(tfidf_model, '/models/tfidf/tfidf.joblib') dump(dct, '/models/dict/dict.joblib') print('ATM training:') atm_model = AuthorTopicModel([dct.doc2bow(doc) for doc in traindata], doc2author=doc2author, id2word=dct) dump(atm_model, '/models/atm/atm.joblib')
def topicandresult(num, corpus, id2doc, dic): """ model_list = [] for i in range(5): model = AuthorTopicModel(corpus = corpus, num_topics = num, id2word=dic, \ author2doc = id2doc, chunksize = 2000, passes = 1, eval_every = 0, \ #random_state = int(random.random()*1000), iterations=100000 ) top_topics = model.top_topics(corpus) tc = sum([t[1] for t in top_topics]) model_list.append((model, tc)) model, tc = max(model_list, key=lambda x: x[1]) """ model = AuthorTopicModel(corpus = corpus, num_topics = num, id2word = dic, author2doc = id2doc, chunksize = 2000, passes = 55, eval_every = 0, gamma_threshold=1e-11, iterations = 10000000) with open('NuclearEnergy/Data/' + str(num) + 'topic.model', 'wb') as p: pickle.dump(model, p) ''' with open('NuclearEnergy/Data/' + str(num) + 'topic.model', 'rb') as p: model = pickle.load(p) ''' pubid = pd.Series(list(set([total['press'][i] + ', ' + total['pubtime'][i] for i in range(0, len(total))])), name='id', dtype='category') result = {'publisher': [], 'time': [], 'docnum': [], 'corpusp': []} for i in range(0, num): result[i] = [] def proportion(pubid): docs = model.author2doc[pubid] return sum([word[1] for doc in docs for word in corpus[doc]]) / vocnum for i in pubid: result['publisher'].append(re.split(', ', i)[0]) result['time'].append(re.split(', ', i)[1]) cr1 = total['press'].map(lambda x: x == re.split(', ', i)[0]) cr2 = total['pubtime'].map(lambda x: x == re.split(', ', i)[1]) result['docnum'].append(len(total[cr1 & cr2])) result['corpusp'].append(proportion(i)) topics = {j[0]: j[1] for j in model[i]} for j in result: if j in topics: result[j].append(topics[j]) elif j is 'publisher' or j is 'time' or j is 'docnum' or j is 'corpusp': pass else: result[j].append(0) result = pd.DataFrame(result) result = result.reindex_axis(natsorted(result.columns, alg=ns.IC), axis=1) ratio = {} for i in range(0, num): ratio[i] = 0 for index, row in result.iterrows(): for i in ratio: ratio[i] += row[i] * row['corpusp'] agg = sum(ratio.values()) ratio = {i: str(ratio[i] * 100 / agg) + '%' for i in ratio} topics = pd.DataFrame({ i[0]: [re.split('\*', j)[1] + '*' + str(round(float(re.split('\*', j)[0]) * 100, 3)) + '%' for j in re.split(' \\+ ', re.sub('"', '', i[1]))] for i in model.show_topics(num_topics= num, num_words=50)}) topics.loc[30] = ratio topics = topics.reindex([30] + list(range(0, 30))) topics.index = ['토픽 비율'] + list(range(1, 31)) ''' topic_labels = ['1 사이버 해킹과 공격', '2 ', '3 원전에 대한 비판적 담론들', '4 원전의 안전과 기술적 문제들', '5 후쿠시마 원전 사고와 안전', '6 지진과 원자력 발전소 및 정치권', '7 해외 동향', '8 지진과 원전 위치 지역의 불안', '9 원자력 안전과 지역사회', '10 원자력 관련 기관 인사', '11 원자력을 둘러싼 정치적 갈등', '12 원전 건설 사업과 관련된 이슈', '13 고리/월성 원전', '14 영화 판도라 관련', '15 원자력 산업 관리', '16 원전 관련 투자와 에너지 산업', '17 사회적 이슈들과 일본 지진','18 원전 관련 기술', '19 원전에 대한 사업적 접근', '20 원전과 북한, 안보'] ''' topic_labels = [str(i) for i in range(1, num + 1)] topics.columns = topic_labels result.columns = topic_labels + ['형태소의 비율', '기사 숫자', '발표 기관', '발표 시기'] return(topics, result, model)
print(dict(dictionary.items())) dictionary.id2token dictionary.token2id dictionary.num_pos dictionary.num_nnz dictionary.num_docs # 对文本进行向量化,得到每首诗词的稀疏向量,向量的每一个元素代表了一个 word 在这首诗词中的出现次数 corpus = [dictionary.doc2bow(doc) for doc in docs] ######### atm 模型 ########## ''' author topic model ''' # 使用 atm 模型进行训练, 设置100个主题 model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, num_topics=100) # 保存模型 model_file = '/Users/wxy/Documents/Sufepost/Courses/文本挖掘/report/MyPoemMining/atm_model/atm.model' model.save(model_file) ######### 模型信息与降维 ########## ''' 模型信息与降维 ''' model_file = '/Users/wxy/Documents/Sufepost/Courses/文本挖掘/report/MyPoemMining/atm_model/atm.model' model = AuthorTopicModel.load(model_file) # 模型的一些信息 model.id2author model.author2id model.author2doc
# -*- coding: utf-8 -*- """ Created on Wed Nov 29 11:33:01 2017 @author: jadm """ #%% from gensim.models import AuthorTopicModel model = AuthorTopicModel.load('modelo1/model.atmodel') #%% top_topics = model.top_topics(model.corpus) #%% for i in range(len(top_topics)): print(top_topics[i][1])
for index, row in df.iterrows(): row = row.tolist() curr_doc = [] for i in range(len(row)): curr_item = row[i] if curr_item > 0: curr_doc.append((i, curr_item)) corpus.append(curr_doc) # dictionary = Dictionary(data) # corpus = [dictionary.doc2bow(text) for text in data] num_topics = 12 model_POLE = AuthorTopicModel( corpus=corpus, author2doc=author2doc_POLE, #id2word=dictionary, num_topics=num_topics) model_MSI = AuthorTopicModel( corpus=corpus, author2doc=author2doc_MSI, #id2word=dictionary, num_topics=num_topics) # construct vectors for authors # author_vecs = [model.get_author_topics(author) for author in model.id2author.values()] # # print(author_vecs) POLE_tops = model_POLE.get_topics() np.savetxt("authortopic_POLE_output.csv", POLE_tops, delimiter=",")
df, f_ini, f_fin = archivos_csv(sys.argv[1]) else: df, f_ini, f_fin = archivos_csv() #%% corpus, dictionary, author2doc = preprocesamiento(df) print('# de autores: %d' % len(author2doc)) print('# tokens unicos: %d' % len(dictionary)) print('# de documentos: %d' % len(corpus)) print('Corriendo modelo') model = AuthorTopicModel(corpus=corpus, num_topics=100, id2word=dictionary.id2token, author2doc=author2doc, chunksize=2000, passes=55, eval_every=0, iterations=10000000, gamma_threshold=1e-11) f = 'modelo' + folder() os.makedirs(f) os.makedirs(f + '/LDA') model.save(f + '/model.atmodel') print("MODELO TERMINADO Y GUARDADO") # LDA print('Corriendo LDA') ldamodel = LdaModel(corpus=corpus, num_topics=100, id2word=dictionary)
# Remove rare and common tokens. # Filter out words that occur too frequently or too rarely. max_freq = 0.5 min_wordcount = 20 dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq) _ = dictionary[0] # This sort of "initializes" dictionary.id2token. corpus = [dictionary.doc2bow(doc) for doc in docs] print('Number of authors: %d' % len(author2doc)) print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(corpus)) from gensim.models import AuthorTopicModel model = AuthorTopicModel(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \ author2doc=author2doc, chunksize=2000, passes=1, eval_every=0, \ iterations=1, random_state=1) model_list = [] for i in range(5): model = AuthorTopicModel(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \ author2doc=author2doc, chunksize=2000, passes=100, gamma_threshold=1e-10, \ eval_every=0, iterations=1, random_state=i) top_topics = model.top_topics(corpus) tc = sum([t[1] for t in top_topics]) model_list.append((model, tc)) model, tc = max(model_list, key=lambda x: x[1]) print('Topic coherence: %.3e' % tc) model.save('/tmp/model.atmodel')
# In[143]: patdf2inv = dict((df_pat_idx[key], value) for (key, value) in pat2inv.items()) patdf2inv # #### Construct author-topic model # In[144]: # construct author-topic model model_at = AuthorTopicModel(corpus=corpus_at, doc2author=patdf2inv, id2word=id_to_word_at, num_topics=25) # In[145]: # construct vectors for authors author_vecs = [model_at.get_author_topics(author) for author in model_at.id2author.values()] author_vecs # In[146]: # inspect topic distribution for author with id# 7788103-1
# Parameters: # # **num_topics**: The number of topics ion the model. There is no 'correct' value here, and it depends entirely on how many different topics occur in the corpus. 100 is generally a reasonable compromise for a corpus this size. <br /> # **chunksize**: Controls the size of the mini-batches. This depends entirely on the corpus - 2000 is the default, but this obviously makes no sense if a corpus only contains 1000 documents. <br /> # **passes**: 100 by default <br /> # **iterations**: iterations is the maximum number of times the model loops over each document <br /> # **alpha**: Can be set to 'asymmetric' <br /> # **eta**: Can be set to ‘auto’, which learns an asymmetric prior over words directly from the data # In[23]: #get_ipython().run_cell_magic('time', '', 'model = AuthorTopicModel(corpus=corpus, num_topics=100, id2word=dictionary.id2token, \\\n author2doc=author2doc_test, chunksize=100, passes=100, gamma_threshold=0.001, \\\n eval_every=0, iterations=1, random_state=1)') print('Training...') model = AuthorTopicModel(corpus=corpus, num_topics=100, id2word=dictionary.id2token, \ author2doc=author2doc_test, chunksize=100, passes=100, gamma_threshold=0.001, \ eval_every=0, iterations=1, random_state=1) # In[24]: # Save model. model.save('./results/model_presentation_github.atmodel') # In[18]: #import pandas as pd #import spacy #from gensim.models import Phrases #from gensim.corpora import Dictionary #from gensim.models import AuthorTopicModel
def load_AT_model(self, name='model.atmodel'): print("\nLoading model from %s" % self.model_name) self.model = AuthorTopicModel.load(self.model_name)