# And increase the topic count increase_count(feature, word, topic, feature_topic, feature_topic_c, topic_word, topic_word_c) if __name__ == '__main__': feature = "category" alpha = 0.01 beta = 0.1 iterationNum = 50 num_topics = 90 doc2feature = prepro_file_load(f"doc2{feature}") num_feature = len(set(list(doc2feature.values()))) doc2word = list(prepro_file_load("doc2word").items()) doc2bow, dictionary, texts = prepro_file_load('doc2bow'), prepro_file_load('corpora'), list( prepro_file_load('doc2pre_text').values()) D, W = (dictionary.num_docs, len(dictionary)) train_docs, test_docs = train_test_split(doc2word, test_size=0.33, random_state=1337) word_topic_assignment, feature_topic, feature_topic_c, topic_word, topic_word_c = random_initialize(doc2word) for i in tqdm(range(0, iterationNum)): gibbs_sampling(train_docs, feature_topic, feature_topic_c, topic_word, topic_word_c, word_topic_assignment) print(time.strftime('%X'), "Iteration: ", i, " Completed", " Perplexity: ", x_perplexity(test_docs, feature_topic, feature_topic_c, topic_word, topic_word_c, doc2feature), " Coherence: ", get_coherence(doc2bow, dictionary, texts, num_topics, topic_word), " Topic Diff: ", mean_topic_diff(topic_word)) model = Model(num_topics, alpha, beta, feature_topic, feature_topic_c, topic_word, topic_word_c, feature) model.save_model() print(get_topics(dictionary, num_topics, topic_word))
model_type = model_path.split("_")[-1] model = load_model( model_path) if model_type != "MultiModel" else load_model(model_path, multi=True) num_topics = model.num_topics topic_word_dist = model.topic_word if model_type == "geographic": id2category = pre.prepro_file_load('id2category', folder_name='full_geographic') elif model_type == "topical": id2category = pre.prepro_file_load('id2category', folder_name='full_topical') model_type = "category" if model_type == "geographic" or model_type == "topical" else model_type item_top_topics = sample_and_sort_items(model, num_items=7) topic_top_words = get_topics(corpora, num_topics, topic_word_dist) # printing item-topic -> topic-word connections if model_type == "standard": print("Random documents with top topics:") elif model_type == "category": print("Random categories with top topics:") elif model_type == "author": print("Random authors with top topics:") elif model_type == "MultiModel": print("Random categories and authors with top topics:") else: print(f"Model type '{model_type}' not known!") exit() for item in item_top_topics.items():
def print_top_topics_geographic_and_topical(num_top_topics: int = 20): model_path = "../model/models/90_0.01_0.1_author_category_MultiModel" model_type = model_path.split("_")[-1] model = load_model(model_path) if model_type != "MultiModel" else load_model(model_path, multi=True) corpora = pre.prepro_file_load("corpora", "full") id2category = pre.prepro_file_load('id2category', folder_name='full') num_topics = model.num_topics topic_word_dist = model.topic_word # names of categories that are based on a geographic location geographic_category_names = ["Frederikshavn-avis", "Morsø Debat", "Morsø-avis", "Rebild-avis", "Brønderslev-avis", "Thisted-avis", "Jammerbugt-avis", "Vesthimmerland-avis", "Hjørring-avis", "Aalborg-avis", "Morsø Sport", "Thisted sport", "Mariagerfjord-avis", "Udland-avis"] geographic_category_ids = get_category_ids_from_names(id2category, geographic_category_names) # categories not based on geographic locations are closer to real topics topical_category_ids = [id for id in id2category.keys() if id not in geographic_category_ids] if model_type == "MultiModel": category_topic = model.feature2_topic else: category_topic = model.doc_topic category_topic = row_distribution_normalization(category_topic) # separate the geographic and topical topic distributions and sort on the topics' summed distribution values sorted_geographic = delete_rows_and_sort(category_topic, topical_category_ids) sorted_topical = delete_rows_and_sort(category_topic, geographic_category_ids) # look for topic ID appearances in both top topic lists and unique appearances top_multiple_topics = [] for index in range(num_top_topics): cur_topic = list(sorted_geographic.keys())[index] if cur_topic in list(sorted_topical.keys())[:num_top_topics]: top_multiple_topics.append(cur_topic) top_unique_topics = list( set(list(sorted_geographic.keys())[:num_top_topics] + list(sorted_topical.keys())[:num_top_topics])) for topic in top_multiple_topics: top_unique_topics.remove(topic) for index, topic in enumerate(top_unique_topics): if topic in list(sorted_geographic.keys())[:num_top_topics]: top_unique_topics[index] = ("geographic", top_unique_topics[index]) else: top_unique_topics[index] = ("topical", top_unique_topics[index]) topic_top_words = get_topics(corpora, num_topics, topic_word_dist) # print observations print("Top topics for only geographic categories:") for index in range(num_top_topics): topic = list(sorted_geographic.items())[index] print(f"{'{:.2f}'.format(topic[1])}, {topic[0]}: {topic_top_words[topic[0]]}") print() print("Top topics for only topical categories:") for index in range(num_top_topics): topic = list(sorted_topical.items())[index] print(f"{'{:.2f}'.format(topic[1])}, {topic[0]}: {topic_top_words[topic[0]]}") print() print("Appears in both lists:") print(f"{len(top_multiple_topics)}/{num_top_topics}") for topic in top_multiple_topics: print(f"{topic}: {topic_top_words[topic]}") print() print("Unique topics:") for topic in top_unique_topics: print(f"{topic}: {topic_top_words[topic[1]]}")
doc2word) # things needed to calculate coherence doc2bow, texts = prepro_file_load('doc2bow', folder_name=in_folder), \ list(prepro_file_load('doc2pre_text', folder_name=in_folder).values()) print("Starting Gibbs") for i in range(0, iterationNum): gibbs_sampling(doc2word) print( time.strftime('%X'), "Iteration: ", i, " Completed", "Coherence: ", get_coherence(doc2bow, corpora, texts, layer_lengths[len(layer_lengths) - 1], topic_to_word)) topic_words = get_topics(corpora, layer_lengths[len(layer_lengths) - 1], topic_to_word) if K is None: topic_words = { struct_root[mid_layers_num - 1][i]: topic_words[i] for i in range(len(topic_words)) } print(topic_words) print('generating distributions') # calculate document-topic distribution doc_top_dists = [{} for x in range(len(layer_lengths))] for id, doc_info in tqdm(enumerate(word_topic_assignment)): for l in range(len(layer_lengths)): doc_dist = np.zeros(shape=layer_lengths[l]) for word in doc_info:
doc_topic_c, topic_word, topic_word_c, word_topic_assignment) print( time.strftime('%X'), "Iteration: ", i, " Completed", " Perplexity: ", triple_perplexity(test_docs, author_topic, author_topic_c, doc_topic, doc_topic_c, topic_word, topic_word_c), " Coherence: ", get_coherence(doc2bow, dictionary, texts, num_topics, topic_word), " Topic Diff: ", mean_topic_diff(topic_word)) model = MultiModel(num_topics, alpha, beta, author_topic, author_topic_c, doc_topic, doc_topic_c, topic_word, topic_word_c, "author_doc") model.save_model() topic_words = get_topics(dictionary, num_topics, topic_word) print(topic_words) print('generating distributions') # calculate document-topic distribution doc_top_dists = {} for id, doc_info in tqdm(enumerate(word_topic_assignment)): doc_dist = np.zeros(shape=num_topics) for word in doc_info: doc_dist[word] += 1 doc_top_dists[id] = doc_dist / doc_dist.sum() # calculate topic-word distribution top_word_dists = {i: np.zeros(D) for i in range(num_topics)}
def print_metadata_documents(metadata_type: str, metadata_name: str, sample_size: int = 10, print_top_topics=False): # load necessary data doc2pre = pre.prepro_file_load('doc2pre_text', folder_name='full') doc2raw = pre.prepro_file_load('doc2raw_text', folder_name='full') id2doc = pre.prepro_file_load('id2doc', folder_name='full') doc2author = pre.prepro_file_load('doc2author', folder_name='full') doc2category = pre.prepro_file_load('doc2category', folder_name='full') doc2taxonomy = pre.prepro_file_load('doc2taxonomy', folder_name='full') id2category = pre.prepro_file_load('id2category', folder_name='full') id2author = pre.prepro_file_load('id2author', folder_name='full') id2taxonomy = pre.prepro_file_load('id2taxonomy', folder_name='full') if metadata_type == "category": doc2meta = pre.prepro_file_load('doc2category', folder_name='full') id2meta = id2category elif metadata_type == "author": doc2meta = pre.prepro_file_load('doc2author', folder_name='full') id2meta = id2author elif metadata_type == "taxonomy": doc2meta = pre.prepro_file_load('doc2taxonomy', folder_name='full') id2meta = id2taxonomy else: print(f"'{metadata_type}' not found!") exit() # get metadata ID from name (examples: '26. Frederik', 'System Administrator', 'EMNER') metaID = find_id_from_value(id2meta, metadata_name) # get document IDs for documents with the given metadata documentIDs = get_metadata_document_ids(doc2meta, metaID) documents = {} documentsRaw = {} docAuthors = {} docCategories = {} docTaxonomies = {} docFileNames = {} # get data based on document IDs for docID in documentIDs: documents[docID] = doc2pre[docID] documentsRaw[docID] = doc2raw[docID] docAuthors[docID] = doc2author[docID] docCategories[docID] = doc2category[docID] docTaxonomies[docID] = doc2taxonomy[docID] docFileNames[docID] = id2doc[docID] # document set information print(f"{len(documents)} documents found\n") print_meta_document_set_info(docAuthors, id2author, "authors") print_meta_document_set_info(docCategories, id2category, "categories") print_meta_document_set_info(docTaxonomies, id2taxonomy, "taxonomies") # random examples of documents with metadata information if print_top_topics: corpora = pre.prepro_file_load("corpora", "full") model_path = "../model/models/90_0.01_0.1_category" model = load_model(model_path) num_topics = model.num_topics topic_word_dist = model.topic_word topic_top_words = get_topics(corpora, num_topics, topic_word_dist) print("Random documents:") sampleIDs = random.sample(documentIDs, len(documentIDs)) for count in range(sample_size): if count == len(sampleIDs): break id = sampleIDs[count] print(f"ID: {id}") print(f"Author: {id2author[docAuthors[id]]}") print(f"Category: {id2category[docCategories[id]]}") print( f"Taxonomy: {[id2taxonomy[taxID] for taxID in docTaxonomies[id]]}") print(f"File name: {docFileNames[id]}") print(documents[id]) print(documentsRaw[id] + "\n") if print_top_topics: item_top_topics = sample_and_sort_items(model, item_id=docCategories[id]) print(f"Top words in category top topics:") for item in item_top_topics.items(): for topic in item[1]: print( f"Topic ID/probability: {topic[0]}/{'{:.2f}'.format(topic[1])} {topic_top_words[topic[0]]}" ) print()