def compute_metrics_on_saved_model(save_model: str or Model, test_documents, multi: bool = False): """ Compute the three metrics (perplexity, coherence and topic diff) on a saved model :param save_model: the name of the model :param test_documents: the test documents we want to test on :return: a dict containing the results """ if type(save_model) == str and not multi: loaded_model = load_model(save_model) elif type(save_model) == str and multi: loaded_model = load_model(save_model, True) else: loaded_model = save_model doc2bow, dictionary, texts = prepro_file_load('doc2bow'), prepro_file_load('corpora'), list( prepro_file_load('doc2pre_text').values()) if not multi: model_perplexity = perplexity(test_documents, loaded_model.doc_topic, loaded_model.doc_topic_count, loaded_model.topic_word, loaded_model.topic_word_count) model_coherence = get_coherence(doc2bow, dictionary, texts, loaded_model.num_topics, loaded_model.topic_word) model_topic_difference = mean_topic_diff(loaded_model.topic_word) else: model_perplexity = triple_perplexity(test_documents, loaded_model.feature_topic, loaded_model.feature_topic_count, loaded_model.feature2_topic, loaded_model.feature2_topic_count, loaded_model.topic_word, loaded_model.topic_word_count) model_coherence = get_coherence(doc2bow, dictionary, texts, loaded_model.num_topics, loaded_model.topic_word) model_topic_difference = mean_topic_diff(loaded_model.topic_word) return {"perplexity": model_perplexity, "coherence": model_coherence, "topic_diff": model_topic_difference}
table_cell += word else: table_cell += f"{word} \\\\ " table_cell += "}" return table_cell if __name__ == '__main__': corpora = pre.prepro_file_load("corpora", "full") doc2pre = pre.prepro_file_load('doc2pre_text', folder_name='full') doc2raw = pre.prepro_file_load('doc2raw_text', folder_name='full') id2category = pre.prepro_file_load('id2category', folder_name='full') id2author = pre.prepro_file_load('id2author', folder_name='full') model_path = "../model/models/90_0.01_0.1_author" model_type = model_path.split("_")[-1] model = load_model( model_path) if model_type != "MultiModel" else load_model(model_path, multi=True) num_topics = model.num_topics topic_word_dist = model.topic_word if model_type == "geographic": id2category = pre.prepro_file_load('id2category', folder_name='full_geographic') elif model_type == "topical": id2category = pre.prepro_file_load('id2category', folder_name='full_topical') model_type = "category" if model_type == "geographic" or model_type == "topical" else model_type item_top_topics = sample_and_sort_items(model, num_items=7) topic_top_words = get_topics(corpora, num_topics, topic_word_dist) # printing item-topic -> topic-word connections
def print_top_topics_geographic_and_topical(num_top_topics: int = 20): model_path = "../model/models/90_0.01_0.1_author_category_MultiModel" model_type = model_path.split("_")[-1] model = load_model(model_path) if model_type != "MultiModel" else load_model(model_path, multi=True) corpora = pre.prepro_file_load("corpora", "full") id2category = pre.prepro_file_load('id2category', folder_name='full') num_topics = model.num_topics topic_word_dist = model.topic_word # names of categories that are based on a geographic location geographic_category_names = ["Frederikshavn-avis", "Morsø Debat", "Morsø-avis", "Rebild-avis", "Brønderslev-avis", "Thisted-avis", "Jammerbugt-avis", "Vesthimmerland-avis", "Hjørring-avis", "Aalborg-avis", "Morsø Sport", "Thisted sport", "Mariagerfjord-avis", "Udland-avis"] geographic_category_ids = get_category_ids_from_names(id2category, geographic_category_names) # categories not based on geographic locations are closer to real topics topical_category_ids = [id for id in id2category.keys() if id not in geographic_category_ids] if model_type == "MultiModel": category_topic = model.feature2_topic else: category_topic = model.doc_topic category_topic = row_distribution_normalization(category_topic) # separate the geographic and topical topic distributions and sort on the topics' summed distribution values sorted_geographic = delete_rows_and_sort(category_topic, topical_category_ids) sorted_topical = delete_rows_and_sort(category_topic, geographic_category_ids) # look for topic ID appearances in both top topic lists and unique appearances top_multiple_topics = [] for index in range(num_top_topics): cur_topic = list(sorted_geographic.keys())[index] if cur_topic in list(sorted_topical.keys())[:num_top_topics]: top_multiple_topics.append(cur_topic) top_unique_topics = list( set(list(sorted_geographic.keys())[:num_top_topics] + list(sorted_topical.keys())[:num_top_topics])) for topic in top_multiple_topics: top_unique_topics.remove(topic) for index, topic in enumerate(top_unique_topics): if topic in list(sorted_geographic.keys())[:num_top_topics]: top_unique_topics[index] = ("geographic", top_unique_topics[index]) else: top_unique_topics[index] = ("topical", top_unique_topics[index]) topic_top_words = get_topics(corpora, num_topics, topic_word_dist) # print observations print("Top topics for only geographic categories:") for index in range(num_top_topics): topic = list(sorted_geographic.items())[index] print(f"{'{:.2f}'.format(topic[1])}, {topic[0]}: {topic_top_words[topic[0]]}") print() print("Top topics for only topical categories:") for index in range(num_top_topics): topic = list(sorted_topical.items())[index] print(f"{'{:.2f}'.format(topic[1])}, {topic[0]}: {topic_top_words[topic[0]]}") print() print("Appears in both lists:") print(f"{len(top_multiple_topics)}/{num_top_topics}") for topic in top_multiple_topics: print(f"{topic}: {topic_top_words[topic]}") print() print("Unique topics:") for topic in top_unique_topics: print(f"{topic}: {topic_top_words[topic[1]]}")
def print_metadata_documents(metadata_type: str, metadata_name: str, sample_size: int = 10, print_top_topics=False): # load necessary data doc2pre = pre.prepro_file_load('doc2pre_text', folder_name='full') doc2raw = pre.prepro_file_load('doc2raw_text', folder_name='full') id2doc = pre.prepro_file_load('id2doc', folder_name='full') doc2author = pre.prepro_file_load('doc2author', folder_name='full') doc2category = pre.prepro_file_load('doc2category', folder_name='full') doc2taxonomy = pre.prepro_file_load('doc2taxonomy', folder_name='full') id2category = pre.prepro_file_load('id2category', folder_name='full') id2author = pre.prepro_file_load('id2author', folder_name='full') id2taxonomy = pre.prepro_file_load('id2taxonomy', folder_name='full') if metadata_type == "category": doc2meta = pre.prepro_file_load('doc2category', folder_name='full') id2meta = id2category elif metadata_type == "author": doc2meta = pre.prepro_file_load('doc2author', folder_name='full') id2meta = id2author elif metadata_type == "taxonomy": doc2meta = pre.prepro_file_load('doc2taxonomy', folder_name='full') id2meta = id2taxonomy else: print(f"'{metadata_type}' not found!") exit() # get metadata ID from name (examples: '26. Frederik', 'System Administrator', 'EMNER') metaID = find_id_from_value(id2meta, metadata_name) # get document IDs for documents with the given metadata documentIDs = get_metadata_document_ids(doc2meta, metaID) documents = {} documentsRaw = {} docAuthors = {} docCategories = {} docTaxonomies = {} docFileNames = {} # get data based on document IDs for docID in documentIDs: documents[docID] = doc2pre[docID] documentsRaw[docID] = doc2raw[docID] docAuthors[docID] = doc2author[docID] docCategories[docID] = doc2category[docID] docTaxonomies[docID] = doc2taxonomy[docID] docFileNames[docID] = id2doc[docID] # document set information print(f"{len(documents)} documents found\n") print_meta_document_set_info(docAuthors, id2author, "authors") print_meta_document_set_info(docCategories, id2category, "categories") print_meta_document_set_info(docTaxonomies, id2taxonomy, "taxonomies") # random examples of documents with metadata information if print_top_topics: corpora = pre.prepro_file_load("corpora", "full") model_path = "../model/models/90_0.01_0.1_category" model = load_model(model_path) num_topics = model.num_topics topic_word_dist = model.topic_word topic_top_words = get_topics(corpora, num_topics, topic_word_dist) print("Random documents:") sampleIDs = random.sample(documentIDs, len(documentIDs)) for count in range(sample_size): if count == len(sampleIDs): break id = sampleIDs[count] print(f"ID: {id}") print(f"Author: {id2author[docAuthors[id]]}") print(f"Category: {id2category[docCategories[id]]}") print( f"Taxonomy: {[id2taxonomy[taxID] for taxID in docTaxonomies[id]]}") print(f"File name: {docFileNames[id]}") print(documents[id]) print(documentsRaw[id] + "\n") if print_top_topics: item_top_topics = sample_and_sort_items(model, item_id=docCategories[id]) print(f"Top words in category top topics:") for item in item_top_topics.items(): for topic in item[1]: print( f"Topic ID/probability: {topic[0]}/{'{:.2f}'.format(topic[1])} {topic_top_words[topic[0]]}" ) print()