def triple_perplexity(documents: List[np.ndarray], author_topic, author_topic_c, category_topic, category_topic_c, topic_word, topic_word_c) -> float: """ Calculates the perplexity based on the documents given :param documents: a list of documents with word ids :return: the perplexity of the documents given """ n = 0 ll = 0.0 doc2author = prepro_file_load("doc2author") doc2category = prepro_file_load("doc2category") for d, doc in documents: author = doc2author[d] category = doc2category[d] for w in doc: ll += np.log(((topic_word[:, w] / topic_word_c) * (category_topic[category, :] / category_topic_c[category]) * (author_topic[author, :] / author_topic_c[author])).sum()) n += 1 return np.exp(ll / (-n))
def compute_metrics_on_saved_model(save_model: str or Model, test_documents, multi: bool = False): """ Compute the three metrics (perplexity, coherence and topic diff) on a saved model :param save_model: the name of the model :param test_documents: the test documents we want to test on :return: a dict containing the results """ if type(save_model) == str and not multi: loaded_model = load_model(save_model) elif type(save_model) == str and multi: loaded_model = load_model(save_model, True) else: loaded_model = save_model doc2bow, dictionary, texts = prepro_file_load('doc2bow'), prepro_file_load('corpora'), list( prepro_file_load('doc2pre_text').values()) if not multi: model_perplexity = perplexity(test_documents, loaded_model.doc_topic, loaded_model.doc_topic_count, loaded_model.topic_word, loaded_model.topic_word_count) model_coherence = get_coherence(doc2bow, dictionary, texts, loaded_model.num_topics, loaded_model.topic_word) model_topic_difference = mean_topic_diff(loaded_model.topic_word) else: model_perplexity = triple_perplexity(test_documents, loaded_model.feature_topic, loaded_model.feature_topic_count, loaded_model.feature2_topic, loaded_model.feature2_topic_count, loaded_model.topic_word, loaded_model.topic_word_count) model_coherence = get_coherence(doc2bow, dictionary, texts, loaded_model.num_topics, loaded_model.topic_word) model_topic_difference = mean_topic_diff(loaded_model.topic_word) return {"perplexity": model_perplexity, "coherence": model_coherence, "topic_diff": model_topic_difference}
def print_metadata_documents(metadata_type: str, metadata_name: str, sample_size: int = 10, print_top_topics=False): # load necessary data doc2pre = pre.prepro_file_load('doc2pre_text', folder_name='full') doc2raw = pre.prepro_file_load('doc2raw_text', folder_name='full') id2doc = pre.prepro_file_load('id2doc', folder_name='full') doc2author = pre.prepro_file_load('doc2author', folder_name='full') doc2category = pre.prepro_file_load('doc2category', folder_name='full') doc2taxonomy = pre.prepro_file_load('doc2taxonomy', folder_name='full') id2category = pre.prepro_file_load('id2category', folder_name='full') id2author = pre.prepro_file_load('id2author', folder_name='full') id2taxonomy = pre.prepro_file_load('id2taxonomy', folder_name='full') if metadata_type == "category": doc2meta = pre.prepro_file_load('doc2category', folder_name='full') id2meta = id2category elif metadata_type == "author": doc2meta = pre.prepro_file_load('doc2author', folder_name='full') id2meta = id2author elif metadata_type == "taxonomy": doc2meta = pre.prepro_file_load('doc2taxonomy', folder_name='full') id2meta = id2taxonomy else: print(f"'{metadata_type}' not found!") exit() # get metadata ID from name (examples: '26. Frederik', 'System Administrator', 'EMNER') metaID = find_id_from_value(id2meta, metadata_name) # get document IDs for documents with the given metadata documentIDs = get_metadata_document_ids(doc2meta, metaID) documents = {} documentsRaw = {} docAuthors = {} docCategories = {} docTaxonomies = {} docFileNames = {} # get data based on document IDs for docID in documentIDs: documents[docID] = doc2pre[docID] documentsRaw[docID] = doc2raw[docID] docAuthors[docID] = doc2author[docID] docCategories[docID] = doc2category[docID] docTaxonomies[docID] = doc2taxonomy[docID] docFileNames[docID] = id2doc[docID] # document set information print(f"{len(documents)} documents found\n") print_meta_document_set_info(docAuthors, id2author, "authors") print_meta_document_set_info(docCategories, id2category, "categories") print_meta_document_set_info(docTaxonomies, id2taxonomy, "taxonomies") # random examples of documents with metadata information if print_top_topics: corpora = pre.prepro_file_load("corpora", "full") model_path = "../model/models/90_0.01_0.1_category" model = load_model(model_path) num_topics = model.num_topics topic_word_dist = model.topic_word topic_top_words = get_topics(corpora, num_topics, topic_word_dist) print("Random documents:") sampleIDs = random.sample(documentIDs, len(documentIDs)) for count in range(sample_size): if count == len(sampleIDs): break id = sampleIDs[count] print(f"ID: {id}") print(f"Author: {id2author[docAuthors[id]]}") print(f"Category: {id2category[docCategories[id]]}") print( f"Taxonomy: {[id2taxonomy[taxID] for taxID in docTaxonomies[id]]}") print(f"File name: {docFileNames[id]}") print(documents[id]) print(documentsRaw[id] + "\n") if print_top_topics: item_top_topics = sample_and_sort_items(model, item_id=docCategories[id]) print(f"Top words in category top topics:") for item in item_top_topics.items(): for topic in item[1]: print( f"Topic ID/probability: {topic[0]}/{'{:.2f}'.format(topic[1])} {topic_top_words[topic[0]]}" ) print()
def hellinger_dis(p, q): return norm(np.sqrt(p) - np.sqrt(q)) / _SQRT2 def js_sim(p, q): return 1 - jensenshannon(p, q) def hellinger_sim(p, q): return 1 - hellinger_dis(p, q) if __name__ == '__main__': folder = "nice" doc2auth = prepro_file_load("doc2author", folder_name=folder) id2auth = prepro_file_load("id2author", folder_name=folder) auth2doc = {} for doc, auth in doc2auth.items(): auth2doc[auth] = auth2doc.get(auth, []) + [doc] in_folder = "nice" path = f"../model/generated_files/{in_folder}/" with open(path + "wta.pickle", "rb") as file: wta = pickle.load(file) with open(path + "middle.pickle", "rb") as file: middle = pickle.load(file) with open(path + "topic_word.pickle", "rb") as file: topic_word = pickle.load(file)