Beispiel #1
0
def compute_metrics_on_saved_model(save_model: str or Model, test_documents, multi: bool = False):
    """
    Compute the three metrics (perplexity, coherence and topic diff) on a saved model
    :param save_model: the name of the model
    :param test_documents: the test documents we want to test on
    :return: a dict containing the results
    """
    if type(save_model) == str and not multi:
        loaded_model = load_model(save_model)
    elif type(save_model) == str and multi:
        loaded_model = load_model(save_model, True)
    else:
        loaded_model = save_model

    doc2bow, dictionary, texts = prepro_file_load('doc2bow'), prepro_file_load('corpora'), list(
        prepro_file_load('doc2pre_text').values())

    if not multi:
        model_perplexity = perplexity(test_documents,
                                      loaded_model.doc_topic,
                                      loaded_model.doc_topic_count,
                                      loaded_model.topic_word,
                                      loaded_model.topic_word_count)
        model_coherence = get_coherence(doc2bow, dictionary, texts, loaded_model.num_topics, loaded_model.topic_word)
        model_topic_difference = mean_topic_diff(loaded_model.topic_word)
    else:
        model_perplexity = triple_perplexity(test_documents,
                                             loaded_model.feature_topic,
                                             loaded_model.feature_topic_count,
                                             loaded_model.feature2_topic,
                                             loaded_model.feature2_topic_count,
                                             loaded_model.topic_word,
                                             loaded_model.topic_word_count)
        model_coherence = get_coherence(doc2bow, dictionary, texts, loaded_model.num_topics, loaded_model.topic_word)
        model_topic_difference = mean_topic_diff(loaded_model.topic_word)
    return {"perplexity": model_perplexity,
            "coherence": model_coherence,
            "topic_diff": model_topic_difference}
            table_cell += word
        else:
            table_cell += f"{word} \\\\ "
    table_cell += "}"
    return table_cell


if __name__ == '__main__':
    corpora = pre.prepro_file_load("corpora", "full")
    doc2pre = pre.prepro_file_load('doc2pre_text', folder_name='full')
    doc2raw = pre.prepro_file_load('doc2raw_text', folder_name='full')
    id2category = pre.prepro_file_load('id2category', folder_name='full')
    id2author = pre.prepro_file_load('id2author', folder_name='full')
    model_path = "../model/models/90_0.01_0.1_author"
    model_type = model_path.split("_")[-1]
    model = load_model(
        model_path) if model_type != "MultiModel" else load_model(model_path,
                                                                  multi=True)
    num_topics = model.num_topics
    topic_word_dist = model.topic_word
    if model_type == "geographic":
        id2category = pre.prepro_file_load('id2category',
                                           folder_name='full_geographic')
    elif model_type == "topical":
        id2category = pre.prepro_file_load('id2category',
                                           folder_name='full_topical')
    model_type = "category" if model_type == "geographic" or model_type == "topical" else model_type

    item_top_topics = sample_and_sort_items(model, num_items=7)
    topic_top_words = get_topics(corpora, num_topics, topic_word_dist)

    # printing item-topic -> topic-word connections
def print_top_topics_geographic_and_topical(num_top_topics: int = 20):
    model_path = "../model/models/90_0.01_0.1_author_category_MultiModel"
    model_type = model_path.split("_")[-1]
    model = load_model(model_path) if model_type != "MultiModel" else load_model(model_path, multi=True)
    corpora = pre.prepro_file_load("corpora", "full")
    id2category = pre.prepro_file_load('id2category', folder_name='full')
    num_topics = model.num_topics
    topic_word_dist = model.topic_word

    # names of categories that are based on a geographic location
    geographic_category_names = ["Frederikshavn-avis", "Morsø Debat", "Morsø-avis", "Rebild-avis", "Brønderslev-avis",
                                 "Thisted-avis", "Jammerbugt-avis", "Vesthimmerland-avis", "Hjørring-avis",
                                 "Aalborg-avis", "Morsø Sport", "Thisted sport", "Mariagerfjord-avis", "Udland-avis"]
    geographic_category_ids = get_category_ids_from_names(id2category, geographic_category_names)
    # categories not based on geographic locations are closer to real topics
    topical_category_ids = [id for id in id2category.keys() if id not in geographic_category_ids]
    if model_type == "MultiModel":
        category_topic = model.feature2_topic
    else:
        category_topic = model.doc_topic
    category_topic = row_distribution_normalization(category_topic)

    # separate the geographic and topical topic distributions and sort on the topics' summed distribution values
    sorted_geographic = delete_rows_and_sort(category_topic, topical_category_ids)
    sorted_topical = delete_rows_and_sort(category_topic, geographic_category_ids)

    # look for topic ID appearances in both top topic lists and unique appearances
    top_multiple_topics = []
    for index in range(num_top_topics):
        cur_topic = list(sorted_geographic.keys())[index]
        if cur_topic in list(sorted_topical.keys())[:num_top_topics]:
            top_multiple_topics.append(cur_topic)
    top_unique_topics = list(
        set(list(sorted_geographic.keys())[:num_top_topics] + list(sorted_topical.keys())[:num_top_topics]))
    for topic in top_multiple_topics:
        top_unique_topics.remove(topic)
    for index, topic in enumerate(top_unique_topics):
        if topic in list(sorted_geographic.keys())[:num_top_topics]:
            top_unique_topics[index] = ("geographic", top_unique_topics[index])
        else:
            top_unique_topics[index] = ("topical", top_unique_topics[index])

    topic_top_words = get_topics(corpora, num_topics, topic_word_dist)

    # print observations
    print("Top topics for only geographic categories:")
    for index in range(num_top_topics):
        topic = list(sorted_geographic.items())[index]
        print(f"{'{:.2f}'.format(topic[1])}, {topic[0]}: {topic_top_words[topic[0]]}")

    print()
    print("Top topics for only topical categories:")
    for index in range(num_top_topics):
        topic = list(sorted_topical.items())[index]
        print(f"{'{:.2f}'.format(topic[1])}, {topic[0]}: {topic_top_words[topic[0]]}")

    print()
    print("Appears in both lists:")
    print(f"{len(top_multiple_topics)}/{num_top_topics}")
    for topic in top_multiple_topics:
        print(f"{topic}: {topic_top_words[topic]}")

    print()
    print("Unique topics:")
    for topic in top_unique_topics:
        print(f"{topic}: {topic_top_words[topic[1]]}")
def print_metadata_documents(metadata_type: str,
                             metadata_name: str,
                             sample_size: int = 10,
                             print_top_topics=False):
    # load necessary data
    doc2pre = pre.prepro_file_load('doc2pre_text', folder_name='full')
    doc2raw = pre.prepro_file_load('doc2raw_text', folder_name='full')
    id2doc = pre.prepro_file_load('id2doc', folder_name='full')
    doc2author = pre.prepro_file_load('doc2author', folder_name='full')
    doc2category = pre.prepro_file_load('doc2category', folder_name='full')
    doc2taxonomy = pre.prepro_file_load('doc2taxonomy', folder_name='full')
    id2category = pre.prepro_file_load('id2category', folder_name='full')
    id2author = pre.prepro_file_load('id2author', folder_name='full')
    id2taxonomy = pre.prepro_file_load('id2taxonomy', folder_name='full')

    if metadata_type == "category":
        doc2meta = pre.prepro_file_load('doc2category', folder_name='full')
        id2meta = id2category
    elif metadata_type == "author":
        doc2meta = pre.prepro_file_load('doc2author', folder_name='full')
        id2meta = id2author
    elif metadata_type == "taxonomy":
        doc2meta = pre.prepro_file_load('doc2taxonomy', folder_name='full')
        id2meta = id2taxonomy
    else:
        print(f"'{metadata_type}' not found!")
        exit()

    # get metadata ID from name (examples: '26. Frederik', 'System Administrator', 'EMNER')
    metaID = find_id_from_value(id2meta, metadata_name)

    # get document IDs for documents with the given metadata
    documentIDs = get_metadata_document_ids(doc2meta, metaID)

    documents = {}
    documentsRaw = {}
    docAuthors = {}
    docCategories = {}
    docTaxonomies = {}
    docFileNames = {}
    # get data based on document IDs
    for docID in documentIDs:
        documents[docID] = doc2pre[docID]
        documentsRaw[docID] = doc2raw[docID]
        docAuthors[docID] = doc2author[docID]
        docCategories[docID] = doc2category[docID]
        docTaxonomies[docID] = doc2taxonomy[docID]
        docFileNames[docID] = id2doc[docID]

    # document set information
    print(f"{len(documents)} documents found\n")
    print_meta_document_set_info(docAuthors, id2author, "authors")
    print_meta_document_set_info(docCategories, id2category, "categories")
    print_meta_document_set_info(docTaxonomies, id2taxonomy, "taxonomies")

    # random examples of documents with metadata information
    if print_top_topics:
        corpora = pre.prepro_file_load("corpora", "full")
        model_path = "../model/models/90_0.01_0.1_category"
        model = load_model(model_path)
        num_topics = model.num_topics
        topic_word_dist = model.topic_word
        topic_top_words = get_topics(corpora, num_topics, topic_word_dist)

    print("Random documents:")
    sampleIDs = random.sample(documentIDs, len(documentIDs))
    for count in range(sample_size):
        if count == len(sampleIDs):
            break
        id = sampleIDs[count]
        print(f"ID: {id}")
        print(f"Author: {id2author[docAuthors[id]]}")
        print(f"Category: {id2category[docCategories[id]]}")
        print(
            f"Taxonomy: {[id2taxonomy[taxID] for taxID in docTaxonomies[id]]}")
        print(f"File name: {docFileNames[id]}")
        print(documents[id])
        print(documentsRaw[id] + "\n")
        if print_top_topics:
            item_top_topics = sample_and_sort_items(model,
                                                    item_id=docCategories[id])
            print(f"Top words in category top topics:")
            for item in item_top_topics.items():
                for topic in item[1]:
                    print(
                        f"Topic ID/probability: {topic[0]}/{'{:.2f}'.format(topic[1])} {topic_top_words[topic[0]]}"
                    )
            print()