Beispiel #1
0
def triple_perplexity(documents: List[np.ndarray],
                      author_topic, author_topic_c,
                      category_topic, category_topic_c,
                      topic_word, topic_word_c) -> float:
    """
    Calculates the perplexity based on the documents given
    :param documents: a list of documents with word ids
    :return: the perplexity of the documents given
    """
    n = 0
    ll = 0.0
    doc2author = prepro_file_load("doc2author")
    doc2category = prepro_file_load("doc2category")
    for d, doc in documents:
        author = doc2author[d]
        category = doc2category[d]
        for w in doc:
            ll += np.log(((topic_word[:, w] / topic_word_c) *
                          (category_topic[category, :] / category_topic_c[category]) *
                          (author_topic[author, :] / author_topic_c[author])).sum())
            n += 1
    return np.exp(ll / (-n))
Beispiel #2
0
def compute_metrics_on_saved_model(save_model: str or Model, test_documents, multi: bool = False):
    """
    Compute the three metrics (perplexity, coherence and topic diff) on a saved model
    :param save_model: the name of the model
    :param test_documents: the test documents we want to test on
    :return: a dict containing the results
    """
    if type(save_model) == str and not multi:
        loaded_model = load_model(save_model)
    elif type(save_model) == str and multi:
        loaded_model = load_model(save_model, True)
    else:
        loaded_model = save_model

    doc2bow, dictionary, texts = prepro_file_load('doc2bow'), prepro_file_load('corpora'), list(
        prepro_file_load('doc2pre_text').values())

    if not multi:
        model_perplexity = perplexity(test_documents,
                                      loaded_model.doc_topic,
                                      loaded_model.doc_topic_count,
                                      loaded_model.topic_word,
                                      loaded_model.topic_word_count)
        model_coherence = get_coherence(doc2bow, dictionary, texts, loaded_model.num_topics, loaded_model.topic_word)
        model_topic_difference = mean_topic_diff(loaded_model.topic_word)
    else:
        model_perplexity = triple_perplexity(test_documents,
                                             loaded_model.feature_topic,
                                             loaded_model.feature_topic_count,
                                             loaded_model.feature2_topic,
                                             loaded_model.feature2_topic_count,
                                             loaded_model.topic_word,
                                             loaded_model.topic_word_count)
        model_coherence = get_coherence(doc2bow, dictionary, texts, loaded_model.num_topics, loaded_model.topic_word)
        model_topic_difference = mean_topic_diff(loaded_model.topic_word)
    return {"perplexity": model_perplexity,
            "coherence": model_coherence,
            "topic_diff": model_topic_difference}
def print_metadata_documents(metadata_type: str,
                             metadata_name: str,
                             sample_size: int = 10,
                             print_top_topics=False):
    # load necessary data
    doc2pre = pre.prepro_file_load('doc2pre_text', folder_name='full')
    doc2raw = pre.prepro_file_load('doc2raw_text', folder_name='full')
    id2doc = pre.prepro_file_load('id2doc', folder_name='full')
    doc2author = pre.prepro_file_load('doc2author', folder_name='full')
    doc2category = pre.prepro_file_load('doc2category', folder_name='full')
    doc2taxonomy = pre.prepro_file_load('doc2taxonomy', folder_name='full')
    id2category = pre.prepro_file_load('id2category', folder_name='full')
    id2author = pre.prepro_file_load('id2author', folder_name='full')
    id2taxonomy = pre.prepro_file_load('id2taxonomy', folder_name='full')

    if metadata_type == "category":
        doc2meta = pre.prepro_file_load('doc2category', folder_name='full')
        id2meta = id2category
    elif metadata_type == "author":
        doc2meta = pre.prepro_file_load('doc2author', folder_name='full')
        id2meta = id2author
    elif metadata_type == "taxonomy":
        doc2meta = pre.prepro_file_load('doc2taxonomy', folder_name='full')
        id2meta = id2taxonomy
    else:
        print(f"'{metadata_type}' not found!")
        exit()

    # get metadata ID from name (examples: '26. Frederik', 'System Administrator', 'EMNER')
    metaID = find_id_from_value(id2meta, metadata_name)

    # get document IDs for documents with the given metadata
    documentIDs = get_metadata_document_ids(doc2meta, metaID)

    documents = {}
    documentsRaw = {}
    docAuthors = {}
    docCategories = {}
    docTaxonomies = {}
    docFileNames = {}
    # get data based on document IDs
    for docID in documentIDs:
        documents[docID] = doc2pre[docID]
        documentsRaw[docID] = doc2raw[docID]
        docAuthors[docID] = doc2author[docID]
        docCategories[docID] = doc2category[docID]
        docTaxonomies[docID] = doc2taxonomy[docID]
        docFileNames[docID] = id2doc[docID]

    # document set information
    print(f"{len(documents)} documents found\n")
    print_meta_document_set_info(docAuthors, id2author, "authors")
    print_meta_document_set_info(docCategories, id2category, "categories")
    print_meta_document_set_info(docTaxonomies, id2taxonomy, "taxonomies")

    # random examples of documents with metadata information
    if print_top_topics:
        corpora = pre.prepro_file_load("corpora", "full")
        model_path = "../model/models/90_0.01_0.1_category"
        model = load_model(model_path)
        num_topics = model.num_topics
        topic_word_dist = model.topic_word
        topic_top_words = get_topics(corpora, num_topics, topic_word_dist)

    print("Random documents:")
    sampleIDs = random.sample(documentIDs, len(documentIDs))
    for count in range(sample_size):
        if count == len(sampleIDs):
            break
        id = sampleIDs[count]
        print(f"ID: {id}")
        print(f"Author: {id2author[docAuthors[id]]}")
        print(f"Category: {id2category[docCategories[id]]}")
        print(
            f"Taxonomy: {[id2taxonomy[taxID] for taxID in docTaxonomies[id]]}")
        print(f"File name: {docFileNames[id]}")
        print(documents[id])
        print(documentsRaw[id] + "\n")
        if print_top_topics:
            item_top_topics = sample_and_sort_items(model,
                                                    item_id=docCategories[id])
            print(f"Top words in category top topics:")
            for item in item_top_topics.items():
                for topic in item[1]:
                    print(
                        f"Topic ID/probability: {topic[0]}/{'{:.2f}'.format(topic[1])} {topic_top_words[topic[0]]}"
                    )
            print()
Beispiel #4
0
def hellinger_dis(p, q):
    return norm(np.sqrt(p) - np.sqrt(q)) / _SQRT2


def js_sim(p, q):
    return 1 - jensenshannon(p, q)

def hellinger_sim(p, q):
    return 1 - hellinger_dis(p, q)


if __name__ == '__main__':

    folder = "nice"
    doc2auth = prepro_file_load("doc2author", folder_name=folder)
    id2auth = prepro_file_load("id2author", folder_name=folder)

    auth2doc = {}
    for doc, auth in doc2auth.items():
        auth2doc[auth] = auth2doc.get(auth, []) + [doc]

    in_folder = "nice"
    path = f"../model/generated_files/{in_folder}/"

    with open(path + "wta.pickle", "rb") as file:
        wta = pickle.load(file)
    with open(path + "middle.pickle", "rb") as file:
        middle = pickle.load(file)
    with open(path + "topic_word.pickle", "rb") as file:
        topic_word = pickle.load(file)