Esempio n. 1
0
            # And increase the topic count
            increase_count(feature, word, topic, feature_topic, feature_topic_c, topic_word, topic_word_c)


if __name__ == '__main__':
    feature = "category"
    alpha = 0.01
    beta = 0.1
    iterationNum = 50
    num_topics = 90
    doc2feature = prepro_file_load(f"doc2{feature}")
    num_feature = len(set(list(doc2feature.values())))
    doc2word = list(prepro_file_load("doc2word").items())
    doc2bow, dictionary, texts = prepro_file_load('doc2bow'), prepro_file_load('corpora'), list(
        prepro_file_load('doc2pre_text').values())
    D, W = (dictionary.num_docs, len(dictionary))
    train_docs, test_docs = train_test_split(doc2word, test_size=0.33, random_state=1337)

    word_topic_assignment, feature_topic, feature_topic_c, topic_word, topic_word_c = random_initialize(doc2word)

    for i in tqdm(range(0, iterationNum)):
        gibbs_sampling(train_docs, feature_topic, feature_topic_c, topic_word, topic_word_c, word_topic_assignment)
        print(time.strftime('%X'), "Iteration: ", i, " Completed",
              " Perplexity: ",
              x_perplexity(test_docs, feature_topic, feature_topic_c, topic_word, topic_word_c, doc2feature),
              " Coherence: ", get_coherence(doc2bow, dictionary, texts, num_topics, topic_word),
              " Topic Diff: ", mean_topic_diff(topic_word))
    model = Model(num_topics, alpha, beta, feature_topic, feature_topic_c, topic_word, topic_word_c, feature)
    model.save_model()
    print(get_topics(dictionary, num_topics, topic_word))
Esempio n. 2
0
    model_type = model_path.split("_")[-1]
    model = load_model(
        model_path) if model_type != "MultiModel" else load_model(model_path,
                                                                  multi=True)
    num_topics = model.num_topics
    topic_word_dist = model.topic_word
    if model_type == "geographic":
        id2category = pre.prepro_file_load('id2category',
                                           folder_name='full_geographic')
    elif model_type == "topical":
        id2category = pre.prepro_file_load('id2category',
                                           folder_name='full_topical')
    model_type = "category" if model_type == "geographic" or model_type == "topical" else model_type

    item_top_topics = sample_and_sort_items(model, num_items=7)
    topic_top_words = get_topics(corpora, num_topics, topic_word_dist)

    # printing item-topic -> topic-word connections
    if model_type == "standard":
        print("Random documents with top topics:")
    elif model_type == "category":
        print("Random categories with top topics:")
    elif model_type == "author":
        print("Random authors with top topics:")
    elif model_type == "MultiModel":
        print("Random categories and authors with top topics:")
    else:
        print(f"Model type '{model_type}' not known!")
        exit()

    for item in item_top_topics.items():
Esempio n. 3
0
def print_top_topics_geographic_and_topical(num_top_topics: int = 20):
    model_path = "../model/models/90_0.01_0.1_author_category_MultiModel"
    model_type = model_path.split("_")[-1]
    model = load_model(model_path) if model_type != "MultiModel" else load_model(model_path, multi=True)
    corpora = pre.prepro_file_load("corpora", "full")
    id2category = pre.prepro_file_load('id2category', folder_name='full')
    num_topics = model.num_topics
    topic_word_dist = model.topic_word

    # names of categories that are based on a geographic location
    geographic_category_names = ["Frederikshavn-avis", "Morsø Debat", "Morsø-avis", "Rebild-avis", "Brønderslev-avis",
                                 "Thisted-avis", "Jammerbugt-avis", "Vesthimmerland-avis", "Hjørring-avis",
                                 "Aalborg-avis", "Morsø Sport", "Thisted sport", "Mariagerfjord-avis", "Udland-avis"]
    geographic_category_ids = get_category_ids_from_names(id2category, geographic_category_names)
    # categories not based on geographic locations are closer to real topics
    topical_category_ids = [id for id in id2category.keys() if id not in geographic_category_ids]
    if model_type == "MultiModel":
        category_topic = model.feature2_topic
    else:
        category_topic = model.doc_topic
    category_topic = row_distribution_normalization(category_topic)

    # separate the geographic and topical topic distributions and sort on the topics' summed distribution values
    sorted_geographic = delete_rows_and_sort(category_topic, topical_category_ids)
    sorted_topical = delete_rows_and_sort(category_topic, geographic_category_ids)

    # look for topic ID appearances in both top topic lists and unique appearances
    top_multiple_topics = []
    for index in range(num_top_topics):
        cur_topic = list(sorted_geographic.keys())[index]
        if cur_topic in list(sorted_topical.keys())[:num_top_topics]:
            top_multiple_topics.append(cur_topic)
    top_unique_topics = list(
        set(list(sorted_geographic.keys())[:num_top_topics] + list(sorted_topical.keys())[:num_top_topics]))
    for topic in top_multiple_topics:
        top_unique_topics.remove(topic)
    for index, topic in enumerate(top_unique_topics):
        if topic in list(sorted_geographic.keys())[:num_top_topics]:
            top_unique_topics[index] = ("geographic", top_unique_topics[index])
        else:
            top_unique_topics[index] = ("topical", top_unique_topics[index])

    topic_top_words = get_topics(corpora, num_topics, topic_word_dist)

    # print observations
    print("Top topics for only geographic categories:")
    for index in range(num_top_topics):
        topic = list(sorted_geographic.items())[index]
        print(f"{'{:.2f}'.format(topic[1])}, {topic[0]}: {topic_top_words[topic[0]]}")

    print()
    print("Top topics for only topical categories:")
    for index in range(num_top_topics):
        topic = list(sorted_topical.items())[index]
        print(f"{'{:.2f}'.format(topic[1])}, {topic[0]}: {topic_top_words[topic[0]]}")

    print()
    print("Appears in both lists:")
    print(f"{len(top_multiple_topics)}/{num_top_topics}")
    for topic in top_multiple_topics:
        print(f"{topic}: {topic_top_words[topic]}")

    print()
    print("Unique topics:")
    for topic in top_unique_topics:
        print(f"{topic}: {topic_top_words[topic[1]]}")
Esempio n. 4
0
        doc2word)

    # things needed to calculate coherence
    doc2bow, texts = prepro_file_load('doc2bow', folder_name=in_folder), \
                     list(prepro_file_load('doc2pre_text', folder_name=in_folder).values())

    print("Starting Gibbs")
    for i in range(0, iterationNum):
        gibbs_sampling(doc2word)
        print(
            time.strftime('%X'), "Iteration: ", i, " Completed", "Coherence: ",
            get_coherence(doc2bow, corpora, texts,
                          layer_lengths[len(layer_lengths) - 1],
                          topic_to_word))

    topic_words = get_topics(corpora, layer_lengths[len(layer_lengths) - 1],
                             topic_to_word)
    if K is None:
        topic_words = {
            struct_root[mid_layers_num - 1][i]: topic_words[i]
            for i in range(len(topic_words))
        }
    print(topic_words)

    print('generating distributions')

    # calculate document-topic distribution
    doc_top_dists = [{} for x in range(len(layer_lengths))]
    for id, doc_info in tqdm(enumerate(word_topic_assignment)):
        for l in range(len(layer_lengths)):
            doc_dist = np.zeros(shape=layer_lengths[l])
            for word in doc_info:
Esempio n. 5
0
                       doc_topic_c, topic_word, topic_word_c,
                       word_topic_assignment)
        print(
            time.strftime('%X'), "Iteration: ", i, " Completed",
            " Perplexity: ",
            triple_perplexity(test_docs, author_topic, author_topic_c,
                              doc_topic, doc_topic_c, topic_word,
                              topic_word_c), " Coherence: ",
            get_coherence(doc2bow, dictionary, texts, num_topics,
                          topic_word), " Topic Diff: ",
            mean_topic_diff(topic_word))
        model = MultiModel(num_topics, alpha, beta, author_topic,
                           author_topic_c, doc_topic, doc_topic_c, topic_word,
                           topic_word_c, "author_doc")
        model.save_model()
    topic_words = get_topics(dictionary, num_topics, topic_word)

    print(topic_words)

    print('generating distributions')

    # calculate document-topic distribution
    doc_top_dists = {}
    for id, doc_info in tqdm(enumerate(word_topic_assignment)):
        doc_dist = np.zeros(shape=num_topics)
        for word in doc_info:
            doc_dist[word] += 1
        doc_top_dists[id] = doc_dist / doc_dist.sum()

    # calculate topic-word distribution
    top_word_dists = {i: np.zeros(D) for i in range(num_topics)}
def print_metadata_documents(metadata_type: str,
                             metadata_name: str,
                             sample_size: int = 10,
                             print_top_topics=False):
    # load necessary data
    doc2pre = pre.prepro_file_load('doc2pre_text', folder_name='full')
    doc2raw = pre.prepro_file_load('doc2raw_text', folder_name='full')
    id2doc = pre.prepro_file_load('id2doc', folder_name='full')
    doc2author = pre.prepro_file_load('doc2author', folder_name='full')
    doc2category = pre.prepro_file_load('doc2category', folder_name='full')
    doc2taxonomy = pre.prepro_file_load('doc2taxonomy', folder_name='full')
    id2category = pre.prepro_file_load('id2category', folder_name='full')
    id2author = pre.prepro_file_load('id2author', folder_name='full')
    id2taxonomy = pre.prepro_file_load('id2taxonomy', folder_name='full')

    if metadata_type == "category":
        doc2meta = pre.prepro_file_load('doc2category', folder_name='full')
        id2meta = id2category
    elif metadata_type == "author":
        doc2meta = pre.prepro_file_load('doc2author', folder_name='full')
        id2meta = id2author
    elif metadata_type == "taxonomy":
        doc2meta = pre.prepro_file_load('doc2taxonomy', folder_name='full')
        id2meta = id2taxonomy
    else:
        print(f"'{metadata_type}' not found!")
        exit()

    # get metadata ID from name (examples: '26. Frederik', 'System Administrator', 'EMNER')
    metaID = find_id_from_value(id2meta, metadata_name)

    # get document IDs for documents with the given metadata
    documentIDs = get_metadata_document_ids(doc2meta, metaID)

    documents = {}
    documentsRaw = {}
    docAuthors = {}
    docCategories = {}
    docTaxonomies = {}
    docFileNames = {}
    # get data based on document IDs
    for docID in documentIDs:
        documents[docID] = doc2pre[docID]
        documentsRaw[docID] = doc2raw[docID]
        docAuthors[docID] = doc2author[docID]
        docCategories[docID] = doc2category[docID]
        docTaxonomies[docID] = doc2taxonomy[docID]
        docFileNames[docID] = id2doc[docID]

    # document set information
    print(f"{len(documents)} documents found\n")
    print_meta_document_set_info(docAuthors, id2author, "authors")
    print_meta_document_set_info(docCategories, id2category, "categories")
    print_meta_document_set_info(docTaxonomies, id2taxonomy, "taxonomies")

    # random examples of documents with metadata information
    if print_top_topics:
        corpora = pre.prepro_file_load("corpora", "full")
        model_path = "../model/models/90_0.01_0.1_category"
        model = load_model(model_path)
        num_topics = model.num_topics
        topic_word_dist = model.topic_word
        topic_top_words = get_topics(corpora, num_topics, topic_word_dist)

    print("Random documents:")
    sampleIDs = random.sample(documentIDs, len(documentIDs))
    for count in range(sample_size):
        if count == len(sampleIDs):
            break
        id = sampleIDs[count]
        print(f"ID: {id}")
        print(f"Author: {id2author[docAuthors[id]]}")
        print(f"Category: {id2category[docCategories[id]]}")
        print(
            f"Taxonomy: {[id2taxonomy[taxID] for taxID in docTaxonomies[id]]}")
        print(f"File name: {docFileNames[id]}")
        print(documents[id])
        print(documentsRaw[id] + "\n")
        if print_top_topics:
            item_top_topics = sample_and_sort_items(model,
                                                    item_id=docCategories[id])
            print(f"Top words in category top topics:")
            for item in item_top_topics.items():
                for topic in item[1]:
                    print(
                        f"Topic ID/probability: {topic[0]}/{'{:.2f}'.format(topic[1])} {topic_top_words[topic[0]]}"
                    )
            print()