コード例 #1
0
    def get_topic_distribution(corpus: Corpus, dataset: str, overwrite: bool = False):
        if overwrite or not os.path.isfile(f'D:/models/topic_vectors/{dataset}.kv'):
            _, _, topic_model, lda_corpus, doc_ids = TopicModeller.train_lda_mem_eff(corpus)
            topic_vectors = {}
            # print(len(lda_corpus))
            # print(doc_ids)
            for i, doc_id in enumerate(doc_ids):
                doc = lda_corpus[i]
                topic_vectors[doc_id] = np.array([score for (topic, score) in topic_model[doc][0]])

            # print(topic_vectors)
            Vectorization.my_save_doc2vec_format(fname=f'D:/models/topic_vectors/{dataset}.kv',
                                                 doctag_vec=topic_vectors)

        topic_vecs, _ = Vectorization.my_load_doc2vec_format(f'D:/models/topic_vectors/{dataset}.kv')

        # print(topic_vecs.docvecs.doctags)
        # for doctag in topic_vecs.docvecs.doctags:
        #     print(doctag, topic_vecs.docvecs.most_similar(doctag, topn=None))
        # print(topic_model[lda_corpus[0]])
        # for document in topic_model:
        #     doc_id = ...
        #     gensim_doc_id = ...
        #     topic_vectors[doc_id] = topic_model[lda_corpus[gensim_doc_id]]
        return topic_vecs
コード例 #2
0
def loop_facets(vectors, corpus: Corpus):
    tuples = []
    for i, doc_id_a in enumerate(corpus.documents.keys()):
        for j, doc_id_b in enumerate(corpus.documents.keys()):
            if j > i:
                tuples.extend(
                    Vectorization.get_facet_sims(vectors, corpus, doc_id_a,
                                                 doc_id_b))
    df = pd.DataFrame(tuples, columns=["Facet", "ID_A", "ID_B", "Similarity"])
    df.to_csv('results/facet_evaluation/facet_sims.csv')
    print(df)
    return df
コード例 #3
0
    def topic_evaluation(self, facet_name: str, doc_id: str,
                         is_series_corpus: bool):
        sim_docs = Vectorization.most_similar_documents(
            self.vectors,
            self.corpus,
            positives=doc_id,
            topn=len(self.corpus.documents.items()),
            feature_to_use=facet_name,
            print_results=False,
            series=is_series_corpus)

        sim_docs = {
            doctag.replace(f'_{facet_name}', ''): sim
            for (doctag, sim) in sim_docs
        }
        topic_sim_docs = Vectorization.most_similar_documents(
            self.topic_vectors,
            self.corpus,
            positives=doc_id,
            topn=len(self.corpus.documents.items()),
            print_results=False,
            series=is_series_corpus)
        topic_sim_docs = {doctag: sim for (doctag, sim) in topic_sim_docs}

        neural_sims = []
        topic_sims = []
        for doc_key in self.corpus.documents.keys():
            neural_sims.append(sim_docs[doc_key])
            topic_sims.append(topic_sim_docs[doc_key])

        spearman_corr, spearman_p = stats.spearmanr(np.array(neural_sims),
                                                    np.array(topic_sims))
        spearman_corr = abs(spearman_corr)
        spearman_corr_strict = spearman_corr
        if spearman_p >= 0.05:
            spearman_corr_strict = 0

        return spearman_corr, spearman_corr_strict
コード例 #4
0
def get_facet_sims_of_books(vectors, corpus: Corpus, doc_id_a: str,
                            doc_id_b: str):
    tuples = Vectorization.get_facet_sims(vectors, corpus, doc_id_a, doc_id_b)
    df = pd.DataFrame(tuples, columns=["Facet", "ID_A", "ID_B", "Similarity"])
    df = df.pivot(index=["ID_A", "ID_B"], columns="Facet", values="Similarity")
    return df
コード例 #5
0
    # c = Corpus.fast_load(path="corpora/german_series", load_entities=False)
    #
    # vec_path = Vectorizer.build_vec_file_name("all",
    #                                           "no_limit",
    #                                           "german_series",
    #                                           "no_filter",
    #                                           "book2vec",
    #                                           "real")

    c = Corpus.fast_load(path="../corpora/classic_gutenberg",
                         load_entities=False)

    vec_path = Vectorization.build_vec_file_name("",
                                                 "",
                                                 "classic_gutenberg",
                                                 "no_filter",
                                                 "book2vec_adv",
                                                 "real",
                                                 allow_combination=True)

    vecs, _ = Vectorization.my_load_doc2vec_format(vec_path)

    Vectorization.most_similar_documents(vecs,
                                         c,
                                         positives="cb_18",
                                         feature_to_use="atm")

    big_df = loop_facets(vecs, c)

    radar_chart(get_facet_sims_of_books(vecs, c, c[0].doc_id, c[1].doc_id))
    radar_chart(get_facet_sims_of_books(vecs, c, c[0].doc_id, c[2].doc_id))
コード例 #6
0
    def eval_vec_loop_eff(cls, corpus, number_of_subparts, corpus_size,
                          data_set, filter_mode, vectorization_algorithm):
        vec_path = Vectorization.build_vec_file_name(number_of_subparts,
                                                     corpus_size,
                                                     data_set,
                                                     filter_mode,
                                                     vectorization_algorithm,
                                                     "real",
                                                     allow_combination=True)
        summation_method = "NF"
        vectors, summation_method = Vectorization.my_load_doc2vec_format(
            vec_path)
        # try:
        #     vectors, _ = Vectorization.my_load_doc2vec_format(vec_path)
        # except FileNotFoundError:
        #     if "_o_" in vectorization_algorithm:
        #         vec_splitted = vectorization_algorithm.split("_o_")[0]
        #         focus_facette = vectorization_algorithm.split("_o_")[1]
        #         base_algorithm = vec_splitted
        #         vec_path = Vectorization.build_vec_file_name(number_of_subparts,
        #                                                      corpus_size,
        #                                                      data_set,
        #                                                      filter_mode,
        #                                                      base_algorithm,
        #                                                      "real",
        #                                                      allow_combination=True)
        #         vectors, _ = Vectorization.my_load_doc2vec_format(vec_path)
        #         summation_method = focus_facette
        #     else:
        #         raise FileNotFoundError

        doctags = vectors.docvecs.doctags.keys()
        doctags = [doctag for doctag in doctags if doctag[-1].isdigit()]
        length_vals = {
            doc_id: len(document.get_flat_tokens_from_disk())
            for doc_id, document in corpus.documents.items()
        }
        # length_vals = {doc_id: len(document.get_flat_document_tokens()) for doc_id, document in
        #                corpus.documents.items()}
        # print(length_vals)
        # histogram(length_vals)

        full_tuples = []
        for doc_id_a in doctags:
            for doc_id_b in doctags:
                cos_sim = vectors.docvecs.similarity(
                    cls.modified_doc_id(doc_id_a, summation_method),
                    cls.modified_doc_id(doc_id_b, summation_method))
                if cos_sim < 0:
                    cos_sim = -1 * cos_sim
                length_sim = TextLengthExperiment.length_similarity(
                    length_vals, doc_id_a, doc_id_b)
                length_abs = TextLengthExperiment.length_abs(
                    length_vals, doc_id_a, doc_id_b)
                full_tuples.append(
                    (doc_id_a, doc_id_b, cos_sim, length_sim, length_abs,
                     length_vals[doc_id_a], length_vals[doc_id_b]))
        full_df = pd.DataFrame(full_tuples,
                               columns=[
                                   'Doc ID A', 'Doc ID B', 'Cosine Similarity',
                                   'Length Similarity', 'Length Distance',
                                   'Length A', 'Length B'
                               ])
        print(full_df)

        tria_cos = cls.triangle_values(full_df, 'Cosine Similarity')
        # tria_len_d = cls.triangle_values(full_df, 'Length Distance')
        tria_len = cls.triangle_values(full_df, 'Length Similarity')

        tuples = []
        for cosine, length, length_a in zip(
                full_df['Cosine Similarity'].to_numpy().flatten(),
                full_df['Length Similarity'].to_numpy().flatten(),
                full_df['Length A'].to_numpy().flatten()):
            tuples.append((cosine, length, length_a))

        # df = pd.DataFrame(tuples, columns=['Cosine Similarity', 'Length Similarity', 'Length A'])
        # pd.paper_plots_tables.scatter_matrix(df, hist_kwds={'bins': 50})
        # plt.show()

        HistScatter(full_df,
                    x0_label='Cosine Similarity',
                    x1_label='Length Similarity',
                    algorithm_name=vectorization_algorithm)

        # print(vectorization_algorithm, stats.pearsonr(full_df[['Cosine Similarity']].to_numpy().flatten(),
        #                                               full_df[['Length Simimarity']].to_numpy().flatten()))

        # print(vectorization_algorithm, stats.pearsonr(tria_cos, tria_len))

        filter_1q, filter_2q, filter_3q = get_short_mid_long(
            full_df, "Length A")

        # noinspection PyTypeChecker
        full_spearman = stats.spearmanr(tria_cos, tria_len)

        full_spearman = f'{full_spearman[0]} [{full_spearman[1]}]'

        short_spearman = stats.spearmanr(
            filter_1q[['Cosine Similarity']].to_numpy().flatten(),
            filter_1q[['Length Similarity']].to_numpy().flatten())

        short_spearman = f'{short_spearman[0]} [{short_spearman[1]}]'

        mid_spearman = stats.spearmanr(
            filter_2q[['Cosine Similarity']].to_numpy().flatten(),
            filter_2q[['Length Similarity']].to_numpy().flatten())

        mid_spearman = f'{mid_spearman[0]} [{mid_spearman[1]}]'

        long_spearman = stats.spearmanr(
            filter_3q[['Cosine Similarity']].to_numpy().flatten(),
            filter_3q[['Length Similarity']].to_numpy().flatten())

        long_spearman = f'{long_spearman[0]} [{long_spearman[1]}]'

        # print(vectorization_algorithm, stats.pearsonr(tria_cos, tria_len_d))
        # print(vectorization_algorithm, stats.spearmanr(tria_cos, tria_len_d))
        # cosine_df = pd.DataFrame(cosine_matrix, index=doctags, columns=doctags)

        return vectorization_algorithm, full_spearman, short_spearman, mid_spearman, long_spearman
コード例 #7
0
def force_directed_graph(model, corpus: Corpus):
    top_n_docs = 3
    top_n_tfidf_words = 5000
    top_n_words = 50
    words_shown = 7

    labels = []
    word_labels = set()
    word_neighbors = {}
    document_neighbors = defaultdict(list)

    tokenized_document_corpus = CorpusDocumentIterator(corpus,
                                                       lemma=False,
                                                       lower=False)

    dictionary = corpora.Dictionary()
    bow_corpus = [
        dictionary.doc2bow(doc, allow_update=True)
        for doc in tokenized_document_corpus
    ]
    tf_idf_model = TfidfModel(bow_corpus)

    doc_id_mapping = {
        doc_id: i
        for i, doc_id in enumerate(tokenized_document_corpus.doc_ids)
    }

    relevant_words_of_doc = set()
    # tf_idf_score = {}
    relevant_words_with_sims = set()
    for doc_id, doc in zip(tokenized_document_corpus.doc_ids,
                           tf_idf_model[bow_corpus]):
        tuples = [(dictionary[word_id], sim) for word_id, sim in doc]
        tuples.sort(key=lambda x: x[1])
        relevant_words_with_sims.update(
            set([(doc_id, word, sim) for word, sim in tuples]))
        tuples = tuples[:top_n_tfidf_words]

        # tf_idf_score[doc_id] = {word: sim for word, sim in tuples}
        relevant_words = set([word for word, sim in tuples])

        relevant_words_of_doc.update(relevant_words)
        # relevant_words_of_doc[doc_id] = relevant_words

    tf_idf_lookup = defaultdict(dict)

    for (doc_id, word, sim) in relevant_words_with_sims:
        tf_idf_lookup[doc_id][word] = sim

    # print(tf_idf_lookup)

    for doc_id in model.docvecs.doctags:
        if str(doc_id)[-1].isalpha() and not str(doc_id).endswith("raw"):
            sim_docs = Vectorization.most_similar_documents(
                model,
                corpus,
                positives=[doc_id],
                topn=top_n_docs,
                print_results=False)
            sim_words = Vectorization.most_similar_words(model,
                                                         positives=[doc_id],
                                                         topn=top_n_words,
                                                         print_results=False)

            labels.append((doc_id, "doc"))
            # sim_words = [(sim_word[0], sim_word[1]) for sim_word in sim_words if
            #              sim_word[0] in relevant_words_of_doc['_'.join(doc_id.split('_')[:-1])]]
            sim_words = [(sim_word[0], sim_word[1]) for sim_word in sim_words
                         if sim_word[0] in relevant_words_of_doc]

            word_neighbors[doc_id] = sim_words
            sim_words = [sim_word[0] for sim_word in sim_words][:words_shown]
            word_labels.update(sim_words)
            document_neighbors[doc_id].extend(sim_docs[1:])

    reverted_word_neighbors = defaultdict(set)
    for doc_id, neighbors in word_neighbors.items():
        for (word_neighbor, sim) in neighbors:
            reverted_word_neighbors[word_neighbor].add(doc_id)
    reverted_word_neighbors = {
        word: len(documents)
        for word, documents in reverted_word_neighbors.items()
        if len(documents) >= 1
    }
    word_labels = [(word, "word") for word in word_labels
                   if word in reverted_word_neighbors.keys()]
    word_neighbors = {
        doc_id: [
            word_sim for word_sim in word_sims
            if word_sim[0] in reverted_word_neighbors.keys()
        ]
        for doc_id, word_sims in word_neighbors.items()
    }

    labels.extend(word_labels)
    # labels = [label.lower() for label in labels]
    # print(labels)
    nodes = []
    label2id = {}

    # word_doc_degree_dict = defaultdict(lambda: 1)
    # for label, neighbors in document_neighbors.items():
    #     for neighbor in neighbors:
    #         print(label, neighbor)
    #         word_doc_degree_dict[neighbor[0]] += 1
    # print(word_doc_degree_dict)

    word_degree_dict = defaultdict(lambda: 1)
    doc_degree_dict = defaultdict(lambda: 1)
    for label, neighbors in word_neighbors.items():
        for neighbor in neighbors:
            # print(label, neighbor)
            doc_degree_dict[label] += 1
            word_degree_dict[neighbor[0]] += 1
    print(word_degree_dict)
    # print(doc_degree_dict)

    for i, (label, typ) in enumerate(labels):
        size = 100
        degree = 2.0
        closeness = 1
        eigenvector = 1

        if typ == "word":
            size = 50
            degree = 1.0
            print(label, typ, label in word_degree_dict)
            if label in word_degree_dict:
                closeness += word_degree_dict[label]
                eigenvector += word_degree_dict[label]**3
        else:
            if label in doc_degree_dict:
                closeness += doc_degree_dict[label]
                eigenvector += doc_degree_dict[label]

        closeness = float(closeness)
        eigenvector = float(eigenvector)

        nodes.append({
            "small": 1.0,
            "documents": closeness,
            "standard": degree,
            "words": eigenvector,
            "colour": colors(label)[0],
            "fontcolour": colors(label)[0],
            "id": doc_id_replace(corpus, label),
            "name": label,
            "value": size
        })
        label2id[label] = i

    links = []
    # print(labels, list(word_neighbors.keys())[-10:])
    for (label, typ) in labels:
        if label in document_neighbors:
            for doc_neighbor in document_neighbors[label]:
                # print(doc_neighbor)
                links.append({
                    "source":
                    label2id[label],
                    "target":
                    label2id[doc_neighbor[0]],
                    "value":
                    int(doc_neighbor[1] * 100),
                    "colour":
                    link_color(label, neighbor=doc_neighbor[0])
                })

        if label in word_neighbors:
            for word_neighbor in word_neighbors[label]:
                if word_neighbor[0] in label2id:
                    # print(tf_idf_lookup['_'.join(label.split('_')[:-1])][word_neighbor[0]])
                    links.append({
                        "source": label2id[label],
                        "target": label2id[word_neighbor[0]],
                        "value": int(word_neighbor[1] * 100 / 2),
                        "colour": "#cccccc"
                    })

    # word_counter = defaultdict(lambda: 0)
    # for link in links:
    #     word_counter[link["target"]] += 1
    #
    # remove_words = set([word for word in word_labels if word_counter[label2id[word]] < 2])
    # remove_words_id = set([label2id[word] for word in remove_words])
    # links = [link for link in links if link["target"] not in remove_words_id]
    # nodes = [node for node in nodes if node["name"] not in remove_words]
    # all_link_nodes = set([link["target"] for link in links])
    # all_link_nodes.update([link["source"] for link in links])
    # nodes = [node for i, node in enumerate(nodes) if i in all_link_nodes]
    d3_graph = {"nodes": nodes, "links": links}

    with open('../d3/neighborhood.json', 'w', encoding="utf-8") as outfile:
        json.dump(d3_graph, outfile, indent=1)

    return d3_graph
コード例 #8
0
def neighbor_plot(model, corpus: Corpus):
    document_labels = []
    document_vectors = []
    plt.rcParams.update({'font.size': 6})
    neighbors = {}

    word_labels = []
    facets = {}
    word_neighbors = {}
    for doc_id in model.docvecs.doctags:
        if str(doc_id)[-1].isalpha():
            sim_docs = Vectorization.most_similar_documents(
                model, corpus, positives=[doc_id], topn=2, print_results=False)
            # print(sim_docs)
            print(doc_id_replace(corpus, doc_id),
                  doc_id_replace(corpus, sim_docs[-1][0]))

            sim_words = Vectorization.most_similar_words(model,
                                                         positives=[doc_id],
                                                         topn=2,
                                                         print_results=False)

            sim_words = [sim_word[0] for sim_word in sim_words]
            word_labels.extend(sim_words)
            word_neighbors[doc_id] = sim_words
            for sim_word in sim_words:
                if str(doc_id)[-1].isalpha():
                    facets[sim_word] = str(doc_id).split('_')[-1]
                else:
                    facets[sim_word] = "sum"

            # print(doc_id, sim_docs)
            neighbors[doc_id] = sim_docs[-1][0]
            document_vectors.append(model.docvecs[doc_id])
            document_labels.append(doc_id_replace(corpus, doc_id))

    word_vectors = [model.wv[word] for word in word_labels]

    print(word_labels)
    # print(word_vectors)

    labels = []
    labels.extend(document_labels)
    labels.extend(word_labels)

    vectors = []
    vectors.extend(document_vectors)
    vectors.extend(word_vectors)

    # dim_reduced_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=42)
    dim_reduced_model = UMAP(n_components=2, init='spectral', random_state=42)
    new_values = dim_reduced_model.fit_transform(vectors)

    reduced_dict = {
        label: new_value
        for new_value, label in zip(new_values, labels)
    }
    # print(reduced_dict)

    new_vals = []
    new_labels = []
    new_lines = []
    for doc_id in model.docvecs.doctags:
        if not doc_id.endswith("raw"):
            if "_0_" in doc_id or "_1_" in doc_id or "_2_" in doc_id or "_3_" in doc_id or True:
                try:
                    sim_doc_id = neighbors[doc_id]

                    lab = doc_id_replace(corpus, doc_id)
                    sim_lab = doc_id_replace(corpus, sim_doc_id)
                    dot = reduced_dict[lab]
                    sim_dot = reduced_dict[sim_lab]
                    new_vals.append(dot)
                    new_vals.append(sim_dot)
                    new_labels.append(lab)
                    new_labels.append(sim_lab)

                    x = [dot[0], sim_dot[0]]
                    y = [dot[1], sim_dot[1]]
                    c = colors(doc_id)[0]
                    new_lines.append((x, y, c))
                except KeyError:
                    pass

    for word in word_labels:
        new_vals.append(reduced_dict[word])
        new_labels.append(word)

    new_values = new_vals
    document_labels = new_labels
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(16, 16))
    dots = []
    labs = []
    for i in range(len(x)):
        if document_labels[i] in facets:
            color, facet = colors(facets[document_labels[i]])
        else:
            color, facet = colors(document_labels[i])
        ax = plt.scatter(x[i], y[i], c=color)
        if facet not in labs:
            dots.append(ax)
            labs.append(facet)
        plt.annotate(document_labels[i].split(" - ")[0],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

    for i, line in enumerate(new_lines):
        plt.plot(line[0], line[1], color=line[2])
    labs = [label_replace(lab) for lab in labs]

    plt.legend(
        dots,
        labs,
        # scatterpoints=1,
        loc='best',
        ncol=2,
        fontsize=20)

    plt.show()
コード例 #9
0
        ncol=2,
        fontsize=20)

    plt.show()


if __name__ == '__main__':
    # data_set_name = "classic_gutenberg"
    # data_set_name = "german_books"
    data_set_name = "goodreads_genres"
    vectorization_algorithm = "book2vec"
    filter = "no_filter"  # "specific_words_strict"  # "no_filter"
    vec_path = Vectorization.build_vec_file_name("all",
                                                 "no_limit",
                                                 data_set_name,
                                                 filter,
                                                 vectorization_algorithm,
                                                 "real",
                                                 allow_combination=True)
    vecs, summation_method = Vectorization.my_load_doc2vec_format(vec_path)

    c = Corpus.fast_load("all",
                         "no_limit",
                         data_set_name,
                         filter,
                         "real",
                         load_entities=False)

    tsne_plot(vecs, c)
    # neighbor_plot(vecs, c)
    force_directed_graph(vecs, c)
コード例 #10
0
def get_neighbors(data_sets: List[str], vector_names: List[str]):
    doc_top_n = 3
    facet_names = [
        #     "loc",
        #     "time",
        #     "atm",
        #     "sty",
        "cont",
        # "plot"
    ]
    is_series_corpus = False
    tuples = []
    columns = None
    for data_set in data_sets:
        corpus = Corpus.fast_load(path=os.path.join('../corpora', data_set),
                                  load_entities=False)
        for vector_name in tqdm(vector_names,
                                desc="Iterate through embedding types",
                                total=len(vector_names)):
            vec_path = Vectorization.build_vec_file_name(
                "all",
                "no_limit",
                data_set,
                "no_filter",
                vector_name,
                "real",
                allow_combination=True)

            vectors, _ = Vectorization.my_load_doc2vec_format(vec_path)

            for doc_id in corpus.documents.keys():
                for facet_name in facet_names:
                    sim_docs = Vectorization.most_similar_documents(
                        vectors,
                        corpus,
                        positives=doc_id,
                        topn=doc_top_n,
                        feature_to_use=facet_name,
                        print_results=False,
                        series=is_series_corpus)[1:]
                    if len(sim_docs) == 2:
                        tuples.append(
                            (data_set, vector_name, facet_name,
                             table_format(corpus.documents[doc_id]), 1,
                             table_format(corpus.documents[replace_sim_id(
                                 sim_docs[0][0])]),
                             table_format(corpus.documents[replace_sim_id(
                                 sim_docs[1][0])])))
                        columns = [
                            "Dataset", "Algorithm", "Facet", "Book", "Rank",
                            "First Neighbor", "Second Neighbor"
                        ]
                    else:
                        for i, (sim_doc_id, sim) in enumerate(sim_docs):
                            tuples.append(
                                (data_set, vector_name, facet_name,
                                 table_format(corpus.documents[doc_id]), i,
                                 table_format(corpus.documents[replace_sim_id(
                                     sim_doc_id)]), sim))
                        columns = [
                            "Dataset", "Algorithm", "Facet", "Book", "Rank",
                            "Similar Book", "Similarity"
                        ]
    df = pd.DataFrame(tuples, columns=columns)
    df.to_csv("results/neighbors/neighbors.csv")

    print(df)
コード例 #11
0
def calculate_facet_scores(data_sets: List[str],
                           vector_names: List[str],
                           facets: List[str],
                           use_topic_vecs: bool = False):
    results = []
    for data_set in data_sets:
        corpus = Corpus.fast_load(path=os.path.join('../corpora', data_set),
                                  load_entities=False)
        topic_dict = None
        summary_dict = None
        if "cont" in facets:
            topic_dict = TopicModeller.topic_modelling(corpus)
        if "plot" in facets:
            summary_dict = Summarizer.get_summary(corpus)
        start_time = time.time()
        if use_topic_vecs:
            topic_vecs = TopicModeller.get_topic_distribution(corpus, data_set)
        else:
            topic_vecs = None
        for vector_name in tqdm(vector_names,
                                desc="Iterate through embedding types",
                                total=len(vector_names)):
            # print('---')
            vec_path = Vectorization.build_vec_file_name(
                "all",
                "no_limit",
                data_set,
                "no_filter",
                vector_name,
                "real",
                allow_combination=True)

            vecs, _ = Vectorization.my_load_doc2vec_format(vec_path)
            adv_mode = False
            if "_adv" in vector_name:
                adv_mode = True
            fee = FacetEfficientEvaluation(vectors=vecs,
                                           corpus=corpus,
                                           data_set_name=data_set,
                                           facet_names=facets,
                                           topic_vectors=topic_vecs)
            fac_relaxed_scores, fac_strict_scores, fac_strict_fac_only = fee.evaluate(
                word_top_n=100,
                topic_dict=topic_dict,
                summary_dict=summary_dict,
                adv_mode=adv_mode)

            for fac_name in facets:
                results.append(
                    (data_set, vector_name, fac_name,
                     fac_relaxed_scores[fac_name], fac_strict_scores[fac_name],
                     fac_strict_fac_only[fac_name]))

        tuples = []
        for result in results:
            data_set, vector_name, fac_name, relaxed_scores, strict_scores, fac_only_scores = result
            tuples.append((data_set, fac_name, vector_name,
                           sum(relaxed_scores) / len(relaxed_scores),
                           sum(strict_scores) / len(strict_scores),
                           sum(fac_only_scores) / len(fac_only_scores)))

        df = pd.DataFrame(tuples,
                          columns=[
                              'Corpus', 'Facet', 'Algorithm', 'Relaxed Score',
                              'Strict Score', 'Facet Only Score'
                          ])
        df = df.sort_values([
            'Corpus', 'Facet', 'Algorithm', 'Relaxed Score', 'Strict Score',
            'Facet Only Score'
        ])
        print(df)
        df.to_csv('results/facet_evaluation/facet_task_results.csv',
                  index=False)
        print(df.to_latex(index=False))
        results = []
        a_time = time.time() - start_time
        start_time = time.time()

        # for vector_name in tqdm(vector_names, desc="Iterate through embedding types", total=len(vector_names)):
        #     print('---')
        #     vec_path = Vectorizer.build_vec_file_name("all",
        #                                               "no_limit",
        #                                               data_set,
        #                                               "no_filter",
        #                                               vector_name,
        #                                               "real")
        #
        #     vecs = Vectorizer.my_load_doc2vec_format(vec_path)
        #
        #     for fac_name in tqdm(facets, total=len(facets), desc="Iterate through facetes"):
        #         fe = FacetEvaluation(fac_name, vecs, c, data_set)
        #         relaxed_scores, strict_scores = fe.evaluate()
        #         results.append((data_set, vector_name, fac_name, relaxed_scores, strict_scores))
        #
        # tuples = []
        # for result in results:
        #     data_set, vector_name, fac_name, relaxed_scores, strict_scores = result
        #     tuples.append((data_set, vector_name, fac_name,
        #                    sum(relaxed_scores) / len(relaxed_scores), sum(strict_scores) / len(strict_scores)))
        #
        # df = pd.DataFrame(tuples, columns=['Corpus', 'Algorithm', 'Facet', 'Relaxed Score', 'Strict Score'])
        # print(df)
        # df.to_csv('results/facet_evaluation/facet_task_results.csv', index=False)

        b_time = time.time() - start_time
        print(a_time, b_time)
コード例 #12
0
    def word_neighborhood_evaluation(self, facet_name: str, doc_id: str,
                                     doc_top_n: int, word_top_n: int,
                                     is_series_corpus: bool,
                                     document: Document, topic_dict,
                                     summary_dict, adv_mode: bool):
        sim_docs = Vectorization.most_similar_documents(
            self.vectors,
            self.corpus,
            positives=doc_id,
            topn=doc_top_n,
            feature_to_use=facet_name,
            print_results=False,
            series=is_series_corpus)

        # print(doc_id, len(sim_docs))
        sim_words = Vectorization.most_similar_words(self.vectors,
                                                     positives=[doc_id],
                                                     topn=word_top_n,
                                                     feature_to_use=facet_name,
                                                     print_results=False)
        sim_words_relaxed = set([word for word, sim in sim_words])
        # print(sim_words, len(sim_words))
        facet_words = self.get_facet_words(document,
                                           facet_name,
                                           topic_dict,
                                           summary_dict,
                                           adv_mode=adv_mode)
        sim_words_strict = set(
            [word for word in sim_words_relaxed if word in facet_words])
        # print(sim_words)
        # print(facet_doc_id)
        # print(sim_docs, sim_words)
        # print('_____')
        shared_word_values_relaxed = []
        shared_word_values_strict = []
        reciprocal_ranks = []
        r = 1
        simple_facet_words = sim_words_strict

        for sim_doc_id, sim_doc in sim_docs:
            if str(sim_doc_id).startswith(doc_id):
                continue
            # print('>>', sim_doc_id, word_top_n)
            sim_doc_words = Vectorization.most_similar_words(
                self.vectors,
                positives=[sim_doc_id],
                topn=word_top_n,
                feature_to_use=None,
                print_results=False)
            # print(len(sim_doc_words))
            # print(sim_words_relaxed)
            # print(sim_doc_words)
            sim_doc_words_relaxed = set([word for word, sim in sim_doc_words])
            sim_doc_words_strict = set([
                word for word in sim_doc_words_relaxed if word in facet_words
            ])
            print(facet_name, sim_doc_words_strict)

            # if facet_name == "plot":
            #     print(facet_name, sim_words)
            #     print(facet_name, sim_words_strict)
            #     print(facet_name, sim_doc_words_strict)

            # if facet_name == "sty":
            #     print(facet_name, sim_words)
            #     print(facet_name, sim_words_strict)
            #     print(facet_name, sim_doc_words_strict)
            shared_words_relaxed = sim_words_relaxed.intersection(
                sim_doc_words_relaxed)
            shared_words_strict = sim_words_strict.intersection(
                sim_doc_words_strict)

            # print(len(sim_words_strict), len(sim_doc_words), len(shared_words_relaxed),
            #       len(shared_words_strict))
            # print(sim_words_relaxed)
            # print(sim_doc_words_relaxed)
            #
            # print('sty', facet_words)
            # print(sim_words_strict)
            # print(sim_doc_words_strict)
            # print()
            shared_word_values_relaxed.append(
                len(shared_words_relaxed) / word_top_n)
            shared_word_values_strict.append(
                len(shared_words_strict) / word_top_n)
            reciprocal_ranks.append(1 / r)
            r += 1
            # print(len(shared_words), shared_words)

        reciprocal_ranks = [
            rank / sum(reciprocal_ranks) for rank in reciprocal_ranks
        ]
        score_relaxed = sum([
            shared_val * rank_val for shared_val, rank_val in zip(
                shared_word_values_relaxed, reciprocal_ranks)
        ])
        score_strict = sum([
            shared_val * rank_val for shared_val, rank_val in zip(
                shared_word_values_strict, reciprocal_ranks)
        ])

        return score_relaxed, score_strict, len(
            simple_facet_words) / word_top_n
コード例 #13
0
def success_prediction_task(data_set_name: str, success_dict, vector_names):
    result_tuples = []
    correctness_table = defaultdict(list)
    for vectorization_algorithm in vector_names:
        majority_class = None
        if vectorization_algorithm == "majority_class":
            majority_class = vectorization_algorithm
            vectorization_algorithm = "doc2vec"

        vec_path = Vectorization.build_vec_file_name("all",
                                                     "no_limit",
                                                     data_set_name,
                                                     "no_filter",
                                                     vectorization_algorithm,
                                                     "real",
                                                     allow_combination=True)
        vectors, summation_method = Vectorization.my_load_doc2vec_format(
            vec_path)

        x = []
        y = []
        doc_ids = []
        k_fold_cross_val = None
        for doctag in vectors.docvecs.doctags:
            try:
                if summation_method and f"_{summation_method}" in str(doctag):
                    x.append(vectors.docvecs[doctag])
                    # success = success_dict[doctag.replace(f"_{summation_method}", "")]
                    # if success == "failure":
                    #     success = 0
                    # else:
                    #     success = 1
                    # y.append(success)
                    y.append(0 if success_dict[doctag.replace(
                        f"_{summation_method}", "")] == "failure" else 1)
                    doc_ids.append(doctag.replace(f"_{summation_method}", ""))

                elif not summation_method and str(doctag)[-1].isdigit():
                    doc_splitted = doctag.split("_")
                    if len(doc_splitted) > 1 and doc_splitted[-1][-1].isdigit(
                    ) and doc_splitted[-2][-1].isdigit():
                        pass
                    else:
                        x.append(vectors.docvecs[doctag])
                        y.append(0 if success_dict[doctag] == "failure" else 1)
                        doc_ids.append(doctag)
                else:
                    pass
            except KeyError:
                pass

        counter = defaultdict(lambda: 0)
        print(len(y))
        for truth_val in y:
            counter[truth_val] += 1
        # print(len(doc_ids), len(y))
        # print(doc_ids)

        # print(y)
        classifiers = [
            # RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0),
            sk.svm.LinearSVC(max_iter=10000,
                             class_weight="balanced",
                             dual=False,
                             random_state=42,
                             C=1.5),
            sk.svm.SVC(max_iter=10000,
                       class_weight="balanced",
                       kernel="rbf",
                       random_state=42,
                       C=0.5),
            sk.svm.NuSVC(max_iter=10000,
                         class_weight="balanced",
                         kernel="rbf",
                         gamma="scale",
                         random_state=42,
                         nu=0.5),
            # MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10, 4), random_state=42, max_iter=10000),
            # LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr', max_iter=1000),
        ]

        # classifiers = [make_pipeline(StandardScaler(), classifier) for classifier in classifiers]
        for classifier in classifiers:
            pipeline_classifier = make_pipeline(StandardScaler(), classifier)
            if k_fold_cross_val:
                weights = [counter[pred] for pred in y]
                # results = cross_val_scores_weighted(classifier, x, y, weights, cv=k_fold_cross_val,
                #                                     metrics=[f1_score, precision_score,
                #                                              recall_score, f1_score])

                scoring = {
                    'accuracy': make_scorer(accuracy_score),
                    'precision': make_scorer(precision_score),
                    'recall': make_scorer(recall_score),
                    'f1_score': make_scorer(f1_score)
                }
                kfold = sk.model_selection.KFold(n_splits=k_fold_cross_val,
                                                 random_state=42,
                                                 shuffle=True)
                results = sk.model_selection.cross_validate(
                    estimator=pipeline_classifier,
                    X=x,
                    y=y,
                    cv=kfold,
                    scoring=scoring)

                result_tuples.append(
                    (classifier.__class__.__name__, vectorization_algorithm,
                     np.mean(results["test_f1_score"]),
                     np.mean(results["test_precision"]),
                     np.mean(results["test_recall"]),
                     np.mean(results["test_accuracy"])))
            else:
                nr_iterations = 100
                f1s = []
                for i in range(nr_iterations):
                    x_train, x_test, y_train, y_test = train_test_split(
                        x, y, test_size=0.30, random_state=i)
                    pipeline_classifier.fit(x_train, y_train)
                    y_pred = pipeline_classifier.predict(x_test)

                    if majority_class:
                        y_pred = [1 for y in y_pred]
                        vectorization_algorithm = majority_class

                    # weights = [counter[pred] for pred in y_test]
                    correctness_table[f"{vectorization_algorithm}-{classifier.__class__.__name__}"]\
                        .extend([pred == truth for pred, truth in zip(y_pred, y_test)])
                    f1s.append(
                        f1_score(y_test,
                                 y_pred,
                                 average="weighted",
                                 pos_label=None))
                    # print(weights)
                    # print(vectorization_algorithm, round(classifier.score(x_test, y_test), 4))

                if len(f1s) == 0:
                    f1s.append(0)
                result_tuples.append((
                    classifier.__class__.__name__,
                    vectorization_algorithm,
                    np.mean(f1s),
                    # precision_score(y_test, y_pred, average="weighted"),
                    # recall_score(y_test, y_pred,  average="weighted"),
                    # accuracy_score(y_test, y_pred)
                ))

    print(correctness_table.keys())
    classifiers = ["LinearSVC", "SVC", "NuSVC"]
    for cl in classifiers:
        algo1 = "xlm_pt"
        algo2 = "book2vec_concat"
        true_true = 0
        true_false = 0
        false_true = 0
        false_false = 0
        for e1, e2 in zip(correctness_table[f"{algo1}-{cl}"],
                          correctness_table[f"{algo2}-{cl}"]):
            if e1 and e2:
                true_true += 1
            elif e1 and not e2:
                true_false += 1
            elif not e1 and e2:
                false_true += 1
            elif not e1 and not e2:
                false_false += 1
            else:
                pass
        table = [[true_true, true_false], [false_true, false_false]]
        print(table)
        print(cl)
        mcnemar_sig_text(table)

    df = pd.DataFrame(
        result_tuples,
        columns=[
            "Classifier",
            "Algorithm",
            "Weighted F1",
            # "Precision", "Recall",
            # "Accuracy"
        ])
    print(df)
    df = df.pivot(index='Algorithm',
                  columns='Classifier',
                  values='Weighted F1')
    df.to_csv("../results/book_success_prediction/eval_scores.csv", index=True)
    print(df)
    print(df.to_latex())
コード例 #14
0
def genre_prediction_task(data_set_name: str, genre_dict, vector_names):
    result_tuples = []
    for vectorization_algorithm in vector_names:

        vec_path = Vectorization.build_vec_file_name("all",
                                                     "no_limit",
                                                     data_set_name,
                                                     "no_filter",
                                                     vectorization_algorithm,
                                                     "real",
                                                     allow_combination=True)
        vectors, summation_method = Vectorization.my_load_doc2vec_format(
            vec_path)

        x = []
        y = []

        k_fold_cross_val = None

        for doctag in vectors.docvecs.doctags:
            if summation_method and f"_{summation_method}" in str(doctag):
                x.append(vectors.docvecs[doctag])
                # success = success_dict[doctag.replace(f"_{summation_method}", "")]
                # if success == "failure":
                #     success = 0
                # else:
                #     success = 1
                # y.append(success)
                y.append(genre_dict[doctag.replace(f"_{summation_method}",
                                                   "")])

            else:
                if str(doctag)[-1].isdigit():
                    x.append(vectors.docvecs[doctag])
                    y.append(genre_dict[doctag])

        # print(y)
        classifiers = [
            # RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0),
            # sk.svm.LinearSVC(max_iter=10000, class_weight="balanced", dual=False, random_state=42),
            sk.svm.LinearSVC(max_iter=20000,
                             class_weight="balanced",
                             dual=True,
                             random_state=42),
            # sk.svm.LinearSVC(max_iter=10000, class_weight="balanced", dual=True, random_state=42, C=5),
            # sk.svm.LinearSVC(max_iter=10000, dual=False, random_state=42),
            # sk.svm.LinearSVC(max_iter=10000, dual=True, random_state=42),
            # MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=42, max_iter=10000),
            # LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr', max_iter=1000)
        ]
        for classifier in classifiers:
            if k_fold_cross_val:
                scoring = {
                    'accuracy': make_scorer(accuracy_score),
                    'precision': make_scorer(precision_score),
                    'recall': make_scorer(recall_score),
                    'f1_score': make_scorer(f1_score)
                }

                kfold = sk.model_selection.KFold(n_splits=k_fold_cross_val,
                                                 random_state=42,
                                                 shuffle=True)

                results = sk.model_selection.cross_validate(
                    estimator=classifier, X=x, y=y, cv=kfold, scoring=scoring)

                result_tuples.append(
                    (classifier.__class__.__name__, vectorization_algorithm,
                     np.mean(results["test_f1_score"]),
                     np.mean(results["test_precision"]),
                     np.mean(results["test_recall"]),
                     np.mean(results["test_accuracy"])))
            else:
                x_train, x_test, y_train, y_test = train_test_split(
                    x, y, test_size=0.30, random_state=42)
                classifier.fit(x_train, y_train)
                y_pred = classifier.predict(x_test)
                # print(z)
                # print(vectorization_algorithm, round(classifier.score(x_test, y_test), 4))
                result_tuples.append(
                    (classifier.__class__.__name__, vectorization_algorithm,
                     f1_score(y_test, y_pred, average='weighted'),
                     precision_score(y_test, y_pred, average='weighted'),
                     recall_score(y_test, y_pred, average='weighted'),
                     accuracy_score(y_test, y_pred)))
    df = pd.DataFrame(result_tuples,
                      columns=[
                          "Classifier", "Algorithm", "F1", "Precision",
                          "Recall", "Accuracy"
                      ])
    df = df.pivot(index='Algorithm', columns=['Classifier'], values='F1')
    df.to_csv("results/book_success_prediction/genre_eval_scores.csv",
              index=True)
    print(df)
    print(df.to_latex())