def build_series_corpus(corpus: Corpus, annotated_series_corpus_path: str,
                        number_of_subparts: int):
    corpus = Preprocesser.filter_too_small_docs_from_corpus(corpus)
    corpus.fake_series(series_corpus_dir=annotated_series_corpus_path,
                       number_of_sub_parts=number_of_subparts)
    return Corpus.fast_load(path=annotated_series_corpus_path,
                            load_entities=False)
    def filter_thresholds(cls, dir_path: str, parallel: bool = False):
        data_set_bar = tqdm(cls.data_sets, total=len(cls.data_sets), desc="2 Operate on dataset!!")
        for data_set in data_set_bar:
            data_set_bar.set_description(f'2 Operate on dataset >{data_set}<')
            data_set_bar.refresh()
            annotated_corpus_path = os.path.join(cls.config["system_storage"]["corpora"], data_set)
            try:
                corpus = Corpus.fast_load(path=annotated_corpus_path, load_entities=False)
            except FileNotFoundError:
                corpus = DataHandler.load_corpus(data_set)
                print('corpus loaded')
                # corpus = Preprocesser.annotate_corpus(corpus, without_spacy=False)
                # corpus.save_corpus_adv(annotated_corpus_path)
                Preprocesser.annotate_and_save(corpus,  corpus_dir=annotated_corpus_path, without_spacy=False)
                print('annotated corpus')
                del corpus
                corpus = Corpus.fast_load(path=annotated_corpus_path, load_entities=False)

                # print('saved corpus')

            if cls.absolute:
                thresholds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                              11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                              25, 50, 100, #1000, 2000, 3000,
                              len(corpus)
                              ]
            else:
                thresholds = cls.thresholds

            threshold_bar = tqdm(thresholds, total=len(thresholds), desc="3 Calculate filter_mode results")
            if parallel:
                Parallel(n_jobs=cls.num_cores)(
                    delayed(CommonWordsExperiment.calculate_vocab_sizes)(corpus, t, data_set=data_set,
                                                                         dir_path=dir_path)
                    for t in threshold_bar)
            else:
                res = {t: CommonWordsExperiment.calculate_vocab_sizes(corpus, t, data_set=data_set,
                                                                      dir_path=dir_path)
                       for t in threshold_bar}

                with open(os.path.join(dir_path, 'all.json'), 'w', encoding='utf-8') as outfile:
                    json.dump(res, outfile, indent=1)
def corpus2plain_text_dir(source_path: str):
    corpus = Corpus.fast_load(path=source_path, load_entities=False)

    new_dir = os.path.join(config["system_storage"]["corpora"], 'plain_text',
                           f'{os.path.basename(source_path)}_plain')
    print(new_dir)
    if not os.path.isdir(new_dir):
        os.mkdir(new_dir)
    for doc_id, d in corpus.documents.items():
        doc_path = os.path.join(new_dir, f'{doc_id}_{d.language}.txt')
        with open(doc_path, 'w', encoding="utf-8") as writer:
            writer.write('\n'.join([
                ' '.join(sent.representation())
                for sent in d.get_sentences_from_disk()
            ]))
Example #4
0
    def run_experiment(cls, parallel: bool = False):
        # res = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: dict())))
        for data_set in tqdm(cls.data_sets,
                             total=len(cls.data_sets),
                             desc=f"Evaluate datasets"):
            for filter_mode in tqdm(cls.filters,
                                    total=len(cls.filters),
                                    desc=f"Evaluate filters"):
                corpus = Corpus.fast_load("all",
                                          "no_limit",
                                          data_set,
                                          filter_mode,
                                          "real",
                                          load_entities=False)

                vec_bar = tqdm(cls.vectorization_algorithms,
                               total=len(cls.vectorization_algorithms),
                               desc=f"Evaluate algorithm")
                if parallel:
                    tuple_list_results = Parallel(n_jobs=cls.num_cores)(
                        delayed(TextLengthExperiment.eval_vec_loop_eff)(
                            corpus, "all", "no_limit", data_set, filter_mode,
                            vectorization_algorithm)
                        for vectorization_algorithm in vec_bar)
                else:
                    tuple_list_results = [
                        TextLengthExperiment.eval_vec_loop_eff(
                            corpus, "all", "no_limit", data_set, filter_mode,
                            vectorization_algorithm)
                        for vectorization_algorithm in vec_bar
                    ]

                full_df = pd.DataFrame(tuple_list_results,
                                       columns=[
                                           'Algorithm', 'Full Spearman [p]',
                                           'Short Spearman [p]',
                                           'Medium Spearman [p]',
                                           'Long Spearman [p]'
                                       ])

                full_df.to_csv(os.path.join('../results',
                                            'text_length_experiment',
                                            'text_length_spearman.csv'),
                               index=False)
                full_df.to_latex(os.path.join('../results',
                                              'text_length_experiment',
                                              'text_length_spearman.tex'),
                                 index=False)
Example #5
0
 def get_summary(corpus: Corpus):
     if corpus.root_corpus_path is None:
         raise UserWarning("No root corpus set!")
     corpus_root_path = corpus.root_corpus_path
     summary_dict_path = os.path.join(corpus_root_path, "sent_ids.json")
     if not os.path.isfile(summary_dict_path):
         summary_dict = {}
         print("train summary")
         root_corpus = Corpus.fast_load(path=corpus_root_path, load_entities=False)
         for doc_id, doc in root_corpus.documents.items():
             sents, ids = Summarizer.generate_summary_of_corpus_doc(doc, 20)
             # print(doc_id, ":", ids, [' '.join(sent) for sent in sents])
             summary_dict[doc_id] = ids
         with open(summary_dict_path, 'w', encoding='utf-8') as fp:
             json.dump(summary_dict, fp, indent=1)
     else:
         with open(summary_dict_path) as json_file:
             summary_dict = json.load(json_file)
     return summary_dict
                                                   facet_pred_vals[facet])

    return complete_correlation, facet_correlation


if __name__ == '__main__':
    # c = Corpus.fast_load(path="corpora/german_series", load_entities=False)
    #
    # vec_path = Vectorizer.build_vec_file_name("all",
    #                                           "no_limit",
    #                                           "german_series",
    #                                           "no_filter",
    #                                           "book2vec",
    #                                           "real")

    c = Corpus.fast_load(path="../corpora/classic_gutenberg",
                         load_entities=False)

    vec_path = Vectorization.build_vec_file_name("",
                                                 "",
                                                 "classic_gutenberg",
                                                 "no_filter",
                                                 "book2vec_adv",
                                                 "real",
                                                 allow_combination=True)

    vecs, _ = Vectorization.my_load_doc2vec_format(vec_path)

    Vectorization.most_similar_documents(vecs,
                                         c,
                                         positives="cb_18",
                                         feature_to_use="atm")
def chunk_documents(data_set: str, number_of_subparts: int,
                    corpus_size: Union[int, str]):
    annotated_series_corpus_path = None
    if "_fake_series" in data_set:
        annotated_series_corpus_path = os.path.join(
            config["system_storage"]["corpora"],
            f'{data_set}_{number_of_subparts}_'
            f'{corpus_size}')
        data_set = data_set.replace("_fake_series", "")

    annotated_corpus_path = os.path.join(config["system_storage"]["corpora"],
                                         f'{data_set}')

    # print(annotated_series_corpus_path, annotated_corpus_path)
    if annotated_series_corpus_path:
        try:
            # check if series corpus exists
            # corpus = Corpus(annotated_series_corpus_path)
            corpus = Corpus.fast_load(path=annotated_series_corpus_path,
                                      load_entities=False)
        except FileNotFoundError:
            try:
                # check if general corpus exists
                corpus = Corpus.fast_load(path=annotated_corpus_path,
                                          load_entities=False)
                if corpus_size != "no_limit":
                    corpus = corpus.sample(corpus_size, seed=42)
                corpus = build_series_corpus(corpus,
                                             annotated_series_corpus_path,
                                             number_of_subparts)

                # corpus.save_corpus_adv(annotated_series_corpus_path)
            except FileNotFoundError:
                # load from raw data
                corpus = DataHandler.load_corpus(data_set)
                if corpus_size != "no_limit":
                    corpus = corpus.sample(corpus_size, seed=42)

                Preprocesser.annotate_and_save(
                    corpus,
                    corpus_dir=annotated_corpus_path,
                    without_spacy=False)
                # corpus = Preprocesser.annotate_corpus(corpus)
                # corpus.save_corpus_adv(annotated_corpus_path)

                corpus = build_series_corpus(
                    Corpus.fast_load(path=annotated_corpus_path,
                                     load_entities=False),
                    annotated_series_corpus_path, number_of_subparts)
    else:
        try:
            # check if general corpus exists
            corpus = Corpus.fast_load(path=annotated_corpus_path,
                                      load_entities=False)
            if corpus_size != "no_limit":
                corpus = corpus.sample(corpus_size, seed=42)

            # corpus.save_corpus_adv(annotated_series_corpus_path)
        except FileNotFoundError:
            # load from raw data
            corpus = DataHandler.load_corpus(data_set)
            if corpus_size != "no_limit":
                corpus = corpus.sample(corpus_size, seed=42)

            Preprocesser.annotate_and_save(corpus,
                                           corpus_dir=annotated_corpus_path,
                                           without_spacy=False)
            corpus = Corpus.fast_load(path=annotated_corpus_path,
                                      load_entities=False)

    return corpus
Example #8
0
def corpus_stats(data_sets: List[str]):
    tuples = []

    for data_set_name in data_sets:
        corpus = Corpus.fast_load("all",
                                  "no_limit",
                                  data_set_name,
                                  "no_filter",
                                  "real",
                                  load_entities=False)
        if corpus.language == Language.DE:
            language = "GER"
        else:
            language = "EN"
        nr_books = human_format(len(corpus.documents))

        document_tokens = [
            document.length for document in corpus.documents.values()
        ]
        tokens_total = human_format(sum(document_tokens))
        tokens_avg = f'{np.mean(document_tokens):.0f} ± {np.std(document_tokens):.0f}'
        # tokens_median = f'{np.median(document_tokens):.0f} ± {iqr(document_tokens):.0f}'
        tokens_median = f'{human_format(np.median(document_tokens))}'
        tokens_iqr = f'{human_format(iqr(document_tokens))}'
        tokens_min = f'{human_format(np.min(document_tokens))}'
        tokens_max = f'{human_format(np.max(document_tokens))}'
        document_vocab = [
            document.vocab_size for document in corpus.documents.values()
        ]
        vocab_total = human_format(sum(document_vocab))
        vocab_avg = f'{np.mean(document_vocab):.0f} ± {np.std(document_vocab):.0f}'
        # vocab_median = f'{np.median(document_vocab):.0f} ± {iqr(document_vocab):.0f}'
        vocab_median = f'{human_format(np.median(document_vocab))}'
        vocab_iqr = f'{human_format(iqr(document_vocab))}'
        # vocab_mix = f'[{human_format(np.min(document_vocab))}, {human_format(np.max(document_vocab))}]'
        vocab_min = f'{human_format(np.min(document_vocab))}'
        vocab_max = f'{human_format(np.max(document_vocab))}'

        document_sents = [
            document.sentences_nr for document in corpus.documents.values()
        ]
        sents_total = sum(document_sents)
        sents_avg = f'{np.mean(document_sents):.0f} ± {np.std(document_sents):.0f}'
        sents_median = f'{np.median(document_sents):.0f} ± {iqr(document_sents):.0f}'

        author_dict = defaultdict(list)
        for doc_id, document in corpus.documents.items():
            author_dict[document.authors].append(doc_id)

        print({
            author: len(doc_ids)
            for author, doc_ids in author_dict.items() if author is not None
        })
        author_vals = [
            len(doc_ids) for author, doc_ids in author_dict.items()
            if author is not None
        ]

        author_median = f'{np.median(author_vals):.0f} ± {iqr(author_vals):.0f} [{np.min(author_vals):.0f}, {np.max(author_vals):.0f}]'
        # author_mean = f'{np.mean(author_vals):.2f} ± {np.std(author_vals):.2f} [{np.min(author_vals):.0f}, {np.max(author_vals):.0f}]'
        author_mean = f'{np.mean(author_vals):.2f}'
        author_std = f'{np.std(author_vals):.2f}'
        author_mix = f'[{np.min(author_vals):.0f}, {np.max(author_vals):.0f}]'
        author_max = f'{np.max(author_vals):.0f}'

        print(data_set_name, "Author median iqr / mean std", author_median,
              author_mean)
        if corpus.series_dict and len(corpus.series_dict) > 0:
            series_vals = [
                len(doc_ids)
                for series_id, doc_ids in corpus.series_dict.items()
                if series_id is not None
            ]
            series_median = f'{np.median(series_vals):.0f} ± {iqr(series_vals):.0f} [{np.min(series_vals):.0f}, {np.max(series_vals):.0f}]'
            # series_mean = f'{np.mean(series_vals):.2f} ± {np.std(series_vals):.2f} [{np.min(series_vals):.0f}, {np.max(series_vals):.0f}]'

            series_mean = f'{np.mean(series_vals):.2f}'
            series_std = f'{np.std(series_vals):.2f}'
            series_mix = f'[{np.min(series_vals):.0f}, {np.max(series_vals):.0f}]'

            series_max = f'{np.max(series_vals):.0f}'
            print(data_set_name, "Series median iqr / mean std", series_median,
                  series_mean)
        else:
            series_median = "-"
            series_mean = "-"
            series_std = "-"
            series_mix = "-"

        if corpus.shared_attributes_dict is None:
            corpus.calculate_documents_with_shared_attributes()
        if corpus.shared_attributes_dict["same_genres"] and len(
                corpus.shared_attributes_dict["same_genres"]) > 1:
            genre_vals = [
                len(doc_ids) for genre, doc_ids in
                corpus.shared_attributes_dict["same_genres"].items()
                if genre is not None
            ]
            # print(genre_vals)
            genre_median = f'{np.median(genre_vals):.0f} ± {iqr(genre_vals):.0f} [{np.min(genre_vals):.0f}, {np.max(genre_vals):.0f}]'
            # genre_mean = f'{np.mean(genre_vals):.2f} ± {np.std(genre_vals):.2f} [{np.min(genre_vals):.0f}, {np.max(genre_vals):.0f}]'
            genre_mean = f'{np.mean(genre_vals):.2f}'
            genre_std = f'{np.std(genre_vals):.2f}'
            genre_mix = f'[{np.min(genre_vals):.0f}, {np.max(genre_vals):.0f}]'

            print(data_set_name, "Genre median iqr / mean std", genre_median,
                  genre_mean)
        else:
            genre_median = "-"
            genre_mean = "-"
            genre_std = "-"
            genre_mix = "-"

        # if corpus and len(corpus.series_dict) > 0:
        #     series_median = np.median([len(doc_ids) for series_id, doc_ids in corpus.series_dict.items()])

        tuples.append((
            data_set_name,
            nr_books,
            language,
            tokens_total,
            tokens_median,
            tokens_iqr,
            tokens_min,
            tokens_max,
            vocab_total,
            vocab_median,
            vocab_iqr,
            vocab_min,
            vocab_max,
            author_mean,
            author_std,
            author_mix,
            series_mean,
            series_std,
            series_mix,
            genre_mean,
            genre_std,
            genre_mix,
        ))
    df = pd.DataFrame(
        tuples,
        columns=[
            "Data set", "Amount of Books", "Language", "Total Tokens",
            "Tokens Median", "Tokens IQR", "Tokens Min", "Tokens Max",
            "Total Vocabulary", "Vocabulary Median", "Vocabulary IQR",
            "Vocabulary Min", "Vocabulary Max", "Author Mean", "Author STD",
            "Author [Min, Max]", "Series Mean", "Series STD",
            "Series [Min, Max]", "Genre Mean", "Genre STD", "Genre [Min, Max]"
            # "Books by Same Author ± STD [Min, Max]",
            # "Books by Same Series ± STD [Min, Max]",
            # "Books by Same Genre ± STD [Min, Max]",
            # "Total Sentences", "Sentences Mean [STD]", "Sentences Median [IQR]",
        ],
        index=data_sets)
    df = df.transpose()
    print(df)
    df.to_csv("results/dataset_stats/sizes.csv", index=True)
    print(df.to_latex(index=True))
        fontsize=20)

    plt.show()


if __name__ == '__main__':
    # data_set_name = "classic_gutenberg"
    # data_set_name = "german_books"
    data_set_name = "goodreads_genres"
    vectorization_algorithm = "book2vec"
    filter = "no_filter"  # "specific_words_strict"  # "no_filter"
    vec_path = Vectorization.build_vec_file_name("all",
                                                 "no_limit",
                                                 data_set_name,
                                                 filter,
                                                 vectorization_algorithm,
                                                 "real",
                                                 allow_combination=True)
    vecs, summation_method = Vectorization.my_load_doc2vec_format(vec_path)

    c = Corpus.fast_load("all",
                         "no_limit",
                         data_set_name,
                         filter,
                         "real",
                         load_entities=False)

    tsne_plot(vecs, c)
    # neighbor_plot(vecs, c)
    force_directed_graph(vecs, c)
Example #10
0
def get_neighbors(data_sets: List[str], vector_names: List[str]):
    doc_top_n = 3
    facet_names = [
        #     "loc",
        #     "time",
        #     "atm",
        #     "sty",
        "cont",
        # "plot"
    ]
    is_series_corpus = False
    tuples = []
    columns = None
    for data_set in data_sets:
        corpus = Corpus.fast_load(path=os.path.join('../corpora', data_set),
                                  load_entities=False)
        for vector_name in tqdm(vector_names,
                                desc="Iterate through embedding types",
                                total=len(vector_names)):
            vec_path = Vectorization.build_vec_file_name(
                "all",
                "no_limit",
                data_set,
                "no_filter",
                vector_name,
                "real",
                allow_combination=True)

            vectors, _ = Vectorization.my_load_doc2vec_format(vec_path)

            for doc_id in corpus.documents.keys():
                for facet_name in facet_names:
                    sim_docs = Vectorization.most_similar_documents(
                        vectors,
                        corpus,
                        positives=doc_id,
                        topn=doc_top_n,
                        feature_to_use=facet_name,
                        print_results=False,
                        series=is_series_corpus)[1:]
                    if len(sim_docs) == 2:
                        tuples.append(
                            (data_set, vector_name, facet_name,
                             table_format(corpus.documents[doc_id]), 1,
                             table_format(corpus.documents[replace_sim_id(
                                 sim_docs[0][0])]),
                             table_format(corpus.documents[replace_sim_id(
                                 sim_docs[1][0])])))
                        columns = [
                            "Dataset", "Algorithm", "Facet", "Book", "Rank",
                            "First Neighbor", "Second Neighbor"
                        ]
                    else:
                        for i, (sim_doc_id, sim) in enumerate(sim_docs):
                            tuples.append(
                                (data_set, vector_name, facet_name,
                                 table_format(corpus.documents[doc_id]), i,
                                 table_format(corpus.documents[replace_sim_id(
                                     sim_doc_id)]), sim))
                        columns = [
                            "Dataset", "Algorithm", "Facet", "Book", "Rank",
                            "Similar Book", "Similarity"
                        ]
    df = pd.DataFrame(tuples, columns=columns)
    df.to_csv("results/neighbors/neighbors.csv")

    print(df)
Example #11
0
def calculate_facet_scores(data_sets: List[str],
                           vector_names: List[str],
                           facets: List[str],
                           use_topic_vecs: bool = False):
    results = []
    for data_set in data_sets:
        corpus = Corpus.fast_load(path=os.path.join('../corpora', data_set),
                                  load_entities=False)
        topic_dict = None
        summary_dict = None
        if "cont" in facets:
            topic_dict = TopicModeller.topic_modelling(corpus)
        if "plot" in facets:
            summary_dict = Summarizer.get_summary(corpus)
        start_time = time.time()
        if use_topic_vecs:
            topic_vecs = TopicModeller.get_topic_distribution(corpus, data_set)
        else:
            topic_vecs = None
        for vector_name in tqdm(vector_names,
                                desc="Iterate through embedding types",
                                total=len(vector_names)):
            # print('---')
            vec_path = Vectorization.build_vec_file_name(
                "all",
                "no_limit",
                data_set,
                "no_filter",
                vector_name,
                "real",
                allow_combination=True)

            vecs, _ = Vectorization.my_load_doc2vec_format(vec_path)
            adv_mode = False
            if "_adv" in vector_name:
                adv_mode = True
            fee = FacetEfficientEvaluation(vectors=vecs,
                                           corpus=corpus,
                                           data_set_name=data_set,
                                           facet_names=facets,
                                           topic_vectors=topic_vecs)
            fac_relaxed_scores, fac_strict_scores, fac_strict_fac_only = fee.evaluate(
                word_top_n=100,
                topic_dict=topic_dict,
                summary_dict=summary_dict,
                adv_mode=adv_mode)

            for fac_name in facets:
                results.append(
                    (data_set, vector_name, fac_name,
                     fac_relaxed_scores[fac_name], fac_strict_scores[fac_name],
                     fac_strict_fac_only[fac_name]))

        tuples = []
        for result in results:
            data_set, vector_name, fac_name, relaxed_scores, strict_scores, fac_only_scores = result
            tuples.append((data_set, fac_name, vector_name,
                           sum(relaxed_scores) / len(relaxed_scores),
                           sum(strict_scores) / len(strict_scores),
                           sum(fac_only_scores) / len(fac_only_scores)))

        df = pd.DataFrame(tuples,
                          columns=[
                              'Corpus', 'Facet', 'Algorithm', 'Relaxed Score',
                              'Strict Score', 'Facet Only Score'
                          ])
        df = df.sort_values([
            'Corpus', 'Facet', 'Algorithm', 'Relaxed Score', 'Strict Score',
            'Facet Only Score'
        ])
        print(df)
        df.to_csv('results/facet_evaluation/facet_task_results.csv',
                  index=False)
        print(df.to_latex(index=False))
        results = []
        a_time = time.time() - start_time
        start_time = time.time()

        # for vector_name in tqdm(vector_names, desc="Iterate through embedding types", total=len(vector_names)):
        #     print('---')
        #     vec_path = Vectorizer.build_vec_file_name("all",
        #                                               "no_limit",
        #                                               data_set,
        #                                               "no_filter",
        #                                               vector_name,
        #                                               "real")
        #
        #     vecs = Vectorizer.my_load_doc2vec_format(vec_path)
        #
        #     for fac_name in tqdm(facets, total=len(facets), desc="Iterate through facetes"):
        #         fe = FacetEvaluation(fac_name, vecs, c, data_set)
        #         relaxed_scores, strict_scores = fe.evaluate()
        #         results.append((data_set, vector_name, fac_name, relaxed_scores, strict_scores))
        #
        # tuples = []
        # for result in results:
        #     data_set, vector_name, fac_name, relaxed_scores, strict_scores = result
        #     tuples.append((data_set, vector_name, fac_name,
        #                    sum(relaxed_scores) / len(relaxed_scores), sum(strict_scores) / len(strict_scores)))
        #
        # df = pd.DataFrame(tuples, columns=['Corpus', 'Algorithm', 'Facet', 'Relaxed Score', 'Strict Score'])
        # print(df)
        # df.to_csv('results/facet_evaluation/facet_task_results.csv', index=False)

        b_time = time.time() - start_time
        print(a_time, b_time)