コード例 #1
0
def build_series_corpus(corpus: Corpus, annotated_series_corpus_path: str,
                        number_of_subparts: int):
    corpus = Preprocesser.filter_too_small_docs_from_corpus(corpus)
    corpus.fake_series(series_corpus_dir=annotated_series_corpus_path,
                       number_of_sub_parts=number_of_subparts)
    return Corpus.fast_load(path=annotated_series_corpus_path,
                            load_entities=False)
コード例 #2
0
    def filter_thresholds(cls, dir_path: str, parallel: bool = False):
        data_set_bar = tqdm(cls.data_sets, total=len(cls.data_sets), desc="2 Operate on dataset!!")
        for data_set in data_set_bar:
            data_set_bar.set_description(f'2 Operate on dataset >{data_set}<')
            data_set_bar.refresh()
            annotated_corpus_path = os.path.join(cls.config["system_storage"]["corpora"], data_set)
            try:
                corpus = Corpus.fast_load(path=annotated_corpus_path, load_entities=False)
            except FileNotFoundError:
                corpus = DataHandler.load_corpus(data_set)
                print('corpus loaded')
                # corpus = Preprocesser.annotate_corpus(corpus, without_spacy=False)
                # corpus.save_corpus_adv(annotated_corpus_path)
                Preprocesser.annotate_and_save(corpus,  corpus_dir=annotated_corpus_path, without_spacy=False)
                print('annotated corpus')
                del corpus
                corpus = Corpus.fast_load(path=annotated_corpus_path, load_entities=False)

                # print('saved corpus')

            if cls.absolute:
                thresholds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                              11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                              25, 50, 100, #1000, 2000, 3000,
                              len(corpus)
                              ]
            else:
                thresholds = cls.thresholds

            threshold_bar = tqdm(thresholds, total=len(thresholds), desc="3 Calculate filter_mode results")
            if parallel:
                Parallel(n_jobs=cls.num_cores)(
                    delayed(CommonWordsExperiment.calculate_vocab_sizes)(corpus, t, data_set=data_set,
                                                                         dir_path=dir_path)
                    for t in threshold_bar)
            else:
                res = {t: CommonWordsExperiment.calculate_vocab_sizes(corpus, t, data_set=data_set,
                                                                      dir_path=dir_path)
                       for t in threshold_bar}

                with open(os.path.join(dir_path, 'all.json'), 'w', encoding='utf-8') as outfile:
                    json.dump(res, outfile, indent=1)
コード例 #3
0
def corpus2plain_text_dir(source_path: str):
    corpus = Corpus.fast_load(path=source_path, load_entities=False)

    new_dir = os.path.join(config["system_storage"]["corpora"], 'plain_text',
                           f'{os.path.basename(source_path)}_plain')
    print(new_dir)
    if not os.path.isdir(new_dir):
        os.mkdir(new_dir)
    for doc_id, d in corpus.documents.items():
        doc_path = os.path.join(new_dir, f'{doc_id}_{d.language}.txt')
        with open(doc_path, 'w', encoding="utf-8") as writer:
            writer.write('\n'.join([
                ' '.join(sent.representation())
                for sent in d.get_sentences_from_disk()
            ]))
コード例 #4
0
    def run_experiment(cls, parallel: bool = False):
        # res = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: dict())))
        for data_set in tqdm(cls.data_sets,
                             total=len(cls.data_sets),
                             desc=f"Evaluate datasets"):
            for filter_mode in tqdm(cls.filters,
                                    total=len(cls.filters),
                                    desc=f"Evaluate filters"):
                corpus = Corpus.fast_load("all",
                                          "no_limit",
                                          data_set,
                                          filter_mode,
                                          "real",
                                          load_entities=False)

                vec_bar = tqdm(cls.vectorization_algorithms,
                               total=len(cls.vectorization_algorithms),
                               desc=f"Evaluate algorithm")
                if parallel:
                    tuple_list_results = Parallel(n_jobs=cls.num_cores)(
                        delayed(TextLengthExperiment.eval_vec_loop_eff)(
                            corpus, "all", "no_limit", data_set, filter_mode,
                            vectorization_algorithm)
                        for vectorization_algorithm in vec_bar)
                else:
                    tuple_list_results = [
                        TextLengthExperiment.eval_vec_loop_eff(
                            corpus, "all", "no_limit", data_set, filter_mode,
                            vectorization_algorithm)
                        for vectorization_algorithm in vec_bar
                    ]

                full_df = pd.DataFrame(tuple_list_results,
                                       columns=[
                                           'Algorithm', 'Full Spearman [p]',
                                           'Short Spearman [p]',
                                           'Medium Spearman [p]',
                                           'Long Spearman [p]'
                                       ])

                full_df.to_csv(os.path.join('../results',
                                            'text_length_experiment',
                                            'text_length_spearman.csv'),
                               index=False)
                full_df.to_latex(os.path.join('../results',
                                              'text_length_experiment',
                                              'text_length_spearman.tex'),
                                 index=False)
コード例 #5
0
 def get_summary(corpus: Corpus):
     if corpus.root_corpus_path is None:
         raise UserWarning("No root corpus set!")
     corpus_root_path = corpus.root_corpus_path
     summary_dict_path = os.path.join(corpus_root_path, "sent_ids.json")
     if not os.path.isfile(summary_dict_path):
         summary_dict = {}
         print("train summary")
         root_corpus = Corpus.fast_load(path=corpus_root_path, load_entities=False)
         for doc_id, doc in root_corpus.documents.items():
             sents, ids = Summarizer.generate_summary_of_corpus_doc(doc, 20)
             # print(doc_id, ":", ids, [' '.join(sent) for sent in sents])
             summary_dict[doc_id] = ids
         with open(summary_dict_path, 'w', encoding='utf-8') as fp:
             json.dump(summary_dict, fp, indent=1)
     else:
         with open(summary_dict_path) as json_file:
             summary_dict = json.load(json_file)
     return summary_dict
コード例 #6
0
                                                   facet_pred_vals[facet])

    return complete_correlation, facet_correlation


if __name__ == '__main__':
    # c = Corpus.fast_load(path="corpora/german_series", load_entities=False)
    #
    # vec_path = Vectorizer.build_vec_file_name("all",
    #                                           "no_limit",
    #                                           "german_series",
    #                                           "no_filter",
    #                                           "book2vec",
    #                                           "real")

    c = Corpus.fast_load(path="../corpora/classic_gutenberg",
                         load_entities=False)

    vec_path = Vectorization.build_vec_file_name("",
                                                 "",
                                                 "classic_gutenberg",
                                                 "no_filter",
                                                 "book2vec_adv",
                                                 "real",
                                                 allow_combination=True)

    vecs, _ = Vectorization.my_load_doc2vec_format(vec_path)

    Vectorization.most_similar_documents(vecs,
                                         c,
                                         positives="cb_18",
                                         feature_to_use="atm")
コード例 #7
0
def chunk_documents(data_set: str, number_of_subparts: int,
                    corpus_size: Union[int, str]):
    annotated_series_corpus_path = None
    if "_fake_series" in data_set:
        annotated_series_corpus_path = os.path.join(
            config["system_storage"]["corpora"],
            f'{data_set}_{number_of_subparts}_'
            f'{corpus_size}')
        data_set = data_set.replace("_fake_series", "")

    annotated_corpus_path = os.path.join(config["system_storage"]["corpora"],
                                         f'{data_set}')

    # print(annotated_series_corpus_path, annotated_corpus_path)
    if annotated_series_corpus_path:
        try:
            # check if series corpus exists
            # corpus = Corpus(annotated_series_corpus_path)
            corpus = Corpus.fast_load(path=annotated_series_corpus_path,
                                      load_entities=False)
        except FileNotFoundError:
            try:
                # check if general corpus exists
                corpus = Corpus.fast_load(path=annotated_corpus_path,
                                          load_entities=False)
                if corpus_size != "no_limit":
                    corpus = corpus.sample(corpus_size, seed=42)
                corpus = build_series_corpus(corpus,
                                             annotated_series_corpus_path,
                                             number_of_subparts)

                # corpus.save_corpus_adv(annotated_series_corpus_path)
            except FileNotFoundError:
                # load from raw data
                corpus = DataHandler.load_corpus(data_set)
                if corpus_size != "no_limit":
                    corpus = corpus.sample(corpus_size, seed=42)

                Preprocesser.annotate_and_save(
                    corpus,
                    corpus_dir=annotated_corpus_path,
                    without_spacy=False)
                # corpus = Preprocesser.annotate_corpus(corpus)
                # corpus.save_corpus_adv(annotated_corpus_path)

                corpus = build_series_corpus(
                    Corpus.fast_load(path=annotated_corpus_path,
                                     load_entities=False),
                    annotated_series_corpus_path, number_of_subparts)
    else:
        try:
            # check if general corpus exists
            corpus = Corpus.fast_load(path=annotated_corpus_path,
                                      load_entities=False)
            if corpus_size != "no_limit":
                corpus = corpus.sample(corpus_size, seed=42)

            # corpus.save_corpus_adv(annotated_series_corpus_path)
        except FileNotFoundError:
            # load from raw data
            corpus = DataHandler.load_corpus(data_set)
            if corpus_size != "no_limit":
                corpus = corpus.sample(corpus_size, seed=42)

            Preprocesser.annotate_and_save(corpus,
                                           corpus_dir=annotated_corpus_path,
                                           without_spacy=False)
            corpus = Corpus.fast_load(path=annotated_corpus_path,
                                      load_entities=False)

    return corpus
コード例 #8
0
def corpus_stats(data_sets: List[str]):
    tuples = []

    for data_set_name in data_sets:
        corpus = Corpus.fast_load("all",
                                  "no_limit",
                                  data_set_name,
                                  "no_filter",
                                  "real",
                                  load_entities=False)
        if corpus.language == Language.DE:
            language = "GER"
        else:
            language = "EN"
        nr_books = human_format(len(corpus.documents))

        document_tokens = [
            document.length for document in corpus.documents.values()
        ]
        tokens_total = human_format(sum(document_tokens))
        tokens_avg = f'{np.mean(document_tokens):.0f} ± {np.std(document_tokens):.0f}'
        # tokens_median = f'{np.median(document_tokens):.0f} ± {iqr(document_tokens):.0f}'
        tokens_median = f'{human_format(np.median(document_tokens))}'
        tokens_iqr = f'{human_format(iqr(document_tokens))}'
        tokens_min = f'{human_format(np.min(document_tokens))}'
        tokens_max = f'{human_format(np.max(document_tokens))}'
        document_vocab = [
            document.vocab_size for document in corpus.documents.values()
        ]
        vocab_total = human_format(sum(document_vocab))
        vocab_avg = f'{np.mean(document_vocab):.0f} ± {np.std(document_vocab):.0f}'
        # vocab_median = f'{np.median(document_vocab):.0f} ± {iqr(document_vocab):.0f}'
        vocab_median = f'{human_format(np.median(document_vocab))}'
        vocab_iqr = f'{human_format(iqr(document_vocab))}'
        # vocab_mix = f'[{human_format(np.min(document_vocab))}, {human_format(np.max(document_vocab))}]'
        vocab_min = f'{human_format(np.min(document_vocab))}'
        vocab_max = f'{human_format(np.max(document_vocab))}'

        document_sents = [
            document.sentences_nr for document in corpus.documents.values()
        ]
        sents_total = sum(document_sents)
        sents_avg = f'{np.mean(document_sents):.0f} ± {np.std(document_sents):.0f}'
        sents_median = f'{np.median(document_sents):.0f} ± {iqr(document_sents):.0f}'

        author_dict = defaultdict(list)
        for doc_id, document in corpus.documents.items():
            author_dict[document.authors].append(doc_id)

        print({
            author: len(doc_ids)
            for author, doc_ids in author_dict.items() if author is not None
        })
        author_vals = [
            len(doc_ids) for author, doc_ids in author_dict.items()
            if author is not None
        ]

        author_median = f'{np.median(author_vals):.0f} ± {iqr(author_vals):.0f} [{np.min(author_vals):.0f}, {np.max(author_vals):.0f}]'
        # author_mean = f'{np.mean(author_vals):.2f} ± {np.std(author_vals):.2f} [{np.min(author_vals):.0f}, {np.max(author_vals):.0f}]'
        author_mean = f'{np.mean(author_vals):.2f}'
        author_std = f'{np.std(author_vals):.2f}'
        author_mix = f'[{np.min(author_vals):.0f}, {np.max(author_vals):.0f}]'
        author_max = f'{np.max(author_vals):.0f}'

        print(data_set_name, "Author median iqr / mean std", author_median,
              author_mean)
        if corpus.series_dict and len(corpus.series_dict) > 0:
            series_vals = [
                len(doc_ids)
                for series_id, doc_ids in corpus.series_dict.items()
                if series_id is not None
            ]
            series_median = f'{np.median(series_vals):.0f} ± {iqr(series_vals):.0f} [{np.min(series_vals):.0f}, {np.max(series_vals):.0f}]'
            # series_mean = f'{np.mean(series_vals):.2f} ± {np.std(series_vals):.2f} [{np.min(series_vals):.0f}, {np.max(series_vals):.0f}]'

            series_mean = f'{np.mean(series_vals):.2f}'
            series_std = f'{np.std(series_vals):.2f}'
            series_mix = f'[{np.min(series_vals):.0f}, {np.max(series_vals):.0f}]'

            series_max = f'{np.max(series_vals):.0f}'
            print(data_set_name, "Series median iqr / mean std", series_median,
                  series_mean)
        else:
            series_median = "-"
            series_mean = "-"
            series_std = "-"
            series_mix = "-"

        if corpus.shared_attributes_dict is None:
            corpus.calculate_documents_with_shared_attributes()
        if corpus.shared_attributes_dict["same_genres"] and len(
                corpus.shared_attributes_dict["same_genres"]) > 1:
            genre_vals = [
                len(doc_ids) for genre, doc_ids in
                corpus.shared_attributes_dict["same_genres"].items()
                if genre is not None
            ]
            # print(genre_vals)
            genre_median = f'{np.median(genre_vals):.0f} ± {iqr(genre_vals):.0f} [{np.min(genre_vals):.0f}, {np.max(genre_vals):.0f}]'
            # genre_mean = f'{np.mean(genre_vals):.2f} ± {np.std(genre_vals):.2f} [{np.min(genre_vals):.0f}, {np.max(genre_vals):.0f}]'
            genre_mean = f'{np.mean(genre_vals):.2f}'
            genre_std = f'{np.std(genre_vals):.2f}'
            genre_mix = f'[{np.min(genre_vals):.0f}, {np.max(genre_vals):.0f}]'

            print(data_set_name, "Genre median iqr / mean std", genre_median,
                  genre_mean)
        else:
            genre_median = "-"
            genre_mean = "-"
            genre_std = "-"
            genre_mix = "-"

        # if corpus and len(corpus.series_dict) > 0:
        #     series_median = np.median([len(doc_ids) for series_id, doc_ids in corpus.series_dict.items()])

        tuples.append((
            data_set_name,
            nr_books,
            language,
            tokens_total,
            tokens_median,
            tokens_iqr,
            tokens_min,
            tokens_max,
            vocab_total,
            vocab_median,
            vocab_iqr,
            vocab_min,
            vocab_max,
            author_mean,
            author_std,
            author_mix,
            series_mean,
            series_std,
            series_mix,
            genre_mean,
            genre_std,
            genre_mix,
        ))
    df = pd.DataFrame(
        tuples,
        columns=[
            "Data set", "Amount of Books", "Language", "Total Tokens",
            "Tokens Median", "Tokens IQR", "Tokens Min", "Tokens Max",
            "Total Vocabulary", "Vocabulary Median", "Vocabulary IQR",
            "Vocabulary Min", "Vocabulary Max", "Author Mean", "Author STD",
            "Author [Min, Max]", "Series Mean", "Series STD",
            "Series [Min, Max]", "Genre Mean", "Genre STD", "Genre [Min, Max]"
            # "Books by Same Author ± STD [Min, Max]",
            # "Books by Same Series ± STD [Min, Max]",
            # "Books by Same Genre ± STD [Min, Max]",
            # "Total Sentences", "Sentences Mean [STD]", "Sentences Median [IQR]",
        ],
        index=data_sets)
    df = df.transpose()
    print(df)
    df.to_csv("results/dataset_stats/sizes.csv", index=True)
    print(df.to_latex(index=True))
コード例 #9
0
        fontsize=20)

    plt.show()


if __name__ == '__main__':
    # data_set_name = "classic_gutenberg"
    # data_set_name = "german_books"
    data_set_name = "goodreads_genres"
    vectorization_algorithm = "book2vec"
    filter = "no_filter"  # "specific_words_strict"  # "no_filter"
    vec_path = Vectorization.build_vec_file_name("all",
                                                 "no_limit",
                                                 data_set_name,
                                                 filter,
                                                 vectorization_algorithm,
                                                 "real",
                                                 allow_combination=True)
    vecs, summation_method = Vectorization.my_load_doc2vec_format(vec_path)

    c = Corpus.fast_load("all",
                         "no_limit",
                         data_set_name,
                         filter,
                         "real",
                         load_entities=False)

    tsne_plot(vecs, c)
    # neighbor_plot(vecs, c)
    force_directed_graph(vecs, c)
コード例 #10
0
def get_neighbors(data_sets: List[str], vector_names: List[str]):
    doc_top_n = 3
    facet_names = [
        #     "loc",
        #     "time",
        #     "atm",
        #     "sty",
        "cont",
        # "plot"
    ]
    is_series_corpus = False
    tuples = []
    columns = None
    for data_set in data_sets:
        corpus = Corpus.fast_load(path=os.path.join('../corpora', data_set),
                                  load_entities=False)
        for vector_name in tqdm(vector_names,
                                desc="Iterate through embedding types",
                                total=len(vector_names)):
            vec_path = Vectorization.build_vec_file_name(
                "all",
                "no_limit",
                data_set,
                "no_filter",
                vector_name,
                "real",
                allow_combination=True)

            vectors, _ = Vectorization.my_load_doc2vec_format(vec_path)

            for doc_id in corpus.documents.keys():
                for facet_name in facet_names:
                    sim_docs = Vectorization.most_similar_documents(
                        vectors,
                        corpus,
                        positives=doc_id,
                        topn=doc_top_n,
                        feature_to_use=facet_name,
                        print_results=False,
                        series=is_series_corpus)[1:]
                    if len(sim_docs) == 2:
                        tuples.append(
                            (data_set, vector_name, facet_name,
                             table_format(corpus.documents[doc_id]), 1,
                             table_format(corpus.documents[replace_sim_id(
                                 sim_docs[0][0])]),
                             table_format(corpus.documents[replace_sim_id(
                                 sim_docs[1][0])])))
                        columns = [
                            "Dataset", "Algorithm", "Facet", "Book", "Rank",
                            "First Neighbor", "Second Neighbor"
                        ]
                    else:
                        for i, (sim_doc_id, sim) in enumerate(sim_docs):
                            tuples.append(
                                (data_set, vector_name, facet_name,
                                 table_format(corpus.documents[doc_id]), i,
                                 table_format(corpus.documents[replace_sim_id(
                                     sim_doc_id)]), sim))
                        columns = [
                            "Dataset", "Algorithm", "Facet", "Book", "Rank",
                            "Similar Book", "Similarity"
                        ]
    df = pd.DataFrame(tuples, columns=columns)
    df.to_csv("results/neighbors/neighbors.csv")

    print(df)
コード例 #11
0
def calculate_facet_scores(data_sets: List[str],
                           vector_names: List[str],
                           facets: List[str],
                           use_topic_vecs: bool = False):
    results = []
    for data_set in data_sets:
        corpus = Corpus.fast_load(path=os.path.join('../corpora', data_set),
                                  load_entities=False)
        topic_dict = None
        summary_dict = None
        if "cont" in facets:
            topic_dict = TopicModeller.topic_modelling(corpus)
        if "plot" in facets:
            summary_dict = Summarizer.get_summary(corpus)
        start_time = time.time()
        if use_topic_vecs:
            topic_vecs = TopicModeller.get_topic_distribution(corpus, data_set)
        else:
            topic_vecs = None
        for vector_name in tqdm(vector_names,
                                desc="Iterate through embedding types",
                                total=len(vector_names)):
            # print('---')
            vec_path = Vectorization.build_vec_file_name(
                "all",
                "no_limit",
                data_set,
                "no_filter",
                vector_name,
                "real",
                allow_combination=True)

            vecs, _ = Vectorization.my_load_doc2vec_format(vec_path)
            adv_mode = False
            if "_adv" in vector_name:
                adv_mode = True
            fee = FacetEfficientEvaluation(vectors=vecs,
                                           corpus=corpus,
                                           data_set_name=data_set,
                                           facet_names=facets,
                                           topic_vectors=topic_vecs)
            fac_relaxed_scores, fac_strict_scores, fac_strict_fac_only = fee.evaluate(
                word_top_n=100,
                topic_dict=topic_dict,
                summary_dict=summary_dict,
                adv_mode=adv_mode)

            for fac_name in facets:
                results.append(
                    (data_set, vector_name, fac_name,
                     fac_relaxed_scores[fac_name], fac_strict_scores[fac_name],
                     fac_strict_fac_only[fac_name]))

        tuples = []
        for result in results:
            data_set, vector_name, fac_name, relaxed_scores, strict_scores, fac_only_scores = result
            tuples.append((data_set, fac_name, vector_name,
                           sum(relaxed_scores) / len(relaxed_scores),
                           sum(strict_scores) / len(strict_scores),
                           sum(fac_only_scores) / len(fac_only_scores)))

        df = pd.DataFrame(tuples,
                          columns=[
                              'Corpus', 'Facet', 'Algorithm', 'Relaxed Score',
                              'Strict Score', 'Facet Only Score'
                          ])
        df = df.sort_values([
            'Corpus', 'Facet', 'Algorithm', 'Relaxed Score', 'Strict Score',
            'Facet Only Score'
        ])
        print(df)
        df.to_csv('results/facet_evaluation/facet_task_results.csv',
                  index=False)
        print(df.to_latex(index=False))
        results = []
        a_time = time.time() - start_time
        start_time = time.time()

        # for vector_name in tqdm(vector_names, desc="Iterate through embedding types", total=len(vector_names)):
        #     print('---')
        #     vec_path = Vectorizer.build_vec_file_name("all",
        #                                               "no_limit",
        #                                               data_set,
        #                                               "no_filter",
        #                                               vector_name,
        #                                               "real")
        #
        #     vecs = Vectorizer.my_load_doc2vec_format(vec_path)
        #
        #     for fac_name in tqdm(facets, total=len(facets), desc="Iterate through facetes"):
        #         fe = FacetEvaluation(fac_name, vecs, c, data_set)
        #         relaxed_scores, strict_scores = fe.evaluate()
        #         results.append((data_set, vector_name, fac_name, relaxed_scores, strict_scores))
        #
        # tuples = []
        # for result in results:
        #     data_set, vector_name, fac_name, relaxed_scores, strict_scores = result
        #     tuples.append((data_set, vector_name, fac_name,
        #                    sum(relaxed_scores) / len(relaxed_scores), sum(strict_scores) / len(strict_scores)))
        #
        # df = pd.DataFrame(tuples, columns=['Corpus', 'Algorithm', 'Facet', 'Relaxed Score', 'Strict Score'])
        # print(df)
        # df.to_csv('results/facet_evaluation/facet_task_results.csv', index=False)

        b_time = time.time() - start_time
        print(a_time, b_time)