def build_series_corpus(corpus: Corpus, annotated_series_corpus_path: str, number_of_subparts: int): corpus = Preprocesser.filter_too_small_docs_from_corpus(corpus) corpus.fake_series(series_corpus_dir=annotated_series_corpus_path, number_of_sub_parts=number_of_subparts) return Corpus.fast_load(path=annotated_series_corpus_path, load_entities=False)
def filter_thresholds(cls, dir_path: str, parallel: bool = False): data_set_bar = tqdm(cls.data_sets, total=len(cls.data_sets), desc="2 Operate on dataset!!") for data_set in data_set_bar: data_set_bar.set_description(f'2 Operate on dataset >{data_set}<') data_set_bar.refresh() annotated_corpus_path = os.path.join(cls.config["system_storage"]["corpora"], data_set) try: corpus = Corpus.fast_load(path=annotated_corpus_path, load_entities=False) except FileNotFoundError: corpus = DataHandler.load_corpus(data_set) print('corpus loaded') # corpus = Preprocesser.annotate_corpus(corpus, without_spacy=False) # corpus.save_corpus_adv(annotated_corpus_path) Preprocesser.annotate_and_save(corpus, corpus_dir=annotated_corpus_path, without_spacy=False) print('annotated corpus') del corpus corpus = Corpus.fast_load(path=annotated_corpus_path, load_entities=False) # print('saved corpus') if cls.absolute: thresholds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 50, 100, #1000, 2000, 3000, len(corpus) ] else: thresholds = cls.thresholds threshold_bar = tqdm(thresholds, total=len(thresholds), desc="3 Calculate filter_mode results") if parallel: Parallel(n_jobs=cls.num_cores)( delayed(CommonWordsExperiment.calculate_vocab_sizes)(corpus, t, data_set=data_set, dir_path=dir_path) for t in threshold_bar) else: res = {t: CommonWordsExperiment.calculate_vocab_sizes(corpus, t, data_set=data_set, dir_path=dir_path) for t in threshold_bar} with open(os.path.join(dir_path, 'all.json'), 'w', encoding='utf-8') as outfile: json.dump(res, outfile, indent=1)
def corpus2plain_text_dir(source_path: str): corpus = Corpus.fast_load(path=source_path, load_entities=False) new_dir = os.path.join(config["system_storage"]["corpora"], 'plain_text', f'{os.path.basename(source_path)}_plain') print(new_dir) if not os.path.isdir(new_dir): os.mkdir(new_dir) for doc_id, d in corpus.documents.items(): doc_path = os.path.join(new_dir, f'{doc_id}_{d.language}.txt') with open(doc_path, 'w', encoding="utf-8") as writer: writer.write('\n'.join([ ' '.join(sent.representation()) for sent in d.get_sentences_from_disk() ]))
def run_experiment(cls, parallel: bool = False): # res = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: dict()))) for data_set in tqdm(cls.data_sets, total=len(cls.data_sets), desc=f"Evaluate datasets"): for filter_mode in tqdm(cls.filters, total=len(cls.filters), desc=f"Evaluate filters"): corpus = Corpus.fast_load("all", "no_limit", data_set, filter_mode, "real", load_entities=False) vec_bar = tqdm(cls.vectorization_algorithms, total=len(cls.vectorization_algorithms), desc=f"Evaluate algorithm") if parallel: tuple_list_results = Parallel(n_jobs=cls.num_cores)( delayed(TextLengthExperiment.eval_vec_loop_eff)( corpus, "all", "no_limit", data_set, filter_mode, vectorization_algorithm) for vectorization_algorithm in vec_bar) else: tuple_list_results = [ TextLengthExperiment.eval_vec_loop_eff( corpus, "all", "no_limit", data_set, filter_mode, vectorization_algorithm) for vectorization_algorithm in vec_bar ] full_df = pd.DataFrame(tuple_list_results, columns=[ 'Algorithm', 'Full Spearman [p]', 'Short Spearman [p]', 'Medium Spearman [p]', 'Long Spearman [p]' ]) full_df.to_csv(os.path.join('../results', 'text_length_experiment', 'text_length_spearman.csv'), index=False) full_df.to_latex(os.path.join('../results', 'text_length_experiment', 'text_length_spearman.tex'), index=False)
def get_summary(corpus: Corpus): if corpus.root_corpus_path is None: raise UserWarning("No root corpus set!") corpus_root_path = corpus.root_corpus_path summary_dict_path = os.path.join(corpus_root_path, "sent_ids.json") if not os.path.isfile(summary_dict_path): summary_dict = {} print("train summary") root_corpus = Corpus.fast_load(path=corpus_root_path, load_entities=False) for doc_id, doc in root_corpus.documents.items(): sents, ids = Summarizer.generate_summary_of_corpus_doc(doc, 20) # print(doc_id, ":", ids, [' '.join(sent) for sent in sents]) summary_dict[doc_id] = ids with open(summary_dict_path, 'w', encoding='utf-8') as fp: json.dump(summary_dict, fp, indent=1) else: with open(summary_dict_path) as json_file: summary_dict = json.load(json_file) return summary_dict
facet_pred_vals[facet]) return complete_correlation, facet_correlation if __name__ == '__main__': # c = Corpus.fast_load(path="corpora/german_series", load_entities=False) # # vec_path = Vectorizer.build_vec_file_name("all", # "no_limit", # "german_series", # "no_filter", # "book2vec", # "real") c = Corpus.fast_load(path="../corpora/classic_gutenberg", load_entities=False) vec_path = Vectorization.build_vec_file_name("", "", "classic_gutenberg", "no_filter", "book2vec_adv", "real", allow_combination=True) vecs, _ = Vectorization.my_load_doc2vec_format(vec_path) Vectorization.most_similar_documents(vecs, c, positives="cb_18", feature_to_use="atm")
def chunk_documents(data_set: str, number_of_subparts: int, corpus_size: Union[int, str]): annotated_series_corpus_path = None if "_fake_series" in data_set: annotated_series_corpus_path = os.path.join( config["system_storage"]["corpora"], f'{data_set}_{number_of_subparts}_' f'{corpus_size}') data_set = data_set.replace("_fake_series", "") annotated_corpus_path = os.path.join(config["system_storage"]["corpora"], f'{data_set}') # print(annotated_series_corpus_path, annotated_corpus_path) if annotated_series_corpus_path: try: # check if series corpus exists # corpus = Corpus(annotated_series_corpus_path) corpus = Corpus.fast_load(path=annotated_series_corpus_path, load_entities=False) except FileNotFoundError: try: # check if general corpus exists corpus = Corpus.fast_load(path=annotated_corpus_path, load_entities=False) if corpus_size != "no_limit": corpus = corpus.sample(corpus_size, seed=42) corpus = build_series_corpus(corpus, annotated_series_corpus_path, number_of_subparts) # corpus.save_corpus_adv(annotated_series_corpus_path) except FileNotFoundError: # load from raw data corpus = DataHandler.load_corpus(data_set) if corpus_size != "no_limit": corpus = corpus.sample(corpus_size, seed=42) Preprocesser.annotate_and_save( corpus, corpus_dir=annotated_corpus_path, without_spacy=False) # corpus = Preprocesser.annotate_corpus(corpus) # corpus.save_corpus_adv(annotated_corpus_path) corpus = build_series_corpus( Corpus.fast_load(path=annotated_corpus_path, load_entities=False), annotated_series_corpus_path, number_of_subparts) else: try: # check if general corpus exists corpus = Corpus.fast_load(path=annotated_corpus_path, load_entities=False) if corpus_size != "no_limit": corpus = corpus.sample(corpus_size, seed=42) # corpus.save_corpus_adv(annotated_series_corpus_path) except FileNotFoundError: # load from raw data corpus = DataHandler.load_corpus(data_set) if corpus_size != "no_limit": corpus = corpus.sample(corpus_size, seed=42) Preprocesser.annotate_and_save(corpus, corpus_dir=annotated_corpus_path, without_spacy=False) corpus = Corpus.fast_load(path=annotated_corpus_path, load_entities=False) return corpus
def corpus_stats(data_sets: List[str]): tuples = [] for data_set_name in data_sets: corpus = Corpus.fast_load("all", "no_limit", data_set_name, "no_filter", "real", load_entities=False) if corpus.language == Language.DE: language = "GER" else: language = "EN" nr_books = human_format(len(corpus.documents)) document_tokens = [ document.length for document in corpus.documents.values() ] tokens_total = human_format(sum(document_tokens)) tokens_avg = f'{np.mean(document_tokens):.0f} ± {np.std(document_tokens):.0f}' # tokens_median = f'{np.median(document_tokens):.0f} ± {iqr(document_tokens):.0f}' tokens_median = f'{human_format(np.median(document_tokens))}' tokens_iqr = f'{human_format(iqr(document_tokens))}' tokens_min = f'{human_format(np.min(document_tokens))}' tokens_max = f'{human_format(np.max(document_tokens))}' document_vocab = [ document.vocab_size for document in corpus.documents.values() ] vocab_total = human_format(sum(document_vocab)) vocab_avg = f'{np.mean(document_vocab):.0f} ± {np.std(document_vocab):.0f}' # vocab_median = f'{np.median(document_vocab):.0f} ± {iqr(document_vocab):.0f}' vocab_median = f'{human_format(np.median(document_vocab))}' vocab_iqr = f'{human_format(iqr(document_vocab))}' # vocab_mix = f'[{human_format(np.min(document_vocab))}, {human_format(np.max(document_vocab))}]' vocab_min = f'{human_format(np.min(document_vocab))}' vocab_max = f'{human_format(np.max(document_vocab))}' document_sents = [ document.sentences_nr for document in corpus.documents.values() ] sents_total = sum(document_sents) sents_avg = f'{np.mean(document_sents):.0f} ± {np.std(document_sents):.0f}' sents_median = f'{np.median(document_sents):.0f} ± {iqr(document_sents):.0f}' author_dict = defaultdict(list) for doc_id, document in corpus.documents.items(): author_dict[document.authors].append(doc_id) print({ author: len(doc_ids) for author, doc_ids in author_dict.items() if author is not None }) author_vals = [ len(doc_ids) for author, doc_ids in author_dict.items() if author is not None ] author_median = f'{np.median(author_vals):.0f} ± {iqr(author_vals):.0f} [{np.min(author_vals):.0f}, {np.max(author_vals):.0f}]' # author_mean = f'{np.mean(author_vals):.2f} ± {np.std(author_vals):.2f} [{np.min(author_vals):.0f}, {np.max(author_vals):.0f}]' author_mean = f'{np.mean(author_vals):.2f}' author_std = f'{np.std(author_vals):.2f}' author_mix = f'[{np.min(author_vals):.0f}, {np.max(author_vals):.0f}]' author_max = f'{np.max(author_vals):.0f}' print(data_set_name, "Author median iqr / mean std", author_median, author_mean) if corpus.series_dict and len(corpus.series_dict) > 0: series_vals = [ len(doc_ids) for series_id, doc_ids in corpus.series_dict.items() if series_id is not None ] series_median = f'{np.median(series_vals):.0f} ± {iqr(series_vals):.0f} [{np.min(series_vals):.0f}, {np.max(series_vals):.0f}]' # series_mean = f'{np.mean(series_vals):.2f} ± {np.std(series_vals):.2f} [{np.min(series_vals):.0f}, {np.max(series_vals):.0f}]' series_mean = f'{np.mean(series_vals):.2f}' series_std = f'{np.std(series_vals):.2f}' series_mix = f'[{np.min(series_vals):.0f}, {np.max(series_vals):.0f}]' series_max = f'{np.max(series_vals):.0f}' print(data_set_name, "Series median iqr / mean std", series_median, series_mean) else: series_median = "-" series_mean = "-" series_std = "-" series_mix = "-" if corpus.shared_attributes_dict is None: corpus.calculate_documents_with_shared_attributes() if corpus.shared_attributes_dict["same_genres"] and len( corpus.shared_attributes_dict["same_genres"]) > 1: genre_vals = [ len(doc_ids) for genre, doc_ids in corpus.shared_attributes_dict["same_genres"].items() if genre is not None ] # print(genre_vals) genre_median = f'{np.median(genre_vals):.0f} ± {iqr(genre_vals):.0f} [{np.min(genre_vals):.0f}, {np.max(genre_vals):.0f}]' # genre_mean = f'{np.mean(genre_vals):.2f} ± {np.std(genre_vals):.2f} [{np.min(genre_vals):.0f}, {np.max(genre_vals):.0f}]' genre_mean = f'{np.mean(genre_vals):.2f}' genre_std = f'{np.std(genre_vals):.2f}' genre_mix = f'[{np.min(genre_vals):.0f}, {np.max(genre_vals):.0f}]' print(data_set_name, "Genre median iqr / mean std", genre_median, genre_mean) else: genre_median = "-" genre_mean = "-" genre_std = "-" genre_mix = "-" # if corpus and len(corpus.series_dict) > 0: # series_median = np.median([len(doc_ids) for series_id, doc_ids in corpus.series_dict.items()]) tuples.append(( data_set_name, nr_books, language, tokens_total, tokens_median, tokens_iqr, tokens_min, tokens_max, vocab_total, vocab_median, vocab_iqr, vocab_min, vocab_max, author_mean, author_std, author_mix, series_mean, series_std, series_mix, genre_mean, genre_std, genre_mix, )) df = pd.DataFrame( tuples, columns=[ "Data set", "Amount of Books", "Language", "Total Tokens", "Tokens Median", "Tokens IQR", "Tokens Min", "Tokens Max", "Total Vocabulary", "Vocabulary Median", "Vocabulary IQR", "Vocabulary Min", "Vocabulary Max", "Author Mean", "Author STD", "Author [Min, Max]", "Series Mean", "Series STD", "Series [Min, Max]", "Genre Mean", "Genre STD", "Genre [Min, Max]" # "Books by Same Author ± STD [Min, Max]", # "Books by Same Series ± STD [Min, Max]", # "Books by Same Genre ± STD [Min, Max]", # "Total Sentences", "Sentences Mean [STD]", "Sentences Median [IQR]", ], index=data_sets) df = df.transpose() print(df) df.to_csv("results/dataset_stats/sizes.csv", index=True) print(df.to_latex(index=True))
fontsize=20) plt.show() if __name__ == '__main__': # data_set_name = "classic_gutenberg" # data_set_name = "german_books" data_set_name = "goodreads_genres" vectorization_algorithm = "book2vec" filter = "no_filter" # "specific_words_strict" # "no_filter" vec_path = Vectorization.build_vec_file_name("all", "no_limit", data_set_name, filter, vectorization_algorithm, "real", allow_combination=True) vecs, summation_method = Vectorization.my_load_doc2vec_format(vec_path) c = Corpus.fast_load("all", "no_limit", data_set_name, filter, "real", load_entities=False) tsne_plot(vecs, c) # neighbor_plot(vecs, c) force_directed_graph(vecs, c)
def get_neighbors(data_sets: List[str], vector_names: List[str]): doc_top_n = 3 facet_names = [ # "loc", # "time", # "atm", # "sty", "cont", # "plot" ] is_series_corpus = False tuples = [] columns = None for data_set in data_sets: corpus = Corpus.fast_load(path=os.path.join('../corpora', data_set), load_entities=False) for vector_name in tqdm(vector_names, desc="Iterate through embedding types", total=len(vector_names)): vec_path = Vectorization.build_vec_file_name( "all", "no_limit", data_set, "no_filter", vector_name, "real", allow_combination=True) vectors, _ = Vectorization.my_load_doc2vec_format(vec_path) for doc_id in corpus.documents.keys(): for facet_name in facet_names: sim_docs = Vectorization.most_similar_documents( vectors, corpus, positives=doc_id, topn=doc_top_n, feature_to_use=facet_name, print_results=False, series=is_series_corpus)[1:] if len(sim_docs) == 2: tuples.append( (data_set, vector_name, facet_name, table_format(corpus.documents[doc_id]), 1, table_format(corpus.documents[replace_sim_id( sim_docs[0][0])]), table_format(corpus.documents[replace_sim_id( sim_docs[1][0])]))) columns = [ "Dataset", "Algorithm", "Facet", "Book", "Rank", "First Neighbor", "Second Neighbor" ] else: for i, (sim_doc_id, sim) in enumerate(sim_docs): tuples.append( (data_set, vector_name, facet_name, table_format(corpus.documents[doc_id]), i, table_format(corpus.documents[replace_sim_id( sim_doc_id)]), sim)) columns = [ "Dataset", "Algorithm", "Facet", "Book", "Rank", "Similar Book", "Similarity" ] df = pd.DataFrame(tuples, columns=columns) df.to_csv("results/neighbors/neighbors.csv") print(df)
def calculate_facet_scores(data_sets: List[str], vector_names: List[str], facets: List[str], use_topic_vecs: bool = False): results = [] for data_set in data_sets: corpus = Corpus.fast_load(path=os.path.join('../corpora', data_set), load_entities=False) topic_dict = None summary_dict = None if "cont" in facets: topic_dict = TopicModeller.topic_modelling(corpus) if "plot" in facets: summary_dict = Summarizer.get_summary(corpus) start_time = time.time() if use_topic_vecs: topic_vecs = TopicModeller.get_topic_distribution(corpus, data_set) else: topic_vecs = None for vector_name in tqdm(vector_names, desc="Iterate through embedding types", total=len(vector_names)): # print('---') vec_path = Vectorization.build_vec_file_name( "all", "no_limit", data_set, "no_filter", vector_name, "real", allow_combination=True) vecs, _ = Vectorization.my_load_doc2vec_format(vec_path) adv_mode = False if "_adv" in vector_name: adv_mode = True fee = FacetEfficientEvaluation(vectors=vecs, corpus=corpus, data_set_name=data_set, facet_names=facets, topic_vectors=topic_vecs) fac_relaxed_scores, fac_strict_scores, fac_strict_fac_only = fee.evaluate( word_top_n=100, topic_dict=topic_dict, summary_dict=summary_dict, adv_mode=adv_mode) for fac_name in facets: results.append( (data_set, vector_name, fac_name, fac_relaxed_scores[fac_name], fac_strict_scores[fac_name], fac_strict_fac_only[fac_name])) tuples = [] for result in results: data_set, vector_name, fac_name, relaxed_scores, strict_scores, fac_only_scores = result tuples.append((data_set, fac_name, vector_name, sum(relaxed_scores) / len(relaxed_scores), sum(strict_scores) / len(strict_scores), sum(fac_only_scores) / len(fac_only_scores))) df = pd.DataFrame(tuples, columns=[ 'Corpus', 'Facet', 'Algorithm', 'Relaxed Score', 'Strict Score', 'Facet Only Score' ]) df = df.sort_values([ 'Corpus', 'Facet', 'Algorithm', 'Relaxed Score', 'Strict Score', 'Facet Only Score' ]) print(df) df.to_csv('results/facet_evaluation/facet_task_results.csv', index=False) print(df.to_latex(index=False)) results = [] a_time = time.time() - start_time start_time = time.time() # for vector_name in tqdm(vector_names, desc="Iterate through embedding types", total=len(vector_names)): # print('---') # vec_path = Vectorizer.build_vec_file_name("all", # "no_limit", # data_set, # "no_filter", # vector_name, # "real") # # vecs = Vectorizer.my_load_doc2vec_format(vec_path) # # for fac_name in tqdm(facets, total=len(facets), desc="Iterate through facetes"): # fe = FacetEvaluation(fac_name, vecs, c, data_set) # relaxed_scores, strict_scores = fe.evaluate() # results.append((data_set, vector_name, fac_name, relaxed_scores, strict_scores)) # # tuples = [] # for result in results: # data_set, vector_name, fac_name, relaxed_scores, strict_scores = result # tuples.append((data_set, vector_name, fac_name, # sum(relaxed_scores) / len(relaxed_scores), sum(strict_scores) / len(strict_scores))) # # df = pd.DataFrame(tuples, columns=['Corpus', 'Algorithm', 'Facet', 'Relaxed Score', 'Strict Score']) # print(df) # df.to_csv('results/facet_evaluation/facet_task_results.csv', index=False) b_time = time.time() - start_time print(a_time, b_time)