def get_topic_distribution(corpus: Corpus, dataset: str, overwrite: bool = False): if overwrite or not os.path.isfile(f'D:/models/topic_vectors/{dataset}.kv'): _, _, topic_model, lda_corpus, doc_ids = TopicModeller.train_lda_mem_eff(corpus) topic_vectors = {} # print(len(lda_corpus)) # print(doc_ids) for i, doc_id in enumerate(doc_ids): doc = lda_corpus[i] topic_vectors[doc_id] = np.array([score for (topic, score) in topic_model[doc][0]]) # print(topic_vectors) Vectorization.my_save_doc2vec_format(fname=f'D:/models/topic_vectors/{dataset}.kv', doctag_vec=topic_vectors) topic_vecs, _ = Vectorization.my_load_doc2vec_format(f'D:/models/topic_vectors/{dataset}.kv') # print(topic_vecs.docvecs.doctags) # for doctag in topic_vecs.docvecs.doctags: # print(doctag, topic_vecs.docvecs.most_similar(doctag, topn=None)) # print(topic_model[lda_corpus[0]]) # for document in topic_model: # doc_id = ... # gensim_doc_id = ... # topic_vectors[doc_id] = topic_model[lda_corpus[gensim_doc_id]] return topic_vecs
def loop_facets(vectors, corpus: Corpus): tuples = [] for i, doc_id_a in enumerate(corpus.documents.keys()): for j, doc_id_b in enumerate(corpus.documents.keys()): if j > i: tuples.extend( Vectorization.get_facet_sims(vectors, corpus, doc_id_a, doc_id_b)) df = pd.DataFrame(tuples, columns=["Facet", "ID_A", "ID_B", "Similarity"]) df.to_csv('results/facet_evaluation/facet_sims.csv') print(df) return df
def topic_evaluation(self, facet_name: str, doc_id: str, is_series_corpus: bool): sim_docs = Vectorization.most_similar_documents( self.vectors, self.corpus, positives=doc_id, topn=len(self.corpus.documents.items()), feature_to_use=facet_name, print_results=False, series=is_series_corpus) sim_docs = { doctag.replace(f'_{facet_name}', ''): sim for (doctag, sim) in sim_docs } topic_sim_docs = Vectorization.most_similar_documents( self.topic_vectors, self.corpus, positives=doc_id, topn=len(self.corpus.documents.items()), print_results=False, series=is_series_corpus) topic_sim_docs = {doctag: sim for (doctag, sim) in topic_sim_docs} neural_sims = [] topic_sims = [] for doc_key in self.corpus.documents.keys(): neural_sims.append(sim_docs[doc_key]) topic_sims.append(topic_sim_docs[doc_key]) spearman_corr, spearman_p = stats.spearmanr(np.array(neural_sims), np.array(topic_sims)) spearman_corr = abs(spearman_corr) spearman_corr_strict = spearman_corr if spearman_p >= 0.05: spearman_corr_strict = 0 return spearman_corr, spearman_corr_strict
def get_facet_sims_of_books(vectors, corpus: Corpus, doc_id_a: str, doc_id_b: str): tuples = Vectorization.get_facet_sims(vectors, corpus, doc_id_a, doc_id_b) df = pd.DataFrame(tuples, columns=["Facet", "ID_A", "ID_B", "Similarity"]) df = df.pivot(index=["ID_A", "ID_B"], columns="Facet", values="Similarity") return df
# c = Corpus.fast_load(path="corpora/german_series", load_entities=False) # # vec_path = Vectorizer.build_vec_file_name("all", # "no_limit", # "german_series", # "no_filter", # "book2vec", # "real") c = Corpus.fast_load(path="../corpora/classic_gutenberg", load_entities=False) vec_path = Vectorization.build_vec_file_name("", "", "classic_gutenberg", "no_filter", "book2vec_adv", "real", allow_combination=True) vecs, _ = Vectorization.my_load_doc2vec_format(vec_path) Vectorization.most_similar_documents(vecs, c, positives="cb_18", feature_to_use="atm") big_df = loop_facets(vecs, c) radar_chart(get_facet_sims_of_books(vecs, c, c[0].doc_id, c[1].doc_id)) radar_chart(get_facet_sims_of_books(vecs, c, c[0].doc_id, c[2].doc_id))
def eval_vec_loop_eff(cls, corpus, number_of_subparts, corpus_size, data_set, filter_mode, vectorization_algorithm): vec_path = Vectorization.build_vec_file_name(number_of_subparts, corpus_size, data_set, filter_mode, vectorization_algorithm, "real", allow_combination=True) summation_method = "NF" vectors, summation_method = Vectorization.my_load_doc2vec_format( vec_path) # try: # vectors, _ = Vectorization.my_load_doc2vec_format(vec_path) # except FileNotFoundError: # if "_o_" in vectorization_algorithm: # vec_splitted = vectorization_algorithm.split("_o_")[0] # focus_facette = vectorization_algorithm.split("_o_")[1] # base_algorithm = vec_splitted # vec_path = Vectorization.build_vec_file_name(number_of_subparts, # corpus_size, # data_set, # filter_mode, # base_algorithm, # "real", # allow_combination=True) # vectors, _ = Vectorization.my_load_doc2vec_format(vec_path) # summation_method = focus_facette # else: # raise FileNotFoundError doctags = vectors.docvecs.doctags.keys() doctags = [doctag for doctag in doctags if doctag[-1].isdigit()] length_vals = { doc_id: len(document.get_flat_tokens_from_disk()) for doc_id, document in corpus.documents.items() } # length_vals = {doc_id: len(document.get_flat_document_tokens()) for doc_id, document in # corpus.documents.items()} # print(length_vals) # histogram(length_vals) full_tuples = [] for doc_id_a in doctags: for doc_id_b in doctags: cos_sim = vectors.docvecs.similarity( cls.modified_doc_id(doc_id_a, summation_method), cls.modified_doc_id(doc_id_b, summation_method)) if cos_sim < 0: cos_sim = -1 * cos_sim length_sim = TextLengthExperiment.length_similarity( length_vals, doc_id_a, doc_id_b) length_abs = TextLengthExperiment.length_abs( length_vals, doc_id_a, doc_id_b) full_tuples.append( (doc_id_a, doc_id_b, cos_sim, length_sim, length_abs, length_vals[doc_id_a], length_vals[doc_id_b])) full_df = pd.DataFrame(full_tuples, columns=[ 'Doc ID A', 'Doc ID B', 'Cosine Similarity', 'Length Similarity', 'Length Distance', 'Length A', 'Length B' ]) print(full_df) tria_cos = cls.triangle_values(full_df, 'Cosine Similarity') # tria_len_d = cls.triangle_values(full_df, 'Length Distance') tria_len = cls.triangle_values(full_df, 'Length Similarity') tuples = [] for cosine, length, length_a in zip( full_df['Cosine Similarity'].to_numpy().flatten(), full_df['Length Similarity'].to_numpy().flatten(), full_df['Length A'].to_numpy().flatten()): tuples.append((cosine, length, length_a)) # df = pd.DataFrame(tuples, columns=['Cosine Similarity', 'Length Similarity', 'Length A']) # pd.paper_plots_tables.scatter_matrix(df, hist_kwds={'bins': 50}) # plt.show() HistScatter(full_df, x0_label='Cosine Similarity', x1_label='Length Similarity', algorithm_name=vectorization_algorithm) # print(vectorization_algorithm, stats.pearsonr(full_df[['Cosine Similarity']].to_numpy().flatten(), # full_df[['Length Simimarity']].to_numpy().flatten())) # print(vectorization_algorithm, stats.pearsonr(tria_cos, tria_len)) filter_1q, filter_2q, filter_3q = get_short_mid_long( full_df, "Length A") # noinspection PyTypeChecker full_spearman = stats.spearmanr(tria_cos, tria_len) full_spearman = f'{full_spearman[0]} [{full_spearman[1]}]' short_spearman = stats.spearmanr( filter_1q[['Cosine Similarity']].to_numpy().flatten(), filter_1q[['Length Similarity']].to_numpy().flatten()) short_spearman = f'{short_spearman[0]} [{short_spearman[1]}]' mid_spearman = stats.spearmanr( filter_2q[['Cosine Similarity']].to_numpy().flatten(), filter_2q[['Length Similarity']].to_numpy().flatten()) mid_spearman = f'{mid_spearman[0]} [{mid_spearman[1]}]' long_spearman = stats.spearmanr( filter_3q[['Cosine Similarity']].to_numpy().flatten(), filter_3q[['Length Similarity']].to_numpy().flatten()) long_spearman = f'{long_spearman[0]} [{long_spearman[1]}]' # print(vectorization_algorithm, stats.pearsonr(tria_cos, tria_len_d)) # print(vectorization_algorithm, stats.spearmanr(tria_cos, tria_len_d)) # cosine_df = pd.DataFrame(cosine_matrix, index=doctags, columns=doctags) return vectorization_algorithm, full_spearman, short_spearman, mid_spearman, long_spearman
def force_directed_graph(model, corpus: Corpus): top_n_docs = 3 top_n_tfidf_words = 5000 top_n_words = 50 words_shown = 7 labels = [] word_labels = set() word_neighbors = {} document_neighbors = defaultdict(list) tokenized_document_corpus = CorpusDocumentIterator(corpus, lemma=False, lower=False) dictionary = corpora.Dictionary() bow_corpus = [ dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_document_corpus ] tf_idf_model = TfidfModel(bow_corpus) doc_id_mapping = { doc_id: i for i, doc_id in enumerate(tokenized_document_corpus.doc_ids) } relevant_words_of_doc = set() # tf_idf_score = {} relevant_words_with_sims = set() for doc_id, doc in zip(tokenized_document_corpus.doc_ids, tf_idf_model[bow_corpus]): tuples = [(dictionary[word_id], sim) for word_id, sim in doc] tuples.sort(key=lambda x: x[1]) relevant_words_with_sims.update( set([(doc_id, word, sim) for word, sim in tuples])) tuples = tuples[:top_n_tfidf_words] # tf_idf_score[doc_id] = {word: sim for word, sim in tuples} relevant_words = set([word for word, sim in tuples]) relevant_words_of_doc.update(relevant_words) # relevant_words_of_doc[doc_id] = relevant_words tf_idf_lookup = defaultdict(dict) for (doc_id, word, sim) in relevant_words_with_sims: tf_idf_lookup[doc_id][word] = sim # print(tf_idf_lookup) for doc_id in model.docvecs.doctags: if str(doc_id)[-1].isalpha() and not str(doc_id).endswith("raw"): sim_docs = Vectorization.most_similar_documents( model, corpus, positives=[doc_id], topn=top_n_docs, print_results=False) sim_words = Vectorization.most_similar_words(model, positives=[doc_id], topn=top_n_words, print_results=False) labels.append((doc_id, "doc")) # sim_words = [(sim_word[0], sim_word[1]) for sim_word in sim_words if # sim_word[0] in relevant_words_of_doc['_'.join(doc_id.split('_')[:-1])]] sim_words = [(sim_word[0], sim_word[1]) for sim_word in sim_words if sim_word[0] in relevant_words_of_doc] word_neighbors[doc_id] = sim_words sim_words = [sim_word[0] for sim_word in sim_words][:words_shown] word_labels.update(sim_words) document_neighbors[doc_id].extend(sim_docs[1:]) reverted_word_neighbors = defaultdict(set) for doc_id, neighbors in word_neighbors.items(): for (word_neighbor, sim) in neighbors: reverted_word_neighbors[word_neighbor].add(doc_id) reverted_word_neighbors = { word: len(documents) for word, documents in reverted_word_neighbors.items() if len(documents) >= 1 } word_labels = [(word, "word") for word in word_labels if word in reverted_word_neighbors.keys()] word_neighbors = { doc_id: [ word_sim for word_sim in word_sims if word_sim[0] in reverted_word_neighbors.keys() ] for doc_id, word_sims in word_neighbors.items() } labels.extend(word_labels) # labels = [label.lower() for label in labels] # print(labels) nodes = [] label2id = {} # word_doc_degree_dict = defaultdict(lambda: 1) # for label, neighbors in document_neighbors.items(): # for neighbor in neighbors: # print(label, neighbor) # word_doc_degree_dict[neighbor[0]] += 1 # print(word_doc_degree_dict) word_degree_dict = defaultdict(lambda: 1) doc_degree_dict = defaultdict(lambda: 1) for label, neighbors in word_neighbors.items(): for neighbor in neighbors: # print(label, neighbor) doc_degree_dict[label] += 1 word_degree_dict[neighbor[0]] += 1 print(word_degree_dict) # print(doc_degree_dict) for i, (label, typ) in enumerate(labels): size = 100 degree = 2.0 closeness = 1 eigenvector = 1 if typ == "word": size = 50 degree = 1.0 print(label, typ, label in word_degree_dict) if label in word_degree_dict: closeness += word_degree_dict[label] eigenvector += word_degree_dict[label]**3 else: if label in doc_degree_dict: closeness += doc_degree_dict[label] eigenvector += doc_degree_dict[label] closeness = float(closeness) eigenvector = float(eigenvector) nodes.append({ "small": 1.0, "documents": closeness, "standard": degree, "words": eigenvector, "colour": colors(label)[0], "fontcolour": colors(label)[0], "id": doc_id_replace(corpus, label), "name": label, "value": size }) label2id[label] = i links = [] # print(labels, list(word_neighbors.keys())[-10:]) for (label, typ) in labels: if label in document_neighbors: for doc_neighbor in document_neighbors[label]: # print(doc_neighbor) links.append({ "source": label2id[label], "target": label2id[doc_neighbor[0]], "value": int(doc_neighbor[1] * 100), "colour": link_color(label, neighbor=doc_neighbor[0]) }) if label in word_neighbors: for word_neighbor in word_neighbors[label]: if word_neighbor[0] in label2id: # print(tf_idf_lookup['_'.join(label.split('_')[:-1])][word_neighbor[0]]) links.append({ "source": label2id[label], "target": label2id[word_neighbor[0]], "value": int(word_neighbor[1] * 100 / 2), "colour": "#cccccc" }) # word_counter = defaultdict(lambda: 0) # for link in links: # word_counter[link["target"]] += 1 # # remove_words = set([word for word in word_labels if word_counter[label2id[word]] < 2]) # remove_words_id = set([label2id[word] for word in remove_words]) # links = [link for link in links if link["target"] not in remove_words_id] # nodes = [node for node in nodes if node["name"] not in remove_words] # all_link_nodes = set([link["target"] for link in links]) # all_link_nodes.update([link["source"] for link in links]) # nodes = [node for i, node in enumerate(nodes) if i in all_link_nodes] d3_graph = {"nodes": nodes, "links": links} with open('../d3/neighborhood.json', 'w', encoding="utf-8") as outfile: json.dump(d3_graph, outfile, indent=1) return d3_graph
def neighbor_plot(model, corpus: Corpus): document_labels = [] document_vectors = [] plt.rcParams.update({'font.size': 6}) neighbors = {} word_labels = [] facets = {} word_neighbors = {} for doc_id in model.docvecs.doctags: if str(doc_id)[-1].isalpha(): sim_docs = Vectorization.most_similar_documents( model, corpus, positives=[doc_id], topn=2, print_results=False) # print(sim_docs) print(doc_id_replace(corpus, doc_id), doc_id_replace(corpus, sim_docs[-1][0])) sim_words = Vectorization.most_similar_words(model, positives=[doc_id], topn=2, print_results=False) sim_words = [sim_word[0] for sim_word in sim_words] word_labels.extend(sim_words) word_neighbors[doc_id] = sim_words for sim_word in sim_words: if str(doc_id)[-1].isalpha(): facets[sim_word] = str(doc_id).split('_')[-1] else: facets[sim_word] = "sum" # print(doc_id, sim_docs) neighbors[doc_id] = sim_docs[-1][0] document_vectors.append(model.docvecs[doc_id]) document_labels.append(doc_id_replace(corpus, doc_id)) word_vectors = [model.wv[word] for word in word_labels] print(word_labels) # print(word_vectors) labels = [] labels.extend(document_labels) labels.extend(word_labels) vectors = [] vectors.extend(document_vectors) vectors.extend(word_vectors) # dim_reduced_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=42) dim_reduced_model = UMAP(n_components=2, init='spectral', random_state=42) new_values = dim_reduced_model.fit_transform(vectors) reduced_dict = { label: new_value for new_value, label in zip(new_values, labels) } # print(reduced_dict) new_vals = [] new_labels = [] new_lines = [] for doc_id in model.docvecs.doctags: if not doc_id.endswith("raw"): if "_0_" in doc_id or "_1_" in doc_id or "_2_" in doc_id or "_3_" in doc_id or True: try: sim_doc_id = neighbors[doc_id] lab = doc_id_replace(corpus, doc_id) sim_lab = doc_id_replace(corpus, sim_doc_id) dot = reduced_dict[lab] sim_dot = reduced_dict[sim_lab] new_vals.append(dot) new_vals.append(sim_dot) new_labels.append(lab) new_labels.append(sim_lab) x = [dot[0], sim_dot[0]] y = [dot[1], sim_dot[1]] c = colors(doc_id)[0] new_lines.append((x, y, c)) except KeyError: pass for word in word_labels: new_vals.append(reduced_dict[word]) new_labels.append(word) new_values = new_vals document_labels = new_labels x = [] y = [] for value in new_values: x.append(value[0]) y.append(value[1]) plt.figure(figsize=(16, 16)) dots = [] labs = [] for i in range(len(x)): if document_labels[i] in facets: color, facet = colors(facets[document_labels[i]]) else: color, facet = colors(document_labels[i]) ax = plt.scatter(x[i], y[i], c=color) if facet not in labs: dots.append(ax) labs.append(facet) plt.annotate(document_labels[i].split(" - ")[0], xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') for i, line in enumerate(new_lines): plt.plot(line[0], line[1], color=line[2]) labs = [label_replace(lab) for lab in labs] plt.legend( dots, labs, # scatterpoints=1, loc='best', ncol=2, fontsize=20) plt.show()
ncol=2, fontsize=20) plt.show() if __name__ == '__main__': # data_set_name = "classic_gutenberg" # data_set_name = "german_books" data_set_name = "goodreads_genres" vectorization_algorithm = "book2vec" filter = "no_filter" # "specific_words_strict" # "no_filter" vec_path = Vectorization.build_vec_file_name("all", "no_limit", data_set_name, filter, vectorization_algorithm, "real", allow_combination=True) vecs, summation_method = Vectorization.my_load_doc2vec_format(vec_path) c = Corpus.fast_load("all", "no_limit", data_set_name, filter, "real", load_entities=False) tsne_plot(vecs, c) # neighbor_plot(vecs, c) force_directed_graph(vecs, c)
def get_neighbors(data_sets: List[str], vector_names: List[str]): doc_top_n = 3 facet_names = [ # "loc", # "time", # "atm", # "sty", "cont", # "plot" ] is_series_corpus = False tuples = [] columns = None for data_set in data_sets: corpus = Corpus.fast_load(path=os.path.join('../corpora', data_set), load_entities=False) for vector_name in tqdm(vector_names, desc="Iterate through embedding types", total=len(vector_names)): vec_path = Vectorization.build_vec_file_name( "all", "no_limit", data_set, "no_filter", vector_name, "real", allow_combination=True) vectors, _ = Vectorization.my_load_doc2vec_format(vec_path) for doc_id in corpus.documents.keys(): for facet_name in facet_names: sim_docs = Vectorization.most_similar_documents( vectors, corpus, positives=doc_id, topn=doc_top_n, feature_to_use=facet_name, print_results=False, series=is_series_corpus)[1:] if len(sim_docs) == 2: tuples.append( (data_set, vector_name, facet_name, table_format(corpus.documents[doc_id]), 1, table_format(corpus.documents[replace_sim_id( sim_docs[0][0])]), table_format(corpus.documents[replace_sim_id( sim_docs[1][0])]))) columns = [ "Dataset", "Algorithm", "Facet", "Book", "Rank", "First Neighbor", "Second Neighbor" ] else: for i, (sim_doc_id, sim) in enumerate(sim_docs): tuples.append( (data_set, vector_name, facet_name, table_format(corpus.documents[doc_id]), i, table_format(corpus.documents[replace_sim_id( sim_doc_id)]), sim)) columns = [ "Dataset", "Algorithm", "Facet", "Book", "Rank", "Similar Book", "Similarity" ] df = pd.DataFrame(tuples, columns=columns) df.to_csv("results/neighbors/neighbors.csv") print(df)
def calculate_facet_scores(data_sets: List[str], vector_names: List[str], facets: List[str], use_topic_vecs: bool = False): results = [] for data_set in data_sets: corpus = Corpus.fast_load(path=os.path.join('../corpora', data_set), load_entities=False) topic_dict = None summary_dict = None if "cont" in facets: topic_dict = TopicModeller.topic_modelling(corpus) if "plot" in facets: summary_dict = Summarizer.get_summary(corpus) start_time = time.time() if use_topic_vecs: topic_vecs = TopicModeller.get_topic_distribution(corpus, data_set) else: topic_vecs = None for vector_name in tqdm(vector_names, desc="Iterate through embedding types", total=len(vector_names)): # print('---') vec_path = Vectorization.build_vec_file_name( "all", "no_limit", data_set, "no_filter", vector_name, "real", allow_combination=True) vecs, _ = Vectorization.my_load_doc2vec_format(vec_path) adv_mode = False if "_adv" in vector_name: adv_mode = True fee = FacetEfficientEvaluation(vectors=vecs, corpus=corpus, data_set_name=data_set, facet_names=facets, topic_vectors=topic_vecs) fac_relaxed_scores, fac_strict_scores, fac_strict_fac_only = fee.evaluate( word_top_n=100, topic_dict=topic_dict, summary_dict=summary_dict, adv_mode=adv_mode) for fac_name in facets: results.append( (data_set, vector_name, fac_name, fac_relaxed_scores[fac_name], fac_strict_scores[fac_name], fac_strict_fac_only[fac_name])) tuples = [] for result in results: data_set, vector_name, fac_name, relaxed_scores, strict_scores, fac_only_scores = result tuples.append((data_set, fac_name, vector_name, sum(relaxed_scores) / len(relaxed_scores), sum(strict_scores) / len(strict_scores), sum(fac_only_scores) / len(fac_only_scores))) df = pd.DataFrame(tuples, columns=[ 'Corpus', 'Facet', 'Algorithm', 'Relaxed Score', 'Strict Score', 'Facet Only Score' ]) df = df.sort_values([ 'Corpus', 'Facet', 'Algorithm', 'Relaxed Score', 'Strict Score', 'Facet Only Score' ]) print(df) df.to_csv('results/facet_evaluation/facet_task_results.csv', index=False) print(df.to_latex(index=False)) results = [] a_time = time.time() - start_time start_time = time.time() # for vector_name in tqdm(vector_names, desc="Iterate through embedding types", total=len(vector_names)): # print('---') # vec_path = Vectorizer.build_vec_file_name("all", # "no_limit", # data_set, # "no_filter", # vector_name, # "real") # # vecs = Vectorizer.my_load_doc2vec_format(vec_path) # # for fac_name in tqdm(facets, total=len(facets), desc="Iterate through facetes"): # fe = FacetEvaluation(fac_name, vecs, c, data_set) # relaxed_scores, strict_scores = fe.evaluate() # results.append((data_set, vector_name, fac_name, relaxed_scores, strict_scores)) # # tuples = [] # for result in results: # data_set, vector_name, fac_name, relaxed_scores, strict_scores = result # tuples.append((data_set, vector_name, fac_name, # sum(relaxed_scores) / len(relaxed_scores), sum(strict_scores) / len(strict_scores))) # # df = pd.DataFrame(tuples, columns=['Corpus', 'Algorithm', 'Facet', 'Relaxed Score', 'Strict Score']) # print(df) # df.to_csv('results/facet_evaluation/facet_task_results.csv', index=False) b_time = time.time() - start_time print(a_time, b_time)
def word_neighborhood_evaluation(self, facet_name: str, doc_id: str, doc_top_n: int, word_top_n: int, is_series_corpus: bool, document: Document, topic_dict, summary_dict, adv_mode: bool): sim_docs = Vectorization.most_similar_documents( self.vectors, self.corpus, positives=doc_id, topn=doc_top_n, feature_to_use=facet_name, print_results=False, series=is_series_corpus) # print(doc_id, len(sim_docs)) sim_words = Vectorization.most_similar_words(self.vectors, positives=[doc_id], topn=word_top_n, feature_to_use=facet_name, print_results=False) sim_words_relaxed = set([word for word, sim in sim_words]) # print(sim_words, len(sim_words)) facet_words = self.get_facet_words(document, facet_name, topic_dict, summary_dict, adv_mode=adv_mode) sim_words_strict = set( [word for word in sim_words_relaxed if word in facet_words]) # print(sim_words) # print(facet_doc_id) # print(sim_docs, sim_words) # print('_____') shared_word_values_relaxed = [] shared_word_values_strict = [] reciprocal_ranks = [] r = 1 simple_facet_words = sim_words_strict for sim_doc_id, sim_doc in sim_docs: if str(sim_doc_id).startswith(doc_id): continue # print('>>', sim_doc_id, word_top_n) sim_doc_words = Vectorization.most_similar_words( self.vectors, positives=[sim_doc_id], topn=word_top_n, feature_to_use=None, print_results=False) # print(len(sim_doc_words)) # print(sim_words_relaxed) # print(sim_doc_words) sim_doc_words_relaxed = set([word for word, sim in sim_doc_words]) sim_doc_words_strict = set([ word for word in sim_doc_words_relaxed if word in facet_words ]) print(facet_name, sim_doc_words_strict) # if facet_name == "plot": # print(facet_name, sim_words) # print(facet_name, sim_words_strict) # print(facet_name, sim_doc_words_strict) # if facet_name == "sty": # print(facet_name, sim_words) # print(facet_name, sim_words_strict) # print(facet_name, sim_doc_words_strict) shared_words_relaxed = sim_words_relaxed.intersection( sim_doc_words_relaxed) shared_words_strict = sim_words_strict.intersection( sim_doc_words_strict) # print(len(sim_words_strict), len(sim_doc_words), len(shared_words_relaxed), # len(shared_words_strict)) # print(sim_words_relaxed) # print(sim_doc_words_relaxed) # # print('sty', facet_words) # print(sim_words_strict) # print(sim_doc_words_strict) # print() shared_word_values_relaxed.append( len(shared_words_relaxed) / word_top_n) shared_word_values_strict.append( len(shared_words_strict) / word_top_n) reciprocal_ranks.append(1 / r) r += 1 # print(len(shared_words), shared_words) reciprocal_ranks = [ rank / sum(reciprocal_ranks) for rank in reciprocal_ranks ] score_relaxed = sum([ shared_val * rank_val for shared_val, rank_val in zip( shared_word_values_relaxed, reciprocal_ranks) ]) score_strict = sum([ shared_val * rank_val for shared_val, rank_val in zip( shared_word_values_strict, reciprocal_ranks) ]) return score_relaxed, score_strict, len( simple_facet_words) / word_top_n
def success_prediction_task(data_set_name: str, success_dict, vector_names): result_tuples = [] correctness_table = defaultdict(list) for vectorization_algorithm in vector_names: majority_class = None if vectorization_algorithm == "majority_class": majority_class = vectorization_algorithm vectorization_algorithm = "doc2vec" vec_path = Vectorization.build_vec_file_name("all", "no_limit", data_set_name, "no_filter", vectorization_algorithm, "real", allow_combination=True) vectors, summation_method = Vectorization.my_load_doc2vec_format( vec_path) x = [] y = [] doc_ids = [] k_fold_cross_val = None for doctag in vectors.docvecs.doctags: try: if summation_method and f"_{summation_method}" in str(doctag): x.append(vectors.docvecs[doctag]) # success = success_dict[doctag.replace(f"_{summation_method}", "")] # if success == "failure": # success = 0 # else: # success = 1 # y.append(success) y.append(0 if success_dict[doctag.replace( f"_{summation_method}", "")] == "failure" else 1) doc_ids.append(doctag.replace(f"_{summation_method}", "")) elif not summation_method and str(doctag)[-1].isdigit(): doc_splitted = doctag.split("_") if len(doc_splitted) > 1 and doc_splitted[-1][-1].isdigit( ) and doc_splitted[-2][-1].isdigit(): pass else: x.append(vectors.docvecs[doctag]) y.append(0 if success_dict[doctag] == "failure" else 1) doc_ids.append(doctag) else: pass except KeyError: pass counter = defaultdict(lambda: 0) print(len(y)) for truth_val in y: counter[truth_val] += 1 # print(len(doc_ids), len(y)) # print(doc_ids) # print(y) classifiers = [ # RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0), sk.svm.LinearSVC(max_iter=10000, class_weight="balanced", dual=False, random_state=42, C=1.5), sk.svm.SVC(max_iter=10000, class_weight="balanced", kernel="rbf", random_state=42, C=0.5), sk.svm.NuSVC(max_iter=10000, class_weight="balanced", kernel="rbf", gamma="scale", random_state=42, nu=0.5), # MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10, 4), random_state=42, max_iter=10000), # LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr', max_iter=1000), ] # classifiers = [make_pipeline(StandardScaler(), classifier) for classifier in classifiers] for classifier in classifiers: pipeline_classifier = make_pipeline(StandardScaler(), classifier) if k_fold_cross_val: weights = [counter[pred] for pred in y] # results = cross_val_scores_weighted(classifier, x, y, weights, cv=k_fold_cross_val, # metrics=[f1_score, precision_score, # recall_score, f1_score]) scoring = { 'accuracy': make_scorer(accuracy_score), 'precision': make_scorer(precision_score), 'recall': make_scorer(recall_score), 'f1_score': make_scorer(f1_score) } kfold = sk.model_selection.KFold(n_splits=k_fold_cross_val, random_state=42, shuffle=True) results = sk.model_selection.cross_validate( estimator=pipeline_classifier, X=x, y=y, cv=kfold, scoring=scoring) result_tuples.append( (classifier.__class__.__name__, vectorization_algorithm, np.mean(results["test_f1_score"]), np.mean(results["test_precision"]), np.mean(results["test_recall"]), np.mean(results["test_accuracy"]))) else: nr_iterations = 100 f1s = [] for i in range(nr_iterations): x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.30, random_state=i) pipeline_classifier.fit(x_train, y_train) y_pred = pipeline_classifier.predict(x_test) if majority_class: y_pred = [1 for y in y_pred] vectorization_algorithm = majority_class # weights = [counter[pred] for pred in y_test] correctness_table[f"{vectorization_algorithm}-{classifier.__class__.__name__}"]\ .extend([pred == truth for pred, truth in zip(y_pred, y_test)]) f1s.append( f1_score(y_test, y_pred, average="weighted", pos_label=None)) # print(weights) # print(vectorization_algorithm, round(classifier.score(x_test, y_test), 4)) if len(f1s) == 0: f1s.append(0) result_tuples.append(( classifier.__class__.__name__, vectorization_algorithm, np.mean(f1s), # precision_score(y_test, y_pred, average="weighted"), # recall_score(y_test, y_pred, average="weighted"), # accuracy_score(y_test, y_pred) )) print(correctness_table.keys()) classifiers = ["LinearSVC", "SVC", "NuSVC"] for cl in classifiers: algo1 = "xlm_pt" algo2 = "book2vec_concat" true_true = 0 true_false = 0 false_true = 0 false_false = 0 for e1, e2 in zip(correctness_table[f"{algo1}-{cl}"], correctness_table[f"{algo2}-{cl}"]): if e1 and e2: true_true += 1 elif e1 and not e2: true_false += 1 elif not e1 and e2: false_true += 1 elif not e1 and not e2: false_false += 1 else: pass table = [[true_true, true_false], [false_true, false_false]] print(table) print(cl) mcnemar_sig_text(table) df = pd.DataFrame( result_tuples, columns=[ "Classifier", "Algorithm", "Weighted F1", # "Precision", "Recall", # "Accuracy" ]) print(df) df = df.pivot(index='Algorithm', columns='Classifier', values='Weighted F1') df.to_csv("../results/book_success_prediction/eval_scores.csv", index=True) print(df) print(df.to_latex())
def genre_prediction_task(data_set_name: str, genre_dict, vector_names): result_tuples = [] for vectorization_algorithm in vector_names: vec_path = Vectorization.build_vec_file_name("all", "no_limit", data_set_name, "no_filter", vectorization_algorithm, "real", allow_combination=True) vectors, summation_method = Vectorization.my_load_doc2vec_format( vec_path) x = [] y = [] k_fold_cross_val = None for doctag in vectors.docvecs.doctags: if summation_method and f"_{summation_method}" in str(doctag): x.append(vectors.docvecs[doctag]) # success = success_dict[doctag.replace(f"_{summation_method}", "")] # if success == "failure": # success = 0 # else: # success = 1 # y.append(success) y.append(genre_dict[doctag.replace(f"_{summation_method}", "")]) else: if str(doctag)[-1].isdigit(): x.append(vectors.docvecs[doctag]) y.append(genre_dict[doctag]) # print(y) classifiers = [ # RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0), # sk.svm.LinearSVC(max_iter=10000, class_weight="balanced", dual=False, random_state=42), sk.svm.LinearSVC(max_iter=20000, class_weight="balanced", dual=True, random_state=42), # sk.svm.LinearSVC(max_iter=10000, class_weight="balanced", dual=True, random_state=42, C=5), # sk.svm.LinearSVC(max_iter=10000, dual=False, random_state=42), # sk.svm.LinearSVC(max_iter=10000, dual=True, random_state=42), # MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=42, max_iter=10000), # LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr', max_iter=1000) ] for classifier in classifiers: if k_fold_cross_val: scoring = { 'accuracy': make_scorer(accuracy_score), 'precision': make_scorer(precision_score), 'recall': make_scorer(recall_score), 'f1_score': make_scorer(f1_score) } kfold = sk.model_selection.KFold(n_splits=k_fold_cross_val, random_state=42, shuffle=True) results = sk.model_selection.cross_validate( estimator=classifier, X=x, y=y, cv=kfold, scoring=scoring) result_tuples.append( (classifier.__class__.__name__, vectorization_algorithm, np.mean(results["test_f1_score"]), np.mean(results["test_precision"]), np.mean(results["test_recall"]), np.mean(results["test_accuracy"]))) else: x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.30, random_state=42) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) # print(z) # print(vectorization_algorithm, round(classifier.score(x_test, y_test), 4)) result_tuples.append( (classifier.__class__.__name__, vectorization_algorithm, f1_score(y_test, y_pred, average='weighted'), precision_score(y_test, y_pred, average='weighted'), recall_score(y_test, y_pred, average='weighted'), accuracy_score(y_test, y_pred))) df = pd.DataFrame(result_tuples, columns=[ "Classifier", "Algorithm", "F1", "Precision", "Recall", "Accuracy" ]) df = df.pivot(index='Algorithm', columns=['Classifier'], values='F1') df.to_csv("results/book_success_prediction/genre_eval_scores.csv", index=True) print(df) print(df.to_latex())