def descriptions_from_bow(descs, languages, translations, translate_policy): if translate_policy != "onlyorig" or languages != "en": raise NotImplementedError() desc_list = DescriptionList(add_title=False, add_subtitle=False, translate_policy=translate_policy, additionals_names=list( descs["classes"].keys())) if get_setting("DEBUG"): descs["vecs"] = dict( list(descs["vecs"].items())[:get_setting("DEBUG_N_ITEMS")]) for name, bow in descs["vecs"].items(): desc_list.add( Description(lang=languages, text=None, title=name, subtitle=None, orig_textlang=None, bow=bow, additionals={ k: v.get(name) for k, v in descs["classes"].items() })) desc_list.proc_steps.append("bow") return desc_list
def preprocess_raw_file(df, pp_components, min_ges_nwords=20): """loads the given Siddata-Style CSV into a pandas-dataframe, already performing some processing like dropping duplicates""" #TODO in exploration I also played around with Levenhsthein-distance etc! #remove those for which the Name (exluding stuff in parantheses) is equal... assert isinstance(df, pd.DataFrame) df = df.reset_index().drop(columns=["Unnamed: 0", "index"]) # df = df[~df['description'].isnull()] df = df[df["description"] != "[]"] if get_setting("DEBUG"): df = df[:get_setting("DEBUG_N_ITEMS") * 2] df = Dataset.merge_multidescs(df, pp_components) df.loc[:, 'ges_nwords'] = df["description"].str.count(" ").fillna(0) df["subtitle"] = df["subtitle"] + df[ "subject"] #TODO: maybe have an extra pp_comp for this? if pp_components.add_title: df.loc[:, 'ges_nwords'] += df["title"].str.count(" ").fillna(0) if pp_components.add_subtitle: df.loc[:, 'ges_nwords'] += df["subtitle"].str.count(" ").fillna(0) df = df[df["ges_nwords"] >= min_ges_nwords] with pd.option_context('mode.chained_assignment', None): #TODO publisher to get uni for column in ["title", "description", "subtitle"]: df.loc[:, column] = df[column].copy().str.strip() return df
def run_lsi(pp_descriptions, filtered_dcm, verbose): """as in [VISR12: 4.2.1]""" if verbose: filtered_dcm.show_info(descriptions=pp_descriptions) if get_setting("DCM_QUANT_MEASURE") != "binary": logger.warn("VISR12 say it works best with binary!") orig_len = len(filtered_dcm.dtm) filtered_dcm.add_pseudo_keyworddocs() # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html svd = TruncatedSVD(n_components=100, random_state=get_setting("RANDOM_SEED")) transformed = svd.fit_transform(filtered_dcm.as_csr().T) desc_psdoc_dists = cdist(transformed[:orig_len], transformed[orig_len:], "cosine") already_keywords = [ [ind, j[0]] for ind, elem in enumerate(filtered_dcm.dtm[:orig_len]) for j in elem ] # we don't gain information from those that are close but already keywords desc_psdoc_dists[list(zip(*already_keywords))] = np.inf WHICH_LOWEST = 30 tenth_lowest = np.partition(desc_psdoc_dists.min(axis=1), WHICH_LOWEST)[ WHICH_LOWEST] # https://stackoverflow.com/a/43171216/5122790 good_fits = np.where(desc_psdoc_dists.min(axis=1) < tenth_lowest)[0] for ndesc, keyword in zip(good_fits, np.argmin(desc_psdoc_dists[good_fits], axis=1)): assert not filtered_dcm.all_terms[ keyword] in pp_descriptions._descriptions[ndesc] print(f"*b*{filtered_dcm.all_terms[keyword]}*b*", pp_descriptions._descriptions[ndesc]) print()
def create_candidate_svm(embedding, term, quants, classifier, plot_svm=False, descriptions=None, quant_name=None, pgbar=None, **kwargs): #!! term is only used for visualization, and ist must stay that way for CLUSTER_DIRECTION_ALGO = "reclassify" ! bin_labels = np.array(quants, dtype=bool) # Ensure that regardless of quant_measure this is correct binary classification labels # (tmp := len(quants)/(2*np.bincount(bin_labels)))[0]/tmp[1] is roughly equal to bin_labels.mean() so balancing is good if classifier == "SVM": svm = sklearn.svm.LinearSVC(class_weight="balanced", loss="hinge", max_iter=20000) elif classifier == "SVM_square": svm = sklearn.svm.LinearSVC(dual=False, class_weight="balanced") #squared-hinge instead of hinge (but fastest!) elif classifier == "SVM2": warnings.warn("Using an SVM Implementation that's slower for this kind of data!") svm = sklearn.svm.SVC(kernel="linear", class_weight="balanced", decision_function_shape="ovo") #slower than LinearSVC, don't use! # see https://stackoverflow.com/q/33843981/5122790, https://stackoverflow.com/q/35076586/5122790 else: raise NotImplementedError(f"Demanded classifier {classifier} not implemented!") with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") svm.fit(embedding, bin_labels) if w: assert issubclass(w[0].category, (sklearn.exceptions.ConvergenceWarning, DeprecationWarning)) no_converge = (bool(w) and issubclass(w[0].category, sklearn.exceptions.ConvergenceWarning)) tn, fp, fn, tp = confusion_matrix(bin_labels, svm.predict(embedding)).ravel() res = {"accuracy": (tp + tn) / len(quants), "precision": tp / (tp + fp), "recall": tp / (tp + fn), "did_converge": not no_converge} res["f_one"] = 2 * (res["precision"] * res["recall"]) / (res["precision"] + res["recall"]) #now, in [DESC15:4.2.1], they compare the "ranking induced by \vec{v_t} with the number of times the term occurs in the entity's documents" with Cohen's Kappa. #see notebooks/proof_of_concept/get_svm_decisionboundary.ipynb#Checking-projection-methods-&-distance-measures-from-point-to-projection for the ranking decision_plane = NDPlane(svm.coef_[0], svm.intercept_[0]) #don't even need the plane class here dist = lambda x, plane: np.dot(plane.normal, x) + plane.intercept distances = [dist(point, decision_plane) for point in embedding] assert np.allclose(distances, svm.decision_function(embedding)) #see https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC.decision_function, https://stats.stackexchange.com/a/14881 distances /= np.linalg.norm(svm.coef_[0]) #TODO: add the links and this normalification to the distances-notebook #sanity check: do most of the points with label=0 have the same sign `np.count_nonzero(np.sign(np.array(distances)[bin_labels])+1) # bin_labels, np.array((np.sign(np.array(distances))+1)/2, dtype=bool) # quant_ranking = np.zeros(quants.shape); quant_ranking[np.where(quants > 0)] = np.argsort(quants[quants > 0]) #TODO cohen's kappa hat nen sample_weight parameter!! DESC15 write they select Kappa "due to its tolerance to class imbalance." -> Does that mean I have to set the weight?! kappa_weights = get_setting("KAPPA_WEIGHTS") if get_setting("KAPPA_WEIGHTS") != "None" else None res["kappa_rank2rank_dense"] = cohen_kappa(rankdata(quants, method="dense"), rankdata(distances, method="dense"), weights=kappa_weights) #if there are 14.900 zeros, the next is a 1 res["kappa_rank2rank_min"] = cohen_kappa(rankdata(quants, method="min"), rankdata(distances, method="dense"), weights=kappa_weights) #if there are 14.900 zeros, the next one is a 14.901 res["kappa_bin2bin"] = cohen_kappa(bin_labels, [i > 0 for i in distances], weights=kappa_weights) res["kappa_digitized"] = cohen_kappa(np.digitize(quants, np.histogram_bin_edges(quants)[1:]), np.digitize(distances, np.histogram_bin_edges(distances)[1:]), weights=kappa_weights) res["ndcg_all"] = ndcg_score(np.array([quants]), np.expand_dims(distances,0)) res["ndcg_onlypos"] = ndcg_score(np.array([quants]), np.expand_dims(distances, 0), k=np.count_nonzero(np.array(quants))) nonzero_indices = np.where(np.array(quants) > 0)[0] q2, d2 = np.array(quants)[nonzero_indices], np.array(distances)[nonzero_indices] with nullcontext(): #warnings.catch_warnings(): #TODO get rid of what cuases the nans here!!! # warnings.filterwarnings('ignore', r'invalid value encountered in true_divide') if quant_name == "count": # in DESC15 they write "measure the correlation between the ranking induced by \vec{vt} and the number of times t appears in the documents associated with each entity", so maybe compare ranking to count?! # res["kappa_count2rank"] = cohen_kappa(quants, rankdata(distances, method="dense"), weights=kappa_weights) res["kappa_count2rank_onlypos"] = cohen_kappa(q2, rankdata(d2, method="dense"), weights=kappa_weights) res["kappa_rank2rank_onlypos_dense"] = cohen_kappa(rankdata(q2, method="dense"), rankdata(d2, method="dense"), weights=kappa_weights) res["kappa_rank2rank_onlypos_min"] = cohen_kappa(rankdata(q2, method="min"), rankdata(d2, method="min"), weights=kappa_weights) res["kappa_rank2rank_onlypos_max"] = cohen_kappa(rankdata(q2, method="max"), rankdata(d2, method="max"), weights=kappa_weights) # res["kappa_digitized_onlypos_1"] = cohen_kappa(np.digitize(q2, np.histogram_bin_edges(quants)[1:]), np.digitize(d2, np.histogram_bin_edges(distances)[1:]), weights=kappa_weights) #one ^ has as histogram-bins what it would be for ALL data, two only for the nonzero-ones res["kappa_digitized_onlypos_2"] = cohen_kappa(np.digitize(q2, np.histogram_bin_edges(q2)[1:]), np.digitize(d2, np.histogram_bin_edges(d2)[1:]), weights=kappa_weights) if plot_svm and descriptions is not None: display_svm(embedding, np.array(bin_labels, dtype=int), svm, term=term, descriptions=descriptions, name=term+" "+(", ".join(f"{k}: {round(v, 3)}" for k, v in res.items())), quants=quants, distances=distances, **kwargs) if pgbar is not None: pgbar.update(1) return res, decision_plane, term
def create_candidate_svms(dcm, embedding, descriptions, verbose, continue_from=None): #TODO I am still not sure about if I am calculating with vectors somewhere where when I should be working with points if hasattr(embedding, "embedding_"): embedding = embedding.embedding_ decision_planes = {} metrics = {} terms = list(dcm.all_terms.values()) metainf = {} if get_setting("DEBUG"): maxlen = min(len(terms), len(embedding), get_setting("DEBUG_N_ITEMS"), len(dcm.dtm)) working_inds = [nterm for nterm, term in enumerate(terms[:maxlen]) if np.array(dcm.term_quants(term)[:maxlen], dtype=bool).std()] #those with >1 class term_inds = unique(flatten([j[0] for j in dcm.dtm[i]] for i in working_inds)) terms = [dcm.all_terms[i] for i in term_inds] embedding = embedding[working_inds] ind_translator = {v: k for k, v in enumerate(term_inds)} dcm = DocTermMatrix([[[ind_translator[j[0]],j[1]] for j in dcm.dtm[i]] for i in working_inds], {ind_translator[i]: dcm.all_terms[i] for i in term_inds}, dcm.quant_name) print(f"Debug-Mode: Running for {len(working_inds)} Items and {len(terms)} Terms.") # warnings.warn("PRECOMMIT there's stuff here!") # assert all(i in terms for i in ['nature', 'ceiling', 'engine', 'athlete', 'seafood', 'shadows', 'skyscrapers', 'b737', 'monument', 'baby', 'sign', 'marine', 'iowa', 'field', 'buy', 'military', 'lounge', 'factory', 'road', 'education', '13thcentury', 'people', 'wait', 'travel', 'tunnel', 'treno', 'wings', 'hot', 'background', 'vintage', 'farmhouse', 'technology', 'building', 'horror', 'realestate', 'crane', 'slipway', 'ruin', 'national', 'morze']) # terms = ['nature', 'ceiling', 'engine', 'athlete', 'seafood', 'shadows', 'skyscrapers', 'b737', 'monument', 'baby', 'sign', 'marine', 'iowa', 'field', 'buy', 'military', 'lounge', 'factory', 'road', 'education', '13thcentury', 'people', 'wait', 'travel', 'tunnel', 'treno', 'wings', 'hot', 'background', 'vintage', 'farmhouse', 'technology', 'building', 'horror', 'realestate', 'crane', 'slipway', 'ruin', 'national', 'morze'] # assert len([i for i in descriptions._descriptions if 'nature' in i]) == len([i for i in dcm.term_quants('nature') if i > 0]) # print(f"Running only for the terms {terms}") else: assert all(len([i for i in descriptions._descriptions if term in i]) == len([i for i in dcm.term_quants(term) if i > 0]) for term in random.sample(terms, 5)) if get_setting("DO_SANITYCHECKS"): assert all(dcm.term_quants(terms[i]) == list(dcm.as_csr()[i,:].toarray().squeeze()) for i in random.sample(range(len(terms)), 5)) quants_s = dcm.as_csr().toarray().tolist() # [dcm.term_quants(term) for term in tqdm(terms, desc="Counting Terms")] ncpu = get_ncpu(ram_per_core=10) #TODO: make ram_per_core dependent on dataset-size if ncpu == 1: #TODO Interruptible: for ncpu==1, I'm adding direct key-value-pairs, in the ncpu>1 version I'm appending to a list -> they are incompatible! with Interruptible(zip(terms, quants_s), ([], decision_planes, metrics), metainf, continue_from=continue_from, pgbar="Creating Candidate SVMs [1 proc]", total=len(terms), name="SVMs") as iter: for term, quants in iter: #in tqdm(zip(terms, quants_s), desc="Creating Candidate SVMs", total=len(terms)) cand_mets, decision_plane, term = create_candidate_svm(embedding, term, quants, classifier=get_setting("CLASSIFIER"), descriptions=descriptions, quant_name=dcm.quant_name) metrics[term] = cand_mets decision_planes[term] = decision_plane else: print(f"Starting Multiprocessed with {ncpu} CPUs") with Interruptible(zip(terms, quants_s), [None, [], None], metainf, continue_from=continue_from, contains_mp=True, name="SVMs", total=len(quants_s)) as iter: with tqdm(total=iter.n_elems, desc=f"Creating Candidate SVMs [{ncpu} procs]") as pgbar, ThreadPool(ncpu, comqu=iter.comqu) as p: res, interrupted = p.starmap(create_candidate_svm, zip(repeat(embedding, iter.n_elems), repeat("next_0"), repeat("next_1"), repeat(get_setting("CLASSIFIER")), repeat(False), repeat(None), repeat(dcm.quant_name), repeat(pgbar)), draw_from=iter.iterable) _, res, _ = iter.notify([None, res, None], exception=interrupted) if interrupted is not False: return quants_s, res, None, metainf for cand_mets, decision_plane, term in res: metrics[term] = cand_mets decision_planes[term] = decision_plane assert set(terms) == set(metrics.keys()) if (didnt_converge := len([1 for i in metrics.values() if i and not i["did_converge"]])): warnings.warn(f"{didnt_converge} of the {len(metrics)} SVMs did not converge!", sklearn.exceptions.ConvergenceWarning)
def run_lsi_gensim(pp_descriptions, filtered_dcm, verbose=False): """as in [VISR12: 4.2.1]""" # TODO options here: # * if it should filter AFTER the LSI if verbose: filtered_dcm.show_info(descriptions=pp_descriptions) if get_setting("DCM_QUANT_MEASURE") != "binary": logger.warn("VISR12 say it works best with binary!") filtered_dcm.add_pseudo_keyworddocs() dictionary = corpora.Dictionary([list(filtered_dcm.all_terms.values())]) print("Start creating the LSA-Model with MORE topics than terms...") lsamodel_manytopics = LsiModel(doc_term_matrix, num_topics=len(all_terms) * 2, id2word=dictionary) print("Start creating the LSA-Model with FEWER topics than terms...") lsamodel_lesstopics = LsiModel(filtered_dcm.dtm, num_topics=len(filtered_dcm.all_terms) // 10, id2word=dictionary) print() import matplotlib.cm import matplotlib.pyplot as plt # TODO use the mpl_tools here as well to also save plot! plt.imshow(lsamodel_lesstopics.get_topics()[:100, :200], vmin=lsamodel_lesstopics.get_topics().min(), vmax=lsamodel_lesstopics.get_topics().max(), cmap=matplotlib.cm.get_cmap("coolwarm")) plt.show()
def get_name_dict(clusters, cluster_reprs, clus_rep_algo=None): clus_rep_algo = clus_rep_algo or get_setting("CLUS_REP_ALGO") if clus_rep_algo.startswith("top"): topwhat = int(clus_rep_algo.split("_")[1]) return {k: ",".join(([k]+v)[:topwhat]) for k, v in clusters.items()} elif clus_rep_algo in list(cluster_reprs.values())[0].keys(): return {k: v[clus_rep_algo] for k, v in cluster_reprs.items()} raise NotImplementedError()
def create_dissim_mat(descriptions: DescriptionList, quantification_measure, verbose=False, **interrupt_kwargs): #Options here: get_setting("NGRAMS_IN_EMBEDDING"), get_setting("DISSIM_MAT_ONLY_PARTNERED") if get_setting("DEBUG"): descriptions._descriptions = descriptions._descriptions[:get_setting( "DEBUG_N_ITEMS")] dtm, metainf = descriptions.generate_DocTermMatrix( min_df=2 if get_setting("DISSIM_MAT_ONLY_PARTNERED") else 1, max_ngram=get_setting("MAX_NGRAM") if get_setting("NGRAMS_IN_EMBEDDING") else None, do_tfidf=quantification_measure if quantification_measure in ["tfidf", "tf"] else None) assert any( " " in i for i in dtm.all_terms.values()) == (get_setting("NGRAMS_IN_EMBEDDING") and get_setting("MAX_NGRAM") > 1) quantification = dtm.apply_quant( quantification_measure, descriptions=descriptions, verbose=verbose) if not metainf.get("sklearn_tfidf") else dtm # das ist jetzt \textbf{v}_e with all e's as rows #cannot use ppmis directly, because a) too sparse, and b) we need a geometric representation with euclidiean props (betweeness, parallism,..) assert all( len(set((lst := [i[0] for i in dtm]))) == len(lst) for dtm in quantification.dtm)
def create_mds(dissim_mat, embed_dimensions, metric=True, init_from_isomap=True): max_iter = 10000 if not get_setting("DEBUG") else 100 if not init_from_isomap: warnings.warn("sklearn's MDS is broken!! Have to init from something, don't f*****g ask why!") n_inits = math.ceil((max(get_ncpu()*2, (10 if not get_setting("DEBUG") else 3)))/get_ncpu())*get_ncpu() # minimally 10, maximally ncpu*2, but in any case a multiple of ncpu print(f"Running {'non-' if not metric else ''}metric MDS {n_inits} times with {get_ncpu(ignore_debug=True)} jobs for max {max_iter} iterations.") embedding = MDS(n_components=embed_dimensions, dissimilarity="precomputed", metric=metric, #TODO with metric=True it always breaks after the second step if n_components>>2 (well, mit metric=False auch^^) n_jobs=get_ncpu(ignore_debug=True), verbose=1 if get_setting("VERBOSE") else 0, n_init=n_inits, max_iter=max_iter) mds = embedding.fit(dissim_mat) else: print(f"Running {'non-' if not metric else ''}metric MDS with {get_ncpu(ignore_debug=True)} jobs for max {max_iter} iterations, initialized from Isomap-Embeddings") embedding = MDS(n_components=embed_dimensions, dissimilarity="precomputed", metric=metric, n_jobs=get_ncpu(ignore_debug=True), verbose=1 if get_setting("VERBOSE") else 0, n_init=1, max_iter=max_iter) try: isomap_init = create_isomap(dissim_mat, embed_dimensions, neighbor_factor=25).embedding_ except ValueError: #There are significant negative eigenvalues... isomap_init = np.random.random((len(dissim_mat), embed_dimensions))*0.01 mds = embedding.fit(dissim_mat, init=isomap_init) return mds
def run_preprocessing_funcs(descriptions: DescriptionList, components, word_tokenizer=None): #components: sent_tokenize=True, lemmatize=True, remove_stopwords=True, convert_lower=True, remove_diacritcs=True, remove_punctuation=True # TODO use TreeTagger? https://textmining.wp.hs-hannover.de/Preprocessing.html#Alternative:-Treetagger # https://textmining.wp.hs-hannover.de/Preprocessing.html#Satzerkennung-und-Tokenization assert components.convert_lower, "Stopwords are lower-case so not converting is not allowed (bad for german...!)" if components.remove_htmltags: descriptions.process_all( lambda data: re.compile(r'<.*?>').sub('', data), "remove_htmltags") if components.sent_tokenize: if not get_setting("USE_STANZA"): descriptions.process_all( nltk_sent_tokenize, "sent_tokenize", indiv_kwargs=dict( language=lambda desc: NLTK_LAN_TRANSLATOR[desc.lang]) ) #nltk suuucks!! sent_tokenize(*, language=german) trennt sogar "...am Ende des 2. Semesters", oder, even worse, "Relevante Probleme wie z.B. Lautierungsregeln", but if there's no space after a dot it DOESN'T split obvious sentences! very visible in description "!! FÄLLT AB 15.11. AUS !! Lektürekurs Spanisch I (Gruppe A und B)." #TODO! #TODO maybe write small rule-based-after-thingy that handles common stuff like... * "z.B." wird nicht getrennt * "\d+\. Nomen" (bspw "2. Semester") wird nicht getrennt * Kram wie "15.11." (also daten) wird nicht getrennt, ... else: logging.getLogger('stanza').setLevel(logging.ERROR) import stanza if len(descriptions.languages) > 1: raise NotImplementedError() nlp = stanza.Pipeline(lang='de', processors='tokenize') fn = lambda txt: [i._text for i in nlp(txt).sentences] descriptions.process_all(fn, "sent_tokenize", pgbar="Stanza Sentence-Tokenizing") if components.convert_lower: convert_lower_all(descriptions) #tokenization will happen anyway! if not components.lemmatize: word_tokenize_all(descriptions, word_tokenizer=word_tokenizer, remove_stopwords=components.remove_stopwords) else: word_tokenize_all(descriptions, word_tokenizer=word_tokenizer, remove_stopwords=False) lemmatize_all(descriptions, components.convert_lower, components.remove_punctuation) if components.remove_stopwords: descriptions.process_all( lambda txt, stopwords: [[lemma for lemma in sent if lemma not in stopwords] for sent in txt], "remove_stopwords", indiv_kwargs=dict( stopwords=lambda desc: get_stopwords(desc.lang))) if components.remove_diacritics: remove_diacritics_all(descriptions) if components.remove_punctuation: remove_punctuation_all(descriptions) return descriptions
def _filter_step2(dtm, used_terms_set, verbose=False, descriptions=None): all_terms_new = dict( enumerate( [v for k, v in dtm.all_terms.items() if k in used_terms_set])) all_terms_new_rev = {v: k for k, v in all_terms_new.items()} dtm_translator = { k: all_terms_new_rev[v] for k, v in dtm.all_terms.items() if k in used_terms_set } doc_term_matrix = [[[dtm_translator.get(ind), num] for ind, num in doc if ind in used_terms_set] for doc in dtm.dtm] if descriptions: if get_setting("DO_SANITYCHECKS"): expected_bows = { ndoc: {all_terms_new[elem]: count for elem, count in doc} for ndoc, doc in enumerate(doc_term_matrix[:10]) } assert all( all(v == descriptions._descriptions[i].bow()[k] for k, v in expected_bows[i].items() if not " " in k) for i in range(10)) assert all( all(v == descriptions._descriptions[i].count_phrase(k) for k, v in expected_bows[i].items() if not " " in k) for i in range(10)) assert all( all_terms_new[ind] in descriptions._descriptions[ndoc] for ndoc, doc in enumerate( tqdm( doc_term_matrix, desc= "Cross-checking filtered DCM with Descriptions [sanity-check]" )) for ind, count in doc) if verbose: shown = [] for n_keyphrases in [0, 1, 20]: items = [[ descriptions._descriptions[i], [all_terms_new[j[0]] for j in e] ] for i, e in enumerate(doc_term_matrix) if len(e) <= n_keyphrases] if items: print( f"Documents with max {n_keyphrases} keyphrases ({len(items)}):\n " + "\n ".join( f"{i[0]}: {', '.join(i[1])}" for i in [j for j in items if j[0] not in shown][:5][:5])) shown += [i[0] for i in items] return DocTermMatrix(dtm=doc_term_matrix, all_terms=all_terms_new, quant_name="count", verbose=verbose)
def extract_coursetype(desc, coursetypes=None): raise NotImplementedError("Hard TODO: move this to dataset_spefics.siddata") coursetypes = coursetypes or get_setting("COURSE_TYPES") for type in coursetypes: if any(i in desc.unprocessed_text.lower() for i in [f"this {type}"]): return type counts = {i: desc.bow().get(i, 0) for i in coursetypes} if any(i > 0 for i in counts.values()): return max(counts.items(), key=lambda x:x[1])[0] counts = {i: desc.unprocessed_text.lower().count(i) for i in coursetypes} if any(i > 0 for i in counts.values()): return max(counts.items(), key=lambda x:x[1])[0] return None
def join_clusters_reclassify(clusters, dcm, embedding, verbose=False): if hasattr(embedding, "embedding_"): embedding = embedding.embedding_ all_cand_mets = {} cluster_directions = {} for k, v in tqdm(clusters.items(), desc="Reclassifying Clusters"): embed = embedding dtm = DocTermMatrix.submat_forterms(dcm, [k] + v) combined_quants = dtm.as_csr().toarray().sum(axis=0) if any(i < get_setting("CANDIDATE_MIN_TERM_COUNT") or i > dtm.n_docs-get_setting("CANDIDATE_MIN_TERM_COUNT") for i in Counter(np.array(combined_quants, dtype=bool)).values()): #TODO have an option for doing this GENERALLY for the SVMs (and plot in 3D) c0_inds = np.where(combined_quants <= np.percentile(combined_quants, 30))[0] c1_inds = np.where(combined_quants >= np.percentile(combined_quants, 70))[0] used_inds = sorted(list(set(c0_inds)|set(c1_inds))) embed = embedding[used_inds] if verbose: print(f"For cluster {k}, the distribution is {dict(Counter(np.array(combined_quants, dtype=bool)))}, so we'll take the most distinct {get_setting('MOST_DISTINCT_PERCENT')}% ({len(c0_inds)} entities per class)") combined_quants = [combined_quants[i] if i in c1_inds else 0 for i in used_inds] cand_mets, decision_plane, _ = create_candidate_svm(embed, f"cluster:{k}", combined_quants, get_setting("CLASSIFIER"), quant_name=dtm.quant_name) all_cand_mets[k] = cand_mets cluster_directions[k] = decision_plane if verbose: print(f"Scores for {get_setting('CLASSIFIER_SUCCMETRIC')} per cluster:", ", ".join(f"{k}: {v[get_setting('CLASSIFIER_SUCCMETRIC')]:.2f}" for k, v in all_cand_mets.items())) return cluster_directions
def show_close_descriptions(dissim_mat, descriptions, is_embedding=False, num=10, title="Dissim-Mat"): # closest_entries = list(zip(*np.where(dissim_mat==min(dissim_mat[dissim_mat>0])))) # closest_entries = set(tuple(sorted(i)) for i in closest_entries) # print(f"Closest Nonequal Descriptions: \n", "\n".join(["*b*"+("*b* & *b*".join([descriptions._descriptions[i].title for i in j]))+"*b*" for j in closest_entries])) print(f"Closest {num} Descriptions in {title}:") if is_embedding: dissim_mat = _create_dissim_mat(dissim_mat, get_setting("DISSIM_MEASURE"), force_singlethread=len(dissim_mat)<500, silent=len(dissim_mat)<500)[0] is_dissim = np.allclose(np.diagonal(dissim_mat), 0, atol=1e-10) assert is_dissim, "TODO now it's a similarity matrix" min_vals = sorted(squareform(dissim_mat))[:num] min_indices = np.where(np.isin(dissim_mat, min_vals)) min_indices = [(i,j) for i,j in zip(*min_indices) if i!=j] min_indices = list({j: None for j in [tuple(sorted(i)) for i in min_indices]}.keys()) #remove duplicates ("aircraft cabin and airplane cabin" and "airplane cabin and aircraft cabin") for first, second in min_indices[:num]: print(f" *b*{descriptions._descriptions[first].title}*b* and *b*{descriptions._descriptions[second].title}*b*")
def create_embedding(dissim_mat, embed_dimensions, embed_algo, verbose=False, pp_descriptions=None): dtm, dissim_mat = dissim_mat if get_setting("DEBUG"): dissim_mat = dissim_mat[:get_setting("DEBUG_N_ITEMS"), :get_setting("DEBUG_N_ITEMS")] is_dissim = np.allclose(np.diagonal(dissim_mat), 0, atol=1e-10) if not is_dissim: print("Seems like you had a similarity matrix, not a dissimilarity matrix! Fixing it.") assert np.allclose(np.diagonal(dissim_mat), 1, atol=1e-10) assert dissim_mat.min() >= 0 and dissim_mat.max() <= 1 dissim_mat = 1-dissim_mat if embed_algo == "mds": embed = create_mds(dissim_mat, embed_dimensions) elif embed_algo == "tsne": embed = create_tsne(dissim_mat, embed_dimensions) elif embed_algo == "isomap": embed = create_isomap(dissim_mat, embed_dimensions) else: raise NotImplementedError(f"Algorithm {embed_algo} is not implemented!") if verbose and pp_descriptions is not None: show_close_descriptions(embed.embedding_, pp_descriptions, is_embedding=True, num=10, title=f"Embedding-Distances ({get_setting('DISSIM_MEASURE')})") if hasattr(embed, "dissimilarity_matrix_") and np.allclose(embed.dissimilarity_matrix_, dissim_mat): print("Dropping the dissim-mat from the embedding - it only bloats and is the same as in the previous step.") embed.dissimilarity_matrix_ = None return embed
def get_stopwords(language, include_desc15_stopwords=True, include_custom=True, include_withoutdiacritics=True): if language in NLTK_LAN_TRANSLATOR: language = NLTK_LAN_TRANSLATOR[language] assert language in NLTK_LAN_TRANSLATOR.values( ), f"Cannot deal with language {language}" stopwords = set(nlstopwords.words(language)) if include_desc15_stopwords and language == "english": stopwords |= load_desc15_stopwords() if include_custom and language == "english": stopwords |= set(get_setting("CUSTOM_STOPWORDS")) if include_withoutdiacritics: stopwords |= set(strip_accents_unicode(i) for i in stopwords) return tuple(stopwords)
def select_salient_terms(metrics, decision_planes, dcm, embedding, prim_lambda, sec_lambda, metricname, verbose=False): #TODO waitwaitwait. Am I 100% sure that the intercepts of the decision_planes are irrelevant?! #TODO what about those with high negative kappa? Einfach abs-wert nehmen und consideren (AUCH SCHON IM SCHRITT VORHER IF SO) print(f"Calculated Metrics: {list(list(metrics.values())[0].keys())}") print(f"Lambda1: {prim_lambda}, Lambda2: {sec_lambda}, compareto-metric: {metricname}") metrics = sorted(list({k: v[metricname] for k, v in metrics.items()}.items()), key=lambda x:x[1], reverse=True) get_tlambda = lambda metrics, lamb: [i[0] for i in metrics if i[1] >= prim_lambda] get_tlambda2 = lambda metrics, lamb1objs, seclamb: [i[0] for i in metrics if i[1] >= sec_lambda and i[0] not in lamb1objs] candidates = get_tlambda(metrics, prim_lambda) salient_directions = [metrics[0][0],] n_terms = min(len(candidates), get_setting("NDIMS_NCANDS_FACTOR")*len(decision_planes[salient_directions[0]].coef)) #2 in [DESC15] if get_setting("DEBUG"): n_terms = min(n_terms, 15) comparer = Comparer(decision_planes, vec_cos) #DESC15: "as the ith term, we select the term t minimising max_{j<i}cos(v_t_j, v_t) - In other words, we repeatedly select the term which is least similar to the terms that have already been selected" for nterm in tqdm(range(1, n_terms), desc="Finding Salient Directions"): cands = set(candidates)-set(salient_directions) compares = {cand: min(comparer(cand, compareto) for compareto in salient_directions) for cand in cands} #vec_cos(decision_planes[next(iter(cands))].normal, decision_planes[salient_directions[0]].normal) salient_directions.append(max(compares.items(), key=lambda x:x[1])[0]) print(f"Found {len(salient_directions)} salient directions: {', '.join(salient_directions)}") compare_vecs = [decision_planes[term].normal for term in salient_directions] clusters = {term: [] for term in salient_directions} #TODO optionally instead do the cluster-assignment with k-means! nongreats = get_tlambda2(metrics, salient_directions, sec_lambda) if get_setting("DEBUG"): nongreats = nongreats[:2000] for term in tqdm(nongreats, desc="Associating the rest to Clusters"): # "we then associate with each term d_i a Cluster C_i containing all terms from T^{0.1} which are more similar to d_i than to any of the # other directions d_j." TODO: experiment with thresholds, if it's extremely unsimilar to ALL just effing discard it! clusters[salient_directions[np.argmin([vec_cos(decision_planes[term].normal, vec2) for vec2 in compare_vecs])]].append(term) # TODO maybe have a smart weighting function that takes into account the kappa-score of the term and/or the closeness to the original clustercenter (to threshold which cluster they are added to) #TODO an option here to either take mean, or only main-one, or smartly-weighted (I think DESC15 did only main-one) if get_setting("CLUSTER_DIRECTION_ALGO") == "mean": cluster_directions = join_clusters_average(clusters, decision_planes) elif get_setting("CLUSTER_DIRECTION_ALGO") == "main": cluster_directions = {term: decision_planes[term] for term in clusters.keys()} elif get_setting("CLUSTER_DIRECTION_ALGO") == "reclassify": cluster_directions = join_clusters_reclassify(clusters, dcm, embedding, verbose=verbose) else: raise NotImplementedError("TODO: weighted and others") #missing: weighted-by-kappa-averaged, weighted-by-distance-to-center-averaged (cosine, cosine+coef) #regarding mean-algorithm: taking the mean of the respective orthogonals seems reasonable, it's the mean direction. However we also care for the actual position of the # hyperplane (to get the actual ranking-wrt-this-feature), which is specified by orthogonal+intercept... and simply averaging the intercepts of it's clustercomponents seems really stupid. # that however gives us another way to weight which-candidates-may-cluster: the closer the orthogonals (cosine-dist) AND the closer their intercepts, the more we want to have them in a cluster. return clusters, cluster_directions
def show_info(self, descriptions=None): occurs_in = [set(j[0] for j in i) if i else set() for i in self.dtm] num_occurences = [ sum([term_ind in i for i in occurs_in]) for term_ind in tqdm(range(len(self.all_terms)), desc="Counting Occurences [verbose]") ] show_hist( num_occurences, f"Docs per Keyword ({self.n_docs} docs, {len(self.all_terms)} terms)", xlabel="# Documents the Keyword appears in", ylabel="Count (log scale)", cutoff_percentile=98, log=True) above_threshold = len([ i for i in num_occurences if i >= get_setting("CANDIDATE_MIN_TERM_COUNT", silent=True) ]) sorted_canditerms = sorted( [[ind, elem] for ind, elem in enumerate(num_occurences)], key=lambda x: x[1], reverse=True) print( f"Found {len(self.all_terms)} candidate Terms, {above_threshold} ({round(above_threshold/len(self.all_terms)*100)}%) of which occur in at least {get_setting('CANDIDATE_MIN_TERM_COUNT', silent=True)} descriptions." ) print( "The 25 terms that occur in the most descriptions (incl the #descriptions they occur in):", ", ".join([ f"{self.all_terms[ind]} ({occs})" for ind, occs in sorted_canditerms[:25] ])) if descriptions is not None: max_ind = np.unravel_index(self.as_csr().argmax(), self.as_csr().shape) print( f"Max value: Term *b*{self.all_terms[max_ind[0]]}*b* has value *b*{dict(self.dtm[max_ind[1]])[max_ind[0]]:.3f}*b* for doc *b*{descriptions._descriptions[max_ind[1]].title}*b*" )
def get_countvec(pp_components, max_ngram, language, min_df=1): if isinstance(pp_components, str): pp_components = PPComponents.from_str(pp_components) if pp_components.remove_stopwords and get_setting( "TRANSLATE_POLICY") == "origlang": raise NotImplementedError( "Cannot deal with per-language-stopwords when using sklearn's CountVectorizer!" ) cnt = CountVectorizer( strip_accents="unicode" if pp_components.remove_diacritics else None, lowercase=pp_components.convert_lower, ngram_range=(1, max_ngram), min_df= min_df, #If 2, every term has a "partner", making the dissimilarity-matrix more compact stop_words=get_stopwords(language) if pp_components.remove_stopwords else None, #TODO see https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words ) # I cannot set min_df and max_df, as I need all words for the dissimilarity-matrix! # TODO when I set preprocessor here I can override the preprocessing (strip_accents and lowercase) stage while preserving tokenizing and n-grams generation steps # TODO gucken wie viele Schritte mir das schon spart - keyword extraction, grundlage für dissim_matrix, ...? (Corollary: gucken was für min_df/max_df-ranges für dissim_matrix sinnvoll sind) # TODO I can merge this and the old one: If the PPComponents-Entry is uppercase, use a subcomponent of the countvectorizer instead of original one # (it's both tokenization and occurence counting in one class, see https://scikit-learn.org/stable/modules/feature_extraction.html#common-vectorizer-usage) return cnt
if quantification_measure in ["tfidf", "tf"] else None) assert any( " " in i for i in dtm.all_terms.values()) == (get_setting("NGRAMS_IN_EMBEDDING") and get_setting("MAX_NGRAM") > 1) quantification = dtm.apply_quant( quantification_measure, descriptions=descriptions, verbose=verbose) if not metainf.get("sklearn_tfidf") else dtm # das ist jetzt \textbf{v}_e with all e's as rows #cannot use ppmis directly, because a) too sparse, and b) we need a geometric representation with euclidiean props (betweeness, parallism,..) assert all( len(set((lst := [i[0] for i in dtm]))) == len(lst) for dtm in quantification.dtm) dissim_mat, metainf = create_dissimilarity_matrix( quantification.as_csr(), dissim_measure=get_setting("dissim_measure"), metainf=metainf, **interrupt_kwargs) if metainf.get("NEWLY_INTERRUPTED"): return quantification, dissim_mat, metainf assert np.allclose( dissim_mat, dissim_mat.T ) #if so it's a correct dissimilarity-matrix and we can do squareform to compress metainf["is_dissim"] = np.allclose(np.diagonal(dissim_mat), 0, atol=1e-10) if verbose: show_close_descriptions(dissim_mat, descriptions) dissim_mat = squareform(dissim_mat, checks=True) #saves > 50% storage space! return quantification, dissim_mat, metainf #TODO: When I calculate PPMI here, relative to all documents and all possible terms, ist das relevant/unintended dass
def postprocess_candidateterms(candidate_terms, descriptions, extraction_method): """ In this method I'll try to fix candidate-terms and check if they are really in the descriptions they claim to be. To count the descriptions they are in, I'll both generate a new doc-term-matrix with the respective ngram AND check if it's in the literal text of the description, such that after this, i can savely forget the original descriptions and focus on DTMs. """ if get_setting("DEBUG"): maxlen = min(len(candidate_terms), len(descriptions._descriptions), get_setting("DEBUG_N_ITEMS")) descriptions._descriptions = descriptions._descriptions[:maxlen] candidate_terms = candidate_terms[:maxlen] assert len(candidate_terms) == len( descriptions ), f"Candidate Terms: {len(candidate_terms)}, Descriptions: {len(descriptions)}" flattened = set(flatten(candidate_terms)) print( "Extracted Unique Terms: ", ", ".join([ f"{k+1}-grams: {v}" for k, v in sorted(Counter([i.count(" ") for i in flattened]).items(), key=lambda x: x[0]) ]), "| sum:", len(flattened)) print( "Most often extracted Terms:", ", ".join( f"{i[0]} ({i[1]} times)" for i in sorted(list(Counter(flatten(candidate_terms)).items()), key=lambda x: x[1], reverse=True)[:5])) max_found_ngram = max(i.count(" ") for i in flatten(candidate_terms)) + 1 dtm = descriptions.generate_DocTermMatrix( min_df=1, max_ngram=max_found_ngram)[ 0] #TODO check if this works for all parameter-combis postprocessed_candidates = [[] for _ in candidate_terms] fails, changeds, toolong, ignores = set(), set(), set(), set() if extraction_method == "keybert": from derive_conceptualspace.create_spaces.preprocess_descriptions import PPComponents, get_countvec assert PPComponents.from_str( descriptions.recover_settings["pp_components"]).use_skcountvec #this is my try to reproduce the preprocessing for the terms from keybert (as it said in some T0D0 somewhere) - TODO do the non-skcountvec-method as well!! cnt = get_countvec(**descriptions.recover_settings, max_ngram=1, min_df=1) processor = lambda cand: " ".join(cnt.build_analyzer()(cand)) try_edit_fns = ( processor, strip_accents_unicode, fix_cand, lambda x: x.lower() ) #all PERMUTATIONS of these will be tried, that's a combinatorical explosion! else: try_edit_fns = () all_edit_fns = flatten([ list(permutations(try_edit_fns, i + 1)) for i in range(len(try_edit_fns)) ]) for desc_ind, desc in enumerate( tqdm(descriptions._descriptions, desc="Checking extracted candidates per-description")): term_counts = { dtm.all_terms[ind]: count for ind, count in dtm.dtm[desc_ind] } for cand in candidate_terms[desc_ind]: if cand.count(" ") + 1 > (get_setting("MAX_NGRAM") or 1): toolong.add(cand) continue if "xxMA_SENTBORDERxx" in cand: ignores.add(cand) continue cond, ncand = check_cand(cand, desc, edit_fns=all_edit_fns) if cond: if not extracted_literally(): assert term_counts[ncand] == desc.count_phrase( ncand ) #!!this shows that the DTM contains exactly the bow!! if cand != ncand: changeds.add((cand, ncand)) postprocessed_candidates[desc_ind].append(ncand) else: fails.add(cand) if extracted_literally(): assert not changeds and not toolong # changeds are for example when extract_coursetype extracted "seminar" from a description because it says "hauptseminar". # we can use that to make a mapping saying that a description containing the latter is defined to count as positive sample for the former. changeds_dict = {k: [] for k, vs in changeds} for k, v in changeds: changeds_dict[k].append(v) for desc_ind, desc in enumerate( tqdm(descriptions._descriptions, desc="Checking a second time " + ("(quickly)" if descriptions.proc_steps == ["bow"] else "(slowly)"))): desc_txt = desc.processed_as_string(allow_shorten=True) desc_dtm = set(i[0] for i in dtm.dtm[desc_ind]) for cand in postprocessed_candidates[desc_ind]: assert cand in desc if not descriptions.proc_steps == [ "bow" ]: #superflous check if it was created solely from the bow assert cand in desc_txt assert dtm.reverse_term_dict[cand] in desc_dtm if toolong: print( f"Had to drop {len(toolong)} out of {len(flatten(candidate_terms))} (non-unique) candidates because they were too long." ) if ignores: print( f"Had to drop {len(ignores)} out of {len(flatten(candidate_terms))} (non-unique) candidates because they were across sentence borders." ) print( f"Had to drop {len(fails)} out of {len(flatten(candidate_terms))} (non-unique) candidates" + (f" and edit {len(changeds)}." if changeds else ".")) print( "Postprocessed Unique Terms: ", ", ".join([ f"{k+1}-grams: {v}" for k, v in sorted(Counter( [i.count(" ") for i in set(flatten(postprocessed_candidates))]).items(), key=lambda x: x[0]) ]), "| sum:", len(set(flatten(postprocessed_candidates)))) return postprocessed_candidates, changeds_dict
def extracted_literally(): #some extraction-methods did extract literally, in which case I want to assert that no changes need to be done. return get_setting("EXTRACTION_METHOD") not in ["pp_keybert", "keybert"]
def translate_text(text, target="en", charlim=490000, origlans=None): # and I can still use the data from THIS call!! """Translates text into the target language. Target must be an ISO 639-1 language code. See https://g.co/cloud/translate/v2/translate-reference#supported_languages Text can also be a sequence of strings, in which case this method will return a sequence of results for each text. """ if not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = get_setting( "GOOGLE_CREDENTIALS_FILE") print(f"Translate-Charlim set to {charlim}") BYTELIM = int( 204800 * 0.9) #if a request is bigger than google API will raise an Error! SEGLIM = 128 #https://github.com/googleapis/google-cloud-python/issues/5425#issuecomment-562745220 TEXT_LEN_LIM = 2800 #google, this is getting ridiculus. SUMMED_TEXT_LEN_LIM = 100000 #102423 was too long... if isinstance(text, six.binary_type): text = text.decode("utf-8") translate_client = translate.Client() origtext = copy.deepcopy(text) if isinstance(text, (list, set, tuple)): accumulated_chars = list(accumulate([len(i) for i in text])) if accumulated_chars[-1] >= charlim: limit = [i > charlim for i in accumulated_chars].index(True) print( f"Have to drop {len(accumulated_chars)-limit} of {len(accumulated_chars)} elements!" ) if accumulated_chars[0] <= charlim: text = text[:limit - 1 if limit >= 0 else None] else: print("Limit already reached!") return prelimtext = copy.deepcopy(text) res = [] rest = [] while True: while len("|".join(text).encode('utf-8')) > BYTELIM or len( text) > SEGLIM: rest.insert(0, text[-1]) text = text[:-1] if any([len(i) > TEXT_LEN_LIM for i in text]): print("F**k you google.") flatten = lambda l: [item for sublist in l for item in sublist] # splitfn = lambda txt, maxlen: [txt[i:i+maxlen] for i in range(0, len(txt)+maxlen, maxlen) if txt[i:i+maxlen]] # split_text = [i.split(". ") if len(i) > TEXT_LEN_LIM else [i] for i in text] split_text = [ nltk.sent_tokenize(i) if len(i) > TEXT_LEN_LIM else [i] for i in text ] #sent_tokenize(x, language=origlans[n]) but whatever assert all(len(i) <= TEXT_LEN_LIM for i in split_text) longer_index = { ind: len(elem) - 1 for ind, elem in enumerate(split_text) if len(elem) > 1 } #now we merge the split sentences until they are all text-len-lim long for ind in longer_index.keys(): lens = [len(i) for i in split_text[ind]] index_mapper = {0: 0} # startindex: nwords indexmappernum = 0 for num, elem in enumerate(lens): assert elem <= TEXT_LEN_LIM, "one sentence is aleady too long." if index_mapper[indexmappernum] + elem >= TEXT_LEN_LIM: indexmappernum = num index_mapper[indexmappernum] = 0 index_mapper[indexmappernum] += elem indices = list( index_mapper.keys()) + [len(split_text[ind]) + 1] indices = [(indices[i], indices[i + 1]) for i in range(len(indices) - 1)] split_text[ind] = [ "".join(split_text[ind][i1:i2]) for i1, i2 in indices ] longer_index[ind] = len(split_text[ind]) - 1 text = [i[0] if isinstance(i, list) else i for i in split_text] latterparts = flatten([ i[1:] for i in split_text if isinstance(i, list) and len(i) > 1 ]) assert len(latterparts) <= SEGLIM, "f**k this." assert all(len(i) < TEXT_LEN_LIM for i in text) assert all(len(i) < TEXT_LEN_LIM for i in latterparts) assert sum([ len(i) for i in text ]) <= SUMMED_TEXT_LEN_LIM, "geez google what the actual f**k" assert sum([ len(i) for i in latterparts ]) <= SUMMED_TEXT_LEN_LIM, "geez google what the actual f**k" try: translations = translate_client.translate( text, target_language=target) translations2 = translate_client.translate( latterparts, target_language=target) except: failed = True else: failed = False assert sum(longer_index.values()) == len(translations2) latterparts_iter = iter(translations2) for index, ntranslations in longer_index.items(): for i in range(ntranslations): translations[index]["translatedText"] += next( latterparts_iter)["translatedText"] translated = translations else: assert all(len(i) < TEXT_LEN_LIM for i in text) try: translated = translate_client.translate(text, target_language=target) except: failed = True else: failed = False if not failed: assert len(translated) == len(text) res.extend(translated) assert len(res) + len(rest) == len(prelimtext) if rest: text = rest rest = [] else: assert len(prelimtext) == len(res) break else: break # print(u"Text: {}".format(result["input"])) # print(u"Translation: {}".format(result["translatedText"])) # print(u"Detected source language: {}".format(result["detectedSourceLanguage"])) return [html.unescape(i["translatedText"]) for i in res]
def dtm_loader(doc_term_matrix): dtm = DocTermMatrix.fromstruct(doc_term_matrix[1][1]) if get_setting("DEBUG"): if len(dtm.dtm) > get_setting("DEBUG_N_ITEMS"): warnings.warn("len(dtm) > DEBUG_N_ITEMS!!") return dtm
def classify(input, target, axnames, catnames, dt_depth, test_percentage_crossval, metric, do_plot=False, features_outvar=None, balance_classes=True, do_render=False, shuffle=False): # input[:, 99] = (target == "Shops&Services"); axnames[99] = "is_shop" # input[:, 98] = (target == "Food"); axnames[98] = "is_food" metric = "accuracy" if metric == "acc" else metric #https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values kwargs = dict(class_weight="balanced") if balance_classes else {} clf = DecisionTreeClassifier(random_state=get_setting("RANDOM_SEED"), max_depth=dt_depth, **kwargs) if test_percentage_crossval > 1: if metric == "f1" and len(catnames) > 2: metric = "f1_micro" cv = test_percentage_crossval if not shuffle else StratifiedKFold( n_splits=test_percentage_crossval, shuffle=True) #see "cv" parameter of cross_val_score at https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html if isinstance(metric, (list, tuple)): if len(catnames) > 2: metric = [i if i != "f1" else "f1_macro" for i in metric] #f1_macro would be = accuracy return None, None, cross_validate(clf, input, target, cv=cv, scoring=metric) else: scores = cross_val_score(clf, input, target, cv=cv, scoring=metric) score = scores.mean() clf.fit(input, target) if metric == "accuracy": assert get_score(clf, input, target, metric) == np.array([ res == target[i] for i, res in enumerate(clf.predict(input)) ]).mean() else: get_score(clf, input, target, metric, is_multiclass=len(catnames) > 2) #have to to be able to plot_tree # print(f"Doing {test_percentage_crossval}-fold cross-validation. Best Score: {scores.max():.2f}, Mean: {score}:.2f") elif test_percentage_crossval == 0: warnings.warn( "Using the full data as training set without a test-set!") clf.fit(input, target) score = scores = get_score(clf, input, target, metric, is_multiclass=len(catnames) > 2) else: X_train, X_test, y_train, y_test = train_test_split( input, target, test_size=test_percentage_crossval ) #TODO: stratify? https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html clf.fit(X_train, y_train) score = scores = get_score(clf, X_test, y_test, metric, is_multiclass=len(catnames) > 2) if features_outvar is not None: features_outvar.append(clf) if do_plot: if catnames: assert len(clf.classes_) == len(catnames) return score, plot_tree(clf, axnames, (catnames or [str(i) for i in clf.classes_]), do_render=do_render), scores return score, None, scores
def generate_DocTermMatrix(self, min_df=1, max_ngram=None, do_tfidf=None): if self.proc_steps[-1] == "bow": assert max_ngram in [None, 1], "Can't do!" print( "Preprocessed produced a bag-of-words already. Config `max_ngram` is useless!" ) forbid_setting("max_ngram") all_words = dict( enumerate( set(flatten(i.bow().keys() for i in self._descriptions)))) rev = {v: k for k, v in all_words.items()} dtm = [[[rev[k], v] for k, v in i.bow().items()] for i in self._descriptions] dtm = DocTermMatrix(dtm=dtm, all_terms=all_words, quant_name="count") if min_df > 1: dtm = DocTermMatrix.filter( dtm, min_df, use_n_docs_count=get_setting("CANDS_USE_NDOCS_COUNT"), verbose=get_setting("VERBOSE"), descriptions=self) return dtm, {"ngrams_in_embedding": False} elif hasattr(self, "recover_settings"): from derive_conceptualspace.create_spaces.preprocess_descriptions import PPComponents, get_countvec pp_comps = PPComponents.from_str( self.recover_settings["pp_components"]) if pp_comps.use_skcountvec: cnt = get_countvec(**self.recover_settings, max_ngram=(max_ngram or 1), min_df=min_df) fit_base = lambda: self.unprocessed_texts( remove_htmltags=pp_comps.remove_htmltags) else: raise NotImplementedError() else: cnt = CountVectorizer( strip_accents=None, lowercase=False, stop_words=None, ngram_range=(1, (max_ngram or 1)), min_df=min_df, token_pattern=r"(?u)(\b\w\w+\b|\b\d\b)" ) #this token_pattern allows for single-digit-numbers fit_base = lambda: self.iter("processed_as_string", insert_sentborder=("sent_tokenize" in self.proc_steps)) #if we sent_tokenize, we have something like "2. Semester" in the original, which becomes due to nltk's suckiness [["bla", "2"], ["Semester", "blub"]]. It shouldn't find stuff across sentence borders, so we insert detectables strings that we can remove later. # TODO If I can do sent_tokenize for the CountVectorizer I need to update this here as well! if do_tfidf is not None: #https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html # count = Pipeline([("count", cnt)]).fit(fit_base()) #to test if count already dies, otherwise we could try #https://discuss.analyticsvidhya.com/t/tfidf-on-sklearn-library-is-giving-me-a-huge-file-and-memory-error/78448/2 # see also https://stackoverflow.com/a/64996792/5122790 pipe = Pipeline([ ("count", cnt), ("tfidf", TfidfTransformer(use_idf=(do_tfidf == "tfidf"))) ]).fit(fit_base()) aslist, all_words = csr_to_list(pipe.transform(fit_base()), pipe["count"].vocabulary_) return DocTermMatrix(dtm=aslist, all_terms=all_words, quant_name=do_tfidf), { "ngrams_in_embedding": any(" " in i for i in all_words.values()), "sklearn_tfidf": True } X = cnt.fit_transform(fit_base()) aslist, all_words = csr_to_list(X, cnt.vocabulary_) return DocTermMatrix(dtm=aslist, all_terms=all_words, quant_name="count"), { "ngrams_in_embedding": any(" " in i for i in all_words.values()) }
def create_tsne(dissim_mat, embed_dimensions): embedding = TSNE(n_components=embed_dimensions, random_state=get_setting("RANDOM_SEED"), metric="precomputed") tsne = embedding.fit(dissim_mat) return tsne
def preprocess_descriptions_full(raw_descriptions, dataset_class, pp_components, for_language, translate_policy, languages, translations=None, verbose=True): #TODO should I assert a minimal number of PP-Components? If I don't word-tokenize it all doesn't make much sense, does it? pp_components = PPComponents.from_str(pp_components) print("The following Pre-Processings will be performed:", ", ".join([k for k, v in pp_components.di.items() if v])) descriptions = dataset_class.preprocess_raw_file(raw_descriptions, pp_components) if get_setting("preprocessed_bow", default_false=True): descriptions = descriptions_from_bow(descriptions, languages, translations, translate_policy) if len(raw_descriptions["vecs"]) > len(descriptions): warnings.warn( f"Because of the min-words-per-desc setting, {len(raw_descriptions['vecs'])-len(descriptions)} of the original items needed to be removed!" ) else: if get_setting("DEBUG"): descriptions = descriptions[:get_setting( "DEBUG_N_ITEMS" )] #pd.DataFrame([descriptions.iloc[key] for key in random.sample(range(len(descriptions)), k=get_setting("DEBUG_N_ITEMS"))]) if isinstance(languages, str): languages = { k: {k2: languages for k2 in descriptions[k]} if set(descriptions[k]) != {''} else None for k in descriptions.keys() } descriptions = create_bare_desclist( languages, translations, for_language, list(descriptions["title"]), list(descriptions["description"]), [i if str(i) != "nan" else None for i in descriptions["subtitle"]], translate_policy, pp_components=pp_components, assert_all_translated=False, additionals={ i: [ j if not (isinstance(j, float) and math.isnan(j)) else None for j in descriptions[i] ] for i in dataset_class.additionals } if pp_components.add_additionals else None) if pp_components.use_skcountvec: descriptions = pp_descriptions_countvec(descriptions, pp_components, for_language) else: descriptions = preprocess_descriptions(descriptions, pp_components) descriptions = descriptions.filter_words( min_words=get_setting("MIN_WORDS_PER_DESC")) if verbose: show_hist([i.n_words() for i in descriptions._descriptions], "Words per Description", xlabel="Number of Words") return descriptions, {"n_samples": len(descriptions)}
for cand_mets, decision_plane, term in res: metrics[term] = cand_mets decision_planes[term] = decision_plane assert set(terms) == set(metrics.keys()) if (didnt_converge := len([1 for i in metrics.values() if i and not i["did_converge"]])): warnings.warn(f"{didnt_converge} of the {len(metrics)} SVMs did not converge!", sklearn.exceptions.ConvergenceWarning) if verbose: df = pd.DataFrame(metrics).T df.columns = df.columns.str.replace("kappa", "k").str.replace("rank2rank", "r2r").str.replace("bin2bin", "b2b").str.replace("f_one", "f1").str.replace("digitized", "dig") for metricname in df.columns: print(f"\nAverage *r*{metricname}*r*: {df[metricname].mean():.5f}") with pd.option_context('display.max_rows', 11, 'display.max_columns', 20, 'display.expand_frame_repr', False, 'display.max_colwidth', 20, 'display.float_format', '{:.4f}'.format): print(str(df.sort_values(by=metricname, ascending=False)[:10]).replace(metricname, f"*r*{metricname}*r*")) if embedding.shape[1] == 3 and IS_INTERACTIVE: best_elem = max(metrics.items(), key=lambda x:(x[1] or {}).get("f_one",0)) create_candidate_svm(embedding, best_elem[0], dcm.term_quants(best_elem[0]), classifier=get_setting("CLASSIFIER"), quant_name=dcm.quant_name, plot_svm=True, descriptions=descriptions) while (another := input("Another one to display: ").strip()) != "": if "," in another: highlight = [i.strip() for i in another.split(",")[1:]] another = another.split(",")[0].strip() else: highlight = [] create_candidate_svm(embedding, another, dcm.term_quants(another), classifier=get_setting("CLASSIFIER"), quant_name=dcm.quant_name, plot_svm=True, descriptions=descriptions, highlight=highlight) return quants_s, decision_planes, metrics, metainf class Comparer(): def __init__(self, decision_planes, compare_fn): self.decision_planes = decision_planes self.already_compared = {} self.compare_fn = compare_fn