def _create_dissim_mat(arr, dissim_measure, force_singlethread=False, n_chunks=200, silent=False, metainf=None, continue_from=None):
    # return squareform(np.apply_along_axis(cos_to_normangdiff, 0, pdist(arr, metric="cosine")))
    # assert np.allclose(np.hstack([cdist(arr, arr[i*10:(i+1)*10], "cosine") for i in range(10)]), squareform(tmp))
    if dissim_measure in ["cosine", "norm_ang_dist"]:
        dist_func = "cosine"
    else:
        dist_func = dissim_measure
    metainf = {}
    tmp = []
    RAM_PER_CORE = 15 #TODO dependent on the dataset
    if not force_singlethread and get_ncpu(ram_per_core=RAM_PER_CORE) > 1: # max. 1 thread per 10GB RAM
        # with WorkerPool(get_ncpu(ram_per_core=10), arr, pgbar="Creating dissimilarity matrix" if not silent else None) as pool:
        #     tmp = pool.work(list(np.array_split(arr, n_chunks)), lambda arr, chunk: cdist(arr, chunk, dist_func))
        with Interruptible(np.array_split(arr, n_chunks), [tmp], metainf, shutdown_time=210, continue_from=continue_from, contains_mp=True) as iter:
            with WorkerPool(get_ncpu(ram_per_core=RAM_PER_CORE), arr, pgbar="Creating dissimilarity matrix" if not silent else None, comqu=iter.comqu) as pool:
                tmpres, interrupted = pool.work(iter, lambda arr, chunk: cdist(arr, chunk, dist_func))
            tmp = iter.notify([tmpres], exception=interrupted)[0]
        if iter.interrupted:
            return tmp, metainf
    else:
        print("Running interruptible with one process")
        with Interruptible(np.array_split(arr, n_chunks), tmp, metainf, continue_from=continue_from, pgbar=None if silent else "Creating dissimilarity matrix") as iter:
            for chunk in iter:  #np.array_split(arr, n_chunks) if silent else tqdm(np.array_split(arr, n_chunks), desc="Creating dissimilarity matrix")
                tmp.append(cdist(arr, chunk, dist_func))
        if iter.interrupted:
            return tmp, metainf
    assert np.allclose(np.hstack(tmp), np.hstack(tmp).T), "The matrix must be symmetric!"
    res = np.hstack(tmp)
    if dissim_measure == "norm_ang_dist":
        flat = squareform(np.hstack(tmp), checks=False) #dunno why this one fails though np.hstack(tmp) == np.hstack(tmp).T
        res = squareform(np.apply_along_axis(cos_to_normangdiff, 0, flat))
    assert np.allclose(np.diagonal(res), 0, atol=1e-10) or np.allclose(np.diagonal(res), 1, atol=1e-10), "Diagonal must be 1 or 0!"
    assert np.allclose(res, res.T), "The matrix must be symmetric!"
    return res, metainf
def pmi(doc_term_matrix, positive=False, verbose=False, descriptions=None):
    # PMI as defined by DESC15
    logger.info("Calculating PMIs...")
    arr = doc_term_matrix.as_csr()
    total_words = arr.sum()
    arr = arr / total_words  #now arr is p_{et}
    words_per_doc = arr.sum(axis=0)  #p_{e*}
    ges_occurs_per_term = arr.sum(axis=1)  #p_{*t}
    prod = ges_occurs_per_term * words_per_doc  #I'd like to scipy.sparse.csr.csr_matrix(...), but that conversion kills my RAM completely..
    res = arr / prod
    res[np.isnan(res)] = 0
    del arr
    del prod
    gc.collect()
    res = np.log1p(
        res
    )  # DESC15 say it's just the log, but if we take the log all the values 0<val<1 are negative and [i for i in res[doc_term_matrix.reverse_term_dict["building"]].tolist()[0] if i > 0] becomes a much smaller number
    if positive:
        res[res < 0] = 0.0
    assert not np.isnan(res).any(), "There are NaNs in the PPMI!"
    quantifications = csr_to_list(res.T)
    del res
    gc.collect()
    if verbose:
        print(
            "The counting that'll come now will take long and is only there because you're verbose"
        )
        print_quantification(doc_term_matrix, quantifications, descriptions)
    return quantifications
def create_isomap(dissim_mat, embed_dimensions, neighbor_factor=2, **kwargs):
    # https://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling says isomap better suited than MDS, but DESC15 say they compared it and it's worse ([15] of [DESC15])!
    n_neighbors=min(max(5, dissim_mat.shape[0]//neighbor_factor), dissim_mat.shape[0]-1)
    print(f"Running Isomap with {get_ncpu(ignore_debug=True)} jobs for max {n_neighbors} neighbors.")
    embedding = Isomap(n_jobs=get_ncpu(ignore_debug=True), n_neighbors=n_neighbors, n_components=embed_dimensions, metric="precomputed", **kwargs)
    isomap = embedding.fit(dissim_mat)
    return isomap
def run_lsi(pp_descriptions, filtered_dcm, verbose):
    """as in [VISR12: 4.2.1]"""
    if verbose:
        filtered_dcm.show_info(descriptions=pp_descriptions)
        if get_setting("DCM_QUANT_MEASURE") != "binary":
            logger.warn("VISR12 say it works best with binary!")
    orig_len = len(filtered_dcm.dtm)
    filtered_dcm.add_pseudo_keyworddocs()
    # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
    svd = TruncatedSVD(n_components=100,
                       random_state=get_setting("RANDOM_SEED"))
    transformed = svd.fit_transform(filtered_dcm.as_csr().T)
    desc_psdoc_dists = cdist(transformed[:orig_len], transformed[orig_len:],
                             "cosine")
    already_keywords = [
        [ind, j[0]] for ind, elem in enumerate(filtered_dcm.dtm[:orig_len])
        for j in elem
    ]  # we don't gain information from those that are close but already keywords
    desc_psdoc_dists[list(zip(*already_keywords))] = np.inf
    WHICH_LOWEST = 30
    tenth_lowest = np.partition(desc_psdoc_dists.min(axis=1), WHICH_LOWEST)[
        WHICH_LOWEST]  # https://stackoverflow.com/a/43171216/5122790
    good_fits = np.where(desc_psdoc_dists.min(axis=1) < tenth_lowest)[0]
    for ndesc, keyword in zip(good_fits,
                              np.argmin(desc_psdoc_dists[good_fits], axis=1)):
        assert not filtered_dcm.all_terms[
            keyword] in pp_descriptions._descriptions[ndesc]
        print(f"*b*{filtered_dcm.all_terms[keyword]}*b*",
              pp_descriptions._descriptions[ndesc])
    print()
Example #5
0
def show_data_info(ctx):
    from derive_conceptualspace.cli.args_from_filename import LAST_RESULT
    ctx.obj[LAST_RESULT] = ctx.obj["json_persister"].load(
        None, LAST_RESULT
    )  #TODO: make the LAST_RESULT ONE THING used also in Snakefile and args_from_filename
    show_data_info_base(ctx)
    print()
def get_file_config(base_dir, filepath, dirname_vars):
    if not isfile(filepath) and isfile(join(base_dir, filepath)):
        filepath = join(base_dir, filepath)
    try:
        with open(filepath) as rfile:
            used_conf = next(ijson.items(rfile, "used_influentials")
                             )  #next(ijson.items(rfile, "used_config"))[0]
            rfile.seek(0)
            used_files = next(ijson.items(rfile, "loaded_files"))
    except Exception as e:
        print(f"Error for {filepath}")
        raise e
    used_conf = {
        k: v
        for k, v in used_conf.items()
        if k not in set(settings.MAY_DIFFER_IN_DEPENDENCIES) -
        set(standardize_config_name(i) for i in dirname_vars)
    }
    all_used_conf = dict(
        ChainMap(
            used_conf,
            *(i["metadata"].get("used_influentials", {})
              for i in used_files.values())))
    return {
        k: float(v) if isinstance(v, decimal.Decimal) else v
        for k, v in all_used_conf.items()
    }
def get_ncpu(ram_per_core=None, ignore_debug=False):
    import psutil
    if not ignore_debug and get_setting("DEBUG"):
        return 1
    ncpus = get_setting("N_CPUS")
    if os.getenv("NSLOTS"):
        if not os.getenv(f"{ENV_PREFIX}shutups_nslots"):
            print("This machine has been given NSLOTS and it is",
                  os.getenv("NSLOTS"))
        os.environ[f"{ENV_PREFIX}shutups_nslots"] = "1"
        ncpus = max(int(os.environ["NSLOTS"]) - 1, 1)
        # "To ensure that your job is scheduled on a host you are advised not to have request more  than $NCPU -1 parallel environments."
    if "GOTO_NUM_THREADS" in os.environ:  # see https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#threads
        print(
            f"Snakemake restricts the #Threads to {os.environ['GOTO_NUM_THREADS']}"
        )
        ncpus = min(ncpus, int(os.environ["GOTO_NUM_THREADS"]))
    if ram_per_core:  #TODO if I'm on the grid, I should have an env-var with the assigned ram and use that instead!!
        ncpus = min(
            ncpus,
            round(psutil.virtual_memory().total / 1024 / 1024 / 1024 /
                  ram_per_core))
        if "SGE_SMK_mem" in os.environ and os.environ["SGE_SMK_mem"].endswith(
                "G"):
            ncpus = min(ncpus,
                        round(
                            int(os.environ["SGE_SMK_mem"][:-1]) /
                            ram_per_core))  # max. 1 thread per XGB RAM
    return ncpus
 def init_context(self, load_envfile=False, load_conffile=True, ignore_envs=False): #works for both a click-Context and my custom one
     if not self._initialized:
         #first of all, load settings from env-vars and, if you have it by then, from config-file
         if not ignore_envs:
             if load_envfile and os.environ.get(ENV_PREFIX+"_"+"ENV_FILE"):
                 load_dotenv(os.environ.get(ENV_PREFIX+"_"+"ENV_FILE"))
             relevant_envvars = {k[len(ENV_PREFIX)+1:]: v for k, v in os.environ.items() if k.startswith(ENV_PREFIX+"_")}
             for param, val in relevant_envvars.items():
                 if param.startswith("CONF_FORCE_"): #that's how snakemake enforces the config-file, in that situation conf-file has higher prio than env-var
                     self.set_config(param[len("CONF_FORCE_"):], val, "smk_wildcard")
                 else:
                     self.set_config(param, val, "env_vars")
         if self.get_config("conf_file"):
             if load_conffile:
                 self.read_configfile()
             else:
                 print("The env-vars contain the path to a config-file, but it intentionally isn't loaded!")
         self.obj["dataset_class"] = dataset_specifics.load_dataset_class(self.get_config("dataset"))
         if hasattr(self.obj["dataset_class"], "configs"):
             for param, val in self.obj["dataset_class"].configs.items():
                 self.set_config(param, val, "dataset_class")
         if hasattr(self.obj["dataset_class"], "init"):
             self.obj["dataset_class"].init(self)
         CustomIO.init(self)
         self.obj["json_persister"] = JsonPersister(self, settings.DIR_STRUCT)
         self.set_debug()
         if self.has_config("base_dir", include_default=False):
             os.chdir(self.get_config("base_dir"))
         self._init_time = datetime.now()
         self._initialized = True
def extract_candidateterms_keybert_preprocessed(descriptions,
                                                max_ngram,
                                                faster_keybert=False,
                                                verbose=False,
                                                **kwargs):
    from keybert import KeyBERT  # lazily loaded as it needs tensorflow/torch which takes some time to init
    model_name = "paraphrase-MiniLM-L6-v2" if faster_keybert else "paraphrase-mpnet-base-v2"
    print(f"Using model {model_name}")
    candidateterms = []
    kw_model = KeyBERT(model_name)
    descs = descriptions._descriptions if not get_setting(
        "DEBUG") else descriptions._descriptions[:get_setting("DEBUG_N_ITEMS")]
    for desc in tqdm(descs, desc="Running KeyBERT on descriptions"):
        stopwords = get_stopwords(desc.lang)
        candidates = set()
        for nwords in range(1, max_ngram):
            n_candidates = kw_model.extract_keywords(
                desc.processed_as_string(),
                keyphrase_ngram_range=(1, nwords),
                stop_words=stopwords)
            candidates |= set(i[0] for i in n_candidates)
        candidates = list(candidates)
        if (ct := extract_coursetype(desc)) and ct not in candidates:
            candidates += [ct]
        candidateterms.append(candidates)
Example #10
0
def cli(ctx):
    """
    You can call this pipeline in many ways: With correct env-vars already set, with a provided `--env-file`, with a
    provided `--conf-file`, with command-line args (at the appropriate sub-command), or with default-values. If a multiple
    values for settings are given, precedence-order is command-line-args > env-vars (--env-file > pre-existing) > conf-file > dataset_class > defaults
    """
    print("Starting up at", datetime.now().strftime("%d.%m.%Y, %H:%M:%S"))
    setup_logging(ctx.get_config("log"), ctx.get_config("logfile"))
    ctx.init_context(
    )  #after this point, no new env-vars should be set anymore (are not considered)
Example #11
0
 def _filter_step2(dtm, used_terms_set, verbose=False, descriptions=None):
     all_terms_new = dict(
         enumerate(
             [v for k, v in dtm.all_terms.items() if k in used_terms_set]))
     all_terms_new_rev = {v: k for k, v in all_terms_new.items()}
     dtm_translator = {
         k: all_terms_new_rev[v]
         for k, v in dtm.all_terms.items() if k in used_terms_set
     }
     doc_term_matrix = [[[dtm_translator.get(ind), num] for ind, num in doc
                         if ind in used_terms_set] for doc in dtm.dtm]
     if descriptions:
         if get_setting("DO_SANITYCHECKS"):
             expected_bows = {
                 ndoc: {all_terms_new[elem]: count
                        for elem, count in doc}
                 for ndoc, doc in enumerate(doc_term_matrix[:10])
             }
             assert all(
                 all(v == descriptions._descriptions[i].bow()[k]
                     for k, v in expected_bows[i].items() if not " " in k)
                 for i in range(10))
             assert all(
                 all(v == descriptions._descriptions[i].count_phrase(k)
                     for k, v in expected_bows[i].items() if not " " in k)
                 for i in range(10))
             assert all(
                 all_terms_new[ind] in descriptions._descriptions[ndoc]
                 for ndoc, doc in enumerate(
                     tqdm(
                         doc_term_matrix,
                         desc=
                         "Cross-checking filtered DCM with Descriptions [sanity-check]"
                     )) for ind, count in doc)
         if verbose:
             shown = []
             for n_keyphrases in [0, 1, 20]:
                 items = [[
                     descriptions._descriptions[i],
                     [all_terms_new[j[0]] for j in e]
                 ] for i, e in enumerate(doc_term_matrix)
                          if len(e) <= n_keyphrases]
                 if items:
                     print(
                         f"Documents with max {n_keyphrases} keyphrases ({len(items)}):\n  "
                         + "\n  ".join(
                             f"{i[0]}: {', '.join(i[1])}" for i in
                             [j
                              for j in items if j[0] not in shown][:5][:5]))
                     shown += [i[0] for i in items]
     return DocTermMatrix(dtm=doc_term_matrix,
                          all_terms=all_terms_new,
                          quant_name="count",
                          verbose=verbose)
def create_doc_cand_matrix(postprocessed_candidates,
                           descriptions,
                           verbose=False):
    postprocessed_candidates, changeds_dict = postprocessed_candidates.values()
    if get_setting("DEBUG"):
        maxlen = min(len(postprocessed_candidates), len(descriptions),
                     get_setting("DEBUG_N_ITEMS"))
        postprocessed_candidates = postprocessed_candidates[:maxlen]
        descriptions._descriptions = descriptions._descriptions[:maxlen]
    else:
        assert len(postprocessed_candidates) == len(descriptions)
    assert all(cand in desc
               for ndesc, desc in enumerate(descriptions._descriptions)
               for cand in postprocessed_candidates[ndesc])
    all_phrases = list(set(flatten(postprocessed_candidates)))
    if get_setting("DEBUG"):
        all_phrases = all_phrases[:get_setting("DEBUG_N_ITEMS")]
    # if I used gensim for this, it would be `dictionary,doc_term_matrix = corpora.Dictionary(descriptions), [dictionary.doc2bow(doc) for doc in descriptions]`
    # dictionary = corpora.Dictionary([all_phrases])
    dtm = [
        sorted(
            [(nphrase, desc.count_phrase(phrase))
             for nphrase, phrase in enumerate(all_phrases) if phrase in desc],
            key=lambda x: x[0])
        for ndesc, desc in enumerate(
            tqdm(descriptions._descriptions, desc="Creating Doc-Cand-Matrix"))
    ]
    #TODO statt dem ^ kann ich wieder SkLearn nehmen
    if get_setting("DO_SANITYCHECKS"):
        assert all(
            [n for n, i in enumerate(descriptions._descriptions)
             if term in i] == [
                 n for n, i in enumerate(dtm)
                 if all_phrases.index(term) in [j[0] for j in i]
             ] for term in random.sample(all_phrases, 5))
    doc_term_matrix = DocTermMatrix(dtm=dtm,
                                    all_terms=all_phrases,
                                    verbose=verbose,
                                    quant_name="count")
    if get_setting("DO_SANITYCHECKS"):
        assert all(
            len([i for i in descriptions._descriptions if term in i]) == len(
                [i for i in doc_term_matrix.term_quants(term) if i > 0])
            for term in random.sample(all_phrases, 5))
    #TODO why do I even need to filter this uhm err
    if verbose and get_setting("EXTRACTION_METHOD") != "all":
        print(
            "The 25 terms that are most often detected as candidate terms (incl. their #detections):",
            ", ".join(f"{k} ({v})" for k, v in sorted(dict(
                Counter(flatten(postprocessed_candidates))).items(),
                                                      key=lambda x: x[1],
                                                      reverse=True)[:25]))
    return doc_term_matrix
def get_defaultsetting(key, silent=False, default_false=False):
    if "DEFAULT_" + key not in globals():
        if not default_false:
            raise ValueError(
                f"You didn't provide a value for {key} and there is no default-value!"
            )
        else:
            return False
    default = globals()["DEFAULT_" + key]
    if key not in NON_INFLUENTIAL_CONFIGS and not silent:
        if not os.getenv(f"{ENV_PREFIX}shutups_{key}"):
            print(f"returning {key} from default: *b*{default}*b*")
        os.environ[f"{ENV_PREFIX}shutups_{key}"] = "1"
    return default
Example #14
0
def plot_perweightingalgo(averaged, detailed, lambda1, do_print=True):
    # TODO what's missing:
    #  * maybe also plot standard deviation, maybe make a scatterplot for all of the param-combis instead of a simple barplot (or overlay both!)
    # TODO[i]: HAVE to add number of samples here, and SHOULD add info on what other configs (including which dataset!) are given
    # TODO[i]: Die Namen der actual kappa-funcs klingen noch ziemlich shitty
    title = f"Number of candidate-directions with κ ≥ {lambda1} per weighting-algorithm,\n averaged over {round(len(detailed)/len(averaged))} parameter-combinations each (TODOsInHere!)"
    averaged = pd.DataFrame(averaged)
    if do_print: print("\n**"+title.replace("\n", "")+":**\n\n", averaged)
    ax = averaged.plot(kind="bar", logy=True)
    ax.set_xticklabels([i._text.replace("kappa_", "").replace("_", " ") for i in ax.get_xticklabels()], ha="right", rotation=45)
    plt.tight_layout()
    plt.subplots_adjust(top=0.88)
    plt.title(title)
    plt.show()
def create_candidate_svms(dcm, embedding, descriptions, verbose, continue_from=None):
    #TODO I am still not sure about if I am calculating with vectors somewhere where when I should be working with points
    if hasattr(embedding, "embedding_"): embedding = embedding.embedding_
    decision_planes = {}
    metrics = {}
    terms = list(dcm.all_terms.values())
    metainf = {}
    if get_setting("DEBUG"):
        maxlen = min(len(terms), len(embedding), get_setting("DEBUG_N_ITEMS"), len(dcm.dtm))
        working_inds = [nterm for nterm, term in enumerate(terms[:maxlen]) if np.array(dcm.term_quants(term)[:maxlen], dtype=bool).std()] #those with >1 class
        term_inds = unique(flatten([j[0] for j in dcm.dtm[i]] for i in working_inds))
        terms = [dcm.all_terms[i] for i in term_inds]
        embedding = embedding[working_inds]
        ind_translator = {v: k for k, v in enumerate(term_inds)}
        dcm = DocTermMatrix([[[ind_translator[j[0]],j[1]] for j in dcm.dtm[i]] for i in working_inds],
                            {ind_translator[i]: dcm.all_terms[i] for i in term_inds}, dcm.quant_name)
        print(f"Debug-Mode: Running for {len(working_inds)} Items and {len(terms)} Terms.")
        # warnings.warn("PRECOMMIT there's stuff here!")
        # assert all(i in terms for i in ['nature', 'ceiling', 'engine', 'athlete', 'seafood', 'shadows', 'skyscrapers', 'b737', 'monument', 'baby', 'sign', 'marine', 'iowa', 'field', 'buy', 'military', 'lounge', 'factory', 'road', 'education', '13thcentury', 'people', 'wait', 'travel', 'tunnel', 'treno', 'wings', 'hot', 'background', 'vintage', 'farmhouse', 'technology', 'building', 'horror', 'realestate', 'crane', 'slipway', 'ruin', 'national', 'morze'])
        # terms = ['nature', 'ceiling', 'engine', 'athlete', 'seafood', 'shadows', 'skyscrapers', 'b737', 'monument', 'baby', 'sign', 'marine', 'iowa', 'field', 'buy', 'military', 'lounge', 'factory', 'road', 'education', '13thcentury', 'people', 'wait', 'travel', 'tunnel', 'treno', 'wings', 'hot', 'background', 'vintage', 'farmhouse', 'technology', 'building', 'horror', 'realestate', 'crane', 'slipway', 'ruin', 'national', 'morze']
        # assert len([i for i in descriptions._descriptions if 'nature' in i]) == len([i for i in dcm.term_quants('nature') if i > 0])
        # print(f"Running only for the terms {terms}")
    else:
        assert all(len([i for i in descriptions._descriptions if term in i]) == len([i for i in dcm.term_quants(term) if i > 0]) for term in random.sample(terms, 5))
    if get_setting("DO_SANITYCHECKS"):
        assert all(dcm.term_quants(terms[i]) == list(dcm.as_csr()[i,:].toarray().squeeze()) for i in random.sample(range(len(terms)), 5))

    quants_s = dcm.as_csr().toarray().tolist()  # [dcm.term_quants(term) for term in tqdm(terms, desc="Counting Terms")]
    ncpu = get_ncpu(ram_per_core=10) #TODO: make ram_per_core dependent on dataset-size
    if ncpu == 1:  #TODO Interruptible: for ncpu==1, I'm adding direct key-value-pairs, in the ncpu>1 version I'm appending to a list -> they are incompatible!
        with Interruptible(zip(terms, quants_s), ([], decision_planes, metrics), metainf, continue_from=continue_from, pgbar="Creating Candidate SVMs [1 proc]", total=len(terms), name="SVMs") as iter:
            for term, quants in iter: #in tqdm(zip(terms, quants_s), desc="Creating Candidate SVMs", total=len(terms))
                cand_mets, decision_plane, term = create_candidate_svm(embedding, term, quants, classifier=get_setting("CLASSIFIER"), descriptions=descriptions, quant_name=dcm.quant_name)
                metrics[term] = cand_mets
                decision_planes[term] = decision_plane
    else:
        print(f"Starting Multiprocessed with {ncpu} CPUs")
        with Interruptible(zip(terms, quants_s), [None, [], None], metainf, continue_from=continue_from, contains_mp=True, name="SVMs", total=len(quants_s)) as iter:
            with tqdm(total=iter.n_elems, desc=f"Creating Candidate SVMs [{ncpu} procs]") as pgbar, ThreadPool(ncpu, comqu=iter.comqu) as p:
                res, interrupted = p.starmap(create_candidate_svm, zip(repeat(embedding, iter.n_elems), repeat("next_0"), repeat("next_1"), repeat(get_setting("CLASSIFIER")), repeat(False), repeat(None), repeat(dcm.quant_name), repeat(pgbar)), draw_from=iter.iterable)
            _, res, _ = iter.notify([None, res, None], exception=interrupted)
            if interrupted is not False:
                return quants_s, res, None, metainf
        for cand_mets, decision_plane, term in res:
            metrics[term] = cand_mets
            decision_planes[term] = decision_plane
        assert set(terms) == set(metrics.keys())
    if (didnt_converge := len([1 for i in metrics.values() if i and not i["did_converge"]])):
        warnings.warn(f"{didnt_converge} of the {len(metrics)} SVMs did not converge!", sklearn.exceptions.ConvergenceWarning)
def show_close_descriptions(dissim_mat, descriptions, is_embedding=False, num=10, title="Dissim-Mat"):
    # closest_entries = list(zip(*np.where(dissim_mat==min(dissim_mat[dissim_mat>0]))))
    # closest_entries = set(tuple(sorted(i)) for i in closest_entries)
    # print(f"Closest Nonequal Descriptions: \n", "\n".join(["*b*"+("*b* & *b*".join([descriptions._descriptions[i].title for i in j]))+"*b*" for j in closest_entries]))
    print(f"Closest {num} Descriptions in {title}:")
    if is_embedding:
        dissim_mat = _create_dissim_mat(dissim_mat, get_setting("DISSIM_MEASURE"), force_singlethread=len(dissim_mat)<500, silent=len(dissim_mat)<500)[0]
    is_dissim = np.allclose(np.diagonal(dissim_mat), 0, atol=1e-10)
    assert is_dissim, "TODO now it's a similarity matrix"
    min_vals = sorted(squareform(dissim_mat))[:num]
    min_indices = np.where(np.isin(dissim_mat, min_vals))
    min_indices = [(i,j) for i,j in zip(*min_indices) if i!=j]
    min_indices = list({j: None for j in [tuple(sorted(i)) for i in min_indices]}.keys()) #remove duplicates ("aircraft cabin and airplane cabin" and "airplane cabin and aircraft cabin")
    for first, second in min_indices[:num]:
        print(f"  *b*{descriptions._descriptions[first].title}*b* and *b*{descriptions._descriptions[second].title}*b*")
Example #17
0
 def __init__(self, dtm, all_terms, quant_name, verbose=False):
     self.includes_pseudodocs = False
     self.dtm = dtm
     self.all_terms = {n: elem
                       for n, elem in enumerate(all_terms)} if isinstance(
                           all_terms, list) else all_terms
     self.quant_name = quant_name
     # if "all_terms" in kwargs and "descriptions" in kwargs: assert hasattr(kwargs["descriptions"][0], "bow")
     # for desc in kwargs["descriptions"]: self.dtm.append([[self.reverse_term_dict[k], v] for k,v in desc.bow().items()])
     assert set(self.all_terms) == set(
         flatten([[elem[0] for elem in row] for row in self.dtm]))
     if verbose:
         print(
             f"Loaded Doc-Term-Matrix with {len(self.dtm)} documents and {len(self.all_terms)} items."
         )
         self.show_info()
def json_load(fname, **kwargs):  #assert_meta=(), return_meta=False,
    try:
        if isinstance(fname, str):
            with open(fname, "r") as rfile:
                tmp = json.load(rfile, **kwargs)
        else:  #then it may be a sacred opened resource (https://sacred.readthedocs.io/en/stable/apidoc.html#sacred.Experiment.open_resource)
            tmp = json.load(fname, **kwargs)
        return npify_rek(tmp)
    except json.decoder.JSONDecodeError as e:
        print(f"{fname} doesn't work!")
        raise json.decoder.JSONDecodeError(msg=f"NAME:{fname}|||MSG:{e.msg}",
                                           doc=e.doc,
                                           pos=e.pos) from e
    except Exception as e:
        print(f"{fname} doesn't work!")
        raise e
def classify_shallowtree_multi(clusters,
                               embedding,
                               descriptions,
                               dataset_class,
                               classes=None,
                               verbose=False,
                               **kwargs):
    results = {}
    for classes in (([classes] if isinstance(classes, str) else classes)
                    or descriptions.additionals_names):
        for test_percentage_crossval in [0.33, 0.5, 4, 5]:
            for one_vs_rest in [True, False]:
                for dt_depth in [1, 2, 3, None]:
                    for balance_classes in [True, False]:
                        results[(classes, test_percentage_crossval,
                                 one_vs_rest, dt_depth, balance_classes)] = {}
                        for metric in ["accuracy", "f1"]:
                            print("==" * 50)
                            score = classify_shallowtree(
                                clusters,
                                embedding,
                                descriptions,
                                dataset_class,
                                one_vs_rest,
                                dt_depth,
                                test_percentage_crossval,
                                classes,
                                verbose=verbose,
                                return_features=False,
                                balance_classes=balance_classes,
                                metric=metric,
                                also_unweighted=True,
                                **kwargs)
                            results[(classes, test_percentage_crossval,
                                     one_vs_rest, dt_depth,
                                     balance_classes)][metric] = score
    df = pd.DataFrame(results,
                      columns=pd.MultiIndex.from_tuples(
                          [i for i in results.keys()],
                          names=("classes", "test%/Xval", "1vsRest",
                                 "Tree-Depth", "balanced")))
    with pd.option_context('display.max_rows', 51, 'display.max_columns', 20,
                           'display.expand_frame_repr', False,
                           'display.max_colwidth', 20, 'display.float_format',
                           '{:.4f}'.format):
        print(df)
    return df
Example #20
0
 def __init__(self, **kwargs):
     assert kwargs.keys() == self.OPTION_LETTER.keys()
     if kwargs["use_skcountvec"]:
         must_override = [
             i for i in kwargs.keys() - self.SKCOUNTVEC_SUPPORTS -
             {"use_skcountvec"} if kwargs[i]
         ]
         if must_override:
             print(
                 f"Must overwrite the following PP-Components to False as SKLearn-CountVectorizer doesn't support it: {', '.join(must_override)}"
             )
             raise Exception("No can do!")
             kwargs = {
                 k: False if k in must_override else v
                 for k, v in kwargs.items()
             }
     self.di = kwargs
def main():
    DATASET = "siddata"
    args = parse_command_line_args()
    setup_logging()
    load_envfiles(DATASET)

    ctx = SnakeContext.loader_context(silent=False)
    descriptions = ctx.load("pp_descriptions")
    res = extract_classes(descriptions,
                          args.classes,
                          ctx.obj["dataset_class"],
                          use_name=args.named)
    fname = join(
        ctx.p.in_dir, "fb_classifier",
        f"{DATASET}_{args.classes}{'_named' if args.named else ''}.csv")
    res.reset_index().to_csv(fname)
    print(f"Saved under {fname}.")
Example #22
0
def list_paramcombis(ctx):
    from derive_conceptualspace.cli.args_from_filename import LAST_RESULT
    # TODO get rid of this entirely.
    # TODO this should ONLY consider command-line-args as config to compare the candidates to
    candidates = [
        join(path, name)[len(ctx.p.in_dir):]
        for path, subdirs, files in os.walk(join(ctx.p.in_dir, ""))
        for name in files if name.startswith(f"{LAST_RESULT}.json")
    ]  #TODO better LAST_RESULT
    candidates = [
        i for i in candidates if i.startswith(
            ctx.p.get_subdir({
                i: ctx.get_config(i)
                for i in ["DEBUG", "DATASET", "LANGUAGE"]
            })[0])
    ]
    for cand in candidates:
        print(cand)
 def set_config(self, key, val, source, silent=False): #this is only a suggestion, it will only be finally set once it's accessed!
     key, val = standardize_config(key, val)
     if key in self.used_configs and val != self.used_configs[key]:
         raise ValueError(fmt(f"{source} is trying to overwrite config {key} with *r*{val}*r*, but it was already used with value *b*{self.used_configs[key]}*b*!"))
     self.toset_configs.append([key, val, source])
     existing_configs = list(zip(*[i for i in self.toset_configs if i[0] == key and i[2] not in ["defaults", "smk_args"]]))
     if existing_configs and len(set(existing_configs[1])) > 1 and existing_configs[0][0] not in settings.MAY_DIFFER_IN_DEPENDENCIES:
         #TODO this has become a mess. I originally only wanted this warning for dependency, but then expanded it for force and now it's BS. Overhaul this!!
         ordered_args = sorted(list(zip(*existing_configs[::-1][:2])), key=lambda x:CONF_PRIORITY.index(re.sub(r'\[.*?\]', '', x[0])))
         ordered_args = dict(sorted({v:k for k,v in list({v: k for k, v in ordered_args[::-1]}.items())}.items(), key=lambda x:CONF_PRIORITY.index(re.sub(r'\[.*?\]', '', x[0])))) # per value only keep the highest-priority-thing that demanded it
         if "dependency" in ordered_args and ordered_args["dependency"] != ordered_args.get("force", ordered_args["dependency"]):
             raise ValueError(f"A Dependency requires {existing_configs[0][0]} to be {dict(ordered_args)['dependency']} but your other config demands {[v for k,v in ordered_args.items() if k!='dependency'][0]}")
         # if "dataset_class" in ordered_args and bool([k for k, v in ordered_args.items() if v != ordered_args["dataset_class"]]): #if something of higher prio overwrites dataset_class
         #     raise ValueError(f"dataset_class requires {existing_configs[0][0]} to be {dict(ordered_args)['dataset_class']} but it will be overwritten by {[k for k, v in ordered_args.items() if v != ordered_args['dataset_class']]}")
         ordered_args = list(ordered_args.items())
         if f"{existing_configs[0][0]} from {ordered_args[1][1]} to {ordered_args[0][1]}" not in self._given_warnings:
             self._given_warnings.append(f"{existing_configs[0][0]} from {ordered_args[1][1]} to {ordered_args[0][1]}")
             if not (silent or (hasattr(self, "silent") and self.silent)):
                 print(f"{ordered_args[1][0]} demanded config {existing_configs[0][0]} to be *r*{ordered_args[1][1]}*r*, but {ordered_args[0][0]} overwrites it to *b*{ordered_args[0][1]}*b*")
def extract_classes(descriptions, classes, dataset_class, use_name=False):
    #TODO: merge this with the content of the very same thing in derive_conceptualspace.evaluate.shallow_trees.classify_shallowtree
    if classes is None:
        classes = descriptions.additionals_names[0]
    if classes in descriptions.additionals_names:
        catnames = None
        if hasattr(dataset_class,
                   "CATNAMES") and classes in dataset_class.CATNAMES:
            catnames = dataset_class.CATNAMES.get(classes)
        hascat = [
            n for n, i in enumerate(descriptions._descriptions)
            if i._additionals[classes] is not None
        ]
        getcat = lambda i: descriptions._descriptions[i]._additionals[classes]
    elif hasattr(dataset_class, "get_custom_class"):
        getcat, hascat, catnames = dataset_class.get_custom_class(classes,
                                                                  descriptions,
                                                                  verbose=True)
    else:
        raise Exception(f"The class {classes} does not exist!")
    if catnames and use_name:
        orig_getcat = getcat
        getcat = lambda x: catnames.get(int(orig_getcat(x)), orig_getcat(x))
    else:
        orig_getcat = getcat
        getcat = lambda x: int(orig_getcat(x)) - 1  #labels 0-9 instead of 1-10

    print(
        f"Using classes from {classes} - {len(hascat)}/{len(descriptions)} entities have a class"
    )
    cats = {i: getcat(i) for i in hascat}
    print(
        f"Labels ({len(set(cats.values()))} classes):",
        ", ".join(f"*b*{k}*b*: {v}"
                  for k, v in Counter(cats.values()).items()))
    return pd.DataFrame(
        {
            descriptions._descriptions[i].title:
            [descriptions._descriptions[i].unprocessed_text,
             getcat(i)]
            for i in hascat
        },
        index=["text", "class"]).T
Example #25
0
 def term_freqs(self, verbose=False):
     """the number of documents containing a word, for all words"""
     if not hasattr(self, "_term_freqs"):
         # occurences = [set(i[0] for i in doc) for doc in self.dtm]
         # self._term_freqs = {term: sum(term in doc for doc in occurences) for term in tqdm(list(self.all_terms.keys()), desc="Calculating Doc-Frequencies")}
         self._term_freqs = dict(
             enumerate(
                 self.as_csr(binary=True).sum(
                     axis=1).squeeze().tolist()[0]))
         if verbose:
             most_freq = sorted(self._term_freqs.items(),
                                key=lambda x: x[1],
                                reverse=True)[:5]
             print(
                 "Most frequent terms:", ", ".join([
                     f"{self.all_terms[term]} ({num})"
                     for term, num in most_freq
                 ]))
     return self._term_freqs
Example #26
0
def create_languages_file(raw_descriptions,
                          columns,
                          json_persister,
                          dataset_class,
                          declare_silent=False,
                          pp_components=None,
                          proc_descs=None):
    if isinstance(columns, str):
        columns = [columns]
    results = {}
    for col in columns:
        try:
            languages = json_persister.load(None,
                                            f"{col}_languages",
                                            loader=lambda langs: langs,
                                            silent=declare_silent)
        except FileNotFoundError:
            if proc_descs is None:
                proc_descs = dataset_class.preprocess_raw_file(
                    raw_descriptions,
                    pp_components=PPComponents.from_str(pp_components))
            langs = get_langs(proc_descs[col],
                              assert_len=False,
                              pgbar_name=f"Getting Language of {col}")
            langs = {
                i[col]: langs[i[col]]
                for _, i in proc_descs.iterrows() if not pd.isna(i[col])
            }
            json_persister.save(f"{col}_languages.json",
                                langs=langs,
                                ignore_confs=[
                                    "DEBUG", "PP_COMPONENTS",
                                    "TRANSLATE_POLICY", "LANGUAGE"
                                ])
            languages = json_persister.load(None,
                                            f"{col}_languages",
                                            loader=lambda langs: langs,
                                            silent=declare_silent)
        else:
            print(f"Languages-file for {col} already exists!")
        results[col] = languages
    return results
def create_mds(dissim_mat, embed_dimensions, metric=True, init_from_isomap=True):
    max_iter = 10000 if not get_setting("DEBUG") else 100
    if not init_from_isomap:
        warnings.warn("sklearn's MDS is broken!! Have to init from something, don't f*****g ask why!")
        n_inits = math.ceil((max(get_ncpu()*2, (10 if not get_setting("DEBUG") else 3)))/get_ncpu())*get_ncpu() # minimally 10, maximally ncpu*2, but in any case a multiple of ncpu
        print(f"Running {'non-' if not metric else ''}metric MDS {n_inits} times with {get_ncpu(ignore_debug=True)} jobs for max {max_iter} iterations.")
        embedding = MDS(n_components=embed_dimensions, dissimilarity="precomputed",
                        metric=metric, #TODO with metric=True it always breaks after the second step if  n_components>>2 (well, mit metric=False auch^^)
                        n_jobs=get_ncpu(ignore_debug=True), verbose=1 if get_setting("VERBOSE") else 0, n_init=n_inits, max_iter=max_iter)
        mds = embedding.fit(dissim_mat)
    else:
        print(f"Running {'non-' if not metric else ''}metric MDS with {get_ncpu(ignore_debug=True)} jobs for max {max_iter} iterations, initialized from Isomap-Embeddings")
        embedding = MDS(n_components=embed_dimensions, dissimilarity="precomputed", metric=metric,
                        n_jobs=get_ncpu(ignore_debug=True), verbose=1 if get_setting("VERBOSE") else 0, n_init=1, max_iter=max_iter)
        try:
            isomap_init = create_isomap(dissim_mat, embed_dimensions, neighbor_factor=25).embedding_
        except ValueError: #There are significant negative eigenvalues...
            isomap_init = np.random.random((len(dissim_mat), embed_dimensions))*0.01
        mds = embedding.fit(dissim_mat, init=isomap_init)
    return mds
 def read_configfile(self):
     if self.get_config("conf_file"):
         fname = join(os.getenv(f"{ENV_PREFIX}_CONFIGDIR", dirname(settings.__file__)), self.get_config("conf_file")) if not isfile(self.get_config("conf_file")) and join(os.getenv(f"{ENV_PREFIX}_CONFIGDIR", dirname(settings.__file__)), self.get_config("conf_file")) else self.get_config("conf_file")
         with open(fname, "r") as rfile:
             config = yaml.load(rfile, Loader=yaml.SafeLoader)
         if config.get("__perdataset__"):
             if config["__perdataset__"].get(self.get_config("dataset"), {}):
                 config.update(config.get("__perdataset__", {}).get(self.get_config("dataset"), {}))
             del config["__perdataset__"]
         for k, v in config.items():
             if isinstance(v, list): #IDEA: wenn v eine liste ist und wenn ein cmd-arg bzw env-var einen wert hat der damit consistent ist, nimm das arg
                 overwriters = [i[1:] for i in self.toset_configs if i[0]==standardize_config_name(k) and CONF_PRIORITY.index(re.sub(r'\[.*?\]', '', i[2])) < CONF_PRIORITY.index("conf_file")]
                 if overwriters and len(set([i[0] for i in overwriters])) > 1:
                     # assert len(overwriters) == 1 and overwriters[0][0] in v, "TODO: do this"
                     self.set_config(k, overwriters[0][0], "conf_file")
                 else:
                     self.set_config(k, v[0], "conf_file")
             else:
                 self.set_config(k, v, "conf_file")
         if not self.silent: print(f"Config-File {fname} loaded.")
def run_lsi_gensim(pp_descriptions, filtered_dcm, verbose=False):
    """as in [VISR12: 4.2.1]"""
    # TODO options here:
    # * if it should filter AFTER the LSI

    if verbose:
        filtered_dcm.show_info(descriptions=pp_descriptions)
        if get_setting("DCM_QUANT_MEASURE") != "binary":
            logger.warn("VISR12 say it works best with binary!")

    filtered_dcm.add_pseudo_keyworddocs()
    dictionary = corpora.Dictionary([list(filtered_dcm.all_terms.values())])
    print("Start creating the LSA-Model with MORE topics than terms...")
    lsamodel_manytopics = LsiModel(doc_term_matrix,
                                   num_topics=len(all_terms) * 2,
                                   id2word=dictionary)
    print("Start creating the LSA-Model with FEWER topics than terms...")
    lsamodel_lesstopics = LsiModel(filtered_dcm.dtm,
                                   num_topics=len(filtered_dcm.all_terms) //
                                   10,
                                   id2word=dictionary)
    print()
    import matplotlib.cm
    import matplotlib.pyplot as plt
    # TODO use the mpl_tools here as well to also save plot!
    plt.imshow(lsamodel_lesstopics.get_topics()[:100, :200],
               vmin=lsamodel_lesstopics.get_topics().min(),
               vmax=lsamodel_lesstopics.get_topics().max(),
               cmap=matplotlib.cm.get_cmap("coolwarm"))
    plt.show()
def extract_candidateterms(pp_descriptions,
                           extraction_method,
                           max_ngram,
                           verbose=False,
                           **kwargs):
    if extraction_method == "keybert":
        candidateterms, metainf = extract_candidateterms_keybert_nopp(
            pp_descriptions,
            max_ngram,
            get_setting("faster_keybert"),
            verbose=verbose,
            **kwargs)
    elif extraction_method == "pp_keybert":
        candidateterms, metainf = extract_candidateterms_keybert_preprocessed(
            pp_descriptions,
            max_ngram,
            get_setting("faster_keybert"),
            verbose=verbose,
            **kwargs)
    elif extraction_method in ["tfidf", "tf", "all", "ppmi"]:
        candidateterms, metainf = extract_candidateterms_quantific(
            pp_descriptions,
            max_ngram,
            quantific=extraction_method,
            verbose=verbose,
            **kwargs)
    else:
        raise NotImplementedError()
    flattened = set(flatten(candidateterms))
    print(
        "Unique Terms I found: ", ", ".join([
            f"{k+1}-grams: {v}"
            for k, v in sorted(Counter([i.count(" ")
                                        for i in flattened]).items(),
                               key=lambda x: x[0])
        ]), "| sum:", len(flattened))
    metainf["n_candidateterms"] = len(flattened)
    return candidateterms, metainf