Esempio n. 1
0
def get_features_chapter(data, params_vocab, data_raw, params_model,
                         params_features):
    lo_authors = params_features['known_authors']
    lo_chapters = params_features['specific_chapters']
    for ch in lo_chapters:
        for auth in lo_authors:

            logging.info(f"Checking features: {ch} vs. {auth}")
            ds = data[(data.author == auth)
                      | data.chapter.str.contains(fr"{ch}")]
            # WARNING: here we may have a match if chapter string
            # equals the author of ch
            ds.loc[data.chapter.str.contains(fr"{ch}"), 'author'] = ch

            data1 = data.copy()
            data1.loc[data.chapter.str.contains(fr"{ch}"), 'author'] = ch
            vocab = build_vocab(data1, params_vocab)
            md = build_model(ds, vocab, params_model)
            df = md[0].HCT_vs_many()
            df.loc[:, f'{auth}:freq'] = df[f'{auth}:n'] / df[f'{auth}:T']
            df.loc[:, f'{ch}:freq'] = df[f'{ch}:n'] / df[f'{ch}:T']
            df.loc[:, "freq_common"] = df['n'] / df['T']
            dfm = df[df.iloc[:,  # only use features selected at least once
                             df.columns.str.contains('affinity')].abs().any(
                                 axis=1)].reset_index()

            cvr = Convert(data_raw)
            dfm['term'] = dfm['feature'].apply(cvr._lem2term)
            dfm.to_csv(f"{params_features['out_path']}{ch}_vs_{auth}.csv")
Esempio n. 2
0
def _val_pipeline(data_train: pd.DataFrame, data_test: pd.DataFrame,
                  vocabulary, model_params) -> float:
    """
    Validation pipeline: 
    1. model construction using training data
    2. prediction of testing data
    """
    md, _ = build_model(data_train, vocabulary, model_params)
    labels = data_test[['doc_id', 'author']].drop_duplicates()
    data_test.loc[:, 'author'] = 'UNK'  # obscure true labels
    df1 = model_predict(data_test, md)
    df1 = df1.drop('author', axis=1).merge(labels, on='doc_id', how='left')
    return df1
Esempio n. 3
0
def _evaluate_discrepancies(data, vocab, model_params):
    """
	Build a word-frequency model. Evaluate discrepancy
	of document '<TEST>' with respect to any of the other
	authors in the data
	"""

    data_train = data[~(data.author == "<TEST>")]
    data_test = data[data.author == "<TEST>"]

    md, _ = build_model(data_train, vocab, model_params)
    df1 = model_predict(data_test, md)
    return df1
Esempio n. 4
0
def _check_doc(ds, vocabulary, params_model) -> pd.DataFrame:
    """
    Build a model from all data after removing entries with ds['author]=='TEST'
    and check the document containing entries ds['doc_id']=='TEST' against
    this model. Return the discrepancy result.

    Params:
    :ds:  data
    :vocabulary:  vocabulary to be used in model construction
    :params_model:  model parameters

    Returns:
         model prediction results (arranged as a dataframe)
    """

    md, _ = build_model(ds[ds.author != 'TEST'], vocabulary, params_model)
    return model_predict(ds[ds.author == 'TEST'], md)
Esempio n. 5
0
def get_features(data, vocab, data_raw, model_params, params_features):

    lo_authors = params_features['known_authors']
    lo_chapters = params_features['specific_chapters']

    md = build_model(
        data[data.author.isin(lo_authors) | data.chapter.isin(lo_chapters)],
        vocab, model_params)
    df = md[0].HCT_vs_many()

    for auth in lo_authors:
        df.loc[:, f'{auth}:freq'] = df[f'{auth}:n'] / df[f'{auth}:T']

    df.loc[:, "freq_common"] = df['n'] / df['T']

    dfm = df[df.iloc[:,  # only use features selected at least once
                     df.columns.str.contains('affinity')].abs().any(
                         axis=1)].reset_index()

    cvr = Convert(data_raw)
    dfm['term'] = dfm['feature'].apply(cvr._lem2term)
    return dfm
Esempio n. 6
0
def _compare_doc_corpus(ds_doc, ds_corp, vocabulary, params_model):
    md = build_model(ds_corp, vocabulary, params_model)
    res = model_predict(ds_doc, md)
    return res