def get_features_chapter(data, params_vocab, data_raw, params_model, params_features): lo_authors = params_features['known_authors'] lo_chapters = params_features['specific_chapters'] for ch in lo_chapters: for auth in lo_authors: logging.info(f"Checking features: {ch} vs. {auth}") ds = data[(data.author == auth) | data.chapter.str.contains(fr"{ch}")] # WARNING: here we may have a match if chapter string # equals the author of ch ds.loc[data.chapter.str.contains(fr"{ch}"), 'author'] = ch data1 = data.copy() data1.loc[data.chapter.str.contains(fr"{ch}"), 'author'] = ch vocab = build_vocab(data1, params_vocab) md = build_model(ds, vocab, params_model) df = md[0].HCT_vs_many() df.loc[:, f'{auth}:freq'] = df[f'{auth}:n'] / df[f'{auth}:T'] df.loc[:, f'{ch}:freq'] = df[f'{ch}:n'] / df[f'{ch}:T'] df.loc[:, "freq_common"] = df['n'] / df['T'] dfm = df[df.iloc[:, # only use features selected at least once df.columns.str.contains('affinity')].abs().any( axis=1)].reset_index() cvr = Convert(data_raw) dfm['term'] = dfm['feature'].apply(cvr._lem2term) dfm.to_csv(f"{params_features['out_path']}{ch}_vs_{auth}.csv")
def _val_pipeline(data_train: pd.DataFrame, data_test: pd.DataFrame, vocabulary, model_params) -> float: """ Validation pipeline: 1. model construction using training data 2. prediction of testing data """ md, _ = build_model(data_train, vocabulary, model_params) labels = data_test[['doc_id', 'author']].drop_duplicates() data_test.loc[:, 'author'] = 'UNK' # obscure true labels df1 = model_predict(data_test, md) df1 = df1.drop('author', axis=1).merge(labels, on='doc_id', how='left') return df1
def _evaluate_discrepancies(data, vocab, model_params): """ Build a word-frequency model. Evaluate discrepancy of document '<TEST>' with respect to any of the other authors in the data """ data_train = data[~(data.author == "<TEST>")] data_test = data[data.author == "<TEST>"] md, _ = build_model(data_train, vocab, model_params) df1 = model_predict(data_test, md) return df1
def _check_doc(ds, vocabulary, params_model) -> pd.DataFrame: """ Build a model from all data after removing entries with ds['author]=='TEST' and check the document containing entries ds['doc_id']=='TEST' against this model. Return the discrepancy result. Params: :ds: data :vocabulary: vocabulary to be used in model construction :params_model: model parameters Returns: model prediction results (arranged as a dataframe) """ md, _ = build_model(ds[ds.author != 'TEST'], vocabulary, params_model) return model_predict(ds[ds.author == 'TEST'], md)
def get_features(data, vocab, data_raw, model_params, params_features): lo_authors = params_features['known_authors'] lo_chapters = params_features['specific_chapters'] md = build_model( data[data.author.isin(lo_authors) | data.chapter.isin(lo_chapters)], vocab, model_params) df = md[0].HCT_vs_many() for auth in lo_authors: df.loc[:, f'{auth}:freq'] = df[f'{auth}:n'] / df[f'{auth}:T'] df.loc[:, "freq_common"] = df['n'] / df['T'] dfm = df[df.iloc[:, # only use features selected at least once df.columns.str.contains('affinity')].abs().any( axis=1)].reset_index() cvr = Convert(data_raw) dfm['term'] = dfm['feature'].apply(cvr._lem2term) return dfm
def _compare_doc_corpus(ds_doc, ds_corp, vocabulary, params_model): md = build_model(ds_corp, vocabulary, params_model) res = model_predict(ds_doc, md) return res