def create_viz_production(results_table_name='final_model_eval'): """ Creates visualizations for models that are trained on the full training data set, and are used in production. Parameters ---------- results_table_name : str The name of a SQL table which contains results (from the test set) about the final models. """ # set up required variables local_paths_env = load_local_paths('../pipeline/local_paths.yaml') env = load_psql_env(local_paths_env['pgpass_path']) ignition = load_config(local_paths_env['ignition_path'] + '_1_baseline_ignition.yaml') # open sql connection connection = SQLConn(env) connection.open() # pull data from table query = f"select * from results.{results_table_name};" results_df = pd.read_sql_query(query, connection.conn) results_df['label'] = results_df['review_group'] # create directory for visualizations vis_dir = f'{local_paths_env["store_visualizations"]}/production_citations' # create folders to store visualizations if not os.path.exists(vis_dir): os.makedirs(vis_dir) # precision recall plots for rg in tqdm(results_df['review_group'].unique()): plot = plot_precision_recall_curve_best(results_df, rg, plot_baseline=False) plt.savefig(f'{vis_dir}/pr_curve-{rg}.png') plt.close() # stacked bar workload plot = workload_relative_stackedbar(results_df) plt.savefig(f'{vis_dir}/workload_relative.png') plt.close() plot_average_workload_reduction(results_df) plt.savefig(f'{vis_dir}/workload_average.png') plt.close()
def heatmap_data(igs, comp_ig='1', local_paths='../pipeline/local_paths.yaml'): """ Function that prepares data to be plotted in heatmap, and is stored as csv. Parameters ========== igs : list List of strings of ignition ids to be pulled from database. """ out_dir = load_local_paths(local_paths)['tmp'] # pull in results data results = pull_results(igs) best = get_best_hyperparam_all(results) best = best[[ 'algorithm', 'hyperparameters', 'label', 'recall', 'precision_at_recall' ]] best['type'] = 'best' comp = get_avg_ignition(results, comp_ig) comp['type'] = 'comp' out = pd.concat([comp, best], axis=0) # pull rg data connection = SQLConn( load_psql_env(load_local_paths(local_paths)['pgpass_path'])) connection.open() papers_rgs = connection.query( 'select inregister as label, count(*) as n_papers from semantic.papers_rgs group by 1;' ) papers_revs = connection.query(f""" with tbl as(select a.*, b.cn from semantic.papers_rgs a left join semantic.papers_reviews b on a.recordid=b.recordid) select inregister as label, count(distinct cn) as n_revs from tbl group by 1; """) # final dataset out = pd.merge(out, papers_rgs, 'left', 'label') out = pd.merge(out, papers_revs, 'left', 'label') # output dataset out.to_csv(out_dir + 'heatmap_data.csv')
def pull_results(ignition_ids, table_name='results.evaluate_rg', metric_col='metrics', metrics=['precision_at_recall'], other_cols=[ 'ignition_id', 'hash_id, algorithm', 'hyperparameters', 'fold', 'recall' ]): """ Pull results from PSQL table into long dataframe. Parameters ========== ignition_ids : list List of ignition_ids to pull into table. table_name : str Name of PSQL table with results. metric_col : str Name of column where metrics exist. metrics : list Metrics to be included in table. Will be parsed from jsonb. other_cols : list List of other columns to included in table as is. labels : list Labels to be included in table. Returns ======= results_df : pd.DataFrame Long dataframe with results from specified ignition files, metrics, and labels. """ local_paths_env = load_local_paths('../pipeline/local_paths.yaml') env = load_psql_env(local_paths_env['pgpass_path']) ignition = load_config(local_paths_env['ignition_path'] + '_1_baseline_ignition.yaml') # establish SQL Connection connection = SQLConn(env) connection.open() ### Set up ### results = {} ignition_ids_sql = "('" + "','".join(ignition_ids) + "')" other_cols_sql = ",".join(other_cols) ## Make one query for each label and store resulting df in a dict ### i = 0 for label in ignition['classes']: metrics_sql = f"'{label}' as label" for metric in metrics: metrics_sql += f",{metric_col} -> '{metric}' -> '{label.lower()}' as {metric}" qy = f""" select {other_cols_sql}, {metrics_sql} from {table_name} where ignition_id in {ignition_ids_sql}; """ results[label] = pd.read_sql_query(qy, connection.conn) ## Concatenate all dfs into one long df ### results_df = pd.concat(results.values(), ignore_index=True) connection.close() return results_df
review_groups = [] for col in list(data.columns): if not col in bad_cols and col[:5] != "cited": review_groups.append(col) ### Prep plot legend ### red_patch = mpatches.Patch(color='firebrick', label='Non-review group paper') blue_patch = mpatches.Patch(color='#1f497d', label='Review group paper') ### Plot ### fig = plt.figure(figsize=(40, 40)) for i, review_group in enumerate(review_groups): plt.subplot(8, 8, i + 1) plot_citations_histograms(data=data, review_group=review_group) # plt.suptitle('Distribution of proportion of cited papers belonging to that review group', fontsize=50, y=1.05) plt.figlegend(handles=[red_patch, blue_patch], loc='lower right', fontsize=30) plt.tight_layout() plt.savefig(save_path) if __name__ == "__main__": connection = SQLConn( load_psql_env( load_local_paths('../pipeline/local_paths.yaml')['pgpass_path'])) connection.open() plot_citations_features_small_multiples(conn=connection) connection.close()
def run_pipeline(ignition_file, persist_all, load_all_fresh): """ An adhoc pipeline created to mirror the standard ML pipeline and work with citations data. Parameters: =========== ignition_file: string name of the yaml file for which you want to run an experiment persist_all: boolean T if you want to persist all data for future use load_all_fresh: boolean T if you want to avoid any persisted data and load new data from scrath Returns: ======== None """ model_parts = {} ##### 1. LOAD ENVIRONMENT DATA ##### # load local paths local_paths_env = load_local_paths('local_paths.yaml') print('Local paths loaded.') # load ignition file ignition = load_config(local_paths_env['ignition_path'] + ignition_file) print('Ignition loaded.') # id used for persisting hash_id = create_hash_id(str(ignition['id'])) print('Hash id created.') # create hyperparameter combinations (for k-folding) hyperparameters = expand_grid(ignition['hyperparameters']) # load environment file psql_env = load_psql_env(pgpass_path=local_paths_env['pgpass_path']) print('PSQL environment file loaded.') # Initiate PSQL Connection connection = SQLConn(psql_env) connection.open() ##### 2. LOAD TRAIN AND TEST DATA ##### if check_persisted(local_paths_env['store_train_data'], f'{hash_id}_x', load_all_fresh): print("Found data") # data loaded before: load from file X_train = load(local_paths_env['store_train_data'], f'{hash_id}_x') X_test = load(local_paths_env['store_test_data'], f'{hash_id}_x') y_train = load(local_paths_env['store_train_data'], f'{hash_id}_y') y_test = load(local_paths_env['store_test_data'], f'{hash_id}_y') print('Loaded data from file.') else: print("Data not found in storage - load from database") # data not loaded: pull from database and create features X_train, X_test, y_train, y_test = sample( ignition, connection, local_paths_env['store_features']) print(f"X_train shape: {X_train.shape}") print(f"X_test shape: {X_test.shape}") print(f"y_train shape: {y_train.shape}") print(f"y_test shape: {y_test.shape}") # add fold index column to data X_train, y_train = k_fold(X_train, y_train, ignition['k_folds'], ignition['k_folds_seed']) # save data to file for future use save(X_train, local_paths_env['store_train_data'], f'{hash_id}_x', persist_all) save(X_test, local_paths_env['store_test_data'], f'{hash_id}_x', persist_all) save(y_train, local_paths_env['store_train_data'], f'{hash_id}_y', persist_all) save(y_test, local_paths_env['store_test_data'], f'{hash_id}_y', persist_all) print('Data loading completed.') ##### 3. K-FOLDING ##### # loop over folds for fold in tqdm(range(ignition['k_folds']), desc='Folds'): # get fold id hash (for persisting) fold_id = create_hash_id(str(ignition['id']) + str(fold)) # get fold data fold_X_train = X_train[X_train['k'] != fold] fold_X_test = X_train[X_train['k'] == fold] fold_y_train = y_train[y_train['k'] != fold] fold_y_test = y_train[y_train['k'] == fold] # store fold features, if any fold_features = {} ##### 4. LOOP OVER HYPERPARAMETERS: TRAIN CLASSIFIER ##### for hyperparam in tqdm(hyperparameters, desc='Hyperparameters'): # create hyperparam unique id and hyperparam-fold unique id hyperparam_id = create_hash_id( str(ignition['id']) + str(hyperparam)) hyperparam_fold_id = create_hash_id( str(ignition['id']) + str(hyperparam) + str(fold)) # if not check_val_in_db(connection, ignition['results_table_name'], # 'results', 'hash_id', hyperparam_fold_id, len(ignition['recalls'])): # create classifier of specified type and with specified target classifier = select_classifier(ignition["model_type"], fold_id, ignition["target"], ignition["classes"], fold_features, hyperparameters=hyperparam, seed=ignition['seed'], env=local_paths_env, load_fresh=load_all_fresh) #print('Classifier created.') # train classifier classifier.train(fold_X_train, fold_y_train) ##### 5. TEST CLASSIFIER ##### # generate predictions from classifier y_probs = classifier.predict(fold_X_test) ##### 6. EVALUATION ##### for recall in tqdm(ignition['recalls'], desc='Evaluations'): # compute evaluation metrics all_metrics = compute_metrics( metric_names=ignition['metrics'], y_true=fold_y_test.drop(columns=['k']), y_pred=y_probs, k=recall) # store results in database unique_id = create_hash_id( str(ignition['id']) + str(hyperparam) + str(fold) + str(recall)) results_to_db(metrics=all_metrics, table_name=ignition['results_table_name'], ignition_id=ignition['id'], hash_id=hyperparam_fold_id, algorithm=ignition['model_type'], hyperparameters=hyperparam, fold=str(fold), recall=recall, unique_id=unique_id, connection=connection) connection.close() print(f"Done running pipeline for ignition id: {ignition['id']}!")
def perform_model_selection(evaluate_best_models=True): """ Run model selection pipeline. """ # Load local paths file local_paths = load_local_paths('local_paths.yaml') # Load product config file prod_config = load_config('../prod/prod_config.yaml', append_static=False) # SQL set up psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path']) connection = SQLConn(psql_env) connection.open() # Pull data X_train, X_test, y_train, y_test = sample( ignition=prod_config, connection=connection, local_features_path=local_paths['store_features']) if evaluate_best_models: # Test best models for each review group scored_papers_test = load(location=local_paths['store_scored_papers'], filename='scored_papers') y_pred_test = scored_papers_test[[ col for col in scored_papers_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] y_test = y_test[[ col for col in y_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] # calculate thresholds upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds( y_test, y_pred_test, minimum_recall=0.99) # persist thresholds for production save(upper_thresholds, local_paths['store_production_models'], 'upper_thresholds') save(lower_thresholds, local_paths['store_production_models'], 'lower_thresholds') # calculate workload reductions keep, consider, discard = get_workload_reduction( y_test, y_pred_test, upper_thresholds, lower_thresholds) rg_list = [] wrkld_reductions = [] # loop over review groups for review_group in tqdm(prod_config['review_groups_recall'].keys(), desc='Review Group'): rg = review_group.lower() # get thresholds thresholds = [upper_thresholds[rg], lower_thresholds[rg]] recall_at_threshold = [ recall_at_thresholds[rg]['upper'], recall_at_thresholds[rg]['lower'] ] workload_reduction = [keep[rg], consider[rg], discard[rg]] rg_list.append(rg) wrkld_reductions.append(workload_reduction) d = {'review_group': rg_list, 'workload_reduction': wrkld_reductions} df = pd.DataFrame.from_dict(d) plot_average_workload_reduction(df) connection.close() print("Model selection pipeline complete.")
def perform_model_selection(evaluate_best_models=True): """ Run model selection pipeline. """ # Load local paths file local_paths = load_local_paths('local_paths.yaml') # Load product config file prod_config = load_config('../prod/prod_config.yaml', append_static=False) # Load results table from dbs results_df = pull_results(ignition_ids=[ '1', '2', '4', '5', '6', '10', '15', '16', '17', '18', '19', '20', '21' ]) # Get a dataframe of best algorithm x hyperparameters for each RG x recall best_df = get_best_algorithm_hyperparameter_onestep(results_df=results_df) # Get dictionary of algorithms and hyperparameters for each # review group based on recall in product config file best_models = choose_models_with_recall( models_df=best_df, group_min_recalls=prod_config['review_groups_recall']) # SQL set up psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path']) connection = SQLConn(psql_env) connection.open() # Pull data X_train, X_test, y_train, y_test = sample( ignition=prod_config, connection=connection, local_features_path=local_paths['store_features']) # Train best models for each review group train_best_models_mp(X_train, y_train, best_models=best_models, prod_config=prod_config, local_paths=local_paths, cores=3) if evaluate_best_models: # Test best models for each review group scored_papers_test = score_papers( X_test, prod_config, models_path=local_paths['store_production_models']) save(object=scored_papers_test, location=local_paths['store_scored_papers'], filename='scored_papers_citations') y_pred_test = scored_papers_test[[ col for col in scored_papers_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] y_test = y_test[[ col for col in y_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] # calculate thresholds upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds( y_test, y_pred_test, minimum_precision=0.95, minimum_recall=0.99) # persist thresholds for production save(upper_thresholds, local_paths['store_production_models'], 'upper_thresholds') save(lower_thresholds, local_paths['store_production_models'], 'lower_thresholds') # calculate workload reductions keep, consider, discard = get_workload_reduction( y_test, y_pred_test, upper_thresholds, lower_thresholds) # loop over review groups for review_group in tqdm(prod_config['review_groups_recall'].keys(), desc='Review Group'): rg = review_group.lower() # get thresholds thresholds = [upper_thresholds[rg], lower_thresholds[rg]] recall_at_threshold = [ recall_at_thresholds[rg]['upper'], recall_at_thresholds[rg]['lower'] ] workload_reduction = [keep[rg], consider[rg], discard[rg]] # evaluate scores for recall in tqdm(prod_config['recalls'], desc='Evaluations'): # calculate precisions precisions = evaluate_precision_at_k_recall( class_true=y_test, class_prob=y_pred_test, k=recall) # store results in database production_results_to_db( table_name=prod_config['results_table_name'], unique_id=f"{rg}_{recall}", review_group=rg, algorithm=best_models[review_group]['algorithm'], hyperparameters=best_models[review_group] ['hyperparameters'], recall=recall, precision=precisions[rg], thresholds=thresholds, recall_at_threshold=recall_at_threshold, workload_reduction=workload_reduction, connection=connection) connection.close() print("Model selection pipeline complete.")