def heatmap_data(igs, comp_ig='1', local_paths='../pipeline/local_paths.yaml'): """ Function that prepares data to be plotted in heatmap, and is stored as csv. Parameters ========== igs : list List of strings of ignition ids to be pulled from database. """ out_dir = load_local_paths(local_paths)['tmp'] # pull in results data results = pull_results(igs) best = get_best_hyperparam_all(results) best = best[[ 'algorithm', 'hyperparameters', 'label', 'recall', 'precision_at_recall' ]] best['type'] = 'best' comp = get_avg_ignition(results, comp_ig) comp['type'] = 'comp' out = pd.concat([comp, best], axis=0) # pull rg data connection = SQLConn( load_psql_env(load_local_paths(local_paths)['pgpass_path'])) connection.open() papers_rgs = connection.query( 'select inregister as label, count(*) as n_papers from semantic.papers_rgs group by 1;' ) papers_revs = connection.query(f""" with tbl as(select a.*, b.cn from semantic.papers_rgs a left join semantic.papers_reviews b on a.recordid=b.recordid) select inregister as label, count(distinct cn) as n_revs from tbl group by 1; """) # final dataset out = pd.merge(out, papers_rgs, 'left', 'label') out = pd.merge(out, papers_revs, 'left', 'label') # output dataset out.to_csv(out_dir + 'heatmap_data.csv')
def create_viz_production(results_table_name='final_model_eval'): """ Creates visualizations for models that are trained on the full training data set, and are used in production. Parameters ---------- results_table_name : str The name of a SQL table which contains results (from the test set) about the final models. """ # set up required variables local_paths_env = load_local_paths('../pipeline/local_paths.yaml') env = load_psql_env(local_paths_env['pgpass_path']) ignition = load_config(local_paths_env['ignition_path'] + '_1_baseline_ignition.yaml') # open sql connection connection = SQLConn(env) connection.open() # pull data from table query = f"select * from results.{results_table_name};" results_df = pd.read_sql_query(query, connection.conn) results_df['label'] = results_df['review_group'] # create directory for visualizations vis_dir = f'{local_paths_env["store_visualizations"]}/production_citations' # create folders to store visualizations if not os.path.exists(vis_dir): os.makedirs(vis_dir) # precision recall plots for rg in tqdm(results_df['review_group'].unique()): plot = plot_precision_recall_curve_best(results_df, rg, plot_baseline=False) plt.savefig(f'{vis_dir}/pr_curve-{rg}.png') plt.close() # stacked bar workload plot = workload_relative_stackedbar(results_df) plt.savefig(f'{vis_dir}/workload_relative.png') plt.close() plot_average_workload_reduction(results_df) plt.savefig(f'{vis_dir}/workload_average.png') plt.close()
def pull_results(ignition_ids, table_name='results.evaluate_rg', metric_col='metrics', metrics=['precision_at_recall'], other_cols=[ 'ignition_id', 'hash_id, algorithm', 'hyperparameters', 'fold', 'recall' ]): """ Pull results from PSQL table into long dataframe. Parameters ========== ignition_ids : list List of ignition_ids to pull into table. table_name : str Name of PSQL table with results. metric_col : str Name of column where metrics exist. metrics : list Metrics to be included in table. Will be parsed from jsonb. other_cols : list List of other columns to included in table as is. labels : list Labels to be included in table. Returns ======= results_df : pd.DataFrame Long dataframe with results from specified ignition files, metrics, and labels. """ local_paths_env = load_local_paths('../pipeline/local_paths.yaml') env = load_psql_env(local_paths_env['pgpass_path']) ignition = load_config(local_paths_env['ignition_path'] + '_1_baseline_ignition.yaml') # establish SQL Connection connection = SQLConn(env) connection.open() ### Set up ### results = {} ignition_ids_sql = "('" + "','".join(ignition_ids) + "')" other_cols_sql = ",".join(other_cols) ## Make one query for each label and store resulting df in a dict ### i = 0 for label in ignition['classes']: metrics_sql = f"'{label}' as label" for metric in metrics: metrics_sql += f",{metric_col} -> '{metric}' -> '{label.lower()}' as {metric}" qy = f""" select {other_cols_sql}, {metrics_sql} from {table_name} where ignition_id in {ignition_ids_sql}; """ results[label] = pd.read_sql_query(qy, connection.conn) ## Concatenate all dfs into one long df ### results_df = pd.concat(results.values(), ignore_index=True) connection.close() return results_df
def main(ignition_ids=[ '1', '2', '4', '5', '7', '10', '15', '16', '17', '18', '19', '20', '21' ]): """ Script for running a quick visualization of results for models trained during crossvalidation and storing these visualizations. Parameters ========== ignition_ids : list List of the ignition id for which results are stored, and which should be taken into account for visualization. In the list, these ignition ids should be stored as strings. """ # load env file containing location to store visualizations local_paths_env = load_local_paths('../pipeline/local_paths.yaml') best_all_ignitions = [] for id in tqdm(ignition_ids, desc='Ignition id'): vis_dir = f'{local_paths_env["store_visualizations"]}/{id}' # create folders to store visualizations if not os.path.exists(vis_dir): os.makedirs(vis_dir) # pull results results_table = pull_results(ignition_ids=[id], table_name='results.evaluate_rg') # calculate best results for this model results_best_hyperparam = get_best_hyperparam_algorithm(results_table) results_best_hyperparam.reset_index().to_csv( f'{local_paths_env["store_visualizations"]}/results_{id}.csv') best_all_ignitions.append(results_best_hyperparam) # plots for each review group separately for rgroup in results_table['label'].unique(): # precision-recall curves for all hyperparameters plt = plot_precision_recall_curve_hyperparams( results_table, rgroup) plt.savefig(f'{vis_dir}/pr_curve_allhyperparam-{rgroup}.png') plt.close() # precision-recall curves for best hyperparameters plt = plot_optimal_precision_recall_curve(results_best_hyperparam, rgroup) plt.savefig(f'{vis_dir}/pr_curve_besthyperparam-{rgroup}.png') plt.close() # plot distribution of precisions at specified recalls for recall in [0.9, 0.95, 0.97, 0.99]: plot = plot_distribution_precision(results_best_hyperparam, recall) plt.savefig( f'{vis_dir}/precision_distribution-recall_{recall}.png') plt.close() # concatenate best results from each model into one dataframe best_all_ignitions = pd.concat(best_all_ignitions) # calculate best results across models best_overall = get_best_hyperparam_all(best_all_ignitions) best_overall.reset_index().drop(columns=['index']).to_csv( f'{local_paths_env["store_visualizations"]}/results_overall.csv') # create directory for best visualizations vis_dir = f'{local_paths_env["store_visualizations"]}/overall' # create folders to store visualizations if not os.path.exists(vis_dir): os.makedirs(vis_dir) for group in best_overall['label'].unique(): # precision_recall curves for the best models for each ignition id # (so the best hyperparameter combination is chosen for each model) plot = plot_precision_recall_curve_best(best_all_ignitions, group) plt.savefig(f'{vis_dir}/pr_curve-{group}.png') plt.close() # precision-recall curve for the top-5 models n = 5 plot = plot_precision_recall_curve_best(best_all_ignitions, group, best_n=n) plt.savefig(f'{vis_dir}/pr_curve_best{n}-{group}.png') plt.close()
review_groups = [] for col in list(data.columns): if not col in bad_cols and col[:5] != "cited": review_groups.append(col) ### Prep plot legend ### red_patch = mpatches.Patch(color='firebrick', label='Non-review group paper') blue_patch = mpatches.Patch(color='#1f497d', label='Review group paper') ### Plot ### fig = plt.figure(figsize=(40, 40)) for i, review_group in enumerate(review_groups): plt.subplot(8, 8, i + 1) plot_citations_histograms(data=data, review_group=review_group) # plt.suptitle('Distribution of proportion of cited papers belonging to that review group', fontsize=50, y=1.05) plt.figlegend(handles=[red_patch, blue_patch], loc='lower right', fontsize=30) plt.tight_layout() plt.savefig(save_path) if __name__ == "__main__": connection = SQLConn( load_psql_env( load_local_paths('../pipeline/local_paths.yaml')['pgpass_path'])) connection.open() plot_citations_features_small_multiples(conn=connection) connection.close()
def run_pipeline(ignition_file, persist_all, load_all_fresh): """ An adhoc pipeline created to mirror the standard ML pipeline and work with citations data. Parameters: =========== ignition_file: string name of the yaml file for which you want to run an experiment persist_all: boolean T if you want to persist all data for future use load_all_fresh: boolean T if you want to avoid any persisted data and load new data from scrath Returns: ======== None """ model_parts = {} ##### 1. LOAD ENVIRONMENT DATA ##### # load local paths local_paths_env = load_local_paths('local_paths.yaml') print('Local paths loaded.') # load ignition file ignition = load_config(local_paths_env['ignition_path'] + ignition_file) print('Ignition loaded.') # id used for persisting hash_id = create_hash_id(str(ignition['id'])) print('Hash id created.') # create hyperparameter combinations (for k-folding) hyperparameters = expand_grid(ignition['hyperparameters']) # load environment file psql_env = load_psql_env(pgpass_path=local_paths_env['pgpass_path']) print('PSQL environment file loaded.') # Initiate PSQL Connection connection = SQLConn(psql_env) connection.open() ##### 2. LOAD TRAIN AND TEST DATA ##### if check_persisted(local_paths_env['store_train_data'], f'{hash_id}_x', load_all_fresh): print("Found data") # data loaded before: load from file X_train = load(local_paths_env['store_train_data'], f'{hash_id}_x') X_test = load(local_paths_env['store_test_data'], f'{hash_id}_x') y_train = load(local_paths_env['store_train_data'], f'{hash_id}_y') y_test = load(local_paths_env['store_test_data'], f'{hash_id}_y') print('Loaded data from file.') else: print("Data not found in storage - load from database") # data not loaded: pull from database and create features X_train, X_test, y_train, y_test = sample( ignition, connection, local_paths_env['store_features']) print(f"X_train shape: {X_train.shape}") print(f"X_test shape: {X_test.shape}") print(f"y_train shape: {y_train.shape}") print(f"y_test shape: {y_test.shape}") # add fold index column to data X_train, y_train = k_fold(X_train, y_train, ignition['k_folds'], ignition['k_folds_seed']) # save data to file for future use save(X_train, local_paths_env['store_train_data'], f'{hash_id}_x', persist_all) save(X_test, local_paths_env['store_test_data'], f'{hash_id}_x', persist_all) save(y_train, local_paths_env['store_train_data'], f'{hash_id}_y', persist_all) save(y_test, local_paths_env['store_test_data'], f'{hash_id}_y', persist_all) print('Data loading completed.') ##### 3. K-FOLDING ##### # loop over folds for fold in tqdm(range(ignition['k_folds']), desc='Folds'): # get fold id hash (for persisting) fold_id = create_hash_id(str(ignition['id']) + str(fold)) # get fold data fold_X_train = X_train[X_train['k'] != fold] fold_X_test = X_train[X_train['k'] == fold] fold_y_train = y_train[y_train['k'] != fold] fold_y_test = y_train[y_train['k'] == fold] # store fold features, if any fold_features = {} ##### 4. LOOP OVER HYPERPARAMETERS: TRAIN CLASSIFIER ##### for hyperparam in tqdm(hyperparameters, desc='Hyperparameters'): # create hyperparam unique id and hyperparam-fold unique id hyperparam_id = create_hash_id( str(ignition['id']) + str(hyperparam)) hyperparam_fold_id = create_hash_id( str(ignition['id']) + str(hyperparam) + str(fold)) # if not check_val_in_db(connection, ignition['results_table_name'], # 'results', 'hash_id', hyperparam_fold_id, len(ignition['recalls'])): # create classifier of specified type and with specified target classifier = select_classifier(ignition["model_type"], fold_id, ignition["target"], ignition["classes"], fold_features, hyperparameters=hyperparam, seed=ignition['seed'], env=local_paths_env, load_fresh=load_all_fresh) #print('Classifier created.') # train classifier classifier.train(fold_X_train, fold_y_train) ##### 5. TEST CLASSIFIER ##### # generate predictions from classifier y_probs = classifier.predict(fold_X_test) ##### 6. EVALUATION ##### for recall in tqdm(ignition['recalls'], desc='Evaluations'): # compute evaluation metrics all_metrics = compute_metrics( metric_names=ignition['metrics'], y_true=fold_y_test.drop(columns=['k']), y_pred=y_probs, k=recall) # store results in database unique_id = create_hash_id( str(ignition['id']) + str(hyperparam) + str(fold) + str(recall)) results_to_db(metrics=all_metrics, table_name=ignition['results_table_name'], ignition_id=ignition['id'], hash_id=hyperparam_fold_id, algorithm=ignition['model_type'], hyperparameters=hyperparam, fold=str(fold), recall=recall, unique_id=unique_id, connection=connection) connection.close() print(f"Done running pipeline for ignition id: {ignition['id']}!")
print('semantic.citations_avg created and populated') conn.commit() cur.close() conn.close() end_time = datetime.datetime.now() print("End time is " + str(end_time)) print("Elapsed time is " + str(end_time - start_time)) if __name__ == "__main__": if len(sys.argv) != 2: print('Please specify one (and only one) argument for starting_point.') else: starting_point = sys.argv[1] dot_env = load_local_paths('../pipeline/local_paths.yaml') env = load_psql_env(pgpass_path=dot_env['pgpass_path']) print(env) if starting_point == 'no_sql': reviews_dir = '/data/raw/reviews/' citations_dict = { 'citations': '/data/citations/TuringCitations.csv', 'recordid_paperid': '/data/citations/TuringCRSPMRecords.csv' } run_etl(starting_point=starting_point, env=env, reviews_dir=reviews_dir, citations_dict=citations_dict) else: run_etl(starting_point=starting_point, env=env)
def perform_model_selection(evaluate_best_models=True): """ Run model selection pipeline. """ # Load local paths file local_paths = load_local_paths('local_paths.yaml') # Load product config file prod_config = load_config('../prod/prod_config.yaml', append_static=False) # SQL set up psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path']) connection = SQLConn(psql_env) connection.open() # Pull data X_train, X_test, y_train, y_test = sample( ignition=prod_config, connection=connection, local_features_path=local_paths['store_features']) if evaluate_best_models: # Test best models for each review group scored_papers_test = load(location=local_paths['store_scored_papers'], filename='scored_papers') y_pred_test = scored_papers_test[[ col for col in scored_papers_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] y_test = y_test[[ col for col in y_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] # calculate thresholds upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds( y_test, y_pred_test, minimum_recall=0.99) # persist thresholds for production save(upper_thresholds, local_paths['store_production_models'], 'upper_thresholds') save(lower_thresholds, local_paths['store_production_models'], 'lower_thresholds') # calculate workload reductions keep, consider, discard = get_workload_reduction( y_test, y_pred_test, upper_thresholds, lower_thresholds) rg_list = [] wrkld_reductions = [] # loop over review groups for review_group in tqdm(prod_config['review_groups_recall'].keys(), desc='Review Group'): rg = review_group.lower() # get thresholds thresholds = [upper_thresholds[rg], lower_thresholds[rg]] recall_at_threshold = [ recall_at_thresholds[rg]['upper'], recall_at_thresholds[rg]['lower'] ] workload_reduction = [keep[rg], consider[rg], discard[rg]] rg_list.append(rg) wrkld_reductions.append(workload_reduction) d = {'review_group': rg_list, 'workload_reduction': wrkld_reductions} df = pd.DataFrame.from_dict(d) plot_average_workload_reduction(df) connection.close() print("Model selection pipeline complete.")
def perform_model_selection(evaluate_best_models=True): """ Run model selection pipeline. """ # Load local paths file local_paths = load_local_paths('local_paths.yaml') # Load product config file prod_config = load_config('../prod/prod_config.yaml', append_static=False) # Load results table from dbs results_df = pull_results(ignition_ids=[ '1', '2', '4', '5', '6', '10', '15', '16', '17', '18', '19', '20', '21' ]) # Get a dataframe of best algorithm x hyperparameters for each RG x recall best_df = get_best_algorithm_hyperparameter_onestep(results_df=results_df) # Get dictionary of algorithms and hyperparameters for each # review group based on recall in product config file best_models = choose_models_with_recall( models_df=best_df, group_min_recalls=prod_config['review_groups_recall']) # SQL set up psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path']) connection = SQLConn(psql_env) connection.open() # Pull data X_train, X_test, y_train, y_test = sample( ignition=prod_config, connection=connection, local_features_path=local_paths['store_features']) # Train best models for each review group train_best_models_mp(X_train, y_train, best_models=best_models, prod_config=prod_config, local_paths=local_paths, cores=3) if evaluate_best_models: # Test best models for each review group scored_papers_test = score_papers( X_test, prod_config, models_path=local_paths['store_production_models']) save(object=scored_papers_test, location=local_paths['store_scored_papers'], filename='scored_papers_citations') y_pred_test = scored_papers_test[[ col for col in scored_papers_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] y_test = y_test[[ col for col in y_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] # calculate thresholds upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds( y_test, y_pred_test, minimum_precision=0.95, minimum_recall=0.99) # persist thresholds for production save(upper_thresholds, local_paths['store_production_models'], 'upper_thresholds') save(lower_thresholds, local_paths['store_production_models'], 'lower_thresholds') # calculate workload reductions keep, consider, discard = get_workload_reduction( y_test, y_pred_test, upper_thresholds, lower_thresholds) # loop over review groups for review_group in tqdm(prod_config['review_groups_recall'].keys(), desc='Review Group'): rg = review_group.lower() # get thresholds thresholds = [upper_thresholds[rg], lower_thresholds[rg]] recall_at_threshold = [ recall_at_thresholds[rg]['upper'], recall_at_thresholds[rg]['lower'] ] workload_reduction = [keep[rg], consider[rg], discard[rg]] # evaluate scores for recall in tqdm(prod_config['recalls'], desc='Evaluations'): # calculate precisions precisions = evaluate_precision_at_k_recall( class_true=y_test, class_prob=y_pred_test, k=recall) # store results in database production_results_to_db( table_name=prod_config['results_table_name'], unique_id=f"{rg}_{recall}", review_group=rg, algorithm=best_models[review_group]['algorithm'], hyperparameters=best_models[review_group] ['hyperparameters'], recall=recall, precision=precisions[rg], thresholds=thresholds, recall_at_threshold=recall_at_threshold, workload_reduction=workload_reduction, connection=connection) connection.close() print("Model selection pipeline complete.")