コード例 #1
0
def heatmap_data(igs, comp_ig='1', local_paths='../pipeline/local_paths.yaml'):
    """
    
    Function that prepares data to be plotted in heatmap, and is stored as csv.

    Parameters
    ==========
    igs : list
        List of strings of ignition ids to be pulled from database.
    
    """

    out_dir = load_local_paths(local_paths)['tmp']

    # pull in results data
    results = pull_results(igs)
    best = get_best_hyperparam_all(results)
    best = best[[
        'algorithm', 'hyperparameters', 'label', 'recall',
        'precision_at_recall'
    ]]
    best['type'] = 'best'
    comp = get_avg_ignition(results, comp_ig)
    comp['type'] = 'comp'
    out = pd.concat([comp, best], axis=0)

    # pull rg data
    connection = SQLConn(
        load_psql_env(load_local_paths(local_paths)['pgpass_path']))
    connection.open()
    papers_rgs = connection.query(
        'select inregister as label, count(*) as n_papers from semantic.papers_rgs group by 1;'
    )
    papers_revs = connection.query(f"""
                    with tbl as(select a.*, b.cn from semantic.papers_rgs a
                    left join semantic.papers_reviews b on a.recordid=b.recordid)
                    select inregister as label, count(distinct cn) as n_revs from tbl group by 1;
                    """)

    # final dataset
    out = pd.merge(out, papers_rgs, 'left', 'label')
    out = pd.merge(out, papers_revs, 'left', 'label')

    # output dataset
    out.to_csv(out_dir + 'heatmap_data.csv')
コード例 #2
0
def create_viz_production(results_table_name='final_model_eval'):
    """
    Creates visualizations for models that are trained on the full training data set,
    and are used in production.

    Parameters
    ----------

    results_table_name : str
        The name of a SQL table which contains results (from the test set) about the final
        models.
    """

    # set up required variables
    local_paths_env = load_local_paths('../pipeline/local_paths.yaml')
    env = load_psql_env(local_paths_env['pgpass_path'])
    ignition = load_config(local_paths_env['ignition_path'] +
                           '_1_baseline_ignition.yaml')

    # open sql connection
    connection = SQLConn(env)
    connection.open()

    # pull data from table
    query = f"select * from results.{results_table_name};"
    results_df = pd.read_sql_query(query, connection.conn)
    results_df['label'] = results_df['review_group']

    # create directory for visualizations
    vis_dir = f'{local_paths_env["store_visualizations"]}/production_citations'

    # create folders to store visualizations
    if not os.path.exists(vis_dir):
        os.makedirs(vis_dir)

    # precision recall plots
    for rg in tqdm(results_df['review_group'].unique()):
        plot = plot_precision_recall_curve_best(results_df,
                                                rg,
                                                plot_baseline=False)
        plt.savefig(f'{vis_dir}/pr_curve-{rg}.png')
        plt.close()

    # stacked bar workload
    plot = workload_relative_stackedbar(results_df)
    plt.savefig(f'{vis_dir}/workload_relative.png')
    plt.close()

    plot_average_workload_reduction(results_df)
    plt.savefig(f'{vis_dir}/workload_average.png')
    plt.close()
コード例 #3
0
def pull_results(ignition_ids,
                 table_name='results.evaluate_rg',
                 metric_col='metrics',
                 metrics=['precision_at_recall'],
                 other_cols=[
                     'ignition_id', 'hash_id, algorithm', 'hyperparameters',
                     'fold', 'recall'
                 ]):
    """
    Pull results from PSQL table into long dataframe.

    Parameters
    ==========
    ignition_ids : list
        List of ignition_ids to pull into table.
    table_name : str
        Name of PSQL table with results.
    metric_col : str
        Name of column where metrics exist.
    metrics : list
        Metrics to be included in table. Will be parsed from jsonb.
    other_cols : list
        List of other columns to included in table as is.
    labels : list
        Labels to be included in table.

    Returns
    =======
    results_df : pd.DataFrame
        Long dataframe with results from specified ignition files, metrics, and labels.
    """

    local_paths_env = load_local_paths('../pipeline/local_paths.yaml')
    env = load_psql_env(local_paths_env['pgpass_path'])
    ignition = load_config(local_paths_env['ignition_path'] +
                           '_1_baseline_ignition.yaml')

    # establish SQL Connection
    connection = SQLConn(env)
    connection.open()

    ### Set up ###
    results = {}
    ignition_ids_sql = "('" + "','".join(ignition_ids) + "')"
    other_cols_sql = ",".join(other_cols)

    ## Make one query for each label and store resulting df in a dict ###
    i = 0
    for label in ignition['classes']:
        metrics_sql = f"'{label}' as label"
        for metric in metrics:
            metrics_sql += f",{metric_col} -> '{metric}' -> '{label.lower()}' as {metric}"

        qy = f"""
        select {other_cols_sql},
        {metrics_sql}
        from
        {table_name}
        where ignition_id in {ignition_ids_sql};
        """

        results[label] = pd.read_sql_query(qy, connection.conn)

    ## Concatenate all dfs into one long df ###
    results_df = pd.concat(results.values(), ignore_index=True)

    connection.close()

    return results_df
コード例 #4
0
def main(ignition_ids=[
    '1', '2', '4', '5', '7', '10', '15', '16', '17', '18', '19', '20', '21'
]):
    """
    Script for running a quick visualization of results for models
    trained during crossvalidation and storing these visualizations.

    Parameters
    ==========
    ignition_ids : list
        List of the ignition id for which results are stored, and which
        should be taken into account for visualization. In the list, these
        ignition ids should be stored as strings.
    """

    # load env file containing location to store visualizations
    local_paths_env = load_local_paths('../pipeline/local_paths.yaml')

    best_all_ignitions = []

    for id in tqdm(ignition_ids, desc='Ignition id'):

        vis_dir = f'{local_paths_env["store_visualizations"]}/{id}'

        # create folders to store visualizations
        if not os.path.exists(vis_dir):
            os.makedirs(vis_dir)

        # pull results
        results_table = pull_results(ignition_ids=[id],
                                     table_name='results.evaluate_rg')

        # calculate best results for this model
        results_best_hyperparam = get_best_hyperparam_algorithm(results_table)
        results_best_hyperparam.reset_index().to_csv(
            f'{local_paths_env["store_visualizations"]}/results_{id}.csv')
        best_all_ignitions.append(results_best_hyperparam)

        # plots for each review group separately
        for rgroup in results_table['label'].unique():
            # precision-recall curves for all hyperparameters
            plt = plot_precision_recall_curve_hyperparams(
                results_table, rgroup)
            plt.savefig(f'{vis_dir}/pr_curve_allhyperparam-{rgroup}.png')
            plt.close()

            # precision-recall curves for best hyperparameters
            plt = plot_optimal_precision_recall_curve(results_best_hyperparam,
                                                      rgroup)
            plt.savefig(f'{vis_dir}/pr_curve_besthyperparam-{rgroup}.png')
            plt.close()

        # plot distribution of precisions at specified recalls
        for recall in [0.9, 0.95, 0.97, 0.99]:
            plot = plot_distribution_precision(results_best_hyperparam, recall)
            plt.savefig(
                f'{vis_dir}/precision_distribution-recall_{recall}.png')
            plt.close()

    # concatenate best results from each model into one dataframe
    best_all_ignitions = pd.concat(best_all_ignitions)

    # calculate best results across models
    best_overall = get_best_hyperparam_all(best_all_ignitions)
    best_overall.reset_index().drop(columns=['index']).to_csv(
        f'{local_paths_env["store_visualizations"]}/results_overall.csv')

    # create directory for best visualizations
    vis_dir = f'{local_paths_env["store_visualizations"]}/overall'

    # create folders to store visualizations
    if not os.path.exists(vis_dir):
        os.makedirs(vis_dir)

    for group in best_overall['label'].unique():
        # precision_recall curves for the best models for each ignition id
        # (so the best hyperparameter combination is chosen for each model)
        plot = plot_precision_recall_curve_best(best_all_ignitions, group)
        plt.savefig(f'{vis_dir}/pr_curve-{group}.png')
        plt.close()

        # precision-recall curve for the top-5 models
        n = 5
        plot = plot_precision_recall_curve_best(best_all_ignitions,
                                                group,
                                                best_n=n)
        plt.savefig(f'{vis_dir}/pr_curve_best{n}-{group}.png')
        plt.close()
コード例 #5
0
    review_groups = []
    for col in list(data.columns):
        if not col in bad_cols and col[:5] != "cited":
            review_groups.append(col)

    ### Prep plot legend ###
    red_patch = mpatches.Patch(color='firebrick',
                               label='Non-review group paper')
    blue_patch = mpatches.Patch(color='#1f497d', label='Review group paper')

    ### Plot ###
    fig = plt.figure(figsize=(40, 40))
    for i, review_group in enumerate(review_groups):
        plt.subplot(8, 8, i + 1)
        plot_citations_histograms(data=data, review_group=review_group)
    # plt.suptitle('Distribution of proportion of cited papers belonging to that review group', fontsize=50, y=1.05)
    plt.figlegend(handles=[red_patch, blue_patch],
                  loc='lower right',
                  fontsize=30)
    plt.tight_layout()
    plt.savefig(save_path)


if __name__ == "__main__":
    connection = SQLConn(
        load_psql_env(
            load_local_paths('../pipeline/local_paths.yaml')['pgpass_path']))
    connection.open()
    plot_citations_features_small_multiples(conn=connection)
    connection.close()
コード例 #6
0
def run_pipeline(ignition_file, persist_all, load_all_fresh):
    """
    An adhoc pipeline created to mirror the standard ML pipeline and work
    with citations data.

    Parameters:
    ===========
    ignition_file: string
        name of the yaml file for which you want to run an experiment

    persist_all: boolean
        T if you want to persist all data for future use

    load_all_fresh: boolean
        T if you want to avoid any persisted data and load new data from scrath

    Returns:
    ========
    None
    """

    model_parts = {}

    ##### 1. LOAD ENVIRONMENT DATA #####

    # load local paths
    local_paths_env = load_local_paths('local_paths.yaml')
    print('Local paths loaded.')

    # load ignition file
    ignition = load_config(local_paths_env['ignition_path'] + ignition_file)
    print('Ignition loaded.')

    # id used for persisting
    hash_id = create_hash_id(str(ignition['id']))
    print('Hash id created.')

    # create hyperparameter combinations (for k-folding)
    hyperparameters = expand_grid(ignition['hyperparameters'])

    # load environment file
    psql_env = load_psql_env(pgpass_path=local_paths_env['pgpass_path'])
    print('PSQL environment file loaded.')

    # Initiate PSQL Connection
    connection = SQLConn(psql_env)
    connection.open()

    ##### 2. LOAD TRAIN AND TEST DATA #####

    if check_persisted(local_paths_env['store_train_data'], f'{hash_id}_x',
                       load_all_fresh):

        print("Found data")

        # data loaded before: load from file
        X_train = load(local_paths_env['store_train_data'], f'{hash_id}_x')
        X_test = load(local_paths_env['store_test_data'], f'{hash_id}_x')
        y_train = load(local_paths_env['store_train_data'], f'{hash_id}_y')
        y_test = load(local_paths_env['store_test_data'], f'{hash_id}_y')

        print('Loaded data from file.')

    else:

        print("Data not found in storage - load from database")

        # data not loaded: pull from database and create features
        X_train, X_test, y_train, y_test = sample(
            ignition, connection, local_paths_env['store_features'])
        print(f"X_train shape: {X_train.shape}")
        print(f"X_test shape: {X_test.shape}")
        print(f"y_train shape: {y_train.shape}")
        print(f"y_test shape: {y_test.shape}")

        # add fold index column to data
        X_train, y_train = k_fold(X_train, y_train, ignition['k_folds'],
                                  ignition['k_folds_seed'])

        # save data to file for future use
        save(X_train, local_paths_env['store_train_data'], f'{hash_id}_x',
             persist_all)
        save(X_test, local_paths_env['store_test_data'], f'{hash_id}_x',
             persist_all)
        save(y_train, local_paths_env['store_train_data'], f'{hash_id}_y',
             persist_all)
        save(y_test, local_paths_env['store_test_data'], f'{hash_id}_y',
             persist_all)

    print('Data loading completed.')

    ##### 3. K-FOLDING #####

    # loop over folds
    for fold in tqdm(range(ignition['k_folds']), desc='Folds'):

        # get fold id hash (for persisting)
        fold_id = create_hash_id(str(ignition['id']) + str(fold))

        # get fold data
        fold_X_train = X_train[X_train['k'] != fold]
        fold_X_test = X_train[X_train['k'] == fold]
        fold_y_train = y_train[y_train['k'] != fold]
        fold_y_test = y_train[y_train['k'] == fold]

        # store fold features, if any
        fold_features = {}

        ##### 4. LOOP OVER HYPERPARAMETERS: TRAIN CLASSIFIER #####

        for hyperparam in tqdm(hyperparameters, desc='Hyperparameters'):

            # create hyperparam unique id and hyperparam-fold unique id
            hyperparam_id = create_hash_id(
                str(ignition['id']) + str(hyperparam))
            hyperparam_fold_id = create_hash_id(
                str(ignition['id']) + str(hyperparam) + str(fold))

            # if not check_val_in_db(connection, ignition['results_table_name'],
            # 'results', 'hash_id', hyperparam_fold_id, len(ignition['recalls'])):

            # create classifier of specified type and with specified target
            classifier = select_classifier(ignition["model_type"],
                                           fold_id,
                                           ignition["target"],
                                           ignition["classes"],
                                           fold_features,
                                           hyperparameters=hyperparam,
                                           seed=ignition['seed'],
                                           env=local_paths_env,
                                           load_fresh=load_all_fresh)
            #print('Classifier created.')

            # train classifier
            classifier.train(fold_X_train, fold_y_train)

            ##### 5. TEST CLASSIFIER #####

            # generate predictions from classifier
            y_probs = classifier.predict(fold_X_test)

            ##### 6. EVALUATION #####

            for recall in tqdm(ignition['recalls'], desc='Evaluations'):

                # compute evaluation metrics
                all_metrics = compute_metrics(
                    metric_names=ignition['metrics'],
                    y_true=fold_y_test.drop(columns=['k']),
                    y_pred=y_probs,
                    k=recall)

                # store results in database
                unique_id = create_hash_id(
                    str(ignition['id']) + str(hyperparam) + str(fold) +
                    str(recall))

                results_to_db(metrics=all_metrics,
                              table_name=ignition['results_table_name'],
                              ignition_id=ignition['id'],
                              hash_id=hyperparam_fold_id,
                              algorithm=ignition['model_type'],
                              hyperparameters=hyperparam,
                              fold=str(fold),
                              recall=recall,
                              unique_id=unique_id,
                              connection=connection)

    connection.close()
    print(f"Done running pipeline for ignition id: {ignition['id']}!")
コード例 #7
0
    print('semantic.citations_avg created and populated')

    conn.commit()
    cur.close()
    conn.close()
    end_time = datetime.datetime.now()
    print("End time is " + str(end_time))
    print("Elapsed time is " + str(end_time - start_time))


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print('Please specify one (and only one) argument for starting_point.')
    else:
        starting_point = sys.argv[1]
        dot_env = load_local_paths('../pipeline/local_paths.yaml')
        env = load_psql_env(pgpass_path=dot_env['pgpass_path'])
        print(env)

        if starting_point == 'no_sql':
            reviews_dir = '/data/raw/reviews/'
            citations_dict = {
                'citations': '/data/citations/TuringCitations.csv',
                'recordid_paperid': '/data/citations/TuringCRSPMRecords.csv'
            }
            run_etl(starting_point=starting_point,
                    env=env,
                    reviews_dir=reviews_dir,
                    citations_dict=citations_dict)
        else:
            run_etl(starting_point=starting_point, env=env)
コード例 #8
0
def perform_model_selection(evaluate_best_models=True):
    """
    Run model selection pipeline.
    """
    # Load local paths file
    local_paths = load_local_paths('local_paths.yaml')

    # Load product config file
    prod_config = load_config('../prod/prod_config.yaml', append_static=False)

    # SQL set up
    psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path'])
    connection = SQLConn(psql_env)
    connection.open()

    # Pull data
    X_train, X_test, y_train, y_test = sample(
        ignition=prod_config,
        connection=connection,
        local_features_path=local_paths['store_features'])

    if evaluate_best_models:

        # Test best models for each review group
        scored_papers_test = load(location=local_paths['store_scored_papers'],
                                  filename='scored_papers')

        y_pred_test = scored_papers_test[[
            col for col in scored_papers_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]
        y_test = y_test[[
            col for col in y_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]

        # calculate thresholds
        upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds(
            y_test, y_pred_test, minimum_recall=0.99)

        # persist thresholds for production
        save(upper_thresholds, local_paths['store_production_models'],
             'upper_thresholds')
        save(lower_thresholds, local_paths['store_production_models'],
             'lower_thresholds')

        # calculate workload reductions
        keep, consider, discard = get_workload_reduction(
            y_test, y_pred_test, upper_thresholds, lower_thresholds)

        rg_list = []
        wrkld_reductions = []

        # loop over review groups
        for review_group in tqdm(prod_config['review_groups_recall'].keys(),
                                 desc='Review Group'):

            rg = review_group.lower()

            # get thresholds
            thresholds = [upper_thresholds[rg], lower_thresholds[rg]]
            recall_at_threshold = [
                recall_at_thresholds[rg]['upper'],
                recall_at_thresholds[rg]['lower']
            ]
            workload_reduction = [keep[rg], consider[rg], discard[rg]]

            rg_list.append(rg)
            wrkld_reductions.append(workload_reduction)

        d = {'review_group': rg_list, 'workload_reduction': wrkld_reductions}
        df = pd.DataFrame.from_dict(d)
        plot_average_workload_reduction(df)

    connection.close()

    print("Model selection pipeline complete.")
コード例 #9
0
def perform_model_selection(evaluate_best_models=True):
    """
    Run model selection pipeline.
    """
    # Load local paths file
    local_paths = load_local_paths('local_paths.yaml')

    # Load product config file
    prod_config = load_config('../prod/prod_config.yaml', append_static=False)

    # Load results table from dbs
    results_df = pull_results(ignition_ids=[
        '1', '2', '4', '5', '6', '10', '15', '16', '17', '18', '19', '20', '21'
    ])

    # Get a dataframe of best algorithm x hyperparameters for each RG x recall
    best_df = get_best_algorithm_hyperparameter_onestep(results_df=results_df)

    # Get dictionary of algorithms and hyperparameters for each
    # review group based on recall in product config file
    best_models = choose_models_with_recall(
        models_df=best_df,
        group_min_recalls=prod_config['review_groups_recall'])

    # SQL set up
    psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path'])
    connection = SQLConn(psql_env)
    connection.open()

    # Pull data
    X_train, X_test, y_train, y_test = sample(
        ignition=prod_config,
        connection=connection,
        local_features_path=local_paths['store_features'])

    # Train best models for each review group
    train_best_models_mp(X_train,
                         y_train,
                         best_models=best_models,
                         prod_config=prod_config,
                         local_paths=local_paths,
                         cores=3)

    if evaluate_best_models:

        # Test best models for each review group
        scored_papers_test = score_papers(
            X_test,
            prod_config,
            models_path=local_paths['store_production_models'])
        save(object=scored_papers_test,
             location=local_paths['store_scored_papers'],
             filename='scored_papers_citations')

        y_pred_test = scored_papers_test[[
            col for col in scored_papers_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]
        y_test = y_test[[
            col for col in y_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]

        # calculate thresholds
        upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds(
            y_test, y_pred_test, minimum_precision=0.95, minimum_recall=0.99)

        # persist thresholds for production
        save(upper_thresholds, local_paths['store_production_models'],
             'upper_thresholds')
        save(lower_thresholds, local_paths['store_production_models'],
             'lower_thresholds')

        # calculate workload reductions
        keep, consider, discard = get_workload_reduction(
            y_test, y_pred_test, upper_thresholds, lower_thresholds)

        # loop over review groups
        for review_group in tqdm(prod_config['review_groups_recall'].keys(),
                                 desc='Review Group'):

            rg = review_group.lower()

            # get thresholds
            thresholds = [upper_thresholds[rg], lower_thresholds[rg]]
            recall_at_threshold = [
                recall_at_thresholds[rg]['upper'],
                recall_at_thresholds[rg]['lower']
            ]
            workload_reduction = [keep[rg], consider[rg], discard[rg]]

            # evaluate scores
            for recall in tqdm(prod_config['recalls'], desc='Evaluations'):

                # calculate precisions
                precisions = evaluate_precision_at_k_recall(
                    class_true=y_test, class_prob=y_pred_test, k=recall)

                # store results in database
                production_results_to_db(
                    table_name=prod_config['results_table_name'],
                    unique_id=f"{rg}_{recall}",
                    review_group=rg,
                    algorithm=best_models[review_group]['algorithm'],
                    hyperparameters=best_models[review_group]
                    ['hyperparameters'],
                    recall=recall,
                    precision=precisions[rg],
                    thresholds=thresholds,
                    recall_at_threshold=recall_at_threshold,
                    workload_reduction=workload_reduction,
                    connection=connection)

    connection.close()

    print("Model selection pipeline complete.")