Beispiel #1
0
def create_viz_production(results_table_name='final_model_eval'):
    """
    Creates visualizations for models that are trained on the full training data set,
    and are used in production.

    Parameters
    ----------

    results_table_name : str
        The name of a SQL table which contains results (from the test set) about the final
        models.
    """

    # set up required variables
    local_paths_env = load_local_paths('../pipeline/local_paths.yaml')
    env = load_psql_env(local_paths_env['pgpass_path'])
    ignition = load_config(local_paths_env['ignition_path'] +
                           '_1_baseline_ignition.yaml')

    # open sql connection
    connection = SQLConn(env)
    connection.open()

    # pull data from table
    query = f"select * from results.{results_table_name};"
    results_df = pd.read_sql_query(query, connection.conn)
    results_df['label'] = results_df['review_group']

    # create directory for visualizations
    vis_dir = f'{local_paths_env["store_visualizations"]}/production_citations'

    # create folders to store visualizations
    if not os.path.exists(vis_dir):
        os.makedirs(vis_dir)

    # precision recall plots
    for rg in tqdm(results_df['review_group'].unique()):
        plot = plot_precision_recall_curve_best(results_df,
                                                rg,
                                                plot_baseline=False)
        plt.savefig(f'{vis_dir}/pr_curve-{rg}.png')
        plt.close()

    # stacked bar workload
    plot = workload_relative_stackedbar(results_df)
    plt.savefig(f'{vis_dir}/workload_relative.png')
    plt.close()

    plot_average_workload_reduction(results_df)
    plt.savefig(f'{vis_dir}/workload_average.png')
    plt.close()
Beispiel #2
0
def heatmap_data(igs, comp_ig='1', local_paths='../pipeline/local_paths.yaml'):
    """
    
    Function that prepares data to be plotted in heatmap, and is stored as csv.

    Parameters
    ==========
    igs : list
        List of strings of ignition ids to be pulled from database.
    
    """

    out_dir = load_local_paths(local_paths)['tmp']

    # pull in results data
    results = pull_results(igs)
    best = get_best_hyperparam_all(results)
    best = best[[
        'algorithm', 'hyperparameters', 'label', 'recall',
        'precision_at_recall'
    ]]
    best['type'] = 'best'
    comp = get_avg_ignition(results, comp_ig)
    comp['type'] = 'comp'
    out = pd.concat([comp, best], axis=0)

    # pull rg data
    connection = SQLConn(
        load_psql_env(load_local_paths(local_paths)['pgpass_path']))
    connection.open()
    papers_rgs = connection.query(
        'select inregister as label, count(*) as n_papers from semantic.papers_rgs group by 1;'
    )
    papers_revs = connection.query(f"""
                    with tbl as(select a.*, b.cn from semantic.papers_rgs a
                    left join semantic.papers_reviews b on a.recordid=b.recordid)
                    select inregister as label, count(distinct cn) as n_revs from tbl group by 1;
                    """)

    # final dataset
    out = pd.merge(out, papers_rgs, 'left', 'label')
    out = pd.merge(out, papers_revs, 'left', 'label')

    # output dataset
    out.to_csv(out_dir + 'heatmap_data.csv')
def pull_results(ignition_ids,
                 table_name='results.evaluate_rg',
                 metric_col='metrics',
                 metrics=['precision_at_recall'],
                 other_cols=[
                     'ignition_id', 'hash_id, algorithm', 'hyperparameters',
                     'fold', 'recall'
                 ]):
    """
    Pull results from PSQL table into long dataframe.

    Parameters
    ==========
    ignition_ids : list
        List of ignition_ids to pull into table.
    table_name : str
        Name of PSQL table with results.
    metric_col : str
        Name of column where metrics exist.
    metrics : list
        Metrics to be included in table. Will be parsed from jsonb.
    other_cols : list
        List of other columns to included in table as is.
    labels : list
        Labels to be included in table.

    Returns
    =======
    results_df : pd.DataFrame
        Long dataframe with results from specified ignition files, metrics, and labels.
    """

    local_paths_env = load_local_paths('../pipeline/local_paths.yaml')
    env = load_psql_env(local_paths_env['pgpass_path'])
    ignition = load_config(local_paths_env['ignition_path'] +
                           '_1_baseline_ignition.yaml')

    # establish SQL Connection
    connection = SQLConn(env)
    connection.open()

    ### Set up ###
    results = {}
    ignition_ids_sql = "('" + "','".join(ignition_ids) + "')"
    other_cols_sql = ",".join(other_cols)

    ## Make one query for each label and store resulting df in a dict ###
    i = 0
    for label in ignition['classes']:
        metrics_sql = f"'{label}' as label"
        for metric in metrics:
            metrics_sql += f",{metric_col} -> '{metric}' -> '{label.lower()}' as {metric}"

        qy = f"""
        select {other_cols_sql},
        {metrics_sql}
        from
        {table_name}
        where ignition_id in {ignition_ids_sql};
        """

        results[label] = pd.read_sql_query(qy, connection.conn)

    ## Concatenate all dfs into one long df ###
    results_df = pd.concat(results.values(), ignore_index=True)

    connection.close()

    return results_df
Beispiel #4
0
    review_groups = []
    for col in list(data.columns):
        if not col in bad_cols and col[:5] != "cited":
            review_groups.append(col)

    ### Prep plot legend ###
    red_patch = mpatches.Patch(color='firebrick',
                               label='Non-review group paper')
    blue_patch = mpatches.Patch(color='#1f497d', label='Review group paper')

    ### Plot ###
    fig = plt.figure(figsize=(40, 40))
    for i, review_group in enumerate(review_groups):
        plt.subplot(8, 8, i + 1)
        plot_citations_histograms(data=data, review_group=review_group)
    # plt.suptitle('Distribution of proportion of cited papers belonging to that review group', fontsize=50, y=1.05)
    plt.figlegend(handles=[red_patch, blue_patch],
                  loc='lower right',
                  fontsize=30)
    plt.tight_layout()
    plt.savefig(save_path)


if __name__ == "__main__":
    connection = SQLConn(
        load_psql_env(
            load_local_paths('../pipeline/local_paths.yaml')['pgpass_path']))
    connection.open()
    plot_citations_features_small_multiples(conn=connection)
    connection.close()
def run_pipeline(ignition_file, persist_all, load_all_fresh):
    """
    An adhoc pipeline created to mirror the standard ML pipeline and work
    with citations data.

    Parameters:
    ===========
    ignition_file: string
        name of the yaml file for which you want to run an experiment

    persist_all: boolean
        T if you want to persist all data for future use

    load_all_fresh: boolean
        T if you want to avoid any persisted data and load new data from scrath

    Returns:
    ========
    None
    """

    model_parts = {}

    ##### 1. LOAD ENVIRONMENT DATA #####

    # load local paths
    local_paths_env = load_local_paths('local_paths.yaml')
    print('Local paths loaded.')

    # load ignition file
    ignition = load_config(local_paths_env['ignition_path'] + ignition_file)
    print('Ignition loaded.')

    # id used for persisting
    hash_id = create_hash_id(str(ignition['id']))
    print('Hash id created.')

    # create hyperparameter combinations (for k-folding)
    hyperparameters = expand_grid(ignition['hyperparameters'])

    # load environment file
    psql_env = load_psql_env(pgpass_path=local_paths_env['pgpass_path'])
    print('PSQL environment file loaded.')

    # Initiate PSQL Connection
    connection = SQLConn(psql_env)
    connection.open()

    ##### 2. LOAD TRAIN AND TEST DATA #####

    if check_persisted(local_paths_env['store_train_data'], f'{hash_id}_x',
                       load_all_fresh):

        print("Found data")

        # data loaded before: load from file
        X_train = load(local_paths_env['store_train_data'], f'{hash_id}_x')
        X_test = load(local_paths_env['store_test_data'], f'{hash_id}_x')
        y_train = load(local_paths_env['store_train_data'], f'{hash_id}_y')
        y_test = load(local_paths_env['store_test_data'], f'{hash_id}_y')

        print('Loaded data from file.')

    else:

        print("Data not found in storage - load from database")

        # data not loaded: pull from database and create features
        X_train, X_test, y_train, y_test = sample(
            ignition, connection, local_paths_env['store_features'])
        print(f"X_train shape: {X_train.shape}")
        print(f"X_test shape: {X_test.shape}")
        print(f"y_train shape: {y_train.shape}")
        print(f"y_test shape: {y_test.shape}")

        # add fold index column to data
        X_train, y_train = k_fold(X_train, y_train, ignition['k_folds'],
                                  ignition['k_folds_seed'])

        # save data to file for future use
        save(X_train, local_paths_env['store_train_data'], f'{hash_id}_x',
             persist_all)
        save(X_test, local_paths_env['store_test_data'], f'{hash_id}_x',
             persist_all)
        save(y_train, local_paths_env['store_train_data'], f'{hash_id}_y',
             persist_all)
        save(y_test, local_paths_env['store_test_data'], f'{hash_id}_y',
             persist_all)

    print('Data loading completed.')

    ##### 3. K-FOLDING #####

    # loop over folds
    for fold in tqdm(range(ignition['k_folds']), desc='Folds'):

        # get fold id hash (for persisting)
        fold_id = create_hash_id(str(ignition['id']) + str(fold))

        # get fold data
        fold_X_train = X_train[X_train['k'] != fold]
        fold_X_test = X_train[X_train['k'] == fold]
        fold_y_train = y_train[y_train['k'] != fold]
        fold_y_test = y_train[y_train['k'] == fold]

        # store fold features, if any
        fold_features = {}

        ##### 4. LOOP OVER HYPERPARAMETERS: TRAIN CLASSIFIER #####

        for hyperparam in tqdm(hyperparameters, desc='Hyperparameters'):

            # create hyperparam unique id and hyperparam-fold unique id
            hyperparam_id = create_hash_id(
                str(ignition['id']) + str(hyperparam))
            hyperparam_fold_id = create_hash_id(
                str(ignition['id']) + str(hyperparam) + str(fold))

            # if not check_val_in_db(connection, ignition['results_table_name'],
            # 'results', 'hash_id', hyperparam_fold_id, len(ignition['recalls'])):

            # create classifier of specified type and with specified target
            classifier = select_classifier(ignition["model_type"],
                                           fold_id,
                                           ignition["target"],
                                           ignition["classes"],
                                           fold_features,
                                           hyperparameters=hyperparam,
                                           seed=ignition['seed'],
                                           env=local_paths_env,
                                           load_fresh=load_all_fresh)
            #print('Classifier created.')

            # train classifier
            classifier.train(fold_X_train, fold_y_train)

            ##### 5. TEST CLASSIFIER #####

            # generate predictions from classifier
            y_probs = classifier.predict(fold_X_test)

            ##### 6. EVALUATION #####

            for recall in tqdm(ignition['recalls'], desc='Evaluations'):

                # compute evaluation metrics
                all_metrics = compute_metrics(
                    metric_names=ignition['metrics'],
                    y_true=fold_y_test.drop(columns=['k']),
                    y_pred=y_probs,
                    k=recall)

                # store results in database
                unique_id = create_hash_id(
                    str(ignition['id']) + str(hyperparam) + str(fold) +
                    str(recall))

                results_to_db(metrics=all_metrics,
                              table_name=ignition['results_table_name'],
                              ignition_id=ignition['id'],
                              hash_id=hyperparam_fold_id,
                              algorithm=ignition['model_type'],
                              hyperparameters=hyperparam,
                              fold=str(fold),
                              recall=recall,
                              unique_id=unique_id,
                              connection=connection)

    connection.close()
    print(f"Done running pipeline for ignition id: {ignition['id']}!")
Beispiel #6
0
    conn.commit()
    cur.close()
    conn.close()
    end_time = datetime.datetime.now()
    print("End time is " + str(end_time))
    print("Elapsed time is " + str(end_time - start_time))


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print('Please specify one (and only one) argument for starting_point.')
    else:
        starting_point = sys.argv[1]
        dot_env = load_local_paths('../pipeline/local_paths.yaml')
        env = load_psql_env(pgpass_path=dot_env['pgpass_path'])
        print(env)

        if starting_point == 'no_sql':
            reviews_dir = '/data/raw/reviews/'
            citations_dict = {
                'citations': '/data/citations/TuringCitations.csv',
                'recordid_paperid': '/data/citations/TuringCRSPMRecords.csv'
            }
            run_etl(starting_point=starting_point,
                    env=env,
                    reviews_dir=reviews_dir,
                    citations_dict=citations_dict)
        else:
            run_etl(starting_point=starting_point, env=env)
Beispiel #7
0
def perform_model_selection(evaluate_best_models=True):
    """
    Run model selection pipeline.
    """
    # Load local paths file
    local_paths = load_local_paths('local_paths.yaml')

    # Load product config file
    prod_config = load_config('../prod/prod_config.yaml', append_static=False)

    # SQL set up
    psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path'])
    connection = SQLConn(psql_env)
    connection.open()

    # Pull data
    X_train, X_test, y_train, y_test = sample(
        ignition=prod_config,
        connection=connection,
        local_features_path=local_paths['store_features'])

    if evaluate_best_models:

        # Test best models for each review group
        scored_papers_test = load(location=local_paths['store_scored_papers'],
                                  filename='scored_papers')

        y_pred_test = scored_papers_test[[
            col for col in scored_papers_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]
        y_test = y_test[[
            col for col in y_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]

        # calculate thresholds
        upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds(
            y_test, y_pred_test, minimum_recall=0.99)

        # persist thresholds for production
        save(upper_thresholds, local_paths['store_production_models'],
             'upper_thresholds')
        save(lower_thresholds, local_paths['store_production_models'],
             'lower_thresholds')

        # calculate workload reductions
        keep, consider, discard = get_workload_reduction(
            y_test, y_pred_test, upper_thresholds, lower_thresholds)

        rg_list = []
        wrkld_reductions = []

        # loop over review groups
        for review_group in tqdm(prod_config['review_groups_recall'].keys(),
                                 desc='Review Group'):

            rg = review_group.lower()

            # get thresholds
            thresholds = [upper_thresholds[rg], lower_thresholds[rg]]
            recall_at_threshold = [
                recall_at_thresholds[rg]['upper'],
                recall_at_thresholds[rg]['lower']
            ]
            workload_reduction = [keep[rg], consider[rg], discard[rg]]

            rg_list.append(rg)
            wrkld_reductions.append(workload_reduction)

        d = {'review_group': rg_list, 'workload_reduction': wrkld_reductions}
        df = pd.DataFrame.from_dict(d)
        plot_average_workload_reduction(df)

    connection.close()

    print("Model selection pipeline complete.")
Beispiel #8
0
def perform_model_selection(evaluate_best_models=True):
    """
    Run model selection pipeline.
    """
    # Load local paths file
    local_paths = load_local_paths('local_paths.yaml')

    # Load product config file
    prod_config = load_config('../prod/prod_config.yaml', append_static=False)

    # Load results table from dbs
    results_df = pull_results(ignition_ids=[
        '1', '2', '4', '5', '6', '10', '15', '16', '17', '18', '19', '20', '21'
    ])

    # Get a dataframe of best algorithm x hyperparameters for each RG x recall
    best_df = get_best_algorithm_hyperparameter_onestep(results_df=results_df)

    # Get dictionary of algorithms and hyperparameters for each
    # review group based on recall in product config file
    best_models = choose_models_with_recall(
        models_df=best_df,
        group_min_recalls=prod_config['review_groups_recall'])

    # SQL set up
    psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path'])
    connection = SQLConn(psql_env)
    connection.open()

    # Pull data
    X_train, X_test, y_train, y_test = sample(
        ignition=prod_config,
        connection=connection,
        local_features_path=local_paths['store_features'])

    # Train best models for each review group
    train_best_models_mp(X_train,
                         y_train,
                         best_models=best_models,
                         prod_config=prod_config,
                         local_paths=local_paths,
                         cores=3)

    if evaluate_best_models:

        # Test best models for each review group
        scored_papers_test = score_papers(
            X_test,
            prod_config,
            models_path=local_paths['store_production_models'])
        save(object=scored_papers_test,
             location=local_paths['store_scored_papers'],
             filename='scored_papers_citations')

        y_pred_test = scored_papers_test[[
            col for col in scored_papers_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]
        y_test = y_test[[
            col for col in y_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]

        # calculate thresholds
        upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds(
            y_test, y_pred_test, minimum_precision=0.95, minimum_recall=0.99)

        # persist thresholds for production
        save(upper_thresholds, local_paths['store_production_models'],
             'upper_thresholds')
        save(lower_thresholds, local_paths['store_production_models'],
             'lower_thresholds')

        # calculate workload reductions
        keep, consider, discard = get_workload_reduction(
            y_test, y_pred_test, upper_thresholds, lower_thresholds)

        # loop over review groups
        for review_group in tqdm(prod_config['review_groups_recall'].keys(),
                                 desc='Review Group'):

            rg = review_group.lower()

            # get thresholds
            thresholds = [upper_thresholds[rg], lower_thresholds[rg]]
            recall_at_threshold = [
                recall_at_thresholds[rg]['upper'],
                recall_at_thresholds[rg]['lower']
            ]
            workload_reduction = [keep[rg], consider[rg], discard[rg]]

            # evaluate scores
            for recall in tqdm(prod_config['recalls'], desc='Evaluations'):

                # calculate precisions
                precisions = evaluate_precision_at_k_recall(
                    class_true=y_test, class_prob=y_pred_test, k=recall)

                # store results in database
                production_results_to_db(
                    table_name=prod_config['results_table_name'],
                    unique_id=f"{rg}_{recall}",
                    review_group=rg,
                    algorithm=best_models[review_group]['algorithm'],
                    hyperparameters=best_models[review_group]
                    ['hyperparameters'],
                    recall=recall,
                    precision=precisions[rg],
                    thresholds=thresholds,
                    recall_at_threshold=recall_at_threshold,
                    workload_reduction=workload_reduction,
                    connection=connection)

    connection.close()

    print("Model selection pipeline complete.")