コード例 #1
0
def create_labels(args):
    """
    Function to obtain a dataframe of labels from experiment file corresponding
    to cohort

    Parameters
    ----------
    experiment: dict
        Experiment file with model parameters

    Return
    ---------
    pd.DataFrame
        Dataframe of IDs and labels
    """

    experiment = get_experiment(args['experiment_id'])
    features = get_local(args, 'features')['id_llamado']

    query ="""
        select distinct labels.id_llamado as id_llamado, tipo_procedimiento_codigo, 
        labels.reception_date, {label_target} as target
        from semantic.labels labels
        join semantic.tenders tenders
        on labels.id_llamado = tenders.id_llamado
        where labels.id_llamado in ({cohort})
    """.format(cohort=experiment['cohort_config']['query'],
               label_target=experiment['label_config']['query'])

    con = utils.connect_to_database()
    labels = pd.read_sql_query(query, con)

    labels = labels[labels['id_llamado'].isin(features)]

    persist_local(labels, args, 'labels')
コード例 #2
0
def generate_temporal_folds(experiment, args):
    """ Given a label table and temporal parameters, it generates temporal folds.

    Parameters
    ----------
    experiment : dict
        Paramenters of to perform the experiment
    args : dict
        Minimum set of parameters to run the pipeline

    Returns
    -------
    list of dicts
        All folds information, by as of date. It carries the ids to filter the
        tables
    """

    params = experiment['validation']['parameters']
    current_aod = dt.datetime.strptime(params['as_of_date'], '%Y-%m-%d')

    labels = get_local(args, 'labels')
    X = labels[['id_llamado', 'reception_date']]
    y = labels['target']

    k = 1
    folds = []
    while True:
        test_end = current_aod + dt.timedelta(days=params['test_lag'])

        if test_end > dt.datetime.strptime(
                params['test_date_limit'], '%Y-%m-%d'):
            break

        if params['number_of_folds'] is not None:
            if k > params['number_of_folds']:
                break

        # If train_lag is 'all', train_start is set to a dummy old date
        # (2000-01-01)
        train_start = (current_aod - dt.timedelta(days=params['train_lag']) - dt.timedelta(days=params['blind_gap'])) \
            if params['train_lag'] != 'all' else dt.datetime(2000, 1, 1)
        train_end = current_aod - dt.timedelta(days=params['blind_gap'])

        train_ids = X.query(
            f"reception_date >= '{train_start}' and reception_date <= '{train_end}'")['id_llamado']
        test_ids = X.query(
            f"reception_date >= '{current_aod}' and reception_date <= '{test_end}'")['id_llamado']

        folds.append({
            'name': dt.datetime.strftime(current_aod, '%Y-%m-%d'),
            'train': train_ids.tolist(),
            'test': test_ids.tolist()
        })

        current_aod = current_aod + dt.timedelta(days=params['aod_lag'])
        k = k + 1

    persist_local(folds, args, 'folds', as_type='.p')

    return folds
コード例 #3
0
def plot_metric_by_fold(selector):

    data = selector['data']
    experiment_id = data['experiment_id'].unique()[0]
    selector_name = selector['name']

    metrics = data['eval_metric'].unique()

    nrows = 1  # Number of rows
    ncols = 1  # Number of columns
    fontsize = 14  # General fontsize

    title = """
Experiment: {experiment_id} 
Metric: {metric}
Selector: {selector_name}"""

    # Loop to create one fig per metric and a line per learner
    for metric in metrics:

        fig, axis = plt.subplots(figsize=(15, 8))

        axis.set_title(
            title.format(experiment_id=experiment_id,
                         selector_name=selector_name,
                         metric=metric))
        axis.set_ylabel('score')
        axis.legend()

        #check if it is k-fold or temporal-fold
        if '-' in data['fold'].iloc[0]:
            axis.set_xlabel('time')
            axis.tick_params(axis='both', labelsize=12)
            axis.tick_params(axis='x', rotation=45)
        else:
            axis.set_xlabel('fold')

        for i, learner in enumerate(data['learner_id'].unique()):

            data_to_plot = data[(data['learner_id'] == learner)
                                & (data['eval_metric'] == metric)]

            axis.plot(data_to_plot['fold'],
                      data_to_plot['score'],
                      color=next(color),
                      label=data['name'].unique()[0] + '_' + str(learner))

        axis.legend()

        persist_local(data=fig,
                      args={
                          'experiment_id': experiment_id,
                          'title': title,
                          'eval_metric': metric
                      },
                      folder='evaluation_plots',
                      id_keys=['experiment_id', 'title', 'eval_metric'],
                      as_type='.png')
コード例 #4
0
def create_features(args):
    """
    Function to obtain features specified in the experiment file.
    Function will loop over all the features.

    Parameters:
    ------------
    experiment: dict
        Experiment file with model parameters

    Return:
    ------------
    pd.DataFrame
        A dataframe of features corresponding to each cohort
    """

    experiment = get_experiment(args['experiment_id'])

    query_config = """with cd_tenders as (
            {cohort}
            )
                select cd_tenders.id_llamado, {columns}
                from cd_tenders
                left join {table} as feature_table
                on cd_tenders.id_llamado = feature_table.id_llamado
        """

    con = utils.connect_to_database()

    features_combined = pd.DataFrame()

    for feature_config in experiment['features']:

        query = query_config.format(cohort=experiment['cohort_config']['query'],
                                    columns=','.join(
                                        feature_config['columns']),
                                    table=feature_config['table'])

        features = pd.read_sql_query(query, con)

        if features_combined.empty:
            features_combined = features
        else:
            features_combined = features_combined.merge(
                features, on='id_llamado', how='inner')

    # print(features_combined.columns)

    features_combined = features_combined.dropna()

    persist_local(features_combined, args, 'features')
コード例 #5
0
def apply_preprocessing(approach, original_train_dict, original_test_dict, args):
    

    train_dict, test_dict = copy.deepcopy(original_train_dict), copy.deepcopy(original_test_dict)

    if 'preprocessors' in approach:

        train_dict['features'], preprocessing = run(approach['preprocessors'], train_dict['features'])

        test_dict['features'], preprocessing = run(approach['preprocessors'], test_dict['features'], preprocessing, fit=False)
    
    persist_local(preprocessing, args, 'preprocessing', ['experiment_id', 'approach_id', 'fold'], '.dill')
    
    return train_dict, test_dict
コード例 #6
0
def save_args(production_path, ids, max_fold, k):
    args = {
        'experiment_id': ids['experiment_id'],
        'approach_id': ids['approach_id'],
        'learner_id': ids['learner_id'],
        'fold': max_fold,
        'features': get_features(ids),
        'k': k
    }

    persist_local(args, {'name': 'best_model_args'},
                  folder=None,
                  id_keys=['name'],
                  as_type='.p',
                  save_path=production_path)
コード例 #7
0
def save_model(production_path, ids):

    model = get_local(ids.to_dict(),
                      folder='models',
                      id_keys=['experiment_id', 'approach_id', 'learner_id'],
                      as_type='.p')

    persist_local(
        model,
        args={
            **ids.to_dict(),
            **{
                'preffix': 'model'
            }
        },
        folder=None,
        id_keys=['preffix', 'experiment_id', 'approach_id', 'learner_id'],
        as_type='.p',
        save_path=production_path)
コード例 #8
0
def apply_preprocessing(approach, original_train_dict, original_test_dict,
                        args):
    """Generic preprocessing implementation.

    It currently supports StandardScaler and OneHotEncoder

    Parameters
    ----------
    approach : dict
        approach variables
    original_train_dict : dict
        contains a dataframe with features and labels
    original_test_dict : dict
        contains a dataframe with features and labels
    args : dict
        generic variables of the pipeline

    Returns
    -------
    dict dict
        modified train and test dict
    """

    train_dict, test_dict = copy.deepcopy(original_train_dict), copy.deepcopy(
        original_test_dict)

    if 'preprocessors' in approach:

        train_dict['features'], preprocessing = run(approach['preprocessors'],
                                                    train_dict['features'])

        test_dict['features'], preprocessing = run(approach['preprocessors'],
                                                   test_dict['features'],
                                                   preprocessing,
                                                   fit=False)

    persist_local(preprocessing, args, 'preprocessing',
                  ['experiment_id', 'approach_id', 'fold_name'], '.dill')

    return train_dict, test_dict
コード例 #9
0
def save_preprocessor(production_path, ids, max_fold):

    content = get_local({
        **ids.to_dict(),
        **{
            'fold': max_fold
        }
    },
                        folder='preprocessing',
                        id_keys=['experiment_id', 'approach_id', 'fold'],
                        as_type='.dill')

    persist_local(content,
                  args={
                      **ids.to_dict(),
                      **{
                          'fold': max_fold,
                          'preffix': 'prepro'
                      }
                  },
                  folder=None,
                  id_keys=['preffix', 'experiment_id', 'approach_id', 'fold'],
                  as_type='.dill',
                  save_path=production_path)
コード例 #10
0
def run_tfidf(fold, args):
    """Wrapper function that runs the process of TFIDF after preprocessing
    
    Parameters
    ----------
    fold : dict
        id_llamado lists to filter features and labels
    args : dict
        dictionary of parameters to be passed into the TFIDF algo
    
    Returns
    -------
    pd.DataFrame
        train and test dataframes for train and test document features
    """

    fold_id = {'fold_id': generate_id(str(fold) + str(args['params']))}

    if check_if_local_exists(fold_id, 'tfidf-train', ['fold_id']):
        tfidf_features_train = get_local(fold_id,
                                         'tfidf-train',
                                         id_keys=['fold_id'],
                                         as_type='.parquet.gz')
        tfidf_features_test = get_local(fold_id,
                                        'tfidf-test',
                                        id_keys=['fold_id'],
                                        as_type='.parquet.gz')

    else:
        # Get the processed list of texts for both train and test
        train_id, train_text = tfidf_preprocess(fold['train'])
        test_id, test_text = tfidf_preprocess(fold['test'])

        # Get train and test document features sets
        stop_words = set(stopwords.words('spanish'))
        # Get TFIDF encoder
        tfidf_encode = vector_fit(train_text, args['params'], stop_words)
        # Get train and test dataframes
        tfidf_features_train = vector_transform(train_text, train_id,
                                                tfidf_encode)
        tfidf_features_test = vector_transform(test_text, test_id,
                                               tfidf_encode)

        persist_local(tfidf_encode,
                      args,
                      'tfidf', ['experiment_id'],
                      as_type='.p')

        persist_local(tfidf_features_train, fold_id, 'tfidf-train',
                      ['fold_id'])
        persist_local(tfidf_features_test, fold_id, 'tfidf-test', ['fold_id'])

    return tfidf_features_train, tfidf_features_test
コード例 #11
0
def do_plots(experiment_id):

    # Get data on experiment results from database

    con = utils.connect_to_database()

    query = """
    select evaluation.*,approach.name
    from experiments.evaluations evaluation
    left join experiments.approaches approach
    on evaluation.approach_id = approach.approach_id
    """

    df = pd.read_sql_query(query, con)

    # Subselect data on specific experiment id
    data = df.loc[df['experiment_id'] == experiment_id]

    # Set of colors to be used in the plot
    n = len(data['learner_id'])
    color = iter(cm.rainbow(np.linspace(0, 1, n)))

    # Set font size
    plt.rcParams.update({'font.size': 14})

    # Loop to create one fig per metric and a line per learner
    for metric in data['eval_metric'].unique():

        fig, ax1 = plt.subplots(figsize=(15, 8))

        ax1.set_title(f"Metric: {metric}")
        ax1.set_ylabel('score')

        # check if it is k-fold or temporal-fold
        if '-' in data['fold'].iloc[0]:
            ax1.set_xlabel('time')
            plt.xticks(rotation=90)
        else:
            ax1.set_xlabel('fold')

        for approach in data['approach_id'].unique():

            c = next(color)

            for learner in data['learner_id'].unique():

                data_to_plot = data[(data['learner_id'] == learner)
                                    & (data['approach_id'] == approach) &
                                    (data['eval_metric'] == metric)]

                approach_name = data_to_plot['name'].unique()

                ax1.plot(data_to_plot['fold'], data_to_plot['score'], c=c)

                ax1.legend(approach_name)

        persist_local(data=fig,
                      args={
                          'experiment_id': experiment_id,
                          'eval_metric': metric
                      },
                      folder='evaluation_plots',
                      id_keys=['experiment_id', 'eval_metric'],
                      as_type='.png')
コード例 #12
0
def loop_the_grid(args):
    """
    Given the experiment file with experiment parameters, the list of
    temporal_folds as well as the data dictionary prepared by the
    model_data_prep function, the function loops through the various temporal folds
    and the list of approaches specified in the experiment file to calculate
    metrics specified in the experiment file.

    Parameters
    ----------
    args: dictionary
        Minimum set of arguments to start functions.
    """

    experiment = get_experiment(args['experiment_id'])
    approaches = get_approaches(args['experiment_id'])

    features = get_local(args, 'features').set_index('id_llamado')
    labels = get_local(args, 'labels').set_index('id_llamado')

    #Check if textprocessing is needed:
    if 'textprocessing' in experiment:
        args_tfidf = {}
        args_tfidf['params'] = experiment['textprocessing']['tfidf']
        args_tfidf['experiment_id'] = args['experiment_id']
    else:
        args_tfidf = {}

    print('Approaches: ', ', '.join([k['name'] for k in approaches]))

    for fold in tqdm(args['folds'], desc='Folds'):

        args['fold_name'] = fold['name']

        original_train_dict, original_test_dict = generate_folds_matrices(
            features, labels, fold, args_tfidf)

        for approach in tqdm(approaches, desc='Approaches'):

            args['approach_id'] = approach['approach_id']
            args['approach_name'] = approach['name']

            train_dict, test_dict = \
            apply_preprocessing(approach, original_train_dict, original_test_dict,
                                                        args)

            for hyperparameters in tqdm(generate_hyperparameters_combinations(
                    approach['hyperparameters']),
                                        desc='Hyper'):

                args['hyperparameters'] = hyperparameters
                args = persist_learner(args)

                try:
                    max_run_time(experiment['model_config']['max_seconds'])

                    mod = importlib.import_module(
                        f"pipeline.approaches.{approach['python_path'][:-3]}")
                    model = mod.fit(args, train_dict=train_dict)

                    predictions = mod.predict(
                        model, test_features=test_dict['features'])

                    evaluations = evaluate(obs=test_dict['labels'],
                                           pred=predictions,
                                           evaluation=experiment['evaluation'])

                    feature_importance = get_feature_importance(
                        model, test_dict['features'])

                    persist_local(predictions, args, 'predictions', [
                        'experiment_id', 'approach_id', 'learner_id',
                        'fold_name'
                    ])
                    persist_local(
                        model, args, 'models',
                        ['experiment_id', 'approach_id', 'learner_id'], '.p')
                    persist_evaluation(evaluations, args)
                    persist_feature_importance(feature_importance, args)

                except TimeoutError as error:
                    error = f'timeout < {experiment["model_config"]["max_seconds"]}'
                    persist_errors(error, args)

                    if experiment['model_config']['errors']:
                        raise

                    continue

                except Exception as e:
                    persist_errors(e, args)
                    if experiment['model_config']['errors']:
                        raise
                    continue
コード例 #13
0
def overall_performance_per_fold(args):

    title = f'Experiment : {args["experiment_id"]}'

    def complaints_per_fold(args, data):

        folds = pickle.load(
            open(f'/data/persist/folds/{args["experiment_id"]}.p', 'rb'))
        labels = get_local(args, 'labels').set_index('id_llamado')[['target']]

        i = 0
        complaints = []
        for fold in folds:

            if fold['name'] in list(data['fold'].unique()):
                complaints.append({
                    'complaints':
                    100 * labels.loc[fold['test']].sum().values[0] /
                    len(labels.loc[fold['test']]),
                    'fold':
                    fold['name']
                })

        return pd.DataFrame(complaints)

    data = fetch_data(args['experiment_id'])

    eval_metrics = data['eval_metric'].unique()

    nrows = len(eval_metrics)  # Number of rows
    ncols = 1  # Number of columns
    fontsize = 14  # General fontsize

    grid = plt.GridSpec(nrows * 2 + 2, ncols)

    fig = plt.figure(figsize=(15, nrows * 7))

    fig.suptitle(title, fontsize=18, y=0.9)

    # Percentage of Complaints

    axis = plt.subplot(grid[1, 0])

    complaints = complaints_per_fold(args, data)
    axis.bar(complaints['fold'],
             complaints['complaints'],
             label='% of true labels per fold',
             align='edge')

    axis.get_xaxis().set_visible(False)

    axis.set_ylabel('%', fontsize=fontsize)

    axis.legend()

    for row, eval_metric in enumerate(eval_metrics):

        axis = plt.subplot(grid[2 + row * 2:2 + (row + 1) * 2, 0])

        df = data.query(f'eval_metric == "{eval_metric}"')

        # Plot something, it can be more than one
        axis = sns.boxplot(x='fold',
                           y='score',
                           hue='eval_metric',
                           data=df,
                           ax=axis,
                           boxprops=dict(alpha=0.3))

        if nrows - 1 > row:
            axis.get_xaxis().set_visible(False)

        # Set tick params size
        axis.tick_params(axis='both', labelsize=12)
        axis.tick_params(axis='x', rotation=45)

    fig.tight_layout()

    persist_local(data=fig,
                  args={
                      'experiment_id': args['experiment_id'],
                      'title': title,
                      'eval_metric': metric
                  },
                  folder='evaluation_plots',
                  id_keys=['experiment_id', 'title', 'eval_metric'],
                  as_type='.png')
コード例 #14
0
def plot_approaches_by_fold(experiment_id, best_learner=''):
    """
    Given a experiment_id and learner_id (optional), returns a plot
    for each evaluation metric.
    The plot shows the score for the metric for each learner at each fold. 
    Learners that use the same approach have the same color.
    
    Parameters
    ----------
    experiment_id : int
        experiment_id to evaluate. 
    best_learner: int
        learner_id selected as the best learner. 

    Returns
    -------
    Plots
    """

    data = fetch_data(experiment_id)
    metrics = data['eval_metric'].unique()
    approaches = data['name'].unique()

    nrows = 1  # Number of rows
    ncols = 1  # Number of columns
    fontsize = 14  # General fontsize

    colors = cycle(cm.get_cmap('tab10', len(approaches)).colors)

    title = """
    Experiment: {experiment_id} 
    Metric: {metric}"""

    # Loop to create one fig per metric and a line per learner
    for metric in metrics:

        fig, axis = plt.subplots(figsize=(10, 4))

        axis.set_title(title.format(experiment_id=experiment_id,
                                    metric=metric))
        axis.set_ylabel('score')
        axis.set_ylim([0, 1])
        axis.spines['top'].set_visible(False)
        axis.spines['right'].set_visible(False)
        axis.legend()

        #check if it is k-fold or temporal-fold
        if '-' in data['fold'].iloc[0]:
            axis.set_xlabel('time')
            axis.tick_params(axis='both', labelsize=12)
            axis.tick_params(axis='x', rotation=45)
        else:
            axis.set_xlabel('fold')

        for approach in approaches:

            c = next(colors)
            label = approach

            for i, learner in enumerate(data['learner_id'].unique()):

                if learner == best_learner:
                    linestyle = 'dashed'
                    linewidth = 3
                    alpha = 1
                else:
                    linestyle = 'solid'
                    linewidth = 1
                    alpha = 0.8

                data_to_plot = data[(data['learner_id'] == learner)
                                    & (data['eval_metric'] == metric) &
                                    (data['name'] == approach)]

                line = axis.plot(data_to_plot['fold'],
                                 data_to_plot['score'],
                                 color=c,
                                 alpha=alpha,
                                 linestyle=linestyle,
                                 linewidth=linewidth,
                                 label=approach if i == 0 else "")

        plt.legend(loc='center left',
                   bbox_to_anchor=(1, 0.5),
                   frameon=False,
                   title='metric')

        persist_local(data=fig,
                      args={
                          'experiment_id': experiment_id,
                          'title': title,
                          'eval_metric': metric
                      },
                      folder='evaluation_plots',
                      id_keys=['experiment_id', 'title', 'eval_metric'],
                      as_type='.png')