def create_labels(args): """ Function to obtain a dataframe of labels from experiment file corresponding to cohort Parameters ---------- experiment: dict Experiment file with model parameters Return --------- pd.DataFrame Dataframe of IDs and labels """ experiment = get_experiment(args['experiment_id']) features = get_local(args, 'features')['id_llamado'] query =""" select distinct labels.id_llamado as id_llamado, tipo_procedimiento_codigo, labels.reception_date, {label_target} as target from semantic.labels labels join semantic.tenders tenders on labels.id_llamado = tenders.id_llamado where labels.id_llamado in ({cohort}) """.format(cohort=experiment['cohort_config']['query'], label_target=experiment['label_config']['query']) con = utils.connect_to_database() labels = pd.read_sql_query(query, con) labels = labels[labels['id_llamado'].isin(features)] persist_local(labels, args, 'labels')
def generate_temporal_folds(experiment, args): """ Given a label table and temporal parameters, it generates temporal folds. Parameters ---------- experiment : dict Paramenters of to perform the experiment args : dict Minimum set of parameters to run the pipeline Returns ------- list of dicts All folds information, by as of date. It carries the ids to filter the tables """ params = experiment['validation']['parameters'] current_aod = dt.datetime.strptime(params['as_of_date'], '%Y-%m-%d') labels = get_local(args, 'labels') X = labels[['id_llamado', 'reception_date']] y = labels['target'] k = 1 folds = [] while True: test_end = current_aod + dt.timedelta(days=params['test_lag']) if test_end > dt.datetime.strptime( params['test_date_limit'], '%Y-%m-%d'): break if params['number_of_folds'] is not None: if k > params['number_of_folds']: break # If train_lag is 'all', train_start is set to a dummy old date # (2000-01-01) train_start = (current_aod - dt.timedelta(days=params['train_lag']) - dt.timedelta(days=params['blind_gap'])) \ if params['train_lag'] != 'all' else dt.datetime(2000, 1, 1) train_end = current_aod - dt.timedelta(days=params['blind_gap']) train_ids = X.query( f"reception_date >= '{train_start}' and reception_date <= '{train_end}'")['id_llamado'] test_ids = X.query( f"reception_date >= '{current_aod}' and reception_date <= '{test_end}'")['id_llamado'] folds.append({ 'name': dt.datetime.strftime(current_aod, '%Y-%m-%d'), 'train': train_ids.tolist(), 'test': test_ids.tolist() }) current_aod = current_aod + dt.timedelta(days=params['aod_lag']) k = k + 1 persist_local(folds, args, 'folds', as_type='.p') return folds
def plot_metric_by_fold(selector): data = selector['data'] experiment_id = data['experiment_id'].unique()[0] selector_name = selector['name'] metrics = data['eval_metric'].unique() nrows = 1 # Number of rows ncols = 1 # Number of columns fontsize = 14 # General fontsize title = """ Experiment: {experiment_id} Metric: {metric} Selector: {selector_name}""" # Loop to create one fig per metric and a line per learner for metric in metrics: fig, axis = plt.subplots(figsize=(15, 8)) axis.set_title( title.format(experiment_id=experiment_id, selector_name=selector_name, metric=metric)) axis.set_ylabel('score') axis.legend() #check if it is k-fold or temporal-fold if '-' in data['fold'].iloc[0]: axis.set_xlabel('time') axis.tick_params(axis='both', labelsize=12) axis.tick_params(axis='x', rotation=45) else: axis.set_xlabel('fold') for i, learner in enumerate(data['learner_id'].unique()): data_to_plot = data[(data['learner_id'] == learner) & (data['eval_metric'] == metric)] axis.plot(data_to_plot['fold'], data_to_plot['score'], color=next(color), label=data['name'].unique()[0] + '_' + str(learner)) axis.legend() persist_local(data=fig, args={ 'experiment_id': experiment_id, 'title': title, 'eval_metric': metric }, folder='evaluation_plots', id_keys=['experiment_id', 'title', 'eval_metric'], as_type='.png')
def create_features(args): """ Function to obtain features specified in the experiment file. Function will loop over all the features. Parameters: ------------ experiment: dict Experiment file with model parameters Return: ------------ pd.DataFrame A dataframe of features corresponding to each cohort """ experiment = get_experiment(args['experiment_id']) query_config = """with cd_tenders as ( {cohort} ) select cd_tenders.id_llamado, {columns} from cd_tenders left join {table} as feature_table on cd_tenders.id_llamado = feature_table.id_llamado """ con = utils.connect_to_database() features_combined = pd.DataFrame() for feature_config in experiment['features']: query = query_config.format(cohort=experiment['cohort_config']['query'], columns=','.join( feature_config['columns']), table=feature_config['table']) features = pd.read_sql_query(query, con) if features_combined.empty: features_combined = features else: features_combined = features_combined.merge( features, on='id_llamado', how='inner') # print(features_combined.columns) features_combined = features_combined.dropna() persist_local(features_combined, args, 'features')
def apply_preprocessing(approach, original_train_dict, original_test_dict, args): train_dict, test_dict = copy.deepcopy(original_train_dict), copy.deepcopy(original_test_dict) if 'preprocessors' in approach: train_dict['features'], preprocessing = run(approach['preprocessors'], train_dict['features']) test_dict['features'], preprocessing = run(approach['preprocessors'], test_dict['features'], preprocessing, fit=False) persist_local(preprocessing, args, 'preprocessing', ['experiment_id', 'approach_id', 'fold'], '.dill') return train_dict, test_dict
def save_args(production_path, ids, max_fold, k): args = { 'experiment_id': ids['experiment_id'], 'approach_id': ids['approach_id'], 'learner_id': ids['learner_id'], 'fold': max_fold, 'features': get_features(ids), 'k': k } persist_local(args, {'name': 'best_model_args'}, folder=None, id_keys=['name'], as_type='.p', save_path=production_path)
def save_model(production_path, ids): model = get_local(ids.to_dict(), folder='models', id_keys=['experiment_id', 'approach_id', 'learner_id'], as_type='.p') persist_local( model, args={ **ids.to_dict(), **{ 'preffix': 'model' } }, folder=None, id_keys=['preffix', 'experiment_id', 'approach_id', 'learner_id'], as_type='.p', save_path=production_path)
def apply_preprocessing(approach, original_train_dict, original_test_dict, args): """Generic preprocessing implementation. It currently supports StandardScaler and OneHotEncoder Parameters ---------- approach : dict approach variables original_train_dict : dict contains a dataframe with features and labels original_test_dict : dict contains a dataframe with features and labels args : dict generic variables of the pipeline Returns ------- dict dict modified train and test dict """ train_dict, test_dict = copy.deepcopy(original_train_dict), copy.deepcopy( original_test_dict) if 'preprocessors' in approach: train_dict['features'], preprocessing = run(approach['preprocessors'], train_dict['features']) test_dict['features'], preprocessing = run(approach['preprocessors'], test_dict['features'], preprocessing, fit=False) persist_local(preprocessing, args, 'preprocessing', ['experiment_id', 'approach_id', 'fold_name'], '.dill') return train_dict, test_dict
def save_preprocessor(production_path, ids, max_fold): content = get_local({ **ids.to_dict(), **{ 'fold': max_fold } }, folder='preprocessing', id_keys=['experiment_id', 'approach_id', 'fold'], as_type='.dill') persist_local(content, args={ **ids.to_dict(), **{ 'fold': max_fold, 'preffix': 'prepro' } }, folder=None, id_keys=['preffix', 'experiment_id', 'approach_id', 'fold'], as_type='.dill', save_path=production_path)
def run_tfidf(fold, args): """Wrapper function that runs the process of TFIDF after preprocessing Parameters ---------- fold : dict id_llamado lists to filter features and labels args : dict dictionary of parameters to be passed into the TFIDF algo Returns ------- pd.DataFrame train and test dataframes for train and test document features """ fold_id = {'fold_id': generate_id(str(fold) + str(args['params']))} if check_if_local_exists(fold_id, 'tfidf-train', ['fold_id']): tfidf_features_train = get_local(fold_id, 'tfidf-train', id_keys=['fold_id'], as_type='.parquet.gz') tfidf_features_test = get_local(fold_id, 'tfidf-test', id_keys=['fold_id'], as_type='.parquet.gz') else: # Get the processed list of texts for both train and test train_id, train_text = tfidf_preprocess(fold['train']) test_id, test_text = tfidf_preprocess(fold['test']) # Get train and test document features sets stop_words = set(stopwords.words('spanish')) # Get TFIDF encoder tfidf_encode = vector_fit(train_text, args['params'], stop_words) # Get train and test dataframes tfidf_features_train = vector_transform(train_text, train_id, tfidf_encode) tfidf_features_test = vector_transform(test_text, test_id, tfidf_encode) persist_local(tfidf_encode, args, 'tfidf', ['experiment_id'], as_type='.p') persist_local(tfidf_features_train, fold_id, 'tfidf-train', ['fold_id']) persist_local(tfidf_features_test, fold_id, 'tfidf-test', ['fold_id']) return tfidf_features_train, tfidf_features_test
def do_plots(experiment_id): # Get data on experiment results from database con = utils.connect_to_database() query = """ select evaluation.*,approach.name from experiments.evaluations evaluation left join experiments.approaches approach on evaluation.approach_id = approach.approach_id """ df = pd.read_sql_query(query, con) # Subselect data on specific experiment id data = df.loc[df['experiment_id'] == experiment_id] # Set of colors to be used in the plot n = len(data['learner_id']) color = iter(cm.rainbow(np.linspace(0, 1, n))) # Set font size plt.rcParams.update({'font.size': 14}) # Loop to create one fig per metric and a line per learner for metric in data['eval_metric'].unique(): fig, ax1 = plt.subplots(figsize=(15, 8)) ax1.set_title(f"Metric: {metric}") ax1.set_ylabel('score') # check if it is k-fold or temporal-fold if '-' in data['fold'].iloc[0]: ax1.set_xlabel('time') plt.xticks(rotation=90) else: ax1.set_xlabel('fold') for approach in data['approach_id'].unique(): c = next(color) for learner in data['learner_id'].unique(): data_to_plot = data[(data['learner_id'] == learner) & (data['approach_id'] == approach) & (data['eval_metric'] == metric)] approach_name = data_to_plot['name'].unique() ax1.plot(data_to_plot['fold'], data_to_plot['score'], c=c) ax1.legend(approach_name) persist_local(data=fig, args={ 'experiment_id': experiment_id, 'eval_metric': metric }, folder='evaluation_plots', id_keys=['experiment_id', 'eval_metric'], as_type='.png')
def loop_the_grid(args): """ Given the experiment file with experiment parameters, the list of temporal_folds as well as the data dictionary prepared by the model_data_prep function, the function loops through the various temporal folds and the list of approaches specified in the experiment file to calculate metrics specified in the experiment file. Parameters ---------- args: dictionary Minimum set of arguments to start functions. """ experiment = get_experiment(args['experiment_id']) approaches = get_approaches(args['experiment_id']) features = get_local(args, 'features').set_index('id_llamado') labels = get_local(args, 'labels').set_index('id_llamado') #Check if textprocessing is needed: if 'textprocessing' in experiment: args_tfidf = {} args_tfidf['params'] = experiment['textprocessing']['tfidf'] args_tfidf['experiment_id'] = args['experiment_id'] else: args_tfidf = {} print('Approaches: ', ', '.join([k['name'] for k in approaches])) for fold in tqdm(args['folds'], desc='Folds'): args['fold_name'] = fold['name'] original_train_dict, original_test_dict = generate_folds_matrices( features, labels, fold, args_tfidf) for approach in tqdm(approaches, desc='Approaches'): args['approach_id'] = approach['approach_id'] args['approach_name'] = approach['name'] train_dict, test_dict = \ apply_preprocessing(approach, original_train_dict, original_test_dict, args) for hyperparameters in tqdm(generate_hyperparameters_combinations( approach['hyperparameters']), desc='Hyper'): args['hyperparameters'] = hyperparameters args = persist_learner(args) try: max_run_time(experiment['model_config']['max_seconds']) mod = importlib.import_module( f"pipeline.approaches.{approach['python_path'][:-3]}") model = mod.fit(args, train_dict=train_dict) predictions = mod.predict( model, test_features=test_dict['features']) evaluations = evaluate(obs=test_dict['labels'], pred=predictions, evaluation=experiment['evaluation']) feature_importance = get_feature_importance( model, test_dict['features']) persist_local(predictions, args, 'predictions', [ 'experiment_id', 'approach_id', 'learner_id', 'fold_name' ]) persist_local( model, args, 'models', ['experiment_id', 'approach_id', 'learner_id'], '.p') persist_evaluation(evaluations, args) persist_feature_importance(feature_importance, args) except TimeoutError as error: error = f'timeout < {experiment["model_config"]["max_seconds"]}' persist_errors(error, args) if experiment['model_config']['errors']: raise continue except Exception as e: persist_errors(e, args) if experiment['model_config']['errors']: raise continue
def overall_performance_per_fold(args): title = f'Experiment : {args["experiment_id"]}' def complaints_per_fold(args, data): folds = pickle.load( open(f'/data/persist/folds/{args["experiment_id"]}.p', 'rb')) labels = get_local(args, 'labels').set_index('id_llamado')[['target']] i = 0 complaints = [] for fold in folds: if fold['name'] in list(data['fold'].unique()): complaints.append({ 'complaints': 100 * labels.loc[fold['test']].sum().values[0] / len(labels.loc[fold['test']]), 'fold': fold['name'] }) return pd.DataFrame(complaints) data = fetch_data(args['experiment_id']) eval_metrics = data['eval_metric'].unique() nrows = len(eval_metrics) # Number of rows ncols = 1 # Number of columns fontsize = 14 # General fontsize grid = plt.GridSpec(nrows * 2 + 2, ncols) fig = plt.figure(figsize=(15, nrows * 7)) fig.suptitle(title, fontsize=18, y=0.9) # Percentage of Complaints axis = plt.subplot(grid[1, 0]) complaints = complaints_per_fold(args, data) axis.bar(complaints['fold'], complaints['complaints'], label='% of true labels per fold', align='edge') axis.get_xaxis().set_visible(False) axis.set_ylabel('%', fontsize=fontsize) axis.legend() for row, eval_metric in enumerate(eval_metrics): axis = plt.subplot(grid[2 + row * 2:2 + (row + 1) * 2, 0]) df = data.query(f'eval_metric == "{eval_metric}"') # Plot something, it can be more than one axis = sns.boxplot(x='fold', y='score', hue='eval_metric', data=df, ax=axis, boxprops=dict(alpha=0.3)) if nrows - 1 > row: axis.get_xaxis().set_visible(False) # Set tick params size axis.tick_params(axis='both', labelsize=12) axis.tick_params(axis='x', rotation=45) fig.tight_layout() persist_local(data=fig, args={ 'experiment_id': args['experiment_id'], 'title': title, 'eval_metric': metric }, folder='evaluation_plots', id_keys=['experiment_id', 'title', 'eval_metric'], as_type='.png')
def plot_approaches_by_fold(experiment_id, best_learner=''): """ Given a experiment_id and learner_id (optional), returns a plot for each evaluation metric. The plot shows the score for the metric for each learner at each fold. Learners that use the same approach have the same color. Parameters ---------- experiment_id : int experiment_id to evaluate. best_learner: int learner_id selected as the best learner. Returns ------- Plots """ data = fetch_data(experiment_id) metrics = data['eval_metric'].unique() approaches = data['name'].unique() nrows = 1 # Number of rows ncols = 1 # Number of columns fontsize = 14 # General fontsize colors = cycle(cm.get_cmap('tab10', len(approaches)).colors) title = """ Experiment: {experiment_id} Metric: {metric}""" # Loop to create one fig per metric and a line per learner for metric in metrics: fig, axis = plt.subplots(figsize=(10, 4)) axis.set_title(title.format(experiment_id=experiment_id, metric=metric)) axis.set_ylabel('score') axis.set_ylim([0, 1]) axis.spines['top'].set_visible(False) axis.spines['right'].set_visible(False) axis.legend() #check if it is k-fold or temporal-fold if '-' in data['fold'].iloc[0]: axis.set_xlabel('time') axis.tick_params(axis='both', labelsize=12) axis.tick_params(axis='x', rotation=45) else: axis.set_xlabel('fold') for approach in approaches: c = next(colors) label = approach for i, learner in enumerate(data['learner_id'].unique()): if learner == best_learner: linestyle = 'dashed' linewidth = 3 alpha = 1 else: linestyle = 'solid' linewidth = 1 alpha = 0.8 data_to_plot = data[(data['learner_id'] == learner) & (data['eval_metric'] == metric) & (data['name'] == approach)] line = axis.plot(data_to_plot['fold'], data_to_plot['score'], color=c, alpha=alpha, linestyle=linestyle, linewidth=linewidth, label=approach if i == 0 else "") plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False, title='metric') persist_local(data=fig, args={ 'experiment_id': experiment_id, 'title': title, 'eval_metric': metric }, folder='evaluation_plots', id_keys=['experiment_id', 'title', 'eval_metric'], as_type='.png')