def save_model(self, explicit_path=None):
        """
        save the model to disk
        model path is determined via data_name, target_name or split_index.
        can be overwritten when explicit_path is given
        """

        if explicit_path is not None:
            path_to_model = explicit_path
        else:
            path_to_model = self.path_files()[0]

        helpers.ensure_dir_exists(os.path.dirname(path_to_model))
        model_io.write(self.model, path_to_model, fmt='txt')
        if self.use_gpu: self.model.to_cupy()
Exemple #2
0
#Opens mapping parameters for the team build dataset
with open(player_config / 'team_player_mappings.yml', 'r') as stream:
    try:
        param_config = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

data_dir = os.path.join(home_dir, 'NBA_Pro_Line_Analytics/model_build_data/')
data_dir = Path(data_dir)
model_build_team_stats = data_dir / 'NBA_Team_Stats_2010-2019_Rolling_Avg_Win_Loss_pts_diff'
model_build_player_stats = data_dir / 'NBA_Player_Stats_2010-2019_rollling_avg'

team_output_dir = data_dir / 'NBA_Team_Stats_2010-2019_Rolling_Avg_Win_Loss_pts_diff_player_features'

helpers.ensure_dir_exists(team_output_dir)

counter = 0
for dataset in dataset_config[
        'NBA_Team_Stats_2010-2019_Rolling_Avg_Win_Loss_pts_diff']:
    print(f'Now mapping player features for dataset {dataset}')
    team_dataset = pd.read_csv(model_build_team_stats / dataset)
    player_dataset = pd.read_csv(
        model_build_player_stats /
        dataset_config['NBA_Player_stats_w_rolling_avg'][counter])
    counter += 1
    for key, value in param_config.items():
        print(f'Now mapping feature {key}')
        team_dataset = helpers.map_player_to_team(
            team_df=team_dataset,
            player_df=player_dataset,
Exemple #3
0
def run_train_test_cycle(X,
                         Y,
                         L,
                         LS,
                         S,
                         P,
                         model_class,
                         output_root_dir,
                         data_name,
                         target_name,
                         training_programme=None,
                         do_this_if_model_exists='skip',
                         save_data_in_output_dir=True,
                         force_device_for_training=None,
                         force_device_for_evaluation=None,
                         do_xval=True,
                         decision_tree=False):
    """
    This script trains and evaluates a model using the given data X,Y over all splits as determined in S

    Parameters:
    -----------

    X : np.ndarray - An numpy.ndarray shaped (N, T, C), where N is the number of samples, T is the number
        of time points in the data and C is the number of channels per time point.

    Y : np.ndarray - An numpy.ndarray shaped (N, L), where N is the number of samples and L is the number of classes/labels

    L : list - a list of channel labels of length C, where C is the number of channels in the data.
        L holds textual descriptions of the data's channels

    LS: np.array - An numpy.ndarray shaped (N, S), where N is the number of samples and S is the number of existing subjects.
        Identifies the subject belonging to each datum X.
        Should run in parallel to the training labels Y

    S : list of lists - Contains indices determining the partitioning of the data.
        The outer lists groups the splits (ie len(S) groups of data) and each list element of S contains the indices of those lists.

    P : np.ndarray - An numpy.ndarray shaped (N,) describing the permutation applied to the input data X and the target labels Y.
        This allows referencing LS to Y and X.

    model: model_db.Model - a CLASS providing a set of required functions and the model architecture for executing the training and evaluation loop

    output_root_dir: str - a string pointing towards the root folder for writing results into.

    data_name: str - what is the data/feature type called? e.g. GRF or JA_X_Lower, ...

    target_name: str - what is the prediction target called? e.g. Subject, Gender or Injury, ...

    training_programme: (optional) ModelTraining class - If this parameter is not None, the model's default training regime will be overwritten
        with the passed ModelTraining class' train_model() function

    do_this_if_model_exists: str - variable controlling the training/evaluation behaviour if a trained model already exists
        at the model output location. options:
        retrain (do everything from scratch)
        load (load model and skip training, perform evaluation only)
        skip (completely skip, do nothing)

    save_data_in_output_dir: bool - controls wheter to save the experimental data (X, Y, L, LS, S) in the output directory

    force_device_for_training: str - values can be either gpu or cpu. force the use of this device during training.

    force_device_for_evaluation: str - values can either gpu or cpu. force the use of this device during evaluaton.
        here, the use of the GPU is almost always recommended due to the large batch size to be processed.

    do_xval: bool - controls wheter all data splits are run through a cross-evaluation scheme, or only data splits 0-2 are to be treated as dedicated training, validation and test splits

    decision_tree: bool - if True trains a decision tree model as a baseline/comparison option for the target model
    """

    # some basic sanity checks
    assert Y.shape[0] == X.shape[0] == LS.shape[
        0], 'Number of samples differ between labels Y (n={}), data X (n={}) and subject labels LS (n={})'.format(
            L.shape[0], X.shape[0], LS.shape[0])
    assert len(L) == X.shape[
        2], 'Number of provided channel names/labels in L (c={}) differs from number of channels in data X(c={})'.format(
            len(L), X.shape[2])
    assert sum([len(s) for s in S]) == X.shape[
        0], 'Number of samples distributed over splits in S (n={}) differs from number of samples in X ({})'.format(
            sum([len(s) for s in S]), X.shape[0])

    # save data, labels and split information in output directory.
    if save_data_in_output_dir:
        print('Saving training and evaluation data to {}'.format(
            output_root_dir))
        helpers.ensure_dir_exists(output_root_dir)
        scipy.io.savemat('{}/data.mat'.format(output_root_dir), {'X': X})
        scipy.io.savemat('{}/targets.mat'.format(output_root_dir), {'Y': Y})
        scipy.io.savemat('{}/channel_labels.mat'.format(output_root_dir),
                         {'L': L})
        scipy.io.savemat('{}/subject_labels.mat'.format(output_root_dir),
                         {'LS': LS})
        scipy.io.savemat('{}/splits.mat'.format(output_root_dir), {'S': S})
        scipy.io.savemat('{}/permutation.mat'.format(output_root_dir),
                         {'P': P})

    #prepare log to append anything happending in this session. kinda deprecated.
    logfile = open('{}/log.txt'.format(output_root_dir), 'a')

    # start main loop and execute training/evaluation for all the splits definied in S
    for split_index in range(len(S)):
        if split_index > 0 and not do_xval:
            cprint(
                colored(
                    'Cross-Validation has been disabled. Terminating after first iteration.',
                    'yellow'))
            #terminate here after one iteration, e.g. in case predetermined splits have been given.
            break

        model = model_class(output_root_dir, data_name, target_name,
                            split_index)
        model_dir = model.path_dir()
        helpers.ensure_dir_exists(model_dir)

        # this case: do nothing.
        if model.exists() and do_this_if_model_exists == 'skip':
            print('Model already exists at {}. skipping'.format(model_dir))
            continue  #skip remaining code, there is nothing to be done. please move along.

        # other cases: split data in any case. measure time. set output log
        t_start = time.time()

        # collect data indices from split table
        j_test = split_index
        i_test = S[j_test]
        j_val = (split_index + 1) % len(S)
        i_val = S[j_val]
        j_train = list(set(range(len(S))) - {j_test, j_val})
        i_train = []
        for j in j_train:
            i_train.extend(S[j])

        # collect data from indices
        x_train = X[i_train, ...]
        y_train = Y[i_train, ...]
        x_test = X[i_test, ...]
        y_test = Y[i_test, ...]
        x_val = X[i_val, ...]
        y_val = Y[i_val, ...]

        # remember shape of test data as originally given
        x_test_shape_orig = x_test.shape

        # model-specific data processing
        x_train, x_val, x_test, y_train, y_val, y_test =\
            model.preprocess_data(x_train, x_val, x_test, y_train, y_val, y_test)

        if not model.exists() or (model.exists()
                                  and do_this_if_model_exists == 'retrain'):
            model.build_model(x_train.shape, y_train.shape)
            if training_programme is not None:
                #this instance-based monkey-patching is not the best way to do it, but probably the most flexible one.
                model.train_model = types.MethodType(
                    training_programme.train_model, model)
            model.train_model(x_train,
                              y_train,
                              x_val,
                              y_val,
                              force_device=force_device_for_training)
            model.save_model()
        else:
            model.load_model()

        # compute test scores and relevance maps for model.
        results = model.evaluate_model(
            x_test,
            y_test,
            force_device=force_device_for_evaluation,
            lower_upper=helpers.get_channel_wise_bounds(x_train)
        )  # compute and give data bounds computed from training data.

        # measure time for training/evaluation cycle
        t_end = time.time()

        # write report for terminal printing
        report = '\n{}\n'.format(model.path_dir().replace('/', ' '))
        report += 'test accuracy : {}\n'.format(results['acc'])
        report += 'test loss (l1): {}\n'.format(results['loss_l1'])
        report += 'train-evaluation-sequence done after {}s\n\n'.format(
            t_end - t_start)
        print(report)

        #dump results to output of this run
        with open('{}/scores.txt'.format(model.path_dir()), 'w') as f:
            f.write(report)

        #also write results to parsable log file for eval_score_logs module
        logfile.write(report)
        logfile.flush()

        #dump evaluation results to mat file
        scipy.io.savemat('{}/outputs.mat'.format(model.path_dir()), results)

        if decision_tree:  # and (not model.exists() or (model.exists() and do_this_if_model_exists == 'retrain')):
            # DTree training and evaluation currently limited to settings where also the target model is trained.
            print('Training and evaluating alternative decision tree model')
            t_start = time.time()

            # make sure all data lives in CPU space for the DT model
            x_train, x_val, x_test, y_train, y_val, y_test =\
                 helpers.arrays_to_numpy(x_train, x_val, x_test, y_train, y_val, y_test)

            random_state = 42
            #prep data for DT models
            x_train_dt = np.reshape(x_train, [x_train.shape[0], -1])
            x_val_dt = np.reshape(x_val, [x_val.shape[0], -1])
            x_test_dt = np.reshape(x_test, [x_test.shape[0], -1])

            #some models (SVM flatten the y_train. we need to reinstate this. here, in this case)
            if len(y_train.shape) == 1:
                tmp = np.zeros((y_train.shape[0],
                                y_val.shape[1]))  # n_samples x n_classes
                tmp[np.arange(y_train.shape[0]), y_train] = 1
                y_train = tmp

            clf = tree.DecisionTreeClassifier(random_state=random_state)
            clf.fit(x_train_dt, y_train)

            y_pred_train = clf.predict(x_train_dt)
            acc_train = helpers.accuracy(y_pred_train, y_train)

            y_pred_val = clf.predict(x_val_dt)
            acc_val = helpers.accuracy(y_pred_val, y_val)

            y_pred_test = clf.predict(x_test_dt)
            acc_test = helpers.accuracy(y_pred_test, y_test)

            importances = clf.feature_importances_

            #collect results
            dtree_results = {
                'acc_train': acc_train,
                'acc_test': acc_test,
                'acc_val': acc_val,
                'y_pred_train': y_pred_train,
                'y_pred_test': y_pred_test,
                'y_pred_val': y_pred_val,
                'importances': importances
            }

            t_end = time.time()

            #save results in file, in parallel to outputs.mat for the target model
            scipy.io.savemat('{}/outputs_dtree.mat'.format(model.path_dir()),
                             dtree_results)

            # write report for terminal printing. only test_accuracy (ie the first line after the header) will be parsed by eval_score_logs
            dtree_report = '\n{}\n'.format(model.path_dir().replace(
                '/',
                ' ').replace(model_class.__name__,
                             'comp.DTree:{}'.format(model_class.__name__)))
            dtree_report += 'test accuracy : {}\n'.format(
                dtree_results['acc_test'])
            dtree_report += 'val accuracy : {}\n'.format(
                dtree_results['acc_val'])
            dtree_report += 'train accuracy : {}\n'.format(
                dtree_results['acc_train'])
            dtree_report += 'train-evaluation-sequence done after {}s\n\n'.format(
                t_end - t_start)
            print(dtree_report)

            #dump results to output of this run
            #again, in parallel to scores.txt for the target model
            with open('{}/scores_dtree.txt'.format(model.path_dir()),
                      'w') as f:
                f.write(dtree_report)

            #also write dree report into logfile
            logfile.write(dtree_report)
            logfile.flush()
#listed in stats_cols
def double_digit_ind(df, stat_cols = ['PTS', 'A', 'TOT', 'BL', 'ST'] ):
    '''
    Function used to generate indicators for whether the player has recorded double
    '''
    cols_needed = []
    for stat in stat_cols:
        col_name = stat + '_doub_digit_ind'
        cols_needed.append(col_name)
        df[col_name] = df[stat].apply(lambda x: 1 if x >= 10 else 0)

    df['num_cats_doub_digit'] = df[cols_needed].sum(axis = 1)

player_dir = home_dir / 'NBA_Pro_Line_Analytics/raw_data/NBA_Player_Stats_2010-2019/'

output_dir = helpers.ensure_dir_exists(home_dir / 'NBA_Pro_Line_Analytics/model_build_data/NBA_Player_Stats_2010-2019_rollling_avg/')

i = 0

for data in dataset_config['NBA_Player_stats_raw']:
    print(f'Now reading in datset {data}')
    player_stats = pd.read_excel(player_dir / data, sheet_name=0)

    print(f'Renaming features:')
    player_stats = player_stats.rename(columns = {'PLAYER \nFULL NAME': 'player_name',
                                                  'OWN \nTEAM': 'player_team',
                                                  'OPPONENT \nTEAM': 'opposing_team',
                                                  'VENUE\n(R/H)': 'venue',
                                                  'STARTER\n(Y/N)': 'starter_ind',
                                                  'USAGE \nRATE (%)': 'usage_rate',
                                                  'DAYS\nREST': 'days_rested',
def run_train_test_cycle_single(X_train,
                                Y_train,
                                X_test,
                                Y_test,
                                X_val,
                                Y_val,
                                L,
                                model_class,
                                output_dir,
                                data_name,
                                target_name,
                                training_programme=None,
                                do_this_if_model_exists='skip',
                                save_data_in_output_dir=True,
                                force_device_for_training=None,
                                force_device_for_evaluation=None):
    """
    This script trains and evaluates a model using the given data X,Y over all splits as determined in S

    Parameters:
    -----------

    X_train : np.ndarray - Training data. An numpy.ndarray shaped (N, T, C), where N is the number of samples, T is the number
        of time points in the data and C is the number of channels per time point.

    Y_train : np.ndarray - Training labels. An numpy.ndarray shaped (N, L), where N is the number of samples and L is the number of classes/labels

    X_test : np.ndarray - Test data. See X_train

    Y_test : np.ndarray - Test labels. See Y_train

    X_val : np.ndarray - Validation data. See X_train

    Y_val : np.ndarray - Validation labels. See Y_train

    L : list - a list of channel labels of length C, where C is the number of channels in the data.
        L holds textual descriptions of the data's channels

    model_class: model_db.Model - a CLASS providing a set of required functions and the model architecture for executing the training and evaluation loop

    output_dir: str - a string pointing towards the folder for writing results and data into.

    data_name: str - what is the data/feature type called? e.g. GRF or JA_X_Lower, ...

    target_name: str - what is the prediction target called? e.g. Subject, Gender or Injury, ...

    training_programme: (optional) ModelTraining class - If this parameter is not None, the model's default training regime will be overwritten
        with the passed ModelTraining class' train_model() function

    do_this_if_model_exists: str - variable controlling the training/evaluation behaviour if a trained model already exists
        at the model output location. options:
        retrain (do everything from scratch)
        load (load model and skip training, perform evaluation only)
        skip (completely skip, do nothing)

    save_data_in_output_dir: bool - controls wheter to save the experimental data (X, Y, L, LS, S) in the output directory

    force_device_for_training: str - values can be either gpu or cpu. force the use of this device during training.

    force_device_for_evaluation: str - values can either gpu or cpu. force the use of this device during evaluaton.
        here, the use of the GPU is almost always recommended due to the large batch size to be processed.
    """

    # save data, labels and split information in output directory.
    if save_data_in_output_dir:
        print('Saving training and evaluation data to {}'.format(output_dir))
        helpers.ensure_dir_exists(output_dir)
        scipy.io.savemat('{}/data_train.mat'.format(output_dir),
                         {'X': X_train})
        scipy.io.savemat('{}/targets_train.mat'.format(output_dir),
                         {'Y': Y_train})
        scipy.io.savemat('{}/data_test.mat'.format(output_dir), {'X': X_test})
        scipy.io.savemat('{}/targets_test.mat'.format(output_dir),
                         {'Y': Y_test})
        scipy.io.savemat('{}/data_val.mat'.format(output_dir), {'X': X_val})
        scipy.io.savemat('{}/targets_val.mat'.format(output_dir), {'Y': Y_val})
        scipy.io.savemat('{}/channel_labels.mat'.format(output_dir), {'L': L})
    #prepare log to append anything happending in this session. kinda deprecated.
    logfile = open('{}/log.txt'.format(output_dir), 'a')

    # start main procedure
    model = model_class(output_dir, data_name, target_name,
                        0)  # note here: split index will always be zero.
    model_dir = model.path_dir()
    helpers.ensure_dir_exists(model_dir)

    # this case: do nothing.
    if model.exists() and do_this_if_model_exists == 'skip':
        print('Model already exists at {}. skipping'.format(model_dir))
        return  #skip remaining code, there is nothing to be done. please move along.

    # other cases: split data in any case. measure time. set output log
    t_start = time.time()

    # remember shape of test data as originally given
    X_test_shape_orig = X_test.shape

    # model-specific data processing
    X_train, X_val, X_test, Y_train, Y_val, Y_test =\
        model.preprocess_data(X_train, X_val, X_test, Y_train, Y_val, Y_test)

    if not model.exists() or (model.exists()
                              and do_this_if_model_exists == 'retrain'):
        model.build_model(X_train.shape, Y_train.shape)
        if training_programme is not None:
            #this instance-based monkey-patching is not the best way to do it, but probably the most flexible one.
            model.train_model = types.MethodType(
                training_programme.train_model, model)
        model.train_model(X_train,
                          Y_train,
                          X_val,
                          Y_val,
                          force_device=force_device_for_training)
        model.save_model()
    else:
        model.load_model()

    # compute test scores and relevance maps for model.
    results = model.evaluate_model(
        X_test,
        Y_test,
        force_device=force_device_for_evaluation,
        lower_upper=helpers.get_channel_wise_bounds(X_train)
    )  # compute and give data bounds computed from training data.

    # measure time for training/evaluation cycle
    t_end = time.time()

    # write report for terminal printing
    report = '\n{}\n'.format(model.path_dir().replace('/', ' '))
    report += 'test accuracy : {}\n'.format(results['acc'])
    report += 'test loss (l1): {}\n'.format(results['loss_l1'])
    report += 'train-evaluation-sequence done after {}s\n\n'.format(t_end -
                                                                    t_start)
    print(report)

    #dump results to output of this run
    with open('{}/scores.txt'.format(model.path_dir()), 'w') as f:
        f.write(report)

    #also write results to parsable log file for eval_score_logs module
    logfile.write(report)
    logfile.flush()

    #dump evaluation results to mat file
    scipy.io.savemat('{}/outputs.mat'.format(model.path_dir()), results)
                                                    Y_train = loaded_data['Y_train'],
                                                    X_test  = loaded_data['X_test'],
                                                    Y_test  = loaded_data['Y_test'],
                                                    X_val   = loaded_data['X_val'],
                                                    Y_val   = loaded_data['Y_val'],
                                                    L       = loaded_data['X_train_channel_labels'],
                                                    model_class = arch,
                                                    output_dir  = ARGS.output_dir,
                                                    data_name   = ARGS.data_name,
                                                    target_name = ARGS.target_name,
                                                    training_programme = training_regime,
                                                    do_this_if_model_exists = ARGS.model_exists,
                                                    save_data_in_output_dir = ARGS.save_data,
                                                    force_device_for_training = ARGS.force_training_device,
                                                    force_device_for_evaluation = ARGS.force_evaluation_device
                                                )

eval_score_logs.run(ARGS.output_dir)

#record function call and parameters if we arrived here

if ARGS.record_call:
    print('Recording current call configuration to {}'.format(ARGS.record_file))
    helpers.ensure_dir_exists(os.path.dirname(ARGS.record_file))
    with open(ARGS.record_file, 'a') as f:
        argline = ' '.join(['--{} {}'.format(a, getattr(ARGS,a)) for a in vars(ARGS)])
        line = '{} : python {} {}'.format(current_datetime,
                                       sys.modules[__name__].__file__,
                                       argline)
        f.write('{}\n\n'.format(line))
    except yaml.YAMLError as exc:
        print(exc)


scripts_dir = os.path.join(gbm_build_dir, 'model_build_scripts')
sys.path.insert(1, scripts_dir)

import helpers

def map_score_diff(df):
    if df['TEAM_HT'] == team:
        return df['score_diff']
    elif df['TEAM_RT'] == team:
        return -1 * df['score_diff']

output_dir = helpers.ensure_dir_exists(os.path.join(home_dir, 'NBA_Pro_Line_Analytics/', 'model_build_data/NBA_Team_Stats_2010-2019_Rolling_Avg_Win_Loss_pts_diff/'))


for df_data in config['NBA_Team_Stats_2010-2019_rolling_avg_win_loss']:
    data = pd.read_csv(os.path.join(home_dir,
                                    'NBA_Pro_Line_Analytics/model_build_data/NBA_Team_Stats_2010-2019_rolling_avg_win_loss/', df_data))
    data['TOT_Final_Score'] = data['Final_Score_HT'] + data['Final_Score_RT']
    for key, value in config_pt_diff_params.items():
        print(f'Now computing feature {key}')
        if key[0:10] == 'AVG_PTdiff':
            data['HT_' + key] = ""
            data['RT_' + key] = ""
            for team in data.TEAM_HT.unique():
                aa = data[(data.TEAM_HT == team) | (data.TEAM_RT == team)][['TEAM_HT', 'TEAM_RT', 'outcome', 'score_diff','Final_Score_HT', 'Final_Score_RT']]
                aa[team + '_scorediff'] = aa.apply(map_score_diff, axis=1)
                aa[team + '_' + key] = aa[team + '_scorediff'].rolling(value['rolling_window_required']).mean().shift()
def analyze_dataset(catalog_id,
                    catalog,
                    dataset_identifier,
                    datasets_output_dir,
                    debug_mode=False,
                    replace=True,
                    debug_distribution_ids=None):
    res = {
        "dataset_status": None,
        "distributions_ok": [],
        "distributions_error": [],
    }

    dataset_meta = catalog.get_dataset(dataset_identifier)

    if dataset_meta:
        dataset_dir = os.path.join(datasets_output_dir, dataset_identifier)
        helpers.ensure_dir_exists(dataset_dir)
        res["dataset_status"] = "OK"
    else:
        res["dataset_status"] = "ERROR: metadata"
        return res

    distribution_ids = [
        distribution["identifier"]
        for distribution in dataset_meta["distribution"]
    ]

    # si está en debug mode, se puede especificar sólo algunos ids
    if debug_mode and debug_distribution_ids:
        distribution_ids = [
            distribution_id for distribution_id in distribution_ids
            if distribution_id in debug_distribution_ids
        ]

    # creo c/u de las distribuciones del dataset
    for distribution_identifier in distribution_ids:
        msg = "Distribución {}: {} ({})"
        try:
            distrib_meta = catalog.get_distribution(distribution_identifier)

            # usa fileName si la distribución lo especifica, sino crea uno
            distribution_name = title_to_name(distrib_meta["title"])
            distribution_file_name = distrib_meta.get(
                "fileName", "{}.csv".format(distribution_name))
            dist_path = os.path.join(dataset_dir, "distribution",
                                     distribution_identifier, "download",
                                     "{}".format(distribution_file_name))
            dist_url = get_distribution_url(dist_path)
            # print("esta es la URL QUE VA AL CATALOGO", dist_url)
            distrib_meta["downloadURL"] = dist_url

            # chequea si ante la existencia del archivo hay que reemplazarlo o
            # saltearlo
            if not os.path.exists(dist_path) or replace:
                status = "Replaced" if os.path.exists(dist_path) else "Created"
                origin_dist_path, df = analyze_distribution(
                    catalog_id, catalog, dataset_identifier,
                    distribution_identifier)

                if isinstance(distribution, list):
                    distribution_complete = pd.concat(distribution)
                else:
                    distribution_complete = distribution

                helpers.ensure_dir_exists(os.path.dirname(dist_path))
                shutil.copyfile(origin_dist_path, dist_path)
            else:
                status = "Skipped"

            res["distributions_ok"].append((distribution_identifier, status))
            logger.info(msg.format(distribution_identifier, "OK", status))

        except Exception as e:
            if isinstance(e, KeyboardInterrupt):
                raise
            res["distributions_error"].append((distribution_identifier,
                                               repr(e).encode("utf8")))

            trace_string = traceback.format_exc()
            logger.error(
                msg.format(distribution_identifier, "ERROR",
                           repr(e).encode("utf8")))
            for line in trace_string.splitlines():
                logger.error(line)

            if debug_mode:
                raise
            res["dataset_status"] = "ERROR: scraping"

    return res
def scrape_dataset(xl,
                   catalog,
                   dataset_identifier,
                   datasets_dir,
                   debug_mode=False,
                   replace=True,
                   debug_distribution_ids=None,
                   catalog_id=None):
    res = {
        "dataset_status": None,
        "distributions_ok": [],
        "distributions_error": [],
    }

    dataset_meta = catalog.get_dataset(dataset_identifier)

    if dataset_meta:
        dataset_dir = os.path.join(datasets_dir, dataset_identifier)
        helpers.ensure_dir_exists(dataset_dir)
        res["dataset_status"] = "OK"
    else:
        res["dataset_status"] = "ERROR: metadata"
        return res

    # filtro los parametros para un dataset en particular
    distribution_ids = [
        distribution["identifier"]
        for distribution in dataset_meta["distribution"]
    ]

    # si está en debug mode, se puede especificar sólo algunos ids
    if debug_mode and debug_distribution_ids:
        distribution_ids = [
            distribution_id for distribution_id in distribution_ids
            if distribution_id in debug_distribution_ids
        ]

    # creo c/u de las distribuciones del dataset
    for distribution_identifier in distribution_ids:
        msg = "Distribución {}: {} ({})"
        try:
            distrib_meta = catalog.get_distribution(distribution_identifier)
            distribution_name = title_to_name(distrib_meta["title"])
            distribution_file_name = distrib_meta.get(
                "fileName", "{}.csv".format(distribution_name))
            dist_download_dir = os.path.join(dataset_dir, "distribution",
                                             distribution_identifier,
                                             "download")
            dist_path = os.path.join(dist_download_dir,
                                     "{}".format(distribution_file_name))
            dist_url = get_distribution_url(dist_path)
            # print("esta es la URL QUE VA AL CATALOGO", dist_url)
            distrib_meta["downloadURL"] = dist_url

            # chequea si ante la existencia del archivo hay que reemplazarlo o
            # saltearlo
            if not os.path.exists(dist_path) or replace:
                status = "Replaced" if os.path.exists(dist_path) else "Created"
                distribution = scrape_distribution(xl, catalog,
                                                   distribution_identifier)

                if isinstance(distribution, list):
                    distribution_complete = pd.concat(distribution)
                else:
                    distribution_complete = distribution

                helpers.remove_other_files(os.path.dirname(dist_path))
                distribution_complete.to_csv(
                    dist_path, encoding="utf-8", index_label="indice_tiempo")
            else:
                status = "Skipped"

            res["distributions_ok"].append((distribution_identifier, status))
            logger.info(msg.format(distribution_identifier, "OK", status))

        except Exception as e:
            if isinstance(e, KeyboardInterrupt):
                raise

            res["distributions_error"].append((distribution_identifier,
                                               repr(e).encode("utf8")))

            trace_string = traceback.format_exc()
            print(
                msg.format(distribution_identifier, "ERROR",
                           repr(e).encode("utf8")))
            print(trace_string)
            if debug_mode:
                raise
            res["dataset_status"] = "ERROR: scraping"

            # si no hay versión vieja de la distribución, elimina del catálogo
            try:
                get_distribution_path(catalog_id, dataset_identifier,
                                      distribution_identifier)
            except:
                catalog.remove_distribution(distribution_identifier,
                                            dataset_identifier)

    return res