def save_model(self, explicit_path=None): """ save the model to disk model path is determined via data_name, target_name or split_index. can be overwritten when explicit_path is given """ if explicit_path is not None: path_to_model = explicit_path else: path_to_model = self.path_files()[0] helpers.ensure_dir_exists(os.path.dirname(path_to_model)) model_io.write(self.model, path_to_model, fmt='txt') if self.use_gpu: self.model.to_cupy()
#Opens mapping parameters for the team build dataset with open(player_config / 'team_player_mappings.yml', 'r') as stream: try: param_config = yaml.load(stream) except yaml.YAMLError as exc: print(exc) data_dir = os.path.join(home_dir, 'NBA_Pro_Line_Analytics/model_build_data/') data_dir = Path(data_dir) model_build_team_stats = data_dir / 'NBA_Team_Stats_2010-2019_Rolling_Avg_Win_Loss_pts_diff' model_build_player_stats = data_dir / 'NBA_Player_Stats_2010-2019_rollling_avg' team_output_dir = data_dir / 'NBA_Team_Stats_2010-2019_Rolling_Avg_Win_Loss_pts_diff_player_features' helpers.ensure_dir_exists(team_output_dir) counter = 0 for dataset in dataset_config[ 'NBA_Team_Stats_2010-2019_Rolling_Avg_Win_Loss_pts_diff']: print(f'Now mapping player features for dataset {dataset}') team_dataset = pd.read_csv(model_build_team_stats / dataset) player_dataset = pd.read_csv( model_build_player_stats / dataset_config['NBA_Player_stats_w_rolling_avg'][counter]) counter += 1 for key, value in param_config.items(): print(f'Now mapping feature {key}') team_dataset = helpers.map_player_to_team( team_df=team_dataset, player_df=player_dataset,
def run_train_test_cycle(X, Y, L, LS, S, P, model_class, output_root_dir, data_name, target_name, training_programme=None, do_this_if_model_exists='skip', save_data_in_output_dir=True, force_device_for_training=None, force_device_for_evaluation=None, do_xval=True, decision_tree=False): """ This script trains and evaluates a model using the given data X,Y over all splits as determined in S Parameters: ----------- X : np.ndarray - An numpy.ndarray shaped (N, T, C), where N is the number of samples, T is the number of time points in the data and C is the number of channels per time point. Y : np.ndarray - An numpy.ndarray shaped (N, L), where N is the number of samples and L is the number of classes/labels L : list - a list of channel labels of length C, where C is the number of channels in the data. L holds textual descriptions of the data's channels LS: np.array - An numpy.ndarray shaped (N, S), where N is the number of samples and S is the number of existing subjects. Identifies the subject belonging to each datum X. Should run in parallel to the training labels Y S : list of lists - Contains indices determining the partitioning of the data. The outer lists groups the splits (ie len(S) groups of data) and each list element of S contains the indices of those lists. P : np.ndarray - An numpy.ndarray shaped (N,) describing the permutation applied to the input data X and the target labels Y. This allows referencing LS to Y and X. model: model_db.Model - a CLASS providing a set of required functions and the model architecture for executing the training and evaluation loop output_root_dir: str - a string pointing towards the root folder for writing results into. data_name: str - what is the data/feature type called? e.g. GRF or JA_X_Lower, ... target_name: str - what is the prediction target called? e.g. Subject, Gender or Injury, ... training_programme: (optional) ModelTraining class - If this parameter is not None, the model's default training regime will be overwritten with the passed ModelTraining class' train_model() function do_this_if_model_exists: str - variable controlling the training/evaluation behaviour if a trained model already exists at the model output location. options: retrain (do everything from scratch) load (load model and skip training, perform evaluation only) skip (completely skip, do nothing) save_data_in_output_dir: bool - controls wheter to save the experimental data (X, Y, L, LS, S) in the output directory force_device_for_training: str - values can be either gpu or cpu. force the use of this device during training. force_device_for_evaluation: str - values can either gpu or cpu. force the use of this device during evaluaton. here, the use of the GPU is almost always recommended due to the large batch size to be processed. do_xval: bool - controls wheter all data splits are run through a cross-evaluation scheme, or only data splits 0-2 are to be treated as dedicated training, validation and test splits decision_tree: bool - if True trains a decision tree model as a baseline/comparison option for the target model """ # some basic sanity checks assert Y.shape[0] == X.shape[0] == LS.shape[ 0], 'Number of samples differ between labels Y (n={}), data X (n={}) and subject labels LS (n={})'.format( L.shape[0], X.shape[0], LS.shape[0]) assert len(L) == X.shape[ 2], 'Number of provided channel names/labels in L (c={}) differs from number of channels in data X(c={})'.format( len(L), X.shape[2]) assert sum([len(s) for s in S]) == X.shape[ 0], 'Number of samples distributed over splits in S (n={}) differs from number of samples in X ({})'.format( sum([len(s) for s in S]), X.shape[0]) # save data, labels and split information in output directory. if save_data_in_output_dir: print('Saving training and evaluation data to {}'.format( output_root_dir)) helpers.ensure_dir_exists(output_root_dir) scipy.io.savemat('{}/data.mat'.format(output_root_dir), {'X': X}) scipy.io.savemat('{}/targets.mat'.format(output_root_dir), {'Y': Y}) scipy.io.savemat('{}/channel_labels.mat'.format(output_root_dir), {'L': L}) scipy.io.savemat('{}/subject_labels.mat'.format(output_root_dir), {'LS': LS}) scipy.io.savemat('{}/splits.mat'.format(output_root_dir), {'S': S}) scipy.io.savemat('{}/permutation.mat'.format(output_root_dir), {'P': P}) #prepare log to append anything happending in this session. kinda deprecated. logfile = open('{}/log.txt'.format(output_root_dir), 'a') # start main loop and execute training/evaluation for all the splits definied in S for split_index in range(len(S)): if split_index > 0 and not do_xval: cprint( colored( 'Cross-Validation has been disabled. Terminating after first iteration.', 'yellow')) #terminate here after one iteration, e.g. in case predetermined splits have been given. break model = model_class(output_root_dir, data_name, target_name, split_index) model_dir = model.path_dir() helpers.ensure_dir_exists(model_dir) # this case: do nothing. if model.exists() and do_this_if_model_exists == 'skip': print('Model already exists at {}. skipping'.format(model_dir)) continue #skip remaining code, there is nothing to be done. please move along. # other cases: split data in any case. measure time. set output log t_start = time.time() # collect data indices from split table j_test = split_index i_test = S[j_test] j_val = (split_index + 1) % len(S) i_val = S[j_val] j_train = list(set(range(len(S))) - {j_test, j_val}) i_train = [] for j in j_train: i_train.extend(S[j]) # collect data from indices x_train = X[i_train, ...] y_train = Y[i_train, ...] x_test = X[i_test, ...] y_test = Y[i_test, ...] x_val = X[i_val, ...] y_val = Y[i_val, ...] # remember shape of test data as originally given x_test_shape_orig = x_test.shape # model-specific data processing x_train, x_val, x_test, y_train, y_val, y_test =\ model.preprocess_data(x_train, x_val, x_test, y_train, y_val, y_test) if not model.exists() or (model.exists() and do_this_if_model_exists == 'retrain'): model.build_model(x_train.shape, y_train.shape) if training_programme is not None: #this instance-based monkey-patching is not the best way to do it, but probably the most flexible one. model.train_model = types.MethodType( training_programme.train_model, model) model.train_model(x_train, y_train, x_val, y_val, force_device=force_device_for_training) model.save_model() else: model.load_model() # compute test scores and relevance maps for model. results = model.evaluate_model( x_test, y_test, force_device=force_device_for_evaluation, lower_upper=helpers.get_channel_wise_bounds(x_train) ) # compute and give data bounds computed from training data. # measure time for training/evaluation cycle t_end = time.time() # write report for terminal printing report = '\n{}\n'.format(model.path_dir().replace('/', ' ')) report += 'test accuracy : {}\n'.format(results['acc']) report += 'test loss (l1): {}\n'.format(results['loss_l1']) report += 'train-evaluation-sequence done after {}s\n\n'.format( t_end - t_start) print(report) #dump results to output of this run with open('{}/scores.txt'.format(model.path_dir()), 'w') as f: f.write(report) #also write results to parsable log file for eval_score_logs module logfile.write(report) logfile.flush() #dump evaluation results to mat file scipy.io.savemat('{}/outputs.mat'.format(model.path_dir()), results) if decision_tree: # and (not model.exists() or (model.exists() and do_this_if_model_exists == 'retrain')): # DTree training and evaluation currently limited to settings where also the target model is trained. print('Training and evaluating alternative decision tree model') t_start = time.time() # make sure all data lives in CPU space for the DT model x_train, x_val, x_test, y_train, y_val, y_test =\ helpers.arrays_to_numpy(x_train, x_val, x_test, y_train, y_val, y_test) random_state = 42 #prep data for DT models x_train_dt = np.reshape(x_train, [x_train.shape[0], -1]) x_val_dt = np.reshape(x_val, [x_val.shape[0], -1]) x_test_dt = np.reshape(x_test, [x_test.shape[0], -1]) #some models (SVM flatten the y_train. we need to reinstate this. here, in this case) if len(y_train.shape) == 1: tmp = np.zeros((y_train.shape[0], y_val.shape[1])) # n_samples x n_classes tmp[np.arange(y_train.shape[0]), y_train] = 1 y_train = tmp clf = tree.DecisionTreeClassifier(random_state=random_state) clf.fit(x_train_dt, y_train) y_pred_train = clf.predict(x_train_dt) acc_train = helpers.accuracy(y_pred_train, y_train) y_pred_val = clf.predict(x_val_dt) acc_val = helpers.accuracy(y_pred_val, y_val) y_pred_test = clf.predict(x_test_dt) acc_test = helpers.accuracy(y_pred_test, y_test) importances = clf.feature_importances_ #collect results dtree_results = { 'acc_train': acc_train, 'acc_test': acc_test, 'acc_val': acc_val, 'y_pred_train': y_pred_train, 'y_pred_test': y_pred_test, 'y_pred_val': y_pred_val, 'importances': importances } t_end = time.time() #save results in file, in parallel to outputs.mat for the target model scipy.io.savemat('{}/outputs_dtree.mat'.format(model.path_dir()), dtree_results) # write report for terminal printing. only test_accuracy (ie the first line after the header) will be parsed by eval_score_logs dtree_report = '\n{}\n'.format(model.path_dir().replace( '/', ' ').replace(model_class.__name__, 'comp.DTree:{}'.format(model_class.__name__))) dtree_report += 'test accuracy : {}\n'.format( dtree_results['acc_test']) dtree_report += 'val accuracy : {}\n'.format( dtree_results['acc_val']) dtree_report += 'train accuracy : {}\n'.format( dtree_results['acc_train']) dtree_report += 'train-evaluation-sequence done after {}s\n\n'.format( t_end - t_start) print(dtree_report) #dump results to output of this run #again, in parallel to scores.txt for the target model with open('{}/scores_dtree.txt'.format(model.path_dir()), 'w') as f: f.write(dtree_report) #also write dree report into logfile logfile.write(dtree_report) logfile.flush()
#listed in stats_cols def double_digit_ind(df, stat_cols = ['PTS', 'A', 'TOT', 'BL', 'ST'] ): ''' Function used to generate indicators for whether the player has recorded double ''' cols_needed = [] for stat in stat_cols: col_name = stat + '_doub_digit_ind' cols_needed.append(col_name) df[col_name] = df[stat].apply(lambda x: 1 if x >= 10 else 0) df['num_cats_doub_digit'] = df[cols_needed].sum(axis = 1) player_dir = home_dir / 'NBA_Pro_Line_Analytics/raw_data/NBA_Player_Stats_2010-2019/' output_dir = helpers.ensure_dir_exists(home_dir / 'NBA_Pro_Line_Analytics/model_build_data/NBA_Player_Stats_2010-2019_rollling_avg/') i = 0 for data in dataset_config['NBA_Player_stats_raw']: print(f'Now reading in datset {data}') player_stats = pd.read_excel(player_dir / data, sheet_name=0) print(f'Renaming features:') player_stats = player_stats.rename(columns = {'PLAYER \nFULL NAME': 'player_name', 'OWN \nTEAM': 'player_team', 'OPPONENT \nTEAM': 'opposing_team', 'VENUE\n(R/H)': 'venue', 'STARTER\n(Y/N)': 'starter_ind', 'USAGE \nRATE (%)': 'usage_rate', 'DAYS\nREST': 'days_rested',
def run_train_test_cycle_single(X_train, Y_train, X_test, Y_test, X_val, Y_val, L, model_class, output_dir, data_name, target_name, training_programme=None, do_this_if_model_exists='skip', save_data_in_output_dir=True, force_device_for_training=None, force_device_for_evaluation=None): """ This script trains and evaluates a model using the given data X,Y over all splits as determined in S Parameters: ----------- X_train : np.ndarray - Training data. An numpy.ndarray shaped (N, T, C), where N is the number of samples, T is the number of time points in the data and C is the number of channels per time point. Y_train : np.ndarray - Training labels. An numpy.ndarray shaped (N, L), where N is the number of samples and L is the number of classes/labels X_test : np.ndarray - Test data. See X_train Y_test : np.ndarray - Test labels. See Y_train X_val : np.ndarray - Validation data. See X_train Y_val : np.ndarray - Validation labels. See Y_train L : list - a list of channel labels of length C, where C is the number of channels in the data. L holds textual descriptions of the data's channels model_class: model_db.Model - a CLASS providing a set of required functions and the model architecture for executing the training and evaluation loop output_dir: str - a string pointing towards the folder for writing results and data into. data_name: str - what is the data/feature type called? e.g. GRF or JA_X_Lower, ... target_name: str - what is the prediction target called? e.g. Subject, Gender or Injury, ... training_programme: (optional) ModelTraining class - If this parameter is not None, the model's default training regime will be overwritten with the passed ModelTraining class' train_model() function do_this_if_model_exists: str - variable controlling the training/evaluation behaviour if a trained model already exists at the model output location. options: retrain (do everything from scratch) load (load model and skip training, perform evaluation only) skip (completely skip, do nothing) save_data_in_output_dir: bool - controls wheter to save the experimental data (X, Y, L, LS, S) in the output directory force_device_for_training: str - values can be either gpu or cpu. force the use of this device during training. force_device_for_evaluation: str - values can either gpu or cpu. force the use of this device during evaluaton. here, the use of the GPU is almost always recommended due to the large batch size to be processed. """ # save data, labels and split information in output directory. if save_data_in_output_dir: print('Saving training and evaluation data to {}'.format(output_dir)) helpers.ensure_dir_exists(output_dir) scipy.io.savemat('{}/data_train.mat'.format(output_dir), {'X': X_train}) scipy.io.savemat('{}/targets_train.mat'.format(output_dir), {'Y': Y_train}) scipy.io.savemat('{}/data_test.mat'.format(output_dir), {'X': X_test}) scipy.io.savemat('{}/targets_test.mat'.format(output_dir), {'Y': Y_test}) scipy.io.savemat('{}/data_val.mat'.format(output_dir), {'X': X_val}) scipy.io.savemat('{}/targets_val.mat'.format(output_dir), {'Y': Y_val}) scipy.io.savemat('{}/channel_labels.mat'.format(output_dir), {'L': L}) #prepare log to append anything happending in this session. kinda deprecated. logfile = open('{}/log.txt'.format(output_dir), 'a') # start main procedure model = model_class(output_dir, data_name, target_name, 0) # note here: split index will always be zero. model_dir = model.path_dir() helpers.ensure_dir_exists(model_dir) # this case: do nothing. if model.exists() and do_this_if_model_exists == 'skip': print('Model already exists at {}. skipping'.format(model_dir)) return #skip remaining code, there is nothing to be done. please move along. # other cases: split data in any case. measure time. set output log t_start = time.time() # remember shape of test data as originally given X_test_shape_orig = X_test.shape # model-specific data processing X_train, X_val, X_test, Y_train, Y_val, Y_test =\ model.preprocess_data(X_train, X_val, X_test, Y_train, Y_val, Y_test) if not model.exists() or (model.exists() and do_this_if_model_exists == 'retrain'): model.build_model(X_train.shape, Y_train.shape) if training_programme is not None: #this instance-based monkey-patching is not the best way to do it, but probably the most flexible one. model.train_model = types.MethodType( training_programme.train_model, model) model.train_model(X_train, Y_train, X_val, Y_val, force_device=force_device_for_training) model.save_model() else: model.load_model() # compute test scores and relevance maps for model. results = model.evaluate_model( X_test, Y_test, force_device=force_device_for_evaluation, lower_upper=helpers.get_channel_wise_bounds(X_train) ) # compute and give data bounds computed from training data. # measure time for training/evaluation cycle t_end = time.time() # write report for terminal printing report = '\n{}\n'.format(model.path_dir().replace('/', ' ')) report += 'test accuracy : {}\n'.format(results['acc']) report += 'test loss (l1): {}\n'.format(results['loss_l1']) report += 'train-evaluation-sequence done after {}s\n\n'.format(t_end - t_start) print(report) #dump results to output of this run with open('{}/scores.txt'.format(model.path_dir()), 'w') as f: f.write(report) #also write results to parsable log file for eval_score_logs module logfile.write(report) logfile.flush() #dump evaluation results to mat file scipy.io.savemat('{}/outputs.mat'.format(model.path_dir()), results)
Y_train = loaded_data['Y_train'], X_test = loaded_data['X_test'], Y_test = loaded_data['Y_test'], X_val = loaded_data['X_val'], Y_val = loaded_data['Y_val'], L = loaded_data['X_train_channel_labels'], model_class = arch, output_dir = ARGS.output_dir, data_name = ARGS.data_name, target_name = ARGS.target_name, training_programme = training_regime, do_this_if_model_exists = ARGS.model_exists, save_data_in_output_dir = ARGS.save_data, force_device_for_training = ARGS.force_training_device, force_device_for_evaluation = ARGS.force_evaluation_device ) eval_score_logs.run(ARGS.output_dir) #record function call and parameters if we arrived here if ARGS.record_call: print('Recording current call configuration to {}'.format(ARGS.record_file)) helpers.ensure_dir_exists(os.path.dirname(ARGS.record_file)) with open(ARGS.record_file, 'a') as f: argline = ' '.join(['--{} {}'.format(a, getattr(ARGS,a)) for a in vars(ARGS)]) line = '{} : python {} {}'.format(current_datetime, sys.modules[__name__].__file__, argline) f.write('{}\n\n'.format(line))
except yaml.YAMLError as exc: print(exc) scripts_dir = os.path.join(gbm_build_dir, 'model_build_scripts') sys.path.insert(1, scripts_dir) import helpers def map_score_diff(df): if df['TEAM_HT'] == team: return df['score_diff'] elif df['TEAM_RT'] == team: return -1 * df['score_diff'] output_dir = helpers.ensure_dir_exists(os.path.join(home_dir, 'NBA_Pro_Line_Analytics/', 'model_build_data/NBA_Team_Stats_2010-2019_Rolling_Avg_Win_Loss_pts_diff/')) for df_data in config['NBA_Team_Stats_2010-2019_rolling_avg_win_loss']: data = pd.read_csv(os.path.join(home_dir, 'NBA_Pro_Line_Analytics/model_build_data/NBA_Team_Stats_2010-2019_rolling_avg_win_loss/', df_data)) data['TOT_Final_Score'] = data['Final_Score_HT'] + data['Final_Score_RT'] for key, value in config_pt_diff_params.items(): print(f'Now computing feature {key}') if key[0:10] == 'AVG_PTdiff': data['HT_' + key] = "" data['RT_' + key] = "" for team in data.TEAM_HT.unique(): aa = data[(data.TEAM_HT == team) | (data.TEAM_RT == team)][['TEAM_HT', 'TEAM_RT', 'outcome', 'score_diff','Final_Score_HT', 'Final_Score_RT']] aa[team + '_scorediff'] = aa.apply(map_score_diff, axis=1) aa[team + '_' + key] = aa[team + '_scorediff'].rolling(value['rolling_window_required']).mean().shift()
def analyze_dataset(catalog_id, catalog, dataset_identifier, datasets_output_dir, debug_mode=False, replace=True, debug_distribution_ids=None): res = { "dataset_status": None, "distributions_ok": [], "distributions_error": [], } dataset_meta = catalog.get_dataset(dataset_identifier) if dataset_meta: dataset_dir = os.path.join(datasets_output_dir, dataset_identifier) helpers.ensure_dir_exists(dataset_dir) res["dataset_status"] = "OK" else: res["dataset_status"] = "ERROR: metadata" return res distribution_ids = [ distribution["identifier"] for distribution in dataset_meta["distribution"] ] # si está en debug mode, se puede especificar sólo algunos ids if debug_mode and debug_distribution_ids: distribution_ids = [ distribution_id for distribution_id in distribution_ids if distribution_id in debug_distribution_ids ] # creo c/u de las distribuciones del dataset for distribution_identifier in distribution_ids: msg = "Distribución {}: {} ({})" try: distrib_meta = catalog.get_distribution(distribution_identifier) # usa fileName si la distribución lo especifica, sino crea uno distribution_name = title_to_name(distrib_meta["title"]) distribution_file_name = distrib_meta.get( "fileName", "{}.csv".format(distribution_name)) dist_path = os.path.join(dataset_dir, "distribution", distribution_identifier, "download", "{}".format(distribution_file_name)) dist_url = get_distribution_url(dist_path) # print("esta es la URL QUE VA AL CATALOGO", dist_url) distrib_meta["downloadURL"] = dist_url # chequea si ante la existencia del archivo hay que reemplazarlo o # saltearlo if not os.path.exists(dist_path) or replace: status = "Replaced" if os.path.exists(dist_path) else "Created" origin_dist_path, df = analyze_distribution( catalog_id, catalog, dataset_identifier, distribution_identifier) if isinstance(distribution, list): distribution_complete = pd.concat(distribution) else: distribution_complete = distribution helpers.ensure_dir_exists(os.path.dirname(dist_path)) shutil.copyfile(origin_dist_path, dist_path) else: status = "Skipped" res["distributions_ok"].append((distribution_identifier, status)) logger.info(msg.format(distribution_identifier, "OK", status)) except Exception as e: if isinstance(e, KeyboardInterrupt): raise res["distributions_error"].append((distribution_identifier, repr(e).encode("utf8"))) trace_string = traceback.format_exc() logger.error( msg.format(distribution_identifier, "ERROR", repr(e).encode("utf8"))) for line in trace_string.splitlines(): logger.error(line) if debug_mode: raise res["dataset_status"] = "ERROR: scraping" return res
def scrape_dataset(xl, catalog, dataset_identifier, datasets_dir, debug_mode=False, replace=True, debug_distribution_ids=None, catalog_id=None): res = { "dataset_status": None, "distributions_ok": [], "distributions_error": [], } dataset_meta = catalog.get_dataset(dataset_identifier) if dataset_meta: dataset_dir = os.path.join(datasets_dir, dataset_identifier) helpers.ensure_dir_exists(dataset_dir) res["dataset_status"] = "OK" else: res["dataset_status"] = "ERROR: metadata" return res # filtro los parametros para un dataset en particular distribution_ids = [ distribution["identifier"] for distribution in dataset_meta["distribution"] ] # si está en debug mode, se puede especificar sólo algunos ids if debug_mode and debug_distribution_ids: distribution_ids = [ distribution_id for distribution_id in distribution_ids if distribution_id in debug_distribution_ids ] # creo c/u de las distribuciones del dataset for distribution_identifier in distribution_ids: msg = "Distribución {}: {} ({})" try: distrib_meta = catalog.get_distribution(distribution_identifier) distribution_name = title_to_name(distrib_meta["title"]) distribution_file_name = distrib_meta.get( "fileName", "{}.csv".format(distribution_name)) dist_download_dir = os.path.join(dataset_dir, "distribution", distribution_identifier, "download") dist_path = os.path.join(dist_download_dir, "{}".format(distribution_file_name)) dist_url = get_distribution_url(dist_path) # print("esta es la URL QUE VA AL CATALOGO", dist_url) distrib_meta["downloadURL"] = dist_url # chequea si ante la existencia del archivo hay que reemplazarlo o # saltearlo if not os.path.exists(dist_path) or replace: status = "Replaced" if os.path.exists(dist_path) else "Created" distribution = scrape_distribution(xl, catalog, distribution_identifier) if isinstance(distribution, list): distribution_complete = pd.concat(distribution) else: distribution_complete = distribution helpers.remove_other_files(os.path.dirname(dist_path)) distribution_complete.to_csv( dist_path, encoding="utf-8", index_label="indice_tiempo") else: status = "Skipped" res["distributions_ok"].append((distribution_identifier, status)) logger.info(msg.format(distribution_identifier, "OK", status)) except Exception as e: if isinstance(e, KeyboardInterrupt): raise res["distributions_error"].append((distribution_identifier, repr(e).encode("utf8"))) trace_string = traceback.format_exc() print( msg.format(distribution_identifier, "ERROR", repr(e).encode("utf8"))) print(trace_string) if debug_mode: raise res["dataset_status"] = "ERROR: scraping" # si no hay versión vieja de la distribución, elimina del catálogo try: get_distribution_path(catalog_id, dataset_identifier, distribution_identifier) except: catalog.remove_distribution(distribution_identifier, dataset_identifier) return res