def create_data_key_dimension(self, file_name, data_type, chunk, mandator, personal=False, parquet=False): ''' Creates entire Filename for given data_type including path. return -- filename for data_type with path ''' if parquet: return gu.get_target_path([f"personal={str(personal)}", f"master_data={data_type}", f"partition_mandator={mandator}", f"partition_chunk={chunk}"]) else: return gu.get_target_path([f"personal={str(personal)}", f"master_data={data_type}", f"partition_mandator={mandator}", f"partition_chunk={chunk}", file_name])
def get_importance_plots(self): file_path = gu.get_target_path([self.local_folder, 'imp1'], file_extension='png') plt.rcParams.update(plt.rcParamsDefault) plt.bar(range(len(self.model.feature_importances_)), self.model.feature_importances_) plt.savefig(file_path, bbox_inches='tight') plt.close() file_path = gu.get_target_path([self.local_folder, 'imp2'], file_extension='png') plt.rcParams.update(plt.rcParamsDefault) xgb.plot_importance(self.model) plt.savefig(file_path, bbox_inches='tight') plt.close()
def create_partial_data_key_deletion(self, data_type, mandantor, personal=False, parquet=None): ''' Creates entire partial Filename for given data_type including path. return -- partial filename for data_type with path to delete files under this directory ''' return gu.get_target_path([f"personal={str(personal)}", f"master_data={data_type}", f"partition_mandator={mandantor}"])
def export_dict_as_text(self, data_dict, fname): file_path = gu.get_target_path([self.local_folder, fname], file_extension='csv') with open(file_path, 'w+') as csv_file: writer = csv.writer(csv_file) for key, value in data_dict.items(): writer.writerow([key, value])
def create_data_key(self, partition_date, file_name, data_type, mandator, personal=False, parquet=None, month=False, report=False): """ Creates entire Filename with for given partition_date and data_type including path. return -- filename for data_type and partition_date with path """ # platformx.data.etl.raw/{publishing_group}/{mandator}/{year}/{month}/{day}/ if parquet: time_partition = gu.create_time_partition(partition_date) return gu.get_target_path([f"personal={personal}", f"transaction={data_type}", f"partition_mandator={mandator}", time_partition]) elif report: time_partition = gu.create_time_partition(partition_date, month) return gu.get_target_path(["report", "partition", f"mandator={mandator}", time_partition, file_name]) else: time_partition = gu.create_time_partition(partition_date, month) return gu.get_target_path([f"personal={personal}", f"transaction={data_type}", f"partition_mandator={mandator}", time_partition, file_name])
def create_report_log_key(self, process_start_date, export_type, index, data_type, mandator): ''' Creates report-log-key including path and filename for given data_type. return -- path and filename for the report-log for data_type ''' time_partition = gu.create_time_partition(process_start_date) return gu.get_target_path([f"type={export_type}", time_partition, f"{data_type}_{mandator}_{str(index)}_{process_start_date.strftime('%H%M%S')}.json"])
def export_dict_as_one_row_text(self, data_dict, fname): csv_columns = data_dict.keys() file_path = gu.get_target_path([self.local_folder, fname], file_extension='csv') with open(file_path, 'w+') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=csv_columns) writer.writeheader() writer.writerow(data_dict)
def get_tree_plot(self): file_path = gu.get_target_path([self.local_folder, 'tree'], file_extension='png') plt.rcParams.update(plt.rcParamsDefault) plt.rcParams['figure.figsize'] = [50, 10] xgb.plot_tree(self.model, num_trees=0) plt.savefig(file_path, bbox_inches='tight') plt.close()
def create_s3_uri(self, bucket, key, tmpFileName, FileType=None): """creates and s3 uri: s3://bucket/key Arguments: bucket {string} -- bucket name key {string} -- key path """ return gu.get_target_path(["s3://", bucket, key, tmpFileName], FileType)
def get_metric_data_key(self, metric): keys = config['s3_structure'] values = [ metric, self.atomic_metrics['project'], self.atomic_metrics['dataset'], self.atomic_metrics['use_case'], self.atomic_metrics['setup'] ] datakeys = [k + '=' + v for k, v in zip(keys, values)] datakeys = ['ML_Analysis'] + datakeys return gu.get_target_path(datakeys)
def get_roc_plot(self): file_path = gu.get_target_path([self.local_folder, 'roc'], file_extension='png') fpr = self.plots['roc']['fpr'] tpr = self.plots['roc']['tpr'] plt.rcParams.update(plt.rcParamsDefault) plt.plot(fpr, tpr, marker='.', label='xgboost') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend() plt.savefig(file_path, bbox_inches='tight') plt.close()
def get_pr_plot(self): file_path = gu.get_target_path([self.local_folder, 'pr'], file_extension='png') plt.rcParams.update(plt.rcParamsDefault) precision = self.plots['pr']['precision'] recall = self.plots['pr']['recall'] plt.plot(recall, precision, marker='.', label='xgboost') plt.xlabel('Recall') plt.ylabel('Precision') plt.legend() plt.savefig(file_path, bbox_inches='tight') plt.close()
def get_validation_metrics_plots(self): for metric in self.validation_metrics: file_path = gu.get_target_path( [self.local_folder, f'val_{metric}'], file_extension='png') df = self.model_metrics['validation_results'][[ f'test_{metric}', f'train_{metric}' ]] df.plot() plt.ylabel(metric) plt.xlabel('epochs') plt.title(f'XGBoost {metric}') plt.savefig(file_path, bbox_inches='tight') plt.close()
def get_prob_plot(self): file_path = gu.get_target_path([self.local_folder, 'proba'], file_extension='png') df = self.plots['prob'] neg = df[df.classification == 'Negatives']['prob_class_1'] pos = df[df.classification == 'Positives']['prob_class_1'] plt.hist(neg, bins=100, label='Negatives') plt.hist(pos, bins=100, label='Positives', alpha=0.7, color='r') plt.xlabel('Probability of being Positive Class') plt.ylabel('Number of records in each bucket') plt.legend() plt.tick_params(axis='both', pad=5) plt.savefig(file_path, bbox_inches='tight') plt.close()
def export_df_as_text(self, df, fname): file_path = gu.get_target_path([self.local_folder, fname], file_extension='csv') df.to_csv(file_path, index=False, header=True)
def prepare_local_folder(self, model_name): proj_folder = os.getcwd() self.local_folder = gu.get_target_path([proj_folder, model_name]) if not os.path.exists(self.local_folder): os.makedirs(self.local_folder)
def export_tree_as_text(self): file_path = gu.get_target_path([self.local_folder, 'tree'], file_extension='csv') self.booster.dump_model(file_path)