Ejemplo n.º 1
0
    def create_data_key_dimension(self, file_name, data_type, chunk, mandator, personal=False, parquet=False):
        '''
        Creates entire Filename for given data_type including path.

        return -- filename for data_type with path
        '''
        if parquet:
            return gu.get_target_path([f"personal={str(personal)}", f"master_data={data_type}", f"partition_mandator={mandator}", f"partition_chunk={chunk}"])
        else:
            return gu.get_target_path([f"personal={str(personal)}", f"master_data={data_type}", f"partition_mandator={mandator}", f"partition_chunk={chunk}", file_name])
Ejemplo n.º 2
0
    def get_importance_plots(self):
        file_path = gu.get_target_path([self.local_folder, 'imp1'],
                                       file_extension='png')
        plt.rcParams.update(plt.rcParamsDefault)
        plt.bar(range(len(self.model.feature_importances_)),
                self.model.feature_importances_)
        plt.savefig(file_path, bbox_inches='tight')
        plt.close()

        file_path = gu.get_target_path([self.local_folder, 'imp2'],
                                       file_extension='png')
        plt.rcParams.update(plt.rcParamsDefault)
        xgb.plot_importance(self.model)
        plt.savefig(file_path, bbox_inches='tight')
        plt.close()
Ejemplo n.º 3
0
    def create_partial_data_key_deletion(self, data_type, mandantor, personal=False, parquet=None):
        '''
        Creates entire partial Filename for given data_type including path.

        return -- partial filename for data_type with path to delete files under this directory
        '''
        return gu.get_target_path([f"personal={str(personal)}", f"master_data={data_type}", f"partition_mandator={mandantor}"])
Ejemplo n.º 4
0
 def export_dict_as_text(self, data_dict, fname):
     file_path = gu.get_target_path([self.local_folder, fname],
                                    file_extension='csv')
     with open(file_path, 'w+') as csv_file:
         writer = csv.writer(csv_file)
         for key, value in data_dict.items():
             writer.writerow([key, value])
Ejemplo n.º 5
0
    def create_data_key(self, partition_date, file_name, data_type, mandator, personal=False,  parquet=None, month=False, report=False):
        """
        Creates entire Filename with for given partition_date and data_type including path.

        return -- filename for data_type and partition_date with path
        """
        # platformx.data.etl.raw/{publishing_group}/{mandator}/{year}/{month}/{day}/
        if parquet:
            time_partition = gu.create_time_partition(partition_date)
            return gu.get_target_path([f"personal={personal}", f"transaction={data_type}", f"partition_mandator={mandator}", time_partition])
        elif report:
            time_partition = gu.create_time_partition(partition_date, month)
            return gu.get_target_path(["report", "partition",  f"mandator={mandator}", time_partition, file_name])
        else:
            time_partition = gu.create_time_partition(partition_date, month)
            return gu.get_target_path([f"personal={personal}", f"transaction={data_type}", f"partition_mandator={mandator}", time_partition, file_name])
Ejemplo n.º 6
0
    def create_report_log_key(self, process_start_date, export_type, index, data_type, mandator):
        '''
        Creates report-log-key including path and filename for given data_type.

        return -- path and filename for the report-log for data_type
        '''
        time_partition = gu.create_time_partition(process_start_date)
        return gu.get_target_path([f"type={export_type}", time_partition, f"{data_type}_{mandator}_{str(index)}_{process_start_date.strftime('%H%M%S')}.json"])
Ejemplo n.º 7
0
 def export_dict_as_one_row_text(self, data_dict, fname):
     csv_columns = data_dict.keys()
     file_path = gu.get_target_path([self.local_folder, fname],
                                    file_extension='csv')
     with open(file_path, 'w+') as csvfile:
         writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
         writer.writeheader()
         writer.writerow(data_dict)
Ejemplo n.º 8
0
 def get_tree_plot(self):
     file_path = gu.get_target_path([self.local_folder, 'tree'],
                                    file_extension='png')
     plt.rcParams.update(plt.rcParamsDefault)
     plt.rcParams['figure.figsize'] = [50, 10]
     xgb.plot_tree(self.model, num_trees=0)
     plt.savefig(file_path, bbox_inches='tight')
     plt.close()
Ejemplo n.º 9
0
    def create_s3_uri(self, bucket, key, tmpFileName, FileType=None):
        """creates and s3 uri: s3://bucket/key

        Arguments:
                bucket {string} -- bucket name
                key {string} -- key path
        """
        return gu.get_target_path(["s3://", bucket, key, tmpFileName], FileType)
Ejemplo n.º 10
0
 def get_metric_data_key(self, metric):
     keys = config['s3_structure']
     values = [
         metric, self.atomic_metrics['project'],
         self.atomic_metrics['dataset'], self.atomic_metrics['use_case'],
         self.atomic_metrics['setup']
     ]
     datakeys = [k + '=' + v for k, v in zip(keys, values)]
     datakeys = ['ML_Analysis'] + datakeys
     return gu.get_target_path(datakeys)
Ejemplo n.º 11
0
 def get_roc_plot(self):
     file_path = gu.get_target_path([self.local_folder, 'roc'],
                                    file_extension='png')
     fpr = self.plots['roc']['fpr']
     tpr = self.plots['roc']['tpr']
     plt.rcParams.update(plt.rcParamsDefault)
     plt.plot(fpr, tpr, marker='.', label='xgboost')
     plt.xlabel('False Positive Rate')
     plt.ylabel('True Positive Rate')
     plt.legend()
     plt.savefig(file_path, bbox_inches='tight')
     plt.close()
Ejemplo n.º 12
0
 def get_pr_plot(self):
     file_path = gu.get_target_path([self.local_folder, 'pr'],
                                    file_extension='png')
     plt.rcParams.update(plt.rcParamsDefault)
     precision = self.plots['pr']['precision']
     recall = self.plots['pr']['recall']
     plt.plot(recall, precision, marker='.', label='xgboost')
     plt.xlabel('Recall')
     plt.ylabel('Precision')
     plt.legend()
     plt.savefig(file_path, bbox_inches='tight')
     plt.close()
Ejemplo n.º 13
0
 def get_validation_metrics_plots(self):
     for metric in self.validation_metrics:
         file_path = gu.get_target_path(
             [self.local_folder, f'val_{metric}'], file_extension='png')
         df = self.model_metrics['validation_results'][[
             f'test_{metric}', f'train_{metric}'
         ]]
         df.plot()
         plt.ylabel(metric)
         plt.xlabel('epochs')
         plt.title(f'XGBoost {metric}')
         plt.savefig(file_path, bbox_inches='tight')
         plt.close()
Ejemplo n.º 14
0
 def get_prob_plot(self):
     file_path = gu.get_target_path([self.local_folder, 'proba'],
                                    file_extension='png')
     df = self.plots['prob']
     neg = df[df.classification == 'Negatives']['prob_class_1']
     pos = df[df.classification == 'Positives']['prob_class_1']
     plt.hist(neg, bins=100, label='Negatives')
     plt.hist(pos, bins=100, label='Positives', alpha=0.7, color='r')
     plt.xlabel('Probability of being Positive Class')
     plt.ylabel('Number of records in each bucket')
     plt.legend()
     plt.tick_params(axis='both', pad=5)
     plt.savefig(file_path, bbox_inches='tight')
     plt.close()
Ejemplo n.º 15
0
 def export_df_as_text(self, df, fname):
     file_path = gu.get_target_path([self.local_folder, fname],
                                    file_extension='csv')
     df.to_csv(file_path, index=False, header=True)
Ejemplo n.º 16
0
 def prepare_local_folder(self, model_name):
     proj_folder = os.getcwd()
     self.local_folder = gu.get_target_path([proj_folder, model_name])
     if not os.path.exists(self.local_folder):
         os.makedirs(self.local_folder)
Ejemplo n.º 17
0
 def export_tree_as_text(self):
     file_path = gu.get_target_path([self.local_folder, 'tree'],
                                    file_extension='csv')
     self.booster.dump_model(file_path)