def transform(self): self.model = Model(raw_data=self.sample, table_columns=table_columns, feature_columns=self.feature_columns, target_column=self.target_column, model_type=self.model_type) self.model.set_data_frame(separator=self.separator, header=self.header) self.model.set_features() self.model.set_target() self.model.set_train_test() self.model.set_fitted_model() self.model.set_predictions() self.model.set_score() self.model.set_classification_report() self.model.set_metrics() logger.info(self.model.score) logger.info(self.model.metrics) logger.info(self.model.classification_report) feature_lists = self.format_features_list() samples_and_predictions = [ i[0] + [i[1], model_version] for i in zip(feature_lists, self.model.predictions.tolist()) ] # NOQA save_rows(file='{file}_{model_name}_predictions.csv'.format( file=self.file, model_name=self.model_name), rows=samples_and_predictions) save_row(file='{file}_{model_name}_' 'scores.csv'.format(file=self.file, model_name=self.model_name), row=self.model.score.tolist() + [model_version]) save_rows( file='{file}_{model_name}_classification_report.csv'.format( file=self.file, model_name=self.model_name), # NOQA rows=[ i + [model_version] for i in self.format_classification_report() ]) save_rows(file='{file}_{model_name}_metrics_report.csv'.format( file=self.file, model_name=self.model_name), rows=[ i + [model_version] for i in [ self.model.metrics['false_positive_rate'], self.model.metrics['true_positive_rate'], self.model.metrics['thresholds'], [self.model.metrics['roc_area_under_curve']] ] ]) # NOQA
def transform(self): self.model = Model(raw_data=self.sample, table_columns=table_columns, feature_columns=self.feature_columns, target_column=self.target_column, model_type=self.model_type) self.model.set_data_frame(separator=self.separator, header=self.header) self.model.set_features() self.model.set_target() self.model.set_train_test() self.model.set_fitted_model() self.model.set_predictions() self.model.set_score() self.model.set_classification_report() self.model.set_metrics() logger.info(self.model.score) logger.info(self.model.metrics) logger.info(self.model.classification_report) feature_lists = self.format_features_list() samples_and_predictions = [i[0] + [i[1], model_version] for i in zip(feature_lists, self.model.predictions.tolist())] # NOQA save_rows(file='{file}_{model_name}_predictions.csv'.format(file=self.file, model_name=self.model_name), rows=samples_and_predictions) save_row(file='{file}_{model_name}_' 'scores.csv'.format(file=self.file, model_name=self.model_name), row=self.model.score.tolist() + [model_version]) save_rows(file='{file}_{model_name}_classification_report.csv'.format(file=self.file, model_name=self.model_name), # NOQA rows=[i + [model_version] for i in self.format_classification_report()]) save_rows(file='{file}_{model_name}_metrics_report.csv'.format(file=self.file, model_name=self.model_name), rows=[i + [model_version] for i in [self.model.metrics['false_positive_rate'], self.model.metrics['true_positive_rate'], self.model.metrics['thresholds'], [self.model.metrics['roc_area_under_curve']]]]) # NOQA
class ModelBuilderWorkflow(BaseETLWorkflow): def __init__(self, sample=None, separator=None, header=None, model_name=None, model_type=None, model_version=1.0, table_columns=None, feature_columns=None, target_column=None, start_date=None): super(ModelBuilderWorkflow, self).__init__(workflow_name='model_builder') self.sample = '{path}/{sample}'.format(path=self.config.path.get('source_data', None), sample=sample) self.separator = separator self.header = header self.model_type = model_type self.model_name = model_name self.model_version = model_version self.file_name = self.table self.file = None self.table_columns = table_columns self.feature_columns = feature_columns self.target_column = target_column self.model = None self.get_paths() def get_paths(self, query_name=None): base_dir, self.file, raw_file, self.compressed_file = self.get_file_path() command.create_directories(base_dir) return base_dir def extract(self): pass def transform(self): self.model = Model(raw_data=self.sample, table_columns=table_columns, feature_columns=self.feature_columns, target_column=self.target_column, model_type=self.model_type) self.model.set_data_frame(separator=self.separator, header=self.header) self.model.set_features() self.model.set_target() self.model.set_train_test() self.model.set_fitted_model() self.model.set_predictions() self.model.set_score() self.model.set_classification_report() self.model.set_metrics() logger.info(self.model.score) logger.info(self.model.metrics) logger.info(self.model.classification_report) feature_lists = self.format_features_list() samples_and_predictions = [i[0] + [i[1], model_version] for i in zip(feature_lists, self.model.predictions.tolist())] # NOQA save_rows(file='{file}_{model_name}_predictions.csv'.format(file=self.file, model_name=self.model_name), rows=samples_and_predictions) save_row(file='{file}_{model_name}_' 'scores.csv'.format(file=self.file, model_name=self.model_name), row=self.model.score.tolist() + [model_version]) save_rows(file='{file}_{model_name}_classification_report.csv'.format(file=self.file, model_name=self.model_name), # NOQA rows=[i + [model_version] for i in self.format_classification_report()]) save_rows(file='{file}_{model_name}_metrics_report.csv'.format(file=self.file, model_name=self.model_name), rows=[i + [model_version] for i in [self.model.metrics['false_positive_rate'], self.model.metrics['true_positive_rate'], self.model.metrics['thresholds'], [self.model.metrics['roc_area_under_curve']]]]) # NOQA def format_features_list(self): feature_lists = list() for i in range(len(self.model.test['features'])): feature_list = list() feature_list.append(self.model.test['features'].iloc[i].tolist()) feature_lists.append(feature_list) return [feature[0] for feature in feature_lists] def format_classification_report(self): split_model = self.model.classification_report.split('\n') report = [ ['class'] + split_model[0].split(), [i.split() for i in split_model][2], [i.split() for i in split_model][3], [''.join([i.split() for i in split_model][5][0:3])] + [i.split() for i in split_model][5][3:] ] return report def format_metrics(self): self.model.metrics['false_positive_rate'] = self.model.metrics['false_positive_rate'].tolist() # NOQA self.model.metrics['true_positive_rate'] = self.model.metrics['true_positive_rate'].tolist() # NOQA self.model.metrics['thresholds'] = self.model.metrics['thresholds'].tolist() def load(self): pass
class ModelBuilderWorkflow(BaseETLWorkflow): def __init__(self, sample=None, separator=None, header=None, model_name=None, model_type=None, model_version=1.0, table_columns=None, feature_columns=None, target_column=None, start_date=None): super(ModelBuilderWorkflow, self).__init__(workflow_name='model_builder') self.sample = '{path}/{sample}'.format(path=self.config.path.get( 'source_data', None), sample=sample) self.separator = separator self.header = header self.model_type = model_type self.model_name = model_name self.model_version = model_version self.file_name = self.table self.file = None self.table_columns = table_columns self.feature_columns = feature_columns self.target_column = target_column self.model = None self.get_paths() def get_paths(self, query_name=None): base_dir, self.file, raw_file, self.compressed_file = self.get_file_path( ) command.create_directories(base_dir) return base_dir def extract(self): pass def transform(self): self.model = Model(raw_data=self.sample, table_columns=table_columns, feature_columns=self.feature_columns, target_column=self.target_column, model_type=self.model_type) self.model.set_data_frame(separator=self.separator, header=self.header) self.model.set_features() self.model.set_target() self.model.set_train_test() self.model.set_fitted_model() self.model.set_predictions() self.model.set_score() self.model.set_classification_report() self.model.set_metrics() logger.info(self.model.score) logger.info(self.model.metrics) logger.info(self.model.classification_report) feature_lists = self.format_features_list() samples_and_predictions = [ i[0] + [i[1], model_version] for i in zip(feature_lists, self.model.predictions.tolist()) ] # NOQA save_rows(file='{file}_{model_name}_predictions.csv'.format( file=self.file, model_name=self.model_name), rows=samples_and_predictions) save_row(file='{file}_{model_name}_' 'scores.csv'.format(file=self.file, model_name=self.model_name), row=self.model.score.tolist() + [model_version]) save_rows( file='{file}_{model_name}_classification_report.csv'.format( file=self.file, model_name=self.model_name), # NOQA rows=[ i + [model_version] for i in self.format_classification_report() ]) save_rows(file='{file}_{model_name}_metrics_report.csv'.format( file=self.file, model_name=self.model_name), rows=[ i + [model_version] for i in [ self.model.metrics['false_positive_rate'], self.model.metrics['true_positive_rate'], self.model.metrics['thresholds'], [self.model.metrics['roc_area_under_curve']] ] ]) # NOQA def format_features_list(self): feature_lists = list() for i in range(len(self.model.test['features'])): feature_list = list() feature_list.append(self.model.test['features'].iloc[i].tolist()) feature_lists.append(feature_list) return [feature[0] for feature in feature_lists] def format_classification_report(self): split_model = self.model.classification_report.split('\n') report = [['class'] + split_model[0].split(), [i.split() for i in split_model][2], [i.split() for i in split_model][3], [''.join([i.split() for i in split_model][5][0:3])] + [i.split() for i in split_model][5][3:]] return report def format_metrics(self): self.model.metrics['false_positive_rate'] = self.model.metrics[ 'false_positive_rate'].tolist() # NOQA self.model.metrics['true_positive_rate'] = self.model.metrics[ 'true_positive_rate'].tolist() # NOQA self.model.metrics['thresholds'] = self.model.metrics[ 'thresholds'].tolist() def load(self): pass