class LoadPredictionsMetadata(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() matrix_uuid = str(uuid.uuid4()) pipeline_type = luigi.Parameter() def requires(self): return CreatePredictions(self.year, self.month, self.day, self.matrix_uuid, self.pipeline_type) host, database, user, password = get_database_connection_parameters() table = "predictions.metadata" schema = "predictions" columns = [("executed_at", "timestamp"), ("matrix_uuid", "varchar"), ("task_params", "varchar"), ("total_predictions", "integer"), ("execution_user", "varchar"), ("source_ip", "varchar"), ("script_tag", "varchar")] def run(self): helper = MetadataHelper(self.year, self.month, self.day) self.inserted_record_count = helper.get_inserted_predictions() super().run() def rows(self): params_string = "year={} month={} day={}".format( str(self.year), str(self.month), str(self.day)) row = ( str(datetime.now(tz=None)), self.matrix_uuid, params_string, self.inserted_record_count, get_os_user(), get_current_ip(), "https://github.com/dpa-2020-equipo-5/nyc-ccci-etl/blob/master/nyc_ccci_etl/predict/predictions_creator.py" ) yield row
class LoadCleanInspectionsMetadata(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() def requires(self): return LoadCleanInspections(self.year, self.month, self.day) host, database, user, password = get_database_connection_parameters() table = "clean.metadata" schema = "clean" columns = [("executed_at", "timestamp"), ("task_params", "varchar"), ("record_count", "integer"), ("execution_user", "varchar"), ("source_ip", "varchar"), ("database_name", "varchar"), ("database_schema", "varchar"), ("database_table", "varchar"), ("database_user", "varchar"), ("vars", "varchar"), ("script_tag", "varchar")] def run(self): helper = MetadataHelper(self.year, self.month, self.day) self.inserted_columns = helper.get_inserted_clean_columns() self.inserted_record_count = helper.get_inserted_clean_records() super().run() def rows(self): params_string = "year={} month={} day={}".format( str(self.year), str(self.month), str(self.day)) row = (str(datetime.now(tz=None)), params_string, self.inserted_record_count, get_os_user(), get_current_ip(), self.database, self.schema, self.table, self.user, self.inserted_columns, "etl") yield row
class PipelineRoot(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() pipeline_type = luigi.Parameter() def requires(self): if str(self.pipeline_type) == 'train': return LoadBiasFairnessMetadata(self.year, self.month, self.day, self.pipeline_type) elif str(self.pipeline_type) == 'predict': return LoadPredictionsMetadata(self.year, self.month, self.day, self.pipeline_type) elif str(self.pipeline_type) == 'load': return (LoadTransformedInspectionsMetadata(self.year, self.month, self.day), LoadUpdateCentersMetadata(self.year, self.month, self.day)) columns = [ ('update_id', 'text'), ('target_table', 'text'), ('inserted', 'timestamp'), ] host, database, user, password = get_database_connection_parameters() table = "table_updates" def rows(self): update_id = "{}_{}{}{}".format(str(self.pipeline_type), str(self.year), str(self.month), str(self.day)) yield (update_id, "table_updates", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
class ModelMetadata(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() def requires(self): return (LoadTransformedInspectionsMetadata(self.year, self.month, self.day), LoadUpdateCentersMetadata(self.year, self.month, self.day)) host, database, user, password = get_database_connection_parameters() table = "modeling.model_parameters" schema = "modeling" def run(self): self.model_params, self.score = RandomForestGridSearch.find_best_params( ) self.columns = [('task_id', 'varchar'), ('created_at', 'timestamp'), ('score', 'float'), ('n_estimators', 'integer'), ('bootstrap', 'boolean'), ('class_weight', 'varchar'), ('max_depth', 'integer'), ('criterion', 'varchar')] super().run() def rows(self): yield ("{}{}{}".format(str(self.year), str(self.month), str(self.day)), datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self.score, self.model_params['n_estimators'], self.model_params['bootstrap'], self.model_params['class_weight'], self.model_params['max_depth'], self.model_params['criterion'])
def __init__(self, year, month, day): host, database, user, password = get_database_connection_parameters() self.date_filter = "{}-{}-{}T00:00:00.000".format( str(year).zfill(2), str(month).zfill(2), str(day).zfill(2)) self.date_filter_clean = "{}_{}_{}t00:00:00.000".format( str(year).zfill(2), str(month).zfill(2), str(day).zfill(2)) self.date_filter_transformed = "{}-{}-{} 00:00:00".format( str(year).zfill(2), str(month).zfill(2), str(day).zfill(2)) self.date_filter_aequitas = "{}-{}-{}".format(str(year), str(month).zfill(2), str(day).zfill(2)) engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format( user=user, password=password, host=host, port=5432, database=database, ) self.engine = create_engine(engine_string)
class InspectionDatesMatchRequestDateValidation(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() host, database, user, password = get_database_connection_parameters() table = "testing.extractions" schema = "testing" columns = [('test', 'varchar'), ('ran_at', 'timestamp'), ('params', 'varchar'), ('status', 'varchar'), ('note', 'varchar')] def run(self): test_inspections_extractor = TestInspectionsExtractor() self.test_result = test_inspections_extractor.test_inspection_date_should_match_params_date( self.year, self.month, self.day) if self.test_result['status'] == 'failed': print_test_failed(self.test_result['test'], self.test_result['note']) sys.exit() else: print_test_passed(self.test_result['test']) super().run() def rows(self): params = "year={} month={} day={}".format(self.year, self.month, self.day) yield (self.test_result['test'], self.test_result['ran_at'], params, self.test_result['status'], self.test_result['note'])
def __init__(self): host, database, user, password = get_database_connection_parameters() engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format( user=user, password=password, host=host, port=5432, database=database, ) self.engine = create_engine(engine_string)
def __init__(self, year, month, day): self.task_id = "{}{}{}".format(str(year), str(month), str(day)) host, database, user, password = get_database_connection_parameters() engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format( user=user, password=password, host=host, port=5432, database=database, ) self.engine = create_engine(engine_string)
def __init__(self, year, month, day): host, database, user, password = get_database_connection_parameters() self.bucket = get_aws_bucket() self.date_param = "{}-{}-{}".format(year, str(month).zfill(2), str(day).zfill(2)) engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format( user = user, password = password, host = host, port = 5432, database = database, ) self.engine = create_engine(engine_string)
class PredictionsColumnsValidation(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() matrix_uuid = luigi.Parameter() host, database, user, password = get_database_connection_parameters() bucket = get_aws_bucket table = "testing.predictions" schema = "testing" columns = [('test', 'varchar'), ('ran_at', 'timestamp'), ('params', 'varchar'), ('status', 'varchar'), ('note', 'varchar')] def get_lastest_model(self, session): s3_client = session.client('s3') response = s3_client.list_objects_v2(Bucket=self.bucket) all_models = response['Contents'] latest = max(all_models, key=lambda x: x['LastModified']) return latest['Key'] def run(self): ses = boto3.session.Session(profile_name='default', region_name='us-east-1') latest_model = self.get_lastest_model(ses) s3_resource = ses.resource('s3') with BytesIO() as data: s3_resource.Bucket(self.bucket).download_fileobj( latest_model, data) data.seek(0) model = pickle.load(data) test_preds = TestPredictions() self.test_result = test_preds.test_predictions_have_correct_columns( model, self.year, self.month, self.day, self.matrix_uuid) if self.test_result['status'] == 'failed': print_test_failed(self.test_result['test'], self.test_result['note']) sys.exit() else: print_test_passed(self.test_result['test']) super().run() def rows(self): params = "year={} month={} day={}".format(self.year, self.month, self.day) yield (self.test_result['test'], self.test_result['ran_at'], params, self.test_result['status'], self.test_result['note'])
def __init__(self, model, year, month, day, uuid): host, database, user, password = get_database_connection_parameters() engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format( user = user, password = password, host = host, port = 5432, database = database, ) self.engine = create_engine(engine_string) self.model = model self.year = year self.month = month self.day = day self.matrix_uuid = uuid
class CreatePredictions(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() matrix_uuid = luigi.Parameter() pipeline_type = luigi.Parameter() def requires(self): return (PredictionsValidationMetadata(self.year, self.month, self.day, self.matrix_uuid, self.pipeline_type)) host, database, user, password = get_database_connection_parameters() table = "predictions.predictions" schema = "predictions" def run(self): ses = boto3.session.Session(profile_name='default', region_name='us-east-1') latest_model = self.get_lastest_model(ses) s3_resource = ses.resource('s3') with BytesIO() as data: s3_resource.Bucket("nyc-ccci").download_fileobj(latest_model, data) data.seek(0) model = pickle.load(data) predictor = PredictionsCreator(model, self.year, self.month, self.day, self.matrix_uuid) self._rows, self.columns = predictor.create_predictions() super().run() def rows(self): for element in self._rows: yield element def get_lastest_model(self, session): s3_client = session.client('s3') response = s3_client.list_objects_v2(Bucket='nyc-ccci') all_models = response['Contents'] latest = max(all_models, key=lambda x: x['LastModified']) return latest['Key']
class LoadBiasFairnessMetadata(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() pipeline_type = luigi.Parameter() def requires(self): return ( LoadAequitasGroups(self.year, self.month, self.day, self.pipeline_type), LoadAequitasBias(self.year, self.month, self.day, self.pipeline_type), LoadAequitasFairness(self.year, self.month, self.day, self.pipeline_type) ) host, database, user, password = get_database_connection_parameters() table = "aequitas.metadata" schema = "aequitas" columns = [ ("executed_at", "timestamp"), ("task_params", "varchar"), ("bias_records", "integer"), ("fairness_records", "integer"), ("groups_records", "integer"), ("execution_user", "varchar"), ("source_ip", "varchar"), ] def run(self): helper = MetadataHelper(self.year, self.month, self.day) self.inserted_bias_records = helper.get_inserted_aequitas_bias() self.inserted_fairness_records = helper.get_inserted_aequitas_fairness() self.inserted_groups_records = helper.get_inserted_aequitas_groups() super().run() def rows(self): params_string = "year={} month={} day={}".format(str(self.year), str(self.month), str(self.day)) row = ( str(datetime.now(tz=None)), params_string, self.inserted_bias_records, self.inserted_fairness_records, self.inserted_groups_records, get_os_user(), get_current_ip() ) yield row
class LoadAequitasGroups(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() pipeline_type = luigi.Parameter() def requires(self): if (str(self.pipeline_type) == 'train'): return FitRandomForestAndCreatePickle(self.year, self.month, self.day) host, database, user, password = get_database_connection_parameters() table = "aequitas.groups" schema = "aequitas" def run(self): g = GroupMetrics(self.year, self.month, self.day) self._rows,self.columns = g.execeute() super().run() def rows(self): for element in self._rows: yield element
class UpdateCenters(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() def requires(self): return FeatureEngineeringValidationMetadata(self.year, self.month, self.day) host, database, user, password = get_database_connection_parameters() table = "transformed.centers" def run(self): centers_updater = CentersUpdater(self.year, self.month, self.day) self.rs, self.columns = centers_updater.execute() super().run() def rows(self): for element in self.rs: yield element
class LoadRawInspections(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() def requires(self): return ExtractionValidationMetadata(self.year, self.month, self.day) host, database, user, password = get_database_connection_parameters() table = "raw.inspections" columns = [("inspection", "json")] def rows(self): etl_extraction = InspectionsExtractor(self.year, self.month, self.day) #Ejecutamos la extracción y se nos regresa una lista de diccionarios (json) inspections_json_data = etl_extraction.execute() r = [(json.dumps(d).replace("'", "''"), ) for d in inspections_json_data] for element in r: yield element
class LoadTransformedInspections(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() host, database, user, password = get_database_connection_parameters() table = "transformed.inspections" def requires(self): return FeatureEngineeringValidationMetadata(self.year, self.month, self.day) def run(self): transform_inspections = InspectionsTransformer(self.year, self.month, self.day) self._rows, self.columns = transform_inspections.execute() super().run() def rows(self): for element in self._rows: yield element
class LoadCleanInspections(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() def requires(self): return ExtractionValidationMetadata(self.year, self.month, self.day) host, database, user, password = get_database_connection_parameters() table = "clean.inspections" def run(self): etl_extraction = InspectionsExtractor(self.year, self.month, self.day) inspections_json_data = etl_extraction.execute() cleaner = InspectionsCleaner(inspections_json_data) self._rows, self._columns = cleaner.execute() self.columns = [(c, 'VARCHAR') for c in self._columns] super().run() def rows(self): for element in self._rows: yield element
class ColumnsOneHotEncodingValidation(CopyToTable): year = luigi.IntParameter() month = luigi.IntParameter() day = luigi.IntParameter() def requires(self): return (LoadRawInspectionsMetadata(self.year, self.month, self.day), LoadCleanInspectionsMetadata(self.year, self.month, self.day)) host, database, user, password = get_database_connection_parameters() table = "testing.feature_engineering" schema = "testing" columns = [('test', 'varchar'), ('ran_at', 'timestamp'), ('params', 'varchar'), ('status', 'varchar'), ('note', 'varchar')] def run(self): test_feature_engineering = TestFeatureEngineering() self.test_result = test_feature_engineering.test_columns_one_hot_encoding( self.year, self.month, self.day) if self.test_result['status'] == 'failed': print_test_failed(self.test_result['test'], self.test_result['note']) sys.exit() else: print_test_passed(self.test_result['test']) #if self.test_result['status'] == 'failed': super().run() def rows(self): params = "year={} month={} day={}".format(self.year, self.month, self.day) yield (self.test_result['test'], self.test_result['ran_at'], params, self.test_result['status'], self.test_result['note'])