コード例 #1
0
class LoadPredictionsMetadata(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()
    matrix_uuid = str(uuid.uuid4())
    pipeline_type = luigi.Parameter()

    def requires(self):
        return CreatePredictions(self.year, self.month, self.day,
                                 self.matrix_uuid, self.pipeline_type)

    host, database, user, password = get_database_connection_parameters()
    table = "predictions.metadata"
    schema = "predictions"
    columns = [("executed_at", "timestamp"), ("matrix_uuid", "varchar"),
               ("task_params", "varchar"), ("total_predictions", "integer"),
               ("execution_user", "varchar"), ("source_ip", "varchar"),
               ("script_tag", "varchar")]

    def run(self):
        helper = MetadataHelper(self.year, self.month, self.day)
        self.inserted_record_count = helper.get_inserted_predictions()
        super().run()

    def rows(self):
        params_string = "year={} month={} day={}".format(
            str(self.year), str(self.month), str(self.day))
        row = (
            str(datetime.now(tz=None)), self.matrix_uuid, params_string,
            self.inserted_record_count, get_os_user(), get_current_ip(),
            "https://github.com/dpa-2020-equipo-5/nyc-ccci-etl/blob/master/nyc_ccci_etl/predict/predictions_creator.py"
        )
        yield row
class LoadCleanInspectionsMetadata(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()

    def requires(self):
        return LoadCleanInspections(self.year, self.month, self.day)

    host, database, user, password = get_database_connection_parameters()
    table = "clean.metadata"
    schema = "clean"

    columns = [("executed_at", "timestamp"), ("task_params", "varchar"),
               ("record_count", "integer"), ("execution_user", "varchar"),
               ("source_ip", "varchar"), ("database_name", "varchar"),
               ("database_schema", "varchar"), ("database_table", "varchar"),
               ("database_user", "varchar"), ("vars", "varchar"),
               ("script_tag", "varchar")]

    def run(self):
        helper = MetadataHelper(self.year, self.month, self.day)
        self.inserted_columns = helper.get_inserted_clean_columns()
        self.inserted_record_count = helper.get_inserted_clean_records()
        super().run()

    def rows(self):
        params_string = "year={} month={} day={}".format(
            str(self.year), str(self.month), str(self.day))
        row = (str(datetime.now(tz=None)),
               params_string, self.inserted_record_count, get_os_user(),
               get_current_ip(), self.database, self.schema, self.table,
               self.user, self.inserted_columns, "etl")
        yield row
コード例 #3
0
class PipelineRoot(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()
    pipeline_type = luigi.Parameter()

    def requires(self):
        if str(self.pipeline_type) == 'train':
            return LoadBiasFairnessMetadata(self.year, self.month, self.day,
                                            self.pipeline_type)
        elif str(self.pipeline_type) == 'predict':
            return LoadPredictionsMetadata(self.year, self.month, self.day,
                                           self.pipeline_type)
        elif str(self.pipeline_type) == 'load':
            return (LoadTransformedInspectionsMetadata(self.year, self.month,
                                                       self.day),
                    LoadUpdateCentersMetadata(self.year, self.month, self.day))

    columns = [
        ('update_id', 'text'),
        ('target_table', 'text'),
        ('inserted', 'timestamp'),
    ]
    host, database, user, password = get_database_connection_parameters()
    table = "table_updates"

    def rows(self):
        update_id = "{}_{}{}{}".format(str(self.pipeline_type), str(self.year),
                                       str(self.month), str(self.day))
        yield (update_id, "table_updates",
               datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
コード例 #4
0
class ModelMetadata(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()

    def requires(self):
        return (LoadTransformedInspectionsMetadata(self.year, self.month,
                                                   self.day),
                LoadUpdateCentersMetadata(self.year, self.month, self.day))

    host, database, user, password = get_database_connection_parameters()
    table = "modeling.model_parameters"
    schema = "modeling"

    def run(self):
        self.model_params, self.score = RandomForestGridSearch.find_best_params(
        )
        self.columns = [('task_id', 'varchar'), ('created_at', 'timestamp'),
                        ('score', 'float'), ('n_estimators', 'integer'),
                        ('bootstrap', 'boolean'), ('class_weight', 'varchar'),
                        ('max_depth', 'integer'), ('criterion', 'varchar')]
        super().run()

    def rows(self):
        yield ("{}{}{}".format(str(self.year), str(self.month), str(self.day)),
               datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self.score,
               self.model_params['n_estimators'],
               self.model_params['bootstrap'],
               self.model_params['class_weight'],
               self.model_params['max_depth'], self.model_params['criterion'])
コード例 #5
0
 def __init__(self, year, month, day):
     host, database, user, password = get_database_connection_parameters()
     self.date_filter = "{}-{}-{}T00:00:00.000".format(
         str(year).zfill(2),
         str(month).zfill(2),
         str(day).zfill(2))
     self.date_filter_clean = "{}_{}_{}t00:00:00.000".format(
         str(year).zfill(2),
         str(month).zfill(2),
         str(day).zfill(2))
     self.date_filter_transformed = "{}-{}-{} 00:00:00".format(
         str(year).zfill(2),
         str(month).zfill(2),
         str(day).zfill(2))
     self.date_filter_aequitas = "{}-{}-{}".format(str(year),
                                                   str(month).zfill(2),
                                                   str(day).zfill(2))
     engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(
         user=user,
         password=password,
         host=host,
         port=5432,
         database=database,
     )
     self.engine = create_engine(engine_string)
class InspectionDatesMatchRequestDateValidation(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()

    host, database, user, password = get_database_connection_parameters()
    table = "testing.extractions"
    schema = "testing"

    columns = [('test', 'varchar'), ('ran_at', 'timestamp'),
               ('params', 'varchar'), ('status', 'varchar'),
               ('note', 'varchar')]

    def run(self):
        test_inspections_extractor = TestInspectionsExtractor()
        self.test_result = test_inspections_extractor.test_inspection_date_should_match_params_date(
            self.year, self.month, self.day)
        if self.test_result['status'] == 'failed':
            print_test_failed(self.test_result['test'],
                              self.test_result['note'])
            sys.exit()
        else:
            print_test_passed(self.test_result['test'])
        super().run()

    def rows(self):
        params = "year={} month={} day={}".format(self.year, self.month,
                                                  self.day)
        yield (self.test_result['test'], self.test_result['ran_at'], params,
               self.test_result['status'], self.test_result['note'])
コード例 #7
0
 def __init__(self):
     host, database, user, password = get_database_connection_parameters()
     engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(
         user=user,
         password=password,
         host=host,
         port=5432,
         database=database,
     )
     self.engine = create_engine(engine_string)
コード例 #8
0
 def __init__(self, year, month, day):
     self.task_id = "{}{}{}".format(str(year), str(month), str(day))
     host, database, user, password = get_database_connection_parameters()
     engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(
         user=user,
         password=password,
         host=host,
         port=5432,
         database=database,
     )
     self.engine = create_engine(engine_string)
コード例 #9
0
 def __init__(self, year, month, day):
     host, database, user, password = get_database_connection_parameters()
     self.bucket = get_aws_bucket()
     self.date_param = "{}-{}-{}".format(year, str(month).zfill(2), str(day).zfill(2))
     engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(
         user = user,
         password = password,
         host = host,
         port = 5432,
         database = database,
     )
     self.engine = create_engine(engine_string)
コード例 #10
0
class PredictionsColumnsValidation(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()
    matrix_uuid = luigi.Parameter()

    host, database, user, password = get_database_connection_parameters()
    bucket = get_aws_bucket
    table = "testing.predictions"
    schema = "testing"

    columns = [('test', 'varchar'), ('ran_at', 'timestamp'),
               ('params', 'varchar'), ('status', 'varchar'),
               ('note', 'varchar')]

    def get_lastest_model(self, session):
        s3_client = session.client('s3')
        response = s3_client.list_objects_v2(Bucket=self.bucket)
        all_models = response['Contents']
        latest = max(all_models, key=lambda x: x['LastModified'])
        return latest['Key']

    def run(self):
        ses = boto3.session.Session(profile_name='default',
                                    region_name='us-east-1')
        latest_model = self.get_lastest_model(ses)
        s3_resource = ses.resource('s3')
        with BytesIO() as data:
            s3_resource.Bucket(self.bucket).download_fileobj(
                latest_model, data)
            data.seek(0)
            model = pickle.load(data)

        test_preds = TestPredictions()
        self.test_result = test_preds.test_predictions_have_correct_columns(
            model, self.year, self.month, self.day, self.matrix_uuid)
        if self.test_result['status'] == 'failed':
            print_test_failed(self.test_result['test'],
                              self.test_result['note'])
            sys.exit()
        else:
            print_test_passed(self.test_result['test'])

        super().run()

    def rows(self):
        params = "year={} month={} day={}".format(self.year, self.month,
                                                  self.day)
        yield (self.test_result['test'], self.test_result['ran_at'], params,
               self.test_result['status'], self.test_result['note'])
コード例 #11
0
 def __init__(self, model, year, month, day, uuid):
     host, database, user, password = get_database_connection_parameters()
     engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(
         user = user,
         password = password,
         host = host,
         port = 5432,
         database = database,
     )
     self.engine = create_engine(engine_string)
     self.model = model
     self.year = year
     self.month = month
     self.day = day
     self.matrix_uuid = uuid
コード例 #12
0
class CreatePredictions(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()
    matrix_uuid = luigi.Parameter()
    pipeline_type = luigi.Parameter()

    def requires(self):
        return (PredictionsValidationMetadata(self.year, self.month, self.day,
                                              self.matrix_uuid,
                                              self.pipeline_type))

    host, database, user, password = get_database_connection_parameters()
    table = "predictions.predictions"
    schema = "predictions"

    def run(self):
        ses = boto3.session.Session(profile_name='default',
                                    region_name='us-east-1')

        latest_model = self.get_lastest_model(ses)

        s3_resource = ses.resource('s3')

        with BytesIO() as data:
            s3_resource.Bucket("nyc-ccci").download_fileobj(latest_model, data)
            data.seek(0)
            model = pickle.load(data)

        predictor = PredictionsCreator(model, self.year, self.month, self.day,
                                       self.matrix_uuid)
        self._rows, self.columns = predictor.create_predictions()

        super().run()

    def rows(self):
        for element in self._rows:
            yield element

    def get_lastest_model(self, session):
        s3_client = session.client('s3')
        response = s3_client.list_objects_v2(Bucket='nyc-ccci')
        all_models = response['Contents']
        latest = max(all_models, key=lambda x: x['LastModified'])
        return latest['Key']
コード例 #13
0
class LoadBiasFairnessMetadata(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()
    pipeline_type = luigi.Parameter()
    
    def requires(self):
        return (
            LoadAequitasGroups(self.year, self.month, self.day, self.pipeline_type),
            LoadAequitasBias(self.year, self.month, self.day, self.pipeline_type),
            LoadAequitasFairness(self.year, self.month, self.day, self.pipeline_type)
        )

    host, database, user, password = get_database_connection_parameters()
    table = "aequitas.metadata"
    schema = "aequitas"
    columns = [ 
        ("executed_at", "timestamp"),
        ("task_params", "varchar"),
        ("bias_records", "integer"),
        ("fairness_records", "integer"),
        ("groups_records", "integer"),
        ("execution_user", "varchar"),
        ("source_ip", "varchar"),
    ]
    def run(self):
        helper = MetadataHelper(self.year, self.month, self.day)
        self.inserted_bias_records = helper.get_inserted_aequitas_bias()
        self.inserted_fairness_records = helper.get_inserted_aequitas_fairness()
        self.inserted_groups_records = helper.get_inserted_aequitas_groups()
        super().run()
    

    def rows(self):
        params_string = "year={} month={} day={}".format(str(self.year), str(self.month), str(self.day))
        row = (
            str(datetime.now(tz=None)),
            params_string,
            self.inserted_bias_records,
            self.inserted_fairness_records,
            self.inserted_groups_records,
            get_os_user(),
            get_current_ip()
        )
        yield row
コード例 #14
0
class LoadAequitasGroups(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()
    pipeline_type = luigi.Parameter()
    def requires(self):
        if (str(self.pipeline_type) == 'train'):
            return FitRandomForestAndCreatePickle(self.year, self.month, self.day)
        
    host, database, user, password = get_database_connection_parameters()
    table = "aequitas.groups"
    schema = "aequitas"
    def run(self):
        g = GroupMetrics(self.year, self.month, self.day)
        self._rows,self.columns = g.execeute()
        super().run()
    
    def rows(self):        
        for element in self._rows:
            yield element
コード例 #15
0
class UpdateCenters(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()

    def requires(self):
        return FeatureEngineeringValidationMetadata(self.year, self.month,
                                                    self.day)

    host, database, user, password = get_database_connection_parameters()
    table = "transformed.centers"

    def run(self):
        centers_updater = CentersUpdater(self.year, self.month, self.day)
        self.rs, self.columns = centers_updater.execute()
        super().run()

    def rows(self):
        for element in self.rs:
            yield element
コード例 #16
0
class LoadRawInspections(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()

    def requires(self):
        return ExtractionValidationMetadata(self.year, self.month, self.day)

    host, database, user, password = get_database_connection_parameters()
    table = "raw.inspections"
    columns = [("inspection", "json")]

    def rows(self):
        etl_extraction = InspectionsExtractor(self.year, self.month, self.day)

        #Ejecutamos la extracción y se nos regresa una lista de diccionarios (json)
        inspections_json_data = etl_extraction.execute()

        r = [(json.dumps(d).replace("'", "''"), )
             for d in inspections_json_data]
        for element in r:
            yield element
コード例 #17
0
class LoadTransformedInspections(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()

    host, database, user, password = get_database_connection_parameters()
    table = "transformed.inspections"

    def requires(self):
        return FeatureEngineeringValidationMetadata(self.year, self.month,
                                                    self.day)

    def run(self):
        transform_inspections = InspectionsTransformer(self.year, self.month,
                                                       self.day)
        self._rows, self.columns = transform_inspections.execute()

        super().run()

    def rows(self):
        for element in self._rows:
            yield element
コード例 #18
0
class LoadCleanInspections(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()

    def requires(self):
        return ExtractionValidationMetadata(self.year, self.month, self.day)

    host, database, user, password = get_database_connection_parameters()
    table = "clean.inspections"

    def run(self):
        etl_extraction = InspectionsExtractor(self.year, self.month, self.day)
        inspections_json_data = etl_extraction.execute()
        cleaner = InspectionsCleaner(inspections_json_data)
        self._rows, self._columns = cleaner.execute()
        self.columns = [(c, 'VARCHAR') for c in self._columns]
        super().run()

    def rows(self):
        for element in self._rows:
            yield element
class ColumnsOneHotEncodingValidation(CopyToTable):
    year = luigi.IntParameter()
    month = luigi.IntParameter()
    day = luigi.IntParameter()

    def requires(self):
        return (LoadRawInspectionsMetadata(self.year, self.month, self.day),
                LoadCleanInspectionsMetadata(self.year, self.month, self.day))

    host, database, user, password = get_database_connection_parameters()
    table = "testing.feature_engineering"
    schema = "testing"

    columns = [('test', 'varchar'), ('ran_at', 'timestamp'),
               ('params', 'varchar'), ('status', 'varchar'),
               ('note', 'varchar')]

    def run(self):
        test_feature_engineering = TestFeatureEngineering()
        self.test_result = test_feature_engineering.test_columns_one_hot_encoding(
            self.year, self.month, self.day)
        if self.test_result['status'] == 'failed':
            print_test_failed(self.test_result['test'],
                              self.test_result['note'])
            sys.exit()
        else:
            print_test_passed(self.test_result['test'])

        #if self.test_result['status'] == 'failed':

        super().run()

    def rows(self):
        params = "year={} month={} day={}".format(self.year, self.month,
                                                  self.day)
        yield (self.test_result['test'], self.test_result['ran_at'], params,
               self.test_result['status'], self.test_result['note'])