Beispiel #1
0
    def __init__(self, schema, start_date, end_date):
        if schema not in existing_feature_schemas():
            raise SchemaMissing(schema)

        # All querying is done using a raw connection.
        # In this connection set to use the relevant schema.
        # This makes sure that we grab the features from the correct schema.
        engine = create_engine(uri)
        self.con = engine.raw_connection()
        self.con.cursor().execute("SET SCHEMA '{}'".format(schema))

        self.start_date = start_date
        self.end_date = end_date
Beispiel #2
0
    def __init__(self, schema, start_date, end_date):
        if schema not in existing_feature_schemas():
            raise SchemaMissing(schema)

        # All querying is done using a raw connection.
        # In this connection set to use the relevant schema.
        # This makes sure that we grab the features from the correct schema.
        engine = create_engine(uri)
        self.con = engine.raw_connection()
        self.con.cursor().execute("SET SCHEMA '{}'".format(schema))

        self.start_date = start_date
        self.end_date = end_date
Beispiel #3
0
def generate_features(features_to_generate, n_months, max_dist,
                     inspection_date=None, insp_set='all_inspections'):
    """
    Generate labels and features for all inspections
    in the inspections database.

    If inspection_date is passed, features will be generated as if
    an inspection will occur on that day
    """
    #select schema
    #depending on the value of inspection date
    
    if insp_set=='all_inspections':
      if inspection_date is None:
        schema = "features"
      else:
        schema = "features_{}".format(inspection_date.strftime('%d%b%Y')).lower()
    elif insp_set=='field_test':
      if inspection_date is None:
        schema = "features_field_test"
      else:
        schema = "features_field_test_{}".format(inspection_date.strftime('%d%b%Y')).lower()

    # use this engine for all data storing (somehow does
    # not work with the raw connection we create below)
    engine = create_engine(uri)

    # all querying is done using a raw connection. in this
    # connection set to use the relevant schema
    # this makes sure that we grab the "inspections_parcels"
    # table from the correct schema in all feature creators
    con = engine.raw_connection()

    if schema not in existing_feature_schemas():
        #Create schema here
        cur = con.cursor()
        cur.execute("CREATE  SCHEMA %s;" % schema)
        con.commit()
        cur.close()
        logging.info('Creating schema %s' % schema)
    else:
        logging.info('Using existing schema')

    #Note on SQL injection: schema is either features or features_DATE
    #date is generated using datetime.datetime.strptime, so if somebody
    #tries to inject SQL there, it will fail
    con.cursor().execute("SET SCHEMA '{}'".format(schema))

    #Print the current schema by reading it from the db
    cur = con.cursor()    
    cur.execute('SELECT current_schema;')
    current_schema = cur.fetchone()[0]
    logger.info(('Starting feature generation in {}. '
                 'n_monts={}. max_dist={}').format(current_schema, n_months, max_dist))
    #Get existing tables
    existing_tables =  tables_in_schema(con, schema)
    
    # make a new table that contains one row for every parcel in Cincinnati
    # this table has three columns: parcel_id, inspection_date, viol_outcome
    # inspection_date is the one given as a parameter and
    # is the same for all parcels
    if 'parcels_inspections' not in existing_tables:
        logger.info('Creating parcels_inspections table...')

        if inspection_date is None:
            inspections = outcome.generate_labels()
        else:
          if insp_set=='all_inspections':
            inspections = outcome.make_fake_inspections_all_parcels_cincy(inspection_date)
          elif insp_set=='field_test':
            inspections = outcome.load_inspections_from_field_test(inspection_date)

        inspections.to_sql("parcels_inspections", engine, chunksize=50000,
                      if_exists='fail', index=False, schema=schema)
        logging.debug("... table has {} rows".format(len(inspections)))
        #Create an index to make joins with events_Xmonths_* tables faster
        cur.execute('CREATE INDEX ON features.parcels_inspections (parcel_id);')
        cur.execute('CREATE INDEX ON features.parcels_inspections (inspection_date);')
        con.commit()
    else:
        logger.info('parcels_inspections table already exists, skipping...')

    for feature in features_to_generate:
        logging.info("Generating {} features".format(feature.table))
        #Try generating features with the n_months argument
        try:
            logging.info(("Generating {} "
                          "features for {} months "
                          "and within {} m").format(feature.table, n_months, max_dist))
            feature_data = feature.generator_function(con, n_months, max_dist)
            table_to_save = '{}_{}m_{}months'.format(feature.table, max_dist, n_months)
        #If it fails, feature is not spatiotemporal, send only connection
        except Exception, e:
            table_to_save = feature.table
            logging.info("Failed to call function with months and dist: {}".format(str(e)))
            feature_data = feature.generator_function(con)
        #Every generator function must have a column with parcel_id,
        #inspection_date and the correct number of rows as their
        #corresponding parcels_inspections table in the schema being used
        # TO DO: check that feature_data has the right shape and indexes
        if table_to_save in existing_tables:
            logger.info('Features table {} already exists. Skipping...'.format(feature.table))
        else:
            feature_data.to_sql(table_to_save, engine, chunksize=50000,
                            if_exists='replace', index=True, schema=schema,
			    #Force saving inspection_date as timestamp without timezone
            dtype={'inspection_date': types.TIMESTAMP(timezone=False)})
            logging.debug("{} table has {} rows".format(table_to_save, 
                                                len(feature_data)))
Beispiel #4
0
def generate_features(features_to_generate,
                      n_months,
                      max_dist,
                      inspection_date=None,
                      insp_set='all_inspections'):
    """
    Generate labels and features for all inspections
    in the inspections database.

    If inspection_date is passed, features will be generated as if
    an inspection will occur on that day
    """
    #select schema
    #depending on the value of inspection date

    if insp_set == 'all_inspections':
        if inspection_date is None:
            schema = "features"
        else:
            schema = "features_{}".format(
                inspection_date.strftime('%d%b%Y')).lower()
    elif insp_set == 'field_test':
        if inspection_date is None:
            schema = "features_field_test"
        else:
            schema = "features_field_test_{}".format(
                inspection_date.strftime('%d%b%Y')).lower()

    # use this engine for all data storing (somehow does
    # not work with the raw connection we create below)
    engine = create_engine(uri)

    # all querying is done using a raw connection. in this
    # connection set to use the relevant schema
    # this makes sure that we grab the "inspections_parcels"
    # table from the correct schema in all feature creators
    con = engine.raw_connection()
    # con = engine.connect()

    if schema not in existing_feature_schemas():
        #Create schema here
        cur = con.cursor()
        cur.execute("CREATE  SCHEMA %s;" % schema)
        con.commit()
        cur.close()
        logging.info('Creating schema %s' % schema)
    else:
        logging.info('Using existing schema')

    #Note on SQL injection: schema is either features or features_DATE
    #date is generated using datetime.datetime.strptime, so if somebody
    #tries to inject SQL there, it will fail
    con.cursor().execute("SET SCHEMA '{}'".format(schema))

    #Print the current schema by reading it from the db
    cur = con.cursor()
    cur.execute('SELECT current_schema;')
    current_schema = cur.fetchone()[0]
    logger.info(('Starting feature generation in {}. '
                 'n_months={}. max_dist={}').format(current_schema, n_months,
                                                    max_dist))
    #Get existing tables
    existing_tables = tables_in_schema(schema)

    # set the search path, otherwise won't find ST_DWithin()
    cur = con.cursor()
    cur.execute("SET search_path TO {schema}, public;".format(schema=schema))
    con.commit()

    # make a new table that contains one row for every parcel in Cincinnati
    # this table has three columns: parcel_id, inspection_date, viol_outcome
    # inspection_date is the one given as a parameter and
    # is the same for all parcels
    if 'parcels_inspections' not in existing_tables:
        logger.info('Creating parcels_inspections table...')

        if inspection_date is None:
            inspections = outcome.generate_labels()
        else:
            if insp_set == 'all_inspections':
                inspections = outcome.make_fake_inspections_all_parcels_cincy(
                    inspection_date)
            elif insp_set == 'field_test':
                inspections = outcome.load_inspections_from_field_test(
                    inspection_date)

        inspections.to_sql("parcels_inspections",
                           engine,
                           chunksize=50000,
                           if_exists='fail',
                           index=False,
                           schema=schema)
        logging.debug("... table has {} rows".format(len(inspections)))
        #Create an index to make joins with events_Xmonths_* tables faster
        cur.execute('CREATE INDEX ON parcels_inspections (parcel_id);')
        cur.execute('CREATE INDEX ON parcels_inspections (inspection_date);')
        cur.execute(
            'CREATE INDEX ON parcels_inspections (parcel_id, inspection_date);'
        )
        con.commit()
    else:
        logger.info('parcels_inspections table already exists, skipping...')

    for feature in features_to_generate:
        logging.info("Generating {} features".format(feature.table))
        #Try generating features with the n_months argument
        try:
            logging.info(("Generating {} "
                          "features for {} months "
                          "and within {} m").format(feature.table, n_months,
                                                    max_dist))
            feature_data = feature.generator_function(con, n_months, max_dist)
            table_to_save = '{}_{}m_{}months'.format(feature.table, max_dist,
                                                     n_months)
        #If it fails, feature is not spatiotemporal, send only connection
        except Exception, e:
            table_to_save = feature.table
            logging.info(
                "Failed to call function with months and dist: {}".format(
                    str(e)))
            feature_data = feature.generator_function(con)
        #Every generator function must have a column with parcel_id,
        #inspection_date and the correct number of rows as their
        #corresponding parcels_inspections table in the schema being used
        # TO DO: check that feature_data has the right shape and indexes
        if table_to_save in existing_tables:
            logger.info(
                'Features table {} already exists. Replacing...'.format(
                    feature.table))

        feature_data.to_sql(
            table_to_save,
            engine,
            chunksize=50000,
            if_exists='replace',
            index=True,
            schema=schema,
            #Force saving inspection_date as timestamp without timezone
            dtype={'inspection_date': types.TIMESTAMP(timezone=False)})
        logging.debug("{} table has {} rows".format(table_to_save,
                                                    len(feature_data)))