Example #1
0
def make_nmonths_table_from_template(con, dataset, date_column,
                                    min_insp_date, max_insp_date,
                                    n_months, max_dist,
                                    template, load=False,  columns='all'):
    '''
        Load inspections table matched with events that happened X months
        before. Returns pandas dataframe with the data loaded
    '''
    #Create a cursor
    cur = con.cursor()

    #Get the current schema
    cur.execute('SELECT current_schema;')
    current_schema = cur.fetchone()[0]

    #Build the table name
    table_name = ('insp2{dataset}_{n_months}months'
                  '_{max_dist}m').format(dataset=dataset,
                                         n_months=n_months,
                                         max_dist=max_dist)
    #Check if table already exists in current schema
    #If not, create it
    if table_name not in tables_in_schema(current_schema):
        logger.info('Table {} does not exist... Creating it'.format(table_name))
        path_to_template = os.path.join(os.environ['ROOT_FOLDER'],
                        'model',
                        'features',
                        template)
        #Load template with SQL statement
        with open(path_to_template, 'r') as f:
            sql_script = Template(f.read())
        #Replace values in template
        sql_script = sql_script.substitute(TABLE_NAME=table_name,
                                           DATASET=dataset,
                                           DATE_COLUMN=date_column,
                                           N_MONTHS=n_months,
                                           MAX_DIST=max_dist,
                                           MIN_INSP_DATE=min_insp_date,
                                           MAX_INSP_DATE=max_insp_date)
        #Run the code using the connection
        #this is going to take a while
        cur.execute(sql_script)
        #Commit changes to db
        con.commit()

        #If table created has a geom column which type USER DEFINED,
        #delete it, we don't need it here
        cols = columns_for_table_in_schema(table_name, current_schema)
        if ('geom', 'USER-DEFINED') in cols:
            #Important: this is not prouction ready since it's
            #vulnerable to SQL injection, I haven't found any solution
            #to dynamically pass table names as parameters in psycopg2
            #it seems like the only solution is to prevent SQL injection
            #in the code
            q = ('ALTER TABLE {} DROP COLUMN geom').format(table_name)
            cur.execute(q)
            con.commit()
            logger.info('Table {} has a PostGIS column, deleting...'.format(table_name))
    else:
        logger.info('Table {} already exists. Skipping...'.format(table_name))

    cur.close()
    #Load data
    e = create_engine(uri)
    logger.info('Loading {} month table...'.format(table_name))
    if columns=='all':
        #Since the table contains a geom column, you need to subselect columns
        #to load otherwise pandas will complain
        cols = columns_for_table_in_schema(table_name, current_schema)
        valid_cols = filter(lambda x: x[1]!= 'USER-DEFINED', cols)
        cols_to_load = [x[0] for x in valid_cols]
    #If the user passed and array in the columns parameter, only
    #select those columns
    else:
        cols_to_load = columns

    if load:
        df = pd.read_sql_table(table_name, e,
                            schema=current_schema,
                            columns=cols_to_load)
        return df
Example #2
0
def make_nmonths_table_from_template(con, dataset, date_column,
                                    min_insp_date, max_insp_date,
                                    n_months, max_dist,
                                    template, load=False,  columns='all'):
    '''
        Load inspections table matched with events that happened X months
        before. Returns pandas dataframe with the data loaded
    '''
    #Create a cursor
    cur = con.cursor()

    #Get the current schema
    cur.execute('SELECT current_schema;')
    current_schema = cur.fetchone()[0]

    #Build the table name
    table_name = ('insp2{dataset}_{n_months}months'
                  '_{max_dist}m').format(dataset=dataset,
                                         n_months=n_months,
                                         max_dist=max_dist)
    #Check if table already exists in current schema
    #If not, create it
    if table_name not in tables_in_schema(current_schema):
        logger.info('Table {} does not exist... Creating it'.format(table_name))
        path_to_template = os.path.join(os.environ['ROOT_FOLDER'],
                        'model',
                        'features',
                        template)
        #Load template with SQL statement
        with open(path_to_template, 'r') as f:
            sql_script = Template(f.read())
        #Replace values in template
        sql_script = sql_script.substitute(TABLE_NAME=table_name,
                                           DATASET=dataset,
                                           DATE_COLUMN=date_column,
                                           N_MONTHS=n_months,
                                           MAX_DIST=max_dist,
                                           MIN_INSP_DATE=min_insp_date,
                                           MAX_INSP_DATE=max_insp_date)
        #Run the code using the connection
        #this is going to take a while
        cur.execute(sql_script)
        #Commit changes to db
        con.commit()

        #If table created has a geom column which type USER DEFINED,
        #delete it, we don't need it here
        cols = columns_for_table_in_schema(table_name, current_schema)
        if ('geom', 'USER-DEFINED') in cols:
            #Important: this is not prouction ready since it's
            #vulnerable to SQL injection, I haven't found any solution
            #to dynamically pass table names as parameters in psycopg2
            #it seems like the only solution is to prevent SQL injection
            #in the code
            q = ('ALTER TABLE {} DROP COLUMN geom').format(table_name)
            cur.execute(q)
            con.commit()
            logger.info('Table {} has a PostGIS column, deleting...'.format(table_name))
    else:
        logger.info('Table {} already exists. Skipping...'.format(table_name))

    cur.close()
    #Load data
    e = create_engine(uri)
    logger.info('Loading {} month table...'.format(table_name))
    if columns=='all':
        #Since the table contains a geom column, you need to subselect columns
        #to load otherwise pandas will complain
        cols = columns_for_table_in_schema(table_name, current_schema)
        valid_cols = filter(lambda x: x[1]!= 'USER-DEFINED', cols)
        cols_to_load = [x[0] for x in valid_cols]
    #If the user passed and array in the columns parameter, only
    #select those columns
    else:
        cols_to_load = columns

    if load:
        df = pd.read_sql_table(table_name, e,
                            schema=current_schema,
                            columns=cols_to_load)
        return df
        url = params['source']['url']
    except Exception, e:
        logger.info('URL was not present in the configuration file...')
    else:
        logger.info('Downloading file...'.format(folder))
        data_file = urllib2.urlopen(url)
        #Dowload file replacing it if already exists
        with open(params['source']['filename'], 'wb') as output:
            output.write(data_file.read())

    #Step two: check most recent entry in the database
    engine = create_engine(uri)
    table_name = params['storage']['table']

    #Check if table exists
    if table_name not in tables_in_schema(schema):
        db_most_recent = None
        logger.info(
            'Table does not exist, diff file will be a copy of source file')
    else:
        query = 'SELECT MAX({}) FROM {}'.format(db_column,
                                                table_name)  #TMP FIX
        db_most_recent = engine.execute(query).fetchone()[0]
        logger.info(
            'Most recent record in database is: {}'.format(db_most_recent))

    #Step three: load and subset the file to include new entries
    #TO DO: avoid loading if most_recent_row is None
    logger.info('Loading {}'.format(params['source']['filename']))
    df = pd.read_csv(params['source']['filename'], dtype=object)
    df[file_column] = pd.to_datetime(df[file_column])
Example #4
0
def generate_features(features_to_generate, n_months, max_dist,
                     inspection_date=None, insp_set='all_inspections'):
    """
    Generate labels and features for all inspections
    in the inspections database.

    If inspection_date is passed, features will be generated as if
    an inspection will occur on that day
    """
    #select schema
    #depending on the value of inspection date
    
    if insp_set=='all_inspections':
      if inspection_date is None:
        schema = "features"
      else:
        schema = "features_{}".format(inspection_date.strftime('%d%b%Y')).lower()
    elif insp_set=='field_test':
      if inspection_date is None:
        schema = "features_field_test"
      else:
        schema = "features_field_test_{}".format(inspection_date.strftime('%d%b%Y')).lower()

    # use this engine for all data storing (somehow does
    # not work with the raw connection we create below)
    engine = create_engine(uri)

    # all querying is done using a raw connection. in this
    # connection set to use the relevant schema
    # this makes sure that we grab the "inspections_parcels"
    # table from the correct schema in all feature creators
    con = engine.raw_connection()

    if schema not in existing_feature_schemas():
        #Create schema here
        cur = con.cursor()
        cur.execute("CREATE  SCHEMA %s;" % schema)
        con.commit()
        cur.close()
        logging.info('Creating schema %s' % schema)
    else:
        logging.info('Using existing schema')

    #Note on SQL injection: schema is either features or features_DATE
    #date is generated using datetime.datetime.strptime, so if somebody
    #tries to inject SQL there, it will fail
    con.cursor().execute("SET SCHEMA '{}'".format(schema))

    #Print the current schema by reading it from the db
    cur = con.cursor()    
    cur.execute('SELECT current_schema;')
    current_schema = cur.fetchone()[0]
    logger.info(('Starting feature generation in {}. '
                 'n_monts={}. max_dist={}').format(current_schema, n_months, max_dist))
    #Get existing tables
    existing_tables =  tables_in_schema(schema)
    
    # make a new table that contains one row for every parcel in Cincinnati
    # this table has three columns: parcel_id, inspection_date, viol_outcome
    # inspection_date is the one given as a parameter and
    # is the same for all parcels
    if 'parcels_inspections' not in existing_tables:
        logger.info('Creating parcels_inspections table...')

        if inspection_date is None:
            inspections = outcome.generate_labels()
        else:
          if insp_set=='all_inspections':
            inspections = outcome.make_fake_inspections_all_parcels_cincy(inspection_date)
          elif insp_set=='field_test':
            inspections = outcome.load_inspections_from_field_test(inspection_date)

        inspections.to_sql("parcels_inspections", engine, chunksize=50000,
                      if_exists='fail', index=False, schema=schema)
        logging.debug("... table has {} rows".format(len(inspections)))
        #Create an index to make joins with events_Xmonths_* tables faster
        cur.execute('CREATE INDEX ON features.parcels_inspections (parcel_id);')
        cur.execute('CREATE INDEX ON features.parcels_inspections (inspection_date);')
        con.commit()
    else:
        logger.info('parcels_inspections table already exists, skipping...')

    for feature in features_to_generate:
        logging.info("Generating {} features".format(feature.table))
        #Try generating features with the n_months argument
        try:
            logging.info(("Generating {} "
                          "features for {} months "
                          "and within {} m").format(feature.table, n_months, max_dist))
            feature_data = feature.generator_function(con, n_months, max_dist)
            table_to_save = '{}_{}m_{}months'.format(feature.table, max_dist, n_months)
        #If it fails, feature is not spatiotemporal, send only connection
        except Exception, e:
            table_to_save = feature.table
            logging.info("Failed to call function with months and dist: {}".format(str(e)))
            feature_data = feature.generator_function(con)
        #Every generator function must have a column with parcel_id,
        #inspection_date and the correct number of rows as their
        #corresponding parcels_inspections table in the schema being used
        # TO DO: check that feature_data has the right shape and indexes
        if table_to_save in existing_tables:
            logger.info('Features table {} already exists. Replacing...'.format(feature.table))

        feature_data.to_sql(table_to_save, engine, chunksize=50000,
          if_exists='replace', index=True, schema=schema,
          #Force saving inspection_date as timestamp without timezone
          dtype={'inspection_date': types.TIMESTAMP(timezone=False)})
        logging.debug("{} table has {} rows".format(table_to_save, len(feature_data)))
Example #5
0
def generate_features(features_to_generate,
                      n_months,
                      max_dist,
                      inspection_date=None,
                      insp_set='all_inspections'):
    """
    Generate labels and features for all inspections
    in the inspections database.

    If inspection_date is passed, features will be generated as if
    an inspection will occur on that day
    """
    #select schema
    #depending on the value of inspection date

    if insp_set == 'all_inspections':
        if inspection_date is None:
            schema = "features"
        else:
            schema = "features_{}".format(
                inspection_date.strftime('%d%b%Y')).lower()
    elif insp_set == 'field_test':
        if inspection_date is None:
            schema = "features_field_test"
        else:
            schema = "features_field_test_{}".format(
                inspection_date.strftime('%d%b%Y')).lower()

    # use this engine for all data storing (somehow does
    # not work with the raw connection we create below)
    engine = create_engine(uri)

    # all querying is done using a raw connection. in this
    # connection set to use the relevant schema
    # this makes sure that we grab the "inspections_parcels"
    # table from the correct schema in all feature creators
    con = engine.raw_connection()
    # con = engine.connect()

    if schema not in existing_feature_schemas():
        #Create schema here
        cur = con.cursor()
        cur.execute("CREATE  SCHEMA %s;" % schema)
        con.commit()
        cur.close()
        logging.info('Creating schema %s' % schema)
    else:
        logging.info('Using existing schema')

    #Note on SQL injection: schema is either features or features_DATE
    #date is generated using datetime.datetime.strptime, so if somebody
    #tries to inject SQL there, it will fail
    con.cursor().execute("SET SCHEMA '{}'".format(schema))

    #Print the current schema by reading it from the db
    cur = con.cursor()
    cur.execute('SELECT current_schema;')
    current_schema = cur.fetchone()[0]
    logger.info(('Starting feature generation in {}. '
                 'n_months={}. max_dist={}').format(current_schema, n_months,
                                                    max_dist))
    #Get existing tables
    existing_tables = tables_in_schema(schema)

    # set the search path, otherwise won't find ST_DWithin()
    cur = con.cursor()
    cur.execute("SET search_path TO {schema}, public;".format(schema=schema))
    con.commit()

    # make a new table that contains one row for every parcel in Cincinnati
    # this table has three columns: parcel_id, inspection_date, viol_outcome
    # inspection_date is the one given as a parameter and
    # is the same for all parcels
    if 'parcels_inspections' not in existing_tables:
        logger.info('Creating parcels_inspections table...')

        if inspection_date is None:
            inspections = outcome.generate_labels()
        else:
            if insp_set == 'all_inspections':
                inspections = outcome.make_fake_inspections_all_parcels_cincy(
                    inspection_date)
            elif insp_set == 'field_test':
                inspections = outcome.load_inspections_from_field_test(
                    inspection_date)

        inspections.to_sql("parcels_inspections",
                           engine,
                           chunksize=50000,
                           if_exists='fail',
                           index=False,
                           schema=schema)
        logging.debug("... table has {} rows".format(len(inspections)))
        #Create an index to make joins with events_Xmonths_* tables faster
        cur.execute('CREATE INDEX ON parcels_inspections (parcel_id);')
        cur.execute('CREATE INDEX ON parcels_inspections (inspection_date);')
        cur.execute(
            'CREATE INDEX ON parcels_inspections (parcel_id, inspection_date);'
        )
        con.commit()
    else:
        logger.info('parcels_inspections table already exists, skipping...')

    for feature in features_to_generate:
        logging.info("Generating {} features".format(feature.table))
        #Try generating features with the n_months argument
        try:
            logging.info(("Generating {} "
                          "features for {} months "
                          "and within {} m").format(feature.table, n_months,
                                                    max_dist))
            feature_data = feature.generator_function(con, n_months, max_dist)
            table_to_save = '{}_{}m_{}months'.format(feature.table, max_dist,
                                                     n_months)
        #If it fails, feature is not spatiotemporal, send only connection
        except Exception, e:
            table_to_save = feature.table
            logging.info(
                "Failed to call function with months and dist: {}".format(
                    str(e)))
            feature_data = feature.generator_function(con)
        #Every generator function must have a column with parcel_id,
        #inspection_date and the correct number of rows as their
        #corresponding parcels_inspections table in the schema being used
        # TO DO: check that feature_data has the right shape and indexes
        if table_to_save in existing_tables:
            logger.info(
                'Features table {} already exists. Replacing...'.format(
                    feature.table))

        feature_data.to_sql(
            table_to_save,
            engine,
            chunksize=50000,
            if_exists='replace',
            index=True,
            schema=schema,
            #Force saving inspection_date as timestamp without timezone
            dtype={'inspection_date': types.TIMESTAMP(timezone=False)})
        logging.debug("{} table has {} rows".format(table_to_save,
                                                    len(feature_data)))
def make_inspections_features(con, n_months, max_dist):
    """
    Make inspections features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'inspections_views.events_parcel_id'
    date_column = 'date'

    ## ------------------------------------------------------------------------
    ## Make the parcel_id-to-nearby-houses table, if it's not there yet.
    ## ------------------------------------------------------------------------

    query = """
        CREATE TABLE insp2houses_{max_dist}m AS
            SELECT  
                feature_y.parcel_id,
                count(*) as parcels
            FROM (
                SELECT t.parcel_id,
                       p.geom
                FROM (SELECT DISTINCT parcel_id FROM parcels_inspections) t
                LEFT JOIN shape_files.parcels_cincy p
                ON t.parcel_id=p.parcelid
            ) feature_y
            LEFT JOIN shape_files.parcels_cincy parcels
            ON ST_DWithin(feature_y.geom, parcels.geom, {max_dist}*3.281::double precision)
            AND feature_y.parcel_id <> parcels.parcelid
            GROUP BY feature_y.parcel_id
        ;
        CREATE INDEX ON insp2houses_{max_dist}m (parcel_id);
        """.format(max_dist=max_dist)

    #Create a cursor
    cur = con.cursor()

    #Get the current schema
    cur.execute('SELECT current_schema;')
    current_schema = cur.fetchone()[0]

    #Build the table name
    table_name = 'insp2houses_{max_dist}m'.format(max_dist=max_dist)
    # check if table already exists in current schema;
    # if not, create it
    if table_name not in tables_in_schema(current_schema):
        logging.info("Table %s does not exist yet, generating." % table_name)
        cur.execute(query)
    else:
        logging.info("Table %s already exists, skipping." % table_name)

    con.commit()

    ## ------------------------------------------------------------------------
    ## Make the table of nearby events, and the features.
    ## ------------------------------------------------------------------------

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset,
                                               date_column)

    query = """
        DROP TABLE IF EXISTS inspfeatures1_{n_months}months_{max_dist}m;
        CREATE TEMP TABLE inspfeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT t2.parcel_id, t2.inspection_date,
                   t2.event,
                   coalesce(t1.count, 0) as count,
                   (coalesce(t1.count, 0)+1.0) / (coalesce(t2.parcels,0)+5.0) as regularized_count_per_houses 
            FROM (
                SELECT  
                    feature_y.parcel_id,
                    feature_y.inspection_date,
                    coalesce(realinspections.event,'missing') as event,
                    count(*) as count
                FROM (
                    SELECT t.*, p.geom, ih.parcels
                    FROM parcels_inspections t
                    LEFT JOIN shape_files.parcels_cincy p
                    ON t.parcel_id=p.parcelid
                    LEFT JOIN insp2houses_{max_dist}m ih
                    USING (parcel_id)
                ) feature_y
                JOIN (
                    SELECT insp.*, p.geom
                    FROM inspections_views.events_parcel_id insp
                    JOIN shape_files.parcels_cincy p
                    ON insp.parcel_no=p.parcelid
                ) realinspections
                ON realinspections.date < feature_y.inspection_date
                AND (feature_y.inspection_date - '{n_months} month'::interval) <= realinspections.date
                AND ST_DWithin(feature_y.geom, realinspections.geom, {max_dist}*3.281::double precision)
                WHERE feature_y.inspection_date BETWEEN '{min_date}' AND '{max_date}'
                GROUP BY feature_y.parcel_id, feature_y.inspection_date, realinspections.event
            ) t1
            RIGHT JOIN
            (SELECT parcel_id, inspection_date, ft.event, parcels
                FROM parcels_inspections
                JOIN 
                    (select distinct coalesce(event,'missing') as event from inspections_views.events_parcel_id) ft
                ON true
                JOIN insp2houses_{max_dist}m
                USING (parcel_id)
            ) t2
            USING (parcel_id, inspection_date, event)
        ;

        CREATE TEMP TABLE inspfeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS (
        SELECT parcel_id, inspection_date, event, count
        FROM inspfeatures1_{n_months}months_{max_dist}m
        UNION ALL (
            SELECT parcel_id, inspection_date, 
                   event||'_per_houses' as event,
                   regularized_count_per_houses AS count
            FROM inspfeatures1_{n_months}months_{max_dist}m
            )
        ) ;
        CREATE INDEX ON inspfeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date);
        
        -- Now call the pivot function to create columns with the 
        -- different inspection events
        SELECT colpivot('insppivot_{n_months}months_{max_dist}m',
                        'select * from inspfeatures2_{n_months}months_{max_dist}m',
                        array['parcel_id','inspection_date'],
                        array['event'],
                        '#.count',
                        null
        ); -- Note: Not coalescing the counts, as the _per_houses shouldn't be
           --       set to 0. We'll have to leave it to later imputation.
        CREATE INDEX ON insppivot_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- still need to 'save' the tables into a permanent table
        DROP TABLE IF EXISTS inspfeatures_{n_months}months_{max_dist}m;
        CREATE TABLE inspfeatures_{n_months}months_{max_dist}m AS
            SELECT * FROM insppivot_{n_months}months_{max_dist}m ip1
        ;
        """.format(n_months=str(n_months),
                   max_dist=max_dist,
                   min_date=str(min_insp),
                   max_date=str(max_insp))

    cur.execute(query)
    con.commit()

    # fetch the data
    query = """
        SELECT * FROM inspfeatures_{n_months}months_{max_dist}m;
    """.format(n_months=str(n_months), max_dist=max_dist)

    df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date'])

    # clean up the column names
    df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns)
    df.columns = map(
        lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns)

    # drop the last interim table
    query = 'drop table inspfeatures_{n_months}months_{max_dist}m'.format(
        n_months=str(n_months), max_dist=str(max_dist))
    cur.execute(query)
    con.commit()

    return df
Example #7
0
        url = params['source']['url']
    except Exception, e:
        logger.info('URL was not present in the configuration file...')
    else:
        logger.info('Downloading file...'.format(folder))
        data_file = urllib2.urlopen(url)
        #Dowload file replacing it if already exists
        with open(params['source']['filename'],'wb') as output:
            output.write(data_file.read())

    #Step two: check most recent entry in the database
    engine = create_engine(uri)
    table_name = params['storage']['table']
    
    #Check if table exists
    if table_name not in tables_in_schema(schema):
        db_most_recent = None
        logger.info('Table does not exist, diff file will be a copy of source file')
    else:
        query = 'SELECT MAX({}) FROM {}'.format(db_column, table_name) #TMP FIX
        db_most_recent = engine.execute(query).fetchone()[0]
        logger.info('Most recent record in database is: {}'.format(db_most_recent))
    
    #Step three: load and subset the file to include new entries
    #TO DO: avoid loading if most_recent_row is None
    logger.info('Loading {}'.format(params['source']['filename']))
    df = pd.read_csv(params['source']['filename'])
    df[file_column] = pd.to_datetime(df[file_column])

    #Subset only if db_most_recent has a value    
    new_entries = df[df[file_column] > db_most_recent] if db_most_recent else df