def make_nmonths_table_from_template(con, dataset, date_column, min_insp_date, max_insp_date, n_months, max_dist, template, load=False, columns='all'): ''' Load inspections table matched with events that happened X months before. Returns pandas dataframe with the data loaded ''' #Create a cursor cur = con.cursor() #Get the current schema cur.execute('SELECT current_schema;') current_schema = cur.fetchone()[0] #Build the table name table_name = ('insp2{dataset}_{n_months}months' '_{max_dist}m').format(dataset=dataset, n_months=n_months, max_dist=max_dist) #Check if table already exists in current schema #If not, create it if table_name not in tables_in_schema(current_schema): logger.info('Table {} does not exist... Creating it'.format(table_name)) path_to_template = os.path.join(os.environ['ROOT_FOLDER'], 'model', 'features', template) #Load template with SQL statement with open(path_to_template, 'r') as f: sql_script = Template(f.read()) #Replace values in template sql_script = sql_script.substitute(TABLE_NAME=table_name, DATASET=dataset, DATE_COLUMN=date_column, N_MONTHS=n_months, MAX_DIST=max_dist, MIN_INSP_DATE=min_insp_date, MAX_INSP_DATE=max_insp_date) #Run the code using the connection #this is going to take a while cur.execute(sql_script) #Commit changes to db con.commit() #If table created has a geom column which type USER DEFINED, #delete it, we don't need it here cols = columns_for_table_in_schema(table_name, current_schema) if ('geom', 'USER-DEFINED') in cols: #Important: this is not prouction ready since it's #vulnerable to SQL injection, I haven't found any solution #to dynamically pass table names as parameters in psycopg2 #it seems like the only solution is to prevent SQL injection #in the code q = ('ALTER TABLE {} DROP COLUMN geom').format(table_name) cur.execute(q) con.commit() logger.info('Table {} has a PostGIS column, deleting...'.format(table_name)) else: logger.info('Table {} already exists. Skipping...'.format(table_name)) cur.close() #Load data e = create_engine(uri) logger.info('Loading {} month table...'.format(table_name)) if columns=='all': #Since the table contains a geom column, you need to subselect columns #to load otherwise pandas will complain cols = columns_for_table_in_schema(table_name, current_schema) valid_cols = filter(lambda x: x[1]!= 'USER-DEFINED', cols) cols_to_load = [x[0] for x in valid_cols] #If the user passed and array in the columns parameter, only #select those columns else: cols_to_load = columns if load: df = pd.read_sql_table(table_name, e, schema=current_schema, columns=cols_to_load) return df
url = params['source']['url'] except Exception, e: logger.info('URL was not present in the configuration file...') else: logger.info('Downloading file...'.format(folder)) data_file = urllib2.urlopen(url) #Dowload file replacing it if already exists with open(params['source']['filename'], 'wb') as output: output.write(data_file.read()) #Step two: check most recent entry in the database engine = create_engine(uri) table_name = params['storage']['table'] #Check if table exists if table_name not in tables_in_schema(schema): db_most_recent = None logger.info( 'Table does not exist, diff file will be a copy of source file') else: query = 'SELECT MAX({}) FROM {}'.format(db_column, table_name) #TMP FIX db_most_recent = engine.execute(query).fetchone()[0] logger.info( 'Most recent record in database is: {}'.format(db_most_recent)) #Step three: load and subset the file to include new entries #TO DO: avoid loading if most_recent_row is None logger.info('Loading {}'.format(params['source']['filename'])) df = pd.read_csv(params['source']['filename'], dtype=object) df[file_column] = pd.to_datetime(df[file_column])
def generate_features(features_to_generate, n_months, max_dist, inspection_date=None, insp_set='all_inspections'): """ Generate labels and features for all inspections in the inspections database. If inspection_date is passed, features will be generated as if an inspection will occur on that day """ #select schema #depending on the value of inspection date if insp_set=='all_inspections': if inspection_date is None: schema = "features" else: schema = "features_{}".format(inspection_date.strftime('%d%b%Y')).lower() elif insp_set=='field_test': if inspection_date is None: schema = "features_field_test" else: schema = "features_field_test_{}".format(inspection_date.strftime('%d%b%Y')).lower() # use this engine for all data storing (somehow does # not work with the raw connection we create below) engine = create_engine(uri) # all querying is done using a raw connection. in this # connection set to use the relevant schema # this makes sure that we grab the "inspections_parcels" # table from the correct schema in all feature creators con = engine.raw_connection() if schema not in existing_feature_schemas(): #Create schema here cur = con.cursor() cur.execute("CREATE SCHEMA %s;" % schema) con.commit() cur.close() logging.info('Creating schema %s' % schema) else: logging.info('Using existing schema') #Note on SQL injection: schema is either features or features_DATE #date is generated using datetime.datetime.strptime, so if somebody #tries to inject SQL there, it will fail con.cursor().execute("SET SCHEMA '{}'".format(schema)) #Print the current schema by reading it from the db cur = con.cursor() cur.execute('SELECT current_schema;') current_schema = cur.fetchone()[0] logger.info(('Starting feature generation in {}. ' 'n_monts={}. max_dist={}').format(current_schema, n_months, max_dist)) #Get existing tables existing_tables = tables_in_schema(schema) # make a new table that contains one row for every parcel in Cincinnati # this table has three columns: parcel_id, inspection_date, viol_outcome # inspection_date is the one given as a parameter and # is the same for all parcels if 'parcels_inspections' not in existing_tables: logger.info('Creating parcels_inspections table...') if inspection_date is None: inspections = outcome.generate_labels() else: if insp_set=='all_inspections': inspections = outcome.make_fake_inspections_all_parcels_cincy(inspection_date) elif insp_set=='field_test': inspections = outcome.load_inspections_from_field_test(inspection_date) inspections.to_sql("parcels_inspections", engine, chunksize=50000, if_exists='fail', index=False, schema=schema) logging.debug("... table has {} rows".format(len(inspections))) #Create an index to make joins with events_Xmonths_* tables faster cur.execute('CREATE INDEX ON features.parcels_inspections (parcel_id);') cur.execute('CREATE INDEX ON features.parcels_inspections (inspection_date);') con.commit() else: logger.info('parcels_inspections table already exists, skipping...') for feature in features_to_generate: logging.info("Generating {} features".format(feature.table)) #Try generating features with the n_months argument try: logging.info(("Generating {} " "features for {} months " "and within {} m").format(feature.table, n_months, max_dist)) feature_data = feature.generator_function(con, n_months, max_dist) table_to_save = '{}_{}m_{}months'.format(feature.table, max_dist, n_months) #If it fails, feature is not spatiotemporal, send only connection except Exception, e: table_to_save = feature.table logging.info("Failed to call function with months and dist: {}".format(str(e))) feature_data = feature.generator_function(con) #Every generator function must have a column with parcel_id, #inspection_date and the correct number of rows as their #corresponding parcels_inspections table in the schema being used # TO DO: check that feature_data has the right shape and indexes if table_to_save in existing_tables: logger.info('Features table {} already exists. Replacing...'.format(feature.table)) feature_data.to_sql(table_to_save, engine, chunksize=50000, if_exists='replace', index=True, schema=schema, #Force saving inspection_date as timestamp without timezone dtype={'inspection_date': types.TIMESTAMP(timezone=False)}) logging.debug("{} table has {} rows".format(table_to_save, len(feature_data)))
def generate_features(features_to_generate, n_months, max_dist, inspection_date=None, insp_set='all_inspections'): """ Generate labels and features for all inspections in the inspections database. If inspection_date is passed, features will be generated as if an inspection will occur on that day """ #select schema #depending on the value of inspection date if insp_set == 'all_inspections': if inspection_date is None: schema = "features" else: schema = "features_{}".format( inspection_date.strftime('%d%b%Y')).lower() elif insp_set == 'field_test': if inspection_date is None: schema = "features_field_test" else: schema = "features_field_test_{}".format( inspection_date.strftime('%d%b%Y')).lower() # use this engine for all data storing (somehow does # not work with the raw connection we create below) engine = create_engine(uri) # all querying is done using a raw connection. in this # connection set to use the relevant schema # this makes sure that we grab the "inspections_parcels" # table from the correct schema in all feature creators con = engine.raw_connection() # con = engine.connect() if schema not in existing_feature_schemas(): #Create schema here cur = con.cursor() cur.execute("CREATE SCHEMA %s;" % schema) con.commit() cur.close() logging.info('Creating schema %s' % schema) else: logging.info('Using existing schema') #Note on SQL injection: schema is either features or features_DATE #date is generated using datetime.datetime.strptime, so if somebody #tries to inject SQL there, it will fail con.cursor().execute("SET SCHEMA '{}'".format(schema)) #Print the current schema by reading it from the db cur = con.cursor() cur.execute('SELECT current_schema;') current_schema = cur.fetchone()[0] logger.info(('Starting feature generation in {}. ' 'n_months={}. max_dist={}').format(current_schema, n_months, max_dist)) #Get existing tables existing_tables = tables_in_schema(schema) # set the search path, otherwise won't find ST_DWithin() cur = con.cursor() cur.execute("SET search_path TO {schema}, public;".format(schema=schema)) con.commit() # make a new table that contains one row for every parcel in Cincinnati # this table has three columns: parcel_id, inspection_date, viol_outcome # inspection_date is the one given as a parameter and # is the same for all parcels if 'parcels_inspections' not in existing_tables: logger.info('Creating parcels_inspections table...') if inspection_date is None: inspections = outcome.generate_labels() else: if insp_set == 'all_inspections': inspections = outcome.make_fake_inspections_all_parcels_cincy( inspection_date) elif insp_set == 'field_test': inspections = outcome.load_inspections_from_field_test( inspection_date) inspections.to_sql("parcels_inspections", engine, chunksize=50000, if_exists='fail', index=False, schema=schema) logging.debug("... table has {} rows".format(len(inspections))) #Create an index to make joins with events_Xmonths_* tables faster cur.execute('CREATE INDEX ON parcels_inspections (parcel_id);') cur.execute('CREATE INDEX ON parcels_inspections (inspection_date);') cur.execute( 'CREATE INDEX ON parcels_inspections (parcel_id, inspection_date);' ) con.commit() else: logger.info('parcels_inspections table already exists, skipping...') for feature in features_to_generate: logging.info("Generating {} features".format(feature.table)) #Try generating features with the n_months argument try: logging.info(("Generating {} " "features for {} months " "and within {} m").format(feature.table, n_months, max_dist)) feature_data = feature.generator_function(con, n_months, max_dist) table_to_save = '{}_{}m_{}months'.format(feature.table, max_dist, n_months) #If it fails, feature is not spatiotemporal, send only connection except Exception, e: table_to_save = feature.table logging.info( "Failed to call function with months and dist: {}".format( str(e))) feature_data = feature.generator_function(con) #Every generator function must have a column with parcel_id, #inspection_date and the correct number of rows as their #corresponding parcels_inspections table in the schema being used # TO DO: check that feature_data has the right shape and indexes if table_to_save in existing_tables: logger.info( 'Features table {} already exists. Replacing...'.format( feature.table)) feature_data.to_sql( table_to_save, engine, chunksize=50000, if_exists='replace', index=True, schema=schema, #Force saving inspection_date as timestamp without timezone dtype={'inspection_date': types.TIMESTAMP(timezone=False)}) logging.debug("{} table has {} rows".format(table_to_save, len(feature_data)))
def make_inspections_features(con, n_months, max_dist): """ Make inspections features Input: db_connection: connection to postgres database. "set schema ..." must have been called on this connection to select the correct schema from which to load inspections Output: A pandas dataframe, with one row per inspection and one column per feature. """ dataset = 'inspections_views.events_parcel_id' date_column = 'date' ## ------------------------------------------------------------------------ ## Make the parcel_id-to-nearby-houses table, if it's not there yet. ## ------------------------------------------------------------------------ query = """ CREATE TABLE insp2houses_{max_dist}m AS SELECT feature_y.parcel_id, count(*) as parcels FROM ( SELECT t.parcel_id, p.geom FROM (SELECT DISTINCT parcel_id FROM parcels_inspections) t LEFT JOIN shape_files.parcels_cincy p ON t.parcel_id=p.parcelid ) feature_y LEFT JOIN shape_files.parcels_cincy parcels ON ST_DWithin(feature_y.geom, parcels.geom, {max_dist}*3.281::double precision) AND feature_y.parcel_id <> parcels.parcelid GROUP BY feature_y.parcel_id ; CREATE INDEX ON insp2houses_{max_dist}m (parcel_id); """.format(max_dist=max_dist) #Create a cursor cur = con.cursor() #Get the current schema cur.execute('SELECT current_schema;') current_schema = cur.fetchone()[0] #Build the table name table_name = 'insp2houses_{max_dist}m'.format(max_dist=max_dist) # check if table already exists in current schema; # if not, create it if table_name not in tables_in_schema(current_schema): logging.info("Table %s does not exist yet, generating." % table_name) cur.execute(query) else: logging.info("Table %s already exists, skipping." % table_name) con.commit() ## ------------------------------------------------------------------------ ## Make the table of nearby events, and the features. ## ------------------------------------------------------------------------ #Get the time window for which you can generate features min_insp, max_insp = check_date_boundaries(con, n_months, dataset, date_column) query = """ DROP TABLE IF EXISTS inspfeatures1_{n_months}months_{max_dist}m; CREATE TEMP TABLE inspfeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS SELECT t2.parcel_id, t2.inspection_date, t2.event, coalesce(t1.count, 0) as count, (coalesce(t1.count, 0)+1.0) / (coalesce(t2.parcels,0)+5.0) as regularized_count_per_houses FROM ( SELECT feature_y.parcel_id, feature_y.inspection_date, coalesce(realinspections.event,'missing') as event, count(*) as count FROM ( SELECT t.*, p.geom, ih.parcels FROM parcels_inspections t LEFT JOIN shape_files.parcels_cincy p ON t.parcel_id=p.parcelid LEFT JOIN insp2houses_{max_dist}m ih USING (parcel_id) ) feature_y JOIN ( SELECT insp.*, p.geom FROM inspections_views.events_parcel_id insp JOIN shape_files.parcels_cincy p ON insp.parcel_no=p.parcelid ) realinspections ON realinspections.date < feature_y.inspection_date AND (feature_y.inspection_date - '{n_months} month'::interval) <= realinspections.date AND ST_DWithin(feature_y.geom, realinspections.geom, {max_dist}*3.281::double precision) WHERE feature_y.inspection_date BETWEEN '{min_date}' AND '{max_date}' GROUP BY feature_y.parcel_id, feature_y.inspection_date, realinspections.event ) t1 RIGHT JOIN (SELECT parcel_id, inspection_date, ft.event, parcels FROM parcels_inspections JOIN (select distinct coalesce(event,'missing') as event from inspections_views.events_parcel_id) ft ON true JOIN insp2houses_{max_dist}m USING (parcel_id) ) t2 USING (parcel_id, inspection_date, event) ; CREATE TEMP TABLE inspfeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS ( SELECT parcel_id, inspection_date, event, count FROM inspfeatures1_{n_months}months_{max_dist}m UNION ALL ( SELECT parcel_id, inspection_date, event||'_per_houses' as event, regularized_count_per_houses AS count FROM inspfeatures1_{n_months}months_{max_dist}m ) ) ; CREATE INDEX ON inspfeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- Now call the pivot function to create columns with the -- different inspection events SELECT colpivot('insppivot_{n_months}months_{max_dist}m', 'select * from inspfeatures2_{n_months}months_{max_dist}m', array['parcel_id','inspection_date'], array['event'], '#.count', null ); -- Note: Not coalescing the counts, as the _per_houses shouldn't be -- set to 0. We'll have to leave it to later imputation. CREATE INDEX ON insppivot_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- still need to 'save' the tables into a permanent table DROP TABLE IF EXISTS inspfeatures_{n_months}months_{max_dist}m; CREATE TABLE inspfeatures_{n_months}months_{max_dist}m AS SELECT * FROM insppivot_{n_months}months_{max_dist}m ip1 ; """.format(n_months=str(n_months), max_dist=max_dist, min_date=str(min_insp), max_date=str(max_insp)) cur.execute(query) con.commit() # fetch the data query = """ SELECT * FROM inspfeatures_{n_months}months_{max_dist}m; """.format(n_months=str(n_months), max_dist=max_dist) df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date']) # clean up the column names df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns) df.columns = map( lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns) # drop the last interim table query = 'drop table inspfeatures_{n_months}months_{max_dist}m'.format( n_months=str(n_months), max_dist=str(max_dist)) cur.execute(query) con.commit() return df
url = params['source']['url'] except Exception, e: logger.info('URL was not present in the configuration file...') else: logger.info('Downloading file...'.format(folder)) data_file = urllib2.urlopen(url) #Dowload file replacing it if already exists with open(params['source']['filename'],'wb') as output: output.write(data_file.read()) #Step two: check most recent entry in the database engine = create_engine(uri) table_name = params['storage']['table'] #Check if table exists if table_name not in tables_in_schema(schema): db_most_recent = None logger.info('Table does not exist, diff file will be a copy of source file') else: query = 'SELECT MAX({}) FROM {}'.format(db_column, table_name) #TMP FIX db_most_recent = engine.execute(query).fetchone()[0] logger.info('Most recent record in database is: {}'.format(db_most_recent)) #Step three: load and subset the file to include new entries #TO DO: avoid loading if most_recent_row is None logger.info('Loading {}'.format(params['source']['filename'])) df = pd.read_csv(params['source']['filename']) df[file_column] = pd.to_datetime(df[file_column]) #Subset only if db_most_recent has a value new_entries = df[df[file_column] > db_most_recent] if db_most_recent else df