def dat_crime(fpath=None): # Step Zero: Create dat_crime table raw_crime(fpath=fpath) dedupe_crime() src_crime() src_crime_table = Table('src_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) dat_crime_table = crime_table('dat_chicago_crimes_all', Base.metadata) dat_crime_table.append_column(Column('chicago_crimes_all_row_id', Integer, primary_key=True)) dat_crime_table.append_column(Column('start_date', TIMESTAMP, server_default=text('CURRENT_TIMESTAMP'))) dat_crime_table.append_column(Column('end_date', TIMESTAMP, server_default=text('NULL'))) dat_crime_table.append_column(Column('current_flag', Boolean, server_default=text('TRUE'))) dat_crime_table.append_constraint(UniqueConstraint('id', 'start_date')) dat_crime_table.create(bind=engine, checkfirst=True) new_cols = ['start_date', 'end_date', 'current_flag', 'chicago_crimes_all_row_id'] dat_ins = dat_crime_table.insert()\ .from_select( [c for c in dat_crime_table.columns.keys() if c not in new_cols], select([c for c in src_crime_table.columns]) ) conn = engine.contextual_connect() res = conn.execute(dat_ins) cols = crime_master_cols(dat_crime_table) master_ins = MasterTable.insert()\ .from_select( [c for c in MasterTable.columns.keys() if c != 'master_row_id'], select(cols)\ .select_from(dat_crime_table) ) conn = engine.contextual_connect() res = conn.execute(master_ins) cleanup_temp_tables() return 'DAT crime created'
def dat_crime(fpath=None): # Step Zero: Create dat_crime table raw_crime(fpath=fpath) dedupe_crime() src_crime() src_crime_table = Table('src_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) dat_crime_table = crime_table('dat_chicago_crimes_all', Base.metadata) dat_crime_table.append_column(Column('chicago_crimes_all_row_id', Integer, primary_key=True)) dat_crime_table.append_column(Column('start_date', TIMESTAMP, server_default=text('CURRENT_TIMESTAMP'))) dat_crime_table.append_column(Column('end_date', TIMESTAMP, server_default=text('NULL'))) dat_crime_table.append_column(Column('current_flag', Boolean, server_default=text('TRUE'))) dat_crime_table.append_constraint(UniqueConstraint('id', 'start_date')) dat_crime_table.create(bind=engine, checkfirst=True) new_cols = ['start_date', 'end_date', 'current_flag', 'chicago_crimes_all_row_id'] dat_ins = dat_crime_table.insert()\ .from_select( [c for c in dat_crime_table.columns.keys() if c not in new_cols], select([c for c in src_crime_table.columns]) ) conn = engine.contextual_connect() res = conn.execute(dat_ins) cols = crime_master_cols(dat_crime_table) master_ins = MasterTable.insert()\ .from_select( [c for c in MasterTable.columns.keys() if c != 'master_row_id'], select(cols)\ .select_from(dat_crime_table) ) conn = engine.contextual_connect() res = conn.execute(master_ins) return 'DAT crime created'
def src_crime(): # Step Three: Create New table with unique ids raw_crime_table = Table('raw_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) dedupe_crime_table = Table('dedup_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) src_crime_table = crime_table('src_chicago_crimes_all', Base.metadata) src_crime_table.drop(bind=engine, checkfirst=True) src_crime_table.create(bind=engine) ins = src_crime_table.insert()\ .from_select( src_crime_table.columns.keys(), select([c for c in raw_crime_table.columns if c.name != 'dup_row_id'])\ .where(raw_crime_table.c.dup_row_id == dedupe_crime_table.c.dup_row_id) ) conn = engine.contextual_connect() conn.execute(ins) return 'Source table created'
def raw_crime(fpath=None, tablename='raw_chicago_crimes_all'): # Step One: Load raw downloaded data if not fpath: fpath = download_crime() raw_crime_table = crime_table(tablename, Base.metadata) raw_crime_table.drop(bind=engine, checkfirst=True) raw_crime_table.append_column(Column('dup_row_id', Integer, primary_key=True)) raw_crime_table.create(bind=engine) conn = engine.raw_connection() cursor = conn.cursor() with gzip.open(fpath, 'rb') as f: cursor.copy_expert("COPY %s \ (id, case_number, orig_date, block, iucr, primary_type, \ description, location_description, arrest, domestic, \ beat, district, ward, community_area, fbi_code, \ x_coordinate, y_coordinate, year, updated_on, \ latitude, longitude, location) FROM STDIN WITH \ (FORMAT CSV, HEADER true, DELIMITER ',')" % tablename, f) conn.commit() return 'Raw Crime data inserted'
def raw_crime(fpath=None, tablename='raw_chicago_crimes_all'): # Step One: Load raw downloaded data if not fpath: fpath = download_crime() raw_crime_table = crime_table(tablename, Base.metadata) raw_crime_table.drop(bind=engine, checkfirst=True) raw_crime_table.append_column(Column('dup_row_id', Integer, primary_key=True)) raw_crime_table.create(bind=engine) conn = engine.raw_connection() cursor = conn.cursor() with gzip.open(fpath, 'rb') as f: cursor.copy_expert("COPY %s \ (id, case_number, orig_date, block, iucr, primary_type, \ description, location_description, arrest, domestic, \ beat, district, ward, community_area, fbi_code, \ x_coordinate, y_coordinate, year, updated_on, \ latitude, longitude, location) FROM STDIN WITH \ (FORMAT CSV, HEADER true, DELIMITER ',')" % tablename, f) conn.commit() return raw_crime_table
def dat_crime(): # Step Zero: Create dat_crime table try: src_crime_table = Table('src_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) except NoSuchTableError: src_crime_table = src_crime() dat_crime_table = crime_table('dat_chicago_crimes_all', Base.metadata) dat_crime_table.append_column(Column('chicago_crimes_all_row_id', Integer, primary_key=True)) dat_crime_table.append_column(Column('start_date', TIMESTAMP, default=datetime.now)) dat_crime_table.append_column(Column('end_date', TIMESTAMP, default=None)) dat_crime_table.append_column(Column('current_flag', Boolean, default=True)) dat_crime_table.append_constraint(UniqueConstraint('id', 'start_date')) dat_crime_table.create(bind=engine, checkfirst=True) new_cols = ['start_date', 'end_date', 'current_flag', 'chicago_crimes_all_row_id'] dat_ins = dat_crime_table.insert()\ .from_select( [c for c in dat_crime_table.columns.keys() if c not in new_cols], select([c for c in src_crime_table.columns]) ) conn = engine.connect() conn.execute(dat_ins) return dat_crime_table