def get_connection(source): ''' Get a DB connection from the CLI args or defaults to postgres:///mydb ''' source.engine = create_engine(source.db_name) ui.header('Connecting to database %s' % source.db_name) if not database_exists(source.engine.url): create_database(source.engine.url) ui.item("Creating database %s" % source.db_name) Session = sessionmaker() Session.configure(bind=source.engine) source.session = Session() gis_q = 'SELECT PostGIS_version();' # Check for PostGIS support try: source.session.execute(gis_q) source.geo = True except OperationalError: source.geo = False except ProgrammingError: source.geo = False source.session.commit() if source.geo: ui.item('PostGIS is installed. Geometries will be imported ' 'as PostGIS geoms.')
def __get_socrata_data(self, page_size=5000): ''' Iterate over a datasets pages using the Socrata API ''' ui.item("Gathering data (this can take a bit for large datasets).") page_num = 0 more_pages = True while more_pages: try: api_data = self.client.get( self.dataset_id, limit=page_size, offset=page_size * page_num, ) if len(api_data) < page_size: more_pages = False page_num += 1 yield api_data except: ui.item("Sleeping for 10 seconds to avoid timeout") time.sleep(10)
def create_metadata(data, mappings): ''' Given a dictionary of data, maps python types of each value to SQLAlchemy types. ''' ui.item("Gathering metadata") print() metadata = [] for col_name in data[0].keys(): for record in data: if col_name == 'geometry': metadata.append( (col_name, Geometry(geometry_type='GEOMETRY', srid=4326))) break elif record[col_name]: try: py_type = type(record[col_name]) print(col_name, ":", py_type) metadata.append((col_name, mappings[py_type])) break except KeyError: warnings.warn( 'Unable to map "%s" to a SQL type.' % col_name) break return metadata
def __get_metadata(self): ''' Maps provided esriFieldTypes to sqlalchemy types. ''' ui.item("Gathering metadata") print() metadata = [] for col in self.data_info['fields']: col_name = col['name'].lower().replace(" ", "_") print(col_name, ": ", col['type']) metadata.append((col_name, self.col_mappings[col['type']])) metadata.append(('geometry', \ Geometry(geometry_type='GEOMETRY', srid=self.srid))) return metadata
def geojson_data(geojson): ''' Parses ['features'] within geojson data and reformats all variable names. ''' ui.item( "Gathering data (this can take a bit for large datasets).") new_data = [] data = geojson['features'] for row in data: output = \ {k.lower().replace(" ", "_"): v \ for k, v in row['properties'].items()} output['geometry'] = row['geometry'] new_data.append(output) return new_data
def spreadsheet_metadata(spreadsheet): ''' Given a spreadsheet object, maps column types as interpreted by pandas into SQLAlchemy types. ''' ui.item("Gathering metadata") print() metadata = [] for col_name, col_type in dict(spreadsheet.df.dtypes).items(): print(col_name, ":", col_type) try: metadata.append((col_name, spreadsheet.col_mappings[col_type])) except KeyError: warnings.warn('Unable to map "%s" to a SQL type.' % col_name) continue return metadata
def __get_metadata(self): ''' Uses provided metadata to map column types to SQLAlchemy. ''' ui.item("Gathering metadata") print() metadata = [] for col in self.client.get_metadata(self.dataset_id)['columns']: print(col['fieldName'], ":", col['dataTypeName']) try: metadata.append( (col['fieldName'], self.col_mappings[col['dataTypeName']])) except KeyError: warnings.warn('Unable to map "%s" to a SQL type.' % col['fieldName']) continue return metadata
def get_binding(source): ''' Translate the source's metadata into a SQLAlchemy binding This looks at each column type in the metadata and creates a SQLAlchemy binding with columns to match. For now it fails loudly if it encounters a column type we've yet to map to its SQLAlchemy type. ''' record_fields = { '__tablename__': source.tbl_name, '_pk_': Column(Integer, primary_key=True) } ui.header('Setting up new table, "%s", from %s source fields' % (source.tbl_name, source.name)) for col_name, col_type in source.metadata: if isinstance(col_type, type(Geometry())) and not source.geo: try: source.session.execute("CREATE EXTENSION POSTGIS;") ui.item( "Adding PostGIS extension to support %s column." \ % col_name) source.session.commit() source.geo = True except: msg = ( '"%s" is a %s column but your database doesn\'t support ' 'PostGIS so it\'ll be skipped.') % ( col_name, col_type, ) ui.item(msg) continue if col_name.startswith(':@computed'): ui.item('Ignoring computed column "%s".' % col_name) continue try: col_name = utils.clean_string(col_name) assert (col_type ), 'Unable to map %s type to a SQL type.' % (source.name) record_fields[col_name] = Column(col_type) except NotImplementedError as e: ui.item('%s' % str(e)) source.binding = type('DataRecord', (declarative_base(), ), record_fields)
def insert_source(source): ''' Gets the connection and binding and inserts data. ''' get_connection(source) if not isinstance(source, sc.CenPy): get_binding(source) if source.engine.dialect.has_table(source.engine, source.tbl_name): print() warnings.warn(("Destination table already exists. Current table " + "will be dropped and replaced.")) print() if not isinstance(source, sc.CenPy): source.binding.__table__.drop(source.engine) try: if not isinstance(source, sc.CenPy): source.binding.__table__.create(source.engine) except ProgrammingError as e: raise CLIError('Error creating destination table: %s' % str(e)) circle_bar = FillingCirclesBar(' ▶ Loading from source', max=source.num_rows) source.insert(circle_bar) circle_bar.finish() ui.item('Committing rows (this can take a bit for large datasets).') source.session.commit() success = 'Successfully imported %s rows.' % (source.num_rows) ui.header(success, color='\033[92m') if source.name == "Socrata" and source.client: source.client.close() return
def parse_items(output_dict): try: for dataset in output[output_dict]: if dataset: location, tbl_name = list(dataset.items())[0] source = source_mapper[output_dict](location) if tbl_name: source.tbl_name = tbl_name if db_name: source.db_name = db_name insert_source(source) else: continue except Exception as e: ui.item(("Skipping %s load due to error: \"%s\". Double check " + "formatting of bulk_load.yaml if this was " + "unintentional.") % (output_dict, e)) print() pass
def __extract_file(self): ''' Extracts data from zip files if a hyperlink is provided and reads the saved shp file. Creates the default table name using a sanitized version of the file's name. ''' try: z = zipfile.ZipFile(io.BytesIO( requests.get(self.location).content)) ui.item("Extracting shapefile to folder") z.extractall() shp = [y for y in sorted(z.namelist()) for ending in \ ['dbf', 'prj', 'shp', 'shx'] if y.endswith(ending)][2] except: shp = self.location ui.item("Reading shapefile") #set default table name tbl_name = shp[shp.rfind("/") + 1:-4].lower() tbl_name = utils.clean_string(tbl_name) return tbl_name, shapefile.Reader(shp).__geo_interface__
def create_df(self, product, year, place_type, place, level, variables): if product == 'Decennial2010': cen_prod = products.Decennial2010() elif product == 'ACS' and year: cen_prod = products.ACS(year) elif product == 'ACS': cen_prod = products.ACS() place_mapper = { 'msa': cen_prod.from_msa, 'csa': cen_prod.from_csa, 'county': cen_prod.from_county, 'state': cen_prod.from_state, 'placename': cen_prod.from_place } ui.item(("Retrieving variables %s for all %ss in %s. " + "This can take some time for large datasets.") % \ (variables, level, place)) print() print(place, level, variables) df = place_mapper[place_type](place, level=level, variables=variables) df.columns = [utils.clean_string(x) for x in df.columns] return df
def load_yaml(): output = yaml.load(open('bulk_load.yaml'), Loader=Loader) db_name = output['DATABASE'] source_mapper = { 'GEOJSONS': sc.GeoJson, 'SHAPEFILES': sc.Shape, 'CSVS': sc.Csv, 'EXCELS': sc.Excel, 'HUD_TABLES': sc.HudPortal } def parse_items(output_dict): try: for dataset in output[output_dict]: if dataset: location, tbl_name = list(dataset.items())[0] source = source_mapper[output_dict](location) if tbl_name: source.tbl_name = tbl_name if db_name: source.db_name = db_name insert_source(source) else: continue except Exception as e: ui.item(("Skipping %s load due to error: \"%s\". Double check " + "formatting of bulk_load.yaml if this was " + "unintentional.") % (output_dict, e)) print() pass for output_dict in source_mapper.keys(): parse_items(output_dict) try: socrata_sites = output.get('SOCRATA').get('sites') app_token = output.get('SOCRATA').get('app_token') if socrata_sites: for site in socrata_sites: url = site['url'] for dataset in site['datasets']: dataset_id, tbl_name = list(dataset.items())[0] source = sc.SocrataPortal(url, dataset_id, app_token, tbl_name) if db_name: source.db_name = db_name if tbl_name: source.tbl_name = tbl_name insert_source(source) except Exception as e: ui.item(("Skipping Socrata load due to error: \"%s\". Double check " + "formatting of bulk_load.yaml if this is was " + "unintentional.") \ % e) print() pass try: place_type = output['CENSUS'].get('place_type') place_name = output['CENSUS'].get('place_name') level = output['CENSUS'].get('level') for dataset in output['CENSUS']['datasets']: if dataset.get('ACS'): product = 'ACS' if dataset.get('DECENNIAL2010'): product = 'Decennial2010' year = dataset[product].get('year') tbl_name = dataset[product]['tbl_name'] variables = dataset[product]['variables'] source = sc.CenPy(product, year, place_type, place_name, level, variables) if db_name: source.db_name = db_name if tbl_name: source.tbl_name = tbl_name insert_source(source) except Exception as e: ui.item( ("Skipping Census load due to error: \"%s\". Double check " + "formatting of bulk_load.yaml if this was unintentional.") % e) print() pass
def insert(self, circle_bar): ui.item("Inserting into PostGIS.") self.df.postgis.to_postgis(con=self.engine, table_name=self.tbl_name, geometry='geometry', if_exists='replace')