def load_xls(cursor, datadir, schema_name, config_path, config_name): """Load xlsx into postgres for multiple files""" files = os.listdir(datadir) files_xls = [f for f in files if f.split('.')[-1] in ('xlsx', 'xls')] logger.info(files_xls) for filename in files_xls: df = pd.read_excel(datadir + '/' + filename, skiprows=1) if df.empty: logger.info('No data') continue df.columns = map(str.lower, df.columns) logger.info("added " + filename) logger.info(df.columns) # load the data into pg engine = postgres_engine_pandas(config_path, config_name) table_name = filename.split('.')[0] create_pg_schema(cursor, schema_name) df.to_sql(table_name, engine, schema=schema_name, if_exists='replace' ) # ,dtype={geom: Geometry('POINT', srid='4326')}) logger.info(filename + ' added as ' + table_name) create_geoms(cursor, schema_name, table_name, 'x-coordinaat', 'y-coordinaat') cursor.execute( sql.SQL("""ALTER TABLE {}.{} ADD COLUMN id SERIAL PRIMARY KEY;"""). format(sql.Identifier(schema_name), sql.Identifier(table_name)), )
def psycopg_connection_string(config_full_path, db_config_name): """ Postgres connection string for psycopg2. Args: 1. config_full_path: location of the config.ini file including the name of the file, for example authentication/config.ini 2. db_config_name: dev or docker to get the ip user/password and port values. Returns: Returns the psycopg required connection string: 'PG:host= port= user= dbname= password='******'Config names: {}'.format(config.sections())) print(db_config_name) host = config.get(db_config_name, 'host') logger.info(host) port = config.get(db_config_name, 'port') user = config.get(db_config_name, 'user') dbname = config.get(db_config_name, 'dbname') password = config.get(db_config_name, 'password') return 'host={} port={} user={} dbname={} password={}'.format( host, port, user, dbname, password)
def get_kvk_json(url, params, api_key = None): """ Get a json response from a url, provided params + api_key. Args: url: api endpoint params: kvkNumber, branchNumber, rsin, street, houseNumber, postalCode, city, tradeName, or provide lists/dicts of values api_key: kvk api_key. add KVK_API_KEY to your ENV variables Returns: parsed json or error message """ API_KEY = os.environ['kvk_api_key'] if API_KEY: url += '&user_key={}'.format(API_KEY) else: return logger.error('please provide api_key') response = requests.get(url, params) try: response.raise_for_status() # Raises 'HTTPError', if one occurred except requests.exceptions.HTTPError as e: raise errors.InvalidResponse(response) from e json_response = response_to_json(response) logger.info("received data from {} ".format(url)) return json_response
def find_resource_id_if_exists(url, dataset_name, file_name): metadata = requests.get(url+'/api/3/action/package_show?id='+dataset_name) package_metadata = metadata.json() for resource in package_metadata['result']['resources']: if resource['name'] == file_name: logger.info('Found existing filename: {}, wil update it now...'.format(resource['name'])) return resource['id']
def upload_file(connection, container_path, files_directory): """ Upload file to the objectstore. Args: 1. connection = Objectstore connection based on from helpers.connection import objectstore_connection 2. container_path = Name of container/prefix/subfolder, for example Dataservices/aanvalsplan_schoon/crow 3. filename_path = full path including the name of file, for example: data/test.csv Uses mime for content_type: https://stackoverflow.com/questions/43580/how-to-find-the-mime-type-of-a-file-in-python Result: Uploads a file to the objectstore and checks if it exists on in the defined container_path. """ files = [ f for f in listdir(files_directory) if isfile(join(files_directory, f)) ] for filename in files: filename_path = os.path.join(files_directory, filename) with open(filename_path, 'rb') as contents: mime = MimeTypes() content_type = mime.guess_type(filename_path)[0] logger.info("Found content type '{}'".format(content_type)) put_object(connection, container_path, filename, contents, content_type) check_existence_object(connection, container_path, filename)
def psycopg_connection_string(config_full_path, db_config_name): """ Postgres connection string for psycopg2. Args: 1. config_full_path: location of the config.ini file including the name of the file, for example authentication/config.ini 2. db_config_name: dev or docker to get the ip user/password and port values. Returns: Returns the psycopg required connection string: 'PG:host= port= user= dbname= password='******'Config names: {}'.format(config.sections())) print(db_config_name) host = config.get(db_config_name,'host') logger.info(host) port = config.get(db_config_name,'port') user = config.get(db_config_name,'user') dbname = config.get(db_config_name,'dbname') password = config.get(db_config_name,'password') return 'host={} port={} user={} dbname={} password={}'.format( host, port, user, dbname, password )
def load_xls(datadir, config_path, db_config_name): """Load xlsx into postgres for multiple files""" files = os.listdir(datadir) files_xls = [f for f in files if f.split('.')[-1] in ('xlsx', 'xls')] logger.info(files_xls) for filename in files_xls: df = pd.read_excel(datadir + '/' + filename) if df.empty: logger.info('No data') continue logger.info("added " + filename) logger.info(df.columns) # load the data into pg engine = postgres_engine_pandas(config_path, db_config_name) # TODO: link to to_sql function table_name = filename.split('.')[0] df.to_sql(table_name, engine, if_exists='replace', index=True, index_label='idx' ) # ,dtype={geom: Geometry('POINT', srid='4326')}) logger.info(filename + ' added as ' + table_name)
def objectstore_connection(config_full_path, config_name, print_config_vars=None): """ Get an objectsctore connection. Args: 1. config_full_path: /path_to_config/config.ini or config.ini if in root. 2. config_name: objectstore 3. print_config_vars: if set to True: print all variables from the config file Returns: An objectstore connection session. """ assert os.environ['OBJECTSTORE_PASSWORD'] config = get_config(config_full_path) if print_config_vars: logger.info('config variables.. :{}'.format(OBJECTSTORE)) conn = Connection(authurl=config.get(config_name, 'AUTHURL'), user=config.get(config_name, 'USER'), key=os.environ['OBJECTSTORE_PASSWORD'], tenant_name=config.get(config_name, 'TENANT_NAME'), auth_version=config.get(config_name, 'VERSION'), os_options={'tenant_id': config.get(config_name, 'TENANT_ID'), 'region_name': config.get(config_name, 'REGION_NAME'), # 'endpoint_type': 'internalURL' }) logger.info('Established successfull connection to {}'.format(config.get(config_name, 'TENANT_NAME'))) return conn
def flatten_rounds(endpoint, key): list_uris = get_objects(endpoint, key) total_objects = [] n = 1 for uri in list_uris: inspection = {} inspection_round = get_json(uri) # pp.pprint(inspection_round.keys()) inspections = inspection_round.pop('inspections') m = 0 for inspection in inspections: inspection.pop('closingUserDisplayName') geojson = inspection['location']['geoJsonFeature'].pop('geometry') inspection.update(inspection_round) #pp.pprint(inspection.keys()) results = inspection.pop('results') for result in results: result.pop('creatingUserDisplayName') result.update(inspection) result = flatten_json(result) result.update(geojson=json.dumps(geojson)) # pp.pprint(result) total_objects.append(result) m += 1 logger.info('{}: {} of {} rounds, {} of {} inspections'.format( endpoint, n, len(list_uris), m, len(inspections))) n += 1 df = pd.DataFrame.from_dict(total_objects, orient='columns', dtype=None) return df
def getJsonData(url, accessToken): """ Get a json response from a url with accesstoken. Args: 1. url: api endpoint 2. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken(usertype='employee_plus', scopes='BRK/RS,BRK/RSN/,BRK/RO') Returns: parsed json or error message """ # noqa response = requests.get(url, headers=accessToken) # Get first page for count if response.status_code != 200: if response.status_code == 404 or response.status_code == 401: logger.info('Error status: {} {}'.format( str(response.status_code), "trying with trailing / ...")) response = requests.get(url + '/', headers=accessToken) else: return logger.info('Error status: ' + str(response.status_code)) jsonData = response.json() logger.info("recieved data from {} ".format(url)) return jsonData
def save_geojson(cursor, pg_string, table_name, output_folder): create_dir_if_not_exists(output_folder) full_path = os.path.join(output_folder, table_name + '.geojson') cmd = [ 'ogr2ogr', '-F', 'GeoJSON', full_path, 'PG:' + pg_string, table_name ] stdout = subprocess.Popen(cmd, stdout=subprocess.PIPE) logger.info("Writen GeoJSON to: {}".format(full_path))
def get_table_names(mdb_file): '''Get table names''' table_names = subprocess.Popen( ['mdb-tables', '-1', mdb_file], stdout=subprocess.PIPE).communicate()[0].decode('utf-8') tables = table_names.strip().split('\n') logger.info('Tables: {}'.format(tables)) return tables
def run_command_sync(cmd, allow_fail=False): '''run a shell command and return the command line output''' encoding = sys.stdout.encoding logger.info('Running %s', scrub(cmd)) stdout = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0].replace( b'\r', b'').decode(encoding).splitlines() return stdout
def import_shapefiles(cursor, pg_string, data_folder, shp_dirs): for shp_dir in shp_dirs: create_pg_schema(cursor, shp_dir['schema']) full_path = os.path.join(data_folder, shp_dir['path'], "*.shp") for shp_filename in glob.glob(full_path): logger.info('Found: ' + shp_filename + ', saving to Postgres') shp2psql(shp_filename, pg_string, shp_filename.split('/')[-1][:-4], shp_dir['schema']) cursor.close()
def save_table_to_postgres(engine, dataframe, tablename): """Load a flattened dataframe into a table""" logger.info('Loading {} to postgres.'.format(tablename)) dataframe.to_sql(tablename, engine, if_exists='replace', index=True, index_label='idx') logger.info('{} added to postgres.'.format(tablename))
def retrywithtrailingslash(url, access_token): response = requests.get(url, headers=access_token) # Get first page for count if response.status_code != 200: if response.status_code == 404 or response.status_code == 401: logger.info('Error status: {} {}'.format(str(response.status_code), "trying with trailing / ...")) response = requests.get(url + '/', headers=access_token) return response else: return logger.info('Error status: ' + str(response.status_code)) return response
def find_resource_id_if_exists(url, dataset_name, file_name): metadata = requests.get(url + '/api/3/action/package_show?id=' + dataset_name) package_metadata = metadata.json() for resource in package_metadata['result']['resources']: if resource['name'] == file_name: logger.info( 'Found existing filename: {}, wil update it now...'.format( resource['name'])) return resource['id']
def get_tables_mdb(mdb_file): """Get the list of table names with "mdb-tables" for a *.mdb file using latin1 as encoding. """ table_names_binary_string = subprocess.Popen( ["mdb-tables", "-1", mdb_file], stdout=subprocess.PIPE).communicate()[0] table_names = table_names_binary_string.decode( 'latin1') # other option could be 'ascii' tables = table_names.split('\n') logger.info("Available tables:{}".format(tables)) return tables
def create_pg_tables(cursor, schema_name, mdb_file): """ Example command: mdb-schema -Nnoord ../data/beheerassets/noord/vm_stadsdeel_noord.mdb postgres """ cmd = ['mdb-schema', '-N' + schema_name, mdb_file, 'postgres'] get_tables_from_mdb = run_command_sync(cmd) schema = cleanup_table_create(get_tables_from_mdb) cursor.execute(schema) print(schema) logger.info('Created tables')
def retrywithtrailingslash(url, access_token): response = requests.get(url, headers=access_token) # Get first page for count if response.status_code != 200: if response.status_code == 404 or response.status_code == 401: logger.info('Error status: {} {}'.format( str(response.status_code), "trying with trailing / ...")) response = requests.get(url + '/', headers=access_token) return response else: return logger.info('Error status: ' + str(response.status_code)) return response
def your_first_function(argname1): """ Does some great stuff. Args: argname1: path/in/store Returns: A file or check, show some examples. """ something_2 = 'test2' logger.info('Succes!') return something_2
def your_first_function(argname1): """ Does some great stuff. Args: argname1: path/in/store Returns: A file or check, show some examples. """ something_2 = 'test2' logger.info('Succes!') return something_2
def get_data(url_api, endpoint, metadata, accessToken, limit): """ Get and flatten all the data from the api. Args: 1. url_api: get the main api url:: https://api.data.amsterdam.nl/tellus 2. get one endpoint:: tellus 3. get a list of dictionaries from other endpoints, in this case: for tellus location, speed and length. 4. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken() 5. limit: set the number of pages you want to retrieve, ideal for testing first:: 10 Returns: A list containing multiple items which are all reformatted to a flattened json with added metadata. """ data = [] url = url_api + '/' + endpoint startPage = 1 has_next_key = False nextKey = "" cvalues = conversionListCvalues(metadata) json_data = getJsonData(url, accessToken) number_of_items = json_data['count'] logger.info("number of items {}".format(number_of_items)) number_of_pages = int(abs(number_of_items / 100)) if "next" in json_data["_links"].keys(): has_next_key = True url = json_data["_links"]["next"] logger.info(url) while has_next_key and startPage < limit: response = getJsonData(url, accessToken) if "next" in response["_links"].keys(): url = response["_links"]["next"] logger.info(nextKey) else: has_next_key = False # no next_key, stop the loop # logger.info('status: ' + str(response.status_code)) for item in response["_embedded"]: #logger.info(item) newRow = reformatData(item, metadata['tellus']['_embedded'], cvalues) # Add c-waarde row #values = list(newRow.values()) # append to main data array data.append(newRow) # json.dump(data, outputFile, indent=4, sort_keys=True) logger.info('Page {} of {}'.format(startPage, number_of_pages)) startPage += 1 #logger.info(data) return data
def getAreaCodes(item, lat, lon): """ Get specific information like area codes based radius to nearest address based on lat/lon value ex: https://api.data.amsterdam.nl/geosearch/search/?item=verblijfsobject&lat=52.3731750&lon=4.8924655&radius=50 It currently is coded to work to get: - "buurt" - "buurtcombinatie" - "stadsdeel" """ if item in ["buurt", "buurtcombinatie", "stadsdeel"]: url = "https://api.data.amsterdam.nl/geosearch/search/?item=%s&lat=%s&lon=%s&radius=1" % (item, lat, lon) logger.info(url) jsonData = getJson(url) logger.info(jsonData) if "features" in jsonData and len(jsonData["features"]) > 0: uri = jsonData["features"][0]["properties"]["uri"] data = getJson(uri) if item == "buurt" or item == "buurtcombinatie": return [data["volledige_code"], data["naam"]] if item == "stadsdeel": return [data["code"], data["naam"]] else: logger.info('Valt buiten Amsterdam') return None else: logger.info("Ongeldig item") return None
def load_wfs_layer_into_postgres(pg_str, url_wfs, layer_name, srs, retry_count=3): """ Get layer from a wfs service. Args: 1. url_wfs: full url of the WFS including https, excluding /?:: https://map.data.amsterdam.nl/maps/gebieden 2. layer_name: Title of the layer:: stadsdeel 3. srs: coordinate system number, excluding EPSG:: 28992 Returns: The layer loaded into postgres """ # noqa parameters = { "REQUEST": "GetFeature", "TYPENAME": layer_name, "SERVICE": "WFS", "VERSION": "2.0.0", #"SRSNAME": "EPSG:{}".format(srs) } logger.info("Requesting data from {}, layer: {}".format( url_wfs, layer_name)) url = url_wfs + '?' + urlencode(parameters) srs = "EPSG:{}".format(srs) cmd = [ 'ogr2ogr', # replace exiting table. '-overwrite', # srs to use 4326, 28992 '-t_srs', srs, # layer name '-nln', layer_name, # geometry target column name # new versions of ogr2ogr use diferent clumns # force it here. (todo make option?) '-lco', 'GEOMETRY_NAME=wkb_geometry', '-F', 'PostgreSQL', 'PG:' + pg_str, url ] run_command_sync(cmd)
def get_data(url_api, endpoint, metadata, accessToken, limit): """ Get and flatten all the data from the api. Args: 1. url_api: get the main api url:: https://api.data.amsterdam.nl/tellus 2. get one endpoint:: tellus 3. get a list of dictionaries from other endpoints, in this case: for tellus location, speed and length. 4. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken() 5. limit: set the number of pages you want to retrieve, ideal for testing first:: 10 Returns: A list containing multiple items which are all reformatted to a flattened json with added metadata. """ data = [] url = url_api + '/' + endpoint startPage = 1 has_next_key = False nextKey = "" cvalues = conversionListCvalues(metadata) json_data = getJsonData(url, accessToken) number_of_items = json_data['count'] logger.info("number of items {}".format(number_of_items)) number_of_pages = int(abs(number_of_items/100)) if "next" in json_data["_links"].keys(): has_next_key = True url = json_data["_links"]["next"] logger.info(url) while has_next_key and startPage < limit: response = getJsonData(url, accessToken) if "next" in response["_links"].keys(): url = response["_links"]["next"] logger.info(nextKey) else: has_next_key = False # no next_key, stop the loop # logger.info('status: ' + str(response.status_code)) for item in response["_embedded"]: #logger.info(item) newRow = reformatData(item, metadata['tellus']['_embedded'], cvalues) # Add c-waarde row #values = list(newRow.values()) # append to main data array data.append(newRow) # json.dump(data, outputFile, indent=4, sort_keys=True) logger.info('Page {} of {}'.format(startPage,number_of_pages)) startPage += 1 #logger.info(data) return data
def get_config(full_path): """ Get config file with all login credentials, port numbers, etc. Args: full_path: provide the full path to the config.ini file, for example authentication/config.ini Returns: The entire configuration file to use them with ``config.get(config_name, 'AUTHURL')`` """ config = configparser.RawConfigParser() config.read(full_path) logger.info('Found these configs.. {}'.format(config.sections())) return config
def get_config(full_path): """ Get config file with all login credentials, port numbers, etc. Args: full_path: provide the full path to the config.ini file, for example authentication/config.ini Returns: The entire configuration file to use them with ``config.get(config_name, 'AUTHURL')`` """ config = configparser.RawConfigParser() config.read(full_path) logger.info('Found these configs.. {}'.format(config.sections())) return config
def your_second_function(argname1, argname2): """ Does some great stuff. Args: 1. argname1: path/in/store 2. argname2: your_file_name.txt Returns: A file or check, show some examples. """ data = argname1 something = data logger.info('Another Succes!') return something
def get_layers_from_wfs(url_wfs): """ Get all layer names in WFS service, print and return them in a list. """ layer_names = [] parameters = {"REQUEST": "GetCapabilities", "SERVICE": "WFS"} getcapabilities = requests.get(url_wfs, params=parameters) # print(getcapabilities.text) root = ET.fromstring(getcapabilities.text) for neighbor in root.iter('{http://www.opengis.net/wfs/2.0}FeatureType'): # print(neighbor.tag, neighbor.attrib) logger.info("layername: " + neighbor[1].text) # neighbor[0]==name, neighbor[1]==title layer_names.append(neighbor[1].text) return layer_names
def your_second_function(argname1, argname2): """ Does some great stuff. Args: 1. argname1: path/in/store 2. argname2: your_file_name.txt Returns: A file or check, show some examples. """ data = argname1 something = data logger.info('Another Succes!') return something
def create_geoms(cursor, schema_name, table_name, x_name, y_name): cursor.execute( sql.SQL(""" ALTER TABLE {0}.{1} DROP COLUMN IF EXISTS geom; SELECT AddGeometryColumn ({2},{3},'geom',28992,'POINT',2); UPDATE {0}.{1} SET geom = CASE -- FIX Microstation offset. Used MSLINK 9688 Noord to match offset mldnr : E06 with dgn vs access db WHEN {4} is not null and {4} < 100000 THEN ST_PointFromText('POINT('||{4}+123835.77100000000791624+2023648.989000000059605||' '||{5}+489390.68800000002374873+1658091.672999999951571||')',28992) WHEN {4} is not null and {4} > 100000 THEN ST_PointFromText('POINT('||{4}||' '||{5}||')',28992) END""").format(sql.Identifier(schema_name), sql.Identifier(table_name), sql.Literal(schema_name), sql.Literal(table_name), sql.Identifier(x_name), sql.Identifier(y_name)), ) logger.info('added geometry column to {}'.format(table_name))
def getJsonData(url, access_token): """ Get a json response from a url with accesstoken. Args: 1. url: api endpoint 2. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken(usertype='employee_plus', scopes='BRK/RS,BRK/RSN/,BRK/RO') Returns: parsed json or error message """ response = retrywithtrailingslash(url, access_token) json_data = response.json() logger.info("recieved data from {} ".format(url)) return json_data
def get_layers_from_wfs(url_wfs): """ Get all layer names in WFS service, print and return them in a list. """ layer_names = [] parameters = {"REQUEST": "GetCapabilities", "SERVICE": "WFS" } getcapabilities = requests.get(url_wfs, params=parameters) # print(getcapabilities.text) root = ET.fromstring(getcapabilities.text) for neighbor in root.iter('{http://www.opengis.net/wfs/2.0}FeatureType'): # print(neighbor.tag, neighbor.attrib) logger.info("layername: " + neighbor[1].text) # neighbor[0]==name, neighbor[1]==title layer_names.append(neighbor[1].text) return layer_names
def getJsonData(url, access_token): """ Get a json response from a url with accesstoken. Args: 1. url: api endpoint 2. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken(usertype='employee_plus', scopes='BRK/RS,BRK/RSN/,BRK/RO') Returns: parsed json or error message """ response = retrywithtrailingslash(url, access_token) json_data = response.json() logger.info("recieved data from {} ".format(url)) return json_data
def execute_sql(pg_str, sql): """ Execute a sql query with psycopg2. Args: 1. pg_str: connection string using helper function psycopg_connection_string, returning:``host= port= user= dbname= password=`` 2. sql: SQL string in triple quotes:: ```CREATE TABLE foo (bar text)``` Returns: Executed sql with conn.cursor().execute(sql) """ with psycopg2.connect(pg_str) as conn: logger.info('connected to database') with conn.cursor() as cursor: logger.info('start exectuting sql query') cursor.execute(sql)
def execute_sql(pg_str, sql): """ Execute a sql query with psycopg2. Args: 1. pg_str: connection string using helper function psycopg_connection_string, returning:``host= port= user= dbname= password=`` 2. sql: SQL string in triple quotes:: ```CREATE TABLE foo (bar text)``` Returns: Executed sql with conn.cursor().execute(sql) """ with psycopg2.connect(pg_str) as conn: logger.info('connected to database') with conn.cursor() as cursor: logger.info('start exectuting sql query') cursor.execute(sql)
def upload_file_to_ckan(url, dataset_name, file_path): """ Upload a file to the CKAN datastore. Args: 1. url: url of the catalog:: https://api.data.amsterdam.nl/catalogus 2. dataset_name: name of the dataset, which can be found on the ckan page url:: https://api.data.amsterdam.nl/catalogus/dataset/afvalcontainers 3. api_key: your private user key, which can be found on the user profile page. 4. file_path: location of the file including filename:: /path/to/file/to/upload.csv Returns: An uploaded file to the CKAN datastore. """ assert os.environ['CKAN_API_KEY'] api_key = os.environ['CKAN_API_KEY'] file_name = file_path.split('/')[-1] resource_id = find_resource_id_if_exists(url, dataset_name, file_name) if resource_id: data_upload_url = url+'/api/action/resource_update' data = {"id":resource_id} else: data_upload_url = url+'/api/action/resource_create' data={"package_id": dataset_name, "name":file_name} logger.info('Uploading {}...'.format(file_name)) response = requests.post(data_upload_url, data=data, headers={"X-CKAN-API-Key": api_key}, files=[('upload', open(file_path, 'rb'))] ) assert response.status_code == requests.codes.ok logger.info('Uploaded {} to https://api.data.amsterdam.nl/catalogus/dataset/{}'.format(file_name, dataset_name))
def upload_file_to_ckan(url, dataset_name, file_path): """ Upload a file to the CKAN datastore. Args: 1. url: url of the catalog:: https://api.data.amsterdam.nl/catalogus 2. dataset_name: name of the dataset, which can be found on the ckan page url:: https://api.data.amsterdam.nl/catalogus/dataset/afvalcontainers 3. api_key: your private user key, which can be found on the user profile page. 4. file_path: location of the file including filename:: /path/to/file/to/upload.csv Returns: An uploaded file to the CKAN datastore. """ assert os.environ['CKAN_API_KEY'] api_key = os.environ['CKAN_API_KEY'] file_name = file_path.split('/')[-1] resource_id = find_resource_id_if_exists(url, dataset_name, file_name) if resource_id: data_upload_url = url + '/api/action/resource_update' data = {"id": resource_id} else: data_upload_url = url + '/api/action/resource_create' data = {"package_id": dataset_name, "name": file_name} logger.info('Uploading {}...'.format(file_name)) response = requests.post(data_upload_url, data=data, headers={"X-CKAN-API-Key": api_key}, files=[('upload', open(file_path, 'rb'))]) assert response.status_code == requests.codes.ok logger.info( 'Uploaded {} to https://api.data.amsterdam.nl/catalogus/dataset/{}'. format(file_name, dataset_name))
def put_object(connection, container: str, filename: str, contents, content_type: str) -> None: """ Upload a file to objectstore. Args: 1. container: path/in/store 2. filename: your_file_name.txt 3. contents: contents of file with use of with open('your_file_name.txt', 'rb') as contents: 4. content_type:'text/csv','application/json', ... Is retrievd by using the mime package. Returns: A saved file in the container of the objectstore. """ logger.info('Uploading file...') connection.put_object(container, filename, contents=contents, content_type=content_type)
def dump_mdb_tables_to_csv(mdb_file, output_folder, table_names): """Dump each table as a CSV file using "mdb-export" and converting " " in table names to "_" for the CSV filenames.""" create_dir_if_not_exists(output_folder) if table_names is None: tables = get_tables_mdb(mdb_file) else: tables = table_names[0].split(',') for table in tables: if table != '': filename = os.path.join(output_folder, table.replace(" ", "_") + ".csv") file = open(filename, 'wb') logger.info("Dumping " + table) contents = subprocess.Popen( ["mdb-export", mdb_file, table], stdout=subprocess.PIPE).communicate()[0] file.write(contents) file.close()
def executeScriptsFromFile(pg_str, filename): """WIP does not work yet""" # Open and read the file as a single buffer fd = open(filename, 'r') sqlFile = fd.read() fd.close() # all SQL commands (split on ';') sqlCommands = sqlFile.split(';') # Execute every command from the input file for command in sqlCommands: logger.info(command) # This will skip and report errors # For example, if the tables do not yet exist, this will skip over # the DROP TABLE commands try: execute_sql(pg_str, command) except psycopg2.OperationalError as msg: logger.info("Command skipped: {}".format(msg))
def load_xls(datadir, config_path, db_config_name): """Load xlsx into postgres for multiple files""" files = os.listdir(datadir) files_xls = [f for f in files if f.split('.')[-1] in ('xlsx', 'xls')] logger.info(files_xls) for filename in files_xls: df = pd.read_excel(datadir + '/' + filename) if df.empty: logger.info('No data') continue logger.info("added " + filename) logger.info(df.columns) # load the data into pg engine = postgres_engine_pandas(config_path, db_config_name) # TODO: link to to_sql function table_name = filename.split('.')[0] df.to_sql(table_name, engine, if_exists='replace', index=True, index_label='idx') # ,dtype={geom: Geometry('POINT', srid='4326')}) logger.info(filename + ' added as ' + table_name)
def getJsonData(url, accessToken): """ Get a json response from a url with accesstoken. Args: 1. url: api endpoint 2. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken(usertype='employee', scopes='TLLS/R') Returns: parsed json or error message """ response = requests.get(url, headers=accessToken) # Get first page for count if response.status_code != 200: if response.status_code == 404 or response.status_code == 401: logger.info('Error status: {} {}'.format(str(response.status_code), "trying with trailing / ...")) response = requests.get(url + '/', headers=accessToken) else: return logger.info('Error status: ' + str(response.status_code)) jsonData = response.json() logger.info("recieved data from {} ".format(url)) return jsonData
def main(): # Return all arguments in a list args = parser().parse_args() logger.info("Getting Access token.") access_token = GetAccessToken().getAccessToken(usertype=args.usertype, scopes=args.scopes) logger.info("Setup temp database to store requests to speed up restart download if network fails.") requests_cache.install_cache('requests_db', backend='sqlite') logger.info("Getting data with Access token.") json_data = getJsonData(args.url, access_token) logger.info(json_data) save_file(json_data, args.output_folder, args.filename)
def get_layer_from_wfs(url_wfs, layer_name, srs, outputformat): """ Get layer from a wfs service. Args: 1. url_wfs: full url of the WFS including https, excluding /?:: https://map.data.amsterdam.nl/maps/gebieden 2. layer_name: Title of the layer:: stadsdeel 3. srs: coordinate system number, excluding EPSG:: 28992 4. outputformat: leave empty to return standard GML, else define json, geojson, txt, shapezip:: geojson Returns: The layer in the specified output format. """ parameters = {"REQUEST": "GetFeature", "TYPENAME": layer_name, "SERVICE": "WFS", "VERSION": "2.0.0", "SRSNAME": "EPSG:{}".format(srs), "OUTPUTFORMAT": outputformat } logger.info("Requesting data from {}, layer: {}".format(url_wfs, layer_name)) response = requests.get(url_wfs, params=parameters) if outputformat in ('geojson, json'): geojson = response.json() logger.info("{} features returned.".format(str(len(geojson["features"])))) return geojson return response
def load_layers(pg_str): """ Load layers into Postgres using a list of titles of each layer within the WFS service. Args: pg_str: psycopg2 connection string:: 'PG:host= port= user= dbname= password='******'stadsdeel', 'buurt', 'buurtcombinatie', 'gebiedsgerichtwerken'] srsName = 'EPSG:28992' for areaName in layerNames: WFS = "https://map.data.amsterdam.nl/maps/gebieden?REQUEST=GetFeature&SERVICE=wfs&Version=2.0.0&SRSNAME=" + srsName + "&typename=" + areaName wfs2psql(WFS, pg_str, areaName) logger.info(areaName + ' loaded into PG.')
def main(): # Return all arguments in a list args = parser().parse_args() logger.info("Getting Access token.") accessToken = GetAccessToken().getAccessToken(usertype='employee', scopes='TLLS/R') logger.info("Setup temp database to store requests to speed up restart download if network fails.") requests_cache.install_cache('requests_db', backend='sqlite') endpoints = ['tellus', 'snelheidscategorie', 'lengtecategorie'] metadata = {} for endpoint in endpoints: json_data = getJsonData(args.url + '/' + endpoint, accessToken) # logger.info(json_data) metadata.update({endpoint: json_data}) logger.info("retrieved {}".format(endpoint)) data = get_data(args.url, 'tellusdata', metadata, accessToken, args.limit) save_file(data, args.output_folder, args.filename)