def load_xls(datadir, config_path, db_config_name): """Load xlsx into postgres for multiple files""" files = os.listdir(datadir) files_xls = [f for f in files if f.split('.')[-1] in ('xlsx', 'xls')] logger.info(files_xls) for filename in files_xls: df = pd.read_excel(datadir + '/' + filename) if df.empty: logger.info('No data') continue logger.info("added " + filename) logger.info(df.columns) # load the data into pg engine = postgres_engine_pandas(config_path, db_config_name) # TODO: link to to_sql function table_name = filename.split('.')[0] df.to_sql(table_name, engine, if_exists='replace', index=True, index_label='idx' ) # ,dtype={geom: Geometry('POINT', srid='4326')}) logger.info(filename + ' added as ' + table_name)
def psycopg_connection_string(config_full_path, db_config_name): """ Postgres connection string for psycopg2. Args: 1. config_full_path: location of the config.ini file including the name of the file, for example authentication/config.ini 2. db_config_name: dev or docker to get the ip user/password and port values. Returns: Returns the psycopg required connection string: 'PG:host= port= user= dbname= password='******'Config names: {}'.format(config.sections())) print(db_config_name) host = config.get(db_config_name, 'host') logger.info(host) port = config.get(db_config_name, 'port') user = config.get(db_config_name, 'user') dbname = config.get(db_config_name, 'dbname') password = config.get(db_config_name, 'password') return 'host={} port={} user={} dbname={} password={}'.format( host, port, user, dbname, password)
def get_kvk_json(url, params, api_key = None): """ Get a json response from a url, provided params + api_key. Args: url: api endpoint params: kvkNumber, branchNumber, rsin, street, houseNumber, postalCode, city, tradeName, or provide lists/dicts of values api_key: kvk api_key. add KVK_API_KEY to your ENV variables Returns: parsed json or error message """ API_KEY = os.environ['kvk_api_key'] if API_KEY: url += '&user_key={}'.format(API_KEY) else: return logger.error('please provide api_key') response = requests.get(url, params) try: response.raise_for_status() # Raises 'HTTPError', if one occurred except requests.exceptions.HTTPError as e: raise errors.InvalidResponse(response) from e json_response = response_to_json(response) logger.info("received data from {} ".format(url)) return json_response
def retrywithtrailingslash(url, access_token): response = requests.get(url, headers=access_token) # Get first page for count if response.status_code != 200: if response.status_code == 404 or response.status_code == 401: logger.info('Error status: {} {}'.format( str(response.status_code), "trying with trailing / ...")) response = requests.get(url + '/', headers=access_token) return response else: return logger.info('Error status: ' + str(response.status_code)) return response
def your_first_function(argname1): """ Does some great stuff. Args: argname1: path/in/store Returns: A file or check, show some examples. """ something_2 = 'test2' logger.info('Succes!') return something_2
def get_data(url_api, endpoint, metadata, accessToken, limit): """ Get and flatten all the data from the api. Args: 1. url_api: get the main api url:: https://api.data.amsterdam.nl/tellus 2. get one endpoint:: tellus 3. get a list of dictionaries from other endpoints, in this case: for tellus location, speed and length. 4. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken() 5. limit: set the number of pages you want to retrieve, ideal for testing first:: 10 Returns: A list containing multiple items which are all reformatted to a flattened json with added metadata. """ data = [] url = url_api + '/' + endpoint startPage = 1 has_next_key = False nextKey = "" cvalues = conversionListCvalues(metadata) json_data = getJsonData(url, accessToken) number_of_items = json_data['count'] logger.info("number of items {}".format(number_of_items)) number_of_pages = int(abs(number_of_items / 100)) if "next" in json_data["_links"].keys(): has_next_key = True url = json_data["_links"]["next"] logger.info(url) while has_next_key and startPage < limit: response = getJsonData(url, accessToken) if "next" in response["_links"].keys(): url = response["_links"]["next"] logger.info(nextKey) else: has_next_key = False # no next_key, stop the loop # logger.info('status: ' + str(response.status_code)) for item in response["_embedded"]: #logger.info(item) newRow = reformatData(item, metadata['tellus']['_embedded'], cvalues) # Add c-waarde row #values = list(newRow.values()) # append to main data array data.append(newRow) # json.dump(data, outputFile, indent=4, sort_keys=True) logger.info('Page {} of {}'.format(startPage, number_of_pages)) startPage += 1 #logger.info(data) return data
def get_config(full_path): """ Get config file with all login credentials, port numbers, etc. Args: full_path: provide the full path to the config.ini file, for example authentication/config.ini Returns: The entire configuration file to use them with ``config.get(config_name, 'AUTHURL')`` """ config = configparser.RawConfigParser() config.read(full_path) logger.info('Found these configs.. {}'.format(config.sections())) return config
def your_second_function(argname1, argname2): """ Does some great stuff. Args: 1. argname1: path/in/store 2. argname2: your_file_name.txt Returns: A file or check, show some examples. """ data = argname1 something = data logger.info('Another Succes!') return something
def get_layers_from_wfs(url_wfs): """ Get all layer names in WFS service, print and return them in a list. """ layer_names = [] parameters = {"REQUEST": "GetCapabilities", "SERVICE": "WFS"} getcapabilities = requests.get(url_wfs, params=parameters) # print(getcapabilities.text) root = ET.fromstring(getcapabilities.text) for neighbor in root.iter('{http://www.opengis.net/wfs/2.0}FeatureType'): # print(neighbor.tag, neighbor.attrib) logger.info("layername: " + neighbor[1].text) # neighbor[0]==name, neighbor[1]==title layer_names.append(neighbor[1].text) return layer_names
def getJsonData(url, access_token): """ Get a json response from a url with accesstoken. Args: 1. url: api endpoint 2. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken(usertype='employee_plus', scopes='BRK/RS,BRK/RSN/,BRK/RO') Returns: parsed json or error message """ response = retrywithtrailingslash(url, access_token) json_data = response.json() logger.info("recieved data from {} ".format(url)) return json_data
def execute_sql(pg_str, sql): """ Execute a sql query with psycopg2. Args: 1. pg_str: connection string using helper function psycopg_connection_string, returning:``host= port= user= dbname= password=`` 2. sql: SQL string in triple quotes:: ```CREATE TABLE foo (bar text)``` Returns: Executed sql with conn.cursor().execute(sql) """ with psycopg2.connect(pg_str) as conn: logger.info('connected to database') with conn.cursor() as cursor: logger.info('start exectuting sql query') cursor.execute(sql)
def put_object( connection, container: str, filename: str, contents, content_type: str) -> None: """ Upload a file to objectstore. Args: 1. container: path/in/store 2. filename: your_file_name.txt 3. contents: contents of file with use of with open('your_file_name.txt', 'rb') as contents: 4. content_type:'text/csv','application/json', ... Is retrievd by using the mime package. Returns: A saved file in the container of the objectstore. """ logger.info('Uploading file...') connection.put_object( container, filename, contents=contents, content_type=content_type)
def executeScriptsFromFile(pg_str, filename): """WIP does not work yet""" # Open and read the file as a single buffer fd = open(filename, 'r') sqlFile = fd.read() fd.close() # all SQL commands (split on ';') sqlCommands = sqlFile.split(';') # Execute every command from the input file for command in sqlCommands: logger.info(command) # This will skip and report errors # For example, if the tables do not yet exist, this will skip over # the DROP TABLE commands try: execute_sql(pg_str, command) except psycopg2.OperationalError as msg: logger.info("Command skipped: {}".format(msg))
def upload_file(connection, container_path, filename_path): """ Upload file to the objectstore. Args: 1. connection = Objectstore connection based on from helpers.connection import objectstore_connection 2. container_path = Name of container/prefix/subfolder, for example Dataservices/aanvalsplan_schoon/crow 3. filename_path = full path including the name of file, for example: data/test.csv Uses mime for content_type: https://stackoverflow.com/questions/43580/how-to-find-the-mime-type-of-a-file-in-python Result: Uploads a file to the objectstore and checks if it exists on in the defined container_path. """ with open(filename_path, 'rb') as contents: mime = MimeTypes() content_type = mime.guess_type(filename_path)[0] logger.info("Found content type '{}'".format(content_type)) filename = os.path.basename(filename_path) put_object(connection, container_path, filename, contents, content_type) check_existence_object(connection, container_path, filename)
def get_layer_from_wfs(url_wfs, layer_name, srs, outputformat): """ Get layer from a wfs service. Args: 1. url_wfs: full url of the WFS including https, excluding /?:: https://map.data.amsterdam.nl/maps/gebieden 2. layer_name: Title of the layer:: stadsdeel 3. srs: coordinate system number, excluding EPSG:: 28992 4. outputformat: leave empty to return standard GML, else define json, geojson, txt, shapezip:: geojson Returns: The layer in the specified output format. """ parameters = { "REQUEST": "GetFeature", "TYPENAME": layer_name, "SERVICE": "WFS", "VERSION": "2.0.0", "SRSNAME": "EPSG:{}".format(srs), "OUTPUTFORMAT": outputformat } logger.info("Requesting data from {}, layer: {}".format( url_wfs, layer_name)) response = requests.get(url_wfs, params=parameters) if outputformat in ('geojson, json'): geojson = response.json() logger.info("{} features returned.".format( str(len(geojson["features"])))) return geojson return response
def main(): args = parser().parse_args() pg_str = psycopg_connection_string(args.full_config_path, args.db_config_name) load_wfs_to_postgres.main() # Reuses pg_str execute_sql(pg_str, createDummyTable) logger.info('created createDummyTable') execute_sql(pg_str, createGeom) logger.info('geometry field created') execute_sql(pg_str, addAreaCodes) logger.info('areaCode fields added') execute_sql(pg_str, test_areacodesView) logger.info('csv view Created')
def getJsonData(url, accessToken): """ Get a json response from a url with accesstoken. Args: 1. url: api endpoint 2. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken(usertype='employee', scopes='TLLS/R') Returns: parsed json or error message """ response = requests.get(url, headers=accessToken) # Get first page for count if response.status_code != 200: if response.status_code == 404 or response.status_code == 401: logger.info('Error status: {} {}'.format( str(response.status_code), "trying with trailing / ...")) response = requests.get(url + '/', headers=accessToken) else: return logger.info('Error status: ' + str(response.status_code)) jsonData = response.json() logger.info("recieved data from {} ".format(url)) return jsonData
def objectstore_connection(config_full_path, config_name, print_config_vars=None): """ Get an objectsctore connection. Args: 1. config_full_path: /path_to_config/config.ini or config.ini if in root. 2. config_name: objectstore 3. print_config_vars: if set to True: print all variables from the config file Returns: An objectstore connection session. """ assert os.environ['OBJECTSTORE_PASSWORD'] config = get_config(config_full_path) if print_config_vars: logger.info('config variables.. :{}'.format(OBJECTSTORE)) conn = Connection( authurl=config.get(config_name, 'AUTHURL'), user=config.get(config_name, 'USER'), key=os.environ['OBJECTSTORE_PASSWORD'], tenant_name=config.get(config_name, 'TENANT_NAME'), auth_version=config.get(config_name, 'VERSION'), os_options={ 'tenant_id': config.get(config_name, 'TENANT_ID'), 'region_name': config.get(config_name, 'REGION_NAME'), # 'endpoint_type': 'internalURL' }) logger.info('Established successfull connection to {}'.format( config.get(config_name, 'TENANT_NAME'))) return conn
def load_layers(pg_str): """ Load layers into Postgres using a list of titles of each layer within the WFS service. Args: pg_str: psycopg2 connection string:: 'PG:host= port= user= dbname= password='******'stadsdeel', 'buurt', 'buurtcombinatie', 'gebiedsgerichtwerken'] srsName = 'EPSG:28992' for areaName in layerNames: WFS = "https://map.data.amsterdam.nl/maps/gebieden?REQUEST=GetFeature&SERVICE=wfs&Version=2.0.0&SRSNAME=" + srsName + "&typename=" + areaName wfs2psql(WFS, pg_str, areaName) logger.info(areaName + ' loaded into PG.')
def main(): # Return all arguments in a list args = parser().parse_args() logger.info("Getting Access token.") access_token = GetAccessToken().getAccessToken(usertype=args.usertype, scopes=args.scopes) logger.info( "Setup temp database to store requests to speed up restart download if network fails." ) requests_cache.install_cache('requests_db', backend='sqlite') logger.info("Getting data with Access token.") json_data = getJsonData(args.url, access_token) logger.info(json_data) save_file(json_data, args.output_folder, args.filename)
def main(): # Return all arguments in a list args = parser().parse_args() logger.info("Getting Access token.") accessToken = GetAccessToken().getAccessToken(usertype='employee', scopes='TLLS/R') logger.info( "Setup temp database to store requests to speed up restart download if network fails." ) requests_cache.install_cache('requests_db', backend='sqlite') endpoints = ['tellus', 'snelheidscategorie', 'lengtecategorie'] metadata = {} for endpoint in endpoints: json_data = getJsonData(args.url + '/' + endpoint, accessToken) # logger.info(json_data) metadata.update({endpoint: json_data}) logger.info("retrieved {}".format(endpoint)) data = get_data(args.url, 'tellusdata', metadata, accessToken, args.limit) save_file(data, args.output_folder, args.filename)
def check_existence_object(connection, container_path, filename): """ Check if the file is present on the objectstore container_path, Args: 1. connection = Objectstore connection based on from helpers.connection import objectstore_connection 2. container_path = Name of container/prefix/subfolder 3. filename = Name of file, for example test.csv Returns: - 'The object was successfully created' - 'The object was not found' - 'Error finding the object' """ try: resp_headers = connection.head_object(container_path, filename) logger.info('The object was successfully created') except ClientException as e: if e.http_status == '404': logger.info('The object was not found') else: logger.info('An error occurred checking for the existence of the object')