コード例 #1
0
def load_xls(cursor, datadir, schema_name, config_path, config_name):
    """Load xlsx into postgres for multiple files"""
    files = os.listdir(datadir)
    files_xls = [f for f in files if f.split('.')[-1] in ('xlsx', 'xls')]
    logger.info(files_xls)

    for filename in files_xls:
        df = pd.read_excel(datadir + '/' + filename, skiprows=1)
        if df.empty:
            logger.info('No data')
            continue
        df.columns = map(str.lower, df.columns)
        logger.info("added " + filename)
        logger.info(df.columns)

        # load the data into pg
        engine = postgres_engine_pandas(config_path, config_name)
        table_name = filename.split('.')[0]
        create_pg_schema(cursor, schema_name)
        df.to_sql(table_name, engine, schema=schema_name, if_exists='replace'
                  )  # ,dtype={geom: Geometry('POINT', srid='4326')})
        logger.info(filename + ' added as ' + table_name)
        create_geoms(cursor, schema_name, table_name, 'x-coordinaat',
                     'y-coordinaat')
        cursor.execute(
            sql.SQL("""ALTER TABLE {}.{} ADD COLUMN id SERIAL PRIMARY KEY;""").
            format(sql.Identifier(schema_name), sql.Identifier(table_name)), )
コード例 #2
0
def psycopg_connection_string(config_full_path, db_config_name):
    """
    Postgres connection string for psycopg2.

    Args:
      1. config_full_path: location of the config.ini file including the name of the file, for example authentication/config.ini
      2. db_config_name: dev or docker to get the ip user/password and port values.

    Returns:
        Returns the psycopg required connection string: 'PG:host= port= user= dbname= password='******'Config names: {}'.format(config.sections()))
    print(db_config_name)
    host = config.get(db_config_name, 'host')
    logger.info(host)
    port = config.get(db_config_name, 'port')
    user = config.get(db_config_name, 'user')
    dbname = config.get(db_config_name, 'dbname')
    password = config.get(db_config_name, 'password')

    return 'host={} port={} user={} dbname={} password={}'.format(
        host, port, user, dbname, password)
コード例 #3
0
def get_kvk_json(url, params, api_key = None):
    """
    Get a json response from a url, provided params + api_key.
    Args:
        url: api endpoint
        params: kvkNumber, branchNumber, rsin, street, houseNumber, postalCode,
                city, tradeName, or provide lists/dicts of values
        api_key: kvk api_key. add KVK_API_KEY to your ENV variables
    Returns:
        parsed json or error message
    """

    API_KEY = os.environ['kvk_api_key']

    if API_KEY:
        url += '&user_key={}'.format(API_KEY)
    else:
        return logger.error('please provide api_key')

    response = requests.get(url, params)

    try:
        response.raise_for_status() # Raises 'HTTPError', if one occurred
    except requests.exceptions.HTTPError as e:
        raise errors.InvalidResponse(response) from e
    json_response = response_to_json(response)

    logger.info("received data from {} ".format(url))

    return json_response
コード例 #4
0
def find_resource_id_if_exists(url, dataset_name, file_name):
    metadata = requests.get(url+'/api/3/action/package_show?id='+dataset_name)
    package_metadata = metadata.json()
    for resource in package_metadata['result']['resources']:
        if resource['name'] == file_name:
            logger.info('Found existing filename: {}, wil update it now...'.format(resource['name']))
            return resource['id']
コード例 #5
0
def upload_file(connection, container_path, files_directory):
    """
    Upload file to the objectstore.

    Args:
        1. connection = Objectstore connection based on from helpers.connection import objectstore_connection
        2. container_path = Name of container/prefix/subfolder, for example Dataservices/aanvalsplan_schoon/crow
        3. filename_path = full path including the name of file, for example: data/test.csv

        Uses mime for content_type: https://stackoverflow.com/questions/43580/how-to-find-the-mime-type-of-a-file-in-python

    Result:
        Uploads a file to the objectstore and checks if it exists on in the defined container_path.
    """

    files = [
        f for f in listdir(files_directory) if isfile(join(files_directory, f))
    ]
    for filename in files:
        filename_path = os.path.join(files_directory, filename)
        with open(filename_path, 'rb') as contents:
            mime = MimeTypes()
            content_type = mime.guess_type(filename_path)[0]
            logger.info("Found content type '{}'".format(content_type))
            put_object(connection, container_path, filename, contents,
                       content_type)
            check_existence_object(connection, container_path, filename)
コード例 #6
0
ファイル: connections.py プロジェクト: Lytrix/data-processing
def psycopg_connection_string(config_full_path, db_config_name):
    """
    Postgres connection string for psycopg2.

    Args:
      1. config_full_path: location of the config.ini file including the name of the file, for example authentication/config.ini
      2. db_config_name: dev or docker to get the ip user/password and port values.

    Returns:
        Returns the psycopg required connection string: 'PG:host= port= user= dbname= password='******'Config names: {}'.format(config.sections()))
    print(db_config_name)
    host = config.get(db_config_name,'host')
    logger.info(host)
    port = config.get(db_config_name,'port')
    user = config.get(db_config_name,'user')
    dbname = config.get(db_config_name,'dbname')
    password = config.get(db_config_name,'password')

    return 'host={} port={} user={} dbname={} password={}'.format(
        host, port, user, dbname, password
    )
コード例 #7
0
def load_xls(datadir, config_path, db_config_name):
    """Load xlsx into postgres for multiple files"""
    files = os.listdir(datadir)
    files_xls = [f for f in files if f.split('.')[-1] in ('xlsx', 'xls')]
    logger.info(files_xls)

    for filename in files_xls:
        df = pd.read_excel(datadir + '/' + filename)
        if df.empty:
            logger.info('No data')
            continue

        logger.info("added " + filename)
        logger.info(df.columns)

        # load the data into pg
        engine = postgres_engine_pandas(config_path, db_config_name)
        # TODO: link to to_sql function
        table_name = filename.split('.')[0]
        df.to_sql(table_name,
                  engine,
                  if_exists='replace',
                  index=True,
                  index_label='idx'
                  )  # ,dtype={geom: Geometry('POINT', srid='4326')})
        logger.info(filename + ' added as ' + table_name)
コード例 #8
0
ファイル: connections.py プロジェクト: Lytrix/data-processing
def objectstore_connection(config_full_path, config_name, print_config_vars=None):
    """
    Get an objectsctore connection.

    Args:
        1. config_full_path: /path_to_config/config.ini or config.ini if in root.
        2. config_name: objectstore
        3. print_config_vars: if set to True: print all variables from the config file

    Returns:
        An objectstore connection session.
      """

    assert os.environ['OBJECTSTORE_PASSWORD']

    config = get_config(config_full_path)

    if print_config_vars:
         logger.info('config variables.. :{}'.format(OBJECTSTORE))

    conn = Connection(authurl=config.get(config_name, 'AUTHURL'),
                      user=config.get(config_name, 'USER'),
                      key=os.environ['OBJECTSTORE_PASSWORD'],
                      tenant_name=config.get(config_name, 'TENANT_NAME'),
                      auth_version=config.get(config_name, 'VERSION'),
                      os_options={'tenant_id': config.get(config_name, 'TENANT_ID'),
                                  'region_name': config.get(config_name, 'REGION_NAME'),
                                  # 'endpoint_type': 'internalURL'
                                  })
    logger.info('Established successfull connection to {}'.format(config.get(config_name, 'TENANT_NAME')))

    return conn
コード例 #9
0
def flatten_rounds(endpoint, key):
    list_uris = get_objects(endpoint, key)
    total_objects = []
    n = 1
    for uri in list_uris:
        inspection = {}
        inspection_round = get_json(uri)
        # pp.pprint(inspection_round.keys())
        inspections = inspection_round.pop('inspections')
        m = 0
        for inspection in inspections:
            inspection.pop('closingUserDisplayName')
            geojson = inspection['location']['geoJsonFeature'].pop('geometry')
            inspection.update(inspection_round)
            #pp.pprint(inspection.keys())
            results = inspection.pop('results')

            for result in results:
                result.pop('creatingUserDisplayName')
                result.update(inspection)
                result = flatten_json(result)
                result.update(geojson=json.dumps(geojson))
                # pp.pprint(result)
                total_objects.append(result)
            m += 1
            logger.info('{}: {} of {} rounds, {} of {} inspections'.format(
                endpoint, n, len(list_uris), m, len(inspections)))
        n += 1
    df = pd.DataFrame.from_dict(total_objects, orient='columns', dtype=None)
    return df
コード例 #10
0
def getJsonData(url, accessToken):
    """
    Get a json response from a url with accesstoken.

    Args:
        1. url: api endpoint
        2. accessToken: acces token generated using the auth helper:
           GetAccessToken().getAccessToken(usertype='employee_plus',
                                           scopes='BRK/RS,BRK/RSN/,BRK/RO')

    Returns:
        parsed json or error message
    """  # noqa
    response = requests.get(url,
                            headers=accessToken)  # Get first page for count
    if response.status_code != 200:
        if response.status_code == 404 or response.status_code == 401:
            logger.info('Error status: {} {}'.format(
                str(response.status_code), "trying with trailing / ..."))
            response = requests.get(url + '/', headers=accessToken)
        else:
            return logger.info('Error status: ' + str(response.status_code))
    jsonData = response.json()
    logger.info("recieved data from {} ".format(url))
    return jsonData
コード例 #11
0
def save_geojson(cursor, pg_string, table_name, output_folder):
    create_dir_if_not_exists(output_folder)
    full_path = os.path.join(output_folder, table_name + '.geojson')
    cmd = [
        'ogr2ogr', '-F', 'GeoJSON', full_path, 'PG:' + pg_string, table_name
    ]
    stdout = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    logger.info("Writen GeoJSON to: {}".format(full_path))
コード例 #12
0
def get_table_names(mdb_file):
    '''Get table names'''
    table_names = subprocess.Popen(
        ['mdb-tables', '-1', mdb_file],
        stdout=subprocess.PIPE).communicate()[0].decode('utf-8')
    tables = table_names.strip().split('\n')
    logger.info('Tables: {}'.format(tables))
    return tables
コード例 #13
0
def run_command_sync(cmd, allow_fail=False):
    '''run a shell command and return the command line output'''
    encoding = sys.stdout.encoding
    logger.info('Running %s', scrub(cmd))
    stdout = subprocess.Popen(cmd,
                              stdout=subprocess.PIPE).communicate()[0].replace(
                                  b'\r', b'').decode(encoding).splitlines()
    return stdout
コード例 #14
0
def import_shapefiles(cursor, pg_string, data_folder, shp_dirs):
    for shp_dir in shp_dirs:
        create_pg_schema(cursor, shp_dir['schema'])
        full_path = os.path.join(data_folder, shp_dir['path'], "*.shp")
        for shp_filename in glob.glob(full_path):
            logger.info('Found: ' + shp_filename + ', saving to Postgres')
            shp2psql(shp_filename, pg_string,
                     shp_filename.split('/')[-1][:-4], shp_dir['schema'])
            cursor.close()
コード例 #15
0
def save_table_to_postgres(engine, dataframe, tablename):
    """Load a flattened dataframe into a table"""
    logger.info('Loading {} to postgres.'.format(tablename))
    dataframe.to_sql(tablename,
                     engine,
                     if_exists='replace',
                     index=True,
                     index_label='idx')
    logger.info('{} added to postgres.'.format(tablename))
コード例 #16
0
def retrywithtrailingslash(url, access_token):
    response = requests.get(url, headers=access_token)  # Get first page for count
    if response.status_code != 200:
        if response.status_code == 404 or response.status_code == 401:
            logger.info('Error status: {} {}'.format(str(response.status_code), "trying with trailing / ..."))
            response = requests.get(url + '/', headers=access_token)
            return response
        else:
            return logger.info('Error status: ' + str(response.status_code))
    return response
コード例 #17
0
def find_resource_id_if_exists(url, dataset_name, file_name):
    metadata = requests.get(url + '/api/3/action/package_show?id=' +
                            dataset_name)
    package_metadata = metadata.json()
    for resource in package_metadata['result']['resources']:
        if resource['name'] == file_name:
            logger.info(
                'Found existing filename: {}, wil update it now...'.format(
                    resource['name']))
            return resource['id']
コード例 #18
0
ファイル: mdb_to_csv.py プロジェクト: vri3z/data-processing
def get_tables_mdb(mdb_file):
    """Get the list of table names with "mdb-tables" for a *.mdb file using latin1 as encoding.
    """
    table_names_binary_string = subprocess.Popen(
        ["mdb-tables", "-1", mdb_file],
        stdout=subprocess.PIPE).communicate()[0]
    table_names = table_names_binary_string.decode(
        'latin1')  # other option could be 'ascii'
    tables = table_names.split('\n')
    logger.info("Available tables:{}".format(tables))
    return tables
コード例 #19
0
def create_pg_tables(cursor, schema_name, mdb_file):
    """
    Example command:
    mdb-schema -Nnoord ../data/beheerassets/noord/vm_stadsdeel_noord.mdb postgres
    """
    cmd = ['mdb-schema', '-N' + schema_name, mdb_file, 'postgres']
    get_tables_from_mdb = run_command_sync(cmd)
    schema = cleanup_table_create(get_tables_from_mdb)
    cursor.execute(schema)
    print(schema)
    logger.info('Created tables')
コード例 #20
0
def retrywithtrailingslash(url, access_token):
    response = requests.get(url,
                            headers=access_token)  # Get first page for count
    if response.status_code != 200:
        if response.status_code == 404 or response.status_code == 401:
            logger.info('Error status: {} {}'.format(
                str(response.status_code), "trying with trailing / ..."))
            response = requests.get(url + '/', headers=access_token)
            return response
        else:
            return logger.info('Error status: ' + str(response.status_code))
    return response
コード例 #21
0
def your_first_function(argname1):
    """
    Does some great stuff.

    Args:
        argname1: path/in/store

    Returns:
        A file or check, show some examples.
    """
    something_2 = 'test2'
    logger.info('Succes!')
    return something_2
コード例 #22
0
def your_first_function(argname1):
    """
    Does some great stuff.

    Args:
        argname1: path/in/store

    Returns:
        A file or check, show some examples.
    """
    something_2 = 'test2'
    logger.info('Succes!')
    return something_2
コード例 #23
0
def get_data(url_api, endpoint, metadata, accessToken, limit):
    """
    Get and flatten all the data from the api.

    Args:
        1. url_api: get the main api url::

            https://api.data.amsterdam.nl/tellus
        2. get one endpoint::

            tellus

        3. get a list of dictionaries from other endpoints, in this case: for tellus location, speed and length.
        4. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken()
        5. limit: set the number of pages you want to retrieve, ideal for testing first::

           10

    Returns:
        A list containing multiple items which are all reformatted to a flattened json with added metadata.
    """
    data = []
    url = url_api + '/' + endpoint
    startPage = 1
    has_next_key = False
    nextKey = ""
    cvalues = conversionListCvalues(metadata)
    json_data = getJsonData(url, accessToken)

    number_of_items = json_data['count']
    logger.info("number of items {}".format(number_of_items))
    number_of_pages = int(abs(number_of_items / 100))

    if "next" in json_data["_links"].keys():
        has_next_key = True
        url = json_data["_links"]["next"]
        logger.info(url)
    while has_next_key and startPage < limit:
        response = getJsonData(url, accessToken)
        if "next" in response["_links"].keys():
            url = response["_links"]["next"]
            logger.info(nextKey)
        else:
            has_next_key = False
            # no next_key, stop the loop
        # logger.info('status: ' + str(response.status_code))

        for item in response["_embedded"]:
            #logger.info(item)
            newRow = reformatData(item, metadata['tellus']['_embedded'],
                                  cvalues)
            # Add c-waarde row
            #values = list(newRow.values())
            # append to main data array
            data.append(newRow)
        # json.dump(data, outputFile, indent=4, sort_keys=True)
        logger.info('Page {} of {}'.format(startPage, number_of_pages))
        startPage += 1
    #logger.info(data)
    return data
コード例 #24
0
def getAreaCodes(item, lat, lon):
    """
    Get specific information like area codes based radius to nearest address based on lat/lon value
       ex: https://api.data.amsterdam.nl/geosearch/search/?item=verblijfsobject&lat=52.3731750&lon=4.8924655&radius=50
    It currently is coded to work to get:
    - "buurt"
    - "buurtcombinatie"
    - "stadsdeel"
    """
    if item in ["buurt", "buurtcombinatie", "stadsdeel"]:
        url = "https://api.data.amsterdam.nl/geosearch/search/?item=%s&lat=%s&lon=%s&radius=1" % (item, lat, lon)
        logger.info(url)
        jsonData = getJson(url)
        logger.info(jsonData)

        if "features" in jsonData and len(jsonData["features"]) > 0:
            uri = jsonData["features"][0]["properties"]["uri"]
            data = getJson(uri)
            if item == "buurt" or item == "buurtcombinatie":
                return [data["volledige_code"], data["naam"]]
            if item == "stadsdeel":
                return [data["code"], data["naam"]]
        else:
            logger.info('Valt buiten Amsterdam')
            return None
    else:
        logger.info("Ongeldig item")
        return None
コード例 #25
0
def load_wfs_layer_into_postgres(pg_str, url_wfs, layer_name, srs, retry_count=3):
    """
    Get layer from a wfs service.
    Args:
        1. url_wfs: full url of the WFS including https, excluding /?::

            https://map.data.amsterdam.nl/maps/gebieden

        2. layer_name: Title of the layer::

            stadsdeel

        3. srs: coordinate system number, excluding EPSG::

            28992

    Returns:
        The layer loaded into postgres
    """  # noqa

    parameters = {
        "REQUEST": "GetFeature",
        "TYPENAME": layer_name,
        "SERVICE": "WFS",
        "VERSION": "2.0.0",
        #"SRSNAME": "EPSG:{}".format(srs)
    }

    logger.info("Requesting data from {}, layer: {}".format(
        url_wfs, layer_name))
    url = url_wfs + '?' + urlencode(parameters)
    srs = "EPSG:{}".format(srs)

    cmd = [
        'ogr2ogr',
        # replace exiting table.
        '-overwrite',
        # srs to use  4326, 28992
        '-t_srs', srs,
        # layer name
        '-nln', layer_name,
        # geometry target column name
        # new versions of ogr2ogr use diferent clumns
        # force it here. (todo make option?)
        '-lco', 'GEOMETRY_NAME=wkb_geometry',
        '-F', 'PostgreSQL', 'PG:' + pg_str,
        url
    ]

    run_command_sync(cmd)
コード例 #26
0
def get_data(url_api, endpoint, metadata, accessToken, limit):
    """
    Get and flatten all the data from the api.

    Args:
        1. url_api: get the main api url::

            https://api.data.amsterdam.nl/tellus
        2. get one endpoint::

            tellus

        3. get a list of dictionaries from other endpoints, in this case: for tellus location, speed and length.
        4. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken()
        5. limit: set the number of pages you want to retrieve, ideal for testing first::

           10

    Returns:
        A list containing multiple items which are all reformatted to a flattened json with added metadata.
    """
    data = []
    url = url_api + '/' + endpoint
    startPage = 1
    has_next_key = False
    nextKey = ""
    cvalues = conversionListCvalues(metadata)
    json_data = getJsonData(url, accessToken)

    number_of_items = json_data['count']
    logger.info("number of items {}".format(number_of_items))
    number_of_pages = int(abs(number_of_items/100))

    if "next" in json_data["_links"].keys():
        has_next_key = True
        url = json_data["_links"]["next"]
        logger.info(url)
    while has_next_key and startPage < limit:
        response = getJsonData(url, accessToken)
        if "next" in response["_links"].keys():
            url = response["_links"]["next"]
            logger.info(nextKey)
        else:
            has_next_key = False
            # no next_key, stop the loop
        # logger.info('status: ' + str(response.status_code))

        for item in response["_embedded"]:
            #logger.info(item)
            newRow = reformatData(item, metadata['tellus']['_embedded'], cvalues)
            # Add c-waarde row
            #values = list(newRow.values())
            # append to main data array
            data.append(newRow)
        # json.dump(data, outputFile, indent=4, sort_keys=True)
        logger.info('Page {} of {}'.format(startPage,number_of_pages))
        startPage += 1
    #logger.info(data)
    return data
コード例 #27
0
def get_config(full_path):
    """
    Get config file with all login credentials, port numbers, etc.

    Args:
        full_path: provide the full path to the config.ini file, for example authentication/config.ini

    Returns:
        The entire configuration file to use them with ``config.get(config_name, 'AUTHURL')``
     """
    config = configparser.RawConfigParser()
    config.read(full_path)
    logger.info('Found these configs.. {}'.format(config.sections()))

    return config
コード例 #28
0
ファイル: connections.py プロジェクト: Lytrix/data-processing
def get_config(full_path):
    """
    Get config file with all login credentials, port numbers, etc.

    Args:
        full_path: provide the full path to the config.ini file, for example authentication/config.ini

    Returns:
        The entire configuration file to use them with ``config.get(config_name, 'AUTHURL')``
     """
    config = configparser.RawConfigParser()
    config.read(full_path)
    logger.info('Found these configs.. {}'.format(config.sections()))

    return config
コード例 #29
0
def your_second_function(argname1, argname2):
    """
    Does some great stuff.

    Args:
        1. argname1: path/in/store
        2. argname2: your_file_name.txt

    Returns:
        A file or check, show some examples.
    """

    data = argname1
    something = data
    logger.info('Another Succes!')
    return something
コード例 #30
0
def get_layers_from_wfs(url_wfs):
    """
        Get all layer names in WFS service, print and return them in a list.
    """
    layer_names = []
    parameters = {"REQUEST": "GetCapabilities", "SERVICE": "WFS"}
    getcapabilities = requests.get(url_wfs, params=parameters)
    # print(getcapabilities.text)
    root = ET.fromstring(getcapabilities.text)

    for neighbor in root.iter('{http://www.opengis.net/wfs/2.0}FeatureType'):
        # print(neighbor.tag, neighbor.attrib)
        logger.info("layername: " +
                    neighbor[1].text)  # neighbor[0]==name, neighbor[1]==title
        layer_names.append(neighbor[1].text)
    return layer_names
コード例 #31
0
def your_second_function(argname1, argname2):
    """
    Does some great stuff.

    Args:
        1. argname1: path/in/store
        2. argname2: your_file_name.txt

    Returns:
        A file or check, show some examples.
    """

    data = argname1
    something = data
    logger.info('Another Succes!')
    return something
コード例 #32
0
def create_geoms(cursor, schema_name, table_name, x_name, y_name):
    cursor.execute(
        sql.SQL("""
    ALTER TABLE {0}.{1} DROP COLUMN IF EXISTS geom;
    SELECT AddGeometryColumn ({2},{3},'geom',28992,'POINT',2);
    UPDATE {0}.{1}
    SET geom =
       CASE
          -- FIX Microstation offset. Used MSLINK 9688 Noord to match offset mldnr : E06 with dgn vs access db
          WHEN {4} is not null and {4} < 100000 THEN
            ST_PointFromText('POINT('||{4}+123835.77100000000791624+2023648.989000000059605||' '||{5}+489390.68800000002374873+1658091.672999999951571||')',28992)
          WHEN {4} is not null and {4} > 100000 THEN
            ST_PointFromText('POINT('||{4}||' '||{5}||')',28992)
       END""").format(sql.Identifier(schema_name), sql.Identifier(table_name),
                      sql.Literal(schema_name), sql.Literal(table_name),
                      sql.Identifier(x_name), sql.Identifier(y_name)), )
    logger.info('added geometry column to {}'.format(table_name))
コード例 #33
0
def getJsonData(url, access_token):
    """
    Get a json response from a url with accesstoken.

    Args:
        1. url: api endpoint
        2. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken(usertype='employee_plus', scopes='BRK/RS,BRK/RSN/,BRK/RO')

    Returns:
        parsed json or error message
    """

    response = retrywithtrailingslash(url, access_token)

    json_data = response.json()
    logger.info("recieved data from {} ".format(url))
    return json_data
コード例 #34
0
def get_layers_from_wfs(url_wfs):
    """
        Get all layer names in WFS service, print and return them in a list.
    """
    layer_names = []
    parameters = {"REQUEST": "GetCapabilities",
                  "SERVICE": "WFS"
                  }
    getcapabilities = requests.get(url_wfs, params=parameters)
    # print(getcapabilities.text)
    root = ET.fromstring(getcapabilities.text)

    for neighbor in root.iter('{http://www.opengis.net/wfs/2.0}FeatureType'):
        # print(neighbor.tag, neighbor.attrib)
        logger.info("layername: " + neighbor[1].text)  # neighbor[0]==name, neighbor[1]==title
        layer_names.append(neighbor[1].text)
    return layer_names
コード例 #35
0
def getJsonData(url, access_token):
    """
    Get a json response from a url with accesstoken.

    Args:
        1. url: api endpoint
        2. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken(usertype='employee_plus', scopes='BRK/RS,BRK/RSN/,BRK/RO')

    Returns:
        parsed json or error message
    """

    response = retrywithtrailingslash(url, access_token)

    json_data = response.json()
    logger.info("recieved data from {} ".format(url))
    return json_data
コード例 #36
0
def execute_sql(pg_str, sql):
    """
    Execute a sql query with psycopg2.

    Args:
        1. pg_str: connection string using helper function psycopg_connection_string, returning:``host= port= user= dbname= password=``
        2. sql: SQL string in triple quotes::

            ```CREATE TABLE foo (bar text)```

    Returns:
        Executed sql with conn.cursor().execute(sql)
    """
    with psycopg2.connect(pg_str) as conn:
        logger.info('connected to database')
        with conn.cursor() as cursor:
            logger.info('start exectuting sql query')
            cursor.execute(sql)
コード例 #37
0
ファイル: connections.py プロジェクト: Lytrix/data-processing
def execute_sql(pg_str, sql):
    """
    Execute a sql query with psycopg2.

    Args:
        1. pg_str: connection string using helper function psycopg_connection_string, returning:``host= port= user= dbname= password=``
        2. sql: SQL string in triple quotes::

            ```CREATE TABLE foo (bar text)```

    Returns:
        Executed sql with conn.cursor().execute(sql)
    """
    with psycopg2.connect(pg_str) as conn:
        logger.info('connected to database')
        with conn.cursor() as cursor:
            logger.info('start exectuting sql query')
            cursor.execute(sql)
コード例 #38
0
def upload_file_to_ckan(url, dataset_name, file_path):
    """
    Upload a file to the CKAN datastore.

    Args:
        1. url: url of the catalog::

            https://api.data.amsterdam.nl/catalogus

        2. dataset_name: name of the dataset, which can be found on the ckan page url::

            https://api.data.amsterdam.nl/catalogus/dataset/afvalcontainers

        3. api_key: your private user key, which can be found on the user profile page.
        4. file_path: location of the file including filename::

            /path/to/file/to/upload.csv

    Returns:
        An uploaded file to the CKAN datastore.
    """

    assert os.environ['CKAN_API_KEY']

    api_key = os.environ['CKAN_API_KEY']
    file_name = file_path.split('/')[-1]
    resource_id = find_resource_id_if_exists(url, dataset_name, file_name)

    if resource_id:
        data_upload_url = url+'/api/action/resource_update'
        data = {"id":resource_id}
    else:
        data_upload_url = url+'/api/action/resource_create'
        data={"package_id": dataset_name,
              "name":file_name}
        logger.info('Uploading {}...'.format(file_name))

    response = requests.post(data_upload_url,
                             data=data,
                             headers={"X-CKAN-API-Key": api_key},
                             files=[('upload', open(file_path, 'rb'))]
                           )
    assert response.status_code == requests.codes.ok
    logger.info('Uploaded {} to https://api.data.amsterdam.nl/catalogus/dataset/{}'.format(file_name, dataset_name))
コード例 #39
0
def upload_file_to_ckan(url, dataset_name, file_path):
    """
    Upload a file to the CKAN datastore.

    Args:
        1. url: url of the catalog::

            https://api.data.amsterdam.nl/catalogus

        2. dataset_name: name of the dataset, which can be found on the ckan page url::

            https://api.data.amsterdam.nl/catalogus/dataset/afvalcontainers

        3. api_key: your private user key, which can be found on the user profile page.
        4. file_path: location of the file including filename::

            /path/to/file/to/upload.csv

    Returns:
        An uploaded file to the CKAN datastore.
    """

    assert os.environ['CKAN_API_KEY']

    api_key = os.environ['CKAN_API_KEY']
    file_name = file_path.split('/')[-1]
    resource_id = find_resource_id_if_exists(url, dataset_name, file_name)

    if resource_id:
        data_upload_url = url + '/api/action/resource_update'
        data = {"id": resource_id}
    else:
        data_upload_url = url + '/api/action/resource_create'
        data = {"package_id": dataset_name, "name": file_name}
        logger.info('Uploading {}...'.format(file_name))

    response = requests.post(data_upload_url,
                             data=data,
                             headers={"X-CKAN-API-Key": api_key},
                             files=[('upload', open(file_path, 'rb'))])
    assert response.status_code == requests.codes.ok
    logger.info(
        'Uploaded {} to https://api.data.amsterdam.nl/catalogus/dataset/{}'.
        format(file_name, dataset_name))
コード例 #40
0
def put_object(connection, container: str, filename: str, contents,
               content_type: str) -> None:
    """
    Upload a file to objectstore.

    Args:
        1. container: path/in/store
        2. filename: your_file_name.txt
        3. contents: contents of file with use of with open('your_file_name.txt', 'rb') as contents:
        4. content_type:'text/csv','application/json', ... Is retrievd by using the mime package.

    Returns:
        A saved file in the container of the objectstore.
    """
    logger.info('Uploading file...')
    connection.put_object(container,
                          filename,
                          contents=contents,
                          content_type=content_type)
コード例 #41
0
ファイル: mdb_to_csv.py プロジェクト: vri3z/data-processing
def dump_mdb_tables_to_csv(mdb_file, output_folder, table_names):
    """Dump each table as a CSV file using "mdb-export"
       and converting " " in table names to "_" for the CSV filenames."""
    create_dir_if_not_exists(output_folder)
    if table_names is None:
        tables = get_tables_mdb(mdb_file)
    else:
        tables = table_names[0].split(',')

    for table in tables:
        if table != '':
            filename = os.path.join(output_folder,
                                    table.replace(" ", "_") + ".csv")
            file = open(filename, 'wb')
            logger.info("Dumping " + table)
            contents = subprocess.Popen(
                ["mdb-export", mdb_file, table],
                stdout=subprocess.PIPE).communicate()[0]
            file.write(contents)
            file.close()
コード例 #42
0
def executeScriptsFromFile(pg_str, filename):
    """WIP does not work yet"""
    # Open and read the file as a single buffer
    fd = open(filename, 'r')
    sqlFile = fd.read()
    fd.close()

    # all SQL commands (split on ';')
    sqlCommands = sqlFile.split(';')

    # Execute every command from the input file
    for command in sqlCommands:
        logger.info(command)
        # This will skip and report errors
        # For example, if the tables do not yet exist, this will skip over
        # the DROP TABLE commands
        try:
            execute_sql(pg_str, command)
        except psycopg2.OperationalError as msg:
            logger.info("Command skipped: {}".format(msg))
コード例 #43
0
def load_xls(datadir, config_path, db_config_name):
    """Load xlsx into postgres for multiple files"""
    files = os.listdir(datadir)
    files_xls = [f for f in files if f.split('.')[-1] in ('xlsx', 'xls')]
    logger.info(files_xls)

    for filename in files_xls:
        df = pd.read_excel(datadir + '/' + filename)
        if df.empty:
            logger.info('No data')
            continue

        logger.info("added " + filename)
        logger.info(df.columns)

        # load the data into pg
        engine = postgres_engine_pandas(config_path, db_config_name)
        # TODO: link to to_sql function
        table_name = filename.split('.')[0]
        df.to_sql(table_name, engine, if_exists='replace', index=True, index_label='idx')  # ,dtype={geom: Geometry('POINT', srid='4326')})
        logger.info(filename + ' added as ' + table_name)
コード例 #44
0
def getJsonData(url, accessToken):
    """
    Get a json response from a url with accesstoken.

    Args:
        1. url: api endpoint
        2. accessToken: acces token generated using the auth helper: GetAccessToken().getAccessToken(usertype='employee', scopes='TLLS/R')

    Returns:
        parsed json or error message
    """
    response = requests.get(url, headers=accessToken)  # Get first page for count
    if response.status_code != 200:
        if response.status_code == 404 or response.status_code == 401:
            logger.info('Error status: {} {}'.format(str(response.status_code), "trying with trailing / ..."))
            response = requests.get(url + '/', headers=accessToken)
        else:
            return logger.info('Error status: ' + str(response.status_code))
    jsonData = response.json()
    logger.info("recieved data from {} ".format(url))
    return jsonData
コード例 #45
0
def main():
    # Return all arguments in a list
    args = parser().parse_args()

    logger.info("Getting Access token.")
    access_token = GetAccessToken().getAccessToken(usertype=args.usertype, scopes=args.scopes)

    logger.info("Setup temp database to store requests to speed up restart download if network fails.")
    requests_cache.install_cache('requests_db', backend='sqlite')

    logger.info("Getting data with Access token.")
    json_data = getJsonData(args.url, access_token)
    logger.info(json_data)

    save_file(json_data, args.output_folder, args.filename)
コード例 #46
0
def get_layer_from_wfs(url_wfs, layer_name, srs, outputformat):
    """
    Get layer from a wfs service.
    Args:
        1. url_wfs: full url of the WFS including https, excluding /?::

            https://map.data.amsterdam.nl/maps/gebieden

        2. layer_name: Title of the layer::

            stadsdeel

        3. srs: coordinate system number, excluding EPSG::

            28992

        4. outputformat: leave empty to return standard GML, else define json, geojson, txt, shapezip::

            geojson

    Returns:
        The layer in the specified output format.
    """
    parameters = {"REQUEST": "GetFeature",
                  "TYPENAME": layer_name,
                  "SERVICE": "WFS",
                  "VERSION": "2.0.0",
                  "SRSNAME": "EPSG:{}".format(srs),
                  "OUTPUTFORMAT": outputformat
                  }
    logger.info("Requesting data from {}, layer: {}".format(url_wfs, layer_name))
    response = requests.get(url_wfs, params=parameters)
    if outputformat in ('geojson, json'):
        geojson = response.json()
        logger.info("{} features returned.".format(str(len(geojson["features"]))))
        return geojson
    return response
コード例 #47
0
def load_layers(pg_str):
    """
    Load layers into Postgres using a list of titles of each layer within the WFS service.

    Args:
        pg_str: psycopg2 connection string::

        'PG:host= port= user= dbname= password='******'stadsdeel',
                  'buurt',
                  'buurtcombinatie',
                  'gebiedsgerichtwerken']

    srsName = 'EPSG:28992'

    for areaName in layerNames:
        WFS = "https://map.data.amsterdam.nl/maps/gebieden?REQUEST=GetFeature&SERVICE=wfs&Version=2.0.0&SRSNAME=" + srsName + "&typename=" + areaName
        wfs2psql(WFS, pg_str, areaName)
        logger.info(areaName + ' loaded into PG.')
コード例 #48
0
def main():
    # Return all arguments in a list
    args = parser().parse_args()
    logger.info("Getting Access token.")
    accessToken = GetAccessToken().getAccessToken(usertype='employee', scopes='TLLS/R')
    logger.info("Setup temp database to store requests to speed up restart download if network fails.")
    requests_cache.install_cache('requests_db', backend='sqlite')

    endpoints = ['tellus', 'snelheidscategorie', 'lengtecategorie']
    metadata = {}
    for endpoint in endpoints:
        json_data = getJsonData(args.url + '/' + endpoint, accessToken)
        # logger.info(json_data)
        metadata.update({endpoint: json_data})
        logger.info("retrieved {}".format(endpoint))
    data = get_data(args.url, 'tellusdata', metadata, accessToken, args.limit)
    save_file(data, args.output_folder, args.filename)