Ejemplo n.º 1
0
def update_CSV_source(source, timestamp, dry_run, verbose):
    # Load the CSV file
    with open(source['path'], 'r') as f:
        delimiter = str(source['delimiter']) # requires string, not unicode
        reader = csv.reader(f, delimiter=delimiter)

        # Extract column names from header line and then the actual data
        header = next(reader)
        column_names = [column_name.decode('utf-8') for column_name in header]
        data = [tuple(row) for row in reader]
    if verbose:
        print('Loaded CSV file with %d columns and %d data rows' % (len(column_names), len(data)))

    # Create postgres schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = 'source_' + source['name'] + '_' + timestamp
    q = 'CREATE SCHEMA %s; SET search_path="%s";' % (schema, schema)
    db.execute(q)

    # Compute normalised column names, saving original names in a separate table
    column_names_normalised = map(normalise_CSV_column_name, column_names)
    q = 'CREATE TABLE column_names (name_original text, name_normalised text);'
    db.execute(q)
    q = """INSERT INTO column_names VALUES %s;"""
    q_data = [(original, normalised) for original, normalised in zip(column_names, column_names_normalised)]
    db.execute_values(q, q_data)

    # Create table containing the actual data from the CSV file
    table = source['table_name']
    table_columns = ', '.join(['%s text' % (name) for name in column_names_normalised])
    q = 'CREATE TABLE %s (%s);' % (table, table_columns)
    db.execute(q)

    # Populate the table with data
    q = 'INSERT INTO ' + table + ' VALUES %s;'
    db.execute_values(q, data)
    if verbose:
        print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else ''))

    # Grant privileges to user data for data/SourceDataInfo to work properly
    db.grant_usage_and_select_on_schema(schema, 'data')

    # Commit and close database connection
    if not dry_run:
        db.commit()
    db.close()
Ejemplo n.º 2
0
def update_JSON_source(source, timestamp, dry_run, verbose):
    # Load the JSON file
    data = json_load(source['path'])

    # Obtain column names appearing anywhere in the JSON
    columns = sorted(list(set(chain.from_iterable([datum.keys() for datum in data]))))
    if verbose:
        print('Loaded JSON files with %d columns and %d data rows' % (len(columns), len(data)))

    # Reorganise data into a list of tuples
    data = [tuple(datum[column] if column in datum else "" for column in columns) for datum in data]

    # Create postgres schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = 'source_' + source['name'] + '_' + timestamp
    q = 'CREATE SCHEMA "%s"; SET search_path="%s";' % (schema, schema)
    db.execute(q)

    # Create table containing the actual data from the CSV file
    table = source['table_name']
    table_columns = ', '.join(['%s text' % (name) for name in columns])
    q = 'CREATE TABLE %s (%s);' % (table, table_columns)
    db.execute(q)

    # Populate the table with data
    q = 'INSERT INTO ' + table + ' VALUES %s;'
    db.execute_values(q, data)
    if verbose:
        print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else ''))

    # Grant privileges to user data for data/SourceDataInfo to work properly
    db.grant_usage_and_select_on_schema(schema, 'data')

    # Commit and close database connection
    if not dry_run:
        db.commit()
    db.close()