def update_CSV_source(source, timestamp, dry_run, verbose): # Load the CSV file with open(source['path'], 'r') as f: delimiter = str(source['delimiter']) # requires string, not unicode reader = csv.reader(f, delimiter=delimiter) # Extract column names from header line and then the actual data header = next(reader) column_names = [column_name.decode('utf-8') for column_name in header] data = [tuple(row) for row in reader] if verbose: print('Loaded CSV file with %d columns and %d data rows' % (len(column_names), len(data))) # Create postgres schema db = DatabaseConnection(path_config='db_config_update_source.yaml') schema = 'source_' + source['name'] + '_' + timestamp q = 'CREATE SCHEMA %s; SET search_path="%s";' % (schema, schema) db.execute(q) # Compute normalised column names, saving original names in a separate table column_names_normalised = map(normalise_CSV_column_name, column_names) q = 'CREATE TABLE column_names (name_original text, name_normalised text);' db.execute(q) q = """INSERT INTO column_names VALUES %s;""" q_data = [(original, normalised) for original, normalised in zip(column_names, column_names_normalised)] db.execute_values(q, q_data) # Create table containing the actual data from the CSV file table = source['table_name'] table_columns = ', '.join(['%s text' % (name) for name in column_names_normalised]) q = 'CREATE TABLE %s (%s);' % (table, table_columns) db.execute(q) # Populate the table with data q = 'INSERT INTO ' + table + ' VALUES %s;' db.execute_values(q, data) if verbose: print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else '')) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema, 'data') # Commit and close database connection if not dry_run: db.commit() db.close()
def update_JSON_source(source, timestamp, dry_run, verbose): # Load the JSON file data = json_load(source['path']) # Obtain column names appearing anywhere in the JSON columns = sorted(list(set(chain.from_iterable([datum.keys() for datum in data])))) if verbose: print('Loaded JSON files with %d columns and %d data rows' % (len(columns), len(data))) # Reorganise data into a list of tuples data = [tuple(datum[column] if column in datum else "" for column in columns) for datum in data] # Create postgres schema db = DatabaseConnection(path_config='db_config_update_source.yaml') schema = 'source_' + source['name'] + '_' + timestamp q = 'CREATE SCHEMA "%s"; SET search_path="%s";' % (schema, schema) db.execute(q) # Create table containing the actual data from the CSV file table = source['table_name'] table_columns = ', '.join(['%s text' % (name) for name in columns]) q = 'CREATE TABLE %s (%s);' % (table, table_columns) db.execute(q) # Populate the table with data q = 'INSERT INTO ' + table + ' VALUES %s;' db.execute_values(q, data) if verbose: print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else '')) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema, 'data') # Commit and close database connection if not dry_run: db.commit() db.close()