def main(): """Generates public data dump files from the latest prod data.""" # Connect to the latest schemas. db = DatabaseConnection(path_config='db_config.yaml') schema = db.get_latest_schema('prod_') schema_profil = db.get_latest_schema('source_internal_profil_') db.execute('SET search_path="' + schema + '", "' + schema_profil + '";') timestamp = schema[schema.rfind('_') + 1:] print('[OK] Dumping from schemas "%s" and "%s"...' % (schema, schema_profil)) # Read YAML configuration file. config = yaml_load('public_dumps.yaml') dir_save = config['save_directory'] dumps = config['dumps'] # Process all dumps. for dump_name in dumps: save_path = os.path.join(dir_save, '%s_%s.csv' % (dump_name, timestamp)) db.dump_to_CSV(dumps[dump_name]['query'], save_path) print('[OK] Saved dump "%s" to %s' % (dump_name, save_path)) stage_path = os.path.join(dir_save, dump_name + '.csv') shutil.copyfile(save_path, stage_path) print('[OK] Copied dump "%s" to %s' % (dump_name, stage_path)) # Close database connection. db.close()
def get_source_data_info(): """ Returns a list of data sources, together with information about the latest update (timestamp, list of table names and columns) """ # Establish connection to the database db = DatabaseConnection(path_config='db_config_data.yaml') # Iterate through sources listed in sources.json sources = json_load('../data/sources.json') result = [] for source in sources: # Obtain schema with the last update try: schema = db.get_latest_schema('source_' + source['name']) except Exception as exception: print('[WARNING] %s' % (exception)) continue # Store information to be returned result.append({ 'description': source['description'], 'name': source['name'], 'schema': schema, 'tables': _get_tables_and_columns_in_schema(db, schema), 'update': _datetimestr_from_schema(schema), }) # Close database connection and return the result db.close() return result
def consolidate_entities(read_only): db = DatabaseConnection(path_config='utils/db_config.yaml', search_path='mysql') ids1, eids1, edges = consolidate_people(db) ids2, eids2 = consolidate_companies(db) cur = db.dict_cursor() # 1. Reset eids to be equal to ids in entities. print "Reset eids" cur.execute("update entities set eid = id;") # 2. Consolidate people print "Update DB with people eids" update_eids_of_ids(cur, ids1, eids1) # 3. Consolidate companies print "Update DB with companies eids" update_eids_of_ids(cur, ids2, eids2) # 4. Remove neighbour edges print "Delete neighbour edges" cur.execute("DELETE from related where source=%s", ("neighbour", )) # 5. Add new neighbour edges add_neighbour_edges(cur, edges) # 6. Update related table print "Updating related eids" cur.execute( "UPDATE related SET eid1=entities.eid FROM entities WHERE related.id1=entities.id;" ) cur.execute( "UPDATE related SET eid2=entities.eid FROM entities WHERE related.id2=entities.id;" ) cur.close() if not read_only: db.commit() db.close()
def get(self): # Return politicians augmented with property counts as JSON db = DatabaseConnection(path_config='db_config.yaml', search_path='kataster') politicians = get_politicians_with_Folio_counts(db) db.close() self.returnJSON(politicians)
def reveal_connection_to_politics(max_relations_to_load, num_contracts, max_distance, path_output): """Reveals connections between recent contract winners and politics. Args: max_relations_to_load: Maximum number of relations to load from production table `related`. Use a smaller number for faster debugging only. num_contracts: Number of most recent contracts to analyse. max_distance: Maximum distance at which connections are reported. path_output: Path where to write the resulting report. """ # Connect to the database: db = DatabaseConnection(path_config='db_config.yaml') schema = db.get_latest_schema('prod_') db.execute('SET search_path to ' + schema + ';') # Load relations and notable eIDs: relations = server._initialise_relations(db, max_relations_to_load) notable_eids = server._initialise_notable_eids(db) # Retrieve most recent contracts with positive price: q = """ SELECT supplier_eid, entities.name AS supplier_name, contract_price_amount, contract_price_total_amount, signed_on, effective_from, effective_to, status_id, contract_id FROM contracts INNER JOIN entities ON entities.id=contracts.supplier_eid WHERE signed_on IS NOT NULL AND signed_on <= now() AND ( contract_price_amount > 0 OR contract_price_total_amount > 0 ) AND entities.name LIKE '%%.' AND entities.name NOT LIKE '%%lovensk%%' ORDER BY signed_on DESC LIMIT %s; """ with open(path_output, "w") as file_output: rows = db.query(q, [num_contracts]) for row in tqdm.tqdm(rows): report_on(row, relations, notable_eids, max_distance, db, file_output) db.close()
def main(args_dict): test_mode = not args_dict['disable_test_mode'] if test_mode: print "=======================" print "=======TEST MODE=======" print "=======================" timestamp = datetime.now().strftime('%Y%m%d%H%M%S') # Write output into prod_schema_name prod_schema_name = "prod_" + timestamp print "prod_schema_name", prod_schema_name # Create database connections: # Read / write address cache from this one db_address_cache = DatabaseConnection( path_config='db_config_update_source.yaml', search_path='address_cache') # Write prod tables into this one db_prod = DatabaseConnection(path_config='db_config_update_source.yaml') CreateAndSetProdSchema(db_prod, prod_schema_name) # Initialize geocoder geocoder = geocoder_lib.Geocoder(db_address_cache, db_prod, test_mode) # Initialize entity lookup entities_lookup = entities.Entities(db_prod) # Table prod_tables.yaml defines a specifications of SQL selects to read # source data and describtion of additional tables to be created. with open('prod_tables.yaml', 'r') as stream: config = yaml.load(stream) # This is where all the population happens!!! # Go through all the specified data sources and process them, adding data # as needed. We process them in the order! for key in sorted(config.keys()): config_per_source = config[key] print "Working on source:", key ProcessSource(db_prod, geocoder, entities_lookup, config_per_source, test_mode) print "GEOCODER STATS" geocoder.PrintStats() # Grant apps read-only access to the newly created schema and tables within db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'data') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'verejne') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'kataster') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'prepojenia') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'obstaravania') # Commit database changes and close database connections db_address_cache.commit() db_address_cache.close() if test_mode: db_prod.conn.rollback() else: db_prod.commit() db_prod.close()
def get_prod_data_info(): """ Return list of tables and column names from the current production tables, and the time when these were generated. """ db = DatabaseConnection(path_config='db_config_data.yaml') schema = db.get_latest_schema('prod_') response = { 'schema': schema, 'tables': _get_tables_and_columns_in_schema(db, schema), 'update': _datetimestr_from_schema(schema), } db.close() return response
def get(self): # Parse politician id try: politician_id = int(self.request.GET["id"]) except: self.abort(400, detail="Could not parse parameter 'id' as int") # Find Parcels owned by this politician in the database db = DatabaseConnection(path_config='db_config.yaml', search_path='kataster') Parcels = get_Parcels_owned_by_Person(db, politician_id) db.close() self.returnJSON(Parcels)
def get(self): # Parse politician id try: politician_id = int(self.request.GET["id"]) except: self.abort(400, detail="Could not parse parameter 'id' as int") # Find asset declarations of this politician in the database db = DatabaseConnection(path_config='db_config.yaml', search_path='kataster') declarations = get_asset_declarations(db, politician_id) db.close() self.returnJSON(declarations)
def get(self): # Parse politician id try: politician_id = int(self.request.GET["id"]) except: self.abort(400, detail="Could not parse parameter 'id' as int") # Get politician information from database db = DatabaseConnection(path_config='db_config.yaml', search_path='kataster') politician = get_politician_by_PersonId(db, politician_id) db.close() if politician is None: self.abort(404, detail="Could not find politician with provided 'id'") self.returnJSON(politician)
def update_CSV_source(source, timestamp, dry_run, verbose): # Load the CSV file with open(source['path'], 'r') as f: delimiter = str(source['delimiter']) # requires string, not unicode reader = csv.reader(f, delimiter=delimiter) # Extract column names from header line and then the actual data header = next(reader) column_names = [column_name.decode('utf-8') for column_name in header] data = [tuple(row) for row in reader] if verbose: print('Loaded CSV file with %d columns and %d data rows' % (len(column_names), len(data))) # Create postgres schema db = DatabaseConnection(path_config='db_config_update_source.yaml') schema = 'source_' + source['name'] + '_' + timestamp q = 'CREATE SCHEMA %s; SET search_path="%s";' % (schema, schema) db.execute(q) # Compute normalised column names, saving original names in a separate table column_names_normalised = map(normalise_CSV_column_name, column_names) q = 'CREATE TABLE column_names (name_original text, name_normalised text);' db.execute(q) q = """INSERT INTO column_names VALUES %s;""" q_data = [(original, normalised) for original, normalised in zip(column_names, column_names_normalised)] db.execute_values(q, q_data) # Create table containing the actual data from the CSV file table = source['table_name'] table_columns = ', '.join(['%s text' % (name) for name in column_names_normalised]) q = 'CREATE TABLE %s (%s);' % (table, table_columns) db.execute(q) # Populate the table with data q = 'INSERT INTO ' + table + ' VALUES %s;' db.execute_values(q, data) if verbose: print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else '')) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema, 'data') # Commit and close database connection if not dry_run: db.commit() db.close()
def subgraph(self, set_A, set_B): # Compute distance of each vertex from A and from B dists_A = self.dijkstra(set_A, set_B, return_all=True) dists_B = self.dijkstra(set_B, set_A, return_all=True) dists_AB = [dists_A[v] for v in set_B if v in dists_A] if len(dists_AB) == 0: return {'vertices': [], 'edges': []} dist_AB = min([dists_A[v] for v in set_B if v in dists_A]) # Determine subgraph's vertices (eIDs) vertices_eids = set() vertices_eids.update(set_A) vertices_eids.update(set_B) tolerance = 1 for v in dists_A: if (v in dists_B) and (dists_A[v] + dists_B[v] <= dist_AB + tolerance): vertices_eids.add(v) # Obtain entity name for chosen vertices db = DatabaseConnection(search_path='mysql') q = """ SELECT eid, entity_name FROM entities WHERE entities.eid IN %s; """ q_data = (tuple(vertices_eids),) rows = db.query(q, q_data) db.close() eid_to_name = {row['eid']: row['entity_name'] for row in rows} # Add entity names and distances to vertices vertices = [] for eid in vertices_eids: vertices.append({ 'eid': eid, 'entity_name': eid_to_name[eid], 'distance_from_A': dists_A.get(eid, None), 'distance_from_B': dists_B.get(eid, None), }) # Build subgraph's edges edges = [] for v1, v2, length in self.edges: if (v1 in vertices_eids) and (v2 in vertices_eids): edges.append((v1, v2, length)) return {'vertices': vertices, 'edges': edges}
def __init__(self): log('Connecting to the database...') db = DatabaseConnection(path_config='db_config.yaml', search_path='mysql') with open("db_config.yaml", "r") as stream: config = yaml.load(stream) log('Relations constructor...') q = """SELECT eid1, eid2, length FROM related LIMIT %s""" q_data = [int(config["relations_to_load"])] for row in db.query(q, q_data): self.edges.append((row["eid1"], row["eid2"], float(row["length"]))) self.edges.append((row["eid2"], row["eid1"], float(row["length"]))) db.close() log('Sorting edges...') self.edges.sort() log('Creating start indices...') for i in xrange(len(self.edges)): cur = self.edges[i][0] if cur in self.start_index: continue self.start_index[cur] = i
def generate_public_data_dumps(limit=None, verbose=False): """ Generates the public data dump files from the latest production data """ # Connect to the latest production data schema db = DatabaseConnection(path_config='db_config_update_source.yaml') schema = db.get_latest_schema('prod_') db.execute('SET search_path="' + schema + '";') timestamp = schema[schema.rfind('_') + 1:] if verbose: print('[OK] Dumping from schema "%s"...' % (schema)) if limit is not None: print('[WARNING] Dumping with row limit %d!' % (limit)) # Read YAML configuration file config = yaml_load('public_dumps.yaml') dir_save = config['save_directory'] dumps = config['dumps'] # Process all dumps for dump_name in dumps: # Construct dump query q = dumps[dump_name]['query'] q = q.rstrip().rstrip(';') # possibly remove ; ending if limit is not None: q += ' LIMIT %d' % (limit) # Dump to CSV without timestamp path_output = '%s%s.csv' % (dir_save, dump_name) db.dump_to_CSV(q, path_output) if verbose: print('[OK] Created dump "%s" in %s' % (dump_name, path_output)) # Dump to CSV with timestamp path_output = '%s%s_%s.csv' % (dir_save, dump_name, timestamp) db.dump_to_CSV(q, path_output) if verbose: print('[OK] Created dump "%s" in %s' % (dump_name, path_output)) # Close database connection db.close()
def update_JSON_source(source, timestamp, dry_run, verbose): # Load the JSON file data = json_load(source['path']) # Obtain column names appearing anywhere in the JSON columns = sorted(list(set(chain.from_iterable([datum.keys() for datum in data])))) if verbose: print('Loaded JSON files with %d columns and %d data rows' % (len(columns), len(data))) # Reorganise data into a list of tuples data = [tuple(datum[column] if column in datum else "" for column in columns) for datum in data] # Create postgres schema db = DatabaseConnection(path_config='db_config_update_source.yaml') schema = 'source_' + source['name'] + '_' + timestamp q = 'CREATE SCHEMA "%s"; SET search_path="%s";' % (schema, schema) db.execute(q) # Create table containing the actual data from the CSV file table = source['table_name'] table_columns = ', '.join(['%s text' % (name) for name in columns]) q = 'CREATE TABLE %s (%s);' % (table, table_columns) db.execute(q) # Populate the table with data q = 'INSERT INTO ' + table + ' VALUES %s;' db.execute_values(q, data) if verbose: print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else '')) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema, 'data') # Commit and close database connection if not dry_run: db.commit() db.close()
def update_SQL_source(source, timestamp, dry_run, verbose): # Check that the (temporary) schema names created by this data source # do not conflict with existing schemas in the database db = DatabaseConnection(path_config='db_config_update_source.yaml') q = """SELECT schema_name FROM information_schema.schemata WHERE schema_name IN %s LIMIT 1;""" q_data = (tuple(source['schemas']),) res = db.query(q, q_data, return_dicts=False) db.close() if len(res) >= 1: raise Exception('Schema "%s" that source "%s" reads into already exists' % (res[0][0], source['name'])) if verbose: print('[OK] No conflicting schema names found') # Download online resource if a URL is specified, storing it at the # location specified in source['path'] if ('url' in source): urllib.urlretrieve(source['url'], source['path']) if verbose: print('[OK] Downloaded from %s to %s' % (source['url'], source['path'])) if dry_run: print('[WARNING] --dry_run option not implemented for entire pipeline of updating an SQL source') db.close() return # Load into postgres, unzipping along the way if source['path'].endswith('.sql.gz'): p1 = subprocess.Popen(['gunzip', '-c', source['path']], stdout=subprocess.PIPE) subprocess.check_output(['psql', '-d', 'vd', '-q'], stdin=p1.stdout) # Load into postgres directly else: # The options -q -o /dev/null just suppress output subprocess.call(['psql', '-d', 'vd', '-f', source['path'], '-q', '-o', '/dev/null']) # Rename loaded schema(s) to the desired schema name(s) # If there is a single schema, rename it to source_NAME_TIMESTAMP # If there are multiple schemas, rename them to source_NAME_SCHEMA_TIMESTAMP db = DatabaseConnection(path_config='db_config_update_source.yaml') if len(source['schemas']) == 1: schema_old = source['schemas'][0] schema_new = 'source_' + source['name'] + '_' + timestamp db.rename_schema(schema_old, schema_new, verbose) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema_new, 'data') else: for schema_old in source['schemas']: schema_new = 'source_' + source['name'] + '_' + schema_old + '_' + timestamp db.rename_schema(schema_old, schema_new, verbose) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema_new, 'data') # Commit and close database connection db.commit() db.close()
def ProcessSource(db_prod, geocoder, entities, config, test_mode): """ Process one source table (read from db_source) using the config and performing normalization using the given geocoder and entities lookup. The produced data are written into db_prod connection. The function writes new entities and addresses in to the Entities and Address tables. It also creates and populates supplementary tables as specified by a config. """ # Connect to the most recent schema from the current source db_source = DatabaseConnection(path_config='db_config_update_source.yaml') source_schema_name = db_source.get_latest_schema('source_' + config["source_schema"]) print "Processing source_schema_name", source_schema_name db_source.execute('SET search_path="' + source_schema_name + '";') columns_for_table = {} with db_prod.dict_cursor() as cur: # Create supplementaty tables using the provided command. # Also store the columns of the table for later use. for table in config["tables"]: table_config = config["tables"][table] columns_for_table[table] = table_config["columns"] cur.execute(table_config["create_command"]) def AddValuesToTable(columns, values, eid, supplier_eid=None): if eid is not None: columns += ["eid"] values += [eid] if supplier_eid is not None: columns += ["supplier_eid"] values += [supplier_eid] if all(v is None for v in values): # Ignore this entry, all meaningful values are None return # TODO: find out how to build SQL statement properly column_names = ",".join(columns) values_params = ",".join(["%s"] * (len(columns))) command = ("INSERT INTO %s (" + column_names + ") " + "VALUES (" + values_params + ") " + "ON CONFLICT DO NOTHING") with db_prod.dict_cursor() as cur: cur.execute(command, [AsIs(table)] + values) def AddToTable(row, table, eid, years, supplier_eid=None): """ Add values for the given row into the supplementary table 'table'. It reads the corresponding values from the row and adds them into the table with the corresponding eid. """ columns = list(columns_for_table[table]) if years: for year in years: values = [] columns_per_year = columns[:] for column in columns: col_name = column + "_" + str(year) if col_name in row: values.append(row[col_name]) else: values.append(None) columns_per_year.append("year") values.append(year) AddValuesToTable(columns_per_year, values, eid) else: values = [row[column] for column in columns] AddValuesToTable(columns, values, eid, supplier_eid) with db_source.dict_cursor() as cur: # Read data using the given command. print "Executing SQL command ..." suffix_for_testing = "" if test_mode: suffix_for_testing = " LIMIT 1000" cur.execute(config["command"] + suffix_for_testing) print "Done." missed = 0 found = 0 empty = 0 missed_eid = 0 found_eid = 0 missed_addresses = set([]) for row in cur: # Read entries one by one and try to geocode them. If the address # lookup succeeds, try to normalize the entities. If it succeeds, # insert into Entities and supplementary tables. address = "" if "address" in row: address = row["address"] if address is None: continue name = "" if "name" in row: name = row["name"] if name is None: continue # Sometimes FirstName and Surname are joined. Lets try the simplest splitting on Capital # letters. if (len(name.split()) == 1): name = ' '.join(re.findall('[A-Z][^A-Z]*', name)) addressId = geocoder.GetAddressId(address.encode("utf8")) if addressId is None: if address == "": empty += 1 else: if test_mode and missed < 10: print "MISSING ADDRESS", address.encode("utf8") missed_addresses.add(address) missed += 1 continue found += 1 eid = None if config.get("no_entity_id"): # TODO(rasto): is the address lookup necessary here? eid = None else: eid = entities.GetEntity(row["ico"], name, addressId) if found % 20000 == 0: print "Progress:", found sys.stdout.flush() if config.get("save_org_id"): entities.AddOrg2Eid(row["org_id"], eid) if config.get("use_org_id_as_eid_relation"): eid2 = entities.GetEidForOrgId(row["eid_relation"]) if eid2 is None: continue row["eid_relation"] = eid2 if config.get("extract_description_from_body"): row["body"] = ExtractDescriptionFromBody(row["body"]) supplier_eid = None if config.get("supplier_eid"): supplier_address_id = None if "supplier_address" in row and not row[ "supplier_address"] is None: supplier_address = row["supplier_address"] if supplier_address: supplier_address_id = geocoder.GetAddressId( supplier_address.encode("utf8")) if supplier_address_id is None: missed_addresses.add(supplier_address) missed += 1 continue else: empty += 1 supplier_name = "" if "supplier_name" in row and not row["supplier_name"] is None: supplier_name = row["supplier_name"] supplier_eid = entities.GetEntity(row["supplier_ico"], supplier_name, supplier_address_id) if table_config.get("strip_html"): for strip_html_column in table_config["strip_html"]: if row.get(strip_html_column): row[strip_html_column] = StripHtml( row[strip_html_column]) if eid is None: missed_eid += 1 found_eid += 1 AddToTable(row, table, eid, table_config.get("years"), supplier_eid) print "FOUND", found print "MISSED", missed print "EMPTY", empty print "MISSED UNIQUE", len(missed_addresses) print "FOUND EID", found_eid print "MISSED EID", missed_eid db_source.close()