def reveal_connection_to_politics(max_relations_to_load, num_contracts, max_distance, path_output): """Reveals connections between recent contract winners and politics. Args: max_relations_to_load: Maximum number of relations to load from production table `related`. Use a smaller number for faster debugging only. num_contracts: Number of most recent contracts to analyse. max_distance: Maximum distance at which connections are reported. path_output: Path where to write the resulting report. """ # Connect to the database: db = DatabaseConnection(path_config='db_config.yaml') schema = db.get_latest_schema('prod_') db.execute('SET search_path to ' + schema + ';') # Load relations and notable eIDs: relations = server._initialise_relations(db, max_relations_to_load) notable_eids = server._initialise_notable_eids(db) # Retrieve most recent contracts with positive price: q = """ SELECT supplier_eid, entities.name AS supplier_name, contract_price_amount, contract_price_total_amount, signed_on, effective_from, effective_to, status_id, contract_id FROM contracts INNER JOIN entities ON entities.id=contracts.supplier_eid WHERE signed_on IS NOT NULL AND signed_on <= now() AND ( contract_price_amount > 0 OR contract_price_total_amount > 0 ) AND entities.name LIKE '%%.' AND entities.name NOT LIKE '%%lovensk%%' ORDER BY signed_on DESC LIMIT %s; """ with open(path_output, "w") as file_output: rows = db.query(q, [num_contracts]) for row in tqdm.tqdm(rows): report_on(row, relations, notable_eids, max_distance, db, file_output) db.close()
def generate_unmovable_asset_count_plots(): """Generates and saves asset count plots for all persons.""" # Connect to most recent profil source schema in the database. db = DatabaseConnection(path_config='db_config.yaml') schema_profil = db.get_latest_schema('source_internal_profil_') db.execute('SET search_path="' + schema_profil + '";') # Load declarations data from the database. declarations = db.query(""" SELECT PersonId, Persons.FirstName AS firstname, Persons.Surname AS surname, year, num_houses, num_fields, num_others FROM AssetDeclarations INNER JOIN Persons ON Persons.Id=AssetDeclarations.PersonId WHERE (num_houses IS NOT NULL) AND (num_fields IS NOT NULL) AND (num_others IS NOT NULL) ;""") # Compute range of years present in the declarations. years = [declaration['year'] for declaration in declarations] years = list(range(min(years), max(years) + 1)) # Group declarations by person. user_declarations = collections.defaultdict(list) for declaration in declarations: user_declarations[declaration['personid']].append(declaration) # Matplotlib font matplotlib.rc( 'font', **{ 'size': 11, 'sans-serif': 'Arial', 'family': 'sans-serif' }) # Iterate through all persons, and plot. for ui, person_id in enumerate(user_declarations): # if person_id != 913: # continue declarations = user_declarations[person_id] plot_unmovable_asset_counts(declarations, years, DIR_SAVE) if ui + 1 == len(user_declarations) or (ui + 1) % 50 == 0: print('Plotted %d/%d persons' % (ui + 1, len(user_declarations))) print('\nDeploy generated plots using\n' 'sudo cp %s* ' '/data/www/verejne.digital/resources/profil_asset_plots' % (DIR_SAVE))
def initialise_app(max_relations_to_load): """Precomputes values shared across requests to this app. The registry property is intended for storing these precomputed values, so as to avoid global variables. """ # Connect to the database: db = DatabaseConnection(path_config='db_config.yaml') schema = db.get_latest_schema('prod_') db.execute('SET search_path to ' + schema + ';') app.registry['db'] = db # Retrieve list of relationship edges: q = """ SELECT eid, eid_relation, stakeholder_type_id FROM related LIMIT %s; """ q_data = [max_relations_to_load] edge_list = [] for row in db.query(q, q_data): edge_type = row['stakeholder_type_id'] or 0 edge_list.append((row['eid'], row['eid_relation'], +1 * edge_type)) edge_list.append((row['eid_relation'], row['eid'], -1 * edge_type)) # Construct Relations object from the edge list: relations = Relations(edge_list) app.registry['relations'] = relations # TEMP: Construct Relations using old database data: db_old = DatabaseConnection(path_config='db_config_old.yaml', search_path='mysql') app.registry['db_old'] = db_old q = """SELECT eid1, eid2, length FROM related LIMIT %s;""" q_data = [max_relations_to_load] edge_list_old = [] for row in db_old.query(q, q_data): edge_list_old.append((row['eid1'], row['eid2'], float(row['length']))) edge_list_old.append((row['eid2'], row['eid1'], float(row['length']))) relations_old = Relations(edge_list_old) app.registry['relations_old'] = relations_old
def update_SQL_source(source, timestamp, dry_run, verbose): # Check that the (temporary) schema names created by this data source # do not conflict with existing schemas in the database db = DatabaseConnection(path_config='db_config_update_source.yaml') q = """SELECT schema_name FROM information_schema.schemata WHERE schema_name IN %s LIMIT 1;""" q_data = (tuple(source['schemas']),) res = db.query(q, q_data, return_dicts=False) db.close() if len(res) >= 1: raise Exception('Schema "%s" that source "%s" reads into already exists' % (res[0][0], source['name'])) if verbose: print('[OK] No conflicting schema names found') # Download online resource if a URL is specified, storing it at the # location specified in source['path'] if ('url' in source): urllib.urlretrieve(source['url'], source['path']) if verbose: print('[OK] Downloaded from %s to %s' % (source['url'], source['path'])) if dry_run: print('[WARNING] --dry_run option not implemented for entire pipeline of updating an SQL source') db.close() return # Load into postgres, unzipping along the way if source['path'].endswith('.sql.gz'): p1 = subprocess.Popen(['gunzip', '-c', source['path']], stdout=subprocess.PIPE) subprocess.check_output(['psql', '-d', 'vd', '-q'], stdin=p1.stdout) # Load into postgres directly else: # The options -q -o /dev/null just suppress output subprocess.call(['psql', '-d', 'vd', '-f', source['path'], '-q', '-o', '/dev/null']) # Rename loaded schema(s) to the desired schema name(s) # If there is a single schema, rename it to source_NAME_TIMESTAMP # If there are multiple schemas, rename them to source_NAME_SCHEMA_TIMESTAMP db = DatabaseConnection(path_config='db_config_update_source.yaml') if len(source['schemas']) == 1: schema_old = source['schemas'][0] schema_new = 'source_' + source['name'] + '_' + timestamp db.rename_schema(schema_old, schema_new, verbose) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema_new, 'data') else: for schema_old in source['schemas']: schema_new = 'source_' + source['name'] + '_' + schema_old + '_' + timestamp db.rename_schema(schema_old, schema_new, verbose) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema_new, 'data') # Commit and close database connection db.commit() db.close()
def test_a_shortest_path_of_unit_length(self): """Tests finding a shortest path between endpoints of an edge.""" # Find a relation in the database: db = DatabaseConnection(path_config='db_config.yaml') schema = db.get_latest_schema('prod_') db.execute('SET search_path to ' + schema + ';') rel = db.query('SELECT eid, eid_relation FROM related LIMIT 1')[0] source = int(rel["eid"]) target = int(rel["eid_relation"]) # Check that the shortest path of length 1 is found: url = '/a_shortest_path?eid1=%d&eid2=%d' % (source, target) content = _request_json(url, self) print('AShortestPath:\n%s' % (content)) self.assertListEqual(content, [source, target])
def subgraph(self, set_A, set_B): # Compute distance of each vertex from A and from B dists_A = self.dijkstra(set_A, set_B, return_all=True) dists_B = self.dijkstra(set_B, set_A, return_all=True) dists_AB = [dists_A[v] for v in set_B if v in dists_A] if len(dists_AB) == 0: return {'vertices': [], 'edges': []} dist_AB = min([dists_A[v] for v in set_B if v in dists_A]) # Determine subgraph's vertices (eIDs) vertices_eids = set() vertices_eids.update(set_A) vertices_eids.update(set_B) tolerance = 1 for v in dists_A: if (v in dists_B) and (dists_A[v] + dists_B[v] <= dist_AB + tolerance): vertices_eids.add(v) # Obtain entity name for chosen vertices db = DatabaseConnection(search_path='mysql') q = """ SELECT eid, entity_name FROM entities WHERE entities.eid IN %s; """ q_data = (tuple(vertices_eids),) rows = db.query(q, q_data) db.close() eid_to_name = {row['eid']: row['entity_name'] for row in rows} # Add entity names and distances to vertices vertices = [] for eid in vertices_eids: vertices.append({ 'eid': eid, 'entity_name': eid_to_name[eid], 'distance_from_A': dists_A.get(eid, None), 'distance_from_B': dists_B.get(eid, None), }) # Build subgraph's edges edges = [] for v1, v2, length in self.edges: if (v1 in vertices_eids) and (v2 in vertices_eids): edges.append((v1, v2, length)) return {'vertices': vertices, 'edges': edges}
def __init__(self): log('Connecting to the database...') db = DatabaseConnection(path_config='db_config.yaml', search_path='mysql') with open("db_config.yaml", "r") as stream: config = yaml.load(stream) log('Relations constructor...') q = """SELECT eid1, eid2, length FROM related LIMIT %s""" q_data = [int(config["relations_to_load"])] for row in db.query(q, q_data): self.edges.append((row["eid1"], row["eid2"], float(row["length"]))) self.edges.append((row["eid2"], row["eid1"], float(row["length"]))) db.close() log('Sorting edges...') self.edges.sort() log('Creating start indices...') for i in xrange(len(self.edges)): cur = self.edges[i][0] if cur in self.start_index: continue self.start_index[cur] = i