def reveal_connection_to_politics(max_relations_to_load, num_contracts,
                                  max_distance, path_output):
    """Reveals connections between recent contract winners and politics.

  Args:
    max_relations_to_load: Maximum number of relations to load from
        production table `related`. Use a smaller number for faster
        debugging only.
    num_contracts: Number of most recent contracts to analyse.
    max_distance: Maximum distance at which connections are reported.
    path_output: Path where to write the resulting report.
  """

    # Connect to the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')

    # Load relations and notable eIDs:
    relations = server._initialise_relations(db, max_relations_to_load)
    notable_eids = server._initialise_notable_eids(db)

    # Retrieve most recent contracts with positive price:
    q = """
      SELECT
        supplier_eid,
        entities.name AS supplier_name,
        contract_price_amount,
        contract_price_total_amount,
        signed_on,
        effective_from,
        effective_to,
        status_id,
        contract_id
      FROM
        contracts
      INNER JOIN
        entities ON entities.id=contracts.supplier_eid
      WHERE
        signed_on IS NOT NULL
        AND signed_on <= now()
        AND (
          contract_price_amount > 0 OR contract_price_total_amount > 0
        )
        AND entities.name LIKE '%%.'
        AND entities.name NOT LIKE '%%lovensk%%'
      ORDER BY
        signed_on DESC
      LIMIT %s;
  """
    with open(path_output, "w") as file_output:
        rows = db.query(q, [num_contracts])
        for row in tqdm.tqdm(rows):
            report_on(row, relations, notable_eids, max_distance, db,
                      file_output)

    db.close()
Ejemplo n.º 2
0
def generate_unmovable_asset_count_plots():
    """Generates and saves asset count plots for all persons."""

    # Connect to most recent profil source schema in the database.
    db = DatabaseConnection(path_config='db_config.yaml')
    schema_profil = db.get_latest_schema('source_internal_profil_')
    db.execute('SET search_path="' + schema_profil + '";')

    # Load declarations data from the database.
    declarations = db.query("""
    SELECT
      PersonId,
      Persons.FirstName AS firstname,
      Persons.Surname AS surname,
      year,
      num_houses,
      num_fields,
      num_others
    FROM
      AssetDeclarations
    INNER JOIN
      Persons ON Persons.Id=AssetDeclarations.PersonId
    WHERE
      (num_houses IS NOT NULL) AND
      (num_fields IS NOT NULL) AND
      (num_others IS NOT NULL)
  ;""")

    # Compute range of years present in the declarations.
    years = [declaration['year'] for declaration in declarations]
    years = list(range(min(years), max(years) + 1))

    # Group declarations by person.
    user_declarations = collections.defaultdict(list)
    for declaration in declarations:
        user_declarations[declaration['personid']].append(declaration)

    # Matplotlib font
    matplotlib.rc(
        'font', **{
            'size': 11,
            'sans-serif': 'Arial',
            'family': 'sans-serif'
        })

    # Iterate through all persons, and plot.
    for ui, person_id in enumerate(user_declarations):
        # if person_id != 913:
        #   continue
        declarations = user_declarations[person_id]
        plot_unmovable_asset_counts(declarations, years, DIR_SAVE)
        if ui + 1 == len(user_declarations) or (ui + 1) % 50 == 0:
            print('Plotted %d/%d persons' % (ui + 1, len(user_declarations)))
    print('\nDeploy generated plots using\n'
          'sudo cp %s* '
          '/data/www/verejne.digital/resources/profil_asset_plots' %
          (DIR_SAVE))
Ejemplo n.º 3
0
def initialise_app(max_relations_to_load):
    """Precomputes values shared across requests to this app.

  The registry property is intended for storing these precomputed
  values, so as to avoid global variables.
  """

    # Connect to the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')
    app.registry['db'] = db

    # Retrieve list of relationship edges:
    q = """
      SELECT eid, eid_relation, stakeholder_type_id
      FROM related
      LIMIT %s;
      """
    q_data = [max_relations_to_load]
    edge_list = []
    for row in db.query(q, q_data):
        edge_type = row['stakeholder_type_id'] or 0
        edge_list.append((row['eid'], row['eid_relation'], +1 * edge_type))
        edge_list.append((row['eid_relation'], row['eid'], -1 * edge_type))

    # Construct Relations object from the edge list:
    relations = Relations(edge_list)
    app.registry['relations'] = relations

    # TEMP: Construct Relations using old database data:
    db_old = DatabaseConnection(path_config='db_config_old.yaml',
                                search_path='mysql')
    app.registry['db_old'] = db_old
    q = """SELECT eid1, eid2, length FROM related LIMIT %s;"""
    q_data = [max_relations_to_load]
    edge_list_old = []
    for row in db_old.query(q, q_data):
        edge_list_old.append((row['eid1'], row['eid2'], float(row['length'])))
        edge_list_old.append((row['eid2'], row['eid1'], float(row['length'])))
    relations_old = Relations(edge_list_old)
    app.registry['relations_old'] = relations_old
Ejemplo n.º 4
0
def update_SQL_source(source, timestamp, dry_run, verbose):
    # Check that the (temporary) schema names created by this data source
    # do not conflict with existing schemas in the database
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    q = """SELECT schema_name FROM information_schema.schemata WHERE schema_name IN %s LIMIT 1;"""
    q_data = (tuple(source['schemas']),)
    res = db.query(q, q_data, return_dicts=False)
    db.close()
    if len(res) >= 1:
        raise Exception('Schema "%s" that source "%s" reads into already exists' % (res[0][0], source['name']))
    if verbose:
        print('[OK] No conflicting schema names found')

    # Download online resource if a URL is specified, storing it at the
    # location specified in source['path']
    if ('url' in source):
        urllib.urlretrieve(source['url'], source['path'])
        if verbose:
            print('[OK] Downloaded from %s to %s' % (source['url'], source['path']))

    if dry_run:
        print('[WARNING] --dry_run option not implemented for entire pipeline of updating an SQL source')
        db.close()
        return

    # Load into postgres, unzipping along the way
    if source['path'].endswith('.sql.gz'):
        p1 = subprocess.Popen(['gunzip', '-c', source['path']], stdout=subprocess.PIPE)
        subprocess.check_output(['psql', '-d', 'vd', '-q'], stdin=p1.stdout)
    # Load into postgres directly
    else:
        # The options -q -o /dev/null just suppress output
        subprocess.call(['psql', '-d', 'vd', '-f', source['path'], '-q', '-o', '/dev/null'])

    # Rename loaded schema(s) to the desired schema name(s)
    # If there is a single schema, rename it to source_NAME_TIMESTAMP
    # If there are multiple schemas, rename them to source_NAME_SCHEMA_TIMESTAMP
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    if len(source['schemas']) == 1:
        schema_old = source['schemas'][0]
        schema_new = 'source_' + source['name'] + '_' + timestamp
        db.rename_schema(schema_old, schema_new, verbose)
        # Grant privileges to user data for data/SourceDataInfo to work properly
        db.grant_usage_and_select_on_schema(schema_new, 'data')
    else:
        for schema_old in source['schemas']:
            schema_new = 'source_' + source['name'] + '_' + schema_old + '_' + timestamp
            db.rename_schema(schema_old, schema_new, verbose)
            # Grant privileges to user data for data/SourceDataInfo to work properly
            db.grant_usage_and_select_on_schema(schema_new, 'data')

    # Commit and close database connection
    db.commit()
    db.close()
Ejemplo n.º 5
0
  def test_a_shortest_path_of_unit_length(self):
    """Tests finding a shortest path between endpoints of an edge."""

    # Find a relation in the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')
    rel = db.query('SELECT eid, eid_relation FROM related LIMIT 1')[0]
    source = int(rel["eid"])
    target = int(rel["eid_relation"])

    # Check that the shortest path of length 1 is found:
    url = '/a_shortest_path?eid1=%d&eid2=%d' % (source, target)
    content = _request_json(url, self)
    print('AShortestPath:\n%s' % (content))
    self.assertListEqual(content, [source, target])
Ejemplo n.º 6
0
    def subgraph(self, set_A, set_B):
        # Compute distance of each vertex from A and from B
        dists_A = self.dijkstra(set_A, set_B, return_all=True)
        dists_B = self.dijkstra(set_B, set_A, return_all=True)
        dists_AB = [dists_A[v] for v in set_B if v in dists_A]
        if len(dists_AB) == 0:
            return {'vertices': [], 'edges': []}
        dist_AB = min([dists_A[v] for v in set_B if v in dists_A])

        # Determine subgraph's vertices (eIDs)
        vertices_eids = set()
        vertices_eids.update(set_A)
        vertices_eids.update(set_B)
        tolerance = 1
        for v in dists_A:
            if (v in dists_B) and (dists_A[v] + dists_B[v] <= dist_AB + tolerance):
                vertices_eids.add(v)

        # Obtain entity name for chosen vertices
        db = DatabaseConnection(search_path='mysql')
        q = """
            SELECT eid, entity_name FROM entities
            WHERE entities.eid IN %s;
            """
        q_data = (tuple(vertices_eids),)
        rows = db.query(q, q_data)
        db.close()
        eid_to_name = {row['eid']: row['entity_name'] for row in rows}

        # Add entity names and distances to vertices
        vertices = []
        for eid in vertices_eids:
            vertices.append({
                'eid': eid,
                'entity_name': eid_to_name[eid],
                'distance_from_A': dists_A.get(eid, None),
                'distance_from_B': dists_B.get(eid, None),
                })

        # Build subgraph's edges
        edges = []
        for v1, v2, length in self.edges:
            if (v1 in vertices_eids) and (v2 in vertices_eids):
                edges.append((v1, v2, length))

        return {'vertices': vertices, 'edges': edges}
Ejemplo n.º 7
0
    def __init__(self):
        log('Connecting to the database...')
        db = DatabaseConnection(path_config='db_config.yaml', search_path='mysql')
        with open("db_config.yaml", "r") as stream:
            config = yaml.load(stream)

        log('Relations constructor...')
        q = """SELECT eid1, eid2, length FROM related LIMIT %s"""
        q_data = [int(config["relations_to_load"])]
        for row in db.query(q, q_data):
            self.edges.append((row["eid1"], row["eid2"], float(row["length"])))
            self.edges.append((row["eid2"], row["eid1"], float(row["length"])))
        db.close()

        log('Sorting edges...')
        self.edges.sort()
        log('Creating start indices...')
        for i in xrange(len(self.edges)):
            cur = self.edges[i][0]
            if cur in self.start_index: continue
            self.start_index[cur] = i