def main():
    """Generates public data dump files from the latest prod data."""

    # Connect to the latest schemas.
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    schema_profil = db.get_latest_schema('source_internal_profil_')
    db.execute('SET search_path="' + schema + '", "' + schema_profil + '";')
    timestamp = schema[schema.rfind('_') + 1:]
    print('[OK] Dumping from schemas "%s" and "%s"...' %
          (schema, schema_profil))

    # Read YAML configuration file.
    config = yaml_load('public_dumps.yaml')
    dir_save = config['save_directory']
    dumps = config['dumps']

    # Process all dumps.
    for dump_name in dumps:
        save_path = os.path.join(dir_save,
                                 '%s_%s.csv' % (dump_name, timestamp))
        db.dump_to_CSV(dumps[dump_name]['query'], save_path)
        print('[OK] Saved dump "%s" to %s' % (dump_name, save_path))

        stage_path = os.path.join(dir_save, dump_name + '.csv')
        shutil.copyfile(save_path, stage_path)
        print('[OK] Copied dump "%s" to %s' % (dump_name, stage_path))

    # Close database connection.
    db.close()
Example #2
0
def initialise_app():
    """Precomputes values shared across requests to this app."""

    # Connect to the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ',public;')
    app.registry['db'] = db
Example #3
0
def initialise_app():
  """Stores values shared across requests in the app registry."""

  # Database connection:
  db = DatabaseConnection()
  schema = db.get_latest_schema('prod_')
  db.execute('SET search_path to ' + schema + ';')
  app.registry['db'] = db
Example #4
0
def generate_unmovable_asset_count_plots():
    """Generates and saves asset count plots for all persons."""

    # Connect to most recent profil source schema in the database.
    db = DatabaseConnection(path_config='db_config.yaml')
    schema_profil = db.get_latest_schema('source_internal_profil_')
    db.execute('SET search_path="' + schema_profil + '";')

    # Load declarations data from the database.
    declarations = db.query("""
    SELECT
      PersonId,
      Persons.FirstName AS firstname,
      Persons.Surname AS surname,
      year,
      num_houses,
      num_fields,
      num_others
    FROM
      AssetDeclarations
    INNER JOIN
      Persons ON Persons.Id=AssetDeclarations.PersonId
    WHERE
      (num_houses IS NOT NULL) AND
      (num_fields IS NOT NULL) AND
      (num_others IS NOT NULL)
  ;""")

    # Compute range of years present in the declarations.
    years = [declaration['year'] for declaration in declarations]
    years = list(range(min(years), max(years) + 1))

    # Group declarations by person.
    user_declarations = collections.defaultdict(list)
    for declaration in declarations:
        user_declarations[declaration['personid']].append(declaration)

    # Matplotlib font
    matplotlib.rc(
        'font', **{
            'size': 11,
            'sans-serif': 'Arial',
            'family': 'sans-serif'
        })

    # Iterate through all persons, and plot.
    for ui, person_id in enumerate(user_declarations):
        # if person_id != 913:
        #   continue
        declarations = user_declarations[person_id]
        plot_unmovable_asset_counts(declarations, years, DIR_SAVE)
        if ui + 1 == len(user_declarations) or (ui + 1) % 50 == 0:
            print('Plotted %d/%d persons' % (ui + 1, len(user_declarations)))
    print('\nDeploy generated plots using\n'
          'sudo cp %s* '
          '/data/www/verejne.digital/resources/profil_asset_plots' %
          (DIR_SAVE))
def reveal_connection_to_politics(max_relations_to_load, num_contracts,
                                  max_distance, path_output):
    """Reveals connections between recent contract winners and politics.

  Args:
    max_relations_to_load: Maximum number of relations to load from
        production table `related`. Use a smaller number for faster
        debugging only.
    num_contracts: Number of most recent contracts to analyse.
    max_distance: Maximum distance at which connections are reported.
    path_output: Path where to write the resulting report.
  """

    # Connect to the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')

    # Load relations and notable eIDs:
    relations = server._initialise_relations(db, max_relations_to_load)
    notable_eids = server._initialise_notable_eids(db)

    # Retrieve most recent contracts with positive price:
    q = """
      SELECT
        supplier_eid,
        entities.name AS supplier_name,
        contract_price_amount,
        contract_price_total_amount,
        signed_on,
        effective_from,
        effective_to,
        status_id,
        contract_id
      FROM
        contracts
      INNER JOIN
        entities ON entities.id=contracts.supplier_eid
      WHERE
        signed_on IS NOT NULL
        AND signed_on <= now()
        AND (
          contract_price_amount > 0 OR contract_price_total_amount > 0
        )
        AND entities.name LIKE '%%.'
        AND entities.name NOT LIKE '%%lovensk%%'
      ORDER BY
        signed_on DESC
      LIMIT %s;
  """
    with open(path_output, "w") as file_output:
        rows = db.query(q, [num_contracts])
        for row in tqdm.tqdm(rows):
            report_on(row, relations, notable_eids, max_distance, db,
                      file_output)

    db.close()
Example #6
0
def initialise_app():
    """Precomputes values to be shared across requests."""

    # Maintain database connection into two schemas: the profil source
    # schema, and the latest production schema. Note these two must
    # be kept consistent for `profilmapping` to make sense; to this end,
    # the source_internal_profil_* schema is made visible to user
    # `kataster` together with prod data generation.

    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    schema_profil = db.get_latest_schema('source_internal_profil_')
    db.execute('SET search_path="' + schema + '", "' + schema_profil +
               '", public;')
    app.registry['db'] = db
Example #7
0
  def test_a_shortest_path_of_unit_length(self):
    """Tests finding a shortest path between endpoints of an edge."""

    # Find a relation in the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')
    rel = db.query('SELECT eid, eid_relation FROM related LIMIT 1')[0]
    source = int(rel["eid"])
    target = int(rel["eid_relation"])

    # Check that the shortest path of length 1 is found:
    url = '/a_shortest_path?eid1=%d&eid2=%d' % (source, target)
    content = _request_json(url, self)
    print('AShortestPath:\n%s' % (content))
    self.assertListEqual(content, [source, target])
Example #8
0
def initialise_app(max_relations_to_load):
    """Precomputes values shared across requests to this app.

  The registry property is intended for storing these precomputed
  values, so as to avoid global variables.
  """

    # Connect to the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')
    app.registry['db'] = db

    # Build Relations object and a set of notable eIDs:
    app.registry['relations'] = _initialise_relations(db,
                                                      max_relations_to_load)
    app.registry['notable_eids'] = _initialise_notable_eids(db)
Example #9
0
def initialise_app(max_relations_to_load):
    """Precomputes values shared across requests to this app.

  The registry property is intended for storing these precomputed
  values, so as to avoid global variables.
  """

    # Connect to the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')
    app.registry['db'] = db

    # Retrieve list of relationship edges:
    q = """
      SELECT eid, eid_relation, stakeholder_type_id
      FROM related
      LIMIT %s;
      """
    q_data = [max_relations_to_load]
    edge_list = []
    for row in db.query(q, q_data):
        edge_type = row['stakeholder_type_id'] or 0
        edge_list.append((row['eid'], row['eid_relation'], +1 * edge_type))
        edge_list.append((row['eid_relation'], row['eid'], -1 * edge_type))

    # Construct Relations object from the edge list:
    relations = Relations(edge_list)
    app.registry['relations'] = relations

    # TEMP: Construct Relations using old database data:
    db_old = DatabaseConnection(path_config='db_config_old.yaml',
                                search_path='mysql')
    app.registry['db_old'] = db_old
    q = """SELECT eid1, eid2, length FROM related LIMIT %s;"""
    q_data = [max_relations_to_load]
    edge_list_old = []
    for row in db_old.query(q, q_data):
        edge_list_old.append((row['eid1'], row['eid2'], float(row['length'])))
        edge_list_old.append((row['eid2'], row['eid1'], float(row['length'])))
    relations_old = Relations(edge_list_old)
    app.registry['relations_old'] = relations_old
Example #10
0
def initialise_app(serving_directory):
    """ Procedure for initialising the app with precomputed values that
        are shared across different requests. The registry property is
        intended for this purpose, in order to avoid global variables.
    """

    # database
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')
    app.registry['db'] = db

    # data_sources
    data_sources = yaml_load('datasources.yaml')
    app.registry['data_sources'] = data_sources

    # entities
    entities = Entities()
    entities.loadFromDirectory(serving_directory)
    app.registry['entities'] = entities
Example #11
0
def generate_public_data_dumps(limit=None, verbose=False):
    """ Generates the public data dump files from the latest production data """

    # Connect to the latest production data schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path="' + schema + '";')
    timestamp = schema[schema.rfind('_') + 1:]
    if verbose:
        print('[OK] Dumping from schema "%s"...' % (schema))
    if limit is not None:
        print('[WARNING] Dumping with row limit %d!' % (limit))

    # Read YAML configuration file
    config = yaml_load('public_dumps.yaml')
    dir_save = config['save_directory']
    dumps = config['dumps']

    # Process all dumps
    for dump_name in dumps:
        # Construct dump query
        q = dumps[dump_name]['query']
        q = q.rstrip().rstrip(';')  # possibly remove ; ending
        if limit is not None:
            q += ' LIMIT %d' % (limit)

        # Dump to CSV without timestamp
        path_output = '%s%s.csv' % (dir_save, dump_name)
        db.dump_to_CSV(q, path_output)
        if verbose:
            print('[OK] Created dump "%s" in %s' % (dump_name, path_output))

        # Dump to CSV with timestamp
        path_output = '%s%s_%s.csv' % (dir_save, dump_name, timestamp)
        db.dump_to_CSV(q, path_output)
        if verbose:
            print('[OK] Created dump "%s" in %s' % (dump_name, path_output))

    # Close database connection
    db.close()
Example #12
0
def update_JSON_source(source, timestamp, dry_run, verbose):
    # Load the JSON file
    data = json_load(source['path'])

    # Obtain column names appearing anywhere in the JSON
    columns = sorted(list(set(chain.from_iterable([datum.keys() for datum in data]))))
    if verbose:
        print('Loaded JSON files with %d columns and %d data rows' % (len(columns), len(data)))

    # Reorganise data into a list of tuples
    data = [tuple(datum[column] if column in datum else "" for column in columns) for datum in data]

    # Create postgres schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = 'source_' + source['name'] + '_' + timestamp
    q = 'CREATE SCHEMA "%s"; SET search_path="%s";' % (schema, schema)
    db.execute(q)

    # Create table containing the actual data from the CSV file
    table = source['table_name']
    table_columns = ', '.join(['%s text' % (name) for name in columns])
    q = 'CREATE TABLE %s (%s);' % (table, table_columns)
    db.execute(q)

    # Populate the table with data
    q = 'INSERT INTO ' + table + ' VALUES %s;'
    db.execute_values(q, data)
    if verbose:
        print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else ''))

    # Grant privileges to user data for data/SourceDataInfo to work properly
    db.grant_usage_and_select_on_schema(schema, 'data')

    # Commit and close database connection
    if not dry_run:
        db.commit()
    db.close()
Example #13
0
def update_CSV_source(source, timestamp, dry_run, verbose):
    # Load the CSV file
    with open(source['path'], 'r') as f:
        delimiter = str(source['delimiter']) # requires string, not unicode
        reader = csv.reader(f, delimiter=delimiter)

        # Extract column names from header line and then the actual data
        header = next(reader)
        column_names = [column_name.decode('utf-8') for column_name in header]
        data = [tuple(row) for row in reader]
    if verbose:
        print('Loaded CSV file with %d columns and %d data rows' % (len(column_names), len(data)))

    # Create postgres schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = 'source_' + source['name'] + '_' + timestamp
    q = 'CREATE SCHEMA %s; SET search_path="%s";' % (schema, schema)
    db.execute(q)

    # Compute normalised column names, saving original names in a separate table
    column_names_normalised = map(normalise_CSV_column_name, column_names)
    q = 'CREATE TABLE column_names (name_original text, name_normalised text);'
    db.execute(q)
    q = """INSERT INTO column_names VALUES %s;"""
    q_data = [(original, normalised) for original, normalised in zip(column_names, column_names_normalised)]
    db.execute_values(q, q_data)

    # Create table containing the actual data from the CSV file
    table = source['table_name']
    table_columns = ', '.join(['%s text' % (name) for name in column_names_normalised])
    q = 'CREATE TABLE %s (%s);' % (table, table_columns)
    db.execute(q)

    # Populate the table with data
    q = 'INSERT INTO ' + table + ' VALUES %s;'
    db.execute_values(q, data)
    if verbose:
        print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else ''))

    # Grant privileges to user data for data/SourceDataInfo to work properly
    db.grant_usage_and_select_on_schema(schema, 'data')

    # Commit and close database connection
    if not dry_run:
        db.commit()
    db.close()
Example #14
0
def ProcessSource(db_prod, geocoder, entities, config, test_mode):
    """ Process one source table (read from db_source) using the config and
    performing normalization using the given geocoder and entities lookup.

    The produced data are written into db_prod connection. The function writes
    new entities and addresses in to the Entities and Address tables. It also
    creates and populates supplementary tables as specified by a config.
    """

    # Connect to the most recent schema from the current source
    db_source = DatabaseConnection(path_config='db_config_update_source.yaml')
    source_schema_name = db_source.get_latest_schema('source_' +
                                                     config["source_schema"])
    print "Processing source_schema_name", source_schema_name
    db_source.execute('SET search_path="' + source_schema_name + '";')

    columns_for_table = {}
    with db_prod.dict_cursor() as cur:
        # Create supplementaty tables using the provided command.
        # Also store the columns of the table for later use.
        for table in config["tables"]:
            table_config = config["tables"][table]
            columns_for_table[table] = table_config["columns"]
            cur.execute(table_config["create_command"])

    def AddValuesToTable(columns, values, eid, supplier_eid=None):
        if eid is not None:
            columns += ["eid"]
            values += [eid]
        if supplier_eid is not None:
            columns += ["supplier_eid"]
            values += [supplier_eid]

        if all(v is None for v in values):
            # Ignore this entry, all meaningful values are None
            return

        # TODO: find out how to build SQL statement properly
        column_names = ",".join(columns)
        values_params = ",".join(["%s"] * (len(columns)))
        command = ("INSERT INTO %s (" + column_names + ") " + "VALUES (" +
                   values_params + ") " + "ON CONFLICT DO NOTHING")
        with db_prod.dict_cursor() as cur:
            cur.execute(command, [AsIs(table)] + values)

    def AddToTable(row, table, eid, years, supplier_eid=None):
        """ Add values for the given row into the supplementary table 'table'.

        It reads the corresponding values from the row and adds them into the
        table with the corresponding eid.
        """
        columns = list(columns_for_table[table])
        if years:
            for year in years:
                values = []
                columns_per_year = columns[:]
                for column in columns:
                    col_name = column + "_" + str(year)
                    if col_name in row:
                        values.append(row[col_name])
                    else:
                        values.append(None)
                columns_per_year.append("year")
                values.append(year)
                AddValuesToTable(columns_per_year, values, eid)
        else:
            values = [row[column] for column in columns]
            AddValuesToTable(columns, values, eid, supplier_eid)

    with db_source.dict_cursor() as cur:
        # Read data using the given command.
        print "Executing SQL command ..."
        suffix_for_testing = ""
        if test_mode:
            suffix_for_testing = " LIMIT 1000"
        cur.execute(config["command"] + suffix_for_testing)
        print "Done."
        missed = 0
        found = 0
        empty = 0

        missed_eid = 0
        found_eid = 0

        missed_addresses = set([])
        for row in cur:
            # Read entries one by one and try to geocode them. If the address
            # lookup succeeds, try to normalize the entities. If it succeeds,
            # insert into Entities and supplementary tables.
            address = ""
            if "address" in row:
                address = row["address"]
                if address is None: continue
            name = ""
            if "name" in row:
                name = row["name"]
                if name is None: continue
            # Sometimes FirstName and Surname are joined. Lets try the simplest splitting on Capital
            # letters.
            if (len(name.split()) == 1):
                name = ' '.join(re.findall('[A-Z][^A-Z]*', name))
            addressId = geocoder.GetAddressId(address.encode("utf8"))
            if addressId is None:
                if address == "":
                    empty += 1
                else:
                    if test_mode and missed < 10:
                        print "MISSING ADDRESS", address.encode("utf8")
                    missed_addresses.add(address)
                    missed += 1
                    continue
            found += 1

            eid = None
            if config.get("no_entity_id"):
                # TODO(rasto): is the address lookup necessary here?
                eid = None
            else:
                eid = entities.GetEntity(row["ico"], name, addressId)

            if found % 20000 == 0:
                print "Progress:", found
                sys.stdout.flush()

            if config.get("save_org_id"):
                entities.AddOrg2Eid(row["org_id"], eid)
            if config.get("use_org_id_as_eid_relation"):
                eid2 = entities.GetEidForOrgId(row["eid_relation"])
                if eid2 is None:
                    continue
                row["eid_relation"] = eid2
            if config.get("extract_description_from_body"):
                row["body"] = ExtractDescriptionFromBody(row["body"])
            supplier_eid = None
            if config.get("supplier_eid"):
                supplier_address_id = None
                if "supplier_address" in row and not row[
                        "supplier_address"] is None:
                    supplier_address = row["supplier_address"]
                    if supplier_address:
                        supplier_address_id = geocoder.GetAddressId(
                            supplier_address.encode("utf8"))
                        if supplier_address_id is None:
                            missed_addresses.add(supplier_address)
                            missed += 1
                            continue
                    else:
                        empty += 1
                supplier_name = ""
                if "supplier_name" in row and not row["supplier_name"] is None:
                    supplier_name = row["supplier_name"]
                supplier_eid = entities.GetEntity(row["supplier_ico"],
                                                  supplier_name,
                                                  supplier_address_id)
            if table_config.get("strip_html"):
                for strip_html_column in table_config["strip_html"]:
                    if row.get(strip_html_column):
                        row[strip_html_column] = StripHtml(
                            row[strip_html_column])
            if eid is None: missed_eid += 1
            found_eid += 1
            AddToTable(row, table, eid, table_config.get("years"),
                       supplier_eid)

    print "FOUND", found
    print "MISSED", missed
    print "EMPTY", empty
    print "MISSED UNIQUE", len(missed_addresses)
    print "FOUND EID", found_eid
    print "MISSED EID", missed_eid
    db_source.close()