def main():
    """Generates public data dump files from the latest prod data."""

    # Connect to the latest schemas.
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    schema_profil = db.get_latest_schema('source_internal_profil_')
    db.execute('SET search_path="' + schema + '", "' + schema_profil + '";')
    timestamp = schema[schema.rfind('_') + 1:]
    print('[OK] Dumping from schemas "%s" and "%s"...' %
          (schema, schema_profil))

    # Read YAML configuration file.
    config = yaml_load('public_dumps.yaml')
    dir_save = config['save_directory']
    dumps = config['dumps']

    # Process all dumps.
    for dump_name in dumps:
        save_path = os.path.join(dir_save,
                                 '%s_%s.csv' % (dump_name, timestamp))
        db.dump_to_CSV(dumps[dump_name]['query'], save_path)
        print('[OK] Saved dump "%s" to %s' % (dump_name, save_path))

        stage_path = os.path.join(dir_save, dump_name + '.csv')
        shutil.copyfile(save_path, stage_path)
        print('[OK] Copied dump "%s" to %s' % (dump_name, stage_path))

    # Close database connection.
Esempio n. 2
def initialise_app():
    """Precomputes values shared across requests to this app."""

    # Connect to the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ',public;')
    app.registry['db'] = db
Esempio n. 3
def initialise_app():
  """Stores values shared across requests in the app registry."""

  # Database connection:
  db = DatabaseConnection()
  schema = db.get_latest_schema('prod_')
  db.execute('SET search_path to ' + schema + ';')
  app.registry['db'] = db
Esempio n. 4
def generate_unmovable_asset_count_plots():
    """Generates and saves asset count plots for all persons."""

    # Connect to most recent profil source schema in the database.
    db = DatabaseConnection(path_config='db_config.yaml')
    schema_profil = db.get_latest_schema('source_internal_profil_')
    db.execute('SET search_path="' + schema_profil + '";')

    # Load declarations data from the database.
    declarations = db.query("""
      Persons.FirstName AS firstname,
      Persons.Surname AS surname,
      Persons ON Persons.Id=AssetDeclarations.PersonId
      (num_houses IS NOT NULL) AND
      (num_fields IS NOT NULL) AND
      (num_others IS NOT NULL)

    # Compute range of years present in the declarations.
    years = [declaration['year'] for declaration in declarations]
    years = list(range(min(years), max(years) + 1))

    # Group declarations by person.
    user_declarations = collections.defaultdict(list)
    for declaration in declarations:

    # Matplotlib font
        'font', **{
            'size': 11,
            'sans-serif': 'Arial',
            'family': 'sans-serif'

    # Iterate through all persons, and plot.
    for ui, person_id in enumerate(user_declarations):
        # if person_id != 913:
        #   continue
        declarations = user_declarations[person_id]
        plot_unmovable_asset_counts(declarations, years, DIR_SAVE)
        if ui + 1 == len(user_declarations) or (ui + 1) % 50 == 0:
            print('Plotted %d/%d persons' % (ui + 1, len(user_declarations)))
    print('\nDeploy generated plots using\n'
          'sudo cp %s* '
          '/data/www/' %
def reveal_connection_to_politics(max_relations_to_load, num_contracts,
                                  max_distance, path_output):
    """Reveals connections between recent contract winners and politics.

    max_relations_to_load: Maximum number of relations to load from
        production table `related`. Use a smaller number for faster
        debugging only.
    num_contracts: Number of most recent contracts to analyse.
    max_distance: Maximum distance at which connections are reported.
    path_output: Path where to write the resulting report.

    # Connect to the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')

    # Load relations and notable eIDs:
    relations = server._initialise_relations(db, max_relations_to_load)
    notable_eids = server._initialise_notable_eids(db)

    # Retrieve most recent contracts with positive price:
    q = """
        supplier_eid, AS supplier_name,
        entities ON
        signed_on IS NOT NULL
        AND signed_on <= now()
        AND (
          contract_price_amount > 0 OR contract_price_total_amount > 0
        AND LIKE '%%.'
        AND NOT LIKE '%%lovensk%%'
      ORDER BY
        signed_on DESC
      LIMIT %s;
    with open(path_output, "w") as file_output:
        rows = db.query(q, [num_contracts])
        for row in tqdm.tqdm(rows):
            report_on(row, relations, notable_eids, max_distance, db,

Esempio n. 6
def initialise_app():
    """Precomputes values to be shared across requests."""

    # Maintain database connection into two schemas: the profil source
    # schema, and the latest production schema. Note these two must
    # be kept consistent for `profilmapping` to make sense; to this end,
    # the source_internal_profil_* schema is made visible to user
    # `kataster` together with prod data generation.

    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    schema_profil = db.get_latest_schema('source_internal_profil_')
    db.execute('SET search_path="' + schema + '", "' + schema_profil +
               '", public;')
    app.registry['db'] = db
Esempio n. 7
  def test_a_shortest_path_of_unit_length(self):
    """Tests finding a shortest path between endpoints of an edge."""

    # Find a relation in the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')
    rel = db.query('SELECT eid, eid_relation FROM related LIMIT 1')[0]
    source = int(rel["eid"])
    target = int(rel["eid_relation"])

    # Check that the shortest path of length 1 is found:
    url = '/a_shortest_path?eid1=%d&eid2=%d' % (source, target)
    content = _request_json(url, self)
    print('AShortestPath:\n%s' % (content))
    self.assertListEqual(content, [source, target])
Esempio n. 8
def initialise_app(max_relations_to_load):
    """Precomputes values shared across requests to this app.

  The registry property is intended for storing these precomputed
  values, so as to avoid global variables.

    # Connect to the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')
    app.registry['db'] = db

    # Build Relations object and a set of notable eIDs:
    app.registry['relations'] = _initialise_relations(db,
    app.registry['notable_eids'] = _initialise_notable_eids(db)
Esempio n. 9
def initialise_app(max_relations_to_load):
    """Precomputes values shared across requests to this app.

  The registry property is intended for storing these precomputed
  values, so as to avoid global variables.

    # Connect to the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')
    app.registry['db'] = db

    # Retrieve list of relationship edges:
    q = """
      SELECT eid, eid_relation, stakeholder_type_id
      FROM related
      LIMIT %s;
    q_data = [max_relations_to_load]
    edge_list = []
    for row in db.query(q, q_data):
        edge_type = row['stakeholder_type_id'] or 0
        edge_list.append((row['eid'], row['eid_relation'], +1 * edge_type))
        edge_list.append((row['eid_relation'], row['eid'], -1 * edge_type))

    # Construct Relations object from the edge list:
    relations = Relations(edge_list)
    app.registry['relations'] = relations

    # TEMP: Construct Relations using old database data:
    db_old = DatabaseConnection(path_config='db_config_old.yaml',
    app.registry['db_old'] = db_old
    q = """SELECT eid1, eid2, length FROM related LIMIT %s;"""
    q_data = [max_relations_to_load]
    edge_list_old = []
    for row in db_old.query(q, q_data):
        edge_list_old.append((row['eid1'], row['eid2'], float(row['length'])))
        edge_list_old.append((row['eid2'], row['eid1'], float(row['length'])))
    relations_old = Relations(edge_list_old)
    app.registry['relations_old'] = relations_old
Esempio n. 10
def initialise_app(serving_directory):
    """ Procedure for initialising the app with precomputed values that
        are shared across different requests. The registry property is
        intended for this purpose, in order to avoid global variables.

    # database
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')
    app.registry['db'] = db

    # data_sources
    data_sources = yaml_load('datasources.yaml')
    app.registry['data_sources'] = data_sources

    # entities
    entities = Entities()
    app.registry['entities'] = entities
Esempio n. 11
def generate_public_data_dumps(limit=None, verbose=False):
    """ Generates the public data dump files from the latest production data """

    # Connect to the latest production data schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path="' + schema + '";')
    timestamp = schema[schema.rfind('_') + 1:]
    if verbose:
        print('[OK] Dumping from schema "%s"...' % (schema))
    if limit is not None:
        print('[WARNING] Dumping with row limit %d!' % (limit))

    # Read YAML configuration file
    config = yaml_load('public_dumps.yaml')
    dir_save = config['save_directory']
    dumps = config['dumps']

    # Process all dumps
    for dump_name in dumps:
        # Construct dump query
        q = dumps[dump_name]['query']
        q = q.rstrip().rstrip(';')  # possibly remove ; ending
        if limit is not None:
            q += ' LIMIT %d' % (limit)

        # Dump to CSV without timestamp
        path_output = '%s%s.csv' % (dir_save, dump_name)
        db.dump_to_CSV(q, path_output)
        if verbose:
            print('[OK] Created dump "%s" in %s' % (dump_name, path_output))

        # Dump to CSV with timestamp
        path_output = '%s%s_%s.csv' % (dir_save, dump_name, timestamp)
        db.dump_to_CSV(q, path_output)
        if verbose:
            print('[OK] Created dump "%s" in %s' % (dump_name, path_output))

    # Close database connection
Esempio n. 12
def update_JSON_source(source, timestamp, dry_run, verbose):
    # Load the JSON file
    data = json_load(source['path'])

    # Obtain column names appearing anywhere in the JSON
    columns = sorted(list(set(chain.from_iterable([datum.keys() for datum in data]))))
    if verbose:
        print('Loaded JSON files with %d columns and %d data rows' % (len(columns), len(data)))

    # Reorganise data into a list of tuples
    data = [tuple(datum[column] if column in datum else "" for column in columns) for datum in data]

    # Create postgres schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = 'source_' + source['name'] + '_' + timestamp
    q = 'CREATE SCHEMA "%s"; SET search_path="%s";' % (schema, schema)

    # Create table containing the actual data from the CSV file
    table = source['table_name']
    table_columns = ', '.join(['%s text' % (name) for name in columns])
    q = 'CREATE TABLE %s (%s);' % (table, table_columns)

    # Populate the table with data
    q = 'INSERT INTO ' + table + ' VALUES %s;'
    db.execute_values(q, data)
    if verbose:
        print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else ''))

    # Grant privileges to user data for data/SourceDataInfo to work properly
    db.grant_usage_and_select_on_schema(schema, 'data')

    # Commit and close database connection
    if not dry_run:
Esempio n. 13
def update_CSV_source(source, timestamp, dry_run, verbose):
    # Load the CSV file
    with open(source['path'], 'r') as f:
        delimiter = str(source['delimiter']) # requires string, not unicode
        reader = csv.reader(f, delimiter=delimiter)

        # Extract column names from header line and then the actual data
        header = next(reader)
        column_names = [column_name.decode('utf-8') for column_name in header]
        data = [tuple(row) for row in reader]
    if verbose:
        print('Loaded CSV file with %d columns and %d data rows' % (len(column_names), len(data)))

    # Create postgres schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = 'source_' + source['name'] + '_' + timestamp
    q = 'CREATE SCHEMA %s; SET search_path="%s";' % (schema, schema)

    # Compute normalised column names, saving original names in a separate table
    column_names_normalised = map(normalise_CSV_column_name, column_names)
    q = 'CREATE TABLE column_names (name_original text, name_normalised text);'
    q = """INSERT INTO column_names VALUES %s;"""
    q_data = [(original, normalised) for original, normalised in zip(column_names, column_names_normalised)]
    db.execute_values(q, q_data)

    # Create table containing the actual data from the CSV file
    table = source['table_name']
    table_columns = ', '.join(['%s text' % (name) for name in column_names_normalised])
    q = 'CREATE TABLE %s (%s);' % (table, table_columns)

    # Populate the table with data
    q = 'INSERT INTO ' + table + ' VALUES %s;'
    db.execute_values(q, data)
    if verbose:
        print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else ''))

    # Grant privileges to user data for data/SourceDataInfo to work properly
    db.grant_usage_and_select_on_schema(schema, 'data')

    # Commit and close database connection
    if not dry_run:
Esempio n. 14
def ProcessSource(db_prod, geocoder, entities, config, test_mode):
    """ Process one source table (read from db_source) using the config and
    performing normalization using the given geocoder and entities lookup.

    The produced data are written into db_prod connection. The function writes
    new entities and addresses in to the Entities and Address tables. It also
    creates and populates supplementary tables as specified by a config.

    # Connect to the most recent schema from the current source
    db_source = DatabaseConnection(path_config='db_config_update_source.yaml')
    source_schema_name = db_source.get_latest_schema('source_' +
    print "Processing source_schema_name", source_schema_name
    db_source.execute('SET search_path="' + source_schema_name + '";')

    columns_for_table = {}
    with db_prod.dict_cursor() as cur:
        # Create supplementaty tables using the provided command.
        # Also store the columns of the table for later use.
        for table in config["tables"]:
            table_config = config["tables"][table]
            columns_for_table[table] = table_config["columns"]

    def AddValuesToTable(columns, values, eid, supplier_eid=None):
        if eid is not None:
            columns += ["eid"]
            values += [eid]
        if supplier_eid is not None:
            columns += ["supplier_eid"]
            values += [supplier_eid]

        if all(v is None for v in values):
            # Ignore this entry, all meaningful values are None

        # TODO: find out how to build SQL statement properly
        column_names = ",".join(columns)
        values_params = ",".join(["%s"] * (len(columns)))
        command = ("INSERT INTO %s (" + column_names + ") " + "VALUES (" +
                   values_params + ") " + "ON CONFLICT DO NOTHING")
        with db_prod.dict_cursor() as cur:
            cur.execute(command, [AsIs(table)] + values)

    def AddToTable(row, table, eid, years, supplier_eid=None):
        """ Add values for the given row into the supplementary table 'table'.

        It reads the corresponding values from the row and adds them into the
        table with the corresponding eid.
        columns = list(columns_for_table[table])
        if years:
            for year in years:
                values = []
                columns_per_year = columns[:]
                for column in columns:
                    col_name = column + "_" + str(year)
                    if col_name in row:
                AddValuesToTable(columns_per_year, values, eid)
            values = [row[column] for column in columns]
            AddValuesToTable(columns, values, eid, supplier_eid)

    with db_source.dict_cursor() as cur:
        # Read data using the given command.
        print "Executing SQL command ..."
        suffix_for_testing = ""
        if test_mode:
            suffix_for_testing = " LIMIT 1000"
        cur.execute(config["command"] + suffix_for_testing)
        print "Done."
        missed = 0
        found = 0
        empty = 0

        missed_eid = 0
        found_eid = 0

        missed_addresses = set([])
        for row in cur:
            # Read entries one by one and try to geocode them. If the address
            # lookup succeeds, try to normalize the entities. If it succeeds,
            # insert into Entities and supplementary tables.
            address = ""
            if "address" in row:
                address = row["address"]
                if address is None: continue
            name = ""
            if "name" in row:
                name = row["name"]
                if name is None: continue
            # Sometimes FirstName and Surname are joined. Lets try the simplest splitting on Capital
            # letters.
            if (len(name.split()) == 1):
                name = ' '.join(re.findall('[A-Z][^A-Z]*', name))
            addressId = geocoder.GetAddressId(address.encode("utf8"))
            if addressId is None:
                if address == "":
                    empty += 1
                    if test_mode and missed < 10:
                        print "MISSING ADDRESS", address.encode("utf8")
                    missed += 1
            found += 1

            eid = None
            if config.get("no_entity_id"):
                # TODO(rasto): is the address lookup necessary here?
                eid = None
                eid = entities.GetEntity(row["ico"], name, addressId)

            if found % 20000 == 0:
                print "Progress:", found

            if config.get("save_org_id"):
                entities.AddOrg2Eid(row["org_id"], eid)
            if config.get("use_org_id_as_eid_relation"):
                eid2 = entities.GetEidForOrgId(row["eid_relation"])
                if eid2 is None:
                row["eid_relation"] = eid2
            if config.get("extract_description_from_body"):
                row["body"] = ExtractDescriptionFromBody(row["body"])
            supplier_eid = None
            if config.get("supplier_eid"):
                supplier_address_id = None
                if "supplier_address" in row and not row[
                        "supplier_address"] is None:
                    supplier_address = row["supplier_address"]
                    if supplier_address:
                        supplier_address_id = geocoder.GetAddressId(
                        if supplier_address_id is None:
                            missed += 1
                        empty += 1
                supplier_name = ""
                if "supplier_name" in row and not row["supplier_name"] is None:
                    supplier_name = row["supplier_name"]
                supplier_eid = entities.GetEntity(row["supplier_ico"],
            if table_config.get("strip_html"):
                for strip_html_column in table_config["strip_html"]:
                    if row.get(strip_html_column):
                        row[strip_html_column] = StripHtml(
            if eid is None: missed_eid += 1
            found_eid += 1
            AddToTable(row, table, eid, table_config.get("years"),

    print "FOUND", found
    print "MISSED", missed
    print "EMPTY", empty
    print "MISSED UNIQUE", len(missed_addresses)
    print "FOUND EID", found_eid
    print "MISSED EID", missed_eid