def main():
    """Generates public data dump files from the latest prod data."""

    # Connect to the latest schemas.
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    schema_profil = db.get_latest_schema('source_internal_profil_')
    db.execute('SET search_path="' + schema + '", "' + schema_profil + '";')
    timestamp = schema[schema.rfind('_') + 1:]
    print('[OK] Dumping from schemas "%s" and "%s"...' %
          (schema, schema_profil))

    # Read YAML configuration file.
    config = yaml_load('public_dumps.yaml')
    dir_save = config['save_directory']
    dumps = config['dumps']

    # Process all dumps.
    for dump_name in dumps:
        save_path = os.path.join(dir_save,
                                 '%s_%s.csv' % (dump_name, timestamp))
        db.dump_to_CSV(dumps[dump_name]['query'], save_path)
        print('[OK] Saved dump "%s" to %s' % (dump_name, save_path))

        stage_path = os.path.join(dir_save, dump_name + '.csv')
        shutil.copyfile(save_path, stage_path)
        print('[OK] Copied dump "%s" to %s' % (dump_name, stage_path))

    # Close database connection.
    db.close()
Example #2
0
def get_source_data_info():
    """ Returns a list of data sources, together with information about
        the latest update (timestamp, list of table names and columns) """

    # Establish connection to the database
    db = DatabaseConnection(path_config='db_config_data.yaml')

    # Iterate through sources listed in sources.json
    sources = json_load('../data/sources.json')
    result = []
    for source in sources:
        # Obtain schema with the last update
        try:
            schema = db.get_latest_schema('source_' + source['name'])
        except Exception as exception:
            print('[WARNING] %s' % (exception))
            continue

        # Store information to be returned
        result.append({
            'description': source['description'],
            'name': source['name'],
            'schema': schema,
            'tables': _get_tables_and_columns_in_schema(db, schema),
            'update': _datetimestr_from_schema(schema),
        })

    # Close database connection and return the result
    db.close()
    return result
Example #3
0
def consolidate_entities(read_only):
    db = DatabaseConnection(path_config='utils/db_config.yaml',
                            search_path='mysql')

    ids1, eids1, edges = consolidate_people(db)
    ids2, eids2 = consolidate_companies(db)

    cur = db.dict_cursor()
    # 1. Reset eids to be equal to ids in entities.
    print "Reset eids"
    cur.execute("update entities set eid = id;")
    # 2. Consolidate people
    print "Update DB with people eids"
    update_eids_of_ids(cur, ids1, eids1)
    # 3. Consolidate companies
    print "Update DB with companies eids"
    update_eids_of_ids(cur, ids2, eids2)
    # 4. Remove neighbour edges
    print "Delete neighbour edges"
    cur.execute("DELETE from related where source=%s", ("neighbour", ))
    # 5. Add new neighbour edges
    add_neighbour_edges(cur, edges)
    # 6. Update related table
    print "Updating related eids"
    cur.execute(
        "UPDATE related SET eid1=entities.eid FROM entities WHERE related.id1=entities.id;"
    )
    cur.execute(
        "UPDATE related SET eid2=entities.eid FROM entities WHERE related.id2=entities.id;"
    )
    cur.close()
    if not read_only:
        db.commit()
    db.close()
Example #4
0
 def get(self):
     # Return politicians augmented with property counts as JSON
     db = DatabaseConnection(path_config='db_config.yaml',
                             search_path='kataster')
     politicians = get_politicians_with_Folio_counts(db)
     db.close()
     self.returnJSON(politicians)
def reveal_connection_to_politics(max_relations_to_load, num_contracts,
                                  max_distance, path_output):
    """Reveals connections between recent contract winners and politics.

  Args:
    max_relations_to_load: Maximum number of relations to load from
        production table `related`. Use a smaller number for faster
        debugging only.
    num_contracts: Number of most recent contracts to analyse.
    max_distance: Maximum distance at which connections are reported.
    path_output: Path where to write the resulting report.
  """

    # Connect to the database:
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')

    # Load relations and notable eIDs:
    relations = server._initialise_relations(db, max_relations_to_load)
    notable_eids = server._initialise_notable_eids(db)

    # Retrieve most recent contracts with positive price:
    q = """
      SELECT
        supplier_eid,
        entities.name AS supplier_name,
        contract_price_amount,
        contract_price_total_amount,
        signed_on,
        effective_from,
        effective_to,
        status_id,
        contract_id
      FROM
        contracts
      INNER JOIN
        entities ON entities.id=contracts.supplier_eid
      WHERE
        signed_on IS NOT NULL
        AND signed_on <= now()
        AND (
          contract_price_amount > 0 OR contract_price_total_amount > 0
        )
        AND entities.name LIKE '%%.'
        AND entities.name NOT LIKE '%%lovensk%%'
      ORDER BY
        signed_on DESC
      LIMIT %s;
  """
    with open(path_output, "w") as file_output:
        rows = db.query(q, [num_contracts])
        for row in tqdm.tqdm(rows):
            report_on(row, relations, notable_eids, max_distance, db,
                      file_output)

    db.close()
Example #6
0
def main(args_dict):
    test_mode = not args_dict['disable_test_mode']
    if test_mode:
        print "======================="
        print "=======TEST MODE======="
        print "======================="

    timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
    # Write output into prod_schema_name
    prod_schema_name = "prod_" + timestamp
    print "prod_schema_name", prod_schema_name

    # Create database connections:
    # Read / write address cache from this one
    db_address_cache = DatabaseConnection(
        path_config='db_config_update_source.yaml',
        search_path='address_cache')
    # Write prod tables into this one
    db_prod = DatabaseConnection(path_config='db_config_update_source.yaml')
    CreateAndSetProdSchema(db_prod, prod_schema_name)

    # Initialize geocoder
    geocoder = geocoder_lib.Geocoder(db_address_cache, db_prod, test_mode)
    # Initialize entity lookup
    entities_lookup = entities.Entities(db_prod)
    # Table prod_tables.yaml defines a specifications of SQL selects to read
    # source data and describtion of additional tables to be created.
    with open('prod_tables.yaml', 'r') as stream:
        config = yaml.load(stream)
    # This is where all the population happens!!!
    # Go through all the specified data sources and process them, adding data
    # as needed. We process them in the order!
    for key in sorted(config.keys()):
        config_per_source = config[key]
        print "Working on source:", key
        ProcessSource(db_prod, geocoder, entities_lookup, config_per_source,
                      test_mode)
        print "GEOCODER STATS"
        geocoder.PrintStats()

    # Grant apps read-only access to the newly created schema and tables within
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'data')
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'verejne')
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'kataster')
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'prepojenia')
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'obstaravania')

    # Commit database changes and close database connections
    db_address_cache.commit()
    db_address_cache.close()
    if test_mode:
        db_prod.conn.rollback()
    else:
        db_prod.commit()
        db_prod.close()
Example #7
0
def get_prod_data_info():
    """ Return list of tables and column names from the current production tables,
        and the time when these were generated. """
    db = DatabaseConnection(path_config='db_config_data.yaml')
    schema = db.get_latest_schema('prod_')
    response = {
        'schema': schema,
        'tables': _get_tables_and_columns_in_schema(db, schema),
        'update': _datetimestr_from_schema(schema),
    }
    db.close()
    return response
Example #8
0
    def get(self):
        # Parse politician id
        try:
            politician_id = int(self.request.GET["id"])
        except:
            self.abort(400, detail="Could not parse parameter 'id' as int")

        # Find Parcels owned by this politician in the database
        db = DatabaseConnection(path_config='db_config.yaml',
                                search_path='kataster')
        Parcels = get_Parcels_owned_by_Person(db, politician_id)
        db.close()
        self.returnJSON(Parcels)
Example #9
0
    def get(self):
        # Parse politician id
        try:
            politician_id = int(self.request.GET["id"])
        except:
            self.abort(400, detail="Could not parse parameter 'id' as int")

        # Find asset declarations of this politician in the database
        db = DatabaseConnection(path_config='db_config.yaml',
                                search_path='kataster')
        declarations = get_asset_declarations(db, politician_id)
        db.close()
        self.returnJSON(declarations)
Example #10
0
    def get(self):
        # Parse politician id
        try:
            politician_id = int(self.request.GET["id"])
        except:
            self.abort(400, detail="Could not parse parameter 'id' as int")

        # Get politician information from database
        db = DatabaseConnection(path_config='db_config.yaml',
                                search_path='kataster')
        politician = get_politician_by_PersonId(db, politician_id)
        db.close()
        if politician is None:
            self.abort(404,
                       detail="Could not find politician with provided 'id'")
        self.returnJSON(politician)
Example #11
0
def update_CSV_source(source, timestamp, dry_run, verbose):
    # Load the CSV file
    with open(source['path'], 'r') as f:
        delimiter = str(source['delimiter']) # requires string, not unicode
        reader = csv.reader(f, delimiter=delimiter)

        # Extract column names from header line and then the actual data
        header = next(reader)
        column_names = [column_name.decode('utf-8') for column_name in header]
        data = [tuple(row) for row in reader]
    if verbose:
        print('Loaded CSV file with %d columns and %d data rows' % (len(column_names), len(data)))

    # Create postgres schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = 'source_' + source['name'] + '_' + timestamp
    q = 'CREATE SCHEMA %s; SET search_path="%s";' % (schema, schema)
    db.execute(q)

    # Compute normalised column names, saving original names in a separate table
    column_names_normalised = map(normalise_CSV_column_name, column_names)
    q = 'CREATE TABLE column_names (name_original text, name_normalised text);'
    db.execute(q)
    q = """INSERT INTO column_names VALUES %s;"""
    q_data = [(original, normalised) for original, normalised in zip(column_names, column_names_normalised)]
    db.execute_values(q, q_data)

    # Create table containing the actual data from the CSV file
    table = source['table_name']
    table_columns = ', '.join(['%s text' % (name) for name in column_names_normalised])
    q = 'CREATE TABLE %s (%s);' % (table, table_columns)
    db.execute(q)

    # Populate the table with data
    q = 'INSERT INTO ' + table + ' VALUES %s;'
    db.execute_values(q, data)
    if verbose:
        print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else ''))

    # Grant privileges to user data for data/SourceDataInfo to work properly
    db.grant_usage_and_select_on_schema(schema, 'data')

    # Commit and close database connection
    if not dry_run:
        db.commit()
    db.close()
Example #12
0
    def subgraph(self, set_A, set_B):
        # Compute distance of each vertex from A and from B
        dists_A = self.dijkstra(set_A, set_B, return_all=True)
        dists_B = self.dijkstra(set_B, set_A, return_all=True)
        dists_AB = [dists_A[v] for v in set_B if v in dists_A]
        if len(dists_AB) == 0:
            return {'vertices': [], 'edges': []}
        dist_AB = min([dists_A[v] for v in set_B if v in dists_A])

        # Determine subgraph's vertices (eIDs)
        vertices_eids = set()
        vertices_eids.update(set_A)
        vertices_eids.update(set_B)
        tolerance = 1
        for v in dists_A:
            if (v in dists_B) and (dists_A[v] + dists_B[v] <= dist_AB + tolerance):
                vertices_eids.add(v)

        # Obtain entity name for chosen vertices
        db = DatabaseConnection(search_path='mysql')
        q = """
            SELECT eid, entity_name FROM entities
            WHERE entities.eid IN %s;
            """
        q_data = (tuple(vertices_eids),)
        rows = db.query(q, q_data)
        db.close()
        eid_to_name = {row['eid']: row['entity_name'] for row in rows}

        # Add entity names and distances to vertices
        vertices = []
        for eid in vertices_eids:
            vertices.append({
                'eid': eid,
                'entity_name': eid_to_name[eid],
                'distance_from_A': dists_A.get(eid, None),
                'distance_from_B': dists_B.get(eid, None),
                })

        # Build subgraph's edges
        edges = []
        for v1, v2, length in self.edges:
            if (v1 in vertices_eids) and (v2 in vertices_eids):
                edges.append((v1, v2, length))

        return {'vertices': vertices, 'edges': edges}
Example #13
0
    def __init__(self):
        log('Connecting to the database...')
        db = DatabaseConnection(path_config='db_config.yaml', search_path='mysql')
        with open("db_config.yaml", "r") as stream:
            config = yaml.load(stream)

        log('Relations constructor...')
        q = """SELECT eid1, eid2, length FROM related LIMIT %s"""
        q_data = [int(config["relations_to_load"])]
        for row in db.query(q, q_data):
            self.edges.append((row["eid1"], row["eid2"], float(row["length"])))
            self.edges.append((row["eid2"], row["eid1"], float(row["length"])))
        db.close()

        log('Sorting edges...')
        self.edges.sort()
        log('Creating start indices...')
        for i in xrange(len(self.edges)):
            cur = self.edges[i][0]
            if cur in self.start_index: continue
            self.start_index[cur] = i
Example #14
0
def generate_public_data_dumps(limit=None, verbose=False):
    """ Generates the public data dump files from the latest production data """

    # Connect to the latest production data schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path="' + schema + '";')
    timestamp = schema[schema.rfind('_') + 1:]
    if verbose:
        print('[OK] Dumping from schema "%s"...' % (schema))
    if limit is not None:
        print('[WARNING] Dumping with row limit %d!' % (limit))

    # Read YAML configuration file
    config = yaml_load('public_dumps.yaml')
    dir_save = config['save_directory']
    dumps = config['dumps']

    # Process all dumps
    for dump_name in dumps:
        # Construct dump query
        q = dumps[dump_name]['query']
        q = q.rstrip().rstrip(';')  # possibly remove ; ending
        if limit is not None:
            q += ' LIMIT %d' % (limit)

        # Dump to CSV without timestamp
        path_output = '%s%s.csv' % (dir_save, dump_name)
        db.dump_to_CSV(q, path_output)
        if verbose:
            print('[OK] Created dump "%s" in %s' % (dump_name, path_output))

        # Dump to CSV with timestamp
        path_output = '%s%s_%s.csv' % (dir_save, dump_name, timestamp)
        db.dump_to_CSV(q, path_output)
        if verbose:
            print('[OK] Created dump "%s" in %s' % (dump_name, path_output))

    # Close database connection
    db.close()
Example #15
0
def update_JSON_source(source, timestamp, dry_run, verbose):
    # Load the JSON file
    data = json_load(source['path'])

    # Obtain column names appearing anywhere in the JSON
    columns = sorted(list(set(chain.from_iterable([datum.keys() for datum in data]))))
    if verbose:
        print('Loaded JSON files with %d columns and %d data rows' % (len(columns), len(data)))

    # Reorganise data into a list of tuples
    data = [tuple(datum[column] if column in datum else "" for column in columns) for datum in data]

    # Create postgres schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = 'source_' + source['name'] + '_' + timestamp
    q = 'CREATE SCHEMA "%s"; SET search_path="%s";' % (schema, schema)
    db.execute(q)

    # Create table containing the actual data from the CSV file
    table = source['table_name']
    table_columns = ', '.join(['%s text' % (name) for name in columns])
    q = 'CREATE TABLE %s (%s);' % (table, table_columns)
    db.execute(q)

    # Populate the table with data
    q = 'INSERT INTO ' + table + ' VALUES %s;'
    db.execute_values(q, data)
    if verbose:
        print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else ''))

    # Grant privileges to user data for data/SourceDataInfo to work properly
    db.grant_usage_and_select_on_schema(schema, 'data')

    # Commit and close database connection
    if not dry_run:
        db.commit()
    db.close()
Example #16
0
def update_SQL_source(source, timestamp, dry_run, verbose):
    # Check that the (temporary) schema names created by this data source
    # do not conflict with existing schemas in the database
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    q = """SELECT schema_name FROM information_schema.schemata WHERE schema_name IN %s LIMIT 1;"""
    q_data = (tuple(source['schemas']),)
    res = db.query(q, q_data, return_dicts=False)
    db.close()
    if len(res) >= 1:
        raise Exception('Schema "%s" that source "%s" reads into already exists' % (res[0][0], source['name']))
    if verbose:
        print('[OK] No conflicting schema names found')

    # Download online resource if a URL is specified, storing it at the
    # location specified in source['path']
    if ('url' in source):
        urllib.urlretrieve(source['url'], source['path'])
        if verbose:
            print('[OK] Downloaded from %s to %s' % (source['url'], source['path']))

    if dry_run:
        print('[WARNING] --dry_run option not implemented for entire pipeline of updating an SQL source')
        db.close()
        return

    # Load into postgres, unzipping along the way
    if source['path'].endswith('.sql.gz'):
        p1 = subprocess.Popen(['gunzip', '-c', source['path']], stdout=subprocess.PIPE)
        subprocess.check_output(['psql', '-d', 'vd', '-q'], stdin=p1.stdout)
    # Load into postgres directly
    else:
        # The options -q -o /dev/null just suppress output
        subprocess.call(['psql', '-d', 'vd', '-f', source['path'], '-q', '-o', '/dev/null'])

    # Rename loaded schema(s) to the desired schema name(s)
    # If there is a single schema, rename it to source_NAME_TIMESTAMP
    # If there are multiple schemas, rename them to source_NAME_SCHEMA_TIMESTAMP
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    if len(source['schemas']) == 1:
        schema_old = source['schemas'][0]
        schema_new = 'source_' + source['name'] + '_' + timestamp
        db.rename_schema(schema_old, schema_new, verbose)
        # Grant privileges to user data for data/SourceDataInfo to work properly
        db.grant_usage_and_select_on_schema(schema_new, 'data')
    else:
        for schema_old in source['schemas']:
            schema_new = 'source_' + source['name'] + '_' + schema_old + '_' + timestamp
            db.rename_schema(schema_old, schema_new, verbose)
            # Grant privileges to user data for data/SourceDataInfo to work properly
            db.grant_usage_and_select_on_schema(schema_new, 'data')

    # Commit and close database connection
    db.commit()
    db.close()
Example #17
0
def ProcessSource(db_prod, geocoder, entities, config, test_mode):
    """ Process one source table (read from db_source) using the config and
    performing normalization using the given geocoder and entities lookup.

    The produced data are written into db_prod connection. The function writes
    new entities and addresses in to the Entities and Address tables. It also
    creates and populates supplementary tables as specified by a config.
    """

    # Connect to the most recent schema from the current source
    db_source = DatabaseConnection(path_config='db_config_update_source.yaml')
    source_schema_name = db_source.get_latest_schema('source_' +
                                                     config["source_schema"])
    print "Processing source_schema_name", source_schema_name
    db_source.execute('SET search_path="' + source_schema_name + '";')

    columns_for_table = {}
    with db_prod.dict_cursor() as cur:
        # Create supplementaty tables using the provided command.
        # Also store the columns of the table for later use.
        for table in config["tables"]:
            table_config = config["tables"][table]
            columns_for_table[table] = table_config["columns"]
            cur.execute(table_config["create_command"])

    def AddValuesToTable(columns, values, eid, supplier_eid=None):
        if eid is not None:
            columns += ["eid"]
            values += [eid]
        if supplier_eid is not None:
            columns += ["supplier_eid"]
            values += [supplier_eid]

        if all(v is None for v in values):
            # Ignore this entry, all meaningful values are None
            return

        # TODO: find out how to build SQL statement properly
        column_names = ",".join(columns)
        values_params = ",".join(["%s"] * (len(columns)))
        command = ("INSERT INTO %s (" + column_names + ") " + "VALUES (" +
                   values_params + ") " + "ON CONFLICT DO NOTHING")
        with db_prod.dict_cursor() as cur:
            cur.execute(command, [AsIs(table)] + values)

    def AddToTable(row, table, eid, years, supplier_eid=None):
        """ Add values for the given row into the supplementary table 'table'.

        It reads the corresponding values from the row and adds them into the
        table with the corresponding eid.
        """
        columns = list(columns_for_table[table])
        if years:
            for year in years:
                values = []
                columns_per_year = columns[:]
                for column in columns:
                    col_name = column + "_" + str(year)
                    if col_name in row:
                        values.append(row[col_name])
                    else:
                        values.append(None)
                columns_per_year.append("year")
                values.append(year)
                AddValuesToTable(columns_per_year, values, eid)
        else:
            values = [row[column] for column in columns]
            AddValuesToTable(columns, values, eid, supplier_eid)

    with db_source.dict_cursor() as cur:
        # Read data using the given command.
        print "Executing SQL command ..."
        suffix_for_testing = ""
        if test_mode:
            suffix_for_testing = " LIMIT 1000"
        cur.execute(config["command"] + suffix_for_testing)
        print "Done."
        missed = 0
        found = 0
        empty = 0

        missed_eid = 0
        found_eid = 0

        missed_addresses = set([])
        for row in cur:
            # Read entries one by one and try to geocode them. If the address
            # lookup succeeds, try to normalize the entities. If it succeeds,
            # insert into Entities and supplementary tables.
            address = ""
            if "address" in row:
                address = row["address"]
                if address is None: continue
            name = ""
            if "name" in row:
                name = row["name"]
                if name is None: continue
            # Sometimes FirstName and Surname are joined. Lets try the simplest splitting on Capital
            # letters.
            if (len(name.split()) == 1):
                name = ' '.join(re.findall('[A-Z][^A-Z]*', name))
            addressId = geocoder.GetAddressId(address.encode("utf8"))
            if addressId is None:
                if address == "":
                    empty += 1
                else:
                    if test_mode and missed < 10:
                        print "MISSING ADDRESS", address.encode("utf8")
                    missed_addresses.add(address)
                    missed += 1
                    continue
            found += 1

            eid = None
            if config.get("no_entity_id"):
                # TODO(rasto): is the address lookup necessary here?
                eid = None
            else:
                eid = entities.GetEntity(row["ico"], name, addressId)

            if found % 20000 == 0:
                print "Progress:", found
                sys.stdout.flush()

            if config.get("save_org_id"):
                entities.AddOrg2Eid(row["org_id"], eid)
            if config.get("use_org_id_as_eid_relation"):
                eid2 = entities.GetEidForOrgId(row["eid_relation"])
                if eid2 is None:
                    continue
                row["eid_relation"] = eid2
            if config.get("extract_description_from_body"):
                row["body"] = ExtractDescriptionFromBody(row["body"])
            supplier_eid = None
            if config.get("supplier_eid"):
                supplier_address_id = None
                if "supplier_address" in row and not row[
                        "supplier_address"] is None:
                    supplier_address = row["supplier_address"]
                    if supplier_address:
                        supplier_address_id = geocoder.GetAddressId(
                            supplier_address.encode("utf8"))
                        if supplier_address_id is None:
                            missed_addresses.add(supplier_address)
                            missed += 1
                            continue
                    else:
                        empty += 1
                supplier_name = ""
                if "supplier_name" in row and not row["supplier_name"] is None:
                    supplier_name = row["supplier_name"]
                supplier_eid = entities.GetEntity(row["supplier_ico"],
                                                  supplier_name,
                                                  supplier_address_id)
            if table_config.get("strip_html"):
                for strip_html_column in table_config["strip_html"]:
                    if row.get(strip_html_column):
                        row[strip_html_column] = StripHtml(
                            row[strip_html_column])
            if eid is None: missed_eid += 1
            found_eid += 1
            AddToTable(row, table, eid, table_config.get("years"),
                       supplier_eid)

    print "FOUND", found
    print "MISSED", missed
    print "EMPTY", empty
    print "MISSED UNIQUE", len(missed_addresses)
    print "FOUND EID", found_eid
    print "MISSED EID", missed_eid
    db_source.close()