def main(args_dict): test_mode = not args_dict['disable_test_mode'] if test_mode: print "=======================" print "=======TEST MODE=======" print "=======================" timestamp = datetime.now().strftime('%Y%m%d%H%M%S') # Write output into prod_schema_name prod_schema_name = "prod_" + timestamp print "prod_schema_name", prod_schema_name # Create database connections: # Read / write address cache from this one db_address_cache = DatabaseConnection( path_config='db_config_update_source.yaml', search_path='address_cache') # Write prod tables into this one db_prod = DatabaseConnection(path_config='db_config_update_source.yaml') CreateAndSetProdSchema(db_prod, prod_schema_name) # Initialize geocoder geocoder = geocoder_lib.Geocoder(db_address_cache, db_prod, test_mode) # Initialize entity lookup entities_lookup = entities.Entities(db_prod) # Table prod_tables.yaml defines a specifications of SQL selects to read # source data and describtion of additional tables to be created. with open('prod_tables.yaml', 'r') as stream: config = yaml.load(stream) # This is where all the population happens!!! # Go through all the specified data sources and process them, adding data # as needed. We process them in the order! for key in sorted(config.keys()): config_per_source = config[key] print "Working on source:", key ProcessSource(db_prod, geocoder, entities_lookup, config_per_source, test_mode) print "GEOCODER STATS" geocoder.PrintStats() # Grant apps read-only access to the newly created schema and tables within db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'data') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'verejne') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'kataster') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'prepojenia') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'obstaravania') # Commit database changes and close database connections db_address_cache.commit() db_address_cache.close() if test_mode: db_prod.conn.rollback() else: db_prod.commit() db_prod.close()
def update_SQL_source(source, timestamp, dry_run, verbose): # Check that the (temporary) schema names created by this data source # do not conflict with existing schemas in the database db = DatabaseConnection(path_config='db_config_update_source.yaml') q = """SELECT schema_name FROM information_schema.schemata WHERE schema_name IN %s LIMIT 1;""" q_data = (tuple(source['schemas']),) res = db.query(q, q_data, return_dicts=False) db.close() if len(res) >= 1: raise Exception('Schema "%s" that source "%s" reads into already exists' % (res[0][0], source['name'])) if verbose: print('[OK] No conflicting schema names found') # Download online resource if a URL is specified, storing it at the # location specified in source['path'] if ('url' in source): urllib.urlretrieve(source['url'], source['path']) if verbose: print('[OK] Downloaded from %s to %s' % (source['url'], source['path'])) if dry_run: print('[WARNING] --dry_run option not implemented for entire pipeline of updating an SQL source') db.close() return # Load into postgres, unzipping along the way if source['path'].endswith('.sql.gz'): p1 = subprocess.Popen(['gunzip', '-c', source['path']], stdout=subprocess.PIPE) subprocess.check_output(['psql', '-d', 'vd', '-q'], stdin=p1.stdout) # Load into postgres directly else: # The options -q -o /dev/null just suppress output subprocess.call(['psql', '-d', 'vd', '-f', source['path'], '-q', '-o', '/dev/null']) # Rename loaded schema(s) to the desired schema name(s) # If there is a single schema, rename it to source_NAME_TIMESTAMP # If there are multiple schemas, rename them to source_NAME_SCHEMA_TIMESTAMP db = DatabaseConnection(path_config='db_config_update_source.yaml') if len(source['schemas']) == 1: schema_old = source['schemas'][0] schema_new = 'source_' + source['name'] + '_' + timestamp db.rename_schema(schema_old, schema_new, verbose) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema_new, 'data') else: for schema_old in source['schemas']: schema_new = 'source_' + source['name'] + '_' + schema_old + '_' + timestamp db.rename_schema(schema_old, schema_new, verbose) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema_new, 'data') # Commit and close database connection db.commit() db.close()
def modify_item(): j = request.get_json() c = DatabaseConnection() auth = check_auth(j, c) if auth: return auth itemid = j.get("itemid", 0) if not itemid: return {"err": "itemid must not be empty"}, 400 user = c.get_user(username=j["username"]) item = c.get_item(itemid) if not item: return {"err": "item does not exist"}, 400 l = c.get_list(item["listid"]) if l["userid"] != user["id"]: return {"err": "item does not belong to user"}, 409 # deleting item if request.path == "/api/deleteitem": c.delete_item(itemid) return {"msg": "successfully deleted item"}, 200 # updating item elif request.path == "/api/updateitem": if not c.update_item(itemid, j): return {"err": "attempted to give item duplicate name"}, 409 return {"msg": "successfully updated item"}, 200 return {"err": "invalid method used"}, 405
def add_item(): j = request.get_json() c = DatabaseConnection() auth = check_auth(j, c) if auth: return auth vs = [None, None, None, None, None, None] varnames = ["listid", "label", "descr", "img", "url", "price"] for i in range(len(varnames)): vs[i] = j.get(varnames[i], None) if not vs[i]: return {"err": varnames[i] + " must not be empty"}, 400 user = c.get_user(username=j["username"]) l = c.get_list(vs[0]) if not l: return {"err": "list does not exist"}, 409 if user["id"] != l["userid"]: return {"err": "list does not belong to user"}, 400 if not c.add_item(vs[0], vs[1], vs[2], vs[3], vs[4], vs[5]): return { "err": "attempting to add item with duplicate label to list" }, 409 return {"msg": "successfully added item to list"}, 201
def modify_list(): j = request.get_json() c = DatabaseConnection() auth = check_auth(j, c) if auth: return auth listid = j.get("listid", None) if not listid: return {"err": "listid must not be empty"}, 400 l = c.get_list(listid) if not l: return {"err": "list does not exist"}, 409 u = c.get_user(j["username"]) if l["userid"] != u["id"]: return {"err": "list does not belong to user"}, 400 if request.path == "/api/deletelist": print("deleting list") c.delete_list(listid) return {"msg": "successfully deleted list"}, 200 elif request.path == "/api/updatelist": label = j.get("label", "") if not label: return {"err": "label must not be empty"}, 400 if not c.update_list(listid, label): return {"err": "attempted to give list duplicate name"}, 409 return {"msg": "successfully updated list"}, 200 return {"err": "invalid method used"}, 405
def insert_many(entities): """ Inserts a single entity to the database. :param entities: a tuple of the form -> ([ street_name, intersection_1, intersection_2, longitude, latitude, city, neighbourhood, is_intersection]) :return: None """ db = DatabaseConnection() sql_insert = """INSERT INTO dimension_pre_stage.location_dimension_pre_stage ( street_name, intersection_1, intersection_2, longitude, latitude, city, neighbourhood, is_intersection) VALUES %s;""" with db.get_connection().cursor() as cursor: execute_values(cur=cursor, sql=sql_insert, argslist=entities)
def main(): """Generates public data dump files from the latest prod data.""" # Connect to the latest schemas. db = DatabaseConnection(path_config='db_config.yaml') schema = db.get_latest_schema('prod_') schema_profil = db.get_latest_schema('source_internal_profil_') db.execute('SET search_path="' + schema + '", "' + schema_profil + '";') timestamp = schema[schema.rfind('_') + 1:] print('[OK] Dumping from schemas "%s" and "%s"...' % (schema, schema_profil)) # Read YAML configuration file. config = yaml_load('public_dumps.yaml') dir_save = config['save_directory'] dumps = config['dumps'] # Process all dumps. for dump_name in dumps: save_path = os.path.join(dir_save, '%s_%s.csv' % (dump_name, timestamp)) db.dump_to_CSV(dumps[dump_name]['query'], save_path) print('[OK] Saved dump "%s" to %s' % (dump_name, save_path)) stage_path = os.path.join(dir_save, dump_name + '.csv') shutil.copyfile(save_path, stage_path) print('[OK] Copied dump "%s" to %s' % (dump_name, stage_path)) # Close database connection. db.close()
def get_source_data_info(): """ Returns a list of data sources, together with information about the latest update (timestamp, list of table names and columns) """ # Establish connection to the database db = DatabaseConnection(path_config='db_config_data.yaml') # Iterate through sources listed in sources.json sources = json_load('../data/sources.json') result = [] for source in sources: # Obtain schema with the last update try: schema = db.get_latest_schema('source_' + source['name']) except Exception as exception: print('[WARNING] %s' % (exception)) continue # Store information to be returned result.append({ 'description': source['description'], 'name': source['name'], 'schema': schema, 'tables': _get_tables_and_columns_in_schema(db, schema), 'update': _datetimestr_from_schema(schema), }) # Close database connection and return the result db.close() return result
def populate(): db = DatabaseConnection() sql = """INSERT INTO accidents_weather_data_mart.accident_fact( hour_key, location_key, accident_key, weather_key, is_fatal, is_intersection ) SELECT AH.hour_key, AL.location_key, AH.accident_key, WL.weather_key, A.collision_classification = 'fatal' as is_fatal, L.is_intersection FROM relations.accident_hour_relation AH, relations.accident_location_relation AL, relations.weather_hour_relation WH, relations.weather_location_relation WL, accidents_weather_data_mart.accident_dimension A, accidents_weather_data_mart.location_dimension L WHERE AL.accident_key = AH.accident_key AND A.accident_key = AL.accident_key AND AH.hour_key = WH.hour_key AND WH.weather_key = WL.weather_key AND AL.location_key = WL.location_key AND L.location_key = AL.location_key""" with db.get_connection().cursor() as cursor: cursor.execute(sql)
def get_count(sql): db = DatabaseConnection() with db.get_connection().cursor( cursor_factory=psycopg2.extras.DictCursor) as cursor: cursor.execute(sql) return cursor.fetchone()['count']
def consolidate_entities(read_only): db = DatabaseConnection(path_config='utils/db_config.yaml', search_path='mysql') ids1, eids1, edges = consolidate_people(db) ids2, eids2 = consolidate_companies(db) cur = db.dict_cursor() # 1. Reset eids to be equal to ids in entities. print "Reset eids" cur.execute("update entities set eid = id;") # 2. Consolidate people print "Update DB with people eids" update_eids_of_ids(cur, ids1, eids1) # 3. Consolidate companies print "Update DB with companies eids" update_eids_of_ids(cur, ids2, eids2) # 4. Remove neighbour edges print "Delete neighbour edges" cur.execute("DELETE from related where source=%s", ("neighbour", )) # 5. Add new neighbour edges add_neighbour_edges(cur, edges) # 6. Update related table print "Updating related eids" cur.execute( "UPDATE related SET eid1=entities.eid FROM entities WHERE related.id1=entities.id;" ) cur.execute( "UPDATE related SET eid2=entities.eid FROM entities WHERE related.id2=entities.id;" ) cur.close() if not read_only: db.commit() db.close()
def setUpClass(self): self.client = app.test_client() self.conn_object = DatabaseConnection() self.present_location = {'present_location': 'Bukumiro'} self.admin_user = { 'username': '******', 'password': '******', 'email': '*****@*****.**' } self.user = { "username": "******", "password": "******", "email": "*****@*****.**" } self.user2 = { "username": "******", "password": "******", "email": "*****@*****.**" } self.client.post('/api/v2/signup', data=json.dumps(self.user2), content_type="application/json")
def get(self): # Return politicians augmented with property counts as JSON db = DatabaseConnection(path_config='db_config.yaml', search_path='kataster') politicians = get_politicians_with_Folio_counts(db) db.close() self.returnJSON(politicians)
def initialise_app(): """Stores values shared across requests in the app registry.""" # Database connection: db = DatabaseConnection() schema = db.get_latest_schema('prod_') db.execute('SET search_path to ' + schema + ';') app.registry['db'] = db
def initialise_app(): """Precomputes values shared across requests to this app.""" # Connect to the database: db = DatabaseConnection(path_config='db_config.yaml') schema = db.get_latest_schema('prod_') db.execute('SET search_path to ' + schema + ',public;') app.registry['db'] = db
def insert_many(entities): """ Insert many entities to the database. :param entities: a list of tuples of the form -> [( station_name, longitude, latitude, elevation, date, time, temperature, dew_point_temp, relative_humidity, wind_direction, wind_speed, wind_speed_flag, visibility, station_pressure, humidex, wind_chill, wind_chill_flag, weather)] :return: None """ db = DatabaseConnection() sql_insert = """INSERT INTO dimension_pre_stage.weather_dimension_pre_stage ( station_name, longitude, latitude, elevation, date, time, temperature, temperature_flag, dew_point_temp, dew_point_temp_flag, relative_humidity, relative_humidity_flag, wind_direction, wind_direction_flag, wind_speed, wind_speed_flag, visibility, visibility_flag, station_pressure, station_pressure_flag, humidex, humidex_flag, wind_chill, wind_chill_flag, weather, weather_flag) VALUES %s""" with db.get_connection().cursor() as cursor: execute_values(cur=cursor, sql=sql_insert, argslist=entities)
def insert_many_temp(entities): db = DatabaseConnection() sql_insert = """INSERT INTO relations.weather_location_temp_relation (station_name, location_key, date, time) VALUES %s;""" with db.get_connection().cursor() as cursor: execute_values(cur=cursor, sql=sql_insert, argslist=entities)
def generate_unmovable_asset_count_plots(): """Generates and saves asset count plots for all persons.""" # Connect to most recent profil source schema in the database. db = DatabaseConnection(path_config='db_config.yaml') schema_profil = db.get_latest_schema('source_internal_profil_') db.execute('SET search_path="' + schema_profil + '";') # Load declarations data from the database. declarations = db.query(""" SELECT PersonId, Persons.FirstName AS firstname, Persons.Surname AS surname, year, num_houses, num_fields, num_others FROM AssetDeclarations INNER JOIN Persons ON Persons.Id=AssetDeclarations.PersonId WHERE (num_houses IS NOT NULL) AND (num_fields IS NOT NULL) AND (num_others IS NOT NULL) ;""") # Compute range of years present in the declarations. years = [declaration['year'] for declaration in declarations] years = list(range(min(years), max(years) + 1)) # Group declarations by person. user_declarations = collections.defaultdict(list) for declaration in declarations: user_declarations[declaration['personid']].append(declaration) # Matplotlib font matplotlib.rc( 'font', **{ 'size': 11, 'sans-serif': 'Arial', 'family': 'sans-serif' }) # Iterate through all persons, and plot. for ui, person_id in enumerate(user_declarations): # if person_id != 913: # continue declarations = user_declarations[person_id] plot_unmovable_asset_counts(declarations, years, DIR_SAVE) if ui + 1 == len(user_declarations) or (ui + 1) % 50 == 0: print('Plotted %d/%d persons' % (ui + 1, len(user_declarations))) print('\nDeploy generated plots using\n' 'sudo cp %s* ' '/data/www/verejne.digital/resources/profil_asset_plots' % (DIR_SAVE))
def reveal_connection_to_politics(max_relations_to_load, num_contracts, max_distance, path_output): """Reveals connections between recent contract winners and politics. Args: max_relations_to_load: Maximum number of relations to load from production table `related`. Use a smaller number for faster debugging only. num_contracts: Number of most recent contracts to analyse. max_distance: Maximum distance at which connections are reported. path_output: Path where to write the resulting report. """ # Connect to the database: db = DatabaseConnection(path_config='db_config.yaml') schema = db.get_latest_schema('prod_') db.execute('SET search_path to ' + schema + ';') # Load relations and notable eIDs: relations = server._initialise_relations(db, max_relations_to_load) notable_eids = server._initialise_notable_eids(db) # Retrieve most recent contracts with positive price: q = """ SELECT supplier_eid, entities.name AS supplier_name, contract_price_amount, contract_price_total_amount, signed_on, effective_from, effective_to, status_id, contract_id FROM contracts INNER JOIN entities ON entities.id=contracts.supplier_eid WHERE signed_on IS NOT NULL AND signed_on <= now() AND ( contract_price_amount > 0 OR contract_price_total_amount > 0 ) AND entities.name LIKE '%%.' AND entities.name NOT LIKE '%%lovensk%%' ORDER BY signed_on DESC LIMIT %s; """ with open(path_output, "w") as file_output: rows = db.query(q, [num_contracts]) for row in tqdm.tqdm(rows): report_on(row, relations, notable_eids, max_distance, db, file_output) db.close()
def check_user_exist(self): db = DatabaseConnection() username = db.check_username(self.username) email = db.check_email(self.email) if username != None: return 'Username is taken.' if email != None: return 'Email already has an account.'
def api_login(): j = request.get_json() c = DatabaseConnection() auth = check_auth(j, c) if auth: return auth return {"msg": "successfully authenicated user"}, 200
def connect_accident_hour_dimension(): db = DatabaseConnection() sql = """INSERT INTO relations.accident_hour_relation (accident_key, hour_key) SELECT A.accident_key, H.hour_key FROM dimension_pre_stage.accident_dimension_pre_stage A, dimension_pre_stage.hour_dimension_pre_stage H WHERE date_trunc('hour', A.date + A.time + INTERVAL '30 minute') = H.date + H.hour_start;""" with db.get_connection().cursor() as cursor: cursor.execute(sql)
def delete_user(): j = request.get_json() c = DatabaseConnection() auth = check_auth(j, c) if auth: return auth c.delete_user(j["username"]) return {"msg": "successfully deleted user"}, 200
def connect_weather_to_location_dimension(): db = DatabaseConnection() sql = """INSERT INTO relations.weather_location_relation (weather_key, location_key) SELECT W.weather_key, T.location_key FROM dimension_pre_stage.weather_dimension_pre_stage W, relations.weather_location_temp_relation T WHERE W.station_name = T.station_name AND W.date = T.date AND W.time = T.time;""" with db.get_connection().cursor() as cursor: cursor.execute(sql)
def insert_many(entities): """ Insert many entities to the database. :param entities: a list of tuples of the form -> [( id, collision_id, location, longitude, latitude, date, time, environment, environment_flag, light, light_flag, surface_condition, surface_condition, traffic_control, traffic_control_flag, traffic_control_condition, traffic_control_condition_flag, collision_classification, collision_classification_flag, impact_type, impact_type_flag, no_of_pedestrians)] :return: None """ db = DatabaseConnection() sql_insert = """INSERT INTO dimension_pre_stage.accident_dimension_pre_stage ( longitude, latitude, date, time, street_name, street1, street2, environment, environment_flag, road_surface, road_surface_flag, traffic_control, traffic_control_flag, visibility, visibility_flag, collision_classification, collision_classification_flag, impact_type, impact_type_flag) VALUES %s""" with db.get_connection().cursor() as cursor: execute_values(cur=cursor, sql=sql_insert, argslist=entities)
def initialise_app(max_relations_to_load): """Precomputes values shared across requests to this app. The registry property is intended for storing these precomputed values, so as to avoid global variables. """ # Connect to the database: db = DatabaseConnection(path_config='db_config.yaml') schema = db.get_latest_schema('prod_') db.execute('SET search_path to ' + schema + ';') app.registry['db'] = db # Retrieve list of relationship edges: q = """ SELECT eid, eid_relation, stakeholder_type_id FROM related LIMIT %s; """ q_data = [max_relations_to_load] edge_list = [] for row in db.query(q, q_data): edge_type = row['stakeholder_type_id'] or 0 edge_list.append((row['eid'], row['eid_relation'], +1 * edge_type)) edge_list.append((row['eid_relation'], row['eid'], -1 * edge_type)) # Construct Relations object from the edge list: relations = Relations(edge_list) app.registry['relations'] = relations # TEMP: Construct Relations using old database data: db_old = DatabaseConnection(path_config='db_config_old.yaml', search_path='mysql') app.registry['db_old'] = db_old q = """SELECT eid1, eid2, length FROM related LIMIT %s;""" q_data = [max_relations_to_load] edge_list_old = [] for row in db_old.query(q, q_data): edge_list_old.append((row['eid1'], row['eid2'], float(row['length']))) edge_list_old.append((row['eid2'], row['eid1'], float(row['length']))) relations_old = Relations(edge_list_old) app.registry['relations_old'] = relations_old
def connect_event_location_dimension(): db = DatabaseConnection() sql = """INSERT INTO relations.event_location_relation (event_key, location_key) SELECT E.event_key, L.location_key FROM dimension_pre_stage.event_dimension_pre_stage E, dimension_pre_stage.location_dimension_pre_stage L WHERE E.city = L.city""" with db.get_connection().cursor() as cursor: cursor.execute(sql)
def connect_event_hour_dimension(): db = DatabaseConnection() sql = """INSERT INTO relations.event_hour_relation (event_key, hour_key) SELECT E.event_key, H.hour_key FROM dimension_pre_stage.event_dimension_pre_stage E, dimension_pre_stage.hour_dimension_pre_stage H WHERE E.start_date = H.date OR E.end_date = H.date""" with db.get_connection().cursor() as cursor: cursor.execute(sql)
def connect_weather_hour_dimension(): db = DatabaseConnection() sql = """INSERT INTO relations.weather_hour_relation (weather_key, hour_key) SELECT W.weather_key, H.hour_key FROM dimension_pre_stage.weather_dimension_pre_stage W, dimension_pre_stage.hour_dimension_pre_stage H WHERE W.date = H.date AND H.hour_start <= W.time AND W.time <= H.hour_end""" with db.get_connection().cursor() as cursor: cursor.execute(sql)
def connect_accident_location_dimension(): db = DatabaseConnection() sql = """INSERT INTO relations.accident_location_relation (accident_key, location_key) SELECT A.accident_key, L.location_key FROM dimension_pre_stage.accident_dimension_pre_stage A, dimension_pre_stage.location_dimension_pre_stage L WHERE L.longitude = A.longitude AND L.latitude = A.latitude""" with db.get_connection().cursor() as cursor: cursor.execute(sql)