def process_database(species = None, commit = False): session = get_session() if species == None: taxa = [taxon_id for (taxon_id,) in session.execute("SELECT DISTINCT taxon_id FROM processing_method WHERE data_type = 2").fetchall()] else: taxa = [taxon_id for (taxon_id,) in session.execute( "SELECT DISTINCT taxon_id FROM processing_method, taxon WHERE taxon.id = taxon_id AND data_type = 2 AND spno IN %s" % sql_list_placeholder('species', species), sql_list_argument('species', species) ).fetchall()] create_region_lookup_table(session) log.info("Step 1/2: Monthly aggregation") fn = functools.partial(aggregate_by_month, commit = commit) for result, error in tqdm(run_parallel(fn, taxa), total = len(taxa)): pass log.info("Step 2/2: Yearly aggregation") fn = functools.partial(aggregate_by_year, commit = commit) for result, error in tqdm(run_parallel(fn, taxa), total = len(taxa)): pass cleanup_region_lookup_table(session)
def process_database(species=None, commit=False): session = get_session() if species == None: taxa = [ taxon_id for (taxon_id, ) in session.execute( "SELECT DISTINCT taxon_id FROM t1_sighting").fetchall() ] else: taxa = [ taxon_id for (taxon_id, ) in session.execute( "SELECT DISTINCT taxon_id FROM t1_sighting, taxon WHERE taxon.id = taxon_id AND spno IN (%s)" % sql_list_placeholder('species', species), sql_list_argument('species', species)).fetchall() ] create_region_lookup_table(session) # Process in parallel tasks = [(taxon_id, commit) for taxon_id in taxa] log.info("Step 1/2: Monthly aggregation") for result, error in tqdm(run_parallel(aggregate_monthly, tasks), total=len(tasks)): if error: print error log.info("Step 1/2: Yearly aggregation") for result, error in tqdm(run_parallel(aggregate_yearly, tasks), total=len(tasks)): if error: print error cleanup_region_lookup_table(session)
def process_database(species=None, commit=False): session = get_session() # Just get taxa that have range polygons if species == None: taxa = session.execute( "SELECT DISTINCT taxon_id FROM taxon_range").fetchall() else: taxa = session.execute( "SELECT DISTINCT taxon_id FROM taxon_range, taxon WHERE taxon_id = taxon.id AND spno IN (%s)" % sql_list_placeholder('species', species), sql_list_argument('species', species)).fetchall() # Unwrap tuple taxa = [taxon_id for (taxon_id, ) in taxa] # Delete old results if commit and len(taxa) > 0: # session.execute("DELETE FROM t2_ultrataxon_sighting WHERE taxon_id IN (:taxa)", { 'taxa': taxa }) session.execute( "DELETE FROM t2_ultrataxon_sighting WHERE taxon_id IN (%s)" % sql_list_placeholder('taxa', taxa), sql_list_argument('taxa', taxa)) session.commit() # Process in parallel tasks = [(taxon_id, commit) for taxon_id in taxa] # This is important because we are about to spawn child processes, and this stops them attempting to share the # same database connection pool session.close() for result, error in tqdm(run_parallel(process_taxon, tasks), total=len(tasks)): if error: print(error)
def process_database(species=None, commit=False): """ Calculates spatial representativeness using alpha hulls Generates alpha hulls from each source x taxon combination Intersects alpha hulls with range layers, and then calculates percentage of range covered """ session = get_session() if commit: if species == None: session.execute("DELETE FROM taxon_source_alpha_hull") else: session.execute( """DELETE FROM taxon_source_alpha_hull WHERE taxon_id IN (SELECT id FROM taxon WHERE spno IN (%s))""" % sql_list_placeholder('species', species), sql_list_argument('species', species)) session.commit() # Load coastal shapefile coastal_shape_filename = tsx.config.config.get("processing.alpha_hull", "coastal_shp") with fiona.Env(OSR_WKT_FORMAT="WKT2_2018"), fiona.open( coastal_shape_filename, 'r') as coastal_shape: # Convert from fiona dictionary to shapely geometry and reproject shp_to_working_transformer = pyproj.Transformer.from_proj( pyproj.CRS.from_wkt(coastal_shape.crs_wkt), working_proj, always_xy=True) coastal_shape = reproject(shape(coastal_shape[0]['geometry']), shp_to_working_transformer) # Simplify coastal boundary - makes things run ~20X faster log.info("Simplifying coastal boundary") coastal_shape = coastal_shape.buffer(10000).simplify(10000) log.info("Generating alpha shapes") for data_type in 1, 2: log.info("Processing type %s data" % data_type) taxa = get_taxa(session, data_type, species) tasks = [(taxon_id, coastal_shape, data_type, commit) for taxon_id in taxa] # This is important because we are about to spawn child processes, and this stops them attempting to share the # same database connection pool session.close() # TODO: not sure if this is needed now # Process all the species in parallel for result, error in tqdm(run_parallel(process, tasks), total=len(tasks)): if error: print(error)
def process_database(species = None, commit = False): """ Generates alpha hulls from raw sighting data in the database Intersects alpha hulls with range layers, and inserts the result back into the database """ session = get_session() if species is None: species = get_all_spno(session) if commit: for spno in species: session.execute(""" DELETE FROM taxon_presence_alpha_hull WHERE taxon_id IN (SELECT id FROM taxon WHERE spno = :spno) """, { 'spno': spno }) session.execute(""" DELETE FROM taxon_presence_alpha_hull_subdiv WHERE taxon_id IN (SELECT id FROM taxon WHERE spno = :spno) """, { 'spno': spno }) session.commit() db_proj = pyproj.Proj('+init=EPSG:4326') # Database always uses WGS84 working_proj = pyproj.Proj('+init=EPSG:3112') # GDA94 / Geoscience Australia Lambert - so that we can buffer in metres # Load coastal shapefile coastal_shape_filename = tsx.config.config.get("processing.alpha_hull", "coastal_shp") with fiona.open(coastal_shape_filename, 'r') as coastal_shape: # Convert from fiona dictionary to shapely geometry and reproject coastal_shape = reproject(shape(coastal_shape[0]['geometry']), pyproj.Proj(coastal_shape.crs), working_proj) # Simplify coastal boundary - makes things run ~20X faster log.info("Simplifying coastal boundary") coastal_shape = coastal_shape.buffer(10000).simplify(10000) log.info("Generating alpha shapes") # Process a single species. # This gets run off the main thread. def process_spno(spno): session = get_session() try: # Get raw points from DB raw_points = get_species_points(session, spno) if len(raw_points) < 4: # Not enough points to create an alpha hull return # Read points from database points = [reproject(p, db_proj, working_proj) for p in raw_points] # Generate alpha shape alpha_shp = make_alpha_hull( points = points, coastal_shape = coastal_shape, thinning_distance = tsx.config.config.getfloat('processing.alpha_hull', 'thinning_distance'), alpha = tsx.config.config.getfloat('processing.alpha_hull', 'alpha'), hullbuffer_distance = tsx.config.config.getfloat('processing.alpha_hull', 'hullbuffer_distance'), isolatedbuffer_distance = tsx.config.config.getfloat('processing.alpha_hull', 'isolatedbuffer_distance')) # Convert back to DB projection alpha_shp = reproject(alpha_shp, working_proj, db_proj) # Clean up geometry alpha_shp = alpha_shp.buffer(0) # Get range polygons to intersect with alpha shape for taxon_id, range_id, breeding_range_id, geom_wkb in get_species_range_polygons(session, spno): # Intersect and insert into DB geom = shapely.wkb.loads(binascii.unhexlify(geom_wkb)).buffer(0) geom = to_multipolygon(geom.intersection(alpha_shp)) # slow if len(geom) > 0: session.execute("""INSERT INTO taxon_presence_alpha_hull (taxon_id, range_id, breeding_range_id, geometry) VALUES (:taxon_id, :range_id, :breeding_range_id, ST_GeomFromWKB(_BINARY :geom_wkb))""", { 'taxon_id': taxon_id, 'range_id': range_id, 'breeding_range_id': breeding_range_id, 'geom_wkb': shapely.wkb.dumps(geom) } ) # We also subdivide the geometries into small pieces and insert this into the database. This allows for much faster # spatial queries in the database. for subgeom in subdivide_geometry(geom, max_points = 100): session.execute("""INSERT INTO taxon_presence_alpha_hull_subdiv (taxon_id, range_id, breeding_range_id, geometry) VALUES (:taxon_id, :range_id, :breeding_range_id, ST_GeomFromWKB(_BINARY :geom_wkb))""", { 'taxon_id': taxon_id, 'range_id': range_id, 'breeding_range_id': breeding_range_id, 'geom_wkb': shapely.wkb.dumps(subgeom) } ) if commit: session.commit() except: log.exception("Exception processing alpha hull") raise finally: session.close() # This is important because we are about to spawn child processes, and this stops them attempting to share the # same database connection pool session.close() tsx.db.connect.engine.dispose() # Process all the species in parallel for result, error in tqdm(run_parallel(process_spno, species, use_processes = True), total = len(species)): if error: print error
def process_sites(session): run_sql( session, "Populate t2_survey_site", """INSERT INTO t2_survey_site (survey_id, site_id) SELECT t2_survey.id, t2_site.id FROM t2_survey, t2_site WHERE site_id = t2_site.id""") # Query OK, 14073 rows affected (0.68 sec) # Records: 14073 Duplicates: 0 Warnings: 0 run_sql( session, "Populate t2_survey_site (spatial)", """INSERT INTO t2_survey_site SELECT t2_survey.id, t2_site.id FROM t2_site STRAIGHT_JOIN t2_survey USE INDEX (coords) WHERE site_id IS NULL AND t2_survey.search_type_id = t2_site.search_type_id AND ST_Intersects(geometry, coords)""") # Query OK, 278678 rows affected (15.77 sec) # Records: 278678 Duplicates: 0 Warnings: 0 run_sql( session, "Populate standardised site surveys", """INSERT INTO t2_processed_survey (raw_survey_id, site_id, search_type_id, start_date_y, start_date_m, source_id, experimental_design_type_id) SELECT t2_survey.id, t2_survey_site.site_id, search_type_id, start_date_y, start_date_m, source_id, 1 FROM t2_survey, t2_survey_site WHERE t2_survey.id = t2_survey_site.survey_id""") # Query OK, 292751 rows affected (5.00 sec) # Records: 292751 Duplicates: 0 Warnings: 0 run_sql( session, "Populate presences / non-pseudo-absences", """INSERT INTO t2_processed_sighting (survey_id, taxon_id, count, unit_id, pseudo_absence) SELECT t2_processed_survey.id, t2_ultrataxon_sighting.taxon_id, count, unit_id, 0 FROM t2_ultrataxon_sighting, t2_sighting, t2_processed_survey WHERE t2_ultrataxon_sighting.sighting_id = t2_sighting.id AND t2_sighting.survey_id = t2_processed_survey.raw_survey_id AND t2_processed_survey.experimental_design_type_id = 1""") # Query OK, 5775718 rows affected (2 min 35.55 sec) # Records: 5775718 Duplicates: 0 Warnings: 0 log.info("Identify taxa for each site based on alpha hulls") taxa = [ taxon_id for (taxon_id, ) in session.execute( "SELECT DISTINCT taxon_id FROM taxon_presence_alpha_hull_subdiv"). fetchall() ] session.execute("""DROP TABLE IF EXISTS tmp_taxon_site""") # Note for some reason CREATE TEMPORARY TABLE doesn't work as expected, the table seems to be empty by the time we get # to the next step... I'm guessing after some kind of timeout the transaction gets rolled back(?) session.execute("""CREATE TABLE tmp_taxon_site ( site_id INT NOT NULL, taxon_id CHAR(6) NOT NULL, INDEX (taxon_id, site_id) )""") # The next step was originally a very slow query, directly populating the tmp_taxon_site table. # Instead, I've broken the query down to process one taxon at a time # Furthermore, we process taxa in parallel threads to fully utilise the CPU, and then the results of the query are # inserted in bulk on the main thread. # Time taken: 17 min 4 s for result, error in tqdm(run_parallel(get_taxon_sites, taxa), total=len(taxa)): # Perform bulk insert on main thread if len(result) > 0: insert_data = [{ 'site_id': site_id, 'taxon_id': taxon_id } for site_id, taxon_id in result] session.execute( """INSERT INTO tmp_taxon_site (site_id, taxon_id) VALUES (:site_id, :taxon_id)""", insert_data) log.info("Insert pseudo absences") # This next step originally ran in about 15 minutes on my laptop as a single query, but took forever on the server # (I gave up after a couple of hours) so I've split it up by taxon with is a bit slower overall but at least you can # see progress # Create temporary table so we aren't inserting and selecting from the same table session.execute( """CREATE TEMPORARY TABLE tmp_processed_sighting SELECT survey_id, taxon_id, count, unit_id, pseudo_absence FROM t2_processed_sighting WHERE FALSE""" ) # Running this in parallel originally resulted in MySQL deadlock errors, but then we were inserting and selecting from the same table. # It might be worth trying a parallel approach again. for taxon_id in tqdm(taxa): # This query is a bit tricky. We do a left join to find taxa that are not present for a survey, and match on t2_processed_sighting.id = NULL to generate the pseudo-absences session.execute( """INSERT INTO tmp_processed_sighting (survey_id, taxon_id, count, unit_id, pseudo_absence) SELECT t2_processed_survey.id, tmp_taxon_site.taxon_id, 0, 2, 1 FROM t2_survey INNER JOIN t2_survey_site ON t2_survey.id = t2_survey_site.survey_id INNER JOIN t2_processed_survey ON t2_survey.id = t2_processed_survey.raw_survey_id AND t2_processed_survey.experimental_design_type_id = 1 INNER JOIN tmp_taxon_site ON t2_survey_site.site_id = tmp_taxon_site.site_id AND taxon_id = :taxon_id LEFT JOIN t2_processed_sighting ON t2_processed_sighting.survey_id = t2_processed_survey.id AND t2_processed_sighting.taxon_id = tmp_taxon_site.taxon_id WHERE t2_processed_sighting.id IS NULL""", {'taxon_id': taxon_id}) session.execute( """INSERT INTO t2_processed_sighting(survey_id, taxon_id, count, unit_id, pseudo_absence) SELECT survey_id, taxon_id, count, unit_id, pseudo_absence FROM tmp_processed_sighting""" ) session.execute("""DROP TABLE tmp_taxon_site""")
def process_database(species = None, commit = False): """ Calculates spatial representativeness using alpha hulls Generates alpha hulls from each source x taxon combination Intersects alpha hulls with range layers, and then calculates percentage of range covered """ session = get_session() if commit: if species == None: session.execute("DELETE FROM taxon_source_alpha_hull") else: session.execute("""DELETE FROM taxon_source_alpha_hull WHERE taxon_id IN (SELECT id FROM taxon WHERE spno IN (%s))""" % sql_list_placeholder('species', species), sql_list_argument('species', species)) session.commit() db_proj = pyproj.Proj('+init=EPSG:4326') # Database always uses WGS84 working_proj = pyproj.Proj('+init=EPSG:3112') # GDA94 / Geoscience Australia Lambert - so that we can buffer in metres # Load coastal shapefile coastal_shape_filename = tsx.config.config.get("processing.alpha_hull", "coastal_shp") with fiona.open(coastal_shape_filename, 'r') as coastal_shape: # Convert from fiona dictionary to shapely geometry and reproject coastal_shape = reproject(shape(coastal_shape[0]['geometry']), pyproj.Proj(coastal_shape.crs), working_proj) # Simplify coastal boundary - makes things run ~20X faster log.info("Simplifying coastal boundary") coastal_shape = coastal_shape.buffer(10000).simplify(10000) log.info("Generating alpha shapes") for data_type in 1,2: log.info("Processing type %s data" % data_type) # Process a single species. # This gets run off the main thread. def process(taxon_id): session = get_session() try: # Load core range geometry core_range_geom = reproject(get_core_range_geometry(session, taxon_id), db_proj, working_proj).buffer(0).intersection(coastal_shape) for source_id in get_source_ids(session, data_type, taxon_id): log.info("Processing taxon_id: %s, source_id: %s" % (taxon_id, source_id)) # Get raw points from DB raw_points = get_raw_points(session, data_type, taxon_id, source_id) empty = len(raw_points) < 4 if empty: log.info("Taxon %s: not enough points to create alpha hull (%s)" % (taxon_id, len(raw_points))) if not empty: # Read points from database points = [reproject(p, db_proj, working_proj) for p in raw_points] # Generate alpha shape alpha_shp = make_alpha_hull( points = points, coastal_shape = None, thinning_distance = tsx.config.config.getfloat('processing.alpha_hull', 'thinning_distance'), alpha = tsx.config.config.getfloat('processing.alpha_hull', 'alpha'), hullbuffer_distance = tsx.config.config.getfloat('processing.alpha_hull', 'hullbuffer_distance'), isolatedbuffer_distance = tsx.config.config.getfloat('processing.alpha_hull', 'isolatedbuffer_distance')) # Clean up geometry alpha_shp = alpha_shp.buffer(0) if core_range_geom.area == 0: log.info("Core range geometry area is zero") empty = True else: # Intersect alpha hull with core range intersected_alpha = to_multipolygon(core_range_geom.intersection(alpha_shp)) empty = intersected_alpha.is_empty if empty: session.execute("""INSERT INTO taxon_source_alpha_hull (source_id, taxon_id, data_type, core_range_area_in_m2, alpha_hull_area_in_m2) VALUES (:source_id, :taxon_id, :data_type, 0, 0)""", { 'source_id': source_id, 'taxon_id': taxon_id, 'data_type': data_type }) else: session.execute("""INSERT INTO taxon_source_alpha_hull (source_id, taxon_id, data_type, geometry, core_range_area_in_m2, alpha_hull_area_in_m2) VALUES (:source_id, :taxon_id, :data_type, ST_GeomFromWKB(_BINARY :geom_wkb), :core_range_area, :alpha_hull_area)""", { 'source_id': source_id, 'taxon_id': taxon_id, 'data_type': data_type, 'geom_wkb': shapely.wkb.dumps(reproject(intersected_alpha, working_proj, db_proj)), 'core_range_area': core_range_geom.area, 'alpha_hull_area': intersected_alpha.area }) if commit: session.commit() except: log.exception("Exception processing alpha hull") raise finally: session.close() taxa = get_taxa(session, data_type, species) # This is important because we are about to spawn child processes, and this stops them attempting to share the # same database connection pool session.close() tsx.db.connect.engine.dispose() # Process all the species in parallel for result, error in tqdm(run_parallel(process, taxa, use_processes = True), total = len(taxa)): if error: print error