def main(args, session): if args.debug_recording_ids: logging.info('Loading specified recordings') recording_ids = args.debug_recording_ids.split(',') recordings = session.query(Recording)\ .filter(Recording.recording_id.in_(recording_ids))\ .all() for recording in recordings: logging.info(f'Processing recording {recording.recording_id}') trim_recording(recording, skip_if_exists=False, skip_write=True, debug_otsu_threshold=args.debug_otsu_threshold, debug_utterances=args.debug_utterances) return logging.info('Loading selected recordings') selected_recordings = session.query(Recording).join( SelectedRecording).all() logging.info('Fetching and trimming recordings') # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python#35134329 original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) with multiprocessing.pool.Pool(args.trim_recordings_process_jobs) as pool: signal.signal(signal.SIGINT, original_sigint_handler) for _output_file_name in progress.percent( pool.imap(_process_recording, [([selected_recording], { 'skip_if_exists': not args.retrim_recordings }) for selected_recording in selected_recordings]), len(selected_recordings)): pass
def main(args, session): global _args # pylint: disable=global-statement _args = args logging.info('Fetching image records for selected species') images = session.query(Image)\ .join(Species, Species.species_id == Image.species_id)\ .join(SelectedSpecies)\ .all() logging.info('Listing existing images') old_images = set(os.listdir(args.image_output_dir)) logging.info('Resizing images') # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python#35134329 original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) with multiprocessing.pool.Pool(args.image_process_jobs) as pool: signal.signal(signal.SIGINT, original_sigint_handler) for image_file_name in progress.percent( pool.imap(_process_image, images), len(images)): if image_file_name: old_images.discard(image_file_name) logging.info(f'Deleting {len(old_images)} old images') for old_image in old_images: try: os.remove(os.path.join(args.image_output_dir, old_image)) except OSError as ex: logging.warning(f'Could not delete {old_image}: {ex}')
def main(args, session): if args.reanalyze_recordings: logging.info('Deleting all sonogram analyses') session.query(SonogramAnalysis).delete() logging.info('Fetching all recordings for selected species') recordings = session.query(Recording)\ .join(Species, Species.scientific_name == Recording.scientific_name)\ .join(SelectedSpecies)\ .filter(Recording.sonogram_url_small != None, # pylint: disable=singleton-comparison Recording.sonogram_url_small != '', ~Recording.sonogram_analysis.has())\ .all() logging.info('Analyzing recordings') # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python#35134329 original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) with multiprocessing.pool.Pool(args.analysis_jobs) as pool: signal.signal(signal.SIGINT, original_sigint_handler) for recording_id, sonogram_quality in progress.percent( pool.imap(_analyze, [(r.recording_id, r.sonogram_url_small) for r in recordings]), len(recordings)): session.add(SonogramAnalysis( recording_id=recording_id, sonogram_quality=sonogram_quality)) session.commit()
def main(args, session): logging.info('Deleting existing xeno-canto recordings') session.query(Recording).filter(Recording.source == 'xc').delete() fetcher = Fetcher(cache_group='xc_api', pool_size=args.recording_load_jobs, clear_cache=args.clear_recordings_cache) query = XcQuery({'nr': f'{args.start_xc_id}-{args.end_xc_id}'}, fetcher) first_page = query.fetch_page(1) num_pages = first_page['numPages'] num_recordings = int(first_page['numRecordings']) logging.info(f'Found {num_pages} pages, {num_recordings} recordings') with multiprocessing.pool.ThreadPool(args.recording_load_jobs) as pool: for page in progress.percent( itertools.chain([first_page], pool.imap(query.fetch_page, range(2, num_pages + 1))), num_pages): try: # Allow replacements in case the API shifts pages around # (it seems to do that, probably when new recordings are # added during the run). recordings = [_parse_recording(r) for r in page['recordings']] session.bulk_save_objects_with_replace(recordings) except Exception: logging.error( f'Error parsing page:\n{json.dumps(page, indent=" ")}', exc_info=True) raise
def main(args, session): logging.info('Deleting existing regions') session.query(Region).delete() logging.info('Loading species') clements_to_ioc = { species.scientific_name_clements: species.scientific_name for species in session.query(Species) if species.scientific_name_clements } logging.info('Processing regions') regions = [] warned_scientific_names = set() with open(args.ebd_regions_file, 'rt') as input_file: # Hardcoding the CSV length here is awful but it's just for progress reporting anyway. for row in progress.percent(csv.DictReader(input_file), 14835): region_id = int(row['region_id']) centroid_lat = float(row['centroid_lat']) centroid_lon = float(row['centroid_lon']) observations_by_scientific_name = json.loads( row['observations_by_scientific_name']) species_weight_by_scientific_name = {} for scientific_name_clements, num_observations in observations_by_scientific_name.items( ): scientific_name = clements_to_ioc.get(scientific_name_clements) if not scientific_name: if (scientific_name_clements not in warned_scientific_names and '/' not in scientific_name_clements # Uncertainties. and 'sp.' not in scientific_name_clements.split( ' ') # Only genus, not species. and 'x' not in scientific_name_clements.split( ' ') # Hybrids. and 'undescribed' not in scientific_name_clements # Undescribed forms. ): # This happens a fair bit; in the "IOC vs other lists" # these rows are typically reddish brown, indicating # "species not recognized by IOC". logging.warning( f'Scientific name {scientific_name_clements} not found ' '(probably recognized by Clements but not IOC)') warned_scientific_names.add(scientific_name_clements) continue species_weight_by_scientific_name[ scientific_name] = num_observations regions.append( Region(region_id=region_id, lat_start=centroid_lat - _SIZE_LAT / 2, lat_end=centroid_lat + _SIZE_LAT / 2, lon_start=centroid_lon - _SIZE_LON / 2, lon_end=centroid_lon + _SIZE_LON / 2, centroid_lat=centroid_lat, centroid_lon=centroid_lon, species_weight_by_scientific_name= species_weight_by_scientific_name)) session.bulk_save_objects(regions)
def main(args, session): global _args # pylint: disable=global-statement _args = args logging.info('Deleting existing image records') session.query(Image).delete() logging.info('Loading species list') species_list = session.query(Species).all() logging.info('Fetching image metadata') # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python#35134329 original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) with multiprocessing.pool.Pool(args.image_load_jobs) as pool: signal.signal(signal.SIGINT, original_sigint_handler) for image in progress.percent(pool.imap(_process_image, species_list), len(species_list)): if image: session.add(image)
def main(_args, session): logging.info('Deleting all recording selections') session.query(SelectedRecording).delete() logging.info('Ordering selected species by importance') selected_species = session.query(Species)\ .join(SelectedSpecies)\ .order_by(SelectedSpecies.ranking)\ .all() logging.info('Loading recording overrides') recording_overrides = RecordingOverrides() logging.info('Selecting best recordings for each species') # Not parallelized, because it's mostly database work. for species in progress.percent(selected_species): select_recordings(session, species, recording_overrides, assume_deleted=True)
def main(args, _session): output_dir = args.map_tiles_output_dir logging.info('Deleting existing map tiles') if os.path.isdir(output_dir): shutil.rmtree(output_dir) tiles = [] for z in range(0, args.max_zoom_level + 1): n = 2**z for x in range(n): for y in range(n): tiles.append({'z': z, 'x': x, 'y': y}) logging.info( f'Largest zoom level: {args.max_zoom_level} ({256 * 2**args.max_zoom_level} pixels)' ) logging.info(f'Fetching and optimizing {len(tiles)} map tiles') fetcher = Fetcher('map_tiles', pool_size=1) tile_format = '{z}_{x}_{y}.png' orig_data_size = 0 opt_data_size = 0 os.makedirs(output_dir, exist_ok=True) for tile in progress.percent(tiles): data = fetcher.fetch_cached(args.map_tiles_url_format.format(**tile)) output_file = os.path.join(output_dir, tile_format.format(**tile)) with open(output_file, 'wb') as f: f.write(data) subprocess.run(['optipng', '-quiet', output_file], check=True) orig_data_size += len(data) opt_data_size += os.path.getsize(output_file) side = 256 * 2**args.max_zoom_level logging.info( f'Total size of map tiles: {orig_data_size} bytes originally, {opt_data_size} bytes after optipng' )
def main(args, session): logging.info('Deleting existing species selections') session.query(SelectedSpecies).delete() logging.info( 'Filtering species by sufficient available recordings and images') candidate_species = session.query(Species)\ .filter(text( ''' ( select count(*) from recordings where recordings.scientific_name = species.scientific_name and recordings.url is not null and recordings.url <> '' and recordings.audio_url is not null and recordings.audio_url <> '' and recordings.sonogram_url_small is not null and recordings.sonogram_url_small <> '' ) >= :min_num_recordings and exists ( select * from images where images.species_id = species.species_id and output_file_name is not null and output_file_name <> '' and license_name is not null and license_name <> '' and image_width >= :min_image_size and image_height >= :min_image_size ) '''))\ .params( num_selected_species=args.num_selected_species, min_image_size=args.min_image_size, min_num_recordings=args.min_num_recordings)\ .all() logging.info('Counting number of regions in which species occur') num_regions_by_species = collections.defaultdict(int) for region in progress.percent(session.query(Region).all()): for scientific_name in region.scientific_names: num_regions_by_species[scientific_name] += 1 logging.info('Sorting candidate species by number of regions') candidate_species.sort( key=lambda s: num_regions_by_species.get(s.scientific_name, 0), reverse=True) logging.info('Selecting top species') selected_species = candidate_species[:args.num_selected_species] for index, species in enumerate(progress.percent(selected_species)): session.add( SelectedSpecies(species_id=species.species_id, ranking=index)) logging.info( f'Selected {session.query(SelectedSpecies).count()} species; top 10: ' f'{", ".join(s.scientific_name for s in selected_species[:10])}')
def main(args, session): if session.query(City).count() and not args.reselect_cities: logging.info('Selected cities exist and reselection not requested') return logging.info('Deleting existing cities') session.query(City).delete() cities = [] cities_by_id = {} points = {} with zipfile.ZipFile(args.cities_file, 'r') as zip_file: file_name = zip_file.namelist()[0] logging.info(f'Loading {file_name} from {args.cities_file}') with zip_file.open(file_name, 'r') as tsv_file: for line in io.TextIOWrapper(tsv_file): row = line.rstrip('\n').split('\t') city = City(city_id=int(row[0]), name=row[1], lat=float(row[4]), lon=float(row[5]), population=int(row[14])) if city.population <= 0: continue cities.append(city) cities_by_id[city.city_id] = city points[city.city_id] = _lat_lon_to_point(city.lat, city.lon) #if len(cities) >= 10000: # break max_population = max(city.population for city in cities) logging.info( f'Loaded {len(cities)} cities; maximum population is {max_population}') logging.info('Indexing cities') prop = rtree.index.Property() prop.dimension = 3 index = rtree.index.Index(properties=prop) for city in progress.percent(cities): city_point = points[city.city_id] index.insert(city.city_id, (*city_point, *city_point)) logging.info('Computing city weights') sigma = args.cities_select_population_sigma_km max_distance = _great_circle_to_cartesian_distance(3.0 * sigma) weights = {} for city in progress.percent(cities): city_point = points[city.city_id] region_population = 0 for nearby_id in index.intersection( _box_around(city_point, max_distance)): nearby_distance = _cartesian_distance(points[nearby_id], city_point) if nearby_distance <= max_distance: d = nearby_distance / sigma population = cities_by_id[nearby_id].population * math.exp( -0.5 * d * d) region_population += population weights[city.city_id] = city.population / region_population logging.info(f'Selecting {args.cities_to_select} cities') cities.sort(key=lambda city: weights[city.city_id], reverse=True) selected_cities = cities[:args.cities_to_select] logging.info(f'Storing {len(selected_cities)} selected cities') session.bulk_save_objects(selected_cities)