Beispiel #1
0
def main(args, session):
    if args.debug_recording_ids:
        logging.info('Loading specified recordings')
        recording_ids = args.debug_recording_ids.split(',')
        recordings = session.query(Recording)\
            .filter(Recording.recording_id.in_(recording_ids))\
            .all()
        for recording in recordings:
            logging.info(f'Processing recording {recording.recording_id}')
            trim_recording(recording,
                           skip_if_exists=False,
                           skip_write=True,
                           debug_otsu_threshold=args.debug_otsu_threshold,
                           debug_utterances=args.debug_utterances)
        return

    logging.info('Loading selected recordings')
    selected_recordings = session.query(Recording).join(
        SelectedRecording).all()

    logging.info('Fetching and trimming recordings')
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python#35134329
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    with multiprocessing.pool.Pool(args.trim_recordings_process_jobs) as pool:
        signal.signal(signal.SIGINT, original_sigint_handler)
        for _output_file_name in progress.percent(
                pool.imap(_process_recording,
                          [([selected_recording], {
                              'skip_if_exists': not args.retrim_recordings
                          }) for selected_recording in selected_recordings]),
                len(selected_recordings)):
            pass
Beispiel #2
0
def main(args, session):
    global _args  # pylint: disable=global-statement
    _args = args

    logging.info('Fetching image records for selected species')
    images = session.query(Image)\
        .join(Species, Species.species_id == Image.species_id)\
        .join(SelectedSpecies)\
        .all()

    logging.info('Listing existing images')
    old_images = set(os.listdir(args.image_output_dir))

    logging.info('Resizing images')
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python#35134329
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    with multiprocessing.pool.Pool(args.image_process_jobs) as pool:
        signal.signal(signal.SIGINT, original_sigint_handler)
        for image_file_name in progress.percent(
                pool.imap(_process_image, images), len(images)):
            if image_file_name:
                old_images.discard(image_file_name)

    logging.info(f'Deleting {len(old_images)} old images')
    for old_image in old_images:
        try:
            os.remove(os.path.join(args.image_output_dir, old_image))
        except OSError as ex:
            logging.warning(f'Could not delete {old_image}: {ex}')
Beispiel #3
0
def main(args, session):
    if args.reanalyze_recordings:
        logging.info('Deleting all sonogram analyses')
        session.query(SonogramAnalysis).delete()

    logging.info('Fetching all recordings for selected species')
    recordings = session.query(Recording)\
        .join(Species, Species.scientific_name == Recording.scientific_name)\
        .join(SelectedSpecies)\
        .filter(Recording.sonogram_url_small != None, # pylint: disable=singleton-comparison
                Recording.sonogram_url_small != '',
                ~Recording.sonogram_analysis.has())\
        .all()

    logging.info('Analyzing recordings')
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python#35134329
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    with multiprocessing.pool.Pool(args.analysis_jobs) as pool:
        signal.signal(signal.SIGINT, original_sigint_handler)

        for recording_id, sonogram_quality in progress.percent(
                pool.imap(_analyze, [(r.recording_id, r.sonogram_url_small) for r in recordings]),
                len(recordings)):
            session.add(SonogramAnalysis(
                recording_id=recording_id,
                sonogram_quality=sonogram_quality))
            session.commit()
Beispiel #4
0
def main(args, session):
    logging.info('Deleting existing xeno-canto recordings')
    session.query(Recording).filter(Recording.source == 'xc').delete()

    fetcher = Fetcher(cache_group='xc_api',
                      pool_size=args.recording_load_jobs,
                      clear_cache=args.clear_recordings_cache)
    query = XcQuery({'nr': f'{args.start_xc_id}-{args.end_xc_id}'}, fetcher)
    first_page = query.fetch_page(1)
    num_pages = first_page['numPages']
    num_recordings = int(first_page['numRecordings'])
    logging.info(f'Found {num_pages} pages, {num_recordings} recordings')
    with multiprocessing.pool.ThreadPool(args.recording_load_jobs) as pool:
        for page in progress.percent(
                itertools.chain([first_page],
                                pool.imap(query.fetch_page,
                                          range(2, num_pages + 1))),
                num_pages):
            try:
                # Allow replacements in case the API shifts pages around
                # (it seems to do that, probably when new recordings are
                # added during the run).
                recordings = [_parse_recording(r) for r in page['recordings']]
                session.bulk_save_objects_with_replace(recordings)
            except Exception:
                logging.error(
                    f'Error parsing page:\n{json.dumps(page, indent="  ")}',
                    exc_info=True)
                raise
Beispiel #5
0
def main(args, session):
    logging.info('Deleting existing regions')
    session.query(Region).delete()

    logging.info('Loading species')
    clements_to_ioc = {
        species.scientific_name_clements: species.scientific_name
        for species in session.query(Species)
        if species.scientific_name_clements
    }

    logging.info('Processing regions')
    regions = []
    warned_scientific_names = set()
    with open(args.ebd_regions_file, 'rt') as input_file:
        # Hardcoding the CSV length here is awful but it's just for progress reporting anyway.
        for row in progress.percent(csv.DictReader(input_file), 14835):
            region_id = int(row['region_id'])
            centroid_lat = float(row['centroid_lat'])
            centroid_lon = float(row['centroid_lon'])
            observations_by_scientific_name = json.loads(
                row['observations_by_scientific_name'])
            species_weight_by_scientific_name = {}
            for scientific_name_clements, num_observations in observations_by_scientific_name.items(
            ):
                scientific_name = clements_to_ioc.get(scientific_name_clements)
                if not scientific_name:
                    if (scientific_name_clements not in warned_scientific_names
                            and '/' not in
                            scientific_name_clements  # Uncertainties.
                            and 'sp.' not in scientific_name_clements.split(
                                ' ')  # Only genus, not species.
                            and 'x' not in scientific_name_clements.split(
                                ' ')  # Hybrids.
                            and 'undescribed' not in
                            scientific_name_clements  # Undescribed forms.
                        ):
                        # This happens a fair bit; in the "IOC vs other lists"
                        # these rows are typically reddish brown, indicating
                        # "species not recognized by IOC".
                        logging.warning(
                            f'Scientific name {scientific_name_clements} not found '
                            '(probably recognized by Clements but not IOC)')
                        warned_scientific_names.add(scientific_name_clements)
                    continue
                species_weight_by_scientific_name[
                    scientific_name] = num_observations
            regions.append(
                Region(region_id=region_id,
                       lat_start=centroid_lat - _SIZE_LAT / 2,
                       lat_end=centroid_lat + _SIZE_LAT / 2,
                       lon_start=centroid_lon - _SIZE_LON / 2,
                       lon_end=centroid_lon + _SIZE_LON / 2,
                       centroid_lat=centroid_lat,
                       centroid_lon=centroid_lon,
                       species_weight_by_scientific_name=
                       species_weight_by_scientific_name))

    session.bulk_save_objects(regions)
Beispiel #6
0
def main(args, session):
    global _args  # pylint: disable=global-statement
    _args = args

    logging.info('Deleting existing image records')
    session.query(Image).delete()

    logging.info('Loading species list')
    species_list = session.query(Species).all()

    logging.info('Fetching image metadata')
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python#35134329
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    with multiprocessing.pool.Pool(args.image_load_jobs) as pool:
        signal.signal(signal.SIGINT, original_sigint_handler)
        for image in progress.percent(pool.imap(_process_image, species_list),
                                      len(species_list)):
            if image:
                session.add(image)
Beispiel #7
0
def main(_args, session):
    logging.info('Deleting all recording selections')
    session.query(SelectedRecording).delete()

    logging.info('Ordering selected species by importance')
    selected_species = session.query(Species)\
        .join(SelectedSpecies)\
        .order_by(SelectedSpecies.ranking)\
        .all()

    logging.info('Loading recording overrides')
    recording_overrides = RecordingOverrides()

    logging.info('Selecting best recordings for each species')
    # Not parallelized, because it's mostly database work.
    for species in progress.percent(selected_species):
        select_recordings(session,
                          species,
                          recording_overrides,
                          assume_deleted=True)
Beispiel #8
0
def main(args, _session):
    output_dir = args.map_tiles_output_dir

    logging.info('Deleting existing map tiles')
    if os.path.isdir(output_dir):
        shutil.rmtree(output_dir)

    tiles = []
    for z in range(0, args.max_zoom_level + 1):
        n = 2**z
        for x in range(n):
            for y in range(n):
                tiles.append({'z': z, 'x': x, 'y': y})

    logging.info(
        f'Largest zoom level: {args.max_zoom_level} ({256 * 2**args.max_zoom_level} pixels)'
    )
    logging.info(f'Fetching and optimizing {len(tiles)} map tiles')
    fetcher = Fetcher('map_tiles', pool_size=1)

    tile_format = '{z}_{x}_{y}.png'
    orig_data_size = 0
    opt_data_size = 0
    os.makedirs(output_dir, exist_ok=True)
    for tile in progress.percent(tiles):
        data = fetcher.fetch_cached(args.map_tiles_url_format.format(**tile))
        output_file = os.path.join(output_dir, tile_format.format(**tile))
        with open(output_file, 'wb') as f:
            f.write(data)
        subprocess.run(['optipng', '-quiet', output_file], check=True)
        orig_data_size += len(data)
        opt_data_size += os.path.getsize(output_file)

    side = 256 * 2**args.max_zoom_level
    logging.info(
        f'Total size of map tiles: {orig_data_size} bytes originally, {opt_data_size} bytes after optipng'
    )
Beispiel #9
0
def main(args, session):
    logging.info('Deleting existing species selections')
    session.query(SelectedSpecies).delete()

    logging.info(
        'Filtering species by sufficient available recordings and images')
    candidate_species = session.query(Species)\
        .filter(text(
            '''
            (
                select count(*)
                from recordings
                where
                    recordings.scientific_name = species.scientific_name
                    and recordings.url is not null
                    and recordings.url <> ''
                    and recordings.audio_url is not null
                    and recordings.audio_url <> ''
                    and recordings.sonogram_url_small is not null
                    and recordings.sonogram_url_small <> ''
            ) >= :min_num_recordings
            and
            exists (
                select *
                from images
                where
                    images.species_id = species.species_id
                    and output_file_name is not null
                    and output_file_name <> ''
                    and license_name is not null
                    and license_name <> ''
                    and image_width >= :min_image_size
                    and image_height >= :min_image_size
            )
            '''))\
        .params(
            num_selected_species=args.num_selected_species,
            min_image_size=args.min_image_size,
            min_num_recordings=args.min_num_recordings)\
        .all()

    logging.info('Counting number of regions in which species occur')
    num_regions_by_species = collections.defaultdict(int)
    for region in progress.percent(session.query(Region).all()):
        for scientific_name in region.scientific_names:
            num_regions_by_species[scientific_name] += 1

    logging.info('Sorting candidate species by number of regions')
    candidate_species.sort(
        key=lambda s: num_regions_by_species.get(s.scientific_name, 0),
        reverse=True)

    logging.info('Selecting top species')
    selected_species = candidate_species[:args.num_selected_species]
    for index, species in enumerate(progress.percent(selected_species)):
        session.add(
            SelectedSpecies(species_id=species.species_id, ranking=index))

    logging.info(
        f'Selected {session.query(SelectedSpecies).count()} species; top 10: '
        f'{", ".join(s.scientific_name for s in selected_species[:10])}')
Beispiel #10
0
def main(args, session):
    if session.query(City).count() and not args.reselect_cities:
        logging.info('Selected cities exist and reselection not requested')
        return

    logging.info('Deleting existing cities')
    session.query(City).delete()

    cities = []
    cities_by_id = {}
    points = {}
    with zipfile.ZipFile(args.cities_file, 'r') as zip_file:
        file_name = zip_file.namelist()[0]
        logging.info(f'Loading {file_name} from {args.cities_file}')
        with zip_file.open(file_name, 'r') as tsv_file:
            for line in io.TextIOWrapper(tsv_file):
                row = line.rstrip('\n').split('\t')
                city = City(city_id=int(row[0]),
                            name=row[1],
                            lat=float(row[4]),
                            lon=float(row[5]),
                            population=int(row[14]))
                if city.population <= 0:
                    continue
                cities.append(city)
                cities_by_id[city.city_id] = city
                points[city.city_id] = _lat_lon_to_point(city.lat, city.lon)
                #if len(cities) >= 10000:
                #   break
    max_population = max(city.population for city in cities)
    logging.info(
        f'Loaded {len(cities)} cities; maximum population is {max_population}')

    logging.info('Indexing cities')
    prop = rtree.index.Property()
    prop.dimension = 3
    index = rtree.index.Index(properties=prop)
    for city in progress.percent(cities):
        city_point = points[city.city_id]
        index.insert(city.city_id, (*city_point, *city_point))

    logging.info('Computing city weights')
    sigma = args.cities_select_population_sigma_km
    max_distance = _great_circle_to_cartesian_distance(3.0 * sigma)
    weights = {}
    for city in progress.percent(cities):
        city_point = points[city.city_id]
        region_population = 0
        for nearby_id in index.intersection(
                _box_around(city_point, max_distance)):
            nearby_distance = _cartesian_distance(points[nearby_id],
                                                  city_point)
            if nearby_distance <= max_distance:
                d = nearby_distance / sigma
                population = cities_by_id[nearby_id].population * math.exp(
                    -0.5 * d * d)
                region_population += population
        weights[city.city_id] = city.population / region_population

    logging.info(f'Selecting {args.cities_to_select} cities')
    cities.sort(key=lambda city: weights[city.city_id], reverse=True)
    selected_cities = cities[:args.cities_to_select]

    logging.info(f'Storing {len(selected_cities)} selected cities')
    session.bulk_save_objects(selected_cities)