Beispiel #1
0
def main():
    """
    Starts the script which behaves like a Finite State Machine (FSM) that takes care of running scheduled jobs. Primarily started by Heroku Scheduler.
    """
    logger.info('Scheduler script started.')

    cfg: ConfigParser = ConfigParser()
    cfg.read('config/config.cfg')
    cfg_dict: dict = cfg._sections['base']
    cfg_dict['collection_name'] = 'metadata'

    handler_class: DBHandlerInterface = next(
        (handler for handler in DBHandlerInterface.__subclasses__()
         if handler.name == cfg_dict['used_db']), None)
    if handler_class is None:
        raise Exception(f'DBHandler called "{cfg_dict["used_db"]}" not found.')

    heroku_api_key: str = os.getenv('HEROKU_API_KEY', False)
    if not heroku_api_key:
        raise Exception(
            f'Environmental variable HEROKU_API_KEY not specified.')

    with handler_class(**cfg_dict) as handler:
        try:
            handle_update(handler,
                          config=cfg_dict,
                          heroku_api_key=heroku_api_key)
        except Exception as ex:
            logger.error('Unknown error occured')
            logger.error(traceback.format_exc())

    logger.info('Scheduler script ended.')
def main():
    """
    Run Zoo Prague lexicon web scraper.

        Checks a `config.cfg` config file and runs the desired Zoo Prague lexicon scraper.
    """

    # Get data from the config file into a flat dictionary
    cfg: ConfigParser = ConfigParser()
    cfg.read('config/config.cfg')
    cfg_dict: dict = cfg._sections['base']
    cfg_dict["min_delay"] = float(os.getenv('MIN_SCRAPING_DELAY', cfg_dict["min_delay"]))
    cfg_dict['collection_name'] = 'animals_data'

    if cfg_dict.get('used_db') is None:
        raise Exception(f'No DBHandler specified in config file.')

    # Get the required db_handler instance
    handler: DBHandlerInterface = next((handler for handler in DBHandlerInterface.__subclasses__() if handler.name == cfg_dict['used_db']), None)
    if handler is None:
        raise Exception(f'DBHandler called "{cfg_dict["used_db"]}" not found.')

    with requests.Session() as session, handler(**cfg_dict) as handler_instance:
        try:
            run_web_scraper(session, db_handler=handler_instance, **cfg_dict)
        except Exception as ex:
            logger.error('Unknown error occured')
            logger.error(traceback.format_exc())
        finally:
            # Work is done either successfully or unsuccessfully. Update scheduler_state
            logger.info('Setting scheduler_state to WORK_DONE.')
            handler_instance.update_one({"_id": 0}, {"$set": {"scheduler_state": SchedulerStates.WORK_DONE}}, collection_name='metadata')
def get_settings() -> SimpleNamespace:
    """
    Initialize global settings for FastAPI endpoints.

    Raises:
        RuntimeError: Raised when some environment or config variables aren't set.

    Returns:
        SimpleNamespace: An object containing all required global settings for FastAPI endpoints.
    """
    # Check S3 environment vars
    s3_env_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_DEFAULT_REGION', 'AWS_STORAGE_BUCKET_NAME']
    s3_missing_vars = list(filter(lambda env_var: env_var not in os.environ, s3_env_vars))
    if(len(s3_missing_vars) != 0):
        raise RuntimeError(f'Environment variables "{s3_missing_vars}" not set.')

    # Get data from the config file into a flat dictionary
    cfg: ConfigParser = ConfigParser()
    cfg.read('config/config.cfg')
    cfg_dict: dict = cfg._sections['base']
    cfg_dict['collection_name'] = 'animals_data'

    if cfg_dict.get('used_db') is None:
        raise RuntimeError(f'No DBHandler specified in config file.')

    # Get the required db_handler instance
    handler: DBHandlerInterface = next((handler for handler in DBHandlerInterface.__subclasses__() if handler.name == cfg_dict['used_db']), None)
    if handler is None:
        raise RuntimeError(f'DBHandler called "{cfg_dict["used_db"]}" not found.')

    res = {
        'aws_storage_bucket_name': os.getenv('AWS_STORAGE_BUCKET_NAME'),
        'map_file_prefix': cfg['mbtiles_downloader']['output'],
        'handler_class': handler,
        'config_data': cfg_dict
    }
    
    return SimpleNamespace(**res)
Beispiel #4
0
def main():
    """
    Run RSS Parser of Zoo Prague news.
    """

    # Get data from the config file into a flat dictionary
    cfg: ConfigParser = ConfigParser()
    cfg.read('config/config.cfg')
    cfg_dict: dict = cfg._sections['base']
    cfg_dict['collection_name'] = 'news'

    handler: DBHandlerInterface = next(
        (handler for handler in DBHandlerInterface.__subclasses__()
         if handler.name == cfg_dict['used_db']), None)
    if handler is None:
        raise Exception(f'DBHandler called "{cfg_dict["used_db"]}" not found.')

    with handler(**cfg_dict) as handler_instance:
        try:
            update_news(handler_instance)
        except Exception as ex:
            logger.error('Unknown error occured')
            logger.error(traceback.format_exc())
def run_web_scraper(session: requests.Session, db_handler: DBHandlerInterface, collection_name: str, min_delay: float = 10, **kwargs):
    """
    Run a Zoo Prague lexicon web scraper to fill the provided DB with data about animals.

    Args:
        session (requests.Session): HTTP session for running requests.
        db_handler (DBHandlerInterface): A DBHandlerInterface instance of chosen database used to store data from Zoo Prague lexicon.
        min_delay (float): Minimum time in seconds to wait between downloads of pages to scrape.
    """
    animal_pens: list[dict] = db_handler.find(filter_={}, collection_name='animal_pens')
    buildings: list[dict] = db_handler.find(filter_={}, collection_name='zoo_parts')
    tmp_coll_name: str = f'tmp_{collection_name}'
    db_handler.update_one({'_id': 0}, {'$set': {'last_update_start': datetime.now()}}, upsert=True, collection_name='metadata')
    db_handler.drop_collection(collection_name=tmp_coll_name)

    for i, url in enumerate(get_animal_urls(session)):
        page = session.get(url.geturl())
        start_time: float = time.time()
        soup: BeautifulSoup = BeautifulSoup(page.content, 'html.parser')

        logger.info(f'{i}. {url.geturl()}')
        try:
            animal_data = parse_animal_data(soup, url, animal_pens, buildings)
            db_handler.insert_one(animal_data.__dict__, collection_name=tmp_coll_name)
        except:
            logger.error(f'Error occured when parsing: {url.geturl()}')
            logger.error(traceback.format_exc())
            continue

        elapsed_time: float = time.time() - start_time
        time_to_sleep: float = min_delay - elapsed_time
        logger.info(f'\t\tElapsed time: {elapsed_time} s')
        if time_to_sleep > 0:
            time.sleep(time_to_sleep)
    
    db_handler.drop_collection(collection_name=collection_name)
    db_handler.rename_collection(collection_new_name=collection_name, collection_name=tmp_coll_name)
    db_handler.update_one({'_id': 0}, {'$set': {'last_update_end': datetime.now()}}, upsert=True, collection_name='metadata')
def main():
    """
    Run MBTiles & GeoJSON map data downloader/parser.

    Downloads vector map data in MBTiles file and GeoJSON files. Stores the MBTiles file in AWS S3 DB and other data in the main DB.
    """

    # Check S3 environment vars
    s3_env_vars = [
        'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_DEFAULT_REGION',
        'AWS_STORAGE_BUCKET_NAME'
    ]
    s3_missing_vars = list(
        filter(lambda env_var: env_var not in os.environ, s3_env_vars))
    if (len(s3_missing_vars) != 0):
        raise RuntimeError(
            f'Environment variables "{s3_missing_vars}" not set.')

    # Get data from the config file into a flat dictionary
    cfg: ConfigParser = ConfigParser()
    cfg.read('config/config.cfg')
    cfg_dict: dict = cfg._sections['mbtiles_downloader'] | cfg._sections['base']
    cfg_dict["min_delay"] = float(
        os.getenv('MIN_SCRAPING_DELAY', cfg_dict["min_delay"]))
    cfg_dict['collection_name'] = 'animal_pens'

    mapzen_url_prefix = os.getenv('MAPZEN_URL_PREFIX',
                                  'https://tile.nextzen.org/tilezen')
    mapzen_api_key = os.getenv('MAPZEN_API_KEY', None)
    if (mapzen_api_key is None):
        raise RuntimeError('Environmental variable MAPZEN_API_KEY is not set.')

    handler: DBHandlerInterface = next(
        (handler for handler in DBHandlerInterface.__subclasses__()
         if handler.name == cfg_dict['used_db']), None)
    if handler is None:
        raise Exception(f'DBHandler called "{cfg_dict["used_db"]}" not found.')

    args = {
        'min_lon': float(cfg_dict['min_lon']),
        'min_lat': float(cfg_dict['min_lat']),
        'max_lon': float(cfg_dict['max_lon']),
        'max_lat': float(cfg_dict['max_lat']),
        'min_zoom': int(cfg_dict['min_zoom']),
        'max_zoom': int(cfg_dict['max_zoom']),
        'output': cfg_dict['output'],
        'tile_size': 512,
        'tile_format': 'json',
        'output_formats': ['mbtiles', 'zipfile'],
        'layer': 'all',
        'type': 'vector',
        'tile_compression': False,
        'concurrency': 1,
        'api_key': mapzen_api_key,
        'url_prefix': mapzen_url_prefix
    }

    with requests.Session() as session, handler(
            **cfg_dict) as handler_instance:
        try:
            folder_path: Path = download_map_data(
                args, os.getenv('AWS_STORAGE_BUCKET_NAME'))
            pens = parse_map_data(folder_path, handler_instance)

            update_animal_tables(session, handler_instance, pens,
                                 cfg_dict["min_delay"])
        except ClientError as ex:
            logger.error('Error occured when uploading files to AWS S3.')
            logger.error(traceback.format_exc())
        except Exception as ex:
            logger.error('Unknown error occured')
            logger.error(traceback.format_exc())
def update_animal_tables(session: requests.Session,
                         db_handler: DBHandlerInterface,
                         pens: list[dict[int, str]], min_delay: float):
    """
    Use new map data to update the DB tables that:
    
    - holds transformations between singular and plural forms of animal names. It is used to connect data from Zoo Prague lexicon and map data.
    - holds data about animal pens. Also connects these animal pens and Zoo Prague lexicon data using singular forms if possible.

    Args:
        session (requests.Session): HTTP session
        db_handler (DBHandlerInterface): A DBHandlerInterface instance of chosen database used to store data from Zoo Prague lexicon.
        pens (list[dict[int, str]]): Map data of located animal pens in Zoo Prague.
        min_delay (float): Delay between HTTP requests.
    """
    collection_name: str = 'singular_plural'
    if (not db_handler.collection_exists(collection_name=collection_name)):
        # Init singular_plural collection
        data = __get_csv_data__()
        db_handler.insert_many(data, collection_name=collection_name)

    singular_plural_data = db_handler.find({}, collection_name=collection_name)
    singular_plural_data = {
        d["_id"]: d["singulars"]
        for d in singular_plural_data
    }

    for pen in pens:
        # Some pens can have multiple animals
        pen_animal_names: list = list()
        names = pen['name'].strip().split(',')
        for name in names:
            # Some names can have noun and pronoun
            words = name.strip().split(' ')
            if (len(words) == 1):
                # Has only noun
                singulars = get_singular(words[0],
                                         session,
                                         singular_plural_data,
                                         collection_name=collection_name,
                                         min_delay=min_delay)
                if (singulars is not None):
                    db_handler.update_one({"_id": words[0]},
                                          {"$set": {
                                              "singulars": singulars
                                          }},
                                          upsert=True,
                                          collection_name=collection_name)
                    pen_animal_names.append(singulars[0])
            elif (len(words) == 2):
                # Has noun and pronoun
                singular_noun = get_singular(words[0],
                                             session,
                                             singular_plural_data,
                                             collection_name=collection_name,
                                             min_delay=min_delay)
                singular_pronouns = get_singular(
                    words[1],
                    session,
                    singular_plural_data,
                    collection_name=collection_name,
                    min_delay=min_delay)
                if (singular_noun is not None
                        and singular_pronouns is not None):
                    db_handler.update_one(
                        {"_id": words[0]},
                        {"$set": {
                            "singulars": singular_noun
                        }},
                        upsert=True,
                        collection_name=collection_name)
                    db_handler.update_one(
                        {"_id": words[1]},
                        {"$set": {
                            "singulars": singular_pronouns
                        }},
                        upsert=True,
                        collection_name=collection_name)
                    for pair in itertools.product(singular_noun,
                                                  singular_pronouns):
                        pen_animal_names.append(' '.join(pair))

        pen["singular_names"] = pen_animal_names

    db_handler.drop_collection()
    db_handler.insert_many(pens)
Beispiel #8
0
def handle_update(handler: DBHandlerInterface, heroku_api_key: str,
                  config: dict, **kwargs):
    """
    Basically the FSM that handles scripts that need Heroku worker dyno to run.

    Handles scheduling updates and switching the worker dyno on/off.

    Args:
        db_handler (DBHandlerInterface): A DBHandlerInterface instance of chosen database used to store data from Zoo Prague lexicon.
        heroku_api_key (str): An API key for Heroku.
        config (dict): Data from the configuration file.

    Raises:
        RuntimeError: Raised if a scheduler_state received from DB is unknown.
    """
    metadata: dict = next(iter(handler.find({"_id": 0})), None)
    logger.info(f'Received metadata: {metadata}')

    next_update: datetime = metadata.get('next_update', datetime.now())
    scheduler_state: SchedulerStates = SchedulerStates(
        metadata.get('scheduler_state'))
    if (scheduler_state is None):
        scheduler_state = SchedulerStates.WAIT
        handler.update_one({"_id": 0},
                           data={'$set': {
                               'scheduler_state': scheduler_state
                           }})

    if (scheduler_state == SchedulerStates.WAIT):
        if (next_update <= datetime.now()):
            # It is time to update the database
            logger.info('WAIT -> UPDATING')
            __schedule_long_job__()
            # Start worker dyno and update scheduler_state in DB
            __change_worker_dyno_state__(DynoStates.UP, heroku_api_key)
            handler.update_one(
                {"_id": 0},
                {"$set": {
                    "scheduler_state": SchedulerStates.UPDATING
                }})
        else:
            logger.info('WAIT')

    elif (scheduler_state == SchedulerStates.UPDATING):
        logger.info('UPDATING')
        diff = datetime.now() - metadata['last_update_start']
        if ((diff.days * 24 + diff.seconds / 3600) > 8.0):
            # The script probably got stuck, reset needed
            logger.info('UPDATING -> WAIT')
            __change_worker_dyno_state__(DynoStates.DOWN, heroku_api_key)
            handler.update_one(
                {"_id": 0},
                {"$set": {
                    "scheduler_state": SchedulerStates.WAIT
                }})

    elif (scheduler_state == SchedulerStates.WORK_DONE):
        # This state should be set only by zoo_scraper
        logger.info('WORK DONE -> WAIT')
        __change_worker_dyno_state__(DynoStates.DOWN, heroku_api_key)

        # Schedule next update
        crontab_schedule: str = os.getenv('CRONTAB_SCHEDULE',
                                          config['default_crontab_schedule'])
        crontab: croniter = croniter(crontab_schedule, datetime.now())
        handler.update_one({"_id": 0}, {
            "$set": {
                "next_update": crontab.get_next(datetime),
                "scheduler_state": SchedulerStates.WAIT
            }
        })
    else:
        raise RuntimeError(f'Unknown scheduler state: {scheduler_state}')