def check_request_actual_status(index_in_db, request, request_db):
    req_id = int(request[REQUEST_ID_FIELD])
    res = rc.get_status(req_id)

    if res['status'].lower() == 'error':
        logger.info("Request file_id: {} has failed".format(req_id))
        request_db.loc[index_in_db,
                       REQUEST_STATUS_FIELD] = RequestStatus.FAILED.value

    elif res['status'] == 'ok':
        request_status = res['result']['status']
        logger.info("Status of request {0} is {1}".format(
            req_id, request_status))
        if request_status == RequestStatus.ERROR.value:
            request_db.loc[index_in_db,
                           REQUEST_STATUS_FIELD] = RequestStatus.ERROR.value
            print(res)

        if request_status == RequestStatus.COMPLETED.value:
            request_db.loc[
                index_in_db,
                REQUEST_STATUS_FIELD] = RequestStatus.COMPLETED.value

    else:
        logger.error("Unhandled request status: {0} for request {1}".format(
            res['status'], req_id))
    request_db.to_csv(REQ_ID_PATH)
def download_request(req_id: str, target_dir):
    logger.info("Downloading files from request {0} into {1}".format(
        req_id, target_dir))
    try:
        download(req_id, target_dir)
    except Exception as e:
        logger.error("Downloading failed", exc_info=True)
        raise e
Beispiel #3
0
def prepare_requests(**kwargs):
    if kwargs['input_file'] is not None:
        params_to_fetch = read_params_from_input_file(kwargs['input_file'])
        logger.info(f"Preparing requests for {len(params_to_fetch)} parameters...")
    else:
        params_to_fetch = [{PARAM_FIELD: kwargs['gfs_parameter'], "level": kwargs['gfs_level'], HOURS_TYPE_FIELD: kwargs[HOURS_TYPE_FIELD]}]

    if kwargs["bulk"]:
        prepare_bulk_region_request(params_to_fetch, **kwargs)
    else:
        prepare_points_request(params_to_fetch, **kwargs)
Beispiel #4
0
def process_csv_files():
    logger.info("Processing csv files...")
    for root, dirs, filenames in os.walk("download/csv"):
        for dir_with_csvs in dirs:
            latlon_search = re.search(r'(\d+(_\d)?)-(\d+(_\d)?)',
                                      dir_with_csvs)
            latitude = latlon_search.group(1)
            longitude = latlon_search.group(3)
            prepare_final_csvs_from_csvs(dir_with_csvs, latitude, longitude,
                                         datetime.datetime(2015, 1, 15))
        break
def print_db_stats(db):
    not_completed = db[db[REQUEST_STATUS_FIELD] == RequestStatus.SENT.value]
    logger.info("{} requests are pending".format(len(not_completed)))
    completed = db[db[REQUEST_STATUS_FIELD] == RequestStatus.COMPLETED.value]
    logger.info("{} requests are completed".format(len(completed)))
    downloaded = db[db[REQUEST_STATUS_FIELD] == RequestStatus.DOWNLOADED.value]
    logger.info("{} requests are downloaded, but not processed yet".format(
        len(downloaded)))
    finished = db[db[REQUEST_STATUS_FIELD] == RequestStatus.FINISHED.value]
    logger.info("{} requests are already processed".format(len(finished)))
    failed = db[db[REQUEST_STATUS_FIELD] == RequestStatus.FAILED.value]
    logger.info("{} requests have failed".format(len(failed)))
def process_netCDF_files_to_csv():
    logger.info("Processing netCDF files...")
    netCDF_dir = "download/netCDF"
    for root, param_dirs, filenames in os.walk(netCDF_dir):
        for param_dir in param_dirs:
            logger.info(f"Processing {param_dir} parameter...")
            for root, level_dirs, filenames in os.walk(
                    os.path.join(netCDF_dir, param_dir)):
                for level_dir in level_dirs:
                    logger.info(f"Processing {level_dir} level...")
                    files_in_directory = [
                        f for f in os.listdir(
                            os.path.join(netCDF_dir, param_dir, level_dir))
                        if isfile(join(netCDF_dir, param_dir, level_dir, f))
                    ]
                    for file in tqdm.tqdm(files_in_directory):
                        file_name_match = re.match(RAW_NETCDF_FILENAME_REGEX,
                                                   file)
                        if file_name_match is not None:
                            process_netCDF_file_to_csv(file, param_dir,
                                                       level_dir)
                # do not take subdirs
                break
        # do not take subdirs
        break

    logger.info("Processing done.")
Beispiel #7
0
def prepare_and_start_processor(**kwargs):
    if kwargs['send_only'] is False:
        prepare_requests(**kwargs)
    job = {}
    try:
        logger.info("Scheduling sender job.")
        job = schedule.every(60).minutes.do(send_prepared_requests, kwargs)
    except Exception as e:
        logger.error(e, exc_info=True)

    job.run()
    while True:
        schedule.run_pending()
        time.sleep(60)
Beispiel #8
0
def prepare_coordinates(coords_data):
    """
    Round GFS coordinates for provided data. Filter duplicates.
    :param coords_data: Pandas dataFrame
    :return:
    """
    coordinates = coords_data.apply(lambda x: [round(x[NLAT_FIELD], 1), round(x[SLAT_FIELD], 1), round(x[WLON_FIELD], 1), round(x[ELON_FIELD], 1)], axis=1)
    coords_data[[NLAT_FIELD, SLAT_FIELD, WLON_FIELD, ELON_FIELD]] = [x for x in coordinates]
    before_duplicates_filter = len(coords_data)
    coords_data = coords_data.drop_duplicates(subset=[NLAT_FIELD, SLAT_FIELD, WLON_FIELD, ELON_FIELD])
    num_dup = before_duplicates_filter - len(coords_data)

    logger.info("Removed {} duplicates rows".format(num_dup))

    return coords_data
Beispiel #9
0
def find_coordinates(path, output_file_name="city_geo.csv"):
    location_list = pd.read_csv(path, encoding="ISO-8859-1",
                                names=['file_id', 'city_name', 'meteo_code'])

    city_list = location_list["city_name"].to_list()
    geolocator = Nominatim(user_agent='gfs_fetch_processor')
    geo_list = []

    logger.info("Downloading coordinates for provided cities")
    for city_name in tqdm(city_list):
        geo = geolocator.geocode(city_name)
        if geo:
            geo_list.append([city_name, geo.latitude, geo.longitude])

    path_to_write = os.path.join("../city_coordinates", output_file_name)
    data = pd.DataFrame(geo_list, columns=["city_name", "latitude", "longitude"])
    data.to_csv(path_to_write)
    return data
Beispiel #10
0
def save_request_to_pseudo_db(request_type: RequestType, request_status: RequestStatus, **kwargs):
    if not os.path.isfile(REQ_ID_PATH):
        pseudo_db = pd.DataFrame(columns=[REQUEST_ID_FIELD, REQUEST_TYPE_FIELD, REQUEST_STATUS_FIELD, NLAT_FIELD, SLAT_FIELD, WLON_FIELD, ELON_FIELD, PARAM_FIELD, LEVEL_FIELD, HOURS_TYPE_FIELD])
    else:
        pseudo_db = pd.read_csv(REQ_ID_PATH, index_col=[0])
    pseudo_db = pseudo_db.append(
        {REQUEST_ID_FIELD: kwargs[REQUEST_ID_FIELD],
         REQUEST_TYPE_FIELD: request_type.value,
         REQUEST_STATUS_FIELD: request_status.value,
         NLAT_FIELD: kwargs[NLAT_FIELD],
         SLAT_FIELD: kwargs[SLAT_FIELD],
         WLON_FIELD: kwargs[WLON_FIELD],
         ELON_FIELD: kwargs[ELON_FIELD],
         PARAM_FIELD: kwargs[PARAM_FIELD],
         LEVEL_FIELD: kwargs[LEVEL_FIELD],
         HOURS_TYPE_FIELD: kwargs[HOURS_TYPE_FIELD]
         }, ignore_index=True)
    logger.info(f"Saving a new request of type {request_type} for coords (lat: {kwargs[NLAT_FIELD]}-{kwargs[SLAT_FIELD]}, "
                f"lon: {kwargs[WLON_FIELD]}-{kwargs[ELON_FIELD]}), param {kwargs[PARAM_FIELD]}, level {kwargs[LEVEL_FIELD]}, hours_type {kwargs[HOURS_TYPE_FIELD]}...")
    pseudo_db.to_csv(REQ_ID_PATH)
Beispiel #11
0
def send_prepared_requests(kwargs):

    start_date = datetime.strptime(kwargs["start_date"], '%Y-%m-%d %H:%M')
    end_date = datetime.strptime(kwargs["end_date"], '%Y-%m-%d %H:%M')
    request_db = pd.read_csv(REQ_ID_PATH, index_col=0)

    requests_to_send = request_db[request_db[REQUEST_STATUS_FIELD] == RequestStatus.PENDING.value]

    for index, request in requests_to_send.iterrows():
        nlat, slat, elon, wlon = request[[NLAT_FIELD, SLAT_FIELD, ELON_FIELD, WLON_FIELD]]
        request_type = request[REQUEST_TYPE_FIELD]
        param = request[PARAM_FIELD]
        level = request[LEVEL_FIELD]
        hours_type = request[HOURS_TYPE_FIELD]
        product = generate_product_description(kwargs['forecast_start'], kwargs['forecast_end'], hours_type=hours_type)

        template = build_template(nlat, slat, elon, wlon, start_date, end_date, param, product, level, 'csv' if request_type == RequestType.POINT.value else 'netCDF')
        response = submit_json(template)
        if response['status'] == 'ok':
            request_id = response['result']['request_id']
            request_db.loc[index, REQUEST_STATUS_FIELD] = RequestStatus.SENT.value
            request_db.loc[index, REQUEST_ID_FIELD] = str(int(request_id))
        else:
            logger.info("Rda has returned error.")
            if response['status'] == 'error' and TOO_MANY_REQUESTS in response['messages']:
                logger.info("Too many requests. Request will be sent on next scheduler trigger.")
                break
            else:
                request_db.loc[index, REQUEST_STATUS_FIELD] = RequestStatus.FAILED.value
        logger.info(response)
    request_db.to_csv(REQ_ID_PATH)
    print("Sending requests done. Waiting for next scheduler trigger.")
def extract_files_from_tar(download_target_path,
                           extract_target_path,
                           file_type: str,
                           tidy=False):
    tars = [
        f for f in os.listdir(download_target_path)
        if os.path.isfile(os.path.join(download_target_path, f))
        and f.endswith("tar")
    ]
    logger.info("Unpacking {0} tars into {1} directory".format(
        len(tars), extract_target_path))
    for file in tars:
        tar = tarfile.open(os.path.join(download_target_path, file), "r:")
        tar.extractall(extract_target_path)
        tar.close()
        if tidy:
            os.remove(tar)

    if file_type == "csv":
        new_file_pattern = re.compile(RAW_CSV_FILENAME_WITH_REQUEST_REGEX)
        for file in [
                f for f in os.listdir(extract_target_path)
                if new_file_pattern.match(f)
        ]:
            final_csv_name = re.sub(RAW_CSV_FILENAME_WITH_REQUEST_REGEX,
                                    r"\1\5", file)  # remove request number
            os.replace(os.path.join(extract_target_path, file),
                       os.path.join(extract_target_path, final_csv_name))
    else:
        new_file_pattern = re.compile(RAW_NETCDF_FILENAME_WITH_REQUEST_REGEX)
        for file in [
                f for f in os.listdir(extract_target_path)
                if new_file_pattern.match(f)
        ]:
            final_csv_name = re.sub(RAW_NETCDF_FILENAME_WITH_REQUEST_REGEX,
                                    r"\1\5", file)
            os.replace(os.path.join(extract_target_path, file),
                       os.path.join(extract_target_path, final_csv_name))
def processor(purge_requests: bool, tidy: bool):
    logger.info("Starting rda download processor")
    request_db = read_pseudo_rda_request_db()
    print_db_stats(request_db)
    print("Checking actual status of pending requests...")

    not_completed = request_db[request_db[REQUEST_STATUS_FIELD] ==
                               RequestStatus.SENT.value]

    for index, request in not_completed.iterrows():
        check_request_actual_status(index, request, request_db)

    completed = request_db[request_db[REQUEST_STATUS_FIELD] ==
                           RequestStatus.COMPLETED.value]
    for index, request in completed.iterrows():
        download_completed_request(index, request, request_db)

    ready_for_unpacking = request_db[request_db[REQUEST_STATUS_FIELD] ==
                                     RequestStatus.DOWNLOADED.value]
    for index, request in ready_for_unpacking.iterrows():
        process_tars(index, request, request_db, tidy)

    request_db.to_csv(REQ_ID_PATH)

    if purge_requests:
        done_requests = request_db[
            (request_db[REQUEST_STATUS_FIELD] == RequestStatus.ERROR.value) |
            (request_db[REQUEST_STATUS_FIELD] == RequestStatus.FAILED.value) |
            (request_db[REQUEST_STATUS_FIELD] == RequestStatus.DOWNLOADED.value
             ) |
            (request_db[REQUEST_STATUS_FIELD] == RequestStatus.FINISHED.value)]
        for index, request in done_requests.iterrows():
            purge(str(int(request[REQUEST_ID_FIELD])))
            request_db.loc[index,
                           REQUEST_STATUS_FIELD] = RequestStatus.PURGED.value
        request_db.to_csv(REQ_ID_PATH)

    print("Done. Waiting for next scheduler trigger.")
def process_netCDF_files_to_npy(output_dir: str):
    for param in GFS_PARAMETERS:
        logger.info(f"Converting parameter {param['name']} {param['level']}")
        process_to_numpy_array(param, Coords(56, 48, 13, 26), output_dir)
def purge(req_id: str):
    logger.info("Purging request {}".format(req_id))
    rc.purge_request(req_id)