def test_get_download_variables_missing_origin(
        local_config_file_missing_origin):

    with pytest.raises(KeyError):

        utils.get_download_variables('TileMovement', 'Britain', datetime.now(),
                                     str(local_config_file_missing_origin))
def test_get_download_variables_works(local_config_file):

    now = datetime.now()

    res = utils.get_download_variables('TileMovement', 'Britain', now,
                                       str(local_config_file))

    assert type(res) is dict

    assert res['dataset_id'] == '1671212783027520'

    assert res['start_date'] == datetime(2020, 3, 10, 0)

    assert res['end_date'] == now
Ejemplo n.º 3
0
def pull_fb(
        dataset_name,
        area,
        outdir: str = os.getcwd(),
        end_date: datetime = datetime.now(),
        frequency: int = 8,
        driver_path: str = "/Applications/chromedriver",
        config_path:
    str = "https://raw.githubusercontent.com/hamishgibbs/pull_facebook_data_for_good/master/.config",
        username: str = None,
        password: str = None,
        driver_flags: list = ["--headless"],
        driver_prefs: dict = {"download.default_directory": os.getcwd()}):

    print("Reading dataset configuration...")
    # Get config variables from repository
    config = utils.get_download_variables(dataset_name, area, end_date,
                                          config_path)

    # Get date sequence between start and end dates
    data_dates = utils.get_file_dates(config["start_date"], config["end_date"],
                                      frequency)

    # Get downloaded dates from outdir
    existing_dates = utils.get_existing_dates(outdir, area)

    # Only download dates that have not already been downloaded
    download_dates = list(set(data_dates).difference(set(existing_dates)))

    download_dates.sort()

    # Get url of each of dataset
    download_urls = url.format_urls(dataset_name, config["dataset_id"],
                                    download_dates)

    # Get credentials here
    keys = credentials.get_credentials(username, password)

    # Authenticate webdriver
    request_cookies_browser = driver.authenticate_driver(
        keys, driver_path, driver_flags, driver_prefs)

    # Download url sequence and move to output directory
    driver.download_data(download_urls, area, outdir, request_cookies_browser)

    # Success message
    print('Done.')
Ejemplo n.º 4
0
def cli(dataset_name,
        area,
        outdir=None,
        end_date=None,
        frequency=None,
        driver_path=None,
        config_path=None):
    """
    Entry point for the pull_fb cli.

    Add args to manually pass start date, end date, id, and frequency

    """

    print("Reading dataset configuration...")
    # Get config variables from repository
    config = utils.get_download_variables(dataset_name, area, end_date,
                                          config_path)

    # Get date sequence between start and end dates
    data_dates = utils.get_file_dates(config["start_date"], config["end_date"],
                                      frequency)

    # Get downloaded dates from outdir
    existing_dates = utils.get_existing_dates(outdir, area)

    # Only download dates that have not already been downloaded
    download_dates = list(set(data_dates).difference(set(existing_dates)))

    # Get url of each of dataset
    download_urls = url.format_urls(dataset_name, config["dataset_id"],
                                    download_dates)

    # Get credentials here
    keys = credentials.get_credentials()

    # Download url sequence and move to output directory
    driver.download_data(download_urls, area, driver_path, keys, outdir)

    # Remove files with no rows (bug with web portal)
    clean_up.remove_empty_files(outdir)

    # Success message
    print('Success.')