Beispiel #1
0
def pull_fb(
        dataset_name,
        area,
        outdir: str = os.getcwd(),
        end_date: datetime = datetime.now(),
        frequency: int = 8,
        driver_path: str = "/Applications/chromedriver",
        config_path:
    str = "https://raw.githubusercontent.com/hamishgibbs/pull_facebook_data_for_good/master/.config",
        username: str = None,
        password: str = None,
        driver_flags: list = ["--headless"],
        driver_prefs: dict = {"download.default_directory": os.getcwd()}):

    print("Reading dataset configuration...")
    # Get config variables from repository
    config = utils.get_download_variables(dataset_name, area, end_date,
                                          config_path)

    # Get date sequence between start and end dates
    data_dates = utils.get_file_dates(config["start_date"], config["end_date"],
                                      frequency)

    # Get downloaded dates from outdir
    existing_dates = utils.get_existing_dates(outdir, area)

    # Only download dates that have not already been downloaded
    download_dates = list(set(data_dates).difference(set(existing_dates)))

    download_dates.sort()

    # Get url of each of dataset
    download_urls = url.format_urls(dataset_name, config["dataset_id"],
                                    download_dates)

    # Get credentials here
    keys = credentials.get_credentials(username, password)

    # Authenticate webdriver
    request_cookies_browser = driver.authenticate_driver(
        keys, driver_path, driver_flags, driver_prefs)

    # Download url sequence and move to output directory
    driver.download_data(download_urls, area, outdir, request_cookies_browser)

    # Success message
    print('Done.')
Beispiel #2
0
def cli(dataset_name,
        area,
        outdir=None,
        end_date=None,
        frequency=None,
        driver_path=None,
        config_path=None):
    """
    Entry point for the pull_fb cli.

    Add args to manually pass start date, end date, id, and frequency

    """

    print("Reading dataset configuration...")
    # Get config variables from repository
    config = utils.get_download_variables(dataset_name, area, end_date,
                                          config_path)

    # Get date sequence between start and end dates
    data_dates = utils.get_file_dates(config["start_date"], config["end_date"],
                                      frequency)

    # Get downloaded dates from outdir
    existing_dates = utils.get_existing_dates(outdir, area)

    # Only download dates that have not already been downloaded
    download_dates = list(set(data_dates).difference(set(existing_dates)))

    # Get url of each of dataset
    download_urls = url.format_urls(dataset_name, config["dataset_id"],
                                    download_dates)

    # Get credentials here
    keys = credentials.get_credentials()

    # Download url sequence and move to output directory
    driver.download_data(download_urls, area, driver_path, keys, outdir)

    # Remove files with no rows (bug with web portal)
    clean_up.remove_empty_files(outdir)

    # Success message
    print('Success.')
Beispiel #3
0
def test_get_file_dates_type(example_date_config):

    res = utils.get_file_dates(example_date_config["start_date"],
                               example_date_config["end_date"], 12)

    assert type(res[0]) is datetime
Beispiel #4
0
def test_get_file_dates_12h(example_date_config):

    res = utils.get_file_dates(example_date_config["start_date"],
                               example_date_config["end_date"], 12)

    assert len(res) == 2