def pull_fb( dataset_name, area, outdir: str = os.getcwd(), end_date: datetime = datetime.now(), frequency: int = 8, driver_path: str = "/Applications/chromedriver", config_path: str = "https://raw.githubusercontent.com/hamishgibbs/pull_facebook_data_for_good/master/.config", username: str = None, password: str = None, driver_flags: list = ["--headless"], driver_prefs: dict = {"download.default_directory": os.getcwd()}): print("Reading dataset configuration...") # Get config variables from repository config = utils.get_download_variables(dataset_name, area, end_date, config_path) # Get date sequence between start and end dates data_dates = utils.get_file_dates(config["start_date"], config["end_date"], frequency) # Get downloaded dates from outdir existing_dates = utils.get_existing_dates(outdir, area) # Only download dates that have not already been downloaded download_dates = list(set(data_dates).difference(set(existing_dates))) download_dates.sort() # Get url of each of dataset download_urls = url.format_urls(dataset_name, config["dataset_id"], download_dates) # Get credentials here keys = credentials.get_credentials(username, password) # Authenticate webdriver request_cookies_browser = driver.authenticate_driver( keys, driver_path, driver_flags, driver_prefs) # Download url sequence and move to output directory driver.download_data(download_urls, area, outdir, request_cookies_browser) # Success message print('Done.')
def cli(dataset_name, area, outdir=None, end_date=None, frequency=None, driver_path=None, config_path=None): """ Entry point for the pull_fb cli. Add args to manually pass start date, end date, id, and frequency """ print("Reading dataset configuration...") # Get config variables from repository config = utils.get_download_variables(dataset_name, area, end_date, config_path) # Get date sequence between start and end dates data_dates = utils.get_file_dates(config["start_date"], config["end_date"], frequency) # Get downloaded dates from outdir existing_dates = utils.get_existing_dates(outdir, area) # Only download dates that have not already been downloaded download_dates = list(set(data_dates).difference(set(existing_dates))) # Get url of each of dataset download_urls = url.format_urls(dataset_name, config["dataset_id"], download_dates) # Get credentials here keys = credentials.get_credentials() # Download url sequence and move to output directory driver.download_data(download_urls, area, driver_path, keys, outdir) # Remove files with no rows (bug with web portal) clean_up.remove_empty_files(outdir) # Success message print('Success.')
def test_get_file_dates_type(example_date_config): res = utils.get_file_dates(example_date_config["start_date"], example_date_config["end_date"], 12) assert type(res[0]) is datetime
def test_get_file_dates_12h(example_date_config): res = utils.get_file_dates(example_date_config["start_date"], example_date_config["end_date"], 12) assert len(res) == 2