def update(): """Download and update file""" save_file = os.path.join(save_dir, 'pubmed_oa_*_*_*.parquet') file_list = list(filter(os.path.isdir, glob(save_file))) if file_list: d = re.search('[0-9]+_[0-9]+_[0-9]+', file_list[0]).group(0) date_file = datetime.strptime(d, '%Y_%m_%d') date_update = get_update_date(option='oa') # if update is newer is_update = date_update > date_file if is_update: print("MEDLINE update available!") subprocess.call(['rm', '-rf', os.path.join(save_dir, 'pubmed_oa_*_*_*.parquet')]) # remove subprocess.call(['rm', '-rf', download_dir, 'pubmed_oa']) subprocess.call(['wget', 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.A-B.xml.tar.gz', '--directory', download_dir]) if not os.path.isdir(unzip_dir): os.mkdir(unzip_dir) subprocess.call(['tar', '-xzf', os.path.join(download_dir, 'non_comm_use.A-B.xml.tar.gz'), '--directory', unzip_dir]) else: print("No update available") else: print("Download Pubmed Open-Access for the first time") is_update = True date_update = get_update_date(option='oa') subprocess.call(['wget', 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.A-B.xml.tar.gz', '--directory', download_dir]) if not os.path.isdir(unzip_dir): os.mkdir(unzip_dir) subprocess.call(['tar', '-xzf', os.path.join(download_dir, 'non_comm_use.A-B.xml.tar.gz'), '--directory', unzip_dir]) return is_update, date_update
def update(): """Download and update file""" save_file = os.path.join(save_dir, 'medline*_*_*_*.parquet') file_list = list(filter(os.path.isdir, glob(save_file))) if file_list: d = re.search('[0-9]+_[0-9]+_[0-9]+', file_list[0]).group(0) date_file = datetime.strptime(d, '%Y_%m_%d') date_update = get_update_date(option='medline') # if update is newer is_update = date_update > date_file if is_update: print("MEDLINE update available!") subprocess.call(['rm', '-rf', os.path.join(save_dir, 'medline_*.parquet')]) # remove subprocess.call(['rm', '-rf', download_dir]) # only example for 3 files, change to ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/*.xml.gz to download all subprocess.call(['wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0001.xml.gz', '--directory', download_dir]) subprocess.call(['wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0166.xml.gz', '--directory', download_dir]) subprocess.call(['wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0718.xml.gz', '--directory', download_dir]) else: print("No update available") else: print("Download MEDLINE for the first time") is_update = True date_update = get_update_date(option='medline') subprocess.call(['wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0001.xml.gz', '--directory', download_dir]) subprocess.call(['wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0166.xml.gz', '--directory', download_dir]) subprocess.call(['wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0718.xml.gz', '--directory', download_dir]) return is_update, date_update
def update(): """Download and update file""" save_file = os.path.join(save_dir, 'medline*_*_*_*.csv') file_list = list(filter(os.path.isdir, glob(save_file))) if file_list: d = re.search('[0-9]+_[0-9]+_[0-9]+', file_list[0]).group(0) date_file = datetime.strptime(d, '%Y_%m_%d') date_update = get_update_date(option='medline') # if update is newer is_update = date_update > date_file if is_update: print("MEDLINE update available!") subprocess.call( ['rm', '-rf', os.path.join(save_dir, 'medline_*.csv')]) # remove subprocess.call(['rm', '-rf', download_dir]) subprocess.call([ 'wget', 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/*.xml.gz', '--directory', download_dir ]) else: print("No update available") else: print("Download MEDLINE for the first time") is_update = True date_update = get_update_date(option='medline') subprocess.call([ 'wget', 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/*.xml.gz', '--directory', download_dir ]) return is_update, date_update
def update(): """Download and update file""" save_file = os.path.join(save_dir, 'medline*_*_*_*.parquet') file_list = list(filter(os.path.isdir, glob(save_file))) if file_list: d = re.search('[0-9]+_[0-9]+_[0-9]+', file_list[0]).group(0) date_file = datetime.strptime(d, '%Y_%m_%d') date_update = get_update_date(option='medline') # if update is newer is_update = date_update > date_file if is_update: print("MEDLINE update available!") subprocess.call( ['rm', '-rf', os.path.join(save_dir, 'medline_*.parquet')]) # remove subprocess.call(['rm', '-rf', download_dir]) # only example for 3 files, change to ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/*.xml.gz to download all subprocess.call([ 'wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0001.xml.gz', '--directory', download_dir ]) subprocess.call([ 'wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0166.xml.gz', '--directory', download_dir ]) subprocess.call([ 'wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0718.xml.gz', '--directory', download_dir ]) else: print("No update available") else: print("Download MEDLINE for the first time") is_update = True date_update = get_update_date(option='medline') subprocess.call([ 'wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0001.xml.gz', '--directory', download_dir ]) subprocess.call([ 'wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0166.xml.gz', '--directory', download_dir ]) subprocess.call([ 'wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0718.xml.gz', '--directory', download_dir ]) return is_update, date_update
def update(): """Download and update file""" save_file = os.path.join(save_dir, 'pubmed_oa_*_*_*.parquet') file_list = list(filter(os.path.isdir, glob(save_file))) if file_list: d = re.search('[0-9]+_[0-9]+_[0-9]+', file_list[0]).group(0) date_file = datetime.strptime(d, '%Y_%m_%d') date_update = get_update_date(option='oa') # if update is newer is_update = date_update > date_file if is_update: print("MEDLINE update available!") subprocess.call([ 'rm', '-rf', os.path.join(save_dir, 'pubmed_oa_*_*_*.parquet') ]) # remove subprocess.call(['rm', '-rf', download_dir, 'pubmed_oa']) subprocess.call([ 'wget', 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.A-B.xml.tar.gz', '--directory', download_dir ]) if not os.path.isdir(unzip_dir): os.mkdir(unzip_dir) subprocess.call([ 'tar', '-xzf', os.path.join(download_dir, 'non_comm_use.A-B.xml.tar.gz'), '--directory', unzip_dir ]) else: print("No update available") else: print("Download Pubmed Open-Access for the first time") is_update = True date_update = get_update_date(option='oa') subprocess.call([ 'wget', 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.A-B.xml.tar.gz', '--directory', download_dir ]) if not os.path.isdir(unzip_dir): os.mkdir(unzip_dir) subprocess.call([ 'tar', '-xzf', os.path.join(download_dir, 'non_comm_use.A-B.xml.tar.gz'), '--directory', unzip_dir ]) return is_update, date_update
def main(_args): ''' download colocation data Parameters ---------- _args : listx Arg list secret_key, username and pass dir, csv file specifying download countries and ids, outdir. Returns ------- None. ''' username = input("Username: "******"Update datasets? (y/n): ") if update == 'y': update = True elif update == 'n': update = False else: sys.exit('Unknown update input. Choose "y", "n". Exiting.') #read target datasets data_target = pd.read_csv(_args[1]) for i, dataset_id in enumerate(data_target['id']): country_output = _args[len(_args) - 1] + "/" + data_target.loc[i, 'country'] + '_mobility' base_url = 'https://www.facebook.com/geoinsights-portal/downloads/vector/?id=' + str(dataset_id) + '&ds=' earliest_date = datetime(int(data_target.loc[i, 'year']), int(data_target.loc[i, 'month']), int(data_target.loc[i, 'day']), int(data_target.loc[i, 'hour'])) data_dates = get_file_dates(earliest_date) if update: data_dates = list(compress(data_dates, [x > get_update_date(country_output) for x in data_dates])) if len(data_dates) == 0: sys.exit('No datasets to download. Exiting.') urls = get_urls(base_url, data_dates) download_data(urls, keys) move_most_recent_files(country_output, urls) print('Success.')
def test_get_update_date(self): self.data = pd.DataFrame({'data': [1, 2, 3, 4, 5]}) self.assertRaises(ValueError, get_update_date, './tmp1') self.data.to_csv('./tmp1/test_2020_01_01.csv') self.assertIsInstance(get_update_date('./tmp1'), datetime)
def pull_population(outdir, keys, country, dl_variables, update, population_type): ''' Parameters ---------- outdir : str Output directory. keys : list user credentials [username, password]. country : str Country name - must match .config file exactly (names with spaces must replace ' ' with '_'). dl_variables : dict download specific variables in a dict, 'id' = dataset id, 'origin' = dataset origin datetime.datetime object. update : boolean Whether an existing dataset is being updated. Returns ------- None. ''' country_output = outdir + "/" + country + '_' + population_type base_url = 'https://www.facebook.com/geoinsights-portal/downloads/raster/?id=' + str( dl_variables['id']) + '&ds=' earliest_date = dl_variables['origin'] data_dates = get_file_dates(earliest_date) if update: data_dates = list( compress(data_dates, [x > get_update_date(country_output) for x in data_dates])) if len(data_dates) == 0: sys.exit('No datasets to download. Exiting.') urls = get_urls(base_url, data_dates) start_time = download_data(urls, keys) move_most_recent_files(country_output, urls, start_time) remove_empty_files(country_output) print('Success.')