# Save dataset metadata as json file filename_metadata = down_dataset_dir + '/ds_' + identifier + '_metadata.json' write_file(filename_metadata, json.dumps(resp_ds['data'])) # Loop over all datafiles of a dataset for df in resp_ds['data']['latestVersion']['files']: file_id = str(df['dataFile']['id']) # Create directory for datafile datafile_dir = down_dataset_dir + '/df_' + file_id if not os.path.isdir(datafile_dir): os.mkdir(datafile_dir) # Download and save datafile file resp = api_down.get_datafile(file_id, 'content') filename_datafile = datafile_dir + '/df_' + str( df['dataFile']['filename']) write_file(filename_datafile, resp.content, 'wb') if UPLOAD_DATA: CREATE_DV = False DELETE_DV = False CREATE_DS = False ADD_FILE = False DELETE_DS = False CREATE_DF = False api_token_up = os.environ["API_TOKEN_UP"] api_host_up = os.environ["API_HOST_UP"] api_up = Api(api_host_up, api_token=api_token_up, use_https=False)
print(api.status) # get the digital object identifier for the Harvard Dataverse dataset DOI = "doi:10.7910/DVN/HIDLTK" # retrieve the contents of the dataset covid = api.get_dataset(DOI) covid_files_list = covid.json()['data']['latestVersion']['files'] for fileObject in covid_files_list: print("File name is {}; id is {}".format( fileObject["dataFile"]["filename"], fileObject["dataFile"]["id"])) # get data file US_states_cases_file = api.get_datafile("4201597") # convert in_text = US_states_cases_file.content tmp = "US_states_cases.tab" f = open(tmp, "wb") f.write(in_text) f.close() US_states_cases = pd.read_csv(tmp, sep='\t') print(US_states_cases.head(10)) ## Cleaning data # select columns of interest
def get_fsp_data_through_api(base_url, identifier): ''' Takes base URL and identifier of the FSP data, and returns the Pandas dataframe of the file Input base_url (str): URL of the website identifier (str): identifier of the desired data file Output df (Pandas dataframe): dataframe of the FSP data ''' dtype_col = { 'FormName': 'str', 'County': 'str', 'GPSLatitude': 'float32', 'GPSLongitude': 'float32' } geo_columns = list(dtype_col.keys()) api = Api(base_url) resp_dataset = api.get_dataset(identifier) files = json.loads(resp_dataset.text)['data']['latestVersion']['files'] df = pd.DataFrame({col: [] for col in geo_columns}) for file in files: file_id = file['dataFile']['id'] resp_datafile = api.get_datafile(file_id) file_extension = file['dataFile']['filename'].split('.')[-1] if file_extension == 'tab': rows = resp_datafile.text.split('\n') headers = rows[0].split('\t') data_rows = \ [row.replace('"', '').split('\t') for row in rows[1:] if row != '' and row.split('\t')[headers.index('GPSLatitude')] != ''] df_file = \ pd.DataFrame(data_rows, columns=headers)[geo_columns].astype(dtype_col) elif file_extension == 'xlsx': workbook = xlrd.open_workbook(file_contents=resp_datafile.content) worksheet = workbook.sheet_by_index(0) col_names = [ col_name.replace(" ", "") for col_name in worksheet.row_values(0) ] df_file = pd.DataFrame({col: [] for col in geo_columns}) for col in geo_columns: data_col = worksheet.col_values(col_names.index(col), start_rowx=1) for idx_data, data in enumerate(data_col): if type(data) == str: data_col[idx_data] = data.replace('"', '') if data in ['', '--']: data_col[idx_data] = 'nan' df_file[col] = pd.Series(data_col, dtype=dtype_col[col]) df = df.append(df_file[df_file['County'] != 'nan'], ignore_index=True) df['geometry'] = \ df.apply(lambda x: Point(float(x['GPSLongitude']), float(x['GPSLatitude'])), axis=1) return df