def retrieve(search, search_type): # pragma: no cover build_final = [] if not isinstance(search, list): search = [search] for search_value in search: flat_dfs = [] url = get_url(search_value, search_type) try: locations = Dive(url).call() except: warn('no information found for: ' + str(search_value)) continue if isinstance(locations, dict): for ulrloc in tqdm(locations['results']): df_ = flatten_df(Dive('%s?format=json' % ulrloc['url']).call()) df_['DSMZ_id'] = [ulrloc['url'].split('/')[-2]] * df_.shape[0] df_.index = (df_['DSMZ_id'] + '||' + df_['Section'] + '||' + df_['Subsection'] + '||' + df_['Field_ID']).values flat_dfs.append(df_) if isinstance(locations, list): for ulrloc in tqdm(locations): df_ = flatten_df(Dive('%s?format=json' % ulrloc['url']).call()) df_['DSMZ_id'] = [ulrloc['url'].split('/')[-2]] * df_.shape[0] df_.index = (df_['DSMZ_id'] + '||' + df_['Section'] + '||' + df_['Subsection'] + '||' + df_['Field_ID']).values flat_dfs.append(df_) build_final.append(implode_fattened_df(pd.concat(flat_dfs))) return pd.concat(build_final, axis=1).sort_index()
def test_flatten_df(self): ulr_test = ('https://bacdive.dsmz.de/api/' 'bacdive/bacdive_id/132558/?format=json') test_json = get_data_path('test.json') truth_df = pd.read_csv( get_data_path('test.csv'), index_col=[ 0, 1, 2]).sort_index() with open(test_json) as json_data: test_data = json.load(json_data) test_data = flatten_df(test_data) test_data['DSMZ_id'] = [ulr_test.split('/')[-2]] * test_data.shape[0] test_data.index = ( test_data['DSMZ_id'] + '||' + test_data['Section'] + '||' + test_data['Subsection'] + '||' + test_data['Field_ID']).values test_data = implode_fattened_df(test_data).sort_index() self.assertListEqual( list( truth_df.index.get_level_values('Section')), list( test_data.index.get_level_values('Section'))) self.assertListEqual( list( truth_df.index.get_level_values('Field')), list( test_data.index.get_level_values('Field')))
def retrieve(search, search_type): # pragma: no cover build_final = [] if not isinstance(search, list): search = [search] for search_value in search: flat_dfs = [] url = get_url(search_value, search_type) if search_type == 'bacdive_id': locations = { 'count': 1, 'next': None, 'previous': None, 'results': [{ 'url': url }] } else: try: locations = Dive(url).call() except: warn('no information found for: ' + str(search_value)) continue if isinstance(locations, dict): for ulrloc in tqdm(locations['results']): df_ = flatten_df(Dive('%s?format=json' % ulrloc['url']).call()) df_['DSMZ_id'] = [ulrloc['url'].split('/')[-2]] * df_.shape[0] df_.index = (df_['DSMZ_id'] + '||' + df_['Section'] + '||' + df_['Subsection'] + '||' + df_['Field_ID']).values flat_dfs.append(df_) if isinstance(locations, list): for ulrloc in tqdm(locations): df_ = flatten_df(Dive('%s?format=json' % ulrloc['url']).call()) df_['DSMZ_id'] = [ulrloc['url'].split('/')[-2]] * df_.shape[0] df_.index = (df_['DSMZ_id'] + '||' + df_['Section'] + '||' + df_['Subsection'] + '||' + df_['Field_ID']).values flat_dfs.append(df_) flat_dfs = pd.concat(flat_dfs) flat_dfs = flat_dfs.groupby(flat_dfs.index)['Field'].apply(list) build_final.append(flat_dfs) build_final = clean_cat(pd.DataFrame(build_final).T) return build_final