def copy_workspace_entities_sushma(destination_workspace_namespace, destination_workspace_name, source_workspace_namespace, source_workspace_name, destination_workspace_bucket): """Copy workspace data tables to destination workspace.""" source_etypes = fapi.list_entity_types(source_workspace_namespace, source_workspace_name) if source_etypes.status_code != 200: # getting list of data tables fails message = f"Failed to retrieve list of data tables (entity types) from: {source_workspace_namespace}/{source_workspace_name}. API error: {source_etypes.text}." print(message) return False, message source_set_etypes = [s for s in list(source_etypes.json().keys()) if s.endswith("_set")] source_single_etypes = [s for s in list(source_etypes.json().keys()) if not s.endswith("_set")] # for each table that is not a set for etype in source_single_etypes: # get entity names for etype entities = fapi.get_entities(source_workspace_namespace, source_workspace_name, etype) if entities.status_code != 200: # getting an etype's entities fails message = f"Failed to retrieve entities (row names) for {etype}. API error: {entities.text}" print(message) return False, message entity_names = [ent["name"] for ent in entities.json()] # copy single etype (with entities) to destination workspace copy_response = fapi.copy_entities(source_workspace_namespace, source_workspace_name, destination_workspace_namespace, destination_workspace_name, etype, entity_names, link_existing_entities=True) if copy_response.status_code not in [201, 409]: # if copying table with entities fails message = f"Failed to copy {etype} with entities({entity_names}) to {destination_workspace_namespace}/{destination_workspace_name}. API error: {copy_response.text}." print(message) return False, message for set_etype in source_set_etypes: # get entity names for etype set_entities = fapi.get_entities(source_workspace_namespace, source_workspace_name, set_etype) if set_entities.status_code != 200: # getting a set etype's entities fails message = f"Failed to retrieve entities (row names) for {set_etype}. API error: {set_entities.text}" print(message) return False, message set_entity_names = [ent["name"] for ent in set_entities.json()] # copy single etype (with entities) to destination workspace set_copy_response = fapi.copy_entities(source_workspace_namespace, source_workspace_name, destination_workspace_namespace, destination_workspace_name, set_etype, set_entity_names, link_existing_entities=True) if set_copy_response.status_code not in [201, 409]: # if copying set table with entities fails message = f"Failed to copy {set_etype} with entities({set_entity_names}) to {destination_workspace_namespace}/{destination_workspace_name}. API error: {set_copy_response.text}." print(message) return False, message print(f"Successfully copied data tables to {destination_workspace_namespace}/{destination_workspace_name}: {list(source_etypes.json().keys())}") # get original workpace bucket id get_bucket_success, get_bucket_message = get_workspace_bucket(source_workspace_name, source_workspace_namespace) # TODO: handle if getting workspace bucket fails source_bucket = json.loads(get_bucket_message)["workspace"]["bucketName"] destination_bucket = destination_workspace_bucket.replace("gs://", "") # update bucket links in the destination workspace so that it matches the path structure of what the WDL generates when it migrates data # gs://new_bucket_id/original_bucket_id/[original data structure] update_entities(destination_workspace_name, destination_workspace_namespace, replace_this=source_bucket, with_this=f"{destination_bucket}/{source_bucket}") print(f"Successfully updated data tables with new bucket paths data tables in {destination_workspace_namespace}/{destination_workspace_name}.") return True, list(source_etypes.json().keys())
def __get_entities(self, etype): """Helper to get entities for a given type.""" r = fapi.get_entities(self.namespace, self.name, etype, self.api_url) fapi._check_response_code(r, 200) return [Entity(e['entityType'], e['name'], e['attributes']) for e in r.json()]
def test_get_entities(self): """Test get_entities().""" r = fapi.get_entities(self.project, self.workspace, "participant") print(r.status_code, r.content) self.assertEqual(r.status_code, 200)
def get_entities(namespace='anvil-datastorage', workspace=None, entity_name=None): """Return all entities in a workspace.""" entities = [ AttrDict(e) for e in FAPI.get_entities(namespace, workspace, entity_name).json() ] return entities
def load_table(namespace, workspace, table_name, store_membership=False): ent_old = fapi.get_entities(namespace, workspace, table_name).json() tbl_old = None membership = None if len(ent_old) > 0: tbl_old = pd.DataFrame(list(map(lambda e: e['attributes'], ent_old))) tbl_old[f"entity:{table_name}_id"] = list( map(lambda f: f['name'], ent_old)) if store_membership: membership = list( map(lambda g: set(map(lambda h: h['entityName'], g['items'])), tbl_old['samples'])) del tbl_old['samples'] c = list(tbl_old.columns) c.remove(f"entity:{table_name}_id") c = [f"entity:{table_name}_id"] + c tbl_old = tbl_old[c] tbl_old = tbl_old.astype(str) return tbl_old, membership
def get_sample_sets(namespace, workspace, batches): response = fapi.get_entities(namespace, workspace, 'sample_set') fapi._check_response_code(response, 200) return [entity for entity in response.json() if entity['name'] in batches]
def gather_and_concatenate_data_model_tsvs(input_file, entity_name): """Get data table tsv files from list of workspaces and concatenate results into a single excel report.""" # read full excel sheet into dataframe - all rows of workspace project and workspace names workspace_info = pd.read_excel(input_file, sheet_name="Sheet1", index_col=None) # instantiate empty list to hold all entity information from all workspaces all_workspace_entities = [] failed_workspaces = [] # for each workspace_name, workspace_project pair for index, workspace in workspace_info.iterrows(): # get workspace details workspace_name = workspace["workspace_name"] workspace_project = workspace["workspace_project"] # get a response with all attributes for each row in entity table entities = fapi.get_entities(workspace_project, workspace_name, entity_name) # if get entities call fails, add workspace details to dictionary # skip to next workspace if entities.status_code != 200: print( f"{entity_name} table in {workspace_project}/{workspace_name} does not exist or user does not have workspace access." ) failed_workspaces.append({ "workspace_project": workspace_project, "workspace_name": workspace_name }) continue # for each row in entity table, re-format nested response json for entity in entities.json(): entity_attributes = entity[ "attributes"] # [{attr name: attr value}] for each row entity_id = entity["name"] # name of entity # insert entity_id, workspace_project, and workspace_name into list of dictionaries entity_attributes[f"entity:{entity_name}_id"] = entity_id entity_attributes["workspace_project"] = workspace_project entity_attributes["workspace_name"] = workspace_name # add entity informatioon (dictionary) to list all_workspace_entities.append(entity_attributes) print( f"{entity_name} table in {workspace_project}/{workspace_name} successfully gathered." ) # successful entity dictionaries -> df - dict per row (entity) for each entity table in all workspaces succeeded_data = pd.DataFrame(all_workspace_entities) # failed workspaces -> df failed_data = pd.DataFrame(failed_workspaces) # reorder dataframe entity:table_name column is first ent_id_col = succeeded_data.pop(f"entity:{entity_name}_id") succeeded_data.insert(0, ent_id_col.name, ent_id_col) # # write final dataframes to excel file - separate sheets for success and failed data succeeded_output_filename = input_file.split("/")[-1].split( ".")[0] + "_succeeded.tsv" failed_output_filename = input_file.split("/")[-1].split( ".")[0] + "_failed.tsv" succeeded_data.to_csv(succeeded_output_filename, sep="\t", index=None) failed_data.to_csv(failed_output_filename, sep="\t", index=None) # if any failures, print warning message. if len(failed_workspaces) > 0: print( f"Warning: Completed gather and concatenate with the exception of some workspace/s. Please examine details in {failed_output_filename}." ) return # else print success message print( f"Successfully completed gather and concatenate for all workspaces. Results can be found in {succeeded_output_filename}." )
from collections import defaultdict from firecloud import api import hail as hl import os import pandas as pd import re import tqdm hl.init(log="/dev/null") #%% entities = api.get_entities("cmg-exomes-gcnv", "cmg_gcnv", "sample_set").json() #%% # copied from https://stackabuse.com/python-how-to-flatten-list-of-lists/ def flatten(list_of_lists): if len(list_of_lists) == 0: return list_of_lists if isinstance(list_of_lists[0], list): return flatten(list_of_lists[0]) + flatten(list_of_lists[1:]) return list_of_lists[:1] + flatten(list_of_lists[1:]) #%% samples_counter = 0 gcnv_cluster_to_sample_bed_paths = {}