def import_tsv(self, tsv_file): """Upload entity data to workspace from tsv loadfile. Args: tsv_file (file): Tab-delimited file of entity data """ r = fapi.upload_entities_tsv(self.namespace, self.name, self.tsv_file, self.api_url) fapi._check_response_code(r, 201)
def upload_entities_from_tsv(namespace, workspace, entities_tsv_file): """Upload entities from tsv file Args: Self-explanatory entities_tsv_file: path to tsv file Returns: HTTP Response """ res = firecloud_api.upload_entities_tsv(namespace, workspace, entities_tsv=entities_tsv_file) return res
def api_upload_entities(tsv, workspace, project): """Call API and create/update data tables.""" response = fapi.upload_entities_tsv(project, workspace, tsv, model="flexible") if response.status_code != 200: print(f'ERROR UPLOADING: See full error message: {response.text}') else: print( f"Upload complete. Check your workspace for new {tsv.replace('.tsv', '')} table!" )
def main(): parser = argparse.ArgumentParser( description= 'create FireCloud workspace for a specific project + cohort + access type' ) parser.add_argument("project_name", help="the name of the project. e.g: TCGA") parser.add_argument("cohort_name", help="name_of_cancer_cohort. e.g: LUAD") parser.add_argument( "billing_project", help= "name of billing project to create the workspace under. e.g: broad-firecloud-tcga" ) parser.add_argument( "ws_suffix", help= "descriptive suffix to add to the workspace auto-generated name. e.g: ControlledAccess_hg38_V1-0_DATA" ) parser.add_argument( "-a", "--auth_domain", help= "authorization domain. for dbGaP controlled access the domain name is TCGA-dbGaP-Authorized.", default="") args = parser.parse_args() #STEP 1: #Create new directory for the cohort and switch wd to this directory if args.auth_domain: new_dir_name = args.project_name + "-" + args.cohort_name + "_" + args.auth_domain else: new_dir_name = args.project_name + "-" + args.cohort_name os.mkdir(new_dir_name) print("Created new directory for the {0} cohort".format(args.cohort_name)) os.chdir(new_dir_name) print("Switched working directory to ./{0}".format(new_dir_name)) #STEP 2: #Create creteria for downloading manifest and then download it. #Right now the file types that are selected for a new workspace depend on whether that workspace is to have open/controlled access to the GDC data portal. #This code will need to be redesigned, or new keys will have to be added to the dictionary if this assumption ever changes. if args.auth_domain: file_types = FILE_TYPE_DICT[args.auth_domain] else: file_types = FILE_TYPE_DICT["default"] filters = dict() filters["cases.project.program.name"] = [args.project_name] filters["cases.project.project_id"] = [ args.project_name + "-" + args.cohort_name ] filters["files.access"] = file_types #Following directions from the GDC, we were told that controlled access workspaces should not contain BAM files if args.auth_domain: filters["files.data_format"] = [ "BCR XML", "TXT", "VCF", "TSV", "MAF", "XLSX" ] else: filters["files.data_format"] = [ "BCR XML", "TXT", "VCF", "TSV", "MAF", "XLSX" ] filt_json = build_filter_json(filters) #Download manifest file to the new directory manifest_filename = download_manifest(filt_json) print("manifest downloaded") #Step 3: #Run fcgdctools on the manifest file if args.project_name == "TARGET": fcgdctools_command = "genFcWsLoadFiles -c " + manifest_filename + ">genFcWsLoadFiles_output.txt" else: fcgdctools_command = "genFcWsLoadFiles " + manifest_filename + ">genFcWsLoadFiles_output.txt" print( "Executing command {0}\nPlease check the output file to see progress and check for errors." .format(fcgdctools_command)) os.system(fcgdctools_command) #Step 4: #Prepare attributes to be loaded workspace_attribute_filename = manifest_filename.split( ".")[0] + "_workspace_attributes.txt" attribute_list = prepare_workspace_attribute_list( workspace_attribute_filename, args.auth_domain) #Step 5: #Create the new workspace on FireCloud workspace_name = "{0}_{1}_{2}".format(args.project_name, args.cohort_name, args.ws_suffix) print("New workspace name is: {0}\nPreparing to create workspace.".format( workspace_name)) api.create_workspace(args.billing_project, workspace_name, args.auth_domain, attribute_list) #Step 6: #Upload data model .tsv files to the newly created workspace data_model_file_prefix = manifest_filename.split(".")[0] data_files = [ "participants", "participant_sets_membership", "samples", "sample_sets_membership", "pairs", "pair_sets_membership" ] for filetype in data_files: full_name = data_model_file_prefix + "_" + filetype + ".txt" if os.path.exists(full_name): print("Uploading file {0}".format(full_name)) api.upload_entities_tsv(args.billing_project, workspace_name, full_name) #Step 7: #Create and Upload method configurations for downloading files to the new workspace downloadable_attrs = list_downloadable_attrs( data_model_file_prefix, ["participant", "sample", "pair"]) print("The downloadable attributes are:") for attr in downloadable_attrs: print(attr[0]) create_method_configs(args.billing_project, workspace_name, downloadable_attrs, args.auth_domain)
f.write(line + '\n') i += 1 # ### Check output # This is optional and you may want to skip it if you have a lot of files. # In[ ]: with open(TABLE_NAME, 'r') as f: print(f.read()) # ### Upload TSV to Terra # In[ ]: response = fapi.upload_entities_tsv(BILLING_PROJECT_ID, WORKSPACE, TABLE_NAME, "flexible") fapi._check_response_code(response, 200) # ## Option 2: Multiple children # Unlike File Finder or Option 1, this parses the output of `gsutil ls` directly. **As a result, if your filenames contain non-ascii (ie, stuff besides A-Z, a-z, underscores, and dashes) or bizarre characters (ie, newlines) there is a chance this will not work as expected.** # In[ ]: logger = logging.getLogger('') logger.setLevel(logging.INFO) def baseID(filename_string, child_extension): global PARENT_FILETYPE global INCLUDE_PARENT_EXTENSION
def update_entities_to_compact_identifier(workspace, project, single_etypes_list, dry_run): """Update Data Model entity attributes to DRS 1.1 Compact Identifier.""" for etype in single_etypes_list: print(f'Starting TCGA DRS updates for entity: {etype}') # get entity table response for API call res_etype = fapi.get_entities_tsv(project, workspace, etype, model="flexible") # save current/original data model tsv files for provenance print(f'Saving original {etype} TSV.') original_tsv_name = f"original_{etype}_{project}-{workspace}_table.tsv" with open(original_tsv_name, "w") as f: f.write(res_etype.text) # read entity table response into dictionary to perform DRS URL updates dict_etype = list( csv.DictReader(StringIO(res_etype.text), delimiter='\t')) # create empty list to add updated rows and list to capture list of columns that were modified drs_dict_table = [] modified_cols = set() # for "row" (each row is [list] of column:values) for row in dict_etype: drs_row = row.copy() # for each column in row for col in row: # check if the col values are dataguids.org URLs and parse out guid if row[col].startswith("drs://dataguids.org"): guid = row[col].split("/")[3] # only modify col if guid is valid and exists if guid and GUID_PATTERN.match(guid): drs_url = "drs://dg.4DFC:" + guid drs_row[col] = drs_url modified_cols.add(col) # append new "row" with updated drs values to new list drs_dict_table.append(drs_row) # save new/drs updated data model tsv files for provenance print(f'Saving updated {etype} TSV.') updated_tsv_name = f"updated_{etype}_{project}-{workspace}_table.tsv" tsv_headers = drs_dict_table[0].keys() with open(updated_tsv_name, 'w') as outfile: # get keys from OrderedDictionary and write rows, separate with tabs writer = csv.DictWriter(outfile, tsv_headers, delimiter="\t") writer.writeheader() writer.writerows(drs_dict_table) # list of the columns that are scoped to be updated if re-run without --dry_run flag modified_cols = list(modified_cols) if dry_run: print( f"Columns in the {etype} table that *will be* be updated when script is re-run without the `--dry_run` flag:" ) if not modified_cols: print("\t" * 4 + f"No columns to update in the {etype} table." + "\n\n") else: print('\n'.join(['\t' * 4 + c for c in modified_cols])) print( f"To view in detail what will be updated, inspect the {updated_tsv_name} file." + "\n\n") else: # upload newly created tsv file containing drs urls print( f"Starting update of the {etype} table with compact DRS identifiers (drs://df.4DFC:GUID)." ) res_update = fapi.upload_entities_tsv(project, workspace, updated_tsv_name, model="flexible") if res_update.status_code != 200: print( f"Could not update existing {etype} table. Error message: {res_update.text}" ) print( f"Finished uploading TCGA DRS updated .tsv for entity: {etype}" + "\n")