Exemple #1
0
    def import_tsv(self, tsv_file):
        """Upload entity data to workspace from tsv loadfile.

        Args:
            tsv_file (file): Tab-delimited file of entity data
        """
        r = fapi.upload_entities_tsv(self.namespace, self.name,
                                     self.tsv_file, self.api_url)
        fapi._check_response_code(r, 201)
def upload_entities_from_tsv(namespace, workspace, entities_tsv_file):
    """Upload entities from tsv file
    Args: 
        Self-explanatory
        entities_tsv_file: path to tsv file
    Returns: 
        HTTP Response
    """
    res = firecloud_api.upload_entities_tsv(namespace,
                                            workspace,
                                            entities_tsv=entities_tsv_file)
    return res
Exemple #3
0
def api_upload_entities(tsv, workspace, project):
    """Call API and create/update data tables."""

    response = fapi.upload_entities_tsv(project,
                                        workspace,
                                        tsv,
                                        model="flexible")
    if response.status_code != 200:
        print(f'ERROR UPLOADING: See full error message: {response.text}')
    else:
        print(
            f"Upload complete. Check your workspace for new {tsv.replace('.tsv', '')} table!"
        )
Exemple #4
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'create FireCloud workspace for a specific project + cohort + access type'
    )
    parser.add_argument("project_name",
                        help="the name of the project. e.g: TCGA")
    parser.add_argument("cohort_name", help="name_of_cancer_cohort. e.g: LUAD")
    parser.add_argument(
        "billing_project",
        help=
        "name of billing project to create the workspace under. e.g: broad-firecloud-tcga"
    )
    parser.add_argument(
        "ws_suffix",
        help=
        "descriptive suffix to add to the workspace auto-generated name. e.g: ControlledAccess_hg38_V1-0_DATA"
    )
    parser.add_argument(
        "-a",
        "--auth_domain",
        help=
        "authorization domain. for dbGaP controlled access the domain name is TCGA-dbGaP-Authorized.",
        default="")

    args = parser.parse_args()

    #STEP 1:
    #Create new directory for the cohort and switch wd to this directory
    if args.auth_domain:
        new_dir_name = args.project_name + "-" + args.cohort_name + "_" + args.auth_domain
    else:
        new_dir_name = args.project_name + "-" + args.cohort_name
    os.mkdir(new_dir_name)
    print("Created new directory for the {0} cohort".format(args.cohort_name))
    os.chdir(new_dir_name)
    print("Switched working directory to ./{0}".format(new_dir_name))

    #STEP 2:
    #Create creteria for downloading manifest and then download it.
    #Right now the file types that are selected for a new workspace depend on whether that workspace is to have open/controlled access to the GDC data portal.
    #This code will need to be redesigned, or new keys will have to be added to the dictionary if this assumption ever changes.
    if args.auth_domain:
        file_types = FILE_TYPE_DICT[args.auth_domain]
    else:
        file_types = FILE_TYPE_DICT["default"]

    filters = dict()
    filters["cases.project.program.name"] = [args.project_name]
    filters["cases.project.project_id"] = [
        args.project_name + "-" + args.cohort_name
    ]
    filters["files.access"] = file_types
    #Following directions from the GDC, we were told that controlled access workspaces should not contain BAM files
    if args.auth_domain:
        filters["files.data_format"] = [
            "BCR XML", "TXT", "VCF", "TSV", "MAF", "XLSX"
        ]
    else:
        filters["files.data_format"] = [
            "BCR XML", "TXT", "VCF", "TSV", "MAF", "XLSX"
        ]

    filt_json = build_filter_json(filters)

    #Download manifest file to the new directory
    manifest_filename = download_manifest(filt_json)
    print("manifest downloaded")

    #Step 3:
    #Run fcgdctools on the manifest file
    if args.project_name == "TARGET":
        fcgdctools_command = "genFcWsLoadFiles -c " + manifest_filename + ">genFcWsLoadFiles_output.txt"
    else:
        fcgdctools_command = "genFcWsLoadFiles " + manifest_filename + ">genFcWsLoadFiles_output.txt"

    print(
        "Executing command {0}\nPlease check the output file to see progress and check for errors."
        .format(fcgdctools_command))
    os.system(fcgdctools_command)

    #Step 4:
    #Prepare attributes to be loaded
    workspace_attribute_filename = manifest_filename.split(
        ".")[0] + "_workspace_attributes.txt"
    attribute_list = prepare_workspace_attribute_list(
        workspace_attribute_filename, args.auth_domain)

    #Step 5:
    #Create the new workspace on FireCloud
    workspace_name = "{0}_{1}_{2}".format(args.project_name, args.cohort_name,
                                          args.ws_suffix)
    print("New workspace name is: {0}\nPreparing to create workspace.".format(
        workspace_name))
    api.create_workspace(args.billing_project, workspace_name,
                         args.auth_domain, attribute_list)

    #Step 6:
    #Upload data model .tsv files to the newly created workspace
    data_model_file_prefix = manifest_filename.split(".")[0]
    data_files = [
        "participants", "participant_sets_membership", "samples",
        "sample_sets_membership", "pairs", "pair_sets_membership"
    ]
    for filetype in data_files:
        full_name = data_model_file_prefix + "_" + filetype + ".txt"
        if os.path.exists(full_name):
            print("Uploading file {0}".format(full_name))
            api.upload_entities_tsv(args.billing_project, workspace_name,
                                    full_name)

    #Step 7:
    #Create and Upload method configurations for downloading files to the new workspace
    downloadable_attrs = list_downloadable_attrs(
        data_model_file_prefix, ["participant", "sample", "pair"])
    print("The downloadable attributes are:")
    for attr in downloadable_attrs:
        print(attr[0])
    create_method_configs(args.billing_project, workspace_name,
                          downloadable_attrs, args.auth_domain)
Exemple #5
0
                    f.write(line + '\n')
                    i += 1

# ### Check output
# This is optional and you may want to skip it if you have a lot of files.

# In[ ]:

with open(TABLE_NAME, 'r') as f:
    print(f.read())

# ### Upload TSV to Terra

# In[ ]:

response = fapi.upload_entities_tsv(BILLING_PROJECT_ID, WORKSPACE, TABLE_NAME,
                                    "flexible")
fapi._check_response_code(response, 200)

# ## Option 2: Multiple children

# Unlike File Finder or Option 1, this parses the output of `gsutil ls` directly. **As a result, if your filenames contain non-ascii (ie, stuff besides A-Z, a-z, underscores, and dashes) or bizarre characters (ie, newlines) there is a chance this will not work as expected.**

# In[ ]:

logger = logging.getLogger('')
logger.setLevel(logging.INFO)


def baseID(filename_string, child_extension):
    global PARENT_FILETYPE
    global INCLUDE_PARENT_EXTENSION
def update_entities_to_compact_identifier(workspace, project,
                                          single_etypes_list, dry_run):
    """Update Data Model entity attributes to DRS 1.1 Compact Identifier."""

    for etype in single_etypes_list:
        print(f'Starting TCGA DRS updates for entity: {etype}')

        # get entity table response for API call
        res_etype = fapi.get_entities_tsv(project,
                                          workspace,
                                          etype,
                                          model="flexible")

        # save current/original data model tsv files for provenance
        print(f'Saving original {etype} TSV.')
        original_tsv_name = f"original_{etype}_{project}-{workspace}_table.tsv"
        with open(original_tsv_name, "w") as f:
            f.write(res_etype.text)

        # read entity table response into dictionary to perform DRS URL updates
        dict_etype = list(
            csv.DictReader(StringIO(res_etype.text), delimiter='\t'))

        # create empty list to add updated rows and list to capture list of columns that were modified
        drs_dict_table = []
        modified_cols = set()
        # for "row" (each row is [list] of column:values)
        for row in dict_etype:
            drs_row = row.copy()
            # for each column in row
            for col in row:
                # check if the col values are dataguids.org URLs and parse out guid
                if row[col].startswith("drs://dataguids.org"):
                    guid = row[col].split("/")[3]
                    # only modify col if guid is valid and exists
                    if guid and GUID_PATTERN.match(guid):
                        drs_url = "drs://dg.4DFC:" + guid
                        drs_row[col] = drs_url
                        modified_cols.add(col)

            # append new "row" with updated drs values to new list
            drs_dict_table.append(drs_row)

        # save new/drs updated data model tsv files for provenance
        print(f'Saving updated {etype} TSV.')
        updated_tsv_name = f"updated_{etype}_{project}-{workspace}_table.tsv"
        tsv_headers = drs_dict_table[0].keys()

        with open(updated_tsv_name, 'w') as outfile:
            # get keys from OrderedDictionary and write rows, separate with tabs
            writer = csv.DictWriter(outfile, tsv_headers, delimiter="\t")
            writer.writeheader()
            writer.writerows(drs_dict_table)

        # list of the columns that are scoped to be updated if re-run without --dry_run flag
        modified_cols = list(modified_cols)
        if dry_run:
            print(
                f"Columns in the {etype} table that *will be* be updated when script is re-run without the `--dry_run` flag:"
            )
            if not modified_cols:
                print("\t" * 4 +
                      f"No columns to update in the {etype} table." + "\n\n")
            else:
                print('\n'.join(['\t' * 4 + c for c in modified_cols]))
                print(
                    f"To view in detail what will be updated, inspect the {updated_tsv_name} file."
                    + "\n\n")
        else:
            # upload newly created tsv file containing drs urls
            print(
                f"Starting update of the {etype} table with compact DRS identifiers (drs://df.4DFC:GUID)."
            )

            res_update = fapi.upload_entities_tsv(project,
                                                  workspace,
                                                  updated_tsv_name,
                                                  model="flexible")
            if res_update.status_code != 200:
                print(
                    f"Could not update existing {etype} table. Error message: {res_update.text}"
                )

            print(
                f"Finished uploading TCGA DRS updated .tsv for entity: {etype}"
                + "\n")