Ejemplo n.º 1
0
def get_subject_dats_material(cache, p_subject, gh_subject, var_lookup):
    subj_id = p_subject['SUBJID']['mapped_value']

    # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable
    def get_var_id(name):
        return var_lookup[name]['dim'].get("identifier").getIdRef()

    # human experimental subject/patient
    subject_sex = DatsObj(
        "Dimension", [("name", util.get_value_annotation("Gender", cache)),
                      ("description", "Gender of the subject"),
                      ("identifier", get_var_id("SEX")),
                      ("values", [p_subject['SEX']['mapped_value']])])

    subject_age = DatsObj(
        "Dimension", [("name", util.get_value_annotation("Age range", cache)),
                      ("description", "Age range of the subject"),
                      ("identifier", get_var_id("AGE")),
                      ("values", [p_subject['AGE']['mapped_value']])])

    subject_hardy_scale = DatsObj(
        "Dimension",
        [("name", util.get_value_annotation("Hardy scale", cache)),
         ("description", "Hardy scale death classification for the subject"),
         ("identifier", get_var_id("DTHHRDY")),
         ("values", [p_subject['DTHHRDY']['mapped_value']])])

    subject_characteristics = [subject_sex, subject_age, subject_hardy_scale]

    # use URI from GTEx id dump if present
    identifier = subj_id
    if gh_subject is not None:
        identifier = gh_subject['Destination URL']['raw_value']

    # human experimental subject/patient
    subject_material = DatsObj(
        "Material",
        [("name", subj_id),
         ("identifier", DatsObj("Identifier", [("identifier", identifier)])),
         ("description", "GTEx subject " + subj_id),
         ("characteristics", subject_characteristics),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", util.get_donor_roles(cache))])

    # add to the cache
    subj_key = ":".join(["Material", subj_id])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: subject_material)

    return dats_subj
Ejemplo n.º 2
0
 def make_data_standard(format):
     return DatsObj("DataStandard",
                    [("name", format),
                     ("type", util.get_value_annotation("format", cache)),
                     ("description", format + " file format")])
Ejemplo n.º 3
0
def get_files_dats_datasets(cache, dats_samples_d, p_samples, gh_samples,
                            protected_cram_files, no_circular_links):
    file_datasets = []

    rnaseq_datatype = DatsObj("DataType", [
        ("information", util.get_annotation("transcription profiling", cache)),
        ("method", util.get_annotation("RNA-seq assay", cache)),
        ("platform", util.get_annotation("Illumina", cache))
    ])

    def get_rnaseq_datatype():
        dkey = ".".join(["DataType", "RNA-seq"])
        return cache.get_obj_or_ref(dkey, lambda: rnaseq_datatype)

    wgs_datatype = DatsObj(
        "DataType",
        [("information", util.get_annotation("DNA sequencing", cache)),
         ("method", util.get_annotation("whole genome sequencing assay",
                                        cache)),
         ("platform", util.get_annotation("Illumina", cache))])

    def get_wgs_datatype():
        dkey = ".".join(["DataType", "WGS"])
        return cache.get_obj_or_ref(dkey, lambda: wgs_datatype)

    broad_key = ":".join(["Organization", "Broad Institute"])
    broad = cache.get_obj_or_ref(
        broad_key,
        lambda: DatsObj("Organization", [("name", "Broad Institute")]))
    creators = [broad]

    def make_data_standard(format):
        return DatsObj("DataStandard",
                       [("name", format),
                        ("type", util.get_value_annotation("format", cache)),
                        ("description", format + " file format")])

    cram_ds_key = ":".join(["DataStandard", "CRAM"])
    cram_ds = cache.get_obj_or_ref(cram_ds_key,
                                   lambda: make_data_standard("CRAM"))

    crai_ds_key = ":".join(["DataStandard", "CRAI"])
    crai_ds = cache.get_obj_or_ref(crai_ds_key,
                                   lambda: make_data_standard("CRAI"))

    for sample_id in protected_cram_files:
        file = protected_cram_files[sample_id]
        material_type = None
        ds_types = None

        # determine file type
        if re.search(r'wgs\/', file['cram_file_aws']['raw_value']):
            material_type = 'DNA'
            ds_types = get_wgs_datatype()
            gcp_suffix = '_gcp'
        elif re.search(r'rnaseq\/', file['cram_file_aws']['raw_value']):
            material_type = 'RNA'
            ds_types = get_rnaseq_datatype()
            gcp_suffix = ''
        else:
            logging.fatal(
                "unable to determine material/sequence type from cram_file_aws="
                + file['cram_file_aws']['raw_value'])
            sys.exit(1)

        # RNA-Seq keys = sample_id	cram_file	cram_file_md5	cram_file_size	cram_index	cram_file_aws	cram_index_aws
        # WGS keys = same as above + firecloud_id
        cram_file = file['cram_file' + gcp_suffix]['raw_value']
        cram_file_md5 = file['cram_file_md5']['raw_value']

        # TODO - review the following encoding decisions:
        #  - storing .crai URI as relatedIdentifier of the DatasetDistribution for the .cram file
        #  - storing MD5 checksum of the .cram file as an extraProperty of the DatasetDistribution
        #  - storing firecloud_id as a relatedIdentifier of the Dataset (not the DatasetDistribution)

        # Google Cloud Platform / Google Storage copy
        gs_access = DatsObj(
            "Access",
            [("landingPage", file['cram_file' + gcp_suffix]['raw_value'])])
        gs_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_access),
                ("identifier",
                 DatsObj("Identifier",
                         [("identifier",
                           file['cram_file' + gcp_suffix]['raw_value'])])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier",
                              file['cram_index' + gcp_suffix]['raw_value']),
                             ("relationType", "cram_index")])
                ]),
                ("size", int(file['cram_file_size']['raw_value'])),
                # TODO - add unit for bytes, include IRI?
                #                ("unit", util.get_value_annotation("bytes", cache))
                ("conformsTo", [
                    cache.get_obj_or_ref(cram_ds_key,
                                         lambda: make_data_standard("CRAM"))
                ])
            ])

        # AWS / S3 copy
        s3_access = DatsObj(
            "Access", [("landingPage", file['cram_file_aws']['raw_value'])])
        s3_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_access),
                ("identifier",
                 DatsObj("Identifier", [
                     ("identifier", file['cram_file_aws']['raw_value'])
                 ])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier", [
                        ("identifier", file['cram_index_aws']['raw_value']),
                        ("relationType", "cram_index")
                    ])
                ]),
                ("size", int(file['cram_file_size']['raw_value'])),
                # TODO - add unit for bytes, include IRI?
                #                ("unit", util.get_value_annotation("bytes", cache))
                ("conformsTo", [
                    cache.get_obj_or_ref(cram_ds_key,
                                         lambda: make_data_standard("CRAM"))
                ])
            ])

        m = re.match(r'^.*\/([^\/]+)$', cram_file)
        if m is None:
            logging.fatal("unable to parse filename from CRAM file URI " +
                          cram_file)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [file['cram_file_md5']['raw_value']])])

        ds = DatsObj("Dataset", [
            ("distributions", [gs_distro, s3_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [ds_types]),
            ("creators", creators),
        ])

        # add firecloud_id for WGS
        if 'firecloud_id' in file:
            f_id = DatsObj("RelatedIdentifier",
                           [("identifier", file['firecloud_id']['raw_value']),
                            ("identifierSource", "FireCloud")])
            ds.set("relatedIdentifiers", [f_id])

        # input RNA/DNA extract that was sequenced
        if sample_id not in dats_samples_d:
            logging.fatal("no sample exists for " + sample_id +
                          " found in file " +
                          file['cram_file_aws']['raw_value'])
            sys.exit(1)

        dats_sample = dats_samples_d[sample_id]
        dats_samp_key = ":".join(["Material", dats_sample.get("name")])
        dats_samp = cache.get_obj_or_ref(dats_samp_key, lambda: dats_sample)

        da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_samp])
             #                ("uses", [])                          # software used
             ])

        if not no_circular_links:
            # circular link back to enclosing Dataset as the output
            da.set("output", [ds.getIdRef()])

        ds.set("producedBy", da)
        file_datasets.append(ds)

    return file_datasets
Ejemplo n.º 4
0
def get_files_dats_datasets(cache, dats_samples_d, sample_manifest, file_guids,
                            no_circular_links):
    file_datasets_l = []

    wgs_datatype = DatsObj(
        "DataType",
        [("information", util.get_annotation("DNA sequencing", cache)),
         ("method", util.get_annotation("whole genome sequencing assay",
                                        cache)),
         ("platform", util.get_annotation("Illumina", cache))])

    def get_wgs_datatype():
        dkey = ".".join(["DataType", "WGS"])
        return cache.get_obj_or_ref(dkey, lambda: wgs_datatype)

    snp_datatype = DatsObj(
        "DataType", [("information", util.get_annotation("SNP", cache)),
                     ("method", util.get_annotation("SNP analysis", cache))])

    cnv_datatype = DatsObj(
        "DataType", [("information", util.get_annotation("CNV", cache)),
                     ("method", util.get_annotation("CNV analysis", cache))])

    def get_snp_datatype():
        dkey = ".".join(["DataType", "SNP"])
        return cache.get_obj_or_ref(dkey, lambda: snp_datatype)

    def get_cnv_datatype():
        dkey = ".".join(["DataType", "CNV"])
        return cache.get_obj_or_ref(dkey, lambda: cnv_datatype)

    nhlbi_key = ":".join(["Organization", "NHLBI"])
    nhlbi = cache.get_obj_or_ref(nhlbi_key, lambda: NIH_NHLBI)
    creators = [nhlbi]

    def make_data_standard(format):
        return DatsObj("DataStandard",
                       [("name", format),
                        ("type", util.get_value_annotation("format", cache)),
                        ("description", format + " file format")])

    cram_ds_key = ":".join(["DataStandard", "CRAM"])
    cram_dstan = cache.get_obj_or_ref(cram_ds_key,
                                      lambda: make_data_standard("CRAM"))

    vcf_ds_key = ":".join(["DataStandard", "VCF"])
    vcf_dstan = cache.get_obj_or_ref(vcf_ds_key,
                                     lambda: make_data_standard("VCF"))

    n_samples = len(dats_samples_d)
    n_samples_found = 0

    for sample_id in dats_samples_d:
        dats_sample = dats_samples_d[sample_id]

        # look up corresponding file paths in manifest file
        if sample_id not in sample_manifest:
            logging.debug("sample not found in manifest - " + sample_id)
            continue

        n_samples_found += 1
        ms = sample_manifest[sample_id]

        material_type = 'DNA'
        wgs_type = get_wgs_datatype()
        snp_type = get_snp_datatype()
        cnv_type = get_cnv_datatype()

        # ------------------------------------------------
        # WGS sequence - CRAM and CRAI files
        # ------------------------------------------------

        def get_filename(gs_uri):
            m = re.match(r'^.*\/([^\/]+)$', gs_uri)
            if m is None:
                logging.fatal("unable to parse filename from " + gs_uri)
                sys.exit(1)
            filename = m.group(1)
            return filename

        gs_cram = ms['gs_cram']['mapped_value']
        gs_crai = ms['gs_crai']['mapped_value']

        # GUID lookup
        cram_file = get_filename(gs_cram)
        crai_file = get_filename(gs_crai)

        cram_doi = file_guids[cram_file]['Sodium_GUID']['raw_value']
        cram_size = file_guids[cram_file]['File size']['raw_value']
        cram_md5 = file_guids[cram_file]['md5sum']['raw_value']

        crai_doi = file_guids[crai_file]['Sodium_GUID']['raw_value']
        crai_md5 = file_guids[crai_file]['md5sum']['raw_value']

        # handle file size values with "e" in them
        def filesize_to_int(size):
            if re.match(r'.*e.*', size):
                size = int(float(size))
            else:
                size = int(size)

        # Google Cloud Platform / Google Storage copy
        gs_cram_access = DatsObj("Access", [("accessURL", gs_cram)])
        gs_cram_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_cram_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", cram_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", crai_doi),
                             ("relationType", "cram_index")])
                ]),
                ("size", filesize_to_int(cram_size)),
                # TODO - add file size units
                ("conformsTo", [cram_dstan])
            ])

        # AWS / S3 copy
        s3_cram = ms['s3_cram']['mapped_value']
        s3_crai = ms['s3_crai']['mapped_value']
        s3_cram_access = DatsObj("Access", [("accessURL", s3_cram)])
        s3_cram_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_cram_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", cram_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", crai_doi),
                             ("relationType", "cram_index")])
                ]),
                ("size", filesize_to_int(cram_size)),
                # TODO - add file size units
                ("conformsTo", [cram_dstan])
            ])

        m = re.match(r'^.*\/([^\/]+)$', gs_cram)
        if m is None:
            logging.fatal("unable to parse filename from CRAM file URI " +
                          gs_cram)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [cram_md5])])

        cram_dataset = DatsObj("Dataset", [
            ("distributions", [gs_cram_distro, s3_cram_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [wgs_type]),
            ("creators", creators),
        ])

        cram_da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_sample.getIdRef()])
             #            ("uses", [])                          # software used
             ])

        cram_dataset.set("producedBy", cram_da)
        # circular link back to enclosing Dataset as the output
        if not no_circular_links:
            cram_da.set("output", [cram_dataset.getIdRef()])
        file_datasets_l.append(cram_dataset)

        # ------------------------------------------------
        # Variant calls - VCF and CSI files
        # ------------------------------------------------

        gs_vcf = ms['gs_vcf']['mapped_value']
        gs_csi = ms['gs_csi']['mapped_value']

        if gs_vcf is None:
            logging.warn("no VCF file found for " + sample_id)
            continue

        # GUID lookup
        vcf_file = get_filename(gs_vcf)
        csi_file = get_filename(gs_csi)

        vcf_doi = file_guids[vcf_file]['Sodium_GUID']['raw_value']
        vcf_size = file_guids[vcf_file]['File size']['raw_value']
        vcf_md5 = file_guids[vcf_file]['md5sum']['raw_value']

        csi_doi = file_guids[csi_file]['Sodium_GUID']['raw_value']
        csi_md5 = file_guids[csi_file]['md5sum']['raw_value']

        # Google Cloud Platform / Google Storage copy
        gs_vcf_access = DatsObj("Access", [("accessURL", gs_vcf)])
        gs_vcf_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_vcf_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", vcf_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", csi_doi),
                             ("relationType", "vcf_index")])
                ]),
                ("size", filesize_to_int(vcf_size)),
                # TODO - add file size units
                ("conformsTo", [vcf_dstan])
            ])

        # AWS / S3 copy
        s3_vcf = ms['s3_vcf']['mapped_value']
        s3_csi = ms['s3_csi']['mapped_value']
        s3_vcf_access = DatsObj("Access", [("accessURL", s3_vcf)])
        s3_vcf_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_vcf_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", vcf_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", csi_doi),
                             ("relationType", "vcf_index")])
                ]),
                ("size", filesize_to_int(vcf_size)),
                # TODO - add file size units
                ("conformsTo", [vcf_dstan])
            ])

        m = re.match(r'^.*\/([^\/]+)$', gs_vcf)
        if m is None:
            logging.fatal("unable to parse filename from VCF file URI " +
                          gs_vcf)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [vcf_md5])])

        vcf_dataset = DatsObj("Dataset", [
            ("distributions", [gs_vcf_distro, s3_vcf_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [snp_type, cnv_type]),
            ("creators", creators),
        ])

        vcf_da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_sample.getIdRef()])
             #            ("uses", [])                          # software used
             ])

        vcf_dataset.set("producedBy", vcf_da)
        # circular link back to enclosing Dataset as the output
        if not no_circular_links:
            vcf_da.set("output", [vcf_dataset.getIdRef()])
        file_datasets_l.append(vcf_dataset)

    logging.info("found " + str(n_samples_found) + " / " + str(n_samples) +
                 " sample(s) in TOPMed file manifest")
    return file_datasets_l
Ejemplo n.º 5
0
def get_sample_dats_material(cache, dats_subject, p_sample, gh_sample,
                             var_lookup):
    samp_id = p_sample['SAMPID']['mapped_value']
    subj_id = p_sample['SUBJID']['mapped_value']

    # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable
    def get_var_id(name):
        return var_lookup[name]['dim'].get("identifier").getIdRef()

    # Uberon id (or EFO id, contrary to the documentation)
    anat_id = p_sample['SMUBRID']['mapped_value']
    if anat_id is None:
        print("No Uberon/anatomy ID specified for sample " + samp_id)
        sys.exit(1)

    anatomy_identifier = None
    anatomy_alt_ids = None
    # TODO - query anatomy term from UBERON/EFO instead?
    anatomy_name = p_sample['SMTSD']['mapped_value']

    def make_anat_part(anat_id, anatomy_name):
        # EFO id
        if re.match(r'^EFO_\d+', anat_id):
            anatomy_identifier = OrderedDict([("identifier", anat_id),
                                              ("identifierSource", "EFO")])
            anatomy_alt_ids = [
                OrderedDict([(
                    "identifier",
                    "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form="
                    + str(anat_id)), ("identifierSource", "EFO")])
            ]
        # Uberon id
        else:
            anatomy_identifier = OrderedDict([("identifier",
                                               "UBERON:" + str(anat_id)),
                                              ("identifierSource", "UBERON")])
            anatomy_alt_ids = [
                OrderedDict([
                    ("identifier",
                     "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                    ("identifierSource", "UBERON")
                ])
            ]

        # anatomical part
        anatomical_part = DatsObj("AnatomicalPart",
                                  [("name", anatomy_name),
                                   ("identifier", anatomy_identifier),
                                   ("alternateIdentifiers", anatomy_alt_ids)])

        return anatomical_part

    # use cached value for AnatomicalPart if possible
    anat_part_key = ":".join(["AnatomicalPart", anatomy_name])
    anatomical_part = cache.get_obj_or_ref(
        anat_part_key, lambda: make_anat_part(anat_id, anatomy_name))

    # use URI from GitHub GTEx id dump if available
    identifier = samp_id
    if gh_sample is not None:
        identifier_id = gh_sample['Destination URL']['raw_value']

    subj_key = ":".join(["Material", dats_subject.get("name")])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject)

    # add sample characteristics from p_sample metadata
    sample_chars = []
    for key in p_sample:
        if re.match(r'^(subject|id)$', key):
            continue
        # TODO - currently including only a small subset of the available values for demonstration purposes
        if not re.match(r'^(SMATSSCR|SMRIN|SMMAPRT|SMGNSDTC)$', key):
            continue
        var = p_sample[key]
        #        print("got key=" + key + " var=" + str(var))
        mapped_val = var['mapped_value']
        char = DatsObj("Dimension",
                       [("name", util.get_value_annotation(key, cache)),
                        ("identifier", get_var_id(key)),
                        ("values", [mapped_val])])
        sample_chars.append(char)

    # biological/tissue sample
    biological_sample_material = DatsObj(
        "Material",
        [("name", samp_id), ("identifier", {
            "identifier": identifier
        }),
         ("description",
          anatomy_name + " specimen collected from subject " + subj_id),
         ("characteristics", sample_chars),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [util.get_annotation("specimen", cache)]),
         ("derivesFrom", [dats_subj, anatomical_part])])

    # analysis freeze classification
    smafrze = p_sample['SMAFRZE']['mapped_value']
    # expected sequence type depending on data freeze classification
    expected_stype = None

    stype = None
    if smafrze == "RNASEQ":
        expected_stype = "RNA"
    elif smafrze == "WGS":
        expected_stype = "DNA"
    elif smafrze == "WES":
        expected_stype = "DNA"
    # Illumina OMNI SNP Array
    elif smafrze == "OMNI":
        expected_stype = "DNA"
    elif smafrze == "EXCLUDE":
        pass
    else:
        logging.fatal("unknown SMAFRZE " + smafrze)
        sys.exit(1)

    # sample type - DNA or RNA
    stype = None
    smnabtcht = p_sample['SMNABTCHT']['mapped_value']
    if re.match(r'^DNA ([iI]solation|[eE]xtraction).*', smnabtcht):
        stype = 'DNA'
    elif re.match(r'^RNA ([iI]solation|[eE]xtraction).*', smnabtcht):
        stype = 'RNA'
    elif re.match(
            r'DNA or RNA Extraction from Paxgene-derived Lysate Plate Based',
            smnabtcht):
        stype = 'RNA'
    elif re.match(r'Transfer To Matrix \(Manual\)', smnabtcht):
        stype = 'DNA'

    if stype is None:
        if expected_stype is not None:
            stype = expected_stype
        else:
            print("couldn't determine sequence type for smafrze=" + smafrze +
                  " smnabtcht=" + smnabtcht)
            return None
    else:
        if (expected_stype is not None) and (stype != expected_stype):
            logging.fatal("seq type " + stype +
                          " doesn't match expected stype " + expected_stype)
            sys.exit(1)

    # DNA or RNA extract
    dna_or_rna_material = DatsObj(
        "Material",
        [("name", stype + " from " + samp_id),
         ("description", "total " + stype + " extracted from " + anatomy_name +
          " specimen collected from subject " + subj_id),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [util.get_annotation(stype + " extract", cache)]),
         ("derivesFrom", [biological_sample_material])])

    return dna_or_rna_material