예제 #1
0
    def make_anat_part(anat_id, anatomy_name):
        # anatomical part
        anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name)])

        if anat_id is not None:
            anatomy_identifier = OrderedDict([("identifier",
                                               "UBERON:" + str(anat_id)),
                                              ("identifierSource", "UBERON")])
            anatomy_alt_ids = [
                OrderedDict([
                    ("identifier",
                     "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                    ("identifierSource", "UBERON")
                ])
            ]

            anatomical_part.set("identifier", anatomy_identifier)
            anatomical_part.set("alternateIdentifiers", anatomy_alt_ids)

        return anatomical_part
예제 #2
0
def get_files_dats_datasets(cache, dats_samples_d, p_samples, gh_samples,
                            protected_cram_files, no_circular_links):
    file_datasets = []

    rnaseq_datatype = DatsObj("DataType", [
        ("information", util.get_annotation("transcription profiling", cache)),
        ("method", util.get_annotation("RNA-seq assay", cache)),
        ("platform", util.get_annotation("Illumina", cache))
    ])

    def get_rnaseq_datatype():
        dkey = ".".join(["DataType", "RNA-seq"])
        return cache.get_obj_or_ref(dkey, lambda: rnaseq_datatype)

    wgs_datatype = DatsObj(
        "DataType",
        [("information", util.get_annotation("DNA sequencing", cache)),
         ("method", util.get_annotation("whole genome sequencing assay",
                                        cache)),
         ("platform", util.get_annotation("Illumina", cache))])

    def get_wgs_datatype():
        dkey = ".".join(["DataType", "WGS"])
        return cache.get_obj_or_ref(dkey, lambda: wgs_datatype)

    broad_key = ":".join(["Organization", "Broad Institute"])
    broad = cache.get_obj_or_ref(
        broad_key,
        lambda: DatsObj("Organization", [("name", "Broad Institute")]))
    creators = [broad]

    def make_data_standard(format):
        return DatsObj("DataStandard",
                       [("name", format),
                        ("type", util.get_value_annotation("format", cache)),
                        ("description", format + " file format")])

    cram_ds_key = ":".join(["DataStandard", "CRAM"])
    cram_ds = cache.get_obj_or_ref(cram_ds_key,
                                   lambda: make_data_standard("CRAM"))

    crai_ds_key = ":".join(["DataStandard", "CRAI"])
    crai_ds = cache.get_obj_or_ref(crai_ds_key,
                                   lambda: make_data_standard("CRAI"))

    for sample_id in protected_cram_files:
        file = protected_cram_files[sample_id]
        material_type = None
        ds_types = None

        # determine file type
        if re.search(r'wgs\/', file['cram_file_aws']['raw_value']):
            material_type = 'DNA'
            ds_types = get_wgs_datatype()
            gcp_suffix = '_gcp'
        elif re.search(r'rnaseq\/', file['cram_file_aws']['raw_value']):
            material_type = 'RNA'
            ds_types = get_rnaseq_datatype()
            gcp_suffix = ''
        else:
            logging.fatal(
                "unable to determine material/sequence type from cram_file_aws="
                + file['cram_file_aws']['raw_value'])
            sys.exit(1)

        # RNA-Seq keys = sample_id	cram_file	cram_file_md5	cram_file_size	cram_index	cram_file_aws	cram_index_aws
        # WGS keys = same as above + firecloud_id
        cram_file = file['cram_file' + gcp_suffix]['raw_value']
        cram_file_md5 = file['cram_file_md5']['raw_value']

        # TODO - review the following encoding decisions:
        #  - storing .crai URI as relatedIdentifier of the DatasetDistribution for the .cram file
        #  - storing MD5 checksum of the .cram file as an extraProperty of the DatasetDistribution
        #  - storing firecloud_id as a relatedIdentifier of the Dataset (not the DatasetDistribution)

        # Google Cloud Platform / Google Storage copy
        gs_access = DatsObj(
            "Access",
            [("landingPage", file['cram_file' + gcp_suffix]['raw_value'])])
        gs_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_access),
                ("identifier",
                 DatsObj("Identifier",
                         [("identifier",
                           file['cram_file' + gcp_suffix]['raw_value'])])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier",
                              file['cram_index' + gcp_suffix]['raw_value']),
                             ("relationType", "cram_index")])
                ]),
                ("size", int(file['cram_file_size']['raw_value'])),
                # TODO - add unit for bytes, include IRI?
                #                ("unit", util.get_value_annotation("bytes", cache))
                ("conformsTo", [
                    cache.get_obj_or_ref(cram_ds_key,
                                         lambda: make_data_standard("CRAM"))
                ])
            ])

        # AWS / S3 copy
        s3_access = DatsObj(
            "Access", [("landingPage", file['cram_file_aws']['raw_value'])])
        s3_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_access),
                ("identifier",
                 DatsObj("Identifier", [
                     ("identifier", file['cram_file_aws']['raw_value'])
                 ])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier", [
                        ("identifier", file['cram_index_aws']['raw_value']),
                        ("relationType", "cram_index")
                    ])
                ]),
                ("size", int(file['cram_file_size']['raw_value'])),
                # TODO - add unit for bytes, include IRI?
                #                ("unit", util.get_value_annotation("bytes", cache))
                ("conformsTo", [
                    cache.get_obj_or_ref(cram_ds_key,
                                         lambda: make_data_standard("CRAM"))
                ])
            ])

        m = re.match(r'^.*\/([^\/]+)$', cram_file)
        if m is None:
            logging.fatal("unable to parse filename from CRAM file URI " +
                          cram_file)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [file['cram_file_md5']['raw_value']])])

        ds = DatsObj("Dataset", [
            ("distributions", [gs_distro, s3_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [ds_types]),
            ("creators", creators),
        ])

        # add firecloud_id for WGS
        if 'firecloud_id' in file:
            f_id = DatsObj("RelatedIdentifier",
                           [("identifier", file['firecloud_id']['raw_value']),
                            ("identifierSource", "FireCloud")])
            ds.set("relatedIdentifiers", [f_id])

        # input RNA/DNA extract that was sequenced
        if sample_id not in dats_samples_d:
            logging.fatal("no sample exists for " + sample_id +
                          " found in file " +
                          file['cram_file_aws']['raw_value'])
            sys.exit(1)

        dats_sample = dats_samples_d[sample_id]
        dats_samp_key = ":".join(["Material", dats_sample.get("name")])
        dats_samp = cache.get_obj_or_ref(dats_samp_key, lambda: dats_sample)

        da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_samp])
             #                ("uses", [])                          # software used
             ])

        if not no_circular_links:
            # circular link back to enclosing Dataset as the output
            da.set("output", [ds.getIdRef()])

        ds.set("producedBy", da)
        file_datasets.append(ds)

    return file_datasets
예제 #3
0
def get_files_dats_datasets(cache, dats_samples_d, sample_manifest, file_guids,
                            no_circular_links):
    file_datasets_l = []

    wgs_datatype = DatsObj(
        "DataType",
        [("information", util.get_annotation("DNA sequencing", cache)),
         ("method", util.get_annotation("whole genome sequencing assay",
                                        cache)),
         ("platform", util.get_annotation("Illumina", cache))])

    def get_wgs_datatype():
        dkey = ".".join(["DataType", "WGS"])
        return cache.get_obj_or_ref(dkey, lambda: wgs_datatype)

    snp_datatype = DatsObj(
        "DataType", [("information", util.get_annotation("SNP", cache)),
                     ("method", util.get_annotation("SNP analysis", cache))])

    cnv_datatype = DatsObj(
        "DataType", [("information", util.get_annotation("CNV", cache)),
                     ("method", util.get_annotation("CNV analysis", cache))])

    def get_snp_datatype():
        dkey = ".".join(["DataType", "SNP"])
        return cache.get_obj_or_ref(dkey, lambda: snp_datatype)

    def get_cnv_datatype():
        dkey = ".".join(["DataType", "CNV"])
        return cache.get_obj_or_ref(dkey, lambda: cnv_datatype)

    nhlbi_key = ":".join(["Organization", "NHLBI"])
    nhlbi = cache.get_obj_or_ref(nhlbi_key, lambda: NIH_NHLBI)
    creators = [nhlbi]

    def make_data_standard(format):
        return DatsObj("DataStandard",
                       [("name", format),
                        ("type", util.get_value_annotation("format", cache)),
                        ("description", format + " file format")])

    cram_ds_key = ":".join(["DataStandard", "CRAM"])
    cram_dstan = cache.get_obj_or_ref(cram_ds_key,
                                      lambda: make_data_standard("CRAM"))

    vcf_ds_key = ":".join(["DataStandard", "VCF"])
    vcf_dstan = cache.get_obj_or_ref(vcf_ds_key,
                                     lambda: make_data_standard("VCF"))

    n_samples = len(dats_samples_d)
    n_samples_found = 0

    for sample_id in dats_samples_d:
        dats_sample = dats_samples_d[sample_id]

        # look up corresponding file paths in manifest file
        if sample_id not in sample_manifest:
            logging.debug("sample not found in manifest - " + sample_id)
            continue

        n_samples_found += 1
        ms = sample_manifest[sample_id]

        material_type = 'DNA'
        wgs_type = get_wgs_datatype()
        snp_type = get_snp_datatype()
        cnv_type = get_cnv_datatype()

        # ------------------------------------------------
        # WGS sequence - CRAM and CRAI files
        # ------------------------------------------------

        def get_filename(gs_uri):
            m = re.match(r'^.*\/([^\/]+)$', gs_uri)
            if m is None:
                logging.fatal("unable to parse filename from " + gs_uri)
                sys.exit(1)
            filename = m.group(1)
            return filename

        gs_cram = ms['gs_cram']['mapped_value']
        gs_crai = ms['gs_crai']['mapped_value']

        # GUID lookup
        cram_file = get_filename(gs_cram)
        crai_file = get_filename(gs_crai)

        cram_doi = file_guids[cram_file]['Sodium_GUID']['raw_value']
        cram_size = file_guids[cram_file]['File size']['raw_value']
        cram_md5 = file_guids[cram_file]['md5sum']['raw_value']

        crai_doi = file_guids[crai_file]['Sodium_GUID']['raw_value']
        crai_md5 = file_guids[crai_file]['md5sum']['raw_value']

        # handle file size values with "e" in them
        def filesize_to_int(size):
            if re.match(r'.*e.*', size):
                size = int(float(size))
            else:
                size = int(size)

        # Google Cloud Platform / Google Storage copy
        gs_cram_access = DatsObj("Access", [("accessURL", gs_cram)])
        gs_cram_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_cram_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", cram_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", crai_doi),
                             ("relationType", "cram_index")])
                ]),
                ("size", filesize_to_int(cram_size)),
                # TODO - add file size units
                ("conformsTo", [cram_dstan])
            ])

        # AWS / S3 copy
        s3_cram = ms['s3_cram']['mapped_value']
        s3_crai = ms['s3_crai']['mapped_value']
        s3_cram_access = DatsObj("Access", [("accessURL", s3_cram)])
        s3_cram_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_cram_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", cram_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", crai_doi),
                             ("relationType", "cram_index")])
                ]),
                ("size", filesize_to_int(cram_size)),
                # TODO - add file size units
                ("conformsTo", [cram_dstan])
            ])

        m = re.match(r'^.*\/([^\/]+)$', gs_cram)
        if m is None:
            logging.fatal("unable to parse filename from CRAM file URI " +
                          gs_cram)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [cram_md5])])

        cram_dataset = DatsObj("Dataset", [
            ("distributions", [gs_cram_distro, s3_cram_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [wgs_type]),
            ("creators", creators),
        ])

        cram_da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_sample.getIdRef()])
             #            ("uses", [])                          # software used
             ])

        cram_dataset.set("producedBy", cram_da)
        # circular link back to enclosing Dataset as the output
        if not no_circular_links:
            cram_da.set("output", [cram_dataset.getIdRef()])
        file_datasets_l.append(cram_dataset)

        # ------------------------------------------------
        # Variant calls - VCF and CSI files
        # ------------------------------------------------

        gs_vcf = ms['gs_vcf']['mapped_value']
        gs_csi = ms['gs_csi']['mapped_value']

        if gs_vcf is None:
            logging.warn("no VCF file found for " + sample_id)
            continue

        # GUID lookup
        vcf_file = get_filename(gs_vcf)
        csi_file = get_filename(gs_csi)

        vcf_doi = file_guids[vcf_file]['Sodium_GUID']['raw_value']
        vcf_size = file_guids[vcf_file]['File size']['raw_value']
        vcf_md5 = file_guids[vcf_file]['md5sum']['raw_value']

        csi_doi = file_guids[csi_file]['Sodium_GUID']['raw_value']
        csi_md5 = file_guids[csi_file]['md5sum']['raw_value']

        # Google Cloud Platform / Google Storage copy
        gs_vcf_access = DatsObj("Access", [("accessURL", gs_vcf)])
        gs_vcf_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_vcf_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", vcf_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", csi_doi),
                             ("relationType", "vcf_index")])
                ]),
                ("size", filesize_to_int(vcf_size)),
                # TODO - add file size units
                ("conformsTo", [vcf_dstan])
            ])

        # AWS / S3 copy
        s3_vcf = ms['s3_vcf']['mapped_value']
        s3_csi = ms['s3_csi']['mapped_value']
        s3_vcf_access = DatsObj("Access", [("accessURL", s3_vcf)])
        s3_vcf_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_vcf_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", vcf_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", csi_doi),
                             ("relationType", "vcf_index")])
                ]),
                ("size", filesize_to_int(vcf_size)),
                # TODO - add file size units
                ("conformsTo", [vcf_dstan])
            ])

        m = re.match(r'^.*\/([^\/]+)$', gs_vcf)
        if m is None:
            logging.fatal("unable to parse filename from VCF file URI " +
                          gs_vcf)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [vcf_md5])])

        vcf_dataset = DatsObj("Dataset", [
            ("distributions", [gs_vcf_distro, s3_vcf_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [snp_type, cnv_type]),
            ("creators", creators),
        ])

        vcf_da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_sample.getIdRef()])
             #            ("uses", [])                          # software used
             ])

        vcf_dataset.set("producedBy", vcf_da)
        # circular link back to enclosing Dataset as the output
        if not no_circular_links:
            vcf_da.set("output", [vcf_dataset.getIdRef()])
        file_datasets_l.append(vcf_dataset)

    logging.info("found " + str(n_samples_found) + " / " + str(n_samples) +
                 " sample(s) in TOPMed file manifest")
    return file_datasets_l
            imi_project = DatsObj("Dataset", [
                ("identifier",
                 DatsObj("Identifier", [("identifier", "IMI-Cat#" + str(i))])),
                ("title", df["Project Acronym"][i]),
                ("description",
                 df["ShortDescription"][i] + ". SUMMARY: " + df["Summary"][i]),
                ("distributions", []), ("creators", [d_orgs, d_person]),
                ("keywords", d_kwds), ("dates", [start_date, end_date]),
                ("types", []), ("producedBy", []), ("storedIn", ""),
                ("isAbout", [d_kwds]), ("version", ""), ("isAbout", []),
                ("extraProperties", [dataset_extra_props])
            ])

            imi_projects.append(imi_project)

        imi_project_catalogue.set("hasPart", imi_projects)

        DATSEncoder().encode(imi_project_catalogue)
        # imi_data_jstr = json.dumps(cls=DATSEncoder)

        # print(imi_project_catalogue.toJSON())
        # imi_data_jstr = json.dumps(imi_project_catalogue.__dict__)

        # print("FULL CATALOGUE:", imi_data_jstr)

        script_dir = os.path.dirname(__file__)
        print("DIR", script_dir)
        filename = 'IMI_datacatalogue_as_DATS.json'
        with open(join(output_dir, filename), 'w', encoding='utf-8') as f:
            json.dump(imi_project_catalogue.toJSON(), f)