def make_anat_part(anat_id, anatomy_name): # anatomical part anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name)]) if anat_id is not None: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] anatomical_part.set("identifier", anatomy_identifier) anatomical_part.set("alternateIdentifiers", anatomy_alt_ids) return anatomical_part
def get_files_dats_datasets(cache, dats_samples_d, p_samples, gh_samples, protected_cram_files, no_circular_links): file_datasets = [] rnaseq_datatype = DatsObj("DataType", [ ("information", util.get_annotation("transcription profiling", cache)), ("method", util.get_annotation("RNA-seq assay", cache)), ("platform", util.get_annotation("Illumina", cache)) ]) def get_rnaseq_datatype(): dkey = ".".join(["DataType", "RNA-seq"]) return cache.get_obj_or_ref(dkey, lambda: rnaseq_datatype) wgs_datatype = DatsObj( "DataType", [("information", util.get_annotation("DNA sequencing", cache)), ("method", util.get_annotation("whole genome sequencing assay", cache)), ("platform", util.get_annotation("Illumina", cache))]) def get_wgs_datatype(): dkey = ".".join(["DataType", "WGS"]) return cache.get_obj_or_ref(dkey, lambda: wgs_datatype) broad_key = ":".join(["Organization", "Broad Institute"]) broad = cache.get_obj_or_ref( broad_key, lambda: DatsObj("Organization", [("name", "Broad Institute")])) creators = [broad] def make_data_standard(format): return DatsObj("DataStandard", [("name", format), ("type", util.get_value_annotation("format", cache)), ("description", format + " file format")]) cram_ds_key = ":".join(["DataStandard", "CRAM"]) cram_ds = cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) crai_ds_key = ":".join(["DataStandard", "CRAI"]) crai_ds = cache.get_obj_or_ref(crai_ds_key, lambda: make_data_standard("CRAI")) for sample_id in protected_cram_files: file = protected_cram_files[sample_id] material_type = None ds_types = None # determine file type if re.search(r'wgs\/', file['cram_file_aws']['raw_value']): material_type = 'DNA' ds_types = get_wgs_datatype() gcp_suffix = '_gcp' elif re.search(r'rnaseq\/', file['cram_file_aws']['raw_value']): material_type = 'RNA' ds_types = get_rnaseq_datatype() gcp_suffix = '' else: logging.fatal( "unable to determine material/sequence type from cram_file_aws=" + file['cram_file_aws']['raw_value']) sys.exit(1) # RNA-Seq keys = sample_id cram_file cram_file_md5 cram_file_size cram_index cram_file_aws cram_index_aws # WGS keys = same as above + firecloud_id cram_file = file['cram_file' + gcp_suffix]['raw_value'] cram_file_md5 = file['cram_file_md5']['raw_value'] # TODO - review the following encoding decisions: # - storing .crai URI as relatedIdentifier of the DatasetDistribution for the .cram file # - storing MD5 checksum of the .cram file as an extraProperty of the DatasetDistribution # - storing firecloud_id as a relatedIdentifier of the Dataset (not the DatasetDistribution) # Google Cloud Platform / Google Storage copy gs_access = DatsObj( "Access", [("landingPage", file['cram_file' + gcp_suffix]['raw_value'])]) gs_distro = DatsObj( "DatasetDistribution", [ ("access", gs_access), ("identifier", DatsObj("Identifier", [("identifier", file['cram_file' + gcp_suffix]['raw_value'])])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", file['cram_index' + gcp_suffix]['raw_value']), ("relationType", "cram_index")]) ]), ("size", int(file['cram_file_size']['raw_value'])), # TODO - add unit for bytes, include IRI? # ("unit", util.get_value_annotation("bytes", cache)) ("conformsTo", [ cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) ]) ]) # AWS / S3 copy s3_access = DatsObj( "Access", [("landingPage", file['cram_file_aws']['raw_value'])]) s3_distro = DatsObj( "DatasetDistribution", [ ("access", s3_access), ("identifier", DatsObj("Identifier", [ ("identifier", file['cram_file_aws']['raw_value']) ])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [ ("identifier", file['cram_index_aws']['raw_value']), ("relationType", "cram_index") ]) ]), ("size", int(file['cram_file_size']['raw_value'])), # TODO - add unit for bytes, include IRI? # ("unit", util.get_value_annotation("bytes", cache)) ("conformsTo", [ cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) ]) ]) m = re.match(r'^.*\/([^\/]+)$', cram_file) if m is None: logging.fatal("unable to parse filename from CRAM file URI " + cram_file) sys.exit(1) filename = m.group(1) # TODO - replace this with DATS-specific MD5 checksum encoding (TBD) md5_dimension = DatsObj( "Dimension", [("name", util.get_value_annotation("MD5", cache)), ("values", [file['cram_file_md5']['raw_value']])]) ds = DatsObj("Dataset", [ ("distributions", [gs_distro, s3_distro]), ("dimensions", [md5_dimension]), ("title", filename), ("types", [ds_types]), ("creators", creators), ]) # add firecloud_id for WGS if 'firecloud_id' in file: f_id = DatsObj("RelatedIdentifier", [("identifier", file['firecloud_id']['raw_value']), ("identifierSource", "FireCloud")]) ds.set("relatedIdentifiers", [f_id]) # input RNA/DNA extract that was sequenced if sample_id not in dats_samples_d: logging.fatal("no sample exists for " + sample_id + " found in file " + file['cram_file_aws']['raw_value']) sys.exit(1) dats_sample = dats_samples_d[sample_id] dats_samp_key = ":".join(["Material", dats_sample.get("name")]) dats_samp = cache.get_obj_or_ref(dats_samp_key, lambda: dats_sample) da = DatsObj( "DataAcquisition", [("name", filename), ("input", [dats_samp]) # ("uses", []) # software used ]) if not no_circular_links: # circular link back to enclosing Dataset as the output da.set("output", [ds.getIdRef()]) ds.set("producedBy", da) file_datasets.append(ds) return file_datasets
def get_files_dats_datasets(cache, dats_samples_d, sample_manifest, file_guids, no_circular_links): file_datasets_l = [] wgs_datatype = DatsObj( "DataType", [("information", util.get_annotation("DNA sequencing", cache)), ("method", util.get_annotation("whole genome sequencing assay", cache)), ("platform", util.get_annotation("Illumina", cache))]) def get_wgs_datatype(): dkey = ".".join(["DataType", "WGS"]) return cache.get_obj_or_ref(dkey, lambda: wgs_datatype) snp_datatype = DatsObj( "DataType", [("information", util.get_annotation("SNP", cache)), ("method", util.get_annotation("SNP analysis", cache))]) cnv_datatype = DatsObj( "DataType", [("information", util.get_annotation("CNV", cache)), ("method", util.get_annotation("CNV analysis", cache))]) def get_snp_datatype(): dkey = ".".join(["DataType", "SNP"]) return cache.get_obj_or_ref(dkey, lambda: snp_datatype) def get_cnv_datatype(): dkey = ".".join(["DataType", "CNV"]) return cache.get_obj_or_ref(dkey, lambda: cnv_datatype) nhlbi_key = ":".join(["Organization", "NHLBI"]) nhlbi = cache.get_obj_or_ref(nhlbi_key, lambda: NIH_NHLBI) creators = [nhlbi] def make_data_standard(format): return DatsObj("DataStandard", [("name", format), ("type", util.get_value_annotation("format", cache)), ("description", format + " file format")]) cram_ds_key = ":".join(["DataStandard", "CRAM"]) cram_dstan = cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) vcf_ds_key = ":".join(["DataStandard", "VCF"]) vcf_dstan = cache.get_obj_or_ref(vcf_ds_key, lambda: make_data_standard("VCF")) n_samples = len(dats_samples_d) n_samples_found = 0 for sample_id in dats_samples_d: dats_sample = dats_samples_d[sample_id] # look up corresponding file paths in manifest file if sample_id not in sample_manifest: logging.debug("sample not found in manifest - " + sample_id) continue n_samples_found += 1 ms = sample_manifest[sample_id] material_type = 'DNA' wgs_type = get_wgs_datatype() snp_type = get_snp_datatype() cnv_type = get_cnv_datatype() # ------------------------------------------------ # WGS sequence - CRAM and CRAI files # ------------------------------------------------ def get_filename(gs_uri): m = re.match(r'^.*\/([^\/]+)$', gs_uri) if m is None: logging.fatal("unable to parse filename from " + gs_uri) sys.exit(1) filename = m.group(1) return filename gs_cram = ms['gs_cram']['mapped_value'] gs_crai = ms['gs_crai']['mapped_value'] # GUID lookup cram_file = get_filename(gs_cram) crai_file = get_filename(gs_crai) cram_doi = file_guids[cram_file]['Sodium_GUID']['raw_value'] cram_size = file_guids[cram_file]['File size']['raw_value'] cram_md5 = file_guids[cram_file]['md5sum']['raw_value'] crai_doi = file_guids[crai_file]['Sodium_GUID']['raw_value'] crai_md5 = file_guids[crai_file]['md5sum']['raw_value'] # handle file size values with "e" in them def filesize_to_int(size): if re.match(r'.*e.*', size): size = int(float(size)) else: size = int(size) # Google Cloud Platform / Google Storage copy gs_cram_access = DatsObj("Access", [("accessURL", gs_cram)]) gs_cram_distro = DatsObj( "DatasetDistribution", [ ("access", gs_cram_access), ("identifier", DatsObj("Identifier", [("identifier", cram_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", crai_doi), ("relationType", "cram_index")]) ]), ("size", filesize_to_int(cram_size)), # TODO - add file size units ("conformsTo", [cram_dstan]) ]) # AWS / S3 copy s3_cram = ms['s3_cram']['mapped_value'] s3_crai = ms['s3_crai']['mapped_value'] s3_cram_access = DatsObj("Access", [("accessURL", s3_cram)]) s3_cram_distro = DatsObj( "DatasetDistribution", [ ("access", s3_cram_access), ("identifier", DatsObj("Identifier", [("identifier", cram_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", crai_doi), ("relationType", "cram_index")]) ]), ("size", filesize_to_int(cram_size)), # TODO - add file size units ("conformsTo", [cram_dstan]) ]) m = re.match(r'^.*\/([^\/]+)$', gs_cram) if m is None: logging.fatal("unable to parse filename from CRAM file URI " + gs_cram) sys.exit(1) filename = m.group(1) # TODO - replace this with DATS-specific MD5 checksum encoding (TBD) md5_dimension = DatsObj( "Dimension", [("name", util.get_value_annotation("MD5", cache)), ("values", [cram_md5])]) cram_dataset = DatsObj("Dataset", [ ("distributions", [gs_cram_distro, s3_cram_distro]), ("dimensions", [md5_dimension]), ("title", filename), ("types", [wgs_type]), ("creators", creators), ]) cram_da = DatsObj( "DataAcquisition", [("name", filename), ("input", [dats_sample.getIdRef()]) # ("uses", []) # software used ]) cram_dataset.set("producedBy", cram_da) # circular link back to enclosing Dataset as the output if not no_circular_links: cram_da.set("output", [cram_dataset.getIdRef()]) file_datasets_l.append(cram_dataset) # ------------------------------------------------ # Variant calls - VCF and CSI files # ------------------------------------------------ gs_vcf = ms['gs_vcf']['mapped_value'] gs_csi = ms['gs_csi']['mapped_value'] if gs_vcf is None: logging.warn("no VCF file found for " + sample_id) continue # GUID lookup vcf_file = get_filename(gs_vcf) csi_file = get_filename(gs_csi) vcf_doi = file_guids[vcf_file]['Sodium_GUID']['raw_value'] vcf_size = file_guids[vcf_file]['File size']['raw_value'] vcf_md5 = file_guids[vcf_file]['md5sum']['raw_value'] csi_doi = file_guids[csi_file]['Sodium_GUID']['raw_value'] csi_md5 = file_guids[csi_file]['md5sum']['raw_value'] # Google Cloud Platform / Google Storage copy gs_vcf_access = DatsObj("Access", [("accessURL", gs_vcf)]) gs_vcf_distro = DatsObj( "DatasetDistribution", [ ("access", gs_vcf_access), ("identifier", DatsObj("Identifier", [("identifier", vcf_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", csi_doi), ("relationType", "vcf_index")]) ]), ("size", filesize_to_int(vcf_size)), # TODO - add file size units ("conformsTo", [vcf_dstan]) ]) # AWS / S3 copy s3_vcf = ms['s3_vcf']['mapped_value'] s3_csi = ms['s3_csi']['mapped_value'] s3_vcf_access = DatsObj("Access", [("accessURL", s3_vcf)]) s3_vcf_distro = DatsObj( "DatasetDistribution", [ ("access", s3_vcf_access), ("identifier", DatsObj("Identifier", [("identifier", vcf_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", csi_doi), ("relationType", "vcf_index")]) ]), ("size", filesize_to_int(vcf_size)), # TODO - add file size units ("conformsTo", [vcf_dstan]) ]) m = re.match(r'^.*\/([^\/]+)$', gs_vcf) if m is None: logging.fatal("unable to parse filename from VCF file URI " + gs_vcf) sys.exit(1) filename = m.group(1) # TODO - replace this with DATS-specific MD5 checksum encoding (TBD) md5_dimension = DatsObj( "Dimension", [("name", util.get_value_annotation("MD5", cache)), ("values", [vcf_md5])]) vcf_dataset = DatsObj("Dataset", [ ("distributions", [gs_vcf_distro, s3_vcf_distro]), ("dimensions", [md5_dimension]), ("title", filename), ("types", [snp_type, cnv_type]), ("creators", creators), ]) vcf_da = DatsObj( "DataAcquisition", [("name", filename), ("input", [dats_sample.getIdRef()]) # ("uses", []) # software used ]) vcf_dataset.set("producedBy", vcf_da) # circular link back to enclosing Dataset as the output if not no_circular_links: vcf_da.set("output", [vcf_dataset.getIdRef()]) file_datasets_l.append(vcf_dataset) logging.info("found " + str(n_samples_found) + " / " + str(n_samples) + " sample(s) in TOPMed file manifest") return file_datasets_l
imi_project = DatsObj("Dataset", [ ("identifier", DatsObj("Identifier", [("identifier", "IMI-Cat#" + str(i))])), ("title", df["Project Acronym"][i]), ("description", df["ShortDescription"][i] + ". SUMMARY: " + df["Summary"][i]), ("distributions", []), ("creators", [d_orgs, d_person]), ("keywords", d_kwds), ("dates", [start_date, end_date]), ("types", []), ("producedBy", []), ("storedIn", ""), ("isAbout", [d_kwds]), ("version", ""), ("isAbout", []), ("extraProperties", [dataset_extra_props]) ]) imi_projects.append(imi_project) imi_project_catalogue.set("hasPart", imi_projects) DATSEncoder().encode(imi_project_catalogue) # imi_data_jstr = json.dumps(cls=DATSEncoder) # print(imi_project_catalogue.toJSON()) # imi_data_jstr = json.dumps(imi_project_catalogue.__dict__) # print("FULL CATALOGUE:", imi_data_jstr) script_dir = os.path.dirname(__file__) print("DIR", script_dir) filename = 'IMI_datacatalogue_as_DATS.json' with open(join(output_dir, filename), 'w', encoding='utf-8') as f: json.dump(imi_project_catalogue.toJSON(), f)