def get_files_dats_datasets(cache, dats_samples_d, sample_manifest, file_guids, no_circular_links): file_datasets_l = [] wgs_datatype = DatsObj( "DataType", [("information", util.get_annotation("DNA sequencing", cache)), ("method", util.get_annotation("whole genome sequencing assay", cache)), ("platform", util.get_annotation("Illumina", cache))]) def get_wgs_datatype(): dkey = ".".join(["DataType", "WGS"]) return cache.get_obj_or_ref(dkey, lambda: wgs_datatype) snp_datatype = DatsObj( "DataType", [("information", util.get_annotation("SNP", cache)), ("method", util.get_annotation("SNP analysis", cache))]) cnv_datatype = DatsObj( "DataType", [("information", util.get_annotation("CNV", cache)), ("method", util.get_annotation("CNV analysis", cache))]) def get_snp_datatype(): dkey = ".".join(["DataType", "SNP"]) return cache.get_obj_or_ref(dkey, lambda: snp_datatype) def get_cnv_datatype(): dkey = ".".join(["DataType", "CNV"]) return cache.get_obj_or_ref(dkey, lambda: cnv_datatype) nhlbi_key = ":".join(["Organization", "NHLBI"]) nhlbi = cache.get_obj_or_ref(nhlbi_key, lambda: NIH_NHLBI) creators = [nhlbi] def make_data_standard(format): return DatsObj("DataStandard", [("name", format), ("type", util.get_value_annotation("format", cache)), ("description", format + " file format")]) cram_ds_key = ":".join(["DataStandard", "CRAM"]) cram_dstan = cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) vcf_ds_key = ":".join(["DataStandard", "VCF"]) vcf_dstan = cache.get_obj_or_ref(vcf_ds_key, lambda: make_data_standard("VCF")) n_samples = len(dats_samples_d) n_samples_found = 0 for sample_id in dats_samples_d: dats_sample = dats_samples_d[sample_id] # look up corresponding file paths in manifest file if sample_id not in sample_manifest: logging.debug("sample not found in manifest - " + sample_id) continue n_samples_found += 1 ms = sample_manifest[sample_id] material_type = 'DNA' wgs_type = get_wgs_datatype() snp_type = get_snp_datatype() cnv_type = get_cnv_datatype() # ------------------------------------------------ # WGS sequence - CRAM and CRAI files # ------------------------------------------------ def get_filename(gs_uri): m = re.match(r'^.*\/([^\/]+)$', gs_uri) if m is None: logging.fatal("unable to parse filename from " + gs_uri) sys.exit(1) filename = m.group(1) return filename gs_cram = ms['gs_cram']['mapped_value'] gs_crai = ms['gs_crai']['mapped_value'] # GUID lookup cram_file = get_filename(gs_cram) crai_file = get_filename(gs_crai) cram_doi = file_guids[cram_file]['Sodium_GUID']['raw_value'] cram_size = file_guids[cram_file]['File size']['raw_value'] cram_md5 = file_guids[cram_file]['md5sum']['raw_value'] crai_doi = file_guids[crai_file]['Sodium_GUID']['raw_value'] crai_md5 = file_guids[crai_file]['md5sum']['raw_value'] # handle file size values with "e" in them def filesize_to_int(size): if re.match(r'.*e.*', size): size = int(float(size)) else: size = int(size) # Google Cloud Platform / Google Storage copy gs_cram_access = DatsObj("Access", [("accessURL", gs_cram)]) gs_cram_distro = DatsObj( "DatasetDistribution", [ ("access", gs_cram_access), ("identifier", DatsObj("Identifier", [("identifier", cram_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", crai_doi), ("relationType", "cram_index")]) ]), ("size", filesize_to_int(cram_size)), # TODO - add file size units ("conformsTo", [cram_dstan]) ]) # AWS / S3 copy s3_cram = ms['s3_cram']['mapped_value'] s3_crai = ms['s3_crai']['mapped_value'] s3_cram_access = DatsObj("Access", [("accessURL", s3_cram)]) s3_cram_distro = DatsObj( "DatasetDistribution", [ ("access", s3_cram_access), ("identifier", DatsObj("Identifier", [("identifier", cram_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", crai_doi), ("relationType", "cram_index")]) ]), ("size", filesize_to_int(cram_size)), # TODO - add file size units ("conformsTo", [cram_dstan]) ]) m = re.match(r'^.*\/([^\/]+)$', gs_cram) if m is None: logging.fatal("unable to parse filename from CRAM file URI " + gs_cram) sys.exit(1) filename = m.group(1) # TODO - replace this with DATS-specific MD5 checksum encoding (TBD) md5_dimension = DatsObj( "Dimension", [("name", util.get_value_annotation("MD5", cache)), ("values", [cram_md5])]) cram_dataset = DatsObj("Dataset", [ ("distributions", [gs_cram_distro, s3_cram_distro]), ("dimensions", [md5_dimension]), ("title", filename), ("types", [wgs_type]), ("creators", creators), ]) cram_da = DatsObj( "DataAcquisition", [("name", filename), ("input", [dats_sample.getIdRef()]) # ("uses", []) # software used ]) cram_dataset.set("producedBy", cram_da) # circular link back to enclosing Dataset as the output if not no_circular_links: cram_da.set("output", [cram_dataset.getIdRef()]) file_datasets_l.append(cram_dataset) # ------------------------------------------------ # Variant calls - VCF and CSI files # ------------------------------------------------ gs_vcf = ms['gs_vcf']['mapped_value'] gs_csi = ms['gs_csi']['mapped_value'] if gs_vcf is None: logging.warn("no VCF file found for " + sample_id) continue # GUID lookup vcf_file = get_filename(gs_vcf) csi_file = get_filename(gs_csi) vcf_doi = file_guids[vcf_file]['Sodium_GUID']['raw_value'] vcf_size = file_guids[vcf_file]['File size']['raw_value'] vcf_md5 = file_guids[vcf_file]['md5sum']['raw_value'] csi_doi = file_guids[csi_file]['Sodium_GUID']['raw_value'] csi_md5 = file_guids[csi_file]['md5sum']['raw_value'] # Google Cloud Platform / Google Storage copy gs_vcf_access = DatsObj("Access", [("accessURL", gs_vcf)]) gs_vcf_distro = DatsObj( "DatasetDistribution", [ ("access", gs_vcf_access), ("identifier", DatsObj("Identifier", [("identifier", vcf_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", csi_doi), ("relationType", "vcf_index")]) ]), ("size", filesize_to_int(vcf_size)), # TODO - add file size units ("conformsTo", [vcf_dstan]) ]) # AWS / S3 copy s3_vcf = ms['s3_vcf']['mapped_value'] s3_csi = ms['s3_csi']['mapped_value'] s3_vcf_access = DatsObj("Access", [("accessURL", s3_vcf)]) s3_vcf_distro = DatsObj( "DatasetDistribution", [ ("access", s3_vcf_access), ("identifier", DatsObj("Identifier", [("identifier", vcf_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", csi_doi), ("relationType", "vcf_index")]) ]), ("size", filesize_to_int(vcf_size)), # TODO - add file size units ("conformsTo", [vcf_dstan]) ]) m = re.match(r'^.*\/([^\/]+)$', gs_vcf) if m is None: logging.fatal("unable to parse filename from VCF file URI " + gs_vcf) sys.exit(1) filename = m.group(1) # TODO - replace this with DATS-specific MD5 checksum encoding (TBD) md5_dimension = DatsObj( "Dimension", [("name", util.get_value_annotation("MD5", cache)), ("values", [vcf_md5])]) vcf_dataset = DatsObj("Dataset", [ ("distributions", [gs_vcf_distro, s3_vcf_distro]), ("dimensions", [md5_dimension]), ("title", filename), ("types", [snp_type, cnv_type]), ("creators", creators), ]) vcf_da = DatsObj( "DataAcquisition", [("name", filename), ("input", [dats_sample.getIdRef()]) # ("uses", []) # software used ]) vcf_dataset.set("producedBy", vcf_da) # circular link back to enclosing Dataset as the output if not no_circular_links: vcf_da.set("output", [vcf_dataset.getIdRef()]) file_datasets_l.append(vcf_dataset) logging.info("found " + str(n_samples_found) + " / " + str(n_samples) + " sample(s) in TOPMed file manifest") return file_datasets_l
def make_consent_group(args, group_name, group_index, subject_l, dats_subject_d): # find DATS subject that corresponds to each named subject dats_subjects_l = [] # parallel array in which existing subjects are represented by idref dats_subjects_idrefs_l = [] for s in subject_l: if s['SUBJID'] not in dats_subject_d: logging.warn( "GTEx subject " + s['SUBJID'] + " not found in public metadata, creating new subject Material") # create new placeholder Material and 1. add it to "all subjects" group 2. subject = DatsObj("Material", [("name", s['SUBJID']), ("characteristics", []), ("description", "GTEx subject " + s['SUBJID'])]) dats_subject_d[s['SUBJID']] = subject dats_subjects_l.append(subject) dats_subjects_idrefs_l.append(subject) else: ds = dats_subject_d[s['SUBJID']] dats_subjects_l.append(ds) dats_subjects_idrefs_l.append(ds.getIdRef()) # create StudyGroup and associated ConsentInfo # TODO - determine if/where to store group_index (0 or 1) # only 2 consent groups in GTEx study: # 0 - Subjects did not participate in the study, did not complete a consent document and # are included only for the pedigree structure and/or genotype controls, such as HapMap subjects # 1 - General Research Use (GRU) consent_info = None if group_name == "General Research Use (GRU)": # Data Use Ontology for consent info - http://www.obofoundry.org/ontology/duo.html # http://purl.obolibrary.org/obo/DUO_0000005 - "general research use and clinical care" # "This primary category consent code indicates that use is allowed for health/medical/biomedical # purposes and other biological research, including the study of population origins or ancestry." consent_info = DatsObj( "ConsentInfo", [("name", group_name), ("abbreviation", "GRU"), ("description", group_name), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", "http://purl.obolibrary.org/obo/DUO_0000005")]) ])]) elif group_name == "Subjects did not participate in the study, did not complete a consent document and are included only for the pedigree structure and/or genotype controls, such as HapMap subjects": consent_info = DatsObj("ConsentInfo", [("name", group_name), ("description", group_name)]) else: logging.fatal("unrecognized consent group " + group_name) sys.exit(1) group = DatsObj("StudyGroup", [("name", group_name), ("members", dats_subjects_idrefs_l), ("size", len(dats_subjects_idrefs_l)), ("consentInformation", [consent_info])]) # create link back from each subject to the parent StudyGroup if args.no_circular_links: logging.warn( "not creating Subject level circular links because of --no_circular_links option" ) else: for s in dats_subjects_l: cl = s.get("characteristics") cl.append( DatsObj("Dimension", [("name", "member of study group"), ("values", [group.getIdRef()])])) return group
def main(): # input parser = argparse.ArgumentParser( description='Create DATS JSON for dbGaP GTEx public metadata.') parser.add_argument( '--output_file', required=True, help= 'Output file path for the DATS JSON file containing the top-level DATS Dataset.' ) parser.add_argument( '--dbgap_public_xml_path', required=True, help= 'Path to directory that contains public dbGaP metadata files e.g., *.data_dict.xml and *.var_report.xml' ) parser.add_argument( '--dbgap_protected_metadata_path', required=False, help= 'Path to directory that contains access-controlled dbGaP tab-delimited metadata files.' ) parser.add_argument( '--max_output_samples', required=False, type=int, help= 'Impose a limit on the number of sample Materials in the output DATS. For testing purposes only.' ) parser.add_argument('--subject_phenotypes_path', default=V7_SUBJECT_PHENOTYPES_FILE, required=False, help='Path to ' + V7_SUBJECT_PHENOTYPES_FILE) parser.add_argument('--sample_attributes_path', default=V7_SAMPLE_ATTRIBUTES_FILE, required=False, help='Path to ' + V7_SAMPLE_ATTRIBUTES_FILE) parser.add_argument( '--data_stewards_repo_path', default='data-stewards', required=False, help='Path to local copy of https://github.com/dcppc/data-stewards') parser.add_argument( '--no_circular_links', action='store_true', help= 'Whether to disallow circular links/paths within the JSON-LD output.') parser.add_argument( '--use_all_dbgap_subject_vars', action='store_true', help= 'Whether to store all available dbGaP variable values as characteristics of the DATS subject Materials.' ) # parser.add_argument('--use_all_dbgap_sample_vars', action='store_true', help ='Whether to store all available dbGaP variable values as characteristics of the DATS sample Materials.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # logging.basicConfig(level=logging.DEBUG) # read portal metadata for subjects and samples p_subjects = portal_files.read_subject_phenotypes_file( args.subject_phenotypes_path) p_samples = portal_files.read_sample_attributes_file( args.sample_attributes_path) portal_files.link_samples_to_subjects(p_samples, p_subjects) # read id dump and manifest files from GitHub data-stewards repo # id dumps subject_id_file = args.data_stewards_repo_path + "/gtex/v7/id_dumps/gtex_v7_subject_ids.txt" gh_subjects = github_files.read_subject_id_file(subject_id_file) sample_id_file = args.data_stewards_repo_path + "/gtex/v7/id_dumps/gtex_v7_sample_ids.txt" gh_samples = github_files.read_sample_id_file(sample_id_file) tissue_id_file = args.data_stewards_repo_path + "/gtex/v7/id_dumps/gtex_v7_tissue_ids.txt" gh_tissues = github_files.read_tissue_id_file(tissue_id_file) # manifest files protected_rnaseq_manifest = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + RNASEQ_MANIFEST_FILE protected_rnaseq_files = github_files.read_protected_rnaseq_manifest( protected_rnaseq_manifest) protected_wgs_manifest = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + WGS_MANIFEST_FILE protected_wgs_files = github_files.read_protected_wgs_manifest( protected_wgs_manifest) # DOIs rnaseq_dois_file = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + RNASEQ_DOIS_FILE rnaseq_dois = github_files.read_dois_manifest(rnaseq_dois_file) wgs_dois_file = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + WGS_DOIS_FILE wgs_dois = github_files.read_dois_manifest(wgs_dois_file) # compare GitHub manifest files with GitHub id dumps cross_check_ids(gh_subjects, gh_samples, protected_rnaseq_files, protected_rnaseq_manifest, "RNA-Seq", "GitHub id dumps") cross_check_ids(gh_subjects, gh_samples, protected_wgs_files, protected_wgs_manifest, "WGS", "GitHub id dumps") # compare GitHub manifest files with GTEx Portal metdata files cross_check_ids(p_subjects, p_samples, protected_rnaseq_files, protected_rnaseq_manifest, "RNA-Seq", "GTEx Portal metadata") cross_check_ids(p_subjects, p_samples, protected_wgs_files, protected_wgs_manifest, "WGS", "GTEx Portal metadata") # create top-level dataset gtex_dataset = ccmm.gtex.wgs_datasets.get_dataset_json() # index dbGaP study Datasets by id dbgap_study_datasets_by_id = {} for tds in gtex_dataset.get("hasPart"): dbgap_study_id = tds.get("identifier").get("identifier") if dbgap_study_id in dbgap_study_datasets_by_id: logging.fatal("encountered duplicate study_id " + dbgap_study_id) sys.exit(1) m = re.match(r'^(phs\d+\.v\d+)\.p\d+$', dbgap_study_id) if m is None: logging.fatal("unable to parse study_id " + dbgap_study_id) sys.exit(1) dbgap_study_datasets_by_id[m.group(1)] = tds # read public dbGaP metadata pub_xp = args.dbgap_public_xml_path restricted_mp = args.dbgap_protected_metadata_path # read public metadata dbgap_study_pub_md = ccmm.gtex.public_metadata.read_study_metadata(pub_xp) # there should be only one study study_ids = [k for k in dbgap_study_pub_md.keys()] n_study_ids = len(study_ids) study_id = study_ids[0] if n_study_ids != 1: logging.fatal("read " + str(n_study_ids) + " dbGaP studies from " + pub_xp) sys.exit(1) dbgap_study_dataset = dbgap_study_datasets_by_id[study_id] dbgap_study_md = dbgap_study_pub_md[study_id] sv = ccmm.gtex.public_metadata.add_study_vars(dbgap_study_dataset, dbgap_study_md) dbgap_study_md['id_to_var'] = sv['id_to_var'] dbgap_study_md['type_name_cg_to_var'] = sv['type_name_cg_to_var'] # set 2nd level types to be the same as the top-level types: WGS and RNA-Seq dbgap_study_dataset.set("types", gtex_dataset.get("types")) # cache used to minimize duplication of JSON objects in JSON-LD output cache = DatsObjCache() # -------------------------- # subjects # -------------------------- # create subjects based on GTEx Portal subject phenotype file and GitHub data-stewards id dump dats_subjects_d = ccmm.gtex.subjects.get_subjects_dats_materials( cache, p_subjects, gh_subjects, dbgap_study_md['type_name_cg_to_var']['Subject_Phenotypes']) # sorted list of subjects dats_subjects_l = sorted([dats_subjects_d[s] for s in dats_subjects_d], key=lambda s: s.get("name")) # TODO - add consent groups, of which GTEx has 2: 0=didn't participate, 1=General Research Use (GRU) # create StudyGroup that lists all the subjects logging.info("creating 'all subjects' StudyGroup containing " + str(len(dats_subjects_l)) + " subject(s) from public metadata") all_subjects = DatsObj( "StudyGroup", [ ("name", "all subjects"), # subjects appear in full here, but id references will be used elsewhere in the instance: ("members", dats_subjects_l), ("size", len(dats_subjects_l)) ]) # create link back from each subject to the parent StudyGroup if args.no_circular_links: logging.warn( "not creating Subject level circular links because of --no_circular_links option" ) else: for s in dats_subjects_l: cl = s.get("characteristics") cl.append( DatsObj("Dimension", [("name", "member of study group"), ("values", [all_subjects.getIdRef()])])) dats_study = DatsObj("Study", [("name", "GTEx"), ("studyGroups", [all_subjects])]) # link Study to Dataset dbgap_study_dataset.set("producedBy", dats_study) # -------------------------- # sample Materials # -------------------------- # create samples based on GTEx Portal sample attributes file and GitHub data-stewards id dump dats_samples_d = ccmm.gtex.samples.get_samples_dats_materials( cache, dats_subjects_d, p_samples, gh_samples, dbgap_study_md['type_name_cg_to_var']['Sample_Attributes']) # sorted list of samples dats_samples_l = sorted([dats_samples_d[s] for s in dats_samples_d], key=lambda s: s.get("name")) if args.max_output_samples is not None: dats_samples_l = dats_samples_l[0:int(args.max_output_samples)] logging.warn("limiting output to " + str(len(dats_samples_l)) + " sample(s) due to value of --max_output_samples") dbgap_study_dataset.set("isAbout", dats_samples_l) # -------------------------- # file Datasets # -------------------------- file_datasets_l = [] # WGS CRAM wgs_dats_file_datasets_l = ccmm.gtex.samples.get_files_dats_datasets( cache, dats_samples_d, p_samples, gh_samples, protected_wgs_files, wgs_dois, args.no_circular_links) logging.info("adding Datasets for " + str(len(wgs_dats_file_datasets_l)) + " WGS CRAM files") file_datasets_l.extend(wgs_dats_file_datasets_l) # RNA-Seq CRAM rnaseq_dats_file_datasets_l = ccmm.gtex.samples.get_files_dats_datasets( cache, dats_samples_d, p_samples, gh_samples, protected_rnaseq_files, rnaseq_dois, args.no_circular_links) logging.info("adding Datasets for " + str(len(rnaseq_dats_file_datasets_l)) + " RNA-Seq CRAM files") file_datasets_l.extend(rnaseq_dats_file_datasets_l) dbgap_study_dataset.set("hasPart", file_datasets_l) # augment public (meta)data with restricted-access (meta)data if restricted_mp is not None: # create study groups and update subjects/samples with restricted phenotype data add_restricted_data(cache, args, dbgap_study_md, dats_subjects_l, dats_samples_d, dats_study, study_id) # write Dataset to DATS JSON file with open(args.output_file, mode="w") as jf: jf.write(json.dumps(gtex_dataset, indent=2, cls=DATSEncoder))
def get_files_dats_datasets(cache, dats_samples_d, p_samples, gh_samples, protected_cram_files, no_circular_links): file_datasets = [] rnaseq_datatype = DatsObj("DataType", [ ("information", util.get_annotation("transcription profiling", cache)), ("method", util.get_annotation("RNA-seq assay", cache)), ("platform", util.get_annotation("Illumina", cache)) ]) def get_rnaseq_datatype(): dkey = ".".join(["DataType", "RNA-seq"]) return cache.get_obj_or_ref(dkey, lambda: rnaseq_datatype) wgs_datatype = DatsObj( "DataType", [("information", util.get_annotation("DNA sequencing", cache)), ("method", util.get_annotation("whole genome sequencing assay", cache)), ("platform", util.get_annotation("Illumina", cache))]) def get_wgs_datatype(): dkey = ".".join(["DataType", "WGS"]) return cache.get_obj_or_ref(dkey, lambda: wgs_datatype) broad_key = ":".join(["Organization", "Broad Institute"]) broad = cache.get_obj_or_ref( broad_key, lambda: DatsObj("Organization", [("name", "Broad Institute")])) creators = [broad] def make_data_standard(format): return DatsObj("DataStandard", [("name", format), ("type", util.get_value_annotation("format", cache)), ("description", format + " file format")]) cram_ds_key = ":".join(["DataStandard", "CRAM"]) cram_ds = cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) crai_ds_key = ":".join(["DataStandard", "CRAI"]) crai_ds = cache.get_obj_or_ref(crai_ds_key, lambda: make_data_standard("CRAI")) for sample_id in protected_cram_files: file = protected_cram_files[sample_id] material_type = None ds_types = None # determine file type if re.search(r'wgs\/', file['cram_file_aws']['raw_value']): material_type = 'DNA' ds_types = get_wgs_datatype() gcp_suffix = '_gcp' elif re.search(r'rnaseq\/', file['cram_file_aws']['raw_value']): material_type = 'RNA' ds_types = get_rnaseq_datatype() gcp_suffix = '' else: logging.fatal( "unable to determine material/sequence type from cram_file_aws=" + file['cram_file_aws']['raw_value']) sys.exit(1) # RNA-Seq keys = sample_id cram_file cram_file_md5 cram_file_size cram_index cram_file_aws cram_index_aws # WGS keys = same as above + firecloud_id cram_file = file['cram_file' + gcp_suffix]['raw_value'] cram_file_md5 = file['cram_file_md5']['raw_value'] # TODO - review the following encoding decisions: # - storing .crai URI as relatedIdentifier of the DatasetDistribution for the .cram file # - storing MD5 checksum of the .cram file as an extraProperty of the DatasetDistribution # - storing firecloud_id as a relatedIdentifier of the Dataset (not the DatasetDistribution) # Google Cloud Platform / Google Storage copy gs_access = DatsObj( "Access", [("landingPage", file['cram_file' + gcp_suffix]['raw_value'])]) gs_distro = DatsObj( "DatasetDistribution", [ ("access", gs_access), ("identifier", DatsObj("Identifier", [("identifier", file['cram_file' + gcp_suffix]['raw_value'])])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", file['cram_index' + gcp_suffix]['raw_value']), ("relationType", "cram_index")]) ]), ("size", int(file['cram_file_size']['raw_value'])), # TODO - add unit for bytes, include IRI? # ("unit", util.get_value_annotation("bytes", cache)) ("conformsTo", [ cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) ]) ]) # AWS / S3 copy s3_access = DatsObj( "Access", [("landingPage", file['cram_file_aws']['raw_value'])]) s3_distro = DatsObj( "DatasetDistribution", [ ("access", s3_access), ("identifier", DatsObj("Identifier", [ ("identifier", file['cram_file_aws']['raw_value']) ])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [ ("identifier", file['cram_index_aws']['raw_value']), ("relationType", "cram_index") ]) ]), ("size", int(file['cram_file_size']['raw_value'])), # TODO - add unit for bytes, include IRI? # ("unit", util.get_value_annotation("bytes", cache)) ("conformsTo", [ cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) ]) ]) m = re.match(r'^.*\/([^\/]+)$', cram_file) if m is None: logging.fatal("unable to parse filename from CRAM file URI " + cram_file) sys.exit(1) filename = m.group(1) # TODO - replace this with DATS-specific MD5 checksum encoding (TBD) md5_dimension = DatsObj( "Dimension", [("name", util.get_value_annotation("MD5", cache)), ("values", [file['cram_file_md5']['raw_value']])]) ds = DatsObj("Dataset", [ ("distributions", [gs_distro, s3_distro]), ("dimensions", [md5_dimension]), ("title", filename), ("types", [ds_types]), ("creators", creators), ]) # add firecloud_id for WGS if 'firecloud_id' in file: f_id = DatsObj("RelatedIdentifier", [("identifier", file['firecloud_id']['raw_value']), ("identifierSource", "FireCloud")]) ds.set("relatedIdentifiers", [f_id]) # input RNA/DNA extract that was sequenced if sample_id not in dats_samples_d: logging.fatal("no sample exists for " + sample_id + " found in file " + file['cram_file_aws']['raw_value']) sys.exit(1) dats_sample = dats_samples_d[sample_id] dats_samp_key = ":".join(["Material", dats_sample.get("name")]) dats_samp = cache.get_obj_or_ref(dats_samp_key, lambda: dats_sample) da = DatsObj( "DataAcquisition", [("name", filename), ("input", [dats_samp]) # ("uses", []) # software used ]) if not no_circular_links: # circular link back to enclosing Dataset as the output da.set("output", [ds.getIdRef()]) ds.set("producedBy", da) file_datasets.append(ds) return file_datasets