def get_single_sample_json(sample, dats_obj_cache): # print("converting sample to json: " + str(sample)) samp_id = sample['SAMPID']['mapped_value'] subj_id = sample['SUBJID']['mapped_value'] subject = sample['subject'] # Uberon id (or EFO id, contrary to the documentation) anat_id = sample['SMUBRID']['mapped_value'] if anat_id is None: print("No Uberon/anatomy ID specified for sample " + samp_id) sys.exit(1) anatomy_identifier = None anatomy_alt_ids = None # TODO - query anatomy term from UBERON/EFO instead? anatomy_name = sample['SMTSD']['mapped_value'] # EFO id if re.match(r'^EFO_\d+', anat_id): anatomy_identifier = OrderedDict([("identifier", anat_id), ("identifierSource", "EFO")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" + str(anat_id)), ("identifierSource", "EFO") ]) ] # Uberon id else: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] # anatomical part anat_part_key = ":".join(["AnatomicalPart", anatomy_name]) if anat_part_key in dats_obj_cache: anatomical_part = dats_obj_cache[anat_part_key] else: anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) dats_obj_cache[anat_part_key] = anatomical_part # human experimental subject/patient subject_sex = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Gender")])), ("description", "Gender of the subject"), ("identifier", DatsObj("Identifier", [("identifier", "SEX"), ("identifierSource", "GTEx")])), ("values", [subject['SEX']['mapped_value']])]) subject_age = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Age range")])), ("description", "Age range of the subject"), ("identifier", DatsObj("Identifier", [("identifier", "AGE"), ("identifierSource", "GTEx")])), ("values", [subject['AGE']['mapped_value']])]) subject_hardy_scale = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Hardy scale")])), ("description", "Hardy scale death classification for the subject"), ("identifier", DatsObj("Identifier", [("identifier", "DTHHRDY"), ("identifierSource", "GTEx")])), ("values", [subject['DTHHRDY']['mapped_value']])]) subject_characteristics = [subject_sex, subject_age, subject_hardy_scale] # human experimental subject/patient subj_key = ":".join(["Material", subj_id]) if subj_key in dats_obj_cache: subject_material = dats_obj_cache[subj_key] else: subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", { "identifier": subj_id }), ("description", "GTEx subject " + subj_id), ("characteristics", subject_characteristics), ("taxonomy", [util.get_taxon_human(dats_obj_cache)]), ("roles", util.get_donor_roles(dats_obj_cache))]) dats_obj_cache[subj_key] = subject_material specimen_annot = util.get_annotation("specimen", dats_obj_cache) rna_extract_annot = util.get_annotation("RNA extract", dats_obj_cache) # biological/tissue sample sample_name = samp_id biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("description", anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", [util.get_taxon_human(dats_obj_cache)]), ("roles", [specimen_annot]), ("derivesFrom", [subject_material, anatomical_part])]) # RNA extracted from tissue sample rna_material = DatsObj( "Material", [("name", "RNA from " + sample_name), ("description", "total RNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", [util.get_taxon_human(dats_obj_cache)]), ("roles", [rna_extract_annot]), ("derivesFrom", [biological_sample_material])]) return rna_material
def get_files_dats_datasets(cache, dats_samples_d, sample_manifest, file_guids, no_circular_links): file_datasets_l = [] wgs_datatype = DatsObj( "DataType", [("information", util.get_annotation("DNA sequencing", cache)), ("method", util.get_annotation("whole genome sequencing assay", cache)), ("platform", util.get_annotation("Illumina", cache))]) def get_wgs_datatype(): dkey = ".".join(["DataType", "WGS"]) return cache.get_obj_or_ref(dkey, lambda: wgs_datatype) snp_datatype = DatsObj( "DataType", [("information", util.get_annotation("SNP", cache)), ("method", util.get_annotation("SNP analysis", cache))]) cnv_datatype = DatsObj( "DataType", [("information", util.get_annotation("CNV", cache)), ("method", util.get_annotation("CNV analysis", cache))]) def get_snp_datatype(): dkey = ".".join(["DataType", "SNP"]) return cache.get_obj_or_ref(dkey, lambda: snp_datatype) def get_cnv_datatype(): dkey = ".".join(["DataType", "CNV"]) return cache.get_obj_or_ref(dkey, lambda: cnv_datatype) nhlbi_key = ":".join(["Organization", "NHLBI"]) nhlbi = cache.get_obj_or_ref(nhlbi_key, lambda: NIH_NHLBI) creators = [nhlbi] def make_data_standard(format): return DatsObj("DataStandard", [("name", format), ("type", util.get_value_annotation("format", cache)), ("description", format + " file format")]) cram_ds_key = ":".join(["DataStandard", "CRAM"]) cram_dstan = cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) vcf_ds_key = ":".join(["DataStandard", "VCF"]) vcf_dstan = cache.get_obj_or_ref(vcf_ds_key, lambda: make_data_standard("VCF")) n_samples = len(dats_samples_d) n_samples_found = 0 for sample_id in dats_samples_d: dats_sample = dats_samples_d[sample_id] # look up corresponding file paths in manifest file if sample_id not in sample_manifest: logging.debug("sample not found in manifest - " + sample_id) continue n_samples_found += 1 ms = sample_manifest[sample_id] material_type = 'DNA' wgs_type = get_wgs_datatype() snp_type = get_snp_datatype() cnv_type = get_cnv_datatype() # ------------------------------------------------ # WGS sequence - CRAM and CRAI files # ------------------------------------------------ def get_filename(gs_uri): m = re.match(r'^.*\/([^\/]+)$', gs_uri) if m is None: logging.fatal("unable to parse filename from " + gs_uri) sys.exit(1) filename = m.group(1) return filename gs_cram = ms['gs_cram']['mapped_value'] gs_crai = ms['gs_crai']['mapped_value'] # GUID lookup cram_file = get_filename(gs_cram) crai_file = get_filename(gs_crai) cram_doi = file_guids[cram_file]['Sodium_GUID']['raw_value'] cram_size = file_guids[cram_file]['File size']['raw_value'] cram_md5 = file_guids[cram_file]['md5sum']['raw_value'] crai_doi = file_guids[crai_file]['Sodium_GUID']['raw_value'] crai_md5 = file_guids[crai_file]['md5sum']['raw_value'] # handle file size values with "e" in them def filesize_to_int(size): if re.match(r'.*e.*', size): size = int(float(size)) else: size = int(size) # Google Cloud Platform / Google Storage copy gs_cram_access = DatsObj("Access", [("accessURL", gs_cram)]) gs_cram_distro = DatsObj( "DatasetDistribution", [ ("access", gs_cram_access), ("identifier", DatsObj("Identifier", [("identifier", cram_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", crai_doi), ("relationType", "cram_index")]) ]), ("size", filesize_to_int(cram_size)), # TODO - add file size units ("conformsTo", [cram_dstan]) ]) # AWS / S3 copy s3_cram = ms['s3_cram']['mapped_value'] s3_crai = ms['s3_crai']['mapped_value'] s3_cram_access = DatsObj("Access", [("accessURL", s3_cram)]) s3_cram_distro = DatsObj( "DatasetDistribution", [ ("access", s3_cram_access), ("identifier", DatsObj("Identifier", [("identifier", cram_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", crai_doi), ("relationType", "cram_index")]) ]), ("size", filesize_to_int(cram_size)), # TODO - add file size units ("conformsTo", [cram_dstan]) ]) m = re.match(r'^.*\/([^\/]+)$', gs_cram) if m is None: logging.fatal("unable to parse filename from CRAM file URI " + gs_cram) sys.exit(1) filename = m.group(1) # TODO - replace this with DATS-specific MD5 checksum encoding (TBD) md5_dimension = DatsObj( "Dimension", [("name", util.get_value_annotation("MD5", cache)), ("values", [cram_md5])]) cram_dataset = DatsObj("Dataset", [ ("distributions", [gs_cram_distro, s3_cram_distro]), ("dimensions", [md5_dimension]), ("title", filename), ("types", [wgs_type]), ("creators", creators), ]) cram_da = DatsObj( "DataAcquisition", [("name", filename), ("input", [dats_sample.getIdRef()]) # ("uses", []) # software used ]) cram_dataset.set("producedBy", cram_da) # circular link back to enclosing Dataset as the output if not no_circular_links: cram_da.set("output", [cram_dataset.getIdRef()]) file_datasets_l.append(cram_dataset) # ------------------------------------------------ # Variant calls - VCF and CSI files # ------------------------------------------------ gs_vcf = ms['gs_vcf']['mapped_value'] gs_csi = ms['gs_csi']['mapped_value'] if gs_vcf is None: logging.warn("no VCF file found for " + sample_id) continue # GUID lookup vcf_file = get_filename(gs_vcf) csi_file = get_filename(gs_csi) vcf_doi = file_guids[vcf_file]['Sodium_GUID']['raw_value'] vcf_size = file_guids[vcf_file]['File size']['raw_value'] vcf_md5 = file_guids[vcf_file]['md5sum']['raw_value'] csi_doi = file_guids[csi_file]['Sodium_GUID']['raw_value'] csi_md5 = file_guids[csi_file]['md5sum']['raw_value'] # Google Cloud Platform / Google Storage copy gs_vcf_access = DatsObj("Access", [("accessURL", gs_vcf)]) gs_vcf_distro = DatsObj( "DatasetDistribution", [ ("access", gs_vcf_access), ("identifier", DatsObj("Identifier", [("identifier", vcf_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", csi_doi), ("relationType", "vcf_index")]) ]), ("size", filesize_to_int(vcf_size)), # TODO - add file size units ("conformsTo", [vcf_dstan]) ]) # AWS / S3 copy s3_vcf = ms['s3_vcf']['mapped_value'] s3_csi = ms['s3_csi']['mapped_value'] s3_vcf_access = DatsObj("Access", [("accessURL", s3_vcf)]) s3_vcf_distro = DatsObj( "DatasetDistribution", [ ("access", s3_vcf_access), ("identifier", DatsObj("Identifier", [("identifier", vcf_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", csi_doi), ("relationType", "vcf_index")]) ]), ("size", filesize_to_int(vcf_size)), # TODO - add file size units ("conformsTo", [vcf_dstan]) ]) m = re.match(r'^.*\/([^\/]+)$', gs_vcf) if m is None: logging.fatal("unable to parse filename from VCF file URI " + gs_vcf) sys.exit(1) filename = m.group(1) # TODO - replace this with DATS-specific MD5 checksum encoding (TBD) md5_dimension = DatsObj( "Dimension", [("name", util.get_value_annotation("MD5", cache)), ("values", [vcf_md5])]) vcf_dataset = DatsObj("Dataset", [ ("distributions", [gs_vcf_distro, s3_vcf_distro]), ("dimensions", [md5_dimension]), ("title", filename), ("types", [snp_type, cnv_type]), ("creators", creators), ]) vcf_da = DatsObj( "DataAcquisition", [("name", filename), ("input", [dats_sample.getIdRef()]) # ("uses", []) # software used ]) vcf_dataset.set("producedBy", vcf_da) # circular link back to enclosing Dataset as the output if not no_circular_links: vcf_da.set("output", [vcf_dataset.getIdRef()]) file_datasets_l.append(vcf_dataset) logging.info("found " + str(n_samples_found) + " / " + str(n_samples) + " sample(s) in TOPMed file manifest") return file_datasets_l
def get_single_dna_extract_json(cache, study, study_md, subj_var_values, samp_var_values): # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"... # Few samples are saliva samples probably due to sample collection issues name = None if 'BODY_SITE' in samp_var_values: name = 'BODY_SITE' elif 'Body_Site' in samp_var_values: name = 'Body_Site' elif 'Body Site' in samp_var_values: name = 'Body Site' anat_id = None anatomy_name = None if name is not None: if "blood" in samp_var_values[name]['value'].lower(): anatomy_name = "blood" anat_id = "0000178" elif samp_var_values[name]['value'].lower() == "saliva": anatomy_name = "saliva" anat_id = "0001836" else: logging.fatal( "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - " + samp_var_values['BODY_SITE']['value']) sys.exit(1) if anat_id is not None: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] # extract subject attributes gender = None age = None visit_year = None sys_bp = None dias_bp = None disease = {} disease['hypertension'] = "unknown" for name in subj_var_values: name_upper = name.upper() if name_upper == "GENDER" or name_upper == "SEX": gender = subj_var_values[name]['value'].lower() elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL": #need to confirm that these allmean the same thing age = subj_var_values[name]['value'] elif name_upper == "VISIT_YEAR": visit_year = subj_var_values[name]['value'] elif name_upper == "SYSBP": sys_bp = subj_var_values[name]['value'] elif name_upper == "DIASBP": dias_bp = subj_var_values[name]['value'] elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES": if subj_var_values[name]['value'].lower( ) == "yes" or subj_var_values[name]['value'] == '1': disease['hypertension'] = "yes" elif re.match(r'\S', subj_var_values[name]['value']): disease['hypertension'] = "no" # anatomical part anatomical_part = None if anatomy_name is not None: anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) subject_characteristics = [] subject_bearerOfDisease = [] # harmonized/standardized characteristics if gender is not None: subject_sex = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Gender")])), ("description", "Gender of the subject"), ("values", [gender])]) subject_characteristics.append(subject_sex) if age is not None: subject_age = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Age")])), ("description", "Age of the subject"), ("values", [age])]) subject_characteristics.append(subject_age) if visit_year is not None: subject_visitYear = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Visit year")])), ("description", "Year of visit, to use for longitudinal analysis"), ("values", [visit_year]) ]) subject_characteristics.append(subject_visitYear) if sys_bp is not None: subject_sysBP = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Systolic blood pressure")])), ("description", "Systolic blood pressure of subject, measured in mmHg"), ("values", [sys_bp])]) subject_characteristics.append(subject_sysBP) if dias_bp is not None: subject_diasBP = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Diastolic blood pressure")])), ("description", "Diastolic blood pressure of subject, measured in mmHg"), ("values", [dias_bp])]) subject_characteristics.append(subject_diasBP) if disease['hypertension'] != "unknown": disease_name = "hypertension" disease_id = "10763" disease_identifier = OrderedDict([ ("identifier", "DOID:" + str(disease_id)), ("identifierSource", "Disease Ontology") ]) disease_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)), ("identifierSource", "Disease Ontology") ]) ] subject_hypertension = DatsObj("Disease", [ ("name", "Hypertension"), ("identifier", disease_identifier), ("alternateIdentifiers", disease_alt_ids), ("diseaseStatus", DatsObj("Annotation", [("value", disease['hypertension']), ("valueIRI", "")])), ]) subject_bearerOfDisease.append(subject_hypertension) # create a DATS Dimension from a dbGaP variable value def make_var_dimension(name, var_value): value = var_value["value"] dim = DatsObj("Dimension", [("name", DatsObj("Annotation", [("value", name)])), ("values", [value])]) # find existing DATS identifier for the corresponding Dataset Dimension if "var" in var_value: id = var_value["var"]["id"] dbgap_var_dim = study_md['dbgap_vars'][id] dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef()) return dim # create DATS Dimensions for dbGaP subject metadata subject_dimensions = [ make_var_dimension(vname, subj_var_values[vname]) for vname in sorted(subj_var_values) ] # create DATS Dimensions for dbGaP sample metadata sample_dimensions = [ make_var_dimension(vname, samp_var_values[vname]) for vname in sorted(samp_var_values) ] # "raw" characteristics from dbGaP metadata subject_characteristics.extend(subject_dimensions) sample_characteristics = sample_dimensions human_t = util.get_taxon_human(cache) subj_id = subj_var_values['SUBJECT_ID']['value'] dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value'] samp_id = samp_var_values['SAMPLE_ID']['value'] dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value'] study_title = study.get("title") # human experimental subject/patient subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", { "identifier": subj_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]), ("description", study_title + " subject " + subj_id), ("characteristics", subject_characteristics), ("bearerOfDisease", subject_bearerOfDisease), ("taxonomy", [human_t]), ("roles", util.get_donor_roles(cache))]) # TODO - use DatsObjCache specimen_annot = util.get_annotation("specimen") dna_extract_annot = util.get_annotation("DNA extract") # biological/tissue sample sample_name = samp_id sample_derives_from = [subject_material] sample_descr = "specimen collected from subject " + subj_id if anatomical_part is not None: sample_derives_from.append(anatomical_part) sample_descr = anatomy_name + " " + sample_descr biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]), ("description", sample_descr), ("characteristics", sample_characteristics), ("taxonomy", [human_t]), ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)]) # DNA extracted from tissue sample dna_descr = "DNA extracted from specimen collected from subject " + subj_id if anatomical_part is not None: dna_descr = "DNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id dna_material = DatsObj("Material", [("name", "DNA from " + sample_name), ("description", dna_descr), ("taxonomy", [human_t]), ("roles", [dna_extract_annot]), ("derivesFrom", [biological_sample_material])]) return dna_material
def get_sample_dats_material(cache, dats_subject, study, study_md, samp_var_values): # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"... # Few samples are saliva samples probably due to sample collection issues name = None if 'BODY_SITE' in samp_var_values: name = 'BODY_SITE' elif 'Body_Site' in samp_var_values: name = 'Body_Site' elif 'Body Site' in samp_var_values: name = 'Body Site' anat_id = None anatomy_name = None if name is not None: if "blood" in samp_var_values[name]['value'].lower(): anatomy_name = "blood" anat_id = "0000178" elif samp_var_values[name]['value'].lower() == "saliva": anatomy_name = "saliva" anat_id = "0001836" else: logging.fatal( "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - " + samp_var_values['BODY_SITE']['value']) sys.exit(1) def make_anat_part(anat_id, anatomy_name): # anatomical part anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name)]) if anat_id is not None: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] anatomical_part.set("identifier", anatomy_identifier) anatomical_part.set("alternateIdentifiers", anatomy_alt_ids) return anatomical_part if anatomy_name is not None: # use cached value for AnatomicalPart if possible anat_part_key = ":".join(["AnatomicalPart", anatomy_name]) anatomical_part = cache.get_obj_or_ref( anat_part_key, lambda: make_anat_part(anat_id, anatomy_name)) else: anatomical_part = None # create a DATS Dimension from a dbGaP variable value def make_var_dimension(name, var_value): value = var_value["value"] dim = DatsObj("Dimension", [("name", DatsObj("Annotation", [("value", name)])), ("values", [value])]) # find existing DATS identifier for the corresponding Dataset Dimension if "var" in var_value: dbgap_var_dim = var_value["var"]["dim"] dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef()) return dim # create DATS Dimensions for dbGaP sample metadata sample_dimensions = [ make_var_dimension(vname, samp_var_values[vname]) for vname in sorted(samp_var_values) ] sample_characteristics = sample_dimensions samp_id = samp_var_values['SAMPLE_ID']['value'] dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value'] study_title = study.get("title") specimen_annot = util.get_annotation("specimen", cache) # corresponding DATS subject Material subj_key = ":".join(["Material", dats_subject.get("name")]) dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject) dats_subj_name = dats_subject.get("name") # biological/tissue sample sample_name = samp_id sample_derives_from = [dats_subj] sample_descr = "specimen collected from subject " + dats_subj_name if anatomical_part is not None: sample_derives_from.append(anatomical_part) sample_descr = anatomy_name + " " + sample_descr biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]), ("description", sample_descr), ("characteristics", sample_characteristics), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)]) # RNA or DNA extracted from tissue sample stype = "DNA" # TODO - check if RNA, not DNA dna_or_rna_descr = stype + " extracted from specimen collected from subject " + dats_subj_name if anatomical_part is not None: dna_or_rna_descr = stype + " extracted from " + anatomy_name + " specimen collected from subject " + dats_subj_name dna_or_rna_material = DatsObj( "Material", [("name", stype + " from " + sample_name), ("description", dna_or_rna_descr), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", [util.get_annotation(stype + " extract", cache)]), ("derivesFrom", [biological_sample_material])]) return dna_or_rna_material
def get_dbgap_studies(qterm): studies = [] study = None lnum = 0 # Add newline before each occurrence of "Versions" if not already present lines = [] for line in GTEX_STUDIES_STR.split('\n'): m = re.match(r'^(\S+.*)(Versions?.*)$', line) if m is None: lines.append(line) else: lines.append(m.group(1)) lines.append(m.group(2)) for line in lines: lnum += 1 # blank line if re.match(r'^\s*$', line): continue # study id m = re.match('^(phs\S+)$', line) if m is not None: study = {'id': m.group(1)} studies.append(study) continue # study description m = re.match(r'^Genotype-Tissue Expression(.*)$', line) if m is not None: study['descr'] = m.group(1) continue # embargo release(s) m = re.match(r'^(Version.*)$', line) if m is not None: if 'versions' not in study: study['versions'] = [] study['versions'].append(m.group(1)) continue # details/participants/type of study m = re.match('^VDAS(\d+)(\D.*)Links$', line) if m is not None: study['n_participants'] = int(m.group(1)) study['study_type'] = m.group(2) continue # platform m = re.match(r'^(HiSeq.*)$', line) if m is not None: study['platform'] = m.group(1) continue # parse error logging.fatal("unexpected content at line " + str(lnum) + " of dbGaP studies: " + line) sys.exit(1) n_studies = len(studies) logging.info("found " + str(n_studies) + " GTEx study in dbGaP") # convert studies to DATS Datasets datasets = [] for s in studies: m = re.match(r'^phs\d+\.(v\d+)\.p\d+$', s['id']) if m is None: logging.fatal( "unable to parse dataset/study version from study id " + s['id']) sys.exit(1) version = m.group(1) dimensions = [ DatsObj("Dimension", [ ("name", { "value": "Actual Subject Count" }), ("description", "The actual number of subjects entered into a clinical trial." ), ("types", [util.get_annotation("Actual Subject Number")]), ("values", [s['n_participants']]) ]) ] types = [ OrderedDict([ ("information", util.get_annotation("DNA sequencing")), ("method", util.get_annotation("whole genome sequencing assay")), ("platform", HISEQ_TYPES[s['platform']]) ]) ] # TODO - Specify creators and release date(s) of this particular dataset. # This may require parsing some of the metadata files and/or documents. # TODO - required field - using NIH NHLBI as placeholder, but need to revisit and assign specific study-level creator creators = [NIH_NHGRI] # TODO - find better location for study_type? extra_props = [ DatsObj("CategoryValuesPair", [("category", "study_type"), ("values", [s['study_type']])]) ] # Dataset dataset = DatsObj( "Dataset", [ ("identifier", DatsObj("Identifier", [("identifier", s['id'])])), ("version", version), # ("dates", []), #("title", s['descr']), ("title", "Genotype-Tissue Expression Project (GTEx) WGS and RNA-Seq data" ), ("storedIn", DB_GAP), ("types", types), ("creators", creators), ("dimensions", dimensions), ("extraProperties", extra_props) # ("producedBy", data_analysis), # ("distributions", [DatsObj("DatasetDistribution", [ # ("access", DatsObj("Access", [ # ("landingPage", GTEX_DATASETS_URL) # ])) # ])]), ]) datasets.append(dataset) return datasets
#To do: incorporate #HumanOmni5-Quad #HumanOmni2.5 #Infinium HumanExome BeadChip #HiSeq 2000 #HiSeq 2000 #GeneChip Human Gene 1.0 ST Array #HiSeq 2000 #HiSeq X Ten DBGAP_QUERY_URL_PREFIX = 'https://www.ncbi.nlm.nih.gov/gap/?term=' DBGAP_GTEX_QUERY_URL = DBGAP_QUERY_URL_PREFIX + 'phs000424' HISEQ_TYPES = { "HiSeq 2000": util.get_annotation("Illumina HiSeq 2000"), "HiSeq X Ten": util.get_annotation("Illumina HiSeq X Ten"), } # TODO - duplicated from rnaseq_datasets.py DB_GAP = DatsObj("DataRepository", [("name", "dbGaP")]) NIH_NHGRI = DatsObj("Organization", [("name", "National Human Genome Research Institute"), ("abbreviation", "NHGRI")]) # TODO - use DatsObjCache cache = None GTEX_V7_RNASEQ_TYPE = DatsObj( "DataType", [("information", util.get_annotation("transcription profiling", cache)),
def get_files_dats_datasets(cache, dats_samples_d, p_samples, gh_samples, protected_cram_files, no_circular_links): file_datasets = [] rnaseq_datatype = DatsObj("DataType", [ ("information", util.get_annotation("transcription profiling", cache)), ("method", util.get_annotation("RNA-seq assay", cache)), ("platform", util.get_annotation("Illumina", cache)) ]) def get_rnaseq_datatype(): dkey = ".".join(["DataType", "RNA-seq"]) return cache.get_obj_or_ref(dkey, lambda: rnaseq_datatype) wgs_datatype = DatsObj( "DataType", [("information", util.get_annotation("DNA sequencing", cache)), ("method", util.get_annotation("whole genome sequencing assay", cache)), ("platform", util.get_annotation("Illumina", cache))]) def get_wgs_datatype(): dkey = ".".join(["DataType", "WGS"]) return cache.get_obj_or_ref(dkey, lambda: wgs_datatype) broad_key = ":".join(["Organization", "Broad Institute"]) broad = cache.get_obj_or_ref( broad_key, lambda: DatsObj("Organization", [("name", "Broad Institute")])) creators = [broad] def make_data_standard(format): return DatsObj("DataStandard", [("name", format), ("type", util.get_value_annotation("format", cache)), ("description", format + " file format")]) cram_ds_key = ":".join(["DataStandard", "CRAM"]) cram_ds = cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) crai_ds_key = ":".join(["DataStandard", "CRAI"]) crai_ds = cache.get_obj_or_ref(crai_ds_key, lambda: make_data_standard("CRAI")) for sample_id in protected_cram_files: file = protected_cram_files[sample_id] material_type = None ds_types = None # determine file type if re.search(r'wgs\/', file['cram_file_aws']['raw_value']): material_type = 'DNA' ds_types = get_wgs_datatype() gcp_suffix = '_gcp' elif re.search(r'rnaseq\/', file['cram_file_aws']['raw_value']): material_type = 'RNA' ds_types = get_rnaseq_datatype() gcp_suffix = '' else: logging.fatal( "unable to determine material/sequence type from cram_file_aws=" + file['cram_file_aws']['raw_value']) sys.exit(1) # RNA-Seq keys = sample_id cram_file cram_file_md5 cram_file_size cram_index cram_file_aws cram_index_aws # WGS keys = same as above + firecloud_id cram_file = file['cram_file' + gcp_suffix]['raw_value'] cram_file_md5 = file['cram_file_md5']['raw_value'] # TODO - review the following encoding decisions: # - storing .crai URI as relatedIdentifier of the DatasetDistribution for the .cram file # - storing MD5 checksum of the .cram file as an extraProperty of the DatasetDistribution # - storing firecloud_id as a relatedIdentifier of the Dataset (not the DatasetDistribution) # Google Cloud Platform / Google Storage copy gs_access = DatsObj( "Access", [("landingPage", file['cram_file' + gcp_suffix]['raw_value'])]) gs_distro = DatsObj( "DatasetDistribution", [ ("access", gs_access), ("identifier", DatsObj("Identifier", [("identifier", file['cram_file' + gcp_suffix]['raw_value'])])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", file['cram_index' + gcp_suffix]['raw_value']), ("relationType", "cram_index")]) ]), ("size", int(file['cram_file_size']['raw_value'])), # TODO - add unit for bytes, include IRI? # ("unit", util.get_value_annotation("bytes", cache)) ("conformsTo", [ cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) ]) ]) # AWS / S3 copy s3_access = DatsObj( "Access", [("landingPage", file['cram_file_aws']['raw_value'])]) s3_distro = DatsObj( "DatasetDistribution", [ ("access", s3_access), ("identifier", DatsObj("Identifier", [ ("identifier", file['cram_file_aws']['raw_value']) ])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [ ("identifier", file['cram_index_aws']['raw_value']), ("relationType", "cram_index") ]) ]), ("size", int(file['cram_file_size']['raw_value'])), # TODO - add unit for bytes, include IRI? # ("unit", util.get_value_annotation("bytes", cache)) ("conformsTo", [ cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) ]) ]) m = re.match(r'^.*\/([^\/]+)$', cram_file) if m is None: logging.fatal("unable to parse filename from CRAM file URI " + cram_file) sys.exit(1) filename = m.group(1) # TODO - replace this with DATS-specific MD5 checksum encoding (TBD) md5_dimension = DatsObj( "Dimension", [("name", util.get_value_annotation("MD5", cache)), ("values", [file['cram_file_md5']['raw_value']])]) ds = DatsObj("Dataset", [ ("distributions", [gs_distro, s3_distro]), ("dimensions", [md5_dimension]), ("title", filename), ("types", [ds_types]), ("creators", creators), ]) # add firecloud_id for WGS if 'firecloud_id' in file: f_id = DatsObj("RelatedIdentifier", [("identifier", file['firecloud_id']['raw_value']), ("identifierSource", "FireCloud")]) ds.set("relatedIdentifiers", [f_id]) # input RNA/DNA extract that was sequenced if sample_id not in dats_samples_d: logging.fatal("no sample exists for " + sample_id + " found in file " + file['cram_file_aws']['raw_value']) sys.exit(1) dats_sample = dats_samples_d[sample_id] dats_samp_key = ":".join(["Material", dats_sample.get("name")]) dats_samp = cache.get_obj_or_ref(dats_samp_key, lambda: dats_sample) da = DatsObj( "DataAcquisition", [("name", filename), ("input", [dats_samp]) # ("uses", []) # software used ]) if not no_circular_links: # circular link back to enclosing Dataset as the output da.set("output", [ds.getIdRef()]) ds.set("producedBy", da) file_datasets.append(ds) return file_datasets
def get_sample_dats_material(cache, dats_subject, p_sample, gh_sample, var_lookup): samp_id = p_sample['SAMPID']['mapped_value'] subj_id = p_sample['SUBJID']['mapped_value'] # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable def get_var_id(name): return var_lookup[name]['dim'].get("identifier").getIdRef() # Uberon id (or EFO id, contrary to the documentation) anat_id = p_sample['SMUBRID']['mapped_value'] if anat_id is None: print("No Uberon/anatomy ID specified for sample " + samp_id) sys.exit(1) anatomy_identifier = None anatomy_alt_ids = None # TODO - query anatomy term from UBERON/EFO instead? anatomy_name = p_sample['SMTSD']['mapped_value'] def make_anat_part(anat_id, anatomy_name): # EFO id if re.match(r'^EFO_\d+', anat_id): anatomy_identifier = OrderedDict([("identifier", anat_id), ("identifierSource", "EFO")]) anatomy_alt_ids = [ OrderedDict([( "identifier", "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" + str(anat_id)), ("identifierSource", "EFO")]) ] # Uberon id else: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] # anatomical part anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) return anatomical_part # use cached value for AnatomicalPart if possible anat_part_key = ":".join(["AnatomicalPart", anatomy_name]) anatomical_part = cache.get_obj_or_ref( anat_part_key, lambda: make_anat_part(anat_id, anatomy_name)) # use URI from GitHub GTEx id dump if available identifier = samp_id if gh_sample is not None: identifier_id = gh_sample['Destination URL']['raw_value'] subj_key = ":".join(["Material", dats_subject.get("name")]) dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject) # add sample characteristics from p_sample metadata sample_chars = [] for key in p_sample: if re.match(r'^(subject|id)$', key): continue # TODO - currently including only a small subset of the available values for demonstration purposes if not re.match(r'^(SMATSSCR|SMRIN|SMMAPRT|SMGNSDTC)$', key): continue var = p_sample[key] # print("got key=" + key + " var=" + str(var)) mapped_val = var['mapped_value'] char = DatsObj("Dimension", [("name", util.get_value_annotation(key, cache)), ("identifier", get_var_id(key)), ("values", [mapped_val])]) sample_chars.append(char) # biological/tissue sample biological_sample_material = DatsObj( "Material", [("name", samp_id), ("identifier", { "identifier": identifier }), ("description", anatomy_name + " specimen collected from subject " + subj_id), ("characteristics", sample_chars), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", [util.get_annotation("specimen", cache)]), ("derivesFrom", [dats_subj, anatomical_part])]) # analysis freeze classification smafrze = p_sample['SMAFRZE']['mapped_value'] # expected sequence type depending on data freeze classification expected_stype = None stype = None if smafrze == "RNASEQ": expected_stype = "RNA" elif smafrze == "WGS": expected_stype = "DNA" elif smafrze == "WES": expected_stype = "DNA" # Illumina OMNI SNP Array elif smafrze == "OMNI": expected_stype = "DNA" elif smafrze == "EXCLUDE": pass else: logging.fatal("unknown SMAFRZE " + smafrze) sys.exit(1) # sample type - DNA or RNA stype = None smnabtcht = p_sample['SMNABTCHT']['mapped_value'] if re.match(r'^DNA ([iI]solation|[eE]xtraction).*', smnabtcht): stype = 'DNA' elif re.match(r'^RNA ([iI]solation|[eE]xtraction).*', smnabtcht): stype = 'RNA' elif re.match( r'DNA or RNA Extraction from Paxgene-derived Lysate Plate Based', smnabtcht): stype = 'RNA' elif re.match(r'Transfer To Matrix \(Manual\)', smnabtcht): stype = 'DNA' if stype is None: if expected_stype is not None: stype = expected_stype else: print("couldn't determine sequence type for smafrze=" + smafrze + " smnabtcht=" + smnabtcht) return None else: if (expected_stype is not None) and (stype != expected_stype): logging.fatal("seq type " + stype + " doesn't match expected stype " + expected_stype) sys.exit(1) # DNA or RNA extract dna_or_rna_material = DatsObj( "Material", [("name", stype + " from " + samp_id), ("description", "total " + stype + " extracted from " + anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", [util.get_annotation(stype + " extract", cache)]), ("derivesFrom", [biological_sample_material])]) return dna_or_rna_material