def get_subject_dats_material(cache, p_subject, gh_subject, var_lookup): subj_id = p_subject['SUBJID']['mapped_value'] # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable def get_var_id(name): return var_lookup[name]['dim'].get("identifier").getIdRef() # human experimental subject/patient subject_sex = DatsObj( "Dimension", [("name", util.get_value_annotation("Gender", cache)), ("description", "Gender of the subject"), ("identifier", get_var_id("SEX")), ("values", [p_subject['SEX']['mapped_value']])]) subject_age = DatsObj( "Dimension", [("name", util.get_value_annotation("Age range", cache)), ("description", "Age range of the subject"), ("identifier", get_var_id("AGE")), ("values", [p_subject['AGE']['mapped_value']])]) subject_hardy_scale = DatsObj( "Dimension", [("name", util.get_value_annotation("Hardy scale", cache)), ("description", "Hardy scale death classification for the subject"), ("identifier", get_var_id("DTHHRDY")), ("values", [p_subject['DTHHRDY']['mapped_value']])]) subject_characteristics = [subject_sex, subject_age, subject_hardy_scale] # use URI from GTEx id dump if present identifier = subj_id if gh_subject is not None: identifier = gh_subject['Destination URL']['raw_value'] # human experimental subject/patient subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", DatsObj("Identifier", [("identifier", identifier)])), ("description", "GTEx subject " + subj_id), ("characteristics", subject_characteristics), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", util.get_donor_roles(cache))]) # add to the cache subj_key = ":".join(["Material", subj_id]) dats_subj = cache.get_obj_or_ref(subj_key, lambda: subject_material) return dats_subj
def get_single_sample_json(sample, dats_obj_cache): # print("converting sample to json: " + str(sample)) samp_id = sample['SAMPID']['mapped_value'] subj_id = sample['SUBJID']['mapped_value'] subject = sample['subject'] # Uberon id (or EFO id, contrary to the documentation) anat_id = sample['SMUBRID']['mapped_value'] if anat_id is None: print("No Uberon/anatomy ID specified for sample " + samp_id) sys.exit(1) anatomy_identifier = None anatomy_alt_ids = None # TODO - query anatomy term from UBERON/EFO instead? anatomy_name = sample['SMTSD']['mapped_value'] # EFO id if re.match(r'^EFO_\d+', anat_id): anatomy_identifier = OrderedDict([("identifier", anat_id), ("identifierSource", "EFO")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" + str(anat_id)), ("identifierSource", "EFO") ]) ] # Uberon id else: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] # anatomical part anat_part_key = ":".join(["AnatomicalPart", anatomy_name]) if anat_part_key in dats_obj_cache: anatomical_part = dats_obj_cache[anat_part_key] else: anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) dats_obj_cache[anat_part_key] = anatomical_part # human experimental subject/patient subject_sex = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Gender")])), ("description", "Gender of the subject"), ("identifier", DatsObj("Identifier", [("identifier", "SEX"), ("identifierSource", "GTEx")])), ("values", [subject['SEX']['mapped_value']])]) subject_age = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Age range")])), ("description", "Age range of the subject"), ("identifier", DatsObj("Identifier", [("identifier", "AGE"), ("identifierSource", "GTEx")])), ("values", [subject['AGE']['mapped_value']])]) subject_hardy_scale = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Hardy scale")])), ("description", "Hardy scale death classification for the subject"), ("identifier", DatsObj("Identifier", [("identifier", "DTHHRDY"), ("identifierSource", "GTEx")])), ("values", [subject['DTHHRDY']['mapped_value']])]) subject_characteristics = [subject_sex, subject_age, subject_hardy_scale] # human experimental subject/patient subj_key = ":".join(["Material", subj_id]) if subj_key in dats_obj_cache: subject_material = dats_obj_cache[subj_key] else: subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", { "identifier": subj_id }), ("description", "GTEx subject " + subj_id), ("characteristics", subject_characteristics), ("taxonomy", [util.get_taxon_human(dats_obj_cache)]), ("roles", util.get_donor_roles(dats_obj_cache))]) dats_obj_cache[subj_key] = subject_material specimen_annot = util.get_annotation("specimen", dats_obj_cache) rna_extract_annot = util.get_annotation("RNA extract", dats_obj_cache) # biological/tissue sample sample_name = samp_id biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("description", anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", [util.get_taxon_human(dats_obj_cache)]), ("roles", [specimen_annot]), ("derivesFrom", [subject_material, anatomical_part])]) # RNA extracted from tissue sample rna_material = DatsObj( "Material", [("name", "RNA from " + sample_name), ("description", "total RNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", [util.get_taxon_human(dats_obj_cache)]), ("roles", [rna_extract_annot]), ("derivesFrom", [biological_sample_material])]) return rna_material
def get_subject_dats_material(cache, study, study_md, subj_var_values): # extract subject attributes gender = None age = None visit_year = None sys_bp = None dias_bp = None disease = {} disease['hypertension'] = "unknown" for name in subj_var_values: name_upper = name.upper() if name_upper == "GENDER" or name_upper == "SEX": gender = subj_var_values[name]['value'].lower() elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL": #need to confirm that these allmean the same thing age = subj_var_values[name]['value'] elif name_upper == "VISIT_YEAR": visit_year = subj_var_values[name]['value'] elif name_upper == "SYSBP": sys_bp = subj_var_values[name]['value'] elif name_upper == "DIASBP": dias_bp = subj_var_values[name]['value'] elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES": if subj_var_values[name]['value'].lower() == "yes" or subj_var_values[name]['value'] == '1': disease['hypertension'] = "yes" elif re.match(r'\S', subj_var_values[name]['value']): disease['hypertension'] = "no" subject_characteristics = [] subject_bearerOfDisease = [] # harmonized/standardized characteristics if gender is not None: subject_sex = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Gender")])), ("description", "Gender of the subject"), ("values", [ gender ]) ]) subject_characteristics.append(subject_sex) if age is not None: subject_age = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Age")])), ("description", "Age of the subject"), ("values", [ age ]) ]) subject_characteristics.append(subject_age) if visit_year is not None: subject_visitYear = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Visit year")])), ("description", "Year of visit, to use for longitudinal analysis"), ("values", [ visit_year ]) ]) subject_characteristics.append(subject_visitYear) if sys_bp is not None: subject_sysBP = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Systolic blood pressure")])), ("description", "Systolic blood pressure of subject, measured in mmHg"), ("values", [ sys_bp ]) ]) subject_characteristics.append(subject_sysBP) if dias_bp is not None: subject_diasBP = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Diastolic blood pressure")])), ("description", "Diastolic blood pressure of subject, measured in mmHg"), ("values", [ dias_bp ]) ]) subject_characteristics.append(subject_diasBP) if disease['hypertension'] != "unknown": disease_name = "hypertension" disease_id = "10763" disease_identifier = OrderedDict([ ("identifier", "DOID:" + str(disease_id)), ("identifierSource", "Disease Ontology")]) disease_alt_ids = [OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)), ("identifierSource", "Disease Ontology")])] subject_hypertension = DatsObj("Disease", [ ("name", "Hypertension"), ("identifier", disease_identifier), ("alternateIdentifiers", disease_alt_ids), ("diseaseStatus", DatsObj("Annotation", [("value", disease['hypertension'] ), ("valueIRI", "")])), ]) subject_bearerOfDisease.append(subject_hypertension) # create a DATS Dimension from a dbGaP variable value def make_var_dimension(name, var_value): value = var_value["value"] dim = DatsObj("Dimension", [("name", DatsObj("Annotation", [( "value", name )])), ("values", [ value ]) ]) # find existing DATS identifier for the corresponding Dataset Dimension if "var" in var_value: dbgap_var_dim = var_value["var"]["dim"] dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef()) return dim # create DATS Dimensions for dbGaP subject metadata subject_dimensions = [ make_var_dimension(vname, subj_var_values[vname]) for vname in sorted(subj_var_values) ] # "raw" characteristics from dbGaP metadata subject_characteristics.extend(subject_dimensions) human_t = util.get_taxon_human(cache) subj_id = subj_var_values['SUBJECT_ID']['value'] dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value'] study_title = study.get("title") # human experimental subject/patient subject_material = DatsObj("Material", [ ("name", subj_id), ("identifier", { "identifier": subj_id }), ("alternateIdentifiers", [ util.get_alt_id(dbgap_subj_id, "dbGaP") ]), ("description", study_title + " subject " + subj_id), ("characteristics", subject_characteristics), ("bearerOfDisease", subject_bearerOfDisease), ("taxonomy", [ human_t ]), ("roles", util.get_donor_roles(cache)) ]) # add to the cache subj_key = ":".join(["Material", subj_id]) dats_subj = cache.get_obj_or_ref(subj_key, lambda: subject_material) return dats_subj
def get_single_dna_extract_json(cache, study, study_md, subj_var_values, samp_var_values): # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"... # Few samples are saliva samples probably due to sample collection issues name = None if 'BODY_SITE' in samp_var_values: name = 'BODY_SITE' elif 'Body_Site' in samp_var_values: name = 'Body_Site' elif 'Body Site' in samp_var_values: name = 'Body Site' anat_id = None anatomy_name = None if name is not None: if "blood" in samp_var_values[name]['value'].lower(): anatomy_name = "blood" anat_id = "0000178" elif samp_var_values[name]['value'].lower() == "saliva": anatomy_name = "saliva" anat_id = "0001836" else: logging.fatal( "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - " + samp_var_values['BODY_SITE']['value']) sys.exit(1) if anat_id is not None: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] # extract subject attributes gender = None age = None visit_year = None sys_bp = None dias_bp = None disease = {} disease['hypertension'] = "unknown" for name in subj_var_values: name_upper = name.upper() if name_upper == "GENDER" or name_upper == "SEX": gender = subj_var_values[name]['value'].lower() elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL": #need to confirm that these allmean the same thing age = subj_var_values[name]['value'] elif name_upper == "VISIT_YEAR": visit_year = subj_var_values[name]['value'] elif name_upper == "SYSBP": sys_bp = subj_var_values[name]['value'] elif name_upper == "DIASBP": dias_bp = subj_var_values[name]['value'] elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES": if subj_var_values[name]['value'].lower( ) == "yes" or subj_var_values[name]['value'] == '1': disease['hypertension'] = "yes" elif re.match(r'\S', subj_var_values[name]['value']): disease['hypertension'] = "no" # anatomical part anatomical_part = None if anatomy_name is not None: anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) subject_characteristics = [] subject_bearerOfDisease = [] # harmonized/standardized characteristics if gender is not None: subject_sex = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Gender")])), ("description", "Gender of the subject"), ("values", [gender])]) subject_characteristics.append(subject_sex) if age is not None: subject_age = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Age")])), ("description", "Age of the subject"), ("values", [age])]) subject_characteristics.append(subject_age) if visit_year is not None: subject_visitYear = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Visit year")])), ("description", "Year of visit, to use for longitudinal analysis"), ("values", [visit_year]) ]) subject_characteristics.append(subject_visitYear) if sys_bp is not None: subject_sysBP = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Systolic blood pressure")])), ("description", "Systolic blood pressure of subject, measured in mmHg"), ("values", [sys_bp])]) subject_characteristics.append(subject_sysBP) if dias_bp is not None: subject_diasBP = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Diastolic blood pressure")])), ("description", "Diastolic blood pressure of subject, measured in mmHg"), ("values", [dias_bp])]) subject_characteristics.append(subject_diasBP) if disease['hypertension'] != "unknown": disease_name = "hypertension" disease_id = "10763" disease_identifier = OrderedDict([ ("identifier", "DOID:" + str(disease_id)), ("identifierSource", "Disease Ontology") ]) disease_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)), ("identifierSource", "Disease Ontology") ]) ] subject_hypertension = DatsObj("Disease", [ ("name", "Hypertension"), ("identifier", disease_identifier), ("alternateIdentifiers", disease_alt_ids), ("diseaseStatus", DatsObj("Annotation", [("value", disease['hypertension']), ("valueIRI", "")])), ]) subject_bearerOfDisease.append(subject_hypertension) # create a DATS Dimension from a dbGaP variable value def make_var_dimension(name, var_value): value = var_value["value"] dim = DatsObj("Dimension", [("name", DatsObj("Annotation", [("value", name)])), ("values", [value])]) # find existing DATS identifier for the corresponding Dataset Dimension if "var" in var_value: id = var_value["var"]["id"] dbgap_var_dim = study_md['dbgap_vars'][id] dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef()) return dim # create DATS Dimensions for dbGaP subject metadata subject_dimensions = [ make_var_dimension(vname, subj_var_values[vname]) for vname in sorted(subj_var_values) ] # create DATS Dimensions for dbGaP sample metadata sample_dimensions = [ make_var_dimension(vname, samp_var_values[vname]) for vname in sorted(samp_var_values) ] # "raw" characteristics from dbGaP metadata subject_characteristics.extend(subject_dimensions) sample_characteristics = sample_dimensions human_t = util.get_taxon_human(cache) subj_id = subj_var_values['SUBJECT_ID']['value'] dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value'] samp_id = samp_var_values['SAMPLE_ID']['value'] dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value'] study_title = study.get("title") # human experimental subject/patient subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", { "identifier": subj_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]), ("description", study_title + " subject " + subj_id), ("characteristics", subject_characteristics), ("bearerOfDisease", subject_bearerOfDisease), ("taxonomy", [human_t]), ("roles", util.get_donor_roles(cache))]) # TODO - use DatsObjCache specimen_annot = util.get_annotation("specimen") dna_extract_annot = util.get_annotation("DNA extract") # biological/tissue sample sample_name = samp_id sample_derives_from = [subject_material] sample_descr = "specimen collected from subject " + subj_id if anatomical_part is not None: sample_derives_from.append(anatomical_part) sample_descr = anatomy_name + " " + sample_descr biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]), ("description", sample_descr), ("characteristics", sample_characteristics), ("taxonomy", [human_t]), ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)]) # DNA extracted from tissue sample dna_descr = "DNA extracted from specimen collected from subject " + subj_id if anatomical_part is not None: dna_descr = "DNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id dna_material = DatsObj("Material", [("name", "DNA from " + sample_name), ("description", dna_descr), ("taxonomy", [human_t]), ("roles", [dna_extract_annot]), ("derivesFrom", [biological_sample_material])]) return dna_material
def get_single_dna_extract_json(study, subj_var_values, samp_var_values): # all samples in TOPMed WGS phase are blood samples if samp_var_values['BODY_SITE'] != 'Blood': logging.fatal( "encountered BODY_SITE other than 'Blood' in TOPMed sample metadata - " + samp_var_values['BODY_SITE']) sys.exit(1) anatomy_name = "blood" anat_id = "0000178" anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON")]) ] # extract subject attributes gender = None age = None for name in subj_var_values: if name == "GENDER": gender = subj_var_values[name].lower() elif name == "VISIT_AGE": age = subj_var_values[name] # TODO - determine what other subject attributes can be mapped directly to core DATS objects # place original dbGaP subject metadata into extraProperties # TODO - consider alternative of doing this only for un-harmonized metadata subj_extra_props = [ DatsObj("CategoryValuesPair", [("category", xp), ("values", [subj_var_values[xp]])]) for xp in sorted(subj_var_values) ] # extract sample attributes for name in samp_var_values: if name == 'SEQUENCING_CENTER': # TODO - determine which DATS objects (e.g., biological sample, DNA prep, sequence data) this property should attach to pass # TODO - determine what other subject attributes can be mapped directly to core DATS objects # e.g., IS_TUMOR -> bearerOfDisease ("the pathology affecting the material...") # place original dbGaP sample metadata into extraProperties samp_extra_props = [ DatsObj("CategoryValuesPair", [("category", xp), ("values", [samp_var_values[xp]])]) for xp in sorted(samp_var_values) ] # anatomical part anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) subject_sex = DatsObj("Dimension", [("name", { "value": "Gender" }), ("description", "Gender of the subject"), ("values", [gender])]) subject_age = DatsObj("Dimension", [("name", { "value": "Age" }), ("description", "Age of the subject"), ("values", [age])]) subject_characteristics = [subject_sex, subject_age] human_t = util.get_taxon_human() subj_id = subj_var_values['SUBJECT_ID'] dbgap_subj_id = subj_var_values['dbGaP_Subject_ID'] samp_id = samp_var_values['SAMPLE_ID'] dbgap_samp_id = samp_var_values['dbGaP_Sample_ID'] study_title = study.get("title") # human experimental subject/patient subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", { "identifier": subj_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]), ("description", study_title + " subject " + subj_id), ("characteristics", subject_characteristics), ("taxonomy", human_t), ("roles", util.get_donor_roles()), ("extraProperties", subj_extra_props)]) # biological/tissue sample sample_name = samp_id biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]), ("description", anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", human_t), ("roles", [OrderedDict([("value", "specimen"), ("valueIRI", "")])]), ("derivesFrom", [subject_material, anatomical_part]), ("extraProperties", samp_extra_props)]) # DNA extracted from tissue sample dna_material = DatsObj( "Material", [("name", "DNA from " + sample_name), ("description", "DNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", human_t), ("roles", [OrderedDict([("value", "DNA extract"), ("valueIRI", "")])]), ("derivesFrom", [biological_sample_material])]) return dna_material