def get_subject_dats_material(cache, p_subject, gh_subject, var_lookup): subj_id = p_subject['SUBJID']['mapped_value'] # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable def get_var_id(name): return var_lookup[name]['dim'].get("identifier").getIdRef() # human experimental subject/patient subject_sex = DatsObj( "Dimension", [("name", util.get_value_annotation("Gender", cache)), ("description", "Gender of the subject"), ("identifier", get_var_id("SEX")), ("values", [p_subject['SEX']['mapped_value']])]) subject_age = DatsObj( "Dimension", [("name", util.get_value_annotation("Age range", cache)), ("description", "Age range of the subject"), ("identifier", get_var_id("AGE")), ("values", [p_subject['AGE']['mapped_value']])]) subject_hardy_scale = DatsObj( "Dimension", [("name", util.get_value_annotation("Hardy scale", cache)), ("description", "Hardy scale death classification for the subject"), ("identifier", get_var_id("DTHHRDY")), ("values", [p_subject['DTHHRDY']['mapped_value']])]) subject_characteristics = [subject_sex, subject_age, subject_hardy_scale] # use URI from GTEx id dump if present identifier = subj_id if gh_subject is not None: identifier = gh_subject['Destination URL']['raw_value'] # human experimental subject/patient subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", DatsObj("Identifier", [("identifier", identifier)])), ("description", "GTEx subject " + subj_id), ("characteristics", subject_characteristics), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", util.get_donor_roles(cache))]) # add to the cache subj_key = ":".join(["Material", subj_id]) dats_subj = cache.get_obj_or_ref(subj_key, lambda: subject_material) return dats_subj
def get_single_dna_extract_json(cache, study, study_md, subj_var_values, samp_var_values): # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"... # Few samples are saliva samples probably due to sample collection issues name = None if 'BODY_SITE' in samp_var_values: name = 'BODY_SITE' elif 'Body_Site' in samp_var_values: name = 'Body_Site' elif 'Body Site' in samp_var_values: name = 'Body Site' anat_id = None anatomy_name = None if name is not None: if "blood" in samp_var_values[name]['value'].lower(): anatomy_name = "blood" anat_id = "0000178" elif samp_var_values[name]['value'].lower() == "saliva": anatomy_name = "saliva" anat_id = "0001836" else: logging.fatal( "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - " + samp_var_values['BODY_SITE']['value']) sys.exit(1) if anat_id is not None: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] # extract subject attributes gender = None age = None visit_year = None sys_bp = None dias_bp = None disease = {} disease['hypertension'] = "unknown" for name in subj_var_values: name_upper = name.upper() if name_upper == "GENDER" or name_upper == "SEX": gender = subj_var_values[name]['value'].lower() elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL": #need to confirm that these allmean the same thing age = subj_var_values[name]['value'] elif name_upper == "VISIT_YEAR": visit_year = subj_var_values[name]['value'] elif name_upper == "SYSBP": sys_bp = subj_var_values[name]['value'] elif name_upper == "DIASBP": dias_bp = subj_var_values[name]['value'] elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES": if subj_var_values[name]['value'].lower( ) == "yes" or subj_var_values[name]['value'] == '1': disease['hypertension'] = "yes" elif re.match(r'\S', subj_var_values[name]['value']): disease['hypertension'] = "no" # anatomical part anatomical_part = None if anatomy_name is not None: anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) subject_characteristics = [] subject_bearerOfDisease = [] # harmonized/standardized characteristics if gender is not None: subject_sex = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Gender")])), ("description", "Gender of the subject"), ("values", [gender])]) subject_characteristics.append(subject_sex) if age is not None: subject_age = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Age")])), ("description", "Age of the subject"), ("values", [age])]) subject_characteristics.append(subject_age) if visit_year is not None: subject_visitYear = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Visit year")])), ("description", "Year of visit, to use for longitudinal analysis"), ("values", [visit_year]) ]) subject_characteristics.append(subject_visitYear) if sys_bp is not None: subject_sysBP = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Systolic blood pressure")])), ("description", "Systolic blood pressure of subject, measured in mmHg"), ("values", [sys_bp])]) subject_characteristics.append(subject_sysBP) if dias_bp is not None: subject_diasBP = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Diastolic blood pressure")])), ("description", "Diastolic blood pressure of subject, measured in mmHg"), ("values", [dias_bp])]) subject_characteristics.append(subject_diasBP) if disease['hypertension'] != "unknown": disease_name = "hypertension" disease_id = "10763" disease_identifier = OrderedDict([ ("identifier", "DOID:" + str(disease_id)), ("identifierSource", "Disease Ontology") ]) disease_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)), ("identifierSource", "Disease Ontology") ]) ] subject_hypertension = DatsObj("Disease", [ ("name", "Hypertension"), ("identifier", disease_identifier), ("alternateIdentifiers", disease_alt_ids), ("diseaseStatus", DatsObj("Annotation", [("value", disease['hypertension']), ("valueIRI", "")])), ]) subject_bearerOfDisease.append(subject_hypertension) # create a DATS Dimension from a dbGaP variable value def make_var_dimension(name, var_value): value = var_value["value"] dim = DatsObj("Dimension", [("name", DatsObj("Annotation", [("value", name)])), ("values", [value])]) # find existing DATS identifier for the corresponding Dataset Dimension if "var" in var_value: id = var_value["var"]["id"] dbgap_var_dim = study_md['dbgap_vars'][id] dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef()) return dim # create DATS Dimensions for dbGaP subject metadata subject_dimensions = [ make_var_dimension(vname, subj_var_values[vname]) for vname in sorted(subj_var_values) ] # create DATS Dimensions for dbGaP sample metadata sample_dimensions = [ make_var_dimension(vname, samp_var_values[vname]) for vname in sorted(samp_var_values) ] # "raw" characteristics from dbGaP metadata subject_characteristics.extend(subject_dimensions) sample_characteristics = sample_dimensions human_t = util.get_taxon_human(cache) subj_id = subj_var_values['SUBJECT_ID']['value'] dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value'] samp_id = samp_var_values['SAMPLE_ID']['value'] dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value'] study_title = study.get("title") # human experimental subject/patient subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", { "identifier": subj_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]), ("description", study_title + " subject " + subj_id), ("characteristics", subject_characteristics), ("bearerOfDisease", subject_bearerOfDisease), ("taxonomy", [human_t]), ("roles", util.get_donor_roles(cache))]) # TODO - use DatsObjCache specimen_annot = util.get_annotation("specimen") dna_extract_annot = util.get_annotation("DNA extract") # biological/tissue sample sample_name = samp_id sample_derives_from = [subject_material] sample_descr = "specimen collected from subject " + subj_id if anatomical_part is not None: sample_derives_from.append(anatomical_part) sample_descr = anatomy_name + " " + sample_descr biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]), ("description", sample_descr), ("characteristics", sample_characteristics), ("taxonomy", [human_t]), ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)]) # DNA extracted from tissue sample dna_descr = "DNA extracted from specimen collected from subject " + subj_id if anatomical_part is not None: dna_descr = "DNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id dna_material = DatsObj("Material", [("name", "DNA from " + sample_name), ("description", dna_descr), ("taxonomy", [human_t]), ("roles", [dna_extract_annot]), ("derivesFrom", [biological_sample_material])]) return dna_material
def get_single_sample_json(sample, dats_obj_cache): # print("converting sample to json: " + str(sample)) samp_id = sample['SAMPID']['mapped_value'] subj_id = sample['SUBJID']['mapped_value'] subject = sample['subject'] # Uberon id (or EFO id, contrary to the documentation) anat_id = sample['SMUBRID']['mapped_value'] if anat_id is None: print("No Uberon/anatomy ID specified for sample " + samp_id) sys.exit(1) anatomy_identifier = None anatomy_alt_ids = None # TODO - query anatomy term from UBERON/EFO instead? anatomy_name = sample['SMTSD']['mapped_value'] # EFO id if re.match(r'^EFO_\d+', anat_id): anatomy_identifier = OrderedDict([("identifier", anat_id), ("identifierSource", "EFO")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" + str(anat_id)), ("identifierSource", "EFO") ]) ] # Uberon id else: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] # anatomical part anat_part_key = ":".join(["AnatomicalPart", anatomy_name]) if anat_part_key in dats_obj_cache: anatomical_part = dats_obj_cache[anat_part_key] else: anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) dats_obj_cache[anat_part_key] = anatomical_part # human experimental subject/patient subject_sex = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Gender")])), ("description", "Gender of the subject"), ("identifier", DatsObj("Identifier", [("identifier", "SEX"), ("identifierSource", "GTEx")])), ("values", [subject['SEX']['mapped_value']])]) subject_age = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Age range")])), ("description", "Age range of the subject"), ("identifier", DatsObj("Identifier", [("identifier", "AGE"), ("identifierSource", "GTEx")])), ("values", [subject['AGE']['mapped_value']])]) subject_hardy_scale = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Hardy scale")])), ("description", "Hardy scale death classification for the subject"), ("identifier", DatsObj("Identifier", [("identifier", "DTHHRDY"), ("identifierSource", "GTEx")])), ("values", [subject['DTHHRDY']['mapped_value']])]) subject_characteristics = [subject_sex, subject_age, subject_hardy_scale] # human experimental subject/patient subj_key = ":".join(["Material", subj_id]) if subj_key in dats_obj_cache: subject_material = dats_obj_cache[subj_key] else: subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", { "identifier": subj_id }), ("description", "GTEx subject " + subj_id), ("characteristics", subject_characteristics), ("taxonomy", [util.get_taxon_human(dats_obj_cache)]), ("roles", util.get_donor_roles(dats_obj_cache))]) dats_obj_cache[subj_key] = subject_material specimen_annot = util.get_annotation("specimen", dats_obj_cache) rna_extract_annot = util.get_annotation("RNA extract", dats_obj_cache) # biological/tissue sample sample_name = samp_id biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("description", anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", [util.get_taxon_human(dats_obj_cache)]), ("roles", [specimen_annot]), ("derivesFrom", [subject_material, anatomical_part])]) # RNA extracted from tissue sample rna_material = DatsObj( "Material", [("name", "RNA from " + sample_name), ("description", "total RNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", [util.get_taxon_human(dats_obj_cache)]), ("roles", [rna_extract_annot]), ("derivesFrom", [biological_sample_material])]) return rna_material
def get_sample_dats_material(cache, dats_subject, study, study_md, samp_var_values): # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"... # Few samples are saliva samples probably due to sample collection issues name = None if 'BODY_SITE' in samp_var_values: name = 'BODY_SITE' elif 'Body_Site' in samp_var_values: name = 'Body_Site' elif 'Body Site' in samp_var_values: name = 'Body Site' anat_id = None anatomy_name = None if name is not None: if "blood" in samp_var_values[name]['value'].lower(): anatomy_name = "blood" anat_id = "0000178" elif samp_var_values[name]['value'].lower() == "saliva": anatomy_name = "saliva" anat_id = "0001836" else: logging.fatal( "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - " + samp_var_values['BODY_SITE']['value']) sys.exit(1) def make_anat_part(anat_id, anatomy_name): # anatomical part anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name)]) if anat_id is not None: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] anatomical_part.set("identifier", anatomy_identifier) anatomical_part.set("alternateIdentifiers", anatomy_alt_ids) return anatomical_part if anatomy_name is not None: # use cached value for AnatomicalPart if possible anat_part_key = ":".join(["AnatomicalPart", anatomy_name]) anatomical_part = cache.get_obj_or_ref( anat_part_key, lambda: make_anat_part(anat_id, anatomy_name)) else: anatomical_part = None # create a DATS Dimension from a dbGaP variable value def make_var_dimension(name, var_value): value = var_value["value"] dim = DatsObj("Dimension", [("name", DatsObj("Annotation", [("value", name)])), ("values", [value])]) # find existing DATS identifier for the corresponding Dataset Dimension if "var" in var_value: dbgap_var_dim = var_value["var"]["dim"] dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef()) return dim # create DATS Dimensions for dbGaP sample metadata sample_dimensions = [ make_var_dimension(vname, samp_var_values[vname]) for vname in sorted(samp_var_values) ] sample_characteristics = sample_dimensions samp_id = samp_var_values['SAMPLE_ID']['value'] dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value'] study_title = study.get("title") specimen_annot = util.get_annotation("specimen", cache) # corresponding DATS subject Material subj_key = ":".join(["Material", dats_subject.get("name")]) dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject) dats_subj_name = dats_subject.get("name") # biological/tissue sample sample_name = samp_id sample_derives_from = [dats_subj] sample_descr = "specimen collected from subject " + dats_subj_name if anatomical_part is not None: sample_derives_from.append(anatomical_part) sample_descr = anatomy_name + " " + sample_descr biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]), ("description", sample_descr), ("characteristics", sample_characteristics), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)]) # RNA or DNA extracted from tissue sample stype = "DNA" # TODO - check if RNA, not DNA dna_or_rna_descr = stype + " extracted from specimen collected from subject " + dats_subj_name if anatomical_part is not None: dna_or_rna_descr = stype + " extracted from " + anatomy_name + " specimen collected from subject " + dats_subj_name dna_or_rna_material = DatsObj( "Material", [("name", stype + " from " + sample_name), ("description", dna_or_rna_descr), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", [util.get_annotation(stype + " extract", cache)]), ("derivesFrom", [biological_sample_material])]) return dna_or_rna_material
def get_subject_dats_material(cache, study, study_md, subj_var_values): # extract subject attributes gender = None age = None visit_year = None sys_bp = None dias_bp = None disease = {} disease['hypertension'] = "unknown" for name in subj_var_values: name_upper = name.upper() if name_upper == "GENDER" or name_upper == "SEX": gender = subj_var_values[name]['value'].lower() elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL": #need to confirm that these allmean the same thing age = subj_var_values[name]['value'] elif name_upper == "VISIT_YEAR": visit_year = subj_var_values[name]['value'] elif name_upper == "SYSBP": sys_bp = subj_var_values[name]['value'] elif name_upper == "DIASBP": dias_bp = subj_var_values[name]['value'] elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES": if subj_var_values[name]['value'].lower() == "yes" or subj_var_values[name]['value'] == '1': disease['hypertension'] = "yes" elif re.match(r'\S', subj_var_values[name]['value']): disease['hypertension'] = "no" subject_characteristics = [] subject_bearerOfDisease = [] # harmonized/standardized characteristics if gender is not None: subject_sex = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Gender")])), ("description", "Gender of the subject"), ("values", [ gender ]) ]) subject_characteristics.append(subject_sex) if age is not None: subject_age = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Age")])), ("description", "Age of the subject"), ("values", [ age ]) ]) subject_characteristics.append(subject_age) if visit_year is not None: subject_visitYear = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Visit year")])), ("description", "Year of visit, to use for longitudinal analysis"), ("values", [ visit_year ]) ]) subject_characteristics.append(subject_visitYear) if sys_bp is not None: subject_sysBP = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Systolic blood pressure")])), ("description", "Systolic blood pressure of subject, measured in mmHg"), ("values", [ sys_bp ]) ]) subject_characteristics.append(subject_sysBP) if dias_bp is not None: subject_diasBP = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Diastolic blood pressure")])), ("description", "Diastolic blood pressure of subject, measured in mmHg"), ("values", [ dias_bp ]) ]) subject_characteristics.append(subject_diasBP) if disease['hypertension'] != "unknown": disease_name = "hypertension" disease_id = "10763" disease_identifier = OrderedDict([ ("identifier", "DOID:" + str(disease_id)), ("identifierSource", "Disease Ontology")]) disease_alt_ids = [OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)), ("identifierSource", "Disease Ontology")])] subject_hypertension = DatsObj("Disease", [ ("name", "Hypertension"), ("identifier", disease_identifier), ("alternateIdentifiers", disease_alt_ids), ("diseaseStatus", DatsObj("Annotation", [("value", disease['hypertension'] ), ("valueIRI", "")])), ]) subject_bearerOfDisease.append(subject_hypertension) # create a DATS Dimension from a dbGaP variable value def make_var_dimension(name, var_value): value = var_value["value"] dim = DatsObj("Dimension", [("name", DatsObj("Annotation", [( "value", name )])), ("values", [ value ]) ]) # find existing DATS identifier for the corresponding Dataset Dimension if "var" in var_value: dbgap_var_dim = var_value["var"]["dim"] dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef()) return dim # create DATS Dimensions for dbGaP subject metadata subject_dimensions = [ make_var_dimension(vname, subj_var_values[vname]) for vname in sorted(subj_var_values) ] # "raw" characteristics from dbGaP metadata subject_characteristics.extend(subject_dimensions) human_t = util.get_taxon_human(cache) subj_id = subj_var_values['SUBJECT_ID']['value'] dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value'] study_title = study.get("title") # human experimental subject/patient subject_material = DatsObj("Material", [ ("name", subj_id), ("identifier", { "identifier": subj_id }), ("alternateIdentifiers", [ util.get_alt_id(dbgap_subj_id, "dbGaP") ]), ("description", study_title + " subject " + subj_id), ("characteristics", subject_characteristics), ("bearerOfDisease", subject_bearerOfDisease), ("taxonomy", [ human_t ]), ("roles", util.get_donor_roles(cache)) ]) # add to the cache subj_key = ":".join(["Material", subj_id]) dats_subj = cache.get_obj_or_ref(subj_key, lambda: subject_material) return dats_subj
def get_sample_dats_material(cache, dats_subject, p_sample, gh_sample, var_lookup): samp_id = p_sample['SAMPID']['mapped_value'] subj_id = p_sample['SUBJID']['mapped_value'] # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable def get_var_id(name): return var_lookup[name]['dim'].get("identifier").getIdRef() # Uberon id (or EFO id, contrary to the documentation) anat_id = p_sample['SMUBRID']['mapped_value'] if anat_id is None: print("No Uberon/anatomy ID specified for sample " + samp_id) sys.exit(1) anatomy_identifier = None anatomy_alt_ids = None # TODO - query anatomy term from UBERON/EFO instead? anatomy_name = p_sample['SMTSD']['mapped_value'] def make_anat_part(anat_id, anatomy_name): # EFO id if re.match(r'^EFO_\d+', anat_id): anatomy_identifier = OrderedDict([("identifier", anat_id), ("identifierSource", "EFO")]) anatomy_alt_ids = [ OrderedDict([( "identifier", "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" + str(anat_id)), ("identifierSource", "EFO")]) ] # Uberon id else: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] # anatomical part anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) return anatomical_part # use cached value for AnatomicalPart if possible anat_part_key = ":".join(["AnatomicalPart", anatomy_name]) anatomical_part = cache.get_obj_or_ref( anat_part_key, lambda: make_anat_part(anat_id, anatomy_name)) # use URI from GitHub GTEx id dump if available identifier = samp_id if gh_sample is not None: identifier_id = gh_sample['Destination URL']['raw_value'] subj_key = ":".join(["Material", dats_subject.get("name")]) dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject) # add sample characteristics from p_sample metadata sample_chars = [] for key in p_sample: if re.match(r'^(subject|id)$', key): continue # TODO - currently including only a small subset of the available values for demonstration purposes if not re.match(r'^(SMATSSCR|SMRIN|SMMAPRT|SMGNSDTC)$', key): continue var = p_sample[key] # print("got key=" + key + " var=" + str(var)) mapped_val = var['mapped_value'] char = DatsObj("Dimension", [("name", util.get_value_annotation(key, cache)), ("identifier", get_var_id(key)), ("values", [mapped_val])]) sample_chars.append(char) # biological/tissue sample biological_sample_material = DatsObj( "Material", [("name", samp_id), ("identifier", { "identifier": identifier }), ("description", anatomy_name + " specimen collected from subject " + subj_id), ("characteristics", sample_chars), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", [util.get_annotation("specimen", cache)]), ("derivesFrom", [dats_subj, anatomical_part])]) # analysis freeze classification smafrze = p_sample['SMAFRZE']['mapped_value'] # expected sequence type depending on data freeze classification expected_stype = None stype = None if smafrze == "RNASEQ": expected_stype = "RNA" elif smafrze == "WGS": expected_stype = "DNA" elif smafrze == "WES": expected_stype = "DNA" # Illumina OMNI SNP Array elif smafrze == "OMNI": expected_stype = "DNA" elif smafrze == "EXCLUDE": pass else: logging.fatal("unknown SMAFRZE " + smafrze) sys.exit(1) # sample type - DNA or RNA stype = None smnabtcht = p_sample['SMNABTCHT']['mapped_value'] if re.match(r'^DNA ([iI]solation|[eE]xtraction).*', smnabtcht): stype = 'DNA' elif re.match(r'^RNA ([iI]solation|[eE]xtraction).*', smnabtcht): stype = 'RNA' elif re.match( r'DNA or RNA Extraction from Paxgene-derived Lysate Plate Based', smnabtcht): stype = 'RNA' elif re.match(r'Transfer To Matrix \(Manual\)', smnabtcht): stype = 'DNA' if stype is None: if expected_stype is not None: stype = expected_stype else: print("couldn't determine sequence type for smafrze=" + smafrze + " smnabtcht=" + smnabtcht) return None else: if (expected_stype is not None) and (stype != expected_stype): logging.fatal("seq type " + stype + " doesn't match expected stype " + expected_stype) sys.exit(1) # DNA or RNA extract dna_or_rna_material = DatsObj( "Material", [("name", stype + " from " + samp_id), ("description", "total " + stype + " extracted from " + anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", [util.get_annotation(stype + " extract", cache)]), ("derivesFrom", [biological_sample_material])]) return dna_or_rna_material
def get_single_dna_extract_json(study, subj_var_values, samp_var_values): # all samples in TOPMed WGS phase are blood samples if samp_var_values['BODY_SITE'] != 'Blood': logging.fatal( "encountered BODY_SITE other than 'Blood' in TOPMed sample metadata - " + samp_var_values['BODY_SITE']) sys.exit(1) anatomy_name = "blood" anat_id = "0000178" anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON")]) ] # extract subject attributes gender = None age = None for name in subj_var_values: if name == "GENDER": gender = subj_var_values[name].lower() elif name == "VISIT_AGE": age = subj_var_values[name] # TODO - determine what other subject attributes can be mapped directly to core DATS objects # place original dbGaP subject metadata into extraProperties # TODO - consider alternative of doing this only for un-harmonized metadata subj_extra_props = [ DatsObj("CategoryValuesPair", [("category", xp), ("values", [subj_var_values[xp]])]) for xp in sorted(subj_var_values) ] # extract sample attributes for name in samp_var_values: if name == 'SEQUENCING_CENTER': # TODO - determine which DATS objects (e.g., biological sample, DNA prep, sequence data) this property should attach to pass # TODO - determine what other subject attributes can be mapped directly to core DATS objects # e.g., IS_TUMOR -> bearerOfDisease ("the pathology affecting the material...") # place original dbGaP sample metadata into extraProperties samp_extra_props = [ DatsObj("CategoryValuesPair", [("category", xp), ("values", [samp_var_values[xp]])]) for xp in sorted(samp_var_values) ] # anatomical part anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) subject_sex = DatsObj("Dimension", [("name", { "value": "Gender" }), ("description", "Gender of the subject"), ("values", [gender])]) subject_age = DatsObj("Dimension", [("name", { "value": "Age" }), ("description", "Age of the subject"), ("values", [age])]) subject_characteristics = [subject_sex, subject_age] human_t = util.get_taxon_human() subj_id = subj_var_values['SUBJECT_ID'] dbgap_subj_id = subj_var_values['dbGaP_Subject_ID'] samp_id = samp_var_values['SAMPLE_ID'] dbgap_samp_id = samp_var_values['dbGaP_Sample_ID'] study_title = study.get("title") # human experimental subject/patient subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", { "identifier": subj_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]), ("description", study_title + " subject " + subj_id), ("characteristics", subject_characteristics), ("taxonomy", human_t), ("roles", util.get_donor_roles()), ("extraProperties", subj_extra_props)]) # biological/tissue sample sample_name = samp_id biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]), ("description", anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", human_t), ("roles", [OrderedDict([("value", "specimen"), ("valueIRI", "")])]), ("derivesFrom", [subject_material, anatomical_part]), ("extraProperties", samp_extra_props)]) # DNA extracted from tissue sample dna_material = DatsObj( "Material", [("name", "DNA from " + sample_name), ("description", "DNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", human_t), ("roles", [OrderedDict([("value", "DNA extract"), ("valueIRI", "")])]), ("derivesFrom", [biological_sample_material])]) return dna_material
def get_gene_json(cache, mod, gff3_json_path, orthologs): # read gene features form BGI file features = read_bgi(cache, mod, gff3_json_path) # read disease info from disease JSON file diseases = read_disease(cache, mod, gff3_json_path) # read disease info from disease JSON file phenotypes = read_phenotype(cache, mod, gff3_json_path) # TODO - read gene features from GFF3 file genes = [] for f in features: genomeLocations = DatsObj("GenomeLocation", [("assembly", f['assembly']), ("chromosome", f['chr']), ("startPosition", f['start']), ("endPosition", f['end']), ("strand", f['strand'])]) roles = [ DatsObj("Annotation", [("value", SOID[f['soid']]), ("valueIRI", "http://purl.obolibrary.org/obo/SO_" + f['soid'][3:])]) ] alternate_ids = [] if f['alt_ids'] != "NA": alt_ids_list = [] for i in f['alt_ids']: source, id = i.split(':') alt_id = util.get_alt_id(id, source) alternate_ids.append(alt_id) #encode disease disease_list = [] gene_diseases = search_dict('object_id', f['primaryId'], diseases) if len(gene_diseases) > 0: do_ids = [d['do_id'] for d in gene_diseases] uniq_do_ids = list(set(do_ids)) for d in uniq_do_ids: disease_id = DatsObj( "Annotation", [("value", d), ("valueIRI", "http://purl.obolibrary.org/obo/DOID_" + d[5:])]) select_diseases = search_dict('do_id', d, gene_diseases) #relation = OrderedDict([("value", select_diseases[0]['association_type'])]) relation = DatsObj( "Annotation", [("value", "Disease"), ("valueIRI", "http://purl.obolibrary.org/obo/DOID_4")]) # account for multiple evidence codes per disease id evd_ids = [] evd_ids_list = [d['evidence_codes'] for d in select_diseases] for i in evd_ids_list[0]: evd_id = DatsObj( "Annotation", [("value", i), ("valueIRI", "http://purl.obolibrary.org/obo/" + EVID[i])]) evd_ids.append(evd_id) # account for multiple publications per disease id pub_ids = [] pub_ids_list = [d['pubmed_id'] for d in select_diseases] for i in pub_ids_list: pub_id = DatsObj( "Publication", [("Identifier", DatsObj("Identifier", [("identifier", i), ("identifierSource", "PubMed")]))]) pub_ids.append(pub_id) mod_pub_ids_list = [d['mod_pub_id'] for d in select_diseases] for i in mod_pub_ids_list: mod_pub_id = DatsObj("Publication", [ ("Identifier", DatsObj("Identifier", [("identifier", i), ("identifierSource", i[:3])])) ]) pub_ids.append(mod_pub_id) relation_evidence = OrderedDict([ ("evidenceCodes", evd_ids), ("publications", pub_ids), ("dateEstablished", DatsObj("Date", [ ("date", select_diseases[0]['date_ass']), ("type", DatsObj("Annotation", [("value", "Date Assigned")])) ])) ]) related_entity_id = OrderedDict([ ("object", disease_id), ("relation", relation), ("resultingFrom", DatsObj( "Activity", [("name", select_diseases[0]['association_type'])])), ("relationEvidence", relation_evidence) ]) disease_list.append(related_entity_id) #encode phenotype #assumes one phenotype termID per record as is in RGD and MGI phenotype JSONs phenotype_list = [] gene_phenotypes = search_dict('object_id', f['primaryId'], phenotypes) if len(gene_phenotypes) > 0: phe_ids = [p['phe_term_ids'] for p in gene_phenotypes] #logging.info("phe_ids: " + str(phe_ids)) uniq_phe_ids = list(set(phe_ids)) for p in uniq_phe_ids: select_phenotypes = search_dict('phe_term_ids', p, gene_phenotypes) #logging.info("select_phe: " + str(select_phenotypes)) term_id = DatsObj( "Annotation", [("value", select_phenotypes[0]['phe_term_ids']), ("valueIRI", "http://purl.obolibrary.org/obo/MP_" + select_phenotypes[0]['phe_term_ids'][3:])]) relation = DatsObj("Annotation", [ ("value", "Phenotype"), ("valueIRI", "http://purl.obolibrary.org/obo/OGMS_0000023") ]) # account for multiple publications per phenotype pub_ids = [] empt = '' pub_ids_list = [p['pubmed_id'] for p in select_phenotypes] for i in pub_ids_list: if i == "": continue else: pub_id = DatsObj( "Publication", [("Identifier", DatsObj("Identifier", [("identifier", i), ("identifierSource", "PubMed")]))]) pub_ids.append(pub_id) mod_pub_ids_list = [p['mod_pub_id'] for p in select_phenotypes] for i in mod_pub_ids_list: mod_pub_id = DatsObj("Publication", [ ("Identifier", DatsObj("Identifier", [("identifier", i), ("identifierSource", i[:3])])) ]) pub_ids.append(mod_pub_id) relation_evidence = OrderedDict([ ("publications", pub_ids), ("dateEstablished", DatsObj("Date", [ ("date", select_phenotypes[0]['date_ass']), ("type", DatsObj("Annotation", [("value", "Date Assigned")])) ])) ]) related_entity_id = OrderedDict([("object", term_id), ("relation", relation), ("relationEvidence", relation_evidence)]) phenotype_list.append(related_entity_id) #encode ortholog ortholog_list = [] gene_orthologs = search_dict('mod_gene_id', f['primaryId'], orthologs) if len(gene_orthologs) > 0: for o in gene_orthologs: if '9606' in o['ortho_taxon']: o_taxon = util.get_taxon_human(cache) else: logging.fatal("encountered taxonomy other human - " + o['ortho_taxon']) sys.exit(1) mol_entity_ortholog = DatsObj("MolecularEntity", [ ("identifier", DatsObj("Identifier", [("identifier", o['ortho_gene_id'])])), ("name", o['ortho_gene_id']), ("taxonomy", [o_taxon]), ("alternateIdentifiers", util.get_alt_id(o['ortho_gene_symbol'], "Gene Symbol")), ]) related_entity_id = OrderedDict([ ("object", mol_entity_ortholog), ("relation", DatsObj("Annotation", [("value", "Orthology"), ("valueIRI", "http://purl.obolibrary.org/obo/HOM_0000017")])) ]) ortholog_list.append(related_entity_id) related_entities = disease_list + phenotype_list + ortholog_list gene = DatsObj( "MolecularEntity", [("identifier", DatsObj("Identifier", [("identifier", f['primaryId'])])), ("name", f['primaryId']), ("description", f['descr']), ("roles", roles), ("taxonomy", [f['taxon']]), ("genomeLocations", [genomeLocations]), ("alternateIdentifiers", alternate_ids), ("relatedEntities", related_entities)]) genes.append(gene) return genes