Esempio n. 1
0
def get_subject_dats_material(cache, p_subject, gh_subject, var_lookup):
    subj_id = p_subject['SUBJID']['mapped_value']

    # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable
    def get_var_id(name):
        return var_lookup[name]['dim'].get("identifier").getIdRef()

    # human experimental subject/patient
    subject_sex = DatsObj(
        "Dimension", [("name", util.get_value_annotation("Gender", cache)),
                      ("description", "Gender of the subject"),
                      ("identifier", get_var_id("SEX")),
                      ("values", [p_subject['SEX']['mapped_value']])])

    subject_age = DatsObj(
        "Dimension", [("name", util.get_value_annotation("Age range", cache)),
                      ("description", "Age range of the subject"),
                      ("identifier", get_var_id("AGE")),
                      ("values", [p_subject['AGE']['mapped_value']])])

    subject_hardy_scale = DatsObj(
        "Dimension",
        [("name", util.get_value_annotation("Hardy scale", cache)),
         ("description", "Hardy scale death classification for the subject"),
         ("identifier", get_var_id("DTHHRDY")),
         ("values", [p_subject['DTHHRDY']['mapped_value']])])

    subject_characteristics = [subject_sex, subject_age, subject_hardy_scale]

    # use URI from GTEx id dump if present
    identifier = subj_id
    if gh_subject is not None:
        identifier = gh_subject['Destination URL']['raw_value']

    # human experimental subject/patient
    subject_material = DatsObj(
        "Material",
        [("name", subj_id),
         ("identifier", DatsObj("Identifier", [("identifier", identifier)])),
         ("description", "GTEx subject " + subj_id),
         ("characteristics", subject_characteristics),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", util.get_donor_roles(cache))])

    # add to the cache
    subj_key = ":".join(["Material", subj_id])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: subject_material)

    return dats_subj
Esempio n. 2
0
def get_single_dna_extract_json(cache, study, study_md, subj_var_values,
                                samp_var_values):
    # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"...
    # Few samples are saliva samples probably due to sample collection issues
    name = None
    if 'BODY_SITE' in samp_var_values:
        name = 'BODY_SITE'
    elif 'Body_Site' in samp_var_values:
        name = 'Body_Site'
    elif 'Body Site' in samp_var_values:
        name = 'Body Site'

    anat_id = None
    anatomy_name = None

    if name is not None:
        if "blood" in samp_var_values[name]['value'].lower():
            anatomy_name = "blood"
            anat_id = "0000178"
        elif samp_var_values[name]['value'].lower() == "saliva":
            anatomy_name = "saliva"
            anat_id = "0001836"
        else:
            logging.fatal(
                "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - "
                + samp_var_values['BODY_SITE']['value'])
            sys.exit(1)

    if anat_id is not None:
        anatomy_identifier = OrderedDict([("identifier",
                                           "UBERON:" + str(anat_id)),
                                          ("identifierSource", "UBERON")])
        anatomy_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                ("identifierSource", "UBERON")
            ])
        ]

    # extract subject attributes
    gender = None
    age = None
    visit_year = None
    sys_bp = None
    dias_bp = None
    disease = {}
    disease['hypertension'] = "unknown"

    for name in subj_var_values:
        name_upper = name.upper()
        if name_upper == "GENDER" or name_upper == "SEX":
            gender = subj_var_values[name]['value'].lower()
        elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL":  #need to confirm that these  allmean the same thing
            age = subj_var_values[name]['value']
        elif name_upper == "VISIT_YEAR":
            visit_year = subj_var_values[name]['value']
        elif name_upper == "SYSBP":
            sys_bp = subj_var_values[name]['value']
        elif name_upper == "DIASBP":
            dias_bp = subj_var_values[name]['value']
        elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES":
            if subj_var_values[name]['value'].lower(
            ) == "yes" or subj_var_values[name]['value'] == '1':
                disease['hypertension'] = "yes"
            elif re.match(r'\S', subj_var_values[name]['value']):
                disease['hypertension'] = "no"

    # anatomical part
    anatomical_part = None
    if anatomy_name is not None:
        anatomical_part = DatsObj("AnatomicalPart",
                                  [("name", anatomy_name),
                                   ("identifier", anatomy_identifier),
                                   ("alternateIdentifiers", anatomy_alt_ids)])

    subject_characteristics = []
    subject_bearerOfDisease = []

    # harmonized/standardized characteristics
    if gender is not None:
        subject_sex = DatsObj(
            "Dimension",
            [("name", DatsObj("Annotation", [("value", "Gender")])),
             ("description", "Gender of the subject"), ("values", [gender])])
        subject_characteristics.append(subject_sex)

    if age is not None:
        subject_age = DatsObj(
            "Dimension", [("name", DatsObj("Annotation", [("value", "Age")])),
                          ("description", "Age of the subject"),
                          ("values", [age])])
        subject_characteristics.append(subject_age)

    if visit_year is not None:
        subject_visitYear = DatsObj("Dimension", [
            ("name", DatsObj("Annotation", [("value", "Visit year")])),
            ("description", "Year of visit, to use for longitudinal analysis"),
            ("values", [visit_year])
        ])
        subject_characteristics.append(subject_visitYear)

    if sys_bp is not None:
        subject_sysBP = DatsObj(
            "Dimension",
            [("name",
              DatsObj("Annotation", [("value", "Systolic blood pressure")])),
             ("description",
              "Systolic blood pressure of subject, measured in mmHg"),
             ("values", [sys_bp])])
        subject_characteristics.append(subject_sysBP)

    if dias_bp is not None:
        subject_diasBP = DatsObj(
            "Dimension",
            [("name",
              DatsObj("Annotation", [("value", "Diastolic blood pressure")])),
             ("description",
              "Diastolic blood pressure of subject, measured in mmHg"),
             ("values", [dias_bp])])
        subject_characteristics.append(subject_diasBP)

    if disease['hypertension'] != "unknown":
        disease_name = "hypertension"
        disease_id = "10763"
        disease_identifier = OrderedDict([
            ("identifier", "DOID:" + str(disease_id)),
            ("identifierSource", "Disease Ontology")
        ])
        disease_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)),
                ("identifierSource", "Disease Ontology")
            ])
        ]
        subject_hypertension = DatsObj("Disease", [
            ("name", "Hypertension"),
            ("identifier", disease_identifier),
            ("alternateIdentifiers", disease_alt_ids),
            ("diseaseStatus",
             DatsObj("Annotation", [("value", disease['hypertension']),
                                    ("valueIRI", "")])),
        ])
        subject_bearerOfDisease.append(subject_hypertension)

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension",
                      [("name", DatsObj("Annotation", [("value", name)])),
                       ("values", [value])])

        # find existing DATS identifier for the corresponding Dataset Dimension
        if "var" in var_value:
            id = var_value["var"]["id"]
            dbgap_var_dim = study_md['dbgap_vars'][id]
            dim.setProperty("identifier",
                            dbgap_var_dim.get("identifier").getIdRef())

        return dim

    # create DATS Dimensions for dbGaP subject metadata
    subject_dimensions = [
        make_var_dimension(vname, subj_var_values[vname])
        for vname in sorted(subj_var_values)
    ]

    # create DATS Dimensions for dbGaP sample metadata
    sample_dimensions = [
        make_var_dimension(vname, samp_var_values[vname])
        for vname in sorted(samp_var_values)
    ]

    # "raw" characteristics from dbGaP metadata
    subject_characteristics.extend(subject_dimensions)
    sample_characteristics = sample_dimensions

    human_t = util.get_taxon_human(cache)
    subj_id = subj_var_values['SUBJECT_ID']['value']
    dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value']
    samp_id = samp_var_values['SAMPLE_ID']['value']
    dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value']

    study_title = study.get("title")

    # human experimental subject/patient
    subject_material = DatsObj(
        "Material",
        [("name", subj_id), ("identifier", {
            "identifier": subj_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]),
         ("description", study_title + " subject " + subj_id),
         ("characteristics", subject_characteristics),
         ("bearerOfDisease", subject_bearerOfDisease), ("taxonomy", [human_t]),
         ("roles", util.get_donor_roles(cache))])

    # TODO - use DatsObjCache
    specimen_annot = util.get_annotation("specimen")
    dna_extract_annot = util.get_annotation("DNA extract")

    # biological/tissue sample
    sample_name = samp_id
    sample_derives_from = [subject_material]
    sample_descr = "specimen collected from subject " + subj_id
    if anatomical_part is not None:
        sample_derives_from.append(anatomical_part)
        sample_descr = anatomy_name + " " + sample_descr

    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]),
         ("description", sample_descr),
         ("characteristics", sample_characteristics), ("taxonomy", [human_t]),
         ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)])

    # DNA extracted from tissue sample
    dna_descr = "DNA extracted from specimen collected from subject " + subj_id
    if anatomical_part is not None:
        dna_descr = "DNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id

    dna_material = DatsObj("Material",
                           [("name", "DNA from " + sample_name),
                            ("description", dna_descr),
                            ("taxonomy", [human_t]),
                            ("roles", [dna_extract_annot]),
                            ("derivesFrom", [biological_sample_material])])

    return dna_material
Esempio n. 3
0
def get_single_sample_json(sample, dats_obj_cache):
    #    print("converting sample to json: " + str(sample))
    samp_id = sample['SAMPID']['mapped_value']
    subj_id = sample['SUBJID']['mapped_value']
    subject = sample['subject']

    # Uberon id (or EFO id, contrary to the documentation)
    anat_id = sample['SMUBRID']['mapped_value']
    if anat_id is None:
        print("No Uberon/anatomy ID specified for sample " + samp_id)
        sys.exit(1)

    anatomy_identifier = None
    anatomy_alt_ids = None
    # TODO - query anatomy term from UBERON/EFO instead?
    anatomy_name = sample['SMTSD']['mapped_value']

    # EFO id
    if re.match(r'^EFO_\d+', anat_id):
        anatomy_identifier = OrderedDict([("identifier", anat_id),
                                          ("identifierSource", "EFO")])
        anatomy_alt_ids = [
            OrderedDict([
                ("identifier",
                 "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" +
                 str(anat_id)), ("identifierSource", "EFO")
            ])
        ]
    # Uberon id
    else:
        anatomy_identifier = OrderedDict([("identifier",
                                           "UBERON:" + str(anat_id)),
                                          ("identifierSource", "UBERON")])
        anatomy_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                ("identifierSource", "UBERON")
            ])
        ]

    # anatomical part
    anat_part_key = ":".join(["AnatomicalPart", anatomy_name])
    if anat_part_key in dats_obj_cache:
        anatomical_part = dats_obj_cache[anat_part_key]
    else:
        anatomical_part = DatsObj("AnatomicalPart",
                                  [("name", anatomy_name),
                                   ("identifier", anatomy_identifier),
                                   ("alternateIdentifiers", anatomy_alt_ids)])
        dats_obj_cache[anat_part_key] = anatomical_part

    # human experimental subject/patient
    subject_sex = DatsObj(
        "Dimension", [("name", DatsObj("Annotation", [("value", "Gender")])),
                      ("description", "Gender of the subject"),
                      ("identifier",
                       DatsObj("Identifier", [("identifier", "SEX"),
                                              ("identifierSource", "GTEx")])),
                      ("values", [subject['SEX']['mapped_value']])])

    subject_age = DatsObj(
        "Dimension",
        [("name", DatsObj("Annotation", [("value", "Age range")])),
         ("description", "Age range of the subject"),
         ("identifier",
          DatsObj("Identifier", [("identifier", "AGE"),
                                 ("identifierSource", "GTEx")])),
         ("values", [subject['AGE']['mapped_value']])])

    subject_hardy_scale = DatsObj(
        "Dimension",
        [("name", DatsObj("Annotation", [("value", "Hardy scale")])),
         ("description", "Hardy scale death classification for the subject"),
         ("identifier",
          DatsObj("Identifier", [("identifier", "DTHHRDY"),
                                 ("identifierSource", "GTEx")])),
         ("values", [subject['DTHHRDY']['mapped_value']])])

    subject_characteristics = [subject_sex, subject_age, subject_hardy_scale]

    # human experimental subject/patient
    subj_key = ":".join(["Material", subj_id])
    if subj_key in dats_obj_cache:
        subject_material = dats_obj_cache[subj_key]
    else:
        subject_material = DatsObj(
            "Material", [("name", subj_id),
                         ("identifier", {
                             "identifier": subj_id
                         }), ("description", "GTEx subject " + subj_id),
                         ("characteristics", subject_characteristics),
                         ("taxonomy", [util.get_taxon_human(dats_obj_cache)]),
                         ("roles", util.get_donor_roles(dats_obj_cache))])
        dats_obj_cache[subj_key] = subject_material

    specimen_annot = util.get_annotation("specimen", dats_obj_cache)
    rna_extract_annot = util.get_annotation("RNA extract", dats_obj_cache)

    # biological/tissue sample
    sample_name = samp_id
    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("description",
          anatomy_name + " specimen collected from subject " + subj_id),
         ("taxonomy", [util.get_taxon_human(dats_obj_cache)]),
         ("roles", [specimen_annot]),
         ("derivesFrom", [subject_material, anatomical_part])])

    # RNA extracted from tissue sample
    rna_material = DatsObj(
        "Material",
        [("name", "RNA from " + sample_name),
         ("description", "total RNA extracted from " + anatomy_name +
          " specimen collected from subject " + subj_id),
         ("taxonomy", [util.get_taxon_human(dats_obj_cache)]),
         ("roles", [rna_extract_annot]),
         ("derivesFrom", [biological_sample_material])])

    return rna_material
Esempio n. 4
0
def get_sample_dats_material(cache, dats_subject, study, study_md,
                             samp_var_values):

    # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"...
    # Few samples are saliva samples probably due to sample collection issues
    name = None
    if 'BODY_SITE' in samp_var_values:
        name = 'BODY_SITE'
    elif 'Body_Site' in samp_var_values:
        name = 'Body_Site'
    elif 'Body Site' in samp_var_values:
        name = 'Body Site'

    anat_id = None
    anatomy_name = None

    if name is not None:
        if "blood" in samp_var_values[name]['value'].lower():
            anatomy_name = "blood"
            anat_id = "0000178"
        elif samp_var_values[name]['value'].lower() == "saliva":
            anatomy_name = "saliva"
            anat_id = "0001836"
        else:
            logging.fatal(
                "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - "
                + samp_var_values['BODY_SITE']['value'])
            sys.exit(1)

    def make_anat_part(anat_id, anatomy_name):
        # anatomical part
        anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name)])

        if anat_id is not None:
            anatomy_identifier = OrderedDict([("identifier",
                                               "UBERON:" + str(anat_id)),
                                              ("identifierSource", "UBERON")])
            anatomy_alt_ids = [
                OrderedDict([
                    ("identifier",
                     "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                    ("identifierSource", "UBERON")
                ])
            ]

            anatomical_part.set("identifier", anatomy_identifier)
            anatomical_part.set("alternateIdentifiers", anatomy_alt_ids)

        return anatomical_part

    if anatomy_name is not None:
        # use cached value for AnatomicalPart if possible
        anat_part_key = ":".join(["AnatomicalPart", anatomy_name])
        anatomical_part = cache.get_obj_or_ref(
            anat_part_key, lambda: make_anat_part(anat_id, anatomy_name))
    else:
        anatomical_part = None

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension",
                      [("name", DatsObj("Annotation", [("value", name)])),
                       ("values", [value])])

        # find existing DATS identifier for the corresponding Dataset Dimension
        if "var" in var_value:
            dbgap_var_dim = var_value["var"]["dim"]
            dim.setProperty("identifier",
                            dbgap_var_dim.get("identifier").getIdRef())
        return dim

    # create DATS Dimensions for dbGaP sample metadata
    sample_dimensions = [
        make_var_dimension(vname, samp_var_values[vname])
        for vname in sorted(samp_var_values)
    ]

    sample_characteristics = sample_dimensions
    samp_id = samp_var_values['SAMPLE_ID']['value']
    dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value']
    study_title = study.get("title")
    specimen_annot = util.get_annotation("specimen", cache)

    # corresponding DATS subject Material
    subj_key = ":".join(["Material", dats_subject.get("name")])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject)
    dats_subj_name = dats_subject.get("name")

    # biological/tissue sample
    sample_name = samp_id
    sample_derives_from = [dats_subj]

    sample_descr = "specimen collected from subject " + dats_subj_name
    if anatomical_part is not None:
        sample_derives_from.append(anatomical_part)
        sample_descr = anatomy_name + " " + sample_descr

    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]),
         ("description", sample_descr),
         ("characteristics", sample_characteristics),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)])

    # RNA or DNA extracted from tissue sample
    stype = "DNA"
    # TODO - check if RNA, not DNA

    dna_or_rna_descr = stype + " extracted from specimen collected from subject " + dats_subj_name
    if anatomical_part is not None:
        dna_or_rna_descr = stype + " extracted from " + anatomy_name + " specimen collected from subject " + dats_subj_name

    dna_or_rna_material = DatsObj(
        "Material",
        [("name", stype + " from " + sample_name),
         ("description", dna_or_rna_descr),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [util.get_annotation(stype + " extract", cache)]),
         ("derivesFrom", [biological_sample_material])])
    return dna_or_rna_material
Esempio n. 5
0
def get_subject_dats_material(cache, study, study_md, subj_var_values):

    # extract subject attributes
    gender = None
    age = None
    visit_year = None
    sys_bp = None
    dias_bp = None
    disease = {}
    disease['hypertension'] = "unknown"
    
    for name in subj_var_values:
        name_upper = name.upper()
        if name_upper == "GENDER" or name_upper == "SEX":
            gender = subj_var_values[name]['value'].lower()
        elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL": #need to confirm that these  allmean the same thing
            age = subj_var_values[name]['value']
        elif name_upper == "VISIT_YEAR":
            visit_year =  subj_var_values[name]['value']
        elif name_upper == "SYSBP":
            sys_bp = subj_var_values[name]['value']
        elif name_upper == "DIASBP":
            dias_bp = subj_var_values[name]['value']
        elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES":
            if subj_var_values[name]['value'].lower() == "yes" or subj_var_values[name]['value'] == '1':
                disease['hypertension'] = "yes"
            elif re.match(r'\S', subj_var_values[name]['value']):
                disease['hypertension'] = "no"

    subject_characteristics = []
    subject_bearerOfDisease = []

    # harmonized/standardized characteristics
    if gender is not None:
        subject_sex = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Gender")])),
                ("description", "Gender of the subject"),
                ("values", [ gender ])
                ])
        subject_characteristics.append(subject_sex)

    if age is not None:
        subject_age = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Age")])),
                ("description", "Age of the subject"),
                ("values", [ age ])
                ])
        subject_characteristics.append(subject_age)
    
    if visit_year is not None:
        subject_visitYear = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Visit year")])),
                ("description", "Year of visit, to use for longitudinal analysis"),
                ("values", [ visit_year ])
                ])
        subject_characteristics.append(subject_visitYear)
    
    if sys_bp is not None:
        subject_sysBP = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Systolic blood pressure")])),
                ("description", "Systolic blood pressure of subject, measured in mmHg"),
                ("values", [ sys_bp ])
                ])
        subject_characteristics.append(subject_sysBP)
        
    if dias_bp is not None:
        subject_diasBP = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Diastolic blood pressure")])),
                ("description", "Diastolic blood pressure of subject, measured in mmHg"),
                ("values", [ dias_bp ])
                ])
        subject_characteristics.append(subject_diasBP)                                      
    
    if disease['hypertension'] != "unknown":
        disease_name = "hypertension"
        disease_id = "10763"
        disease_identifier = OrderedDict([
            ("identifier",  "DOID:" + str(disease_id)),
            ("identifierSource", "Disease Ontology")])
        disease_alt_ids = [OrderedDict([
            ("identifier", "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)),
            ("identifierSource", "Disease Ontology")])]
        subject_hypertension = DatsObj("Disease", [
            ("name", "Hypertension"),
            ("identifier", disease_identifier),
            ("alternateIdentifiers", disease_alt_ids),
            ("diseaseStatus", DatsObj("Annotation", [("value", disease['hypertension'] ), ("valueIRI", "")])), 
            ])
        subject_bearerOfDisease.append(subject_hypertension)

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension", 
                      [("name", DatsObj("Annotation", [( "value",  name )])), 
                       ("values", [ value ])
                       ])

        # find existing DATS identifier for the corresponding Dataset Dimension 
        if "var" in var_value:
            dbgap_var_dim = var_value["var"]["dim"]
            dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef())
        return dim

    # create DATS Dimensions for dbGaP subject metadata
    subject_dimensions = [ make_var_dimension(vname, subj_var_values[vname]) for vname in sorted(subj_var_values) ]

    # "raw" characteristics from dbGaP metadata
    subject_characteristics.extend(subject_dimensions)
    
    human_t = util.get_taxon_human(cache)
    subj_id = subj_var_values['SUBJECT_ID']['value']
    dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value']

    study_title = study.get("title")

    # human experimental subject/patient
    subject_material = DatsObj("Material", [
            ("name", subj_id),
            ("identifier", { "identifier": subj_id }),
            ("alternateIdentifiers", [ util.get_alt_id(dbgap_subj_id, "dbGaP") ]),
            ("description", study_title + " subject " + subj_id),
            ("characteristics", subject_characteristics),
            ("bearerOfDisease", subject_bearerOfDisease),
            ("taxonomy", [ human_t ]),
            ("roles", util.get_donor_roles(cache))
            ])

    # add to the cache
    subj_key = ":".join(["Material", subj_id])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: subject_material)

    return dats_subj
Esempio n. 6
0
def get_sample_dats_material(cache, dats_subject, p_sample, gh_sample,
                             var_lookup):
    samp_id = p_sample['SAMPID']['mapped_value']
    subj_id = p_sample['SUBJID']['mapped_value']

    # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable
    def get_var_id(name):
        return var_lookup[name]['dim'].get("identifier").getIdRef()

    # Uberon id (or EFO id, contrary to the documentation)
    anat_id = p_sample['SMUBRID']['mapped_value']
    if anat_id is None:
        print("No Uberon/anatomy ID specified for sample " + samp_id)
        sys.exit(1)

    anatomy_identifier = None
    anatomy_alt_ids = None
    # TODO - query anatomy term from UBERON/EFO instead?
    anatomy_name = p_sample['SMTSD']['mapped_value']

    def make_anat_part(anat_id, anatomy_name):
        # EFO id
        if re.match(r'^EFO_\d+', anat_id):
            anatomy_identifier = OrderedDict([("identifier", anat_id),
                                              ("identifierSource", "EFO")])
            anatomy_alt_ids = [
                OrderedDict([(
                    "identifier",
                    "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form="
                    + str(anat_id)), ("identifierSource", "EFO")])
            ]
        # Uberon id
        else:
            anatomy_identifier = OrderedDict([("identifier",
                                               "UBERON:" + str(anat_id)),
                                              ("identifierSource", "UBERON")])
            anatomy_alt_ids = [
                OrderedDict([
                    ("identifier",
                     "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                    ("identifierSource", "UBERON")
                ])
            ]

        # anatomical part
        anatomical_part = DatsObj("AnatomicalPart",
                                  [("name", anatomy_name),
                                   ("identifier", anatomy_identifier),
                                   ("alternateIdentifiers", anatomy_alt_ids)])

        return anatomical_part

    # use cached value for AnatomicalPart if possible
    anat_part_key = ":".join(["AnatomicalPart", anatomy_name])
    anatomical_part = cache.get_obj_or_ref(
        anat_part_key, lambda: make_anat_part(anat_id, anatomy_name))

    # use URI from GitHub GTEx id dump if available
    identifier = samp_id
    if gh_sample is not None:
        identifier_id = gh_sample['Destination URL']['raw_value']

    subj_key = ":".join(["Material", dats_subject.get("name")])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject)

    # add sample characteristics from p_sample metadata
    sample_chars = []
    for key in p_sample:
        if re.match(r'^(subject|id)$', key):
            continue
        # TODO - currently including only a small subset of the available values for demonstration purposes
        if not re.match(r'^(SMATSSCR|SMRIN|SMMAPRT|SMGNSDTC)$', key):
            continue
        var = p_sample[key]
        #        print("got key=" + key + " var=" + str(var))
        mapped_val = var['mapped_value']
        char = DatsObj("Dimension",
                       [("name", util.get_value_annotation(key, cache)),
                        ("identifier", get_var_id(key)),
                        ("values", [mapped_val])])
        sample_chars.append(char)

    # biological/tissue sample
    biological_sample_material = DatsObj(
        "Material",
        [("name", samp_id), ("identifier", {
            "identifier": identifier
        }),
         ("description",
          anatomy_name + " specimen collected from subject " + subj_id),
         ("characteristics", sample_chars),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [util.get_annotation("specimen", cache)]),
         ("derivesFrom", [dats_subj, anatomical_part])])

    # analysis freeze classification
    smafrze = p_sample['SMAFRZE']['mapped_value']
    # expected sequence type depending on data freeze classification
    expected_stype = None

    stype = None
    if smafrze == "RNASEQ":
        expected_stype = "RNA"
    elif smafrze == "WGS":
        expected_stype = "DNA"
    elif smafrze == "WES":
        expected_stype = "DNA"
    # Illumina OMNI SNP Array
    elif smafrze == "OMNI":
        expected_stype = "DNA"
    elif smafrze == "EXCLUDE":
        pass
    else:
        logging.fatal("unknown SMAFRZE " + smafrze)
        sys.exit(1)

    # sample type - DNA or RNA
    stype = None
    smnabtcht = p_sample['SMNABTCHT']['mapped_value']
    if re.match(r'^DNA ([iI]solation|[eE]xtraction).*', smnabtcht):
        stype = 'DNA'
    elif re.match(r'^RNA ([iI]solation|[eE]xtraction).*', smnabtcht):
        stype = 'RNA'
    elif re.match(
            r'DNA or RNA Extraction from Paxgene-derived Lysate Plate Based',
            smnabtcht):
        stype = 'RNA'
    elif re.match(r'Transfer To Matrix \(Manual\)', smnabtcht):
        stype = 'DNA'

    if stype is None:
        if expected_stype is not None:
            stype = expected_stype
        else:
            print("couldn't determine sequence type for smafrze=" + smafrze +
                  " smnabtcht=" + smnabtcht)
            return None
    else:
        if (expected_stype is not None) and (stype != expected_stype):
            logging.fatal("seq type " + stype +
                          " doesn't match expected stype " + expected_stype)
            sys.exit(1)

    # DNA or RNA extract
    dna_or_rna_material = DatsObj(
        "Material",
        [("name", stype + " from " + samp_id),
         ("description", "total " + stype + " extracted from " + anatomy_name +
          " specimen collected from subject " + subj_id),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [util.get_annotation(stype + " extract", cache)]),
         ("derivesFrom", [biological_sample_material])])

    return dna_or_rna_material
Esempio n. 7
0
def get_single_dna_extract_json(study, subj_var_values, samp_var_values):

    # all samples in TOPMed WGS phase are blood samples
    if samp_var_values['BODY_SITE'] != 'Blood':
        logging.fatal(
            "encountered BODY_SITE other than 'Blood' in TOPMed sample metadata - "
            + samp_var_values['BODY_SITE'])
        sys.exit(1)

    anatomy_name = "blood"
    anat_id = "0000178"

    anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)),
                                      ("identifierSource", "UBERON")])
    anatomy_alt_ids = [
        OrderedDict([("identifier",
                      "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                     ("identifierSource", "UBERON")])
    ]

    # extract subject attributes
    gender = None
    age = None
    for name in subj_var_values:
        if name == "GENDER":
            gender = subj_var_values[name].lower()
        elif name == "VISIT_AGE":
            age = subj_var_values[name]
    # TODO - determine what other subject attributes can be mapped directly to core DATS objects

    # place original dbGaP subject metadata into extraProperties
    # TODO - consider alternative of doing this only for un-harmonized metadata
    subj_extra_props = [
        DatsObj("CategoryValuesPair", [("category", xp),
                                       ("values", [subj_var_values[xp]])])
        for xp in sorted(subj_var_values)
    ]

    # extract sample attributes
    for name in samp_var_values:
        if name == 'SEQUENCING_CENTER':
            # TODO - determine which DATS objects (e.g., biological sample, DNA prep, sequence data) this property should attach to
            pass

    # TODO - determine what other subject attributes can be mapped directly to core DATS objects
    # e.g., IS_TUMOR -> bearerOfDisease ("the pathology affecting the material...")

    # place original dbGaP sample metadata into extraProperties
    samp_extra_props = [
        DatsObj("CategoryValuesPair", [("category", xp),
                                       ("values", [samp_var_values[xp]])])
        for xp in sorted(samp_var_values)
    ]

    # anatomical part
    anatomical_part = DatsObj("AnatomicalPart",
                              [("name", anatomy_name),
                               ("identifier", anatomy_identifier),
                               ("alternateIdentifiers", anatomy_alt_ids)])

    subject_sex = DatsObj("Dimension", [("name", {
        "value": "Gender"
    }), ("description", "Gender of the subject"), ("values", [gender])])

    subject_age = DatsObj("Dimension", [("name", {
        "value": "Age"
    }), ("description", "Age of the subject"), ("values", [age])])

    subject_characteristics = [subject_sex, subject_age]

    human_t = util.get_taxon_human()
    subj_id = subj_var_values['SUBJECT_ID']
    dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']
    samp_id = samp_var_values['SAMPLE_ID']
    dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']

    study_title = study.get("title")

    # human experimental subject/patient
    subject_material = DatsObj(
        "Material",
        [("name", subj_id), ("identifier", {
            "identifier": subj_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]),
         ("description", study_title + " subject " + subj_id),
         ("characteristics", subject_characteristics), ("taxonomy", human_t),
         ("roles", util.get_donor_roles()),
         ("extraProperties", subj_extra_props)])

    # biological/tissue sample
    sample_name = samp_id
    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]),
         ("description",
          anatomy_name + " specimen collected from subject " + subj_id),
         ("taxonomy", human_t),
         ("roles", [OrderedDict([("value", "specimen"), ("valueIRI", "")])]),
         ("derivesFrom", [subject_material, anatomical_part]),
         ("extraProperties", samp_extra_props)])

    # DNA extracted from tissue sample
    dna_material = DatsObj(
        "Material",
        [("name", "DNA from " + sample_name),
         ("description", "DNA extracted from " +
          anatomy_name + " specimen collected from subject " + subj_id),
         ("taxonomy", human_t),
         ("roles", [OrderedDict([("value", "DNA extract"),
                                 ("valueIRI", "")])]),
         ("derivesFrom", [biological_sample_material])])

    return dna_material
Esempio n. 8
0
def get_gene_json(cache, mod, gff3_json_path, orthologs):

    # read gene features form BGI file
    features = read_bgi(cache, mod, gff3_json_path)

    # read disease info from disease JSON file
    diseases = read_disease(cache, mod, gff3_json_path)

    # read disease info from disease JSON file
    phenotypes = read_phenotype(cache, mod, gff3_json_path)

    # TODO - read gene features from GFF3 file

    genes = []

    for f in features:
        genomeLocations = DatsObj("GenomeLocation",
                                  [("assembly", f['assembly']),
                                   ("chromosome", f['chr']),
                                   ("startPosition", f['start']),
                                   ("endPosition", f['end']),
                                   ("strand", f['strand'])])

        roles = [
            DatsObj("Annotation",
                    [("value", SOID[f['soid']]),
                     ("valueIRI",
                      "http://purl.obolibrary.org/obo/SO_" + f['soid'][3:])])
        ]

        alternate_ids = []
        if f['alt_ids'] != "NA":
            alt_ids_list = []
            for i in f['alt_ids']:
                source, id = i.split(':')
                alt_id = util.get_alt_id(id, source)
                alternate_ids.append(alt_id)

        #encode disease
        disease_list = []

        gene_diseases = search_dict('object_id', f['primaryId'], diseases)

        if len(gene_diseases) > 0:

            do_ids = [d['do_id'] for d in gene_diseases]
            uniq_do_ids = list(set(do_ids))

            for d in uniq_do_ids:
                disease_id = DatsObj(
                    "Annotation",
                    [("value", d),
                     ("valueIRI",
                      "http://purl.obolibrary.org/obo/DOID_" + d[5:])])

                select_diseases = search_dict('do_id', d, gene_diseases)

                #relation = OrderedDict([("value", select_diseases[0]['association_type'])])
                relation = DatsObj(
                    "Annotation",
                    [("value", "Disease"),
                     ("valueIRI", "http://purl.obolibrary.org/obo/DOID_4")])

                # account for multiple evidence codes per disease id
                evd_ids = []
                evd_ids_list = [d['evidence_codes'] for d in select_diseases]
                for i in evd_ids_list[0]:
                    evd_id = DatsObj(
                        "Annotation",
                        [("value", i),
                         ("valueIRI",
                          "http://purl.obolibrary.org/obo/" + EVID[i])])
                    evd_ids.append(evd_id)

            # account for multiple publications per disease id
                pub_ids = []
                pub_ids_list = [d['pubmed_id'] for d in select_diseases]
                for i in pub_ids_list:
                    pub_id = DatsObj(
                        "Publication",
                        [("Identifier",
                          DatsObj("Identifier",
                                  [("identifier", i),
                                   ("identifierSource", "PubMed")]))])
                    pub_ids.append(pub_id)

                mod_pub_ids_list = [d['mod_pub_id'] for d in select_diseases]
                for i in mod_pub_ids_list:
                    mod_pub_id = DatsObj("Publication", [
                        ("Identifier",
                         DatsObj("Identifier", [("identifier", i),
                                                ("identifierSource", i[:3])]))
                    ])
                    pub_ids.append(mod_pub_id)

                relation_evidence = OrderedDict([
                    ("evidenceCodes", evd_ids), ("publications", pub_ids),
                    ("dateEstablished",
                     DatsObj("Date", [
                         ("date", select_diseases[0]['date_ass']),
                         ("type",
                          DatsObj("Annotation", [("value", "Date Assigned")]))
                     ]))
                ])

                related_entity_id = OrderedDict([
                    ("object", disease_id), ("relation", relation),
                    ("resultingFrom",
                     DatsObj(
                         "Activity",
                         [("name", select_diseases[0]['association_type'])])),
                    ("relationEvidence", relation_evidence)
                ])
                disease_list.append(related_entity_id)

        #encode phenotype
        #assumes one phenotype termID per record as is in RGD and MGI phenotype JSONs
        phenotype_list = []

        gene_phenotypes = search_dict('object_id', f['primaryId'], phenotypes)

        if len(gene_phenotypes) > 0:

            phe_ids = [p['phe_term_ids'] for p in gene_phenotypes]
            #logging.info("phe_ids: " + str(phe_ids))
            uniq_phe_ids = list(set(phe_ids))

            for p in uniq_phe_ids:
                select_phenotypes = search_dict('phe_term_ids', p,
                                                gene_phenotypes)
                #logging.info("select_phe: " + str(select_phenotypes))

                term_id = DatsObj(
                    "Annotation",
                    [("value", select_phenotypes[0]['phe_term_ids']),
                     ("valueIRI", "http://purl.obolibrary.org/obo/MP_" +
                      select_phenotypes[0]['phe_term_ids'][3:])])

                relation = DatsObj("Annotation", [
                    ("value", "Phenotype"),
                    ("valueIRI", "http://purl.obolibrary.org/obo/OGMS_0000023")
                ])

                # account for multiple publications per phenotype
                pub_ids = []
                empt = ''
                pub_ids_list = [p['pubmed_id'] for p in select_phenotypes]
                for i in pub_ids_list:
                    if i == "":
                        continue
                    else:
                        pub_id = DatsObj(
                            "Publication",
                            [("Identifier",
                              DatsObj("Identifier",
                                      [("identifier", i),
                                       ("identifierSource", "PubMed")]))])
                        pub_ids.append(pub_id)

                mod_pub_ids_list = [p['mod_pub_id'] for p in select_phenotypes]
                for i in mod_pub_ids_list:
                    mod_pub_id = DatsObj("Publication", [
                        ("Identifier",
                         DatsObj("Identifier", [("identifier", i),
                                                ("identifierSource", i[:3])]))
                    ])
                    pub_ids.append(mod_pub_id)

                relation_evidence = OrderedDict([
                    ("publications", pub_ids),
                    ("dateEstablished",
                     DatsObj("Date", [
                         ("date", select_phenotypes[0]['date_ass']),
                         ("type",
                          DatsObj("Annotation", [("value", "Date Assigned")]))
                     ]))
                ])

                related_entity_id = OrderedDict([("object", term_id),
                                                 ("relation", relation),
                                                 ("relationEvidence",
                                                  relation_evidence)])
                phenotype_list.append(related_entity_id)

        #encode ortholog
        ortholog_list = []

        gene_orthologs = search_dict('mod_gene_id', f['primaryId'], orthologs)

        if len(gene_orthologs) > 0:
            for o in gene_orthologs:
                if '9606' in o['ortho_taxon']:
                    o_taxon = util.get_taxon_human(cache)
                else:
                    logging.fatal("encountered taxonomy other human - " +
                                  o['ortho_taxon'])
                    sys.exit(1)

                mol_entity_ortholog = DatsObj("MolecularEntity", [
                    ("identifier",
                     DatsObj("Identifier",
                             [("identifier", o['ortho_gene_id'])])),
                    ("name", o['ortho_gene_id']),
                    ("taxonomy", [o_taxon]),
                    ("alternateIdentifiers",
                     util.get_alt_id(o['ortho_gene_symbol'], "Gene Symbol")),
                ])

                related_entity_id = OrderedDict([
                    ("object", mol_entity_ortholog),
                    ("relation",
                     DatsObj("Annotation",
                             [("value", "Orthology"),
                              ("valueIRI",
                               "http://purl.obolibrary.org/obo/HOM_0000017")]))
                ])
                ortholog_list.append(related_entity_id)

        related_entities = disease_list + phenotype_list + ortholog_list

        gene = DatsObj(
            "MolecularEntity",
            [("identifier",
              DatsObj("Identifier", [("identifier", f['primaryId'])])),
             ("name", f['primaryId']), ("description", f['descr']),
             ("roles", roles), ("taxonomy", [f['taxon']]),
             ("genomeLocations", [genomeLocations]),
             ("alternateIdentifiers", alternate_ids),
             ("relatedEntities", related_entities)])
        genes.append(gene)

    return genes