예제 #1
0
def get_single_dna_extract_json(cache, study, study_md, subj_var_values,
                                samp_var_values):
    # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"...
    # Few samples are saliva samples probably due to sample collection issues
    name = None
    if 'BODY_SITE' in samp_var_values:
        name = 'BODY_SITE'
    elif 'Body_Site' in samp_var_values:
        name = 'Body_Site'
    elif 'Body Site' in samp_var_values:
        name = 'Body Site'

    anat_id = None
    anatomy_name = None

    if name is not None:
        if "blood" in samp_var_values[name]['value'].lower():
            anatomy_name = "blood"
            anat_id = "0000178"
        elif samp_var_values[name]['value'].lower() == "saliva":
            anatomy_name = "saliva"
            anat_id = "0001836"
        else:
            logging.fatal(
                "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - "
                + samp_var_values['BODY_SITE']['value'])
            sys.exit(1)

    if anat_id is not None:
        anatomy_identifier = OrderedDict([("identifier",
                                           "UBERON:" + str(anat_id)),
                                          ("identifierSource", "UBERON")])
        anatomy_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                ("identifierSource", "UBERON")
            ])
        ]

    # extract subject attributes
    gender = None
    age = None
    visit_year = None
    sys_bp = None
    dias_bp = None
    disease = {}
    disease['hypertension'] = "unknown"

    for name in subj_var_values:
        name_upper = name.upper()
        if name_upper == "GENDER" or name_upper == "SEX":
            gender = subj_var_values[name]['value'].lower()
        elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL":  #need to confirm that these  allmean the same thing
            age = subj_var_values[name]['value']
        elif name_upper == "VISIT_YEAR":
            visit_year = subj_var_values[name]['value']
        elif name_upper == "SYSBP":
            sys_bp = subj_var_values[name]['value']
        elif name_upper == "DIASBP":
            dias_bp = subj_var_values[name]['value']
        elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES":
            if subj_var_values[name]['value'].lower(
            ) == "yes" or subj_var_values[name]['value'] == '1':
                disease['hypertension'] = "yes"
            elif re.match(r'\S', subj_var_values[name]['value']):
                disease['hypertension'] = "no"

    # anatomical part
    anatomical_part = None
    if anatomy_name is not None:
        anatomical_part = DatsObj("AnatomicalPart",
                                  [("name", anatomy_name),
                                   ("identifier", anatomy_identifier),
                                   ("alternateIdentifiers", anatomy_alt_ids)])

    subject_characteristics = []
    subject_bearerOfDisease = []

    # harmonized/standardized characteristics
    if gender is not None:
        subject_sex = DatsObj(
            "Dimension",
            [("name", DatsObj("Annotation", [("value", "Gender")])),
             ("description", "Gender of the subject"), ("values", [gender])])
        subject_characteristics.append(subject_sex)

    if age is not None:
        subject_age = DatsObj(
            "Dimension", [("name", DatsObj("Annotation", [("value", "Age")])),
                          ("description", "Age of the subject"),
                          ("values", [age])])
        subject_characteristics.append(subject_age)

    if visit_year is not None:
        subject_visitYear = DatsObj("Dimension", [
            ("name", DatsObj("Annotation", [("value", "Visit year")])),
            ("description", "Year of visit, to use for longitudinal analysis"),
            ("values", [visit_year])
        ])
        subject_characteristics.append(subject_visitYear)

    if sys_bp is not None:
        subject_sysBP = DatsObj(
            "Dimension",
            [("name",
              DatsObj("Annotation", [("value", "Systolic blood pressure")])),
             ("description",
              "Systolic blood pressure of subject, measured in mmHg"),
             ("values", [sys_bp])])
        subject_characteristics.append(subject_sysBP)

    if dias_bp is not None:
        subject_diasBP = DatsObj(
            "Dimension",
            [("name",
              DatsObj("Annotation", [("value", "Diastolic blood pressure")])),
             ("description",
              "Diastolic blood pressure of subject, measured in mmHg"),
             ("values", [dias_bp])])
        subject_characteristics.append(subject_diasBP)

    if disease['hypertension'] != "unknown":
        disease_name = "hypertension"
        disease_id = "10763"
        disease_identifier = OrderedDict([
            ("identifier", "DOID:" + str(disease_id)),
            ("identifierSource", "Disease Ontology")
        ])
        disease_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)),
                ("identifierSource", "Disease Ontology")
            ])
        ]
        subject_hypertension = DatsObj("Disease", [
            ("name", "Hypertension"),
            ("identifier", disease_identifier),
            ("alternateIdentifiers", disease_alt_ids),
            ("diseaseStatus",
             DatsObj("Annotation", [("value", disease['hypertension']),
                                    ("valueIRI", "")])),
        ])
        subject_bearerOfDisease.append(subject_hypertension)

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension",
                      [("name", DatsObj("Annotation", [("value", name)])),
                       ("values", [value])])

        # find existing DATS identifier for the corresponding Dataset Dimension
        if "var" in var_value:
            id = var_value["var"]["id"]
            dbgap_var_dim = study_md['dbgap_vars'][id]
            dim.setProperty("identifier",
                            dbgap_var_dim.get("identifier").getIdRef())

        return dim

    # create DATS Dimensions for dbGaP subject metadata
    subject_dimensions = [
        make_var_dimension(vname, subj_var_values[vname])
        for vname in sorted(subj_var_values)
    ]

    # create DATS Dimensions for dbGaP sample metadata
    sample_dimensions = [
        make_var_dimension(vname, samp_var_values[vname])
        for vname in sorted(samp_var_values)
    ]

    # "raw" characteristics from dbGaP metadata
    subject_characteristics.extend(subject_dimensions)
    sample_characteristics = sample_dimensions

    human_t = util.get_taxon_human(cache)
    subj_id = subj_var_values['SUBJECT_ID']['value']
    dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value']
    samp_id = samp_var_values['SAMPLE_ID']['value']
    dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value']

    study_title = study.get("title")

    # human experimental subject/patient
    subject_material = DatsObj(
        "Material",
        [("name", subj_id), ("identifier", {
            "identifier": subj_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]),
         ("description", study_title + " subject " + subj_id),
         ("characteristics", subject_characteristics),
         ("bearerOfDisease", subject_bearerOfDisease), ("taxonomy", [human_t]),
         ("roles", util.get_donor_roles(cache))])

    # TODO - use DatsObjCache
    specimen_annot = util.get_annotation("specimen")
    dna_extract_annot = util.get_annotation("DNA extract")

    # biological/tissue sample
    sample_name = samp_id
    sample_derives_from = [subject_material]
    sample_descr = "specimen collected from subject " + subj_id
    if anatomical_part is not None:
        sample_derives_from.append(anatomical_part)
        sample_descr = anatomy_name + " " + sample_descr

    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]),
         ("description", sample_descr),
         ("characteristics", sample_characteristics), ("taxonomy", [human_t]),
         ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)])

    # DNA extracted from tissue sample
    dna_descr = "DNA extracted from specimen collected from subject " + subj_id
    if anatomical_part is not None:
        dna_descr = "DNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id

    dna_material = DatsObj("Material",
                           [("name", "DNA from " + sample_name),
                            ("description", dna_descr),
                            ("taxonomy", [human_t]),
                            ("roles", [dna_extract_annot]),
                            ("derivesFrom", [biological_sample_material])])

    return dna_material
예제 #2
0
def get_sample_dats_material(cache, dats_subject, study, study_md,
                             samp_var_values):

    # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"...
    # Few samples are saliva samples probably due to sample collection issues
    name = None
    if 'BODY_SITE' in samp_var_values:
        name = 'BODY_SITE'
    elif 'Body_Site' in samp_var_values:
        name = 'Body_Site'
    elif 'Body Site' in samp_var_values:
        name = 'Body Site'

    anat_id = None
    anatomy_name = None

    if name is not None:
        if "blood" in samp_var_values[name]['value'].lower():
            anatomy_name = "blood"
            anat_id = "0000178"
        elif samp_var_values[name]['value'].lower() == "saliva":
            anatomy_name = "saliva"
            anat_id = "0001836"
        else:
            logging.fatal(
                "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - "
                + samp_var_values['BODY_SITE']['value'])
            sys.exit(1)

    def make_anat_part(anat_id, anatomy_name):
        # anatomical part
        anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name)])

        if anat_id is not None:
            anatomy_identifier = OrderedDict([("identifier",
                                               "UBERON:" + str(anat_id)),
                                              ("identifierSource", "UBERON")])
            anatomy_alt_ids = [
                OrderedDict([
                    ("identifier",
                     "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                    ("identifierSource", "UBERON")
                ])
            ]

            anatomical_part.set("identifier", anatomy_identifier)
            anatomical_part.set("alternateIdentifiers", anatomy_alt_ids)

        return anatomical_part

    if anatomy_name is not None:
        # use cached value for AnatomicalPart if possible
        anat_part_key = ":".join(["AnatomicalPart", anatomy_name])
        anatomical_part = cache.get_obj_or_ref(
            anat_part_key, lambda: make_anat_part(anat_id, anatomy_name))
    else:
        anatomical_part = None

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension",
                      [("name", DatsObj("Annotation", [("value", name)])),
                       ("values", [value])])

        # find existing DATS identifier for the corresponding Dataset Dimension
        if "var" in var_value:
            dbgap_var_dim = var_value["var"]["dim"]
            dim.setProperty("identifier",
                            dbgap_var_dim.get("identifier").getIdRef())
        return dim

    # create DATS Dimensions for dbGaP sample metadata
    sample_dimensions = [
        make_var_dimension(vname, samp_var_values[vname])
        for vname in sorted(samp_var_values)
    ]

    sample_characteristics = sample_dimensions
    samp_id = samp_var_values['SAMPLE_ID']['value']
    dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value']
    study_title = study.get("title")
    specimen_annot = util.get_annotation("specimen", cache)

    # corresponding DATS subject Material
    subj_key = ":".join(["Material", dats_subject.get("name")])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject)
    dats_subj_name = dats_subject.get("name")

    # biological/tissue sample
    sample_name = samp_id
    sample_derives_from = [dats_subj]

    sample_descr = "specimen collected from subject " + dats_subj_name
    if anatomical_part is not None:
        sample_derives_from.append(anatomical_part)
        sample_descr = anatomy_name + " " + sample_descr

    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]),
         ("description", sample_descr),
         ("characteristics", sample_characteristics),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)])

    # RNA or DNA extracted from tissue sample
    stype = "DNA"
    # TODO - check if RNA, not DNA

    dna_or_rna_descr = stype + " extracted from specimen collected from subject " + dats_subj_name
    if anatomical_part is not None:
        dna_or_rna_descr = stype + " extracted from " + anatomy_name + " specimen collected from subject " + dats_subj_name

    dna_or_rna_material = DatsObj(
        "Material",
        [("name", stype + " from " + sample_name),
         ("description", dna_or_rna_descr),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [util.get_annotation(stype + " extract", cache)]),
         ("derivesFrom", [biological_sample_material])])
    return dna_or_rna_material
예제 #3
0
def get_subject_dats_material(cache, study, study_md, subj_var_values):

    # extract subject attributes
    gender = None
    age = None
    visit_year = None
    sys_bp = None
    dias_bp = None
    disease = {}
    disease['hypertension'] = "unknown"
    
    for name in subj_var_values:
        name_upper = name.upper()
        if name_upper == "GENDER" or name_upper == "SEX":
            gender = subj_var_values[name]['value'].lower()
        elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL": #need to confirm that these  allmean the same thing
            age = subj_var_values[name]['value']
        elif name_upper == "VISIT_YEAR":
            visit_year =  subj_var_values[name]['value']
        elif name_upper == "SYSBP":
            sys_bp = subj_var_values[name]['value']
        elif name_upper == "DIASBP":
            dias_bp = subj_var_values[name]['value']
        elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES":
            if subj_var_values[name]['value'].lower() == "yes" or subj_var_values[name]['value'] == '1':
                disease['hypertension'] = "yes"
            elif re.match(r'\S', subj_var_values[name]['value']):
                disease['hypertension'] = "no"

    subject_characteristics = []
    subject_bearerOfDisease = []

    # harmonized/standardized characteristics
    if gender is not None:
        subject_sex = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Gender")])),
                ("description", "Gender of the subject"),
                ("values", [ gender ])
                ])
        subject_characteristics.append(subject_sex)

    if age is not None:
        subject_age = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Age")])),
                ("description", "Age of the subject"),
                ("values", [ age ])
                ])
        subject_characteristics.append(subject_age)
    
    if visit_year is not None:
        subject_visitYear = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Visit year")])),
                ("description", "Year of visit, to use for longitudinal analysis"),
                ("values", [ visit_year ])
                ])
        subject_characteristics.append(subject_visitYear)
    
    if sys_bp is not None:
        subject_sysBP = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Systolic blood pressure")])),
                ("description", "Systolic blood pressure of subject, measured in mmHg"),
                ("values", [ sys_bp ])
                ])
        subject_characteristics.append(subject_sysBP)
        
    if dias_bp is not None:
        subject_diasBP = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Diastolic blood pressure")])),
                ("description", "Diastolic blood pressure of subject, measured in mmHg"),
                ("values", [ dias_bp ])
                ])
        subject_characteristics.append(subject_diasBP)                                      
    
    if disease['hypertension'] != "unknown":
        disease_name = "hypertension"
        disease_id = "10763"
        disease_identifier = OrderedDict([
            ("identifier",  "DOID:" + str(disease_id)),
            ("identifierSource", "Disease Ontology")])
        disease_alt_ids = [OrderedDict([
            ("identifier", "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)),
            ("identifierSource", "Disease Ontology")])]
        subject_hypertension = DatsObj("Disease", [
            ("name", "Hypertension"),
            ("identifier", disease_identifier),
            ("alternateIdentifiers", disease_alt_ids),
            ("diseaseStatus", DatsObj("Annotation", [("value", disease['hypertension'] ), ("valueIRI", "")])), 
            ])
        subject_bearerOfDisease.append(subject_hypertension)

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension", 
                      [("name", DatsObj("Annotation", [( "value",  name )])), 
                       ("values", [ value ])
                       ])

        # find existing DATS identifier for the corresponding Dataset Dimension 
        if "var" in var_value:
            dbgap_var_dim = var_value["var"]["dim"]
            dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef())
        return dim

    # create DATS Dimensions for dbGaP subject metadata
    subject_dimensions = [ make_var_dimension(vname, subj_var_values[vname]) for vname in sorted(subj_var_values) ]

    # "raw" characteristics from dbGaP metadata
    subject_characteristics.extend(subject_dimensions)
    
    human_t = util.get_taxon_human(cache)
    subj_id = subj_var_values['SUBJECT_ID']['value']
    dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value']

    study_title = study.get("title")

    # human experimental subject/patient
    subject_material = DatsObj("Material", [
            ("name", subj_id),
            ("identifier", { "identifier": subj_id }),
            ("alternateIdentifiers", [ util.get_alt_id(dbgap_subj_id, "dbGaP") ]),
            ("description", study_title + " subject " + subj_id),
            ("characteristics", subject_characteristics),
            ("bearerOfDisease", subject_bearerOfDisease),
            ("taxonomy", [ human_t ]),
            ("roles", util.get_donor_roles(cache))
            ])

    # add to the cache
    subj_key = ":".join(["Material", subj_id])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: subject_material)

    return dats_subj
예제 #4
0
def get_single_dna_extract_json(study, subj_var_values, samp_var_values):

    # all samples in TOPMed WGS phase are blood samples
    if samp_var_values['BODY_SITE'] != 'Blood':
        logging.fatal(
            "encountered BODY_SITE other than 'Blood' in TOPMed sample metadata - "
            + samp_var_values['BODY_SITE'])
        sys.exit(1)

    anatomy_name = "blood"
    anat_id = "0000178"

    anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)),
                                      ("identifierSource", "UBERON")])
    anatomy_alt_ids = [
        OrderedDict([("identifier",
                      "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                     ("identifierSource", "UBERON")])
    ]

    # extract subject attributes
    gender = None
    age = None
    for name in subj_var_values:
        if name == "GENDER":
            gender = subj_var_values[name].lower()
        elif name == "VISIT_AGE":
            age = subj_var_values[name]
    # TODO - determine what other subject attributes can be mapped directly to core DATS objects

    # place original dbGaP subject metadata into extraProperties
    # TODO - consider alternative of doing this only for un-harmonized metadata
    subj_extra_props = [
        DatsObj("CategoryValuesPair", [("category", xp),
                                       ("values", [subj_var_values[xp]])])
        for xp in sorted(subj_var_values)
    ]

    # extract sample attributes
    for name in samp_var_values:
        if name == 'SEQUENCING_CENTER':
            # TODO - determine which DATS objects (e.g., biological sample, DNA prep, sequence data) this property should attach to
            pass

    # TODO - determine what other subject attributes can be mapped directly to core DATS objects
    # e.g., IS_TUMOR -> bearerOfDisease ("the pathology affecting the material...")

    # place original dbGaP sample metadata into extraProperties
    samp_extra_props = [
        DatsObj("CategoryValuesPair", [("category", xp),
                                       ("values", [samp_var_values[xp]])])
        for xp in sorted(samp_var_values)
    ]

    # anatomical part
    anatomical_part = DatsObj("AnatomicalPart",
                              [("name", anatomy_name),
                               ("identifier", anatomy_identifier),
                               ("alternateIdentifiers", anatomy_alt_ids)])

    subject_sex = DatsObj("Dimension", [("name", {
        "value": "Gender"
    }), ("description", "Gender of the subject"), ("values", [gender])])

    subject_age = DatsObj("Dimension", [("name", {
        "value": "Age"
    }), ("description", "Age of the subject"), ("values", [age])])

    subject_characteristics = [subject_sex, subject_age]

    human_t = util.get_taxon_human()
    subj_id = subj_var_values['SUBJECT_ID']
    dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']
    samp_id = samp_var_values['SAMPLE_ID']
    dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']

    study_title = study.get("title")

    # human experimental subject/patient
    subject_material = DatsObj(
        "Material",
        [("name", subj_id), ("identifier", {
            "identifier": subj_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]),
         ("description", study_title + " subject " + subj_id),
         ("characteristics", subject_characteristics), ("taxonomy", human_t),
         ("roles", util.get_donor_roles()),
         ("extraProperties", subj_extra_props)])

    # biological/tissue sample
    sample_name = samp_id
    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]),
         ("description",
          anatomy_name + " specimen collected from subject " + subj_id),
         ("taxonomy", human_t),
         ("roles", [OrderedDict([("value", "specimen"), ("valueIRI", "")])]),
         ("derivesFrom", [subject_material, anatomical_part]),
         ("extraProperties", samp_extra_props)])

    # DNA extracted from tissue sample
    dna_material = DatsObj(
        "Material",
        [("name", "DNA from " + sample_name),
         ("description", "DNA extracted from " +
          anatomy_name + " specimen collected from subject " + subj_id),
         ("taxonomy", human_t),
         ("roles", [OrderedDict([("value", "DNA extract"),
                                 ("valueIRI", "")])]),
         ("derivesFrom", [biological_sample_material])])

    return dna_material
예제 #5
0
def update_single_subject(cache, study, study_md, subj, subj_var_values,
                          use_all_dbgap_vars):

    # extract subject attributes
    gender = None
    age = None
    visit_year = None
    sys_bp = None
    dias_bp = None
    disease = {}
    disease['hypertension'] = "unknown"

    for name in subj_var_values:
        name_upper = name.upper()
        if name_upper == "GENDER" or name_upper == "SEX":
            gender = subj_var_values[name]['value'].lower()
        elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL":  #need to confirm that these  allmean the same thing
            age = subj_var_values[name]['value']
        elif name_upper == "VISIT_YEAR":
            visit_year = subj_var_values[name]['value']
        elif name_upper == "SYSBP":
            sys_bp = subj_var_values[name]['value']
        elif name_upper == "DIASBP":
            dias_bp = subj_var_values[name]['value']
        elif name_upper == "HYPERTENSION" or name_upper == "MHHTN":
            if subj_var_values[name]['value'].lower(
            ) == "yes" or subj_var_values[name]['value'] == '1':
                disease['hypertension'] = "yes"
            elif re.match(r'\S', subj_var_values[name]['value']):
                disease['hypertension'] = "no"

    subject_characteristics = []
    subject_bearerOfDisease = []

    # harmonized/standardized characteristics
    if gender is not None:
        # TODO - this
        ss = None
        if gender == "2":
            ss = "female"
        if gender == "1":
            ss = "male"
        subject_sex = DatsObj(
            "Dimension",
            [("name", DatsObj("Annotation", [("value", "Gender")])),
             ("description", "Gender of the subject"), ("values", [ss])])
        subject_characteristics.append(subject_sex)

    if age is not None:
        subject_age = DatsObj(
            "Dimension", [("name", DatsObj("Annotation", [("value", "Age")])),
                          ("description", "Age of the subject"),
                          ("values", [age])])
        subject_characteristics.append(subject_age)

    if visit_year is not None:
        subject_visitYear = DatsObj("Dimension", [
            ("name", DatsObj("Annotation", [("value", "Visit year")])),
            ("description", "Year of visit, to use for longitudinal analysis"),
            ("values", [visit_year])
        ])
        subject_characteristics.append(subject_visitYear)

    if sys_bp is not None:
        subject_sysBP = DatsObj(
            "Dimension",
            [("name",
              DatsObj("Annotation", [("value", "Systolic blood pressure")])),
             ("description",
              "Systolic blood pressure of subject, measured in mmHg"),
             ("values", [sys_bp])])
        subject_characteristics.append(subject_sysBP)

    if dias_bp is not None:
        subject_diasBP = DatsObj(
            "Dimension",
            [("name",
              DatsObj("Annotation", [("value", "Diastolic blood pressure")])),
             ("description",
              "Diastolic blood pressure of subject, measured in mmHg"),
             ("values", [dias_bp])])
        subject_characteristics.append(subject_diasBP)

    if disease['hypertension'] != "unknown":
        disease_name = "hypertension"
        disease_id = "10763"
        disease_identifier = OrderedDict([
            ("identifier", "DOID:" + str(disease_id)),
            ("identifierSource", "Disease Ontology")
        ])
        disease_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)),
                ("identifierSource", "Disease Ontology")
            ])
        ]
        subject_hypertension = DatsObj(
            "Disease",
            [("name", "Hypertension"), ("identifier", disease_identifier),
             ("alternateIdentifiers", disease_alt_ids),
             ("diseaseStatus",
              DatsObj("Annotation", [("value", disease['hypertension'])]))])
        subject_bearerOfDisease.append(subject_hypertension)

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension",
                      [("name", DatsObj("Annotation", [("value", name)])),
                       ("values", [value])])

        # find existing DATS identifier for the corresponding Dataset Dimension
        if "var" in var_value:
            dim.setProperty(
                "identifier",
                var_value["var"]["dim"].get("identifier").getIdRef())

        return dim

    # add "raw" characteristics / DATS Dimensions for _all_ dbGaP subject metadata
    # create DATS Dimensions for dbGaP subject metadata
    if use_all_dbgap_vars:
        subject_dimensions = [
            make_var_dimension(vname, subj_var_values[vname])
            for vname in sorted(subj_var_values)
        ]
        subject_characteristics.extend(subject_dimensions)

    # update subject
    dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value']
    subj.set("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")])
    subj.set("bearerOfDisease", subject_bearerOfDisease)

    # update characteristics
    chars = subj.get("characteristics")
    new_chars = []

    # remove values from public data release that will be overwritten
    for c in chars:
        name = c.get("name")
        # TODO - 'member of study group' is string, not Annotation
        if not isinstance(name, DatsObj) and name == 'member of study group':
            new_chars.append(c)
    new_chars.extend(subject_characteristics)
    subj.set("characteristics", new_chars)
예제 #6
0
def get_gene_json(cache, mod, gff3_json_path, orthologs):

    # read gene features form BGI file
    features = read_bgi(cache, mod, gff3_json_path)

    # read disease info from disease JSON file
    diseases = read_disease(cache, mod, gff3_json_path)

    # read disease info from disease JSON file
    phenotypes = read_phenotype(cache, mod, gff3_json_path)

    # TODO - read gene features from GFF3 file

    genes = []

    for f in features:
        genomeLocations = DatsObj("GenomeLocation",
                                  [("assembly", f['assembly']),
                                   ("chromosome", f['chr']),
                                   ("startPosition", f['start']),
                                   ("endPosition", f['end']),
                                   ("strand", f['strand'])])

        roles = [
            DatsObj("Annotation",
                    [("value", SOID[f['soid']]),
                     ("valueIRI",
                      "http://purl.obolibrary.org/obo/SO_" + f['soid'][3:])])
        ]

        alternate_ids = []
        if f['alt_ids'] != "NA":
            alt_ids_list = []
            for i in f['alt_ids']:
                source, id = i.split(':')
                alt_id = util.get_alt_id(id, source)
                alternate_ids.append(alt_id)

        #encode disease
        disease_list = []

        gene_diseases = search_dict('object_id', f['primaryId'], diseases)

        if len(gene_diseases) > 0:

            do_ids = [d['do_id'] for d in gene_diseases]
            uniq_do_ids = list(set(do_ids))

            for d in uniq_do_ids:
                disease_id = DatsObj(
                    "Annotation",
                    [("value", d),
                     ("valueIRI",
                      "http://purl.obolibrary.org/obo/DOID_" + d[5:])])

                select_diseases = search_dict('do_id', d, gene_diseases)

                #relation = OrderedDict([("value", select_diseases[0]['association_type'])])
                relation = DatsObj(
                    "Annotation",
                    [("value", "Disease"),
                     ("valueIRI", "http://purl.obolibrary.org/obo/DOID_4")])

                # account for multiple evidence codes per disease id
                evd_ids = []
                evd_ids_list = [d['evidence_codes'] for d in select_diseases]
                for i in evd_ids_list[0]:
                    evd_id = DatsObj(
                        "Annotation",
                        [("value", i),
                         ("valueIRI",
                          "http://purl.obolibrary.org/obo/" + EVID[i])])
                    evd_ids.append(evd_id)

            # account for multiple publications per disease id
                pub_ids = []
                pub_ids_list = [d['pubmed_id'] for d in select_diseases]
                for i in pub_ids_list:
                    pub_id = DatsObj(
                        "Publication",
                        [("Identifier",
                          DatsObj("Identifier",
                                  [("identifier", i),
                                   ("identifierSource", "PubMed")]))])
                    pub_ids.append(pub_id)

                mod_pub_ids_list = [d['mod_pub_id'] for d in select_diseases]
                for i in mod_pub_ids_list:
                    mod_pub_id = DatsObj("Publication", [
                        ("Identifier",
                         DatsObj("Identifier", [("identifier", i),
                                                ("identifierSource", i[:3])]))
                    ])
                    pub_ids.append(mod_pub_id)

                relation_evidence = OrderedDict([
                    ("evidenceCodes", evd_ids), ("publications", pub_ids),
                    ("dateEstablished",
                     DatsObj("Date", [
                         ("date", select_diseases[0]['date_ass']),
                         ("type",
                          DatsObj("Annotation", [("value", "Date Assigned")]))
                     ]))
                ])

                related_entity_id = OrderedDict([
                    ("object", disease_id), ("relation", relation),
                    ("resultingFrom",
                     DatsObj(
                         "Activity",
                         [("name", select_diseases[0]['association_type'])])),
                    ("relationEvidence", relation_evidence)
                ])
                disease_list.append(related_entity_id)

        #encode phenotype
        #assumes one phenotype termID per record as is in RGD and MGI phenotype JSONs
        phenotype_list = []

        gene_phenotypes = search_dict('object_id', f['primaryId'], phenotypes)

        if len(gene_phenotypes) > 0:

            phe_ids = [p['phe_term_ids'] for p in gene_phenotypes]
            #logging.info("phe_ids: " + str(phe_ids))
            uniq_phe_ids = list(set(phe_ids))

            for p in uniq_phe_ids:
                select_phenotypes = search_dict('phe_term_ids', p,
                                                gene_phenotypes)
                #logging.info("select_phe: " + str(select_phenotypes))

                term_id = DatsObj(
                    "Annotation",
                    [("value", select_phenotypes[0]['phe_term_ids']),
                     ("valueIRI", "http://purl.obolibrary.org/obo/MP_" +
                      select_phenotypes[0]['phe_term_ids'][3:])])

                relation = DatsObj("Annotation", [
                    ("value", "Phenotype"),
                    ("valueIRI", "http://purl.obolibrary.org/obo/OGMS_0000023")
                ])

                # account for multiple publications per phenotype
                pub_ids = []
                empt = ''
                pub_ids_list = [p['pubmed_id'] for p in select_phenotypes]
                for i in pub_ids_list:
                    if i == "":
                        continue
                    else:
                        pub_id = DatsObj(
                            "Publication",
                            [("Identifier",
                              DatsObj("Identifier",
                                      [("identifier", i),
                                       ("identifierSource", "PubMed")]))])
                        pub_ids.append(pub_id)

                mod_pub_ids_list = [p['mod_pub_id'] for p in select_phenotypes]
                for i in mod_pub_ids_list:
                    mod_pub_id = DatsObj("Publication", [
                        ("Identifier",
                         DatsObj("Identifier", [("identifier", i),
                                                ("identifierSource", i[:3])]))
                    ])
                    pub_ids.append(mod_pub_id)

                relation_evidence = OrderedDict([
                    ("publications", pub_ids),
                    ("dateEstablished",
                     DatsObj("Date", [
                         ("date", select_phenotypes[0]['date_ass']),
                         ("type",
                          DatsObj("Annotation", [("value", "Date Assigned")]))
                     ]))
                ])

                related_entity_id = OrderedDict([("object", term_id),
                                                 ("relation", relation),
                                                 ("relationEvidence",
                                                  relation_evidence)])
                phenotype_list.append(related_entity_id)

        #encode ortholog
        ortholog_list = []

        gene_orthologs = search_dict('mod_gene_id', f['primaryId'], orthologs)

        if len(gene_orthologs) > 0:
            for o in gene_orthologs:
                if '9606' in o['ortho_taxon']:
                    o_taxon = util.get_taxon_human(cache)
                else:
                    logging.fatal("encountered taxonomy other human - " +
                                  o['ortho_taxon'])
                    sys.exit(1)

                mol_entity_ortholog = DatsObj("MolecularEntity", [
                    ("identifier",
                     DatsObj("Identifier",
                             [("identifier", o['ortho_gene_id'])])),
                    ("name", o['ortho_gene_id']),
                    ("taxonomy", [o_taxon]),
                    ("alternateIdentifiers",
                     util.get_alt_id(o['ortho_gene_symbol'], "Gene Symbol")),
                ])

                related_entity_id = OrderedDict([
                    ("object", mol_entity_ortholog),
                    ("relation",
                     DatsObj("Annotation",
                             [("value", "Orthology"),
                              ("valueIRI",
                               "http://purl.obolibrary.org/obo/HOM_0000017")]))
                ])
                ortholog_list.append(related_entity_id)

        related_entities = disease_list + phenotype_list + ortholog_list

        gene = DatsObj(
            "MolecularEntity",
            [("identifier",
              DatsObj("Identifier", [("identifier", f['primaryId'])])),
             ("name", f['primaryId']), ("description", f['descr']),
             ("roles", roles), ("taxonomy", [f['taxon']]),
             ("genomeLocations", [genomeLocations]),
             ("alternateIdentifiers", alternate_ids),
             ("relatedEntities", related_entities)])
        genes.append(gene)

    return genes