Example #1
0
    def _process_nlx_157874_1_view(self, raw, limit=None):
        """
        This table contains the Elements of Morphology data that has been
        screen-scraped into DISCO.
        Note that foaf:depiction is inverse of foaf:depicts relationship.

        Since it is bad form to have two definitions,
        we concatenate the two into one string.

        Triples:
            <eom id> a owl:Class
                rdf:label Literal(eom label)
                OIO:hasRelatedSynonym Literal(synonym list)
                IAO:definition Literal(objective_def. subjective def)
                foaf:depiction Literal(small_image_url),
                               Literal(large_image_url)
                foaf:page Literal(page_url)
                rdfs:comment Literal(long commented text)


        :param raw:
        :param limit:
        :return:
        """

        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            filereader = csv.reader(f1, delimiter='\t', quotechar='\"')
            for line in filereader:
                line_counter += 1
                (morphology_term_id, morphology_term_num,
                 morphology_term_label, morphology_term_url,
                 terminology_category_label, terminology_category_url,
                 subcategory, objective_definition, subjective_definition,
                 comments, synonyms, replaces, small_figure_url,
                 large_figure_url, e_uid, v_uid, v_uuid,
                 v_last_modified, v_status, v_lastmodified_epoch) = line

                # note:
                # e_uid v_uuid v_last_modified terminology_category_url
                # subcategory v_uid morphology_term_num
                # terminology_category_label hp_label notes
                # are currently unused.

                # Add morphology term to graph as a class
                # with label, type, and description.
                model.addClassToGraph(morphology_term_id,
                                      morphology_term_label)

                # Assemble the description text

                if subjective_definition != '' and not (
                        re.match(r'.+\.$', subjective_definition)):
                    # add a trailing period.
                    subjective_definition = subjective_definition.strip() + '.'
                if objective_definition != '' and not (
                        re.match(r'.+\.$', objective_definition)):
                    # add a trailing period.
                    objective_definition = objective_definition.strip() + '.'

                definition = \
                    '  '.join(
                        (objective_definition, subjective_definition)).strip()

                model.addDefinition(morphology_term_id, definition)

                # <term id> FOAF:depicted_by literal url
                # <url> type foaf:depiction

                # do we want both images?
                # morphology_term_id has depiction small_figure_url
                if small_figure_url != '':
                    model.addDepiction(morphology_term_id,
                                       small_figure_url)

                # morphology_term_id has depiction large_figure_url
                if large_figure_url != '':
                    model.addDepiction(morphology_term_id,
                                       large_figure_url)

                # morphology_term_id has comment comments
                if comments != '':
                    model.addComment(morphology_term_id,
                                     comments.strip())

                if synonyms != '':
                    for s in synonyms.split(';'):
                        model.addSynonym(
                            morphology_term_id, s.strip(),
                            model.annotation_properties['hasExactSynonym'])

                # morphology_term_id hasRelatedSynonym replaces (; delimited)
                if replaces != '' and replaces != synonyms:
                    for s in replaces.split(';'):
                        model.addSynonym(
                            morphology_term_id, s.strip(),
                            model.annotation_properties['hasRelatedSynonym'])

                # morphology_term_id has page morphology_term_url
                reference = Reference(self.graph)
                reference.addPage(morphology_term_id, morphology_term_url)

                if limit is not None and line_counter > limit:
                    break
        return
Example #2
0
    def process_omia_phenotypes(self, limit):

        # process the whole directory
        # TODO get the file listing
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        logger.info(
            "Processing Monarch OMIA Animal disease-phenotype associations")

        # get file listing
        mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype'))
        file_list = [
            f for f in listdir(mypath)
            if isfile(join(mypath, f)) and re.search(r'.txt$', f)]

        for f in file_list:
            logger.info("Processing %s", f)
            print(f)
            line_counter = 0
            count_missing = 0
            bad_rows = list()
            fname = '/'.join((mypath, f))
            with open(fname, 'r') as csvfile:
                filereader = csv.reader(
                    csvfile, delimiter='\t', quotechar='\"')
                for row in filereader:
                    line_counter += 1
                    if line_counter <= 1:
                        continue  # skip header
                    if len(row) != 22:
                        logger.info("Not enough cols (%d) in %s - please fix",
                                    len(row), f)
                        continue
                    (disease_num, species_id, breed_name, variant, inheritance,
                     phenotype_id, phenotype_name, entity_id, entity_name,
                     quality_id, quality_name, related_entity_id,
                     related_entity_name, abnormal_id, abnormal_name,
                     phenotype_description, assay, frequency, pubmed_id,
                     pub_description, curator_notes, date_created) = row

                    if phenotype_id == '':
                        # logger.warning('Missing phenotype in row:\n%s', row)
                        count_missing += 1
                        bad_rows.append(row)
                        continue
                    if len(str(disease_num)) < 6:
                        disease_num = str(disease_num).zfill(6)
                    disease_id = 'OMIA:'+disease_num.strip()
                    species_id = species_id.strip()
                    if species_id != '':
                        disease_id = '-'.join((disease_id, species_id))
                    assoc = D2PAssoc(g, self.name, disease_id, phenotype_id)
                    if pubmed_id != '':
                        for p in re.split(r'[,;]', pubmed_id):
                            pmid = 'PMID:'+p.strip()
                            assoc.add_source(pmid)
                    else:
                        assoc.add_source(
                            '/'.join(('http://omia.angis.org.au/OMIA' +
                                      disease_num.strip(),
                                      species_id.strip())))
                    assoc.add_association_to_graph()
                    aid = assoc.get_association_id()
                    if phenotype_description != '':
                        model.addDescription(aid, phenotype_description)
                    if breed_name != '':
                        model.addDescription(
                            aid, breed_name.strip()+' [observed in]')
                    if assay != '':
                        model.addDescription(aid, assay.strip()+' [assay]')
                    if curator_notes != '':
                        model.addComment(aid, curator_notes.strip())

                    if entity_id != '' or quality_id != '':
                        logger.info("EQ not empty for %s: %s + %s", disease_id,
                                    entity_name, quality_name)
            if count_missing > 0:
                logger.warning(
                    "You are missing %d/%d D2P annotations from id %s",
                    count_missing, line_counter-1, f)
                # TODO PYLINT Used builtin function 'map'.
                # Using a list comprehension can be clearer.
                logger.warning("Bad rows:\n"+"\n".join(map(str, bad_rows)))
            # finish loop through all files

        return
Example #3
0
    def _add_g2p_assoc(self, g, strain_id, sex, assay_id, phenotypes, comment):
        """
        Create an association between a sex-specific strain id
        and each of the phenotypes.
        Here, we create a genotype from the strain,
        and a sex-specific genotype.
        Each of those genotypes are created as anonymous nodes.

        The evidence code is hardcoded to be:
            ECO:experimental_phenotypic_evidence.

        :param g:
        :param strain_id:
        :param sex:
        :param assay_id:
        :param phenotypes: a list of phenotypes to association with the strain
        :param comment:
        :return:

        """
        geno = Genotype(g)
        model = Model(g)
        eco_id = "ECO:0000059"  # experimental_phenotypic_evidence
        strain_label = self.idlabel_hash.get(strain_id)
        # strain genotype
        genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), 'genotype'))
        genotype_label = '[' + strain_label + ']'

        sex_specific_genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id),
                                                 sex, 'genotype'))
        if strain_label is not None:
            sex_specific_genotype_label = strain_label + ' (' + sex + ')'
        else:
            sex_specific_genotype_label = strain_id + '(' + sex + ')'

        genotype_type = Genotype.genoparts['sex_qualified_genotype']
        if sex == 'm':
            genotype_type = Genotype.genoparts['male_genotype']
        elif sex == 'f':
            genotype_type = Genotype.genoparts['female_genotype']

        # add the genotype to strain connection
        geno.addGenotype(
            genotype_id, genotype_label,
            Genotype.genoparts['genomic_background'])
        g.addTriple(
            strain_id, Genotype.object_properties['has_genotype'], genotype_id)

        geno.addGenotype(
            sex_specific_genotype_id, sex_specific_genotype_label,
            genotype_type)

        # add the strain as the background for the genotype
        g.addTriple(
            sex_specific_genotype_id,
            Genotype.object_properties['has_sex_agnostic_genotype_part'],
            genotype_id)

        # #############    BUILD THE G2P ASSOC    #############
        # TODO add more provenance info when that model is completed

        if phenotypes is not None:
            for phenotype_id in phenotypes:
                assoc = G2PAssoc(
                    g, self.name, sex_specific_genotype_id, phenotype_id)
                assoc.add_evidence(assay_id)
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()
                model.addComment(assoc_id, comment)

        return
Example #4
0
    def process_omia_phenotypes(self, limit):

        # process the whole directory
        # TODO get the file listing
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)

        LOG.info("Processing Monarch OMIA Animal disease-phenotype associations")

        src_key = 'omia_d2p'

        # get file listing
        mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype'))
        file_list = [
            f for f in listdir(mypath)
            if isfile(join(mypath, f)) and re.search(r'.txt$', f)]

        col = self.files[src_key]['columns']
        # reusable initial code generator
        # for c in col:
        #   print(
        #    '# '+str.lower(c.replace(" ",""))+" = row[col.index('"+c+"')].strip()")

        for filename in file_list:
            LOG.info("Processing %s", filename)
            count_missing = 0
            bad_rows = list()
            fname = '/'.join((mypath, filename))
            with open(fname, 'r') as csvfile:
                filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
                fileheader = next(filereader)
                if fileheader != col:
                    LOG.error('Expected  %s to have columns: %s', fname, col)
                    LOG.error('But Found %s to have columns: %s', fname, fileheader)
                    raise AssertionError('Incomming data headers have changed.')

                for row in filereader:
                    if len(row) != len(col):
                        LOG.info(
                            "Not enough cols %d in %s - please fix", len(row), filename)
                        continue

                    disease_num = row[col.index('Disease ID')].strip()
                    species_id = row[col.index('Species ID')].strip()
                    breed_name = row[col.index('Breed Name')].strip()
                    # variant = row[col.index('Variant')]
                    # inheritance = row[col.index('Inheritance')]
                    phenotype_id = row[col.index('Phenotype ID')].strip()
                    # phenotype_name = row[col.index('Phenotype Name')]
                    entity_id = row[col.index('Entity ID')].strip()
                    entity_name = row[col.index('Entity Name')]
                    quality_id = row[col.index('Quality ID')].strip()
                    quality_name = row[col.index('Quality Name')]
                    # related_entity_id = row[col.index('Related Entity ID')]
                    # related_entity_name = row[col.index('Related Entity Name')]
                    # abnormal_id = row[col.index('Abnormal ID')]
                    # abnormal_name = row[col.index('Abnormal Name')]
                    # phenotype_desc = row[col.index('Phenotype Desc')]
                    assay = row[col.index('Assay')].strip()
                    # frequency = row[col.index('Frequency')]
                    pubmed_id = row[col.index('Pubmed ID')].strip()
                    phenotype_description = row[col.index('Pub Desc')].strip()
                    curator_notes = row[col.index('Curator Notes')].strip()
                    # date_created = row[col.index('Date Created')]

                    if phenotype_id == '':
                        # LOG.warning('Missing phenotype in row:\n%s', row)
                        count_missing += 1
                        bad_rows.append(row)
                        continue
                    if len(str(disease_num)) < 6:
                        disease_num = str(disease_num).zfill(6)
                    disease_id = 'OMIA:' + disease_num
                    if species_id != '':
                        disease_id = '-'.join((disease_id, species_id))
                    assoc = D2PAssoc(graph, self.name, disease_id, phenotype_id)
                    if pubmed_id != '':
                        for pnum in re.split(r'[,;]', pubmed_id):
                            pnum = re.sub(r'[^0-9]', '', pnum)
                            pmid = 'PMID:' + pnum
                            assoc.add_source(pmid)
                    else:
                        assoc.add_source(
                            '/'.join((
                                self.curie_map['OMIA'] + disease_num, species_id)))
                    assoc.add_association_to_graph()
                    aid = assoc.get_association_id()
                    if phenotype_description != '':
                        model.addDescription(aid, phenotype_description)
                    if breed_name != '':
                        model.addDescription(aid, breed_name + ' [observed in]')
                    if assay != '':
                        model.addDescription(aid, assay + ' [assay]')
                    if curator_notes != '':
                        model.addComment(aid, curator_notes)

                    if entity_id != '' or quality_id != '':
                        LOG.info(
                            "EQ not empty for %s: %s + %s",
                            disease_id, entity_name, quality_name)
            if count_missing > 0:
                LOG.warning(
                    "We are missing %d of %d D2P annotations from id %s",
                    count_missing, filereader.line_num-1, filename)
                LOG.warning("Bad rows:\n%s", '\n'.join([str(x) for x in bad_rows]))
            # finish loop through all files

        return
Example #5
0
class ModelTestCase(unittest.TestCase):
    def setUp(self):
        g = RDFGraph()
        self.model = Model(g)

        this_curie_map = curie_map.get()
        self.cutil = CurieUtil(this_curie_map)

        # stuff to make test triples
        self.test_cat_subj_curie = "MGI:1234"
        self.test_cat_subj = self.cutil.get_uri("MGI:1234")
        self.test_cat_default_pred = self.cutil.get_uri("biolink:category")
        self.test_named_indiv = self.cutil.get_uri("owl:NamedIndividual")
        self.test_label_pred = self.cutil.get_uri("rdfs:label")
        self.test_label = "some label"

        self.test_comment_IRI = self.cutil.get_uri("dcterms:comment")
        self.test_comment = 'bonus eruptus'

    def tearDown(self):
        self.graph = None

    def test_addIndividualToGraph_assign_label(self):
        self.model.addIndividualToGraph(self.test_cat_subj_curie, "some label")

        label_triple = list(
            self.model.graph.triples((URIRef(self.test_cat_subj),
                                      URIRef(self.test_label_pred), None)))

        self.assertEqual(len(label_triple), 1, "method didn't assign label")
        self.assertEqual(str(label_triple[0][2]), self.test_label,
                         "method didn't assign correct label")

    def test_addIndividualToGraph_assign_type_named_individual(self):
        self.model.addIndividualToGraph(self.test_cat_subj_curie, "some label")

        triples = list(
            self.model.graph.triples((URIRef(self.test_cat_subj), None,
                                      URIRef(self.test_named_indiv))))

        self.assertEqual(len(triples), 1,
                         "method didn't assign type as named individual")

    def test_addIndividualToGraph_assign_category(self):
        self.model.addIndividualToGraph(self.test_cat_subj_curie,
                                        "some label",
                                        ind_category=blv.terms['Genotype'])

        triples = list(
            self.model.graph.triples(
                (URIRef(self.test_cat_subj),
                 URIRef(self.test_cat_default_pred), None)))

        self.assertEqual(len(triples), 1, "method didn't assign category")

    def test_add_comment(self):
        self.model.addComment(self.test_cat_subj, self.test_comment)

        triples = list(
            self.model.graph.triples(
                (URIRef(self.test_cat_subj), URIRef(self.test_comment_IRI),
                 Literal(self.test_comment))))

        self.assertEqual(len(triples), 1, "method didn't assign comment")

    def test_add_comment_assign_subject_category(self):
        self.model.addComment(self.test_cat_subj,
                              self.test_comment,
                              subject_category=blv.terms['Genotype'])

        triples = list(
            self.model.graph.triples(
                (URIRef(self.test_cat_subj),
                 URIRef(self.test_cat_default_pred), None)))
        self.assertEqual(len(triples), 1, "method didn't assign category")
Example #6
0
    def process_omia_phenotypes(self, limit):

        # process the whole directory
        # TODO get the file listing
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)

        LOG.info(
            "Processing Monarch OMIA Animal disease-phenotype associations")

        # get file listing
        mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype'))
        file_list = [
            f for f in listdir(mypath)
            if isfile(join(mypath, f)) and re.search(r'.txt$', f)
        ]

        for filename in file_list:
            LOG.info("Processing %s", filename)
            count_missing = 0
            bad_rows = list()
            fname = '/'.join((mypath, filename))
            with open(fname, 'r') as csvfile:
                filereader = csv.reader(csvfile,
                                        delimiter='\t',
                                        quotechar='\"')
                header = next(filereader)
                for row in filereader:
                    if len(row) != 22 or len(row) != len(header):
                        LOG.info("Not enough cols %d in %s - please fix",
                                 len(row), filename)
                        continue
                    (disease_num, species_id, breed_name, variant, inheritance,
                     phenotype_id, phenotype_name, entity_id, entity_name,
                     quality_id, quality_name, related_entity_id,
                     related_entity_name, abnormal_id, abnormal_name,
                     phenotype_description, assay, frequency, pubmed_id,
                     pub_description, curator_notes, date_created) = row

                    if phenotype_id == '':
                        # LOG.warning('Missing phenotype in row:\n%s', row)
                        count_missing += 1
                        bad_rows.append(row)
                        continue
                    if len(str(disease_num)) < 6:
                        disease_num = str(disease_num).zfill(6)
                    disease_id = 'OMIA:' + disease_num.strip()
                    species_id = species_id.strip()
                    if species_id != '':
                        disease_id = '-'.join((disease_id, species_id))
                    assoc = D2PAssoc(graph, self.name, disease_id,
                                     phenotype_id)
                    if pubmed_id != '':
                        for p in re.split(r'[,;]', pubmed_id):
                            pmid = 'PMID:' + p.strip()
                            assoc.add_source(pmid)
                    else:
                        assoc.add_source('/'.join(
                            ('http://omia.angis.org.au/OMIA' +
                             disease_num.strip(), species_id.strip())))
                    assoc.add_association_to_graph()
                    aid = assoc.get_association_id()
                    if phenotype_description != '':
                        model.addDescription(aid, phenotype_description)
                    if breed_name != '':
                        model.addDescription(
                            aid,
                            breed_name.strip() + ' [observed in]')
                    if assay != '':
                        model.addDescription(aid, assay.strip() + ' [assay]')
                    if curator_notes != '':
                        model.addComment(aid, curator_notes.strip())

                    if entity_id != '' or quality_id != '':
                        LOG.info("EQ not empty for %s: %s + %s", disease_id,
                                 entity_name, quality_name)
            if count_missing > 0:
                LOG.warning(
                    "We are missing %d of %d D2P annotations from id %s",
                    count_missing, filereader.line_num - 1, filename)
                LOG.warning("Bad rows:\n%s",
                            '\n'.join([str(x) for x in bad_rows]))
            # finish loop through all files

        return
Example #7
0
    def _add_g2p_assoc(self, g, strain_id, sex, assay_id, phenotypes, comment):
        """
        Create an association between a sex-specific strain id
        and each of the phenotypes.
        Here, we create a genotype from the strain,
        and a sex-specific genotype.
        Each of those genotypes are created as anonymous nodes.

        The evidence code is hardcoded to be:
            ECO:experimental_phenotypic_evidence.

        :param g:
        :param strain_id:
        :param sex:
        :param assay_id:
        :param phenotypes: a list of phenotypes to association with the strain
        :param comment:
        :return:

        """
        geno = Genotype(g)
        model = Model(g)
        eco_id = "ECO:0000059"  # experimental_phenotypic_evidence
        strain_label = self.idlabel_hash.get(strain_id)
        # strain genotype
        genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), 'genotype'))
        genotype_label = '[' + strain_label + ']'

        sex_specific_genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id),
                                                 sex, 'genotype'))
        if strain_label is not None:
            sex_specific_genotype_label = strain_label + ' (' + sex + ')'
        else:
            sex_specific_genotype_label = strain_id + '(' + sex + ')'

        genotype_type = Genotype.genoparts['sex_qualified_genotype']
        if sex == 'm':
            genotype_type = Genotype.genoparts['male_genotype']
        elif sex == 'f':
            genotype_type = Genotype.genoparts['female_genotype']

        # add the genotype to strain connection
        geno.addGenotype(
            genotype_id, genotype_label,
            Genotype.genoparts['genomic_background'])
        g.addTriple(
            strain_id, Genotype.object_properties['has_genotype'], genotype_id)

        geno.addGenotype(
            sex_specific_genotype_id, sex_specific_genotype_label,
            genotype_type)

        # add the strain as the background for the genotype
        g.addTriple(
            sex_specific_genotype_id,
            Genotype.object_properties['has_sex_agnostic_genotype_part'],
            genotype_id)

        # #############    BUILD THE G2P ASSOC    #############
        # TODO add more provenance info when that model is completed

        if phenotypes is not None:
            for phenotype_id in phenotypes:
                assoc = G2PAssoc(
                    g, self.name, sex_specific_genotype_id, phenotype_id)
                assoc.add_evidence(assay_id)
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()
                model.addComment(assoc_id, comment)

        return