Esempio n. 1
0
    def process_pub_xrefs(self, limit=None):

        raw = '/'.join((self.rawdir, self.files['pub_xrefs']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        logger.info("Processing publication xrefs")
        line_counter = 0
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (wb_ref, xref) = row
                # WBPaper00000009 pmid8805<BR>
                # WBPaper00000011 doi10.1139/z78-244<BR>
                # WBPaper00000012 cgc12<BR>

                if self.testMode and wb_ref not in self.test_ids['pub']:
                    continue

                ref_id = 'WormBase:' + wb_ref
                xref_id = None
                r = None
                xref = re.sub(r'<BR>', '', xref)
                xref = xref.strip()
                if re.match(r'pmid', xref):
                    xref_id = 'PMID:' + re.sub(r'pmid\s*', '', xref)
                    reference = Reference(
                        g, xref_id, Reference.ref_types['journal_article'])
                elif re.search(r'[\(\)\<\>\[\]\s]', xref):
                    continue
                elif re.match(r'doi', xref):
                    xref_id = 'DOI:' + re.sub(r'doi', '', xref.strip())
                    reference = Reference(g, xref_id)
                elif re.match(r'cgc', xref):
                    # TODO not sure what to do here with cgc xrefs
                    continue
                else:
                    # logger.debug("Other xrefs like %s", xref)
                    continue

                if xref_id is not None:
                    reference.addRefToGraph()
                    model.addSameIndividual(ref_id, xref_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Esempio n. 2
0
    def _get_pubs(self, entry, g):
        """
        Extract mentioned publications from the reference list
        :param entry:
        :return:
        """

        ref_to_pmid = {}
        entry_num = entry['mimNumber']
        model = Model(g)
        if 'referenceList' in entry:
            reflist = entry['referenceList']
            for r in reflist:
                if 'pubmedID' in r['reference']:
                    pub_id = 'PMID:' + str(r['reference']['pubmedID'])
                    ref = \
                        Reference(
                            g, pub_id,
                            Reference.ref_types['journal_article'])
                else:
                    # make blank node for internal reference
                    pub_id = \
                        '_:OMIM' + str(entry_num) + 'ref' + \
                        str(r['reference']['referenceNumber'])

                    ref = Reference(g, pub_id)
                    title = author_list = source = citation = None
                    if 'title' in r['reference']:
                        title = r['reference']['title']
                        ref.setTitle(title)
                    if 'authors' in r['reference']:
                        author_list = r['reference']['authors']
                        ref.setAuthorList(author_list)
                        citation = re.split(r'\.\,', author_list)[0] + ' et al'
                    if 'source' in r['reference']:
                        source = r['reference']['source']
                    citation = '; '.join(
                        list(filter(None.__ne__, [citation, title, source])))
                    ref.setShortCitation(citation)
                ref.addRefToGraph()
                ref_to_pmid[r['reference']['referenceNumber']] = pub_id

                # add is_about for the pub
                omim_id = 'OMIM:'+str(entry_num)
                g.addTriple(omim_id, model.object_properties['mentions'],
                            pub_id)

        return ref_to_pmid
Esempio n. 3
0
    def parse(self, limit=None):
        zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized)
        model = Model(self.graph)
        zp_file = '/'.join((self.rawdir, self.files['zpmap']['file']))
        g2p_file = '/'.join((self.rawdir, self.files['g2p_clean']['file']))
        zfin_parser.zp_map = zfin_parser._load_zp_mappings(zp_file)

        with open(g2p_file, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:

                (internal_id, symbol, gene_id, subterm1_id, subterm1_label,
                 pc_rel_id, pc_rel_label, superterm1_id, superterm1_label,
                 quality_id, quality_name, modifier, subterm2_id,
                 subterm2_label, pc_rel2_id, pc_rel2_id, superterm2_id,
                 superterm2_label, fish_id, fish_label, start_stage, end_stage,
                 environment, pub_id, figure_id, unknown_field) = row

                zp_id = zfin_parser._map_sextuple_to_phenotype(
                    superterm1_id, subterm1_id, quality_id, superterm2_id,
                    subterm2_id, modifier)

                gene_curie = "ZFIN:{0}".format(gene_id)
                model.makeLeader(gene_curie)
                pub_curie = "ZFIN:{0}".format(pub_id)
                if zp_id:
                    assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id)
                    if pub_id:
                        reference = Reference(self.graph, pub_curie,
                                              Reference.ref_types['document'])
                        reference.addRefToGraph()
                        assoc.add_source(pub_curie)

                    assoc.add_evidence('ECO:0000059')
                    assoc.add_association_to_graph()
Esempio n. 4
0
    def _parse_curated_chem_disease(self, limit):
        model = Model(self.graph)
        line_counter = 0
        file_path = '/'.join(
            (self.rawdir, self.static_files['publications']['file']))
        with open(file_path, 'r') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            for row in reader:
                # catch comment lines
                if re.match(r'^#', ' '.join(row)):
                    continue
                line_counter += 1
                self._check_list_len(row, 10)
                (pub_id, disease_label, disease_id, disease_cat, evidence,
                 chem_label, chem_id, cas_rn, gene_symbol, gene_acc) = row

                if disease_id.strip() == '' or chem_id.strip() == '':
                    continue

                rel_id = self.resolve(evidence)
                chem_id = 'MESH:' + chem_id
                model.addClassToGraph(chem_id, chem_label)
                model.addClassToGraph(disease_id, None)
                if pub_id != '':
                    pub_id = 'PMID:' + pub_id
                    r = Reference(pub_id, self.globaltt['journal article'])
                    r.addRefToGraph(self.graph)
                    pubids = [pub_id]
                else:
                    pubids = None
                self._make_association(chem_id, disease_id, rel_id, pubids)

                if not self.testMode and limit is not None and line_counter >= limit:
                    break
        return
Esempio n. 5
0
    def _make_association(self, subject_id, object_id, rel_id, pubmed_ids):
        """
        Make a reified association given an array of pubmed identifiers.

        Args:
            :param subject_id  id of the subject of the association (gene/chem)
            :param object_id  id of the object of the association (disease)
            :param rel_id  relationship id
            :param pubmed_ids an array of pubmed identifiers
        Returns:
            :return None

        """

        # TODO pass in the relevant Assoc class rather than relying on G2P
        assoc = G2PAssoc(self.g, self.name, subject_id, object_id, rel_id)
        if pubmed_ids is not None and len(pubmed_ids) > 0:
            eco = self._get_evidence_code('TAS')
            for pmid in pubmed_ids:
                r = Reference(
                    self.g, pmid, Reference.ref_types['journal_article'])
                r.addRefToGraph()
                assoc.add_source(pmid)
                assoc.add_evidence(eco)

        assoc.add_association_to_graph()
        return
Esempio n. 6
0
    def _process_article_row(self, row):
        model = Model(self.g)
        # don't bother in test mode
        if self.testMode:
            return

        iarticle_id = self._make_internal_id('article', row['article_id'])
        self.id_hash['article'][row['article_id']] = iarticle_id
        rtype = None
        if row['journal'] != '':
            rtype = Reference.ref_types['journal_article']
        reference = Reference(self.g, iarticle_id, rtype)

        if row['title'] is not None:
            reference.setTitle(row['title'].strip())
        if row['year'] is not None:
            reference.setYear(row['year'])
        reference.addRefToGraph()

        if row['pubmed_id'] is not None:
            pmid = 'PMID:'+str(row['pubmed_id'])
            self.id_hash['article'][row['article_id']] = pmid
            model.addSameIndividual(iarticle_id, pmid)
            model.addComment(pmid, iarticle_id.replace("_:", ''))

        return
Esempio n. 7
0
    def _process_pathway_pubmed(self, limit):
        """
        Indicate that a pathway is annotated directly to a paper (is about)
            via it's pubmed id.
        :param limit:
        :return:
        """
        LOG.info("Processing KEGG pathways to pubmed ids")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        raw = '/'.join((self.rawdir, self.files['pathway_pubmed']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (pubmed_id, kegg_pathway_num) = row

                if self.test_mode and kegg_pathway_num not in self.test_ids['pathway']:
                    continue

                pubmed_id = pubmed_id.upper()
                # will look like KEGG-path:map04130
                kegg_id = 'KEGG-' + kegg_pathway_num

                r = Reference(graph, pubmed_id, self.globaltt['journal article'])
                r.addRefToGraph()
                graph.addTriple(pubmed_id, self.globaltt['is_about'], kegg_id)

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        return
Esempio n. 8
0
    def _process_collection(self, collection_id, label, page):
        """
        This function will process the data supplied internally
        about the repository from Coriell.

        Triples:
            Repository a ERO:collection
            rdf:label Literal(label)
            foaf:page Literal(page)

        :param collection_id:
        :param label:
        :param page:
        :return:
        """
        # #############    BUILD THE CELL LINE REPOSITORY    #############
        for graph in [self.graph, self.testgraph]:
            # TODO: How to devise a label for each repository?
            model = Model(graph)
            reference = Reference(graph)
            repo_id = 'CoriellCollection:' + collection_id
            repo_label = label
            repo_page = page

            model.addIndividualToGraph(repo_id, repo_label,
                                       self.globaltt['collection'])
            reference.addPage(repo_id, repo_page)

        return
Esempio n. 9
0
File: CTD.py Progetto: sgml/dipper
    def _make_association(self, subject_id, object_id, rel_id, pubmed_ids):
        """
        Make a reified association given an array of pubmed identifiers.

        Args:
            :param subject_id  id of the subject of the association (gene/chem)
            :param object_id  id of the object of the association (disease)
            :param rel_id  relationship id
            :param pubmed_ids an array of pubmed identifiers
        Returns:
            :return None

        """

        # TODO pass in the relevant Assoc class rather than relying on G2P
        assoc = G2PAssoc(self.graph, self.name, subject_id, object_id, rel_id)
        if pubmed_ids is not None and len(pubmed_ids) > 0:
            for pmid in pubmed_ids:
                ref = Reference(
                    self.graph, pmid, self.globaltt['journal article'])
                ref.addRefToGraph()
                assoc.add_source(pmid)
                assoc.add_evidence(self.globaltt['traceable author statement'])

        assoc.add_association_to_graph()
        return
Esempio n. 10
0
    def _get_pubs(self, entry, graph):
        """
        Extract mentioned publications from the reference list
        :param entry:
        :return:
        """

        ref_to_pmid = {}
        entry_num = entry['mimNumber']

        if 'referenceList' in entry:
            reflist = entry['referenceList']
            for rlst in reflist:
                if 'pubmedID' in rlst['reference']:
                    pub_id = 'PMID:' + str(rlst['reference']['pubmedID'])
                    ref = Reference(graph, pub_id,
                                    self.globaltt['journal article'])
                else:
                    # make blank node for internal reference
                    pub_id = '_:OMIM' + str(entry_num) + 'ref' + str(
                        rlst['reference']['referenceNumber'])

                    ref = Reference(graph, pub_id)
                    title = author_list = source = citation = None
                    if 'title' in rlst['reference']:
                        title = rlst['reference']['title']
                        ref.setTitle(title)
                    if 'authors' in rlst['reference']:
                        author_list = rlst['reference']['authors']
                        ref.setAuthorList(author_list)
                        citation = re.split(r'\.\,', author_list)[0] + ' et al'
                    if 'source' in rlst['reference']:
                        source = rlst['reference']['source']
                    citation = '; '.join([
                        tok for tok in [citation, title, source]
                        if tok is not None
                    ])
                    ref.setShortCitation(citation)
                ref.addRefToGraph()
                ref_to_pmid[rlst['reference']['referenceNumber']] = pub_id

                # add is_about for the pub
                omim_id = 'OMIM:' + str(entry_num)
                graph.addTriple(omim_id, self.globaltt['mentions'], pub_id)

        return ref_to_pmid
Esempio n. 11
0
    def _add_variant_trait_association(self,
                                       variant_id,
                                       mapped_trait_uri,
                                       mapped_trait,
                                       mondo_data,
                                       pubmed_id,
                                       description=None):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)

        # Use mapped traits for labels, hope that labels do not contain commas

        mapped_traits = [trait.strip() for trait in mapped_trait.split(',')]
        mapped_trait_uris = [
            iri.strip() for iri in mapped_trait_uri.split(',')
        ]
        if mapped_trait_uris:
            for index, trait in enumerate(mapped_trait_uris):
                trait_curie = trait.replace("http://www.ebi.ac.uk/efo/EFO_",
                                            "EFO:")

                if not DipperUtil.is_id_in_mondo(trait_curie, mondo_data):
                    if re.match(r'^EFO', trait_curie):
                        model.addClassToGraph(
                            trait_curie,
                            mapped_traits[index],
                            self.globaltt['phenotype'],
                            class_category=blv.terms['PhenotypicFeature'])
                    LOG.debug("{} not in mondo".format(trait_curie))
                else:
                    LOG.debug("{} in mondo".format(trait_curie))

                pubmed_curie = 'PMID:' + pubmed_id

                ref = Reference(graph, pubmed_curie,
                                self.globaltt['journal article'])
                ref.addRefToGraph()

                if trait_curie is not None and trait_curie != '' and \
                        variant_id is not None and variant_id != '':
                    assoc = G2PAssoc(
                        graph, self.name, variant_id, trait_curie,
                        model.globaltt['contributes to condition'])
                    assoc.add_source(pubmed_curie)

                    assoc.add_evidence(self.globaltt[
                        'combinatorial evidence used in automatic assertion'])

                    if description is not None:
                        assoc.set_description(description)

                    # FIXME score should get added to provenance/study
                    # assoc.set_score(pvalue)

                    assoc.add_association_to_graph()
Esempio n. 12
0
    def parse(self, limit=None):

        count = 0
        for num in range(10, 100):
            fuzzy_gene = "MGI:{0}*".format(num)
            gene = "MGI:{0}".format(num)
            service = Service("http://www.mousemine.org/mousemine/service")
            logging.getLogger('Model').setLevel(logging.CRITICAL)
            logging.getLogger('JSONIterator').setLevel(logging.CRITICAL)
            query = service.new_query("OntologyAnnotation")
            query.add_constraint("subject", "SequenceFeature")
            query.add_constraint("ontologyTerm", "MPTerm")
            query.add_view("subject.primaryIdentifier", "subject.symbol",
                           "subject.sequenceOntologyTerm.name",
                           "ontologyTerm.identifier", "ontologyTerm.name",
                           "evidence.publications.pubMedId",
                           "evidence.comments.type",
                           "evidence.comments.description")
            query.add_sort_order("OntologyAnnotation.ontologyTerm.name", "ASC")
            query.add_constraint("subject.organism.taxonId",
                                 "=",
                                 "10090",
                                 code="A")
            query.add_constraint("subject", "LOOKUP", fuzzy_gene, code="B")
            query.add_constraint("subject.primaryIdentifier",
                                 "CONTAINS",
                                 gene,
                                 code="C")
            query.outerjoin("evidence.comments")

            for row in query.rows():
                mgi_curie = row["subject.primaryIdentifier"]
                mp_curie = row["ontologyTerm.identifier"]
                pub_curie = "PMID:{0}".format(
                    row["evidence.publications.pubMedId"])
                assoc = G2PAssoc(self.graph, self.name, mgi_curie, mp_curie)
                if row["evidence.publications.pubMedId"]:
                    reference = Reference(
                        self.graph, pub_curie,
                        Reference.ref_types['journal_article'])
                    reference.addRefToGraph()
                    assoc.add_source(pub_curie)

                assoc.add_evidence('ECO:0000059')
                assoc.add_association_to_graph()

            if not count % 10 and count != 0:
                count_from = count - 10
                logger.info(
                    "{0} processed ids from MGI:{1}* to MGI:{2}*".format(
                        datetime.datetime.now(), count_from, count))

            count += 1
            if limit and count >= limit:
                break

        return
Esempio n. 13
0
    def _add_variant_trait_association(self,
                                       variant_id,
                                       mapped_trait_uri,
                                       efo_ontology,
                                       pubmed_id,
                                       description=None):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        # make associations to the EFO terms; there can be >1
        if mapped_trait_uri.strip() != '':
            for trait in re.split(r',', mapped_trait_uri):
                trait = trait.strip()

                trait_curie = trait.replace("http://www.ebi.ac.uk/efo/EFO_",
                                            "EFO:")

                phenotype_query = """
                    SELECT ?trait
                    WHERE {{
                        <{0}> rdfs:subClassOf+ <http://www.ebi.ac.uk/efo/EFO_0000651> .
                        <{0}> rdfs:label ?trait .
                    }}
                """.format(trait)

                query_result = efo_ontology.query(phenotype_query)
                if len(list(query_result)) > 0:
                    if re.match(r'^EFO', trait_curie):
                        model.addClassToGraph(trait_curie,
                                              list(query_result)[0][0],
                                              self.globaltt['Phenotype'])

                pubmed_curie = 'PMID:' + pubmed_id

                ref = Reference(graph, pubmed_curie,
                                self.globaltt['journal article'])
                ref.addRefToGraph()

                assoc = G2PAssoc(graph, self.name, variant_id, trait_curie,
                                 model.globaltt['contributes to condition'])
                assoc.add_source(pubmed_curie)

                assoc.add_evidence(self.globaltt[
                    'combinatorial evidence used in automatic assertion'])

                if description is not None:
                    assoc.set_description(description)

                # FIXME score should get added to provenance/study
                # assoc.set_score(pvalue)
                if trait_curie is not None:
                    assoc.add_association_to_graph()
Esempio n. 14
0
    def process_pub_xrefs(self, limit=None):
        src_key = 'pub_xrefs'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("Processing: %s", self.files[src_key]['file'])
        graph = self.graph
        model = Model(graph)
        col = self.files[src_key]['columns']
        with open(raw, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in reader:
                wb_ref = row[col.index('wb_ref')].strip()
                xref = row[col.index(
                    'xref')].strip()[:-4]  # strips trailing '<BR>'

                # WBPaper00000009 pmid8805<BR>
                # WBPaper00000011 doi10.1139/z78-244<BR>
                # WBPaper00000012 cgc12<BR>

                ref_curie = 'WormBase:' + wb_ref
                dbxref_curie = None
                if xref[:4] == 'pmid':
                    dbxref_curie = 'PMID:' + xref[4:]
                    reference = Reference(graph, dbxref_curie,
                                          self.globaltt['journal article'])
                elif re.search(r'[\(\)\<\>\[\]\s]', xref):
                    continue
                elif xref[:3] == 'doi':
                    dbxref_curie = 'DOI:' + xref[3:]
                    reference = Reference(graph, dbxref_curie)
                else:
                    # LOG.debug("Other xrefs like %s", xref)
                    continue

                if dbxref_curie is not None:
                    reference.addRefToGraph()
                    model.addSameIndividual(ref_curie, dbxref_curie)

                if limit is not None and reader.line_num > limit:
                    break
Esempio n. 15
0
    def make_association(self, record):
        """
        contstruct the association
        :param record:
        :return: modeled association of  genotype to mammalian phenotype
        """
        model = Model(self.graph)

        record['relation']['id'] = self.resolve("has phenotype")
        # define the triple
        gene = record['subject']['id']
        relation = record['relation']['id']
        phenotype = record['object']['id']

        # instantiate the association
        g2p_assoc = Assoc(self.graph,
                          self.name,
                          sub=gene,
                          obj=phenotype,
                          pred=relation)

        # add the references
        references = record['evidence']['has_supporting_reference']
        # created RGDRef prefix in curie map to route to proper reference URL in RGD
        references = [
            x.replace('RGD', 'RGDRef') if 'PMID' not in x else x
            for x in references
        ]

        if len(references) > 0:
            # make first ref in list the source
            g2p_assoc.add_source(identifier=references[0])
            ref_model = Reference(self.graph, references[0],
                                  self.globaltt['publication'])
            ref_model.addRefToGraph()

        if len(references) > 1:
            # create equivalent source for any other refs in list
            # This seems to be specific to this source and
            # there could be non-equivalent references in this list
            for ref in references[1:]:
                model.addSameIndividual(sub=references[0], obj=ref)

        # add the date created on
        g2p_assoc.add_date(date=record['date'])
        g2p_assoc.add_evidence(self.resolve(
            record['evidence']['type']))  # ?set where?
        g2p_assoc.add_association_to_graph()

        return
Esempio n. 16
0
    def _parse_curated_chem_disease(self, file_path, limit):
        model = Model(self.graph)
        src_key = 'publications'
        col = self.api_fetch[src_key]['columns']

        with open(file_path, 'r') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            row = next(reader)
            row[0] = row[0][2:].strip()
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                # catch comment lines
                if row[0][0] == '#':
                    continue
                self._check_list_len(row, len(col))

                pub_id = row[col.index('Input')]
                # disease_label = row[col.index('DiseaseName')]
                disease_id = row[col.index('DiseaseID')]
                # disease_cat = row[col.index('DiseaseCategories')]
                evidence = row[col.index('DirectEvidence')]
                chem_label = row[col.index('ChemicalName')]
                chem_id = row[col.index('ChemicalID')]
                # cas_rn = row[col.index('CasRN')]
                # gene_symbol = row[col.index('GeneSymbol')]
                # gene_acc = row[col.index('GeneAcc')]

                if disease_id.strip() == '' or chem_id.strip() == '':
                    continue

                rel_id = self.resolve(evidence)
                chem_id = 'MESH:' + chem_id
                model.addClassToGraph(chem_id, chem_label)
                model.addClassToGraph(disease_id, None)
                if pub_id != '':
                    pub_id = 'PMID:' + pub_id
                    ref = Reference(self.graph,
                                    pub_id,
                                    ref_type=self.globaltt['journal article'])
                    ref.addRefToGraph()
                    pubids = [pub_id]
                else:
                    pubids = None
                self._make_association(chem_id, disease_id, rel_id, pubids)

                if not self.test_mode and limit is not None and \
                        reader.line_num >= limit:
                    break
Esempio n. 17
0
    def process_gaf(self, gaffile, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", gaffile)
        uniprot_hit = 0
        uniprot_miss = 0
        col = self.gaf_columns

        with gzip.open(gaffile, 'rb') as csvfile:
            reader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"')
            for row in reader:
                # comments start with exclamation
                if row[0][0] == '!':
                    continue

                if len(row) != len(col):
                    LOG.error(
                        "Wrong number of columns %i, expected ... got:\n\t%s",
                        len(col), row)
                    exit(1)

                dbase = row[col.index('DB')].strip()
                gene_num = row[col.index('DB_Object_ID')].strip()
                gene_symbol = row[col.index('DB_Object_Symbol')].strip()
                qualifier = row[col.index('Qualifier')]
                go_id = row[col.index('GO_ID')].strip()
                ref = row[col.index('DB:Reference')].strip()
                eco_symbol = row[col.index('Evidence Code')].strip()
                with_or_from = row[col.index('With (or) From')]
                aspect = row[col.index('Aspect')].strip()
                gene_name = row[col.index('DB_Object_Name')]
                gene_synonym = row[col.index('DB_Object_Synonym')]
                # object_type = row[col.index('DB_Object_Type')].strip()
                taxon = row[col.index('Taxon and Interacting taxon')].strip()
                # date = row[col.index('Date')].strip()
                # assigned_by = row[col.index('Assigned_By')].strip()
                # annotation_extension = row[col.index('Annotation_Extension')]
                # gene_product_form_id = row[col.index('Gene_Product_Form_ID')]

                # test for required fields
                if '' in [row[:10], row[12]]:
                    LOG.error(
                        "Missing required part of annotation on row %i:\n%s",
                        reader.line_num, str(row[:-4]))
                    continue

                # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and gene_id[:9] != 'NCBIGene:' and\
                        gene_num not in self.test_ids:
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        syn = syn.strip()
                        if syn[:10] == 'UniProtKB:':
                            model.addTriple(
                                gene_id, self.globaltt['has gene product'], syn)
                        elif re.fullmatch(graph.curie_regexp, syn) is not None:
                            LOG.warning(
                                'possible curie "%s" as a literal synomym for %s',
                                syn, gene_id)
                            model.addSynonym(gene_id, syn)
                        else:
                            model.addSynonym(gene_id, syn)

                for txid in taxon.split('|'):
                    tax_curie = re.sub(r'taxon:', 'NCBITaxon:', txid)
                    geno.addTaxon(tax_curie, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to', qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n", str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                ########################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = with_or_from.split('|')
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for itm in withitems:
                        if itm == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm):
                            LOG.warning(
                                "Skipping  %s from or with %s", uniprotid, itm)
                            continue
                        itm = re.sub(r'MGI\:MGI\:', 'MGI:', itm)
                        itm = re.sub(r'WB:', 'WormBase:', itm)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', itm):
                            targeted_gene_id = self.zfin.make_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', itm):
                            targeted_gene_id = self.wbase.make_reagent_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, itm, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(
                                    self.globaltt['experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be the evidence for the GO assoc?

                if not self.test_mode and limit is not None and \
                        reader.line_num > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
Esempio n. 18
0
    def process_nbk_html(self, limit):
        """
        Here we process the gene reviews books to fetch
        the clinical descriptions to include in the ontology.
        We only use books that have been acquired manually,
        as NCBI Bookshelf does not permit automated downloads.
        This parser will only process the books that are found in
        the ```raw/genereviews/books``` directory,
        permitting partial completion.

        :param limit:
        :return:
        """
        model = Model(self.graph)
        cnt = 0
        books_not_found = set()
        clin_des_regx = re.compile(r".*Summary.sec0")
        lit_cite_regex = re.compile(r".*Literature_Cited")
        pubmed_regex = re.compile(r"pubmed")  # ??? for a static string?
        for nbk in self.book_ids:
            cnt += 1
            nbk_id = 'GeneReviews:' + nbk
            book_item = self.all_books.get(nbk)
            url = '/'.join((self.rawdir, book_item['file']))

            # figure out if the book is there; if so, process, otherwise skip
            book_dir = '/'.join((self.rawdir, 'books'))
            book_files = os.listdir(book_dir)
            if ''.join((nbk, '.html')) not in book_files:
                # LOG.warning("No book found locally for %s; skipping", nbk)
                books_not_found.add(nbk)
                continue
            LOG.info("Processing %s", nbk)

            page = open(url)
            soup = BeautifulSoup(page.read())

            # sec0 == clinical description
            clin_summary = soup.find('div', id=clin_des_regx)
            if clin_summary is not None:
                ptext = clin_summary.find('p').text
                ptext = re.sub(r'\s+', ' ', ptext)

                unlst = clin_summary.find('ul')
                if unlst is not None:
                    item_text = list()
                    for lst_itm in unlst.find_all('li'):
                        item_text.append(re.sub(r'\s+', ' ', lst_itm.text))
                    ptext += ' '.join(item_text)

                # add in the copyright and citation info to description
                ptext = ' '.join(
                    (ptext, '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' +
                     nbk_id + ']'))

                model.addDefinition(nbk_id, ptext.strip())

            # get the pubs
            pmid_set = set()
            pub_div = soup.find('div', id=lit_cite_regex)
            if pub_div is not None:
                ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"})
                for ref in ref_list:
                    for anchor in ref.find_all('a',
                                               attrs={'href': pubmed_regex}):
                        if re.match(r'PubMed:', anchor.text):
                            pmnum = re.sub(r'PubMed:\s*', '', anchor.text)
                        else:
                            pmnum = re.search(r'\/pubmed\/(\d+)$',
                                              anchor['href']).group(1)
                        if pmnum is not None:
                            pmid = 'PMID:' + str(pmnum)
                            self.graph.addTriple(pmid,
                                                 self.globaltt['is_about'],
                                                 nbk_id)
                            pmid_set.add(pmnum)
                            reference = Reference(
                                self.graph, pmid,
                                self.globaltt['journal article'])
                            reference.addRefToGraph()

            # TODO add author history, copyright, license to dataset

            # TODO get PMID-NBKID equivalence (near foot of page),
            # and make it "is about" link
            # self.gu.addTriple(
            #   self.graph, pmid,
            #   self.globaltt['is_about'], nbk_id)
            # for example: NBK1191 PMID:20301370

            # add the book to the dataset
            self.dataset.setFileAccessUrl(book_item['url'])

            if limit is not None and cnt > limit:
                break

            # finish looping through books

        bknfd = len(books_not_found)
        if len(books_not_found) > 0:
            if bknfd > 100:
                LOG.warning("There were %d books not found.", bknfd)
            else:
                LOG.warning(
                    "The following %d books were not found locally: %s", bknfd,
                    str(books_not_found))
        LOG.info("Finished processing %d books for clinical descriptions",
                 cnt - bknfd)

        return
Esempio n. 19
0
    def process_gaf(self, file, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", file)
        line_counter = 0
        uniprot_hit = 0
        uniprot_miss = 0
        if '7955' in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        if '6239' in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue

                if len(row) > 17 or len(row) < 15:
                    LOG.warning(
                        "Wrong number of columns %i, expected 15 or 17\n%s",
                        len(row), row)
                    continue

                if 17 > len(row) >= 15:
                    row += [""] * (17 - len(row))

                (dbase, gene_num, gene_symbol, qualifier, go_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 object_type, taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (dbase == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    LOG.error(
                        "Missing required part of annotation on row %d:\n" +
                        '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s  is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and not (re.match(r'NCBIGene', gene_id)
                                           and int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, syn.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    LOG.info(">1 taxon (%s) on line %d.  skipping", taxon,
                             line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to',
                                                   qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n",
                                str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                #######################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i):
                            LOG.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = wbase.make_reagent_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, i, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(self.globaltt[
                                    'experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.test_mode and limit is not None and line_counter > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
        return
Esempio n. 20
0
    def _process_qtls_genomic_location(
            self, raw, src_key, txid, build_id, build_label, common_name, limit=None):
        """
        This method

        Triples created:

        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)
        # assume that chrs get added to the genome elsewhere

        taxon_curie = 'NCBITaxon:' + txid
        eco_id = self.globaltt['quantitative trait analysis evidence']
        LOG.info("Processing QTL locations for %s from %s", taxon_curie, raw)
        with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            # no header in GFF, so no header checking
            col = self.files[src_key]['columns']
            col_len = len(col)
            for row in reader:
                if row[0][0] == '#':
                    # LOG.info(row)
                    continue

                if len(row) != col_len and ''.join(row[col_len:]) != '':
                    LOG.warning(
                        "Problem parsing in %s row %s\n"
                        "got %s cols but expected %s",
                        raw, reader.line_num, len(row), col_len)
                    LOG.info(row)
                    continue

                chromosome = row[col.index('SEQNAME')].strip()
                # qtl_source = row[col.index('SOURCE')].strip()
                # qtl_type = row[col.index('FEATURE')].strip()
                start_bp = row[col.index('START')].strip()
                stop_bp = row[col.index('END')].strip()
                # score = row[col.index('SCORE')].strip()
                strand = row[col.index('STRAND')].strip()
                # frame = row[col.index('FRAME')].strip()
                attr = row[col.index('ATTRIBUTE')].strip()

                example = '''
Chr.Z   Animal QTLdb    Production_QTL  33954873      34023581...
QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234;
trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass";
MO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian";
Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52";
Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01
                '''
                str(example)
                # make dictionary of attributes
                # keys are:
                # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers,
                # VTO_name,Map_Type,Significance,P-value,Model,
                # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM,
                # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect,
                # Dominance_Effect,Likelihood_Ratio,LS-means,Breed,
                # trait (duplicate with Name),Variance,Bayes-value,
                # F-Stat,LOD-score,Additive_Effect,Dominance_Effect,
                # Likelihood_Ratio,LS-means

                # deal with poorly formed attributes
                if re.search(r'"FlankMarkers";', attr):
                    attr = re.sub(r'FlankMarkers;', '', attr)
                attr_items = re.sub(r'"', '', attr).split(";")
                bad_attrs = set()
                for attributes in attr_items:
                    if not re.search(r'=', attributes):
                        # remove this attribute from the list
                        bad_attrs.add(attributes)

                attr_set = set(attr_items) - bad_attrs
                attribute_dict = dict(item.split("=") for item in attr_set)

                qtl_num = attribute_dict.get('QTL_ID')
                if self.test_mode and int(qtl_num) not in self.test_ids:
                    continue
                # make association between QTL and trait based on taxon

                qtl_id = common_name + 'QTL:' + str(qtl_num)
                model.addIndividualToGraph(qtl_id, None, self.globaltt['QTL'])
                geno.addTaxon(taxon_curie, qtl_id)

                #
                trait_id = 'AQTLTrait:' + attribute_dict.get('trait_ID')

                # if pub is in attributes, add it to the association
                pub_id = None
                if 'PUBMED_ID' in attribute_dict.keys():
                    pub_id = attribute_dict.get('PUBMED_ID')
                    if re.match(r'ISU.*', pub_id):
                        pub_id = 'AQTLPub:' + pub_id.strip()
                        reference = Reference(graph, pub_id)
                    else:
                        pub_id = 'PMID:' + pub_id.strip()
                        reference = Reference(
                            graph, pub_id, self.globaltt['journal article'])
                    reference.addRefToGraph()

                # Add QTL to graph
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id,
                    self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)
                if 'P-value' in attribute_dict.keys():
                    scr = re.sub(r'<', '', attribute_dict.get('P-value'))
                    if ',' in scr:
                        scr = re.sub(r',', '.', scr)
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)

                assoc.add_association_to_graph()
                # TODO make association to breed
                # (which means making QTL feature in Breed background)

                # get location of QTL
                chromosome = re.sub(r'Chr\.', '', chromosome)
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                qtl_feature = Feature(graph, qtl_id, None, self.globaltt['QTL'])
                if start_bp == '':
                    start_bp = None
                qtl_feature.addFeatureStartLocation(
                    start_bp, chrom_in_build_id, strand,
                    [self.globaltt['FuzzyPosition']])
                if stop_bp == '':
                    stop_bp = None
                qtl_feature.addFeatureEndLocation(
                    stop_bp, chrom_in_build_id, strand,
                    [self.globaltt['FuzzyPosition']])
                qtl_feature.addTaxonToFeature(taxon_curie)
                qtl_feature.addFeatureToGraph()

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break

        # LOG.warning("Bad attribute flags in this file")  # what does this even mean??
        LOG.info("Done with QTL genomic mappings for %s", taxon_curie)
Esempio n. 21
0
    def _process_phenotype_data(self, limit):
        """
        NOTE: If a Strain carries more than one mutation,
        then each Mutation description,
        i.e., the set: (
            Mutation Type - Chromosome - Gene Symbol -
            Gene Name - Allele Symbol - Allele Name)
        will require a separate line.

        Note that MMRRC curates phenotypes to alleles,
        even though they distribute only one file with the
        phenotypes appearing to be associated with a strain.

        So, here we process the allele-to-phenotype relationships separately
        from the strain-to-allele relationships.

        :param limit:
        :return:

        """

        src_key = 'catalog'
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        fname = '/'.join((self.rawdir, self.files[src_key]['file']))

        self.strain_hash = {}
        self.id_label_hash = {}
        genes_with_no_ids = set()
        stem_cell_class = self.globaltt['stem cell']
        mouse_taxon = self.globaltt['Mus musculus']
        geno = Genotype(graph)
        with open(fname, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            # First line is header not date/version info. This changed recently,
            # apparently as of Sep 2019. Also, 3rd line is no longer blank.
            row = [x.strip() for x in next(reader)]  # messy messy
            col = self.files['catalog']['columns']
            strain_missing_allele = []  # to count the ones w/insufficent info
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                strain_id = row[col.index('STRAIN/STOCK_ID')].strip()
                strain_label = row[col.index('STRAIN/STOCK_DESIGNATION')]
                # strain_type_symbol = row[col.index('STRAIN_TYPE')]
                strain_state = row[col.index('STATE')]
                mgi_allele_id = row[col.index(
                    'MGI_ALLELE_ACCESSION_ID')].strip()
                mgi_allele_symbol = row[col.index('ALLELE_SYMBOL')]
                # mgi_allele_name = row[col.index('ALLELE_NAME')]
                # mutation_type = row[col.index('MUTATION_TYPE')]
                # chrom = row[col.index('CHROMOSOME')]
                mgi_gene_id = row[col.index('MGI_GENE_ACCESSION_ID')].strip()
                mgi_gene_symbol = row[col.index('GENE_SYMBOL')].strip()
                mgi_gene_name = row[col.index('GENE_NAME')]
                # sds_url = row[col.index('SDS_URL')]
                # accepted_date = row[col.index('ACCEPTED_DATE')]
                mpt_ids = row[col.index('MPT_IDS')].strip()
                pubmed_nums = row[col.index('PUBMED_IDS')].strip()
                research_areas = row[col.index('RESEARCH_AREAS')].strip()

                if self.test_mode and (strain_id not in self.test_ids) \
                        or mgi_gene_name == 'withdrawn':
                    continue

                # strip off stuff after the dash -
                # is the holding center important?
                # MMRRC:00001-UNC --> MMRRC:00001
                strain_id = re.sub(r'-\w+$', '', strain_id)

                self.id_label_hash[strain_id] = strain_label

                # get the variant or gene to save for later building of
                # the genotype
                if strain_id not in self.strain_hash:
                    self.strain_hash[strain_id] = {
                        'variants': set(),
                        'genes': set()
                    }

                # flag bad ones
                if mgi_allele_id[:4] != 'MGI:' and mgi_allele_id != '':
                    LOG.error("Erroneous MGI allele id: %s", mgi_allele_id)
                    if mgi_allele_id[:3] == 'MG:':
                        mgi_allele_id = 'MGI:' + mgi_allele_id[3:]
                    else:
                        mgi_allele_id = ''

                if mgi_allele_id != '':
                    self.strain_hash[strain_id]['variants'].add(mgi_allele_id)
                    self.id_label_hash[mgi_allele_id] = mgi_allele_symbol

                    # use the following if needing to add the sequence alteration types
                    # var_type = self.localtt[mutation_type]
                    # make a sequence alteration for this variant locus,
                    # and link the variation type to it
                    # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA'
                    # if self.nobnodes:
                    #     sa_id = ':'+sa_id
                    # gu.addIndividualToGraph(g, sa_id, None, var_type)
                    # geno.addSequenceAlterationToVariantLocus(sa_id, mgi_allele_id)

                # scrub out any spaces, fix known issues
                mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id)
                if mgi_gene_id == 'NULL':
                    mgi_gene_id = ''
                elif mgi_gene_id[:7] == 'GeneID:':
                    mgi_gene_id = 'NCBIGene:' + mgi_gene_id[7:]

                if mgi_gene_id != '':
                    try:
                        [curie, localid] = mgi_gene_id.split(':')
                    except ValueError as verror:
                        LOG.warning(
                            "Problem parsing mgi_gene_id %s from file %s: %s",
                            mgi_gene_id, fname, verror)
                    if curie not in ['MGI', 'NCBIGene']:
                        LOG.info("MGI Gene id not recognized: %s", mgi_gene_id)
                    self.strain_hash[strain_id]['genes'].add(mgi_gene_id)
                    self.id_label_hash[mgi_gene_id] = mgi_gene_symbol

                # catch some errors - too many. report summary at the end
                # some things have gene labels, but no identifiers - report
                if mgi_gene_symbol != '' and mgi_gene_id == '':
                    # LOG.error(
                    #    "Gene label with no MGI identifier for strain %s: %s",
                    #    strain_id, mgi_gene_symbol)
                    genes_with_no_ids.add(mgi_gene_symbol)
                    # make a temp id for genes that aren't identified ... err wow.
                    # tmp_gene_id = '_' + mgi_gene_symbol
                    # self.id_label_hash[tmp_gene_id.strip()] = mgi_gene_symbol
                    # self.strain_hash[strain_id]['genes'].add(tmp_gene_id)

                # split apart the mp ids
                # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ...
                # mpt_ids are a comma delimited list
                # labels with MP terms following in brackets
                phenotype_ids = []
                if mpt_ids != '':
                    for lb_mp in mpt_ids.split(r','):
                        lb_mp = lb_mp.strip()
                        if lb_mp[-1:] == ']' and lb_mp[-12:-8] == '[MP:':
                            phenotype_ids.append(lb_mp[-11:-2])

                # pubmed ids are space delimited
                pubmed_ids = []
                if pubmed_nums != '':
                    for pm_num in re.split(r'\s+', pubmed_nums):
                        pmid = 'PMID:' + pm_num.strip()
                        pubmed_ids.append(pmid)
                        ref = Reference(graph, pmid,
                                        self.globaltt['journal article'])
                        ref.addRefToGraph()

                # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001
                # is a good example of 4 genotype parts

                model.addClassToGraph(mouse_taxon, None)
                if research_areas == '':
                    research_areas = None
                else:
                    research_areas = 'Research Areas: ' + research_areas
                strain_type = mouse_taxon
                if strain_state == 'ES':
                    strain_type = stem_cell_class
                model.addIndividualToGraph(  # an inst of mouse??
                    strain_id, strain_label, strain_type, research_areas)
                model.makeLeader(strain_id)

                # phenotypes are associated with the alleles
                for pid in phenotype_ids:
                    # assume the phenotype label is in some ontology
                    model.addClassToGraph(pid, None)
                    if mgi_allele_id is not None and mgi_allele_id != '':
                        assoc = G2PAssoc(graph, self.name, mgi_allele_id, pid,
                                         self.globaltt['has phenotype'])
                        for p in pubmed_ids:
                            assoc.add_source(p)
                        assoc.add_association_to_graph()
                    else:
                        # too chatty here. report aggregate
                        # LOG.info("Phenotypes and no allele for %s", strain_id)
                        strain_missing_allele.append(strain_id)

                if not self.test_mode and (limit is not None
                                           and reader.line_num > limit):
                    break

            # report misses
            if strain_missing_allele:
                LOG.info("Phenotypes and no allele for %i strains",
                         len(strain_missing_allele))

            # now that we've collected all of the variant information, build it
            # we don't know their zygosities
            for s in self.strain_hash:
                h = self.strain_hash.get(s)
                variants = h['variants']
                genes = h['genes']
                vl_set = set()
                # make variant loci for each gene
                if variants:
                    for var in variants:
                        vl_id = var.strip()
                        vl_symbol = self.id_label_hash[vl_id]
                        geno.addAllele(vl_id, vl_symbol,
                                       self.globaltt['variant_locus'])
                        vl_set.add(vl_id)
                        if len(variants) == 1 and len(genes) == 1:
                            for gene in genes:
                                geno.addAlleleOfGene(vl_id, gene)
                        else:
                            geno.addAllele(vl_id, vl_symbol)
                else:  # len(vars) == 0
                    # it's just anonymous variants in some gene
                    for gene in genes:
                        vl_id = '_:' + re.sub(r':', '', gene) + '-VL'
                        vl_symbol = self.id_label_hash[gene] + '<?>'
                        self.id_label_hash[vl_id] = vl_symbol
                        geno.addAllele(vl_id, vl_symbol,
                                       self.globaltt['variant_locus'])
                        geno.addGene(gene, self.id_label_hash[gene])
                        geno.addAlleleOfGene(vl_id, gene)
                        vl_set.add(vl_id)

                # make the vslcs
                vl_list = sorted(vl_set)
                vslc_list = []
                for vl in vl_list:
                    # for unknown zygosity
                    vslc_id = re.sub(r'^_', '', vl) + 'U'
                    vslc_id = re.sub(r':', '', vslc_id)
                    vslc_id = '_:' + vslc_id
                    vslc_label = self.id_label_hash[vl] + '/?'
                    self.id_label_hash[vslc_id] = vslc_label
                    vslc_list.append(vslc_id)
                    geno.addPartsToVSLC(vslc_id, vl, None,
                                        self.globaltt['indeterminate'],
                                        self.globaltt['has_variant_part'],
                                        None)
                    model.addIndividualToGraph(
                        vslc_id, vslc_label,
                        self.globaltt['variant single locus complement'])
                if vslc_list:
                    if len(vslc_list) > 1:
                        gvc_id = '-'.join(vslc_list)
                        gvc_id = re.sub(r'_|:', '', gvc_id)
                        gvc_id = '_:' + gvc_id
                        gvc_label = '; '.join(self.id_label_hash[v]
                                              for v in vslc_list)
                        model.addIndividualToGraph(
                            gvc_id, gvc_label,
                            self.globaltt['genomic_variation_complement'])
                        for vslc_id in vslc_list:
                            geno.addVSLCtoParent(vslc_id, gvc_id)
                    else:
                        # the GVC == VSLC, so don't have to make an extra piece
                        gvc_id = vslc_list.pop()
                        gvc_label = self.id_label_hash[gvc_id]

                    genotype_label = gvc_label + ' [n.s.]'
                    bkgd_id = re.sub(
                        r':', '', '-'.join(
                            (self.globaltt['unspecified_genomic_background'],
                             s)))
                    genotype_id = '-'.join((gvc_id, bkgd_id))
                    bkgd_id = '_:' + bkgd_id
                    geno.addTaxon(mouse_taxon, bkgd_id)
                    geno.addGenomicBackground(
                        bkgd_id, 'unspecified (' + s + ')',
                        self.globaltt['unspecified_genomic_background'],
                        "A placeholder for the unspecified genetic background for "
                        + s)
                    geno.addGenomicBackgroundToGenotype(
                        bkgd_id, genotype_id,
                        self.globaltt['unspecified_genomic_background'])
                    geno.addParts(gvc_id, genotype_id,
                                  self.globaltt['has_variant_part'])
                    geno.addGenotype(genotype_id, genotype_label)
                    graph.addTriple(s, self.globaltt['has_genotype'],
                                    genotype_id)
                else:
                    # LOG.debug(
                    #   "Strain %s is not making a proper genotype.", s)
                    pass

            LOG.warning(
                "The following gene symbols did not list identifiers: %s",
                str(sorted(list(genes_with_no_ids))))
            LOG.error('%i symbols given are missing their gene identifiers',
                      len(genes_with_no_ids))

        return
Esempio n. 22
0
    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['allele_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing Allele phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, phenotype_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_id = None
                if eco_symbol == 'IMP':
                    eco_id = 'ECO:0000015'
                elif eco_symbol.strip() != '':
                    logger.warning("Encountered an ECO code we don't have: %s",
                                   eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    logger.error(
                        "Missing alleles from phenotype assoc at line %d",
                        line_counter)
                    continue
                else:
                    for a in allele_list:
                        allele_num = re.sub(r'WB:', '', a.strip())
                        allele_id = 'WormBase:' + allele_num
                        gene_id = 'WormBase:' + gene_num

                        if re.search(r'WBRNAi', allele_id):
                            # make the reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            rnai_num = re.sub(r'WormBase:', '', allele_id)
                            rnai_id = allele_id
                            rtg_id = self.make_reagent_targeted_gene_id(
                                gene_num, rnai_num)
                            geno.addReagentTargetedGene(
                                rnai_id, 'WormBase:' + gene_num, rtg_id)
                            geno.addGeneTargetingReagent(
                                rnai_id, None, geno.genoparts['RNAi_reagent'],
                                gene_id)
                            allele_id = rtg_id
                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations

                            # the public name will come from elsewhere
                            geno.addSequenceAlteration(allele_id, None)
                            vl_id = '_:' + '-'.join((gene_num, allele_num))
                            geno.addSequenceAlterationToVariantLocus(
                                allele_id, vl_id)
                            geno.addAlleleOfGene(vl_id, gene_id)
                        else:
                            logger.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(g, self.name, allele_id, phenotype_id)

                        if eco_id is not None:
                            assoc.add_evidence(eco_id)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            reference = Reference(g, ref)
                            if re.search(r'Person', ref):
                                reference.setType(
                                    reference.ref_types['person'])
                                # also add
                                # inferred from background scientific knowledge
                                assoc.add_evidence('ECO:0000001')
                            reference.addRefToGraph()
                            assoc.add_source(ref)

                        assoc.add_association_to_graph()

                        # finish looping through all alleles

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Esempio n. 23
0
    def _get_process_allelic_variants(self, entry, g):
        model = Model(g)
        reference = Reference(g)
        geno = Genotype(g)
        if entry is not None:
            # to hold the entry-specific publication mentions
            # for the allelic variants
            publist = {}
            entry_num = entry['mimNumber']

            # process the ref list just to get the pmids
            ref_to_pmid = self._get_pubs(entry, g)

            if 'allelicVariantList' in entry:
                allelicVariantList = entry['allelicVariantList']
                for al in allelicVariantList:
                    al_num = al['allelicVariant']['number']
                    al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4)
                    al_label = None
                    al_description = None
                    if al['allelicVariant']['status'] == 'live':
                        publist[al_id] = set()
                        if 'mutations' in al['allelicVariant']:
                            al_label = al['allelicVariant']['mutations']
                        if 'text' in al['allelicVariant']:
                            al_description = al['allelicVariant']['text']
                            m = re.findall(r'\{(\d+)\:', al_description)
                            publist[al_id] = set(m)
                        geno.addAllele(
                            al_id, al_label, geno.genoparts['variant_locus'],
                            al_description)
                        geno.addAlleleOfGene(
                            al_id, 'OMIM:'+str(entry_num),
                            geno.object_properties[
                                'is_sequence_variant_instance_of'])
                        for r in publist[al_id]:
                            pmid = ref_to_pmid[int(r)]
                            g.addTriple(
                                pmid, model.object_properties['is_about'],
                                al_id)
                        # look up the pubmed id in the list of references
                        if 'dbSnps' in al['allelicVariant']:
                            dbsnp_ids = \
                                re.split(r',', al['allelicVariant']['dbSnps'])
                            for dnum in dbsnp_ids:
                                did = 'dbSNP:'+dnum.strip()
                                model.addIndividualToGraph(did, None)
                                model.addSameIndividual(al_id, did)
                        if 'clinvarAccessions' in al['allelicVariant']:
                            # clinvarAccessions triple semicolon delimited
                            # each >1 like RCV000020059;;;
                            rcv_ids = \
                                re.split(
                                    r';;;',
                                    al['allelicVariant']['clinvarAccessions'])
                            rcv_ids = [
                                (re.match(r'(RCV\d+);*', r)).group(1)
                                for r in rcv_ids]
                            for rnum in rcv_ids:
                                rid = 'ClinVar:'+rnum
                                model.addXref(al_id, rid)
                        reference.addPage(
                            al_id, "http://omim.org/entry/" +
                            str(entry_num)+"#" + str(al_num).zfill(4))
                    elif re.search(
                            r'moved', al['allelicVariant']['status']):
                        # for both 'moved' and 'removed'
                        moved_ids = None
                        if 'movedTo' in al['allelicVariant']:
                            moved_id = 'OMIM:'+al['allelicVariant']['movedTo']
                            moved_ids = [moved_id]
                        model.addDeprecatedIndividual(al_id, moved_ids)
                    else:
                        logger.error('Uncaught alleleic variant status %s',
                                     al['allelicVariant']['status'])
                # end loop allelicVariantList

        return
Esempio n. 24
0
    def _process_data(self, source, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page
         https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

        line_id a CL_0000057,  #fibroblast line
            derives_from patient_id
            part_of :NIGMSrepository
            RO:model_of OMIM:disease_id

        patient id a foaf:person,
            label: "fibroblast from patient 12345 with disease X"
            member_of family_id  #what is the right thing here?
            SIO:race EFO:caucasian  #subclass of EFO:0001799
            in_taxon NCBITaxon:9606
            dc:description Literal(remark)
            RO:has_phenotype OMIM:disease_id
            GENO:has_genotype genotype_id

        family_id a owl:NamedIndividual
            foaf:page
             "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

        genotype_id a intrinsic_genotype
            GENO:has_alternate_part allelic_variant_id
            we don't necessarily know much about the genotype,
            other than the allelic variant. also there's the sex here

        pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files[source]['file']))

        LOG.info("Processing Data from %s", raw)

        if self.testMode:  # set the graph to build
            graph = self.testgraph
        else:
            graph = self.graph

        family = Family(graph)
        model = Model(graph)

        line_counter = 1
        geno = Genotype(graph)
        diputil = DipperUtil()
        col = self.files[source]['columns']
        # affords access with
        # x = row[col.index('x')].strip()

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"')
            # we can keep a close watch on changing file formats
            fileheader = next(filereader, None)
            fileheader = [c.lower() for c in fileheader]
            if col != fileheader:  # assert
                LOG.error('Expected  %s to have columns: %s', raw, col)
                LOG.error('But Found %s to have columns: %s', raw, fileheader)
                raise AssertionError('Incomming data headers have changed.')

            for row in filereader:
                line_counter += 1
                if len(row) != len(col):
                    LOG.warning('Expected %i values but find %i in  row %i',
                                len(col), len(row), line_counter)
                    continue

                # (catalog_id, description, omim_number, sample_type,
                # cell_line_available, dna_in_stock, dna_ref, gender, age,
                # race, ethnicity, affected, karyotype, relprob, mutation,
                # gene, family_id, collection, url, cat_remark, pubmed_ids,
                # family_member, variant_id, dbsnp_id, species) = row

                # example:
                # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,
                #       ,Female,26 YR,Caucasian,,,,
                # parent,,,39,NIGMS Human Genetic Cell Repository,
                # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                # 46;XX; clinically normal mother of a child with Hurler syndrome;
                #       proband not in Repository,,
                # 2,,18343,H**o sapiens

                catalog_id = row[col.index('catalog_id')].strip()

                if self.testMode and catalog_id not in self.test_lines:
                    # skip rows not in our test lines, when in test mode
                    continue

                # ###########    BUILD REQUIRED VARIABLES    ###########

                # Make the cell line ID
                cell_line_id = 'Coriell:' + catalog_id
                # Map the cell/sample type
                cell_type = self.resolve(row[col.index('sample_type')].strip())
                # on fail cell_type = self.globaltt['cell'] ?

                # Make a cell line label
                collection = row[col.index('collection')].strip()
                line_label = collection.partition(' ')[0] + '-' + catalog_id

                # Map the repository/collection
                repository = self.localtt[collection]

                # patients are uniquely identified by one of:
                # dbsnp id (which is == an individual haplotype)
                # family id + family member (if present) OR
                # probands are usually family member zero
                # cell line id
                # since some patients have >1 cell line derived from them,
                # we must make sure that the genotype is attached to
                # the patient, and can be inferred to the cell line
                # examples of repeated patients are:
                #   famid=1159, member=1; fam=152,member=1

                # Make the patient ID

                # make an anonymous patient
                patient_id = '_:person'
                fam_id = row[col.index('fam')].strip()
                fammember = row[col.index('fammember')].strip()
                if fam_id != '':
                    patient_id = '-'.join((patient_id, fam_id, fammember))
                else:
                    # make an anonymous patient
                    patient_id = '-'.join((patient_id, catalog_id))

                # properties of the individual patients:  sex, family id,
                # member/relproband, description descriptions are
                # really long and ugly SCREAMING text, so need to clean up
                # the control cases are so odd with this labeling scheme;
                # but we'll deal with it as-is for now.
                description = row[col.index('description')].strip()
                short_desc = (description.split(';')[0]).capitalize()

                gender = row[col.index('gender')].strip().lower()
                affected = row[col.index('affected')].strip()
                relprob = row[col.index('relprob')].strip()

                if affected == '':
                    affected = 'unspecified'
                elif affected in self.localtt:
                    affected = self.localtt[affected]
                else:
                    LOG.warning('Novel Affected status  %s at row: %i of %s',
                                affected, line_counter, raw)
                patient_label = ' '.join((affected, gender, relprob))
                if relprob == 'proband':
                    patient_label = ' '.join(
                        (patient_label.strip(), 'with', short_desc))
                else:
                    patient_label = ' '.join(
                        (patient_label.strip(), 'of proband with', short_desc))

                # #############    BUILD THE CELL LINE    #############

                # Adding the cell line as a typed individual.
                cell_line_reagent_id = self.globaltt['cell line']

                model.addIndividualToGraph(cell_line_id, line_label,
                                           cell_line_reagent_id)

                # add the equivalent id == dna_ref
                dna_ref = row[col.index('dna_ref')].strip()
                if dna_ref != '' and dna_ref != catalog_id:
                    equiv_cell_line = 'Coriell:' + dna_ref
                    # some of the equivalent ids are not defined
                    # in the source data; so add them
                    model.addIndividualToGraph(equiv_cell_line, None,
                                               cell_line_reagent_id)
                    model.addSameIndividual(cell_line_id, equiv_cell_line)

                # Cell line derives from patient
                geno.addDerivesFrom(cell_line_id, patient_id)
                geno.addDerivesFrom(cell_line_id, cell_type)

                # Cell line a member of repository
                family.addMember(repository, cell_line_id)

                cat_remark = row[col.index('cat_remark')].strip()

                if cat_remark != '':
                    model.addDescription(cell_line_id, cat_remark)

                # Cell age_at_sampling
                # TODO add the age nodes when modeled properly in #78
                # if (age != ''):
                # this would give a BNode that is an instance of Age.
                # but i don't know how to connect
                # the age node to the cell line? we need to ask @mbrush
                # age_id = '_'+re.sub('\s+','_',age)
                # gu.addIndividualToGraph(
                #   graph,age_id,age,self.globaltt['age'])
                # gu.addTriple(
                #   graph,age_id,self.globaltt['has measurement value'],age,
                #   True)

                # #############    BUILD THE PATIENT    #############

                # Add the patient ID as an individual.
                model.addPerson(patient_id, patient_label)
                # TODO map relationship to proband as a class
                # (what ontology?)

                # Add race of patient
                # FIXME: Adjust for subcategories based on ethnicity field
                # EDIT: There are 743 different entries for ethnicity...
                # Too many to map?
                # Add ethnicity as literal in addition to the mapped race?
                # Adjust the ethnicity txt (if using)
                # to initial capitalization to remove ALLCAPS

                # TODO race should go into the individual's background
                # and abstracted out to the Genotype class punting for now.
                # if race != '':
                #    mapped_race = self.resolve(race)
                #    if mapped_race is not None:
                #        gu.addTriple(
                #           g,patient_id,self.globaltt['race'], mapped_race)
                #        model.addSubClass(
                #           mapped_race,self.globaltt['ethnic_group'])

                # #############    BUILD THE FAMILY    #############

                # Add triples for family_id, if present.
                if fam_id != '':
                    family_comp_id = 'CoriellFamily:' + fam_id

                    family_label = ' '.join(
                        ('Family of proband with', short_desc))

                    # Add the family ID as a named individual
                    model.addIndividualToGraph(family_comp_id, family_label,
                                               self.globaltt['family'])

                    # Add the patient as a member of the family
                    family.addMemberOf(patient_id, family_comp_id)

                # #############    BUILD THE GENOTYPE   #############

                # the important things to pay attention to here are:
                # karyotype = chr rearrangements  (somatic?)
                # mutation = protein-level mutation as a label,
                # often from omim
                # gene = gene symbol - TODO get id
                # variant_id = omim variant ids (; delimited)
                # dbsnp_id = snp individual ids = full genotype?

                # note GM00633 is a good example of chromosomal variation
                # - do we have enough to capture this?
                # GM00325 has both abnormal karyotype and variation

                # make an assumption that if the taxon is blank,
                # that it is human!
                species = row[col.index('species')].strip()
                if species is None or species == '':
                    species = 'H**o sapiens'
                taxon = self.resolve(species)

                # if there's a dbSNP id,
                # this is actually the individual's genotype
                genotype_id = None
                genotype_label = None

                dbsnp_id = row[col.index('dbsnp_id')].strip()
                if dbsnp_id != '':
                    genotype_id = 'dbSNPIndividual:' + dbsnp_id

                omim_map = {}
                gvc_id = None

                # some of the karyotypes are encoded
                # with terrible hidden codes. remove them here
                # i've seen a <98> character
                karyotype = row[col.index('karyotype')].strip()
                karyotype = diputil.remove_control_characters(karyotype)
                karyotype_id = None
                if karyotype.strip() != '':
                    karyotype_id = '_:' + re.sub('MONARCH:', '',
                                                 self.make_id(karyotype))
                    # add karyotype as karyotype_variation_complement
                    model.addIndividualToGraph(
                        karyotype_id, karyotype,
                        self.globaltt['karyotype_variation_complement'])
                    # TODO break down the karyotype into parts
                    # and map into GENO. depends on #77

                    # place the karyotype in a location(s).
                    karyo_chrs = self._get_affected_chromosomes_from_karyotype(
                        karyotype)
                    for chrom in karyo_chrs:
                        chr_id = makeChromID(chrom, taxon, 'CHR')
                        # add an anonymous sequence feature,
                        # each located on chr
                        karyotype_feature_id = '-'.join((karyotype_id, chrom))
                        karyotype_feature_label = \
                            'some karyotype alteration on chr' + str(chrom)
                        feat = Feature(graph, karyotype_feature_id,
                                       karyotype_feature_label,
                                       self.globaltt['sequence_alteration'])
                        feat.addFeatureStartLocation(None, chr_id)
                        feat.addFeatureToGraph()
                        geno.addParts(karyotype_feature_id, karyotype_id,
                                      self.globaltt['has_variant_part'])

                gene = row[col.index('gene')].strip()
                mutation = row[col.index('mutation')].strip()
                if gene != '':
                    vl = gene + '(' + mutation + ')'

                # fix the variant_id so it's always in the same order
                variant_id = row[col.index('variant_id')].strip()
                vids = variant_id.split(';')
                variant_id = ';'.join(sorted(list(set(vids))))

                if karyotype.strip() != '' and not self._is_normal_karyotype(
                        karyotype):

                    gvc_id = karyotype_id
                    if variant_id != '':
                        gvc_id = '_:' + variant_id.replace(';', '-') + '-' \
                            + re.sub(r'\w*:', '', karyotype_id)
                    if mutation.strip() != '':
                        gvc_label = '; '.join((vl, karyotype))
                    else:
                        gvc_label = karyotype
                elif variant_id.strip() != '':
                    gvc_id = '_:' + variant_id.replace(';', '-')
                    gvc_label = vl
                else:
                    # wildtype?
                    pass

                # add the karyotype to the gvc.
                # use reference if normal karyotype
                karyo_rel = self.globaltt['has_variant_part']
                if self._is_normal_karyotype(karyotype):
                    karyo_rel = self.globaltt['has_reference_part']
                if karyotype_id is not None \
                        and not self._is_normal_karyotype(karyotype) \
                        and gvc_id is not None and karyotype_id != gvc_id:
                    geno.addParts(karyotype_id, gvc_id, karyo_rel)

                if variant_id.strip() != '':
                    # split the variants & add them as part of the genotype
                    # we don't necessarily know their zygosity,
                    # just that they are part of the genotype variant ids
                    # are from OMIM, so prefix as such we assume that the
                    # sequence alts will be defined in OMIM not here
                    # TODO sort the variant_id list, if the omim prefix is
                    # the same, then assume it's the locus make a hashmap
                    # of the omim id to variant id list;
                    # then build the genotype hashmap is also useful for
                    # removing the "genes" from the list of "phenotypes"

                    # will hold gene/locus id to variant list
                    omim_map = {}

                    locus_num = None
                    for var in variant_id.split(';'):
                        # handle omim-style and odd var ids
                        # like 610661.p.R401X
                        mch = re.match(r'(\d+)\.+(.*)', var.strip())
                        if mch is not None and len(mch.groups()) == 2:
                            (locus_num, var_num) = mch.groups()

                        if locus_num is not None and locus_num not in omim_map:
                            omim_map[locus_num] = [var_num]
                        else:
                            omim_map[locus_num] += [var_num]

                    for omim in omim_map:
                        # gene_id = 'OMIM:' + omim  # TODO unused
                        vslc_id = '_:' + '-'.join(
                            [omim + '.' + a for a in omim_map.get(omim)])
                        vslc_label = vl
                        # we don't really know the zygosity of
                        # the alleles at all.
                        # so the vslcs are just a pot of them
                        model.addIndividualToGraph(
                            vslc_id, vslc_label,
                            self.globaltt['variant single locus complement'])
                        for var in omim_map.get(omim):
                            # this is actually a sequence alt
                            allele1_id = 'OMIM:' + omim + '.' + var
                            geno.addSequenceAlteration(allele1_id, None)

                            # assume that the sa -> var_loc -> gene
                            # is taken care of in OMIM
                            geno.addPartsToVSLC(
                                vslc_id, allele1_id, None,
                                self.globaltt['indeterminate'],
                                self.globaltt['has_variant_part'])

                        if vslc_id != gvc_id:
                            geno.addVSLCtoParent(vslc_id, gvc_id)

                if affected == 'unaffected':
                    # let's just say that this person is wildtype
                    model.addType(patient_id, self.globaltt['wildtype'])
                elif genotype_id is None:
                    # make an anonymous genotype id (aka blank node)
                    genotype_id = '_:geno' + catalog_id.strip()

                # add the gvc
                if gvc_id is not None:
                    model.addIndividualToGraph(
                        gvc_id, gvc_label,
                        self.globaltt['genomic_variation_complement'])

                    # add the gvc to the genotype
                    if genotype_id is not None:
                        if affected == 'unaffected':
                            rel = self.globaltt['has_reference_part']
                        else:
                            rel = self.globaltt['has_variant_part']
                        geno.addParts(gvc_id, genotype_id, rel)

                    if karyotype_id is not None \
                            and self._is_normal_karyotype(karyotype):
                        if gvc_label is not None and gvc_label != '':
                            genotype_label = '; '.join((gvc_label, karyotype))
                        elif karyotype is not None:
                            genotype_label = karyotype
                        if genotype_id is None:
                            genotype_id = karyotype_id
                        else:
                            geno.addParts(karyotype_id, genotype_id,
                                          self.globaltt['has_reference_part'])
                    else:
                        genotype_label = gvc_label
                        # use the catalog id as the background
                    genotype_label += ' [' + catalog_id.strip() + ']'

                if genotype_id is not None and gvc_id is not None:
                    # only add the genotype if it has some parts
                    geno.addGenotype(genotype_id, genotype_label,
                                     self.globaltt['intrinsic_genotype'])
                    geno.addTaxon(taxon, genotype_id)
                    # add that the patient has the genotype
                    # TODO check if the genotype belongs to
                    # the cell line or to the patient
                    graph.addTriple(patient_id, self.globaltt['has_genotype'],
                                    genotype_id)
                else:
                    geno.addTaxon(taxon, patient_id)

                # TODO: Add sex/gender  (as part of the karyotype?)
                # = row[col.index('')].strip()
                # #############    DEAL WITH THE DISEASES   #############
                omim_num = row[col.index('omim_num')].strip()

                # we associate the disease to the patient
                if affected == 'affected' and omim_num != '':
                    for d in omim_num.split(';'):
                        if d is not None and d != '':
                            # if the omim number is in omim_map,
                            # then it is a gene not a pheno

                            # TEC - another place to use the mimTitle omim
                            # classifier omia & genereviews are using

                            if d not in omim_map:
                                disease_id = 'OMIM:' + d.strip()
                                # assume the label is taken care of in OMIM
                                model.addClassToGraph(disease_id, None)

                                # add the association:
                                #   the patient has the disease
                                assoc = G2PAssoc(graph, self.name, patient_id,
                                                 disease_id)
                                assoc.add_association_to_graph()

                                # this line is a model of this disease
                                # TODO abstract out model into
                                # it's own association class?
                                graph.addTriple(cell_line_id,
                                                self.globaltt['is model of'],
                                                disease_id)
                            else:
                                LOG.info('drop gene %s from disease list', d)

                # #############    ADD PUBLICATIONS   #############
                pubmed_ids = row[col.index('pubmed_ids')].strip()
                if pubmed_ids != '':
                    for s in pubmed_ids.split(';'):
                        pubmed_id = 'PMID:' + s.strip()
                        ref = Reference(graph, pubmed_id)
                        ref.setType(self.globaltt['journal article'])
                        ref.addRefToGraph()
                        graph.addTriple(pubmed_id, self.globaltt['mentions'],
                                        cell_line_id)

                if not self.testMode and (limit is not None
                                          and line_counter > limit):
                    break
        return
Esempio n. 25
0
    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """
        src_key = 'allele_pheno'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        col = self.files[src_key]['columns']

        graph = self.graph
        model = Model(self.graph)

        LOG.info("Processing Allele phenotype associations")
        geno = Genotype(graph)
        with open(raw, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            row = next(reader)
            if row[0] != '!gaf-version: 2.0':
                LOG.error('Not a vlaid gaf v2.0 formatted file: %s', raw)
                # raise
            for row in reader:
                if row[0][0] == '!':
                    continue
                # db = row[col.index('DB')]
                gene_num = row[col.index('DB Object ID')]
                # gene_symbol = row[col.index('DB Object Symbol')]
                is_not = row[col.index('Qualifier')]
                phenotype_id = row[col.index('GO ID')]
                ref = row[col.index('DB:Reference (|DB:Reference)')].strip()
                eco_symbol = row[col.index('Evidence Code')]
                with_or_from = row[col.index('With (or) From')]
                # aspect = row[col.index('Aspect')]
                # gene_name = row[col.index('DB Object Name')]
                # gene_synonym = row[col.index('DB Object Synonym (|Synonym)')]
                # gene_class = row[col.index('DB Object Type')]
                # taxon = row[col.index('Taxon(|taxon)')]
                # date = row[col.index('Date')]
                # assigned_by = row[col.index('Assigned By')]
                # blank = row[col.index('Annotation Extension')]
                # blank2 = row[col.index('Gene Product Form ID')]

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_symbol = eco_symbol.strip()
                eco_curie = None
                if eco_symbol.strip() != '' and eco_symbol in self.gaf_eco:
                    eco_curie = self.gaf_eco[eco_symbol]
                else:
                    LOG.warning(
                        'Evidence code %s is not found in the (gaf) gaf_eco',
                        eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    LOG.error(
                        "Missing alleles from phenotype assoc at line %d",
                        reader.line_num)
                    continue
                else:
                    for allele in allele_list:
                        allele_num = re.sub(r'WB:', '', allele.strip())
                        allele_id = 'WormBase:' + allele_num
                        gene_id = 'WormBase:' + gene_num

                        if re.search(r'WBRNAi', allele_id):

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling
                            # make the WormBase:WBRNAi* id
                            # a self.globaltt['reagent_targeted_gene'], and attach
                            # phenotype to this ID

                            # Previous model - make a bnode reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            # rnai_num = re.sub(r'WormBase:', '', allele_id)
                            # rnai_id = allele_id
                            # rtg_id = self.make_reagent_targeted_gene_id(
                            #    gene_num, rnai_num)
                            # geno.addReagentTargetedGene(
                            #    rnai_id, 'WormBase:' + gene_num, rtg_id)
                            # allele_id = rtg_id

                            # Could type the IRI as both the reagant and reagant
                            # targeted gene but not sure if this needed
                            # geno.addGeneTargetingReagent(
                            #   allele_id, None, self.globaltt['RNAi_reagent'], gene_id)

                            model.addIndividualToGraph(
                                allele_id, None,
                                self.globaltt['reagent_targeted_gene'])

                            self.graph.addTriple(
                                allele_id,
                                self.globaltt['is_expression_variant_of'],
                                gene_id)

                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations
                            # the public name will come from elsewhere

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling, treat variant
                            # like an allele
                            # vl_id = '_:'+'-'.join((gene_num, allele_num))
                            # geno.addSequenceAlterationToVariantLocus(
                            #    allele_id, vl_id)
                            # geno.addAlleleOfGene(vl_id, gene_id)

                            geno.addSequenceAlteration(allele_id, None)
                            geno.addAlleleOfGene(allele_id, gene_id)
                        else:
                            LOG.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(graph, self.name, allele_id,
                                         phenotype_id)

                        if eco_curie is not None:
                            assoc.add_evidence(eco_curie)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            reference = Reference(graph, ref)
                            if re.search(r'Person', ref):
                                reference.setType(self.globaltt['person'])
                                assoc.add_evidence(self.globaltt[
                                    'inference from background scientific knowledge']
                                                   )
                            reference.addRefToGraph()
                            assoc.add_source(ref)

                        assoc.add_association_to_graph()

                        # finish looping through all alleles

                if limit is not None and reader.line_num > limit:
                    break
Esempio n. 26
0
    def _process_allele_phenotype(self, limit):
        """
        Make allele to phenotype associations using derived_pheno_class
        and derived_pheno_manifest cvterm in the flybase db, an example entry is:

        FBal0257663    @FBcv0000351:lethal@ | @FBcv0000308:female limited@,
                       with @FBal0130657:Scer\GAL4<up>dome-PG14</up>@

        The first term is the phenotype, and all follow up terms are qualifiers,
        self.globaltt['has_qualifier'])

        Our previous approach was to use the genotype id associated with
        FBal0257663/FBal0130657 , however, this required us to create blank
        nodes and was considered unnecessarily granular

        Note that sometimes identifiers do not exist for a term, eg
        @:heat sensitive | tetracycline conditional@

        derived_pheno_class - FBcv terms, these are phenotypes
        derived_pheno_manifest -  Anatomy terms FBbt, we currently
        make phenotype IRI equivalents that end up in UPheno, but
        this is being developed and updated, see
        https://github.com/monarch-initiative/dipper/issues/770

        Adds triples to self.graph

        :param limit: number of rows to process
        :return: None

        """
        model = Model(self.graph)
        src_key = 'allele_phenotype'
        raw = '/'.join((self.rawdir, self.queries[src_key]['file']))
        LOG.info("processing allele phenotype associations")
        col = self.queries[src_key]['columns']

        transgenic_alleles = self._get_foreign_transgenic_alleles()

        # flybase terms - terms we prefix with FlyBase:
        fly_prefixes = ['FBal', 'FBti', 'FBab', 'FBba', 'FBtp']

        # a alphanumeric id followed by a colon then
        # any character but a colon bordered by @s
        term_regex = re.compile(r'@([\w]*):([^:@]*)@')
        id_regex = re.compile(r'([a-zA-Z]+)(\d+)')

        with open(raw, 'r') as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t')
            row = next(reader)  # headers
            self.check_fileheader(col, row)

            for row in reader:
                allele_id = row[col.index('allele_id')]
                pheno_desc = row[col.index('pheno_desc')]
                pheno_type = row[col.index('pheno_type')]
                pub_id = row[col.index('pub_id')]
                pub_title = row[col.index('pub_title')]
                pmid_id = row[col.index('pmid_id')]

                # Don't get phenotypes for transgenic alleles
                if allele_id in transgenic_alleles:
                    continue

                allele_curie = 'FlyBase:' + allele_id

                terms = re.findall(term_regex, pheno_desc)

                if not terms:
                    LOG.warning('Could not @terms@ in description: %s',
                                pheno_desc)
                    continue

                term_ids, term_labels = zip(*terms)
                id_match = re.match(id_regex, term_ids[0])

                if id_match is not None:
                    prefix, reference = id_match.group(1, 2)
                else:
                    raise ValueError("Could not parse id {}".format(
                        term_ids[0]))

                # derived_pheno_class should all start with a FBcv term
                if pheno_type == 'derived_pheno_class' and prefix != 'FBcv':
                    LOG.warning(
                        'derived_pheno_class does not '
                        'start with FBcv: %s', pheno_desc)
                    continue

                # Create phenotype curie
                if pheno_type == 'derived_pheno_class':
                    phenotype_curie = prefix + ':' + reference
                elif pheno_type == 'derived_pheno_manifest':
                    # These are not proper FBcv phenotype terms
                    # but rather anatomical entities, go terms, sometimes free text
                    # skip parsing for now
                    continue
                else:
                    raise ValueError(
                        "Unexpected phenotype type: {}".format(pheno_type))

                if pmid_id:
                    ref_curie = 'PMID:' + pmid_id
                else:
                    ref_curie = 'FlyBase:' + pub_id
                    reference = Reference(self.graph, ref_curie)
                    reference.setTitle(pub_title)
                    reference.addRefToGraph()

                assoc = G2PAssoc(self.graph, self.name, allele_curie,
                                 phenotype_curie,
                                 self.globaltt['has phenotype'])
                assoc.add_source(ref_curie)
                # Associations need to be disambiguated via their qualifiers
                # see http://flybase.org/reports/FBal0207398 as an example
                assoc.set_association_id(
                    assoc.make_association_id(self.name, allele_curie,
                                              self.globaltt['has phenotype'],
                                              phenotype_curie, term_ids[1:]))
                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                # add the rest as qualifiers
                for term in term_ids[1:]:
                    if term:
                        # FBal, GO, FBti, FBab ...
                        id_match = re.match(id_regex, term)
                        if id_match is not None:
                            prefix, reference = id_match.group(1, 2)
                            if prefix in fly_prefixes:
                                term_curie = 'FlyBase:' + term
                            else:
                                term_curie = prefix + ':' + reference
                        else:
                            raise ValueError(
                                "Could not parse id {}".format(term))

                    else:
                        # There is not an id for a term,
                        # eg @:heat sensitive | tetracycline conditional@
                        continue

                    self.graph.addTriple(assoc_id,
                                         self.globaltt['has_qualifier'],
                                         term_curie)

                if limit is not None and reader.line_num > limit:
                    break
Esempio n. 27
0
    def _get_gene2pubmed(self, limit):
        """
        Loops through the gene2pubmed file and adds a simple triple to say
        that a given publication is_about a gene.
        Publications are added as NamedIndividuals.

        These are filtered on the taxon.

        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file']))
        logger.info("FILE: %s", myfile)
        assoc_counter = 0
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, pubmed_num) = line.split('\t')

                # ## set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #        or (self.filter == 'geneids' and \
                #            (int(gene_num) not in self.gene_ids))):
                #         continue
                # #### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                if gene_num == '-' or pubmed_num == '-':
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                pubmed_id = ':'.join(('PMID', pubmed_num))

                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                else:
                    model.addIndividualToGraph(gene_id, None)
                # add the publication as a NamedIndividual
                # add type publication
                model.addIndividualToGraph(pubmed_id, None, None)
                reference = Reference(g, pubmed_id,
                                      Reference.ref_types['journal_article'])
                reference.addRefToGraph()
                g.addTriple(pubmed_id, model.object_properties['is_about'],
                            gene_id)
                assoc_counter += 1
                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.info("Processed %d pub-gene associations", assoc_counter)

        return
Esempio n. 28
0
    def _process_nlx_157874_1_view(self, raw, limit=None):
        """
        This table contains the Elements of Morphology data .
        Note that foaf:depiction is inverse of foaf:depicts relationship.

        Since it is bad form to have two definitions,
        we concatenate the two into one string.

        Turtle:
            <eom id> a owl:Class
                rdf:label Literal(eom label)
                oboInOwl:has_related_synonym Literal(synonym list)
                IAO:definition Literal(objective_def. subjective def)
                foaf:depiction Literal(small_image_url),
                               Literal(large_image_url)
                foaf:page Literal(page_url)
                rdfs:comment Literal(long commented text)

        TEC_note: URL are not literals.


        :param raw:
        :param limit:
        :return:
        """

        src_key = 'tables'
        model = Model(self.graph)
        col = self.resources[src_key]['columns']
        with open(raw, 'r') as rawread:
            reader = csv.reader(rawread, delimiter='\t', quotechar='\"')
            row = next(reader)
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                # head -1 dvp.pr_nlx_157874_1|tr '\t' '\n'|
                # sed "s|\(.*\)|# \1 = row[col.index('\1')]|g"

                morphology_term_id = row[col.index(
                    'morphology_term_id')].strip()
                # morphology_term_num = row[col.index('morphology_term_num')]
                morphology_term_label = row[col.index(
                    'morphology_term_label')].strip()
                morphology_term_url = row[col.index(
                    'morphology_term_url')].strip()
                # terminology_category_label = row[
                #   col.index('terminology_category_label')]
                # terminology_category_url = row[col.index('terminology_category_url')]
                # subcategory = row[col.index('subcategory')]
                objective_definition = row[col.index(
                    'objective_definition')].strip()
                subjective_definition = row[col.index(
                    'subjective_definition')].strip()
                comments = row[col.index('comments')].strip()
                synonyms = row[col.index('synonyms')].strip()
                replaces = row[col.index('replaces')].strip()
                small_figure_url = row[col.index('small_figure_url')].strip()
                large_figure_url = row[col.index('large_figure_url')].strip()
                # e_uid = row[col.index('e_uid')]
                # v_uid = row[col.index('v_uid')]
                # v_uuid = row[col.index('v_uuid')]
                # v_lastmodified = row[col.index('v_lastmodified')]
                # v_status = row[col.index('v_status')]
                # v_lastmodified_epoch = row[col.index('v_lastmodified_epoch')]

                # Add morphology term to graph as a class
                # with label, type, and description.
                model.addClassToGraph(morphology_term_id,
                                      morphology_term_label)

                # Assemble the description text

                if subjective_definition != '' and not (re.match(
                        r'.+\.$', subjective_definition)):
                    # add a trailing period.
                    subjective_definition = subjective_definition + '.'
                if objective_definition != '' and not (re.match(
                        r'.+\.$', objective_definition)):
                    # add a trailing period.
                    objective_definition = objective_definition + '.'

                definition = '  '.join(
                    (objective_definition, subjective_definition))

                model.addDefinition(morphology_term_id, definition)

                # <term id> FOAF:depicted_by literal url
                # <url> type foaf:depiction

                # do we want both images?
                # morphology_term_id has depiction small_figure_url
                if small_figure_url != '':
                    model.addDepiction(morphology_term_id, small_figure_url)

                # morphology_term_id has depiction large_figure_url
                if large_figure_url != '':
                    model.addDepiction(morphology_term_id, large_figure_url)

                # morphology_term_id has comment comments
                if comments != '':
                    model.addComment(morphology_term_id, comments)

                for syn in synonyms.split(';'):
                    model.addSynonym(morphology_term_id, syn.strip(),
                                     self.globaltt['has_exact_synonym'])

                # morphology_term_id has_related_synonym replaces (; delimited)
                if replaces not in ['', synonyms]:
                    for syn in replaces.split(';'):
                        model.addSynonym(morphology_term_id, syn.strip(),
                                         self.globaltt['has_related_synonym'])

                # <morphology_term_id> <foaf:page> morphology_term_url
                if morphology_term_id is not None:
                    reference = Reference(self.graph, morphology_term_id,
                                          self.globaltt['web page'])

                    # TEC 201905:
                    # Not so sure we need explicit   <eom_uri> <webpage> <eom_url>.
                    # since <eom_uri> IS the <eom_url>.

                    reference.addPage(morphology_term_id, morphology_term_url)

                if limit is not None and reader.line_num > limit:
                    break
Esempio n. 29
0
    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['allele_pheno']['file']))

        graph = self.graph
        model = Model(self.graph)

        LOG.info("Processing Allele phenotype associations")
        line_counter = 0
        geno = Genotype(graph)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, phenotype_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_symbol = eco_symbol.strip()
                eco_id = None
                if eco_symbol.strip() != '':
                    eco_id = self.resolve(eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    LOG.error(
                        "Missing alleles from phenotype assoc at line %d",
                        line_counter)
                    continue
                else:
                    for allele in allele_list:
                        allele_num = re.sub(r'WB:', '', allele.strip())
                        allele_id = 'WormBase:' + allele_num
                        gene_id = 'WormBase:' + gene_num

                        if re.search(r'WBRNAi', allele_id):

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling
                            # make the WormBase:WBRNAi* id
                            # a self.globaltt['reagent_targeted_gene'], and attach
                            # phenotype to this ID

                            # Previous model - make a bnode reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            #rnai_num = re.sub(r'WormBase:', '', allele_id)
                            #rnai_id = allele_id
                            #rtg_id = self.make_reagent_targeted_gene_id(
                            #    gene_num, rnai_num)
                            #geno.addReagentTargetedGene(
                            #    rnai_id, 'WormBase:' + gene_num, rtg_id)
                            # allele_id = rtg_id

                            # Could type the IRI as both the reagant and reagant
                            # targeted gene but not sure if this needed
                            # geno.addGeneTargetingReagent(
                            #   allele_id, None, self.globaltt['RNAi_reagent'], gene_id)

                            model.addIndividualToGraph(
                                allele_id, None,
                                self.globaltt['reagent_targeted_gene'])

                            self.graph.addTriple(
                                allele_id, self.globaltt['is_expression_variant_of'],
                                gene_id)

                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations
                            # the public name will come from elsewhere

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling, treat variant
                            # like an allele
                            #vl_id = '_:'+'-'.join((gene_num, allele_num))
                            #geno.addSequenceAlterationToVariantLocus(
                            #    allele_id, vl_id)
                            #geno.addAlleleOfGene(vl_id, gene_id)

                            geno.addSequenceAlteration(allele_id, None)
                            geno.addAlleleOfGene(allele_id, gene_id)
                        else:
                            LOG.warning(
                                "Some kind of allele I don't recognize: %s", allele_num)
                            continue
                        assoc = G2PAssoc(graph, self.name, allele_id, phenotype_id)

                        if eco_id is not None:
                            assoc.add_evidence(eco_id)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            reference = Reference(graph, ref)
                            if re.search(r'Person', ref):
                                reference.setType(self.globaltt['person'])
                                assoc.add_evidence(
                                    self.globaltt[
                                        'inference from background scientific knowledge'
                                    ])
                            reference.addRefToGraph()
                            assoc.add_source(ref)

                        assoc.add_association_to_graph()

                        # finish looping through all alleles

                if limit is not None and line_counter > limit:
                    break

        return
Esempio n. 30
0
    def _get_var_citations(self, limit):

        # Generated weekly, the first of the week
        # A tab-delimited report of citations associated with data in ClinVar,
        # connected to the AlleleID, the VariationID, and either rs# from dbSNP
        # or nsv in dbVar.
        #
        # AlleleID          int value  (xpath //Measure/@ID )
        # VariationID       ID ClinVar uses to anchor default display.
        #                   (xpath  //MeasureSet/@ID)
        # rs			    rs identifier from dbSNP
        # nsv				nsv identifier from dbVar
        # citation_source	The source of the citation, either PubMed,
        #                   PubMedCentral, or the NCBI Bookshelf
        # citation_id		The identifier used by that source

        logger.info("Processing Citations for variants")
        line_counter = 0
        myfile = \
            '/'.join((self.rawdir, self.files['variant_citations']['file']))
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)

        with open(myfile, 'r', encoding="utf8") as f:
            filereader = csv.reader(f, delimiter='\t', quotechar='\"')

            for line in filereader:
                # skip comments
                line = line
                if re.match(r'^#', line[0]):
                    continue
                (allele_num, variant_num, rs_num, nsv_num, citation_source,
                 citation_id) = line

                line_counter += 1

                if self.testMode:
                    if int(variant_num) not in self.variant_ids:
                        continue

                if citation_id.strip() == '':
                    logger.info(
                        "Skipping blank citation for ClinVarVariant:%s",
                        str(variant_num))
                    continue

                # the citation for a variant is made to some kind of
                # combination of the ids here.
                # but i'm not sure which, we don't know what the
                # citation is for exactly, other than the variant.
                # so use mentions

                var_id = 'ClinVarVariant:' + variant_num

                # citation source: PubMed | PubMedCentral | citation_source
                # citation id:
                # format the citation id:
                ref_id = None
                if citation_source == 'PubMed':
                    ref_id = 'PMID:' + str(citation_id.replace(" ", ""))
                    model.makeLeader(ref_id)
                elif citation_source == 'PubMedCentral':
                    ref_id = 'PMCID:' + str(citation_id)
                if ref_id is not None:
                    r = Reference(self.graph, ref_id,
                                  Reference.ref_types['journal_article'])
                    r.addRefToGraph()
                    g.addTriple(ref_id, self.properties['is_about'], var_id)

                if not self.testMode \
                        and (limit is not None and line_counter > limit):
                    break

        logger.info("Finished processing citations for variants")

        return