Exemple #1
0
    def _map_eom_terms(self, raw, limit=None):
        """
        This table contains the HP ID mappings from the local tsv file.
        Triples:
            <eom id> owl:equivalentClass <hp id>
        :param raw:
        :param limit:
        :return:
        """

        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            for line in f1:
                line_counter += 1

                (morphology_term_id, morphology_term_label, hp_id, hp_label,
                 notes) = line.split('\t')

                # Sub out the underscores for colons.
                hp_id = re.sub('_', ':', hp_id)
                if re.match(".*HP:.*", hp_id):
                    # add the HP term as a class
                    model.addClassToGraph(hp_id, None)
                    # Add the HP ID as an equivalent class
                    model.addEquivalentClass(morphology_term_id, hp_id)
                else:
                    logger.warning('No matching HP term for %s',
                                   morphology_term_label)

                if limit is not None and line_counter > limit:
                    break

        return
Exemple #2
0
    def _map_eom_terms(self, raw, limit=None):
        """
        This table contains the HP ID mappings from the local tsv file.
        Triples:
            <eom id> owl:equivalentClass <hp id>
        :param raw:
        :param limit:
        :return:
        """

        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            for line in f1:
                line_counter += 1
                row = line.split('\t')
                (
                    morphology_term_id, morphology_term_label, hp_id, hp_label,
                    notes) = row

                # Sub out the underscores for colons.
                hp_id = re.sub('_', ':', hp_id)
                if re.match(".*HP:.*", hp_id):
                    # add the HP term as a class
                    model.addClassToGraph(hp_id, None)
                    # Add the HP ID as an equivalent class
                    model.addEquivalentClass(morphology_term_id, hp_id)
                else:
                    LOG.warning('No matching HP term for %s', morphology_term_label)

                if limit is not None and line_counter > limit:
                    break

        return
    def _add_gene_equivalencies(self, xrefs, gene_id, taxon):
        """
        Add equivalentClass and sameAs relationships

        Uses external resource map located in
        /resources/clique_leader.yaml to determine
        if an NCBITaxon ID space is a clique leader
        """

        clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport']

        # deal with the dbxrefs
        # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696

        for dbxref in xrefs.strip().split('|'):
            prefix = ':'.join(dbxref.split(':')[:-1]).strip()
            if prefix in self.localtt:
                prefix = self.localtt[prefix]
            dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1]))

            if dbxref_curie is not None and prefix != '':
                if prefix == 'HPRD':  # proteins are not == genes.
                    model.addTriple(
                        gene_id, self.globaltt['has gene product'], dbxref_curie)
                    continue
                    # skip some of these for now based on curie prefix
                if prefix in filter_out:
                    continue

                if prefix == 'ENSEMBL':
                    model.addXref(gene_id, dbxref_curie)
                if prefix == 'OMIM':
                    if dbxref_curie in self.omim_replaced:
                        repl = self.omim_replaced[dbxref_curie]
                        for omim in repl:
                            if omim in self.omim_type and \
                                    self.omim_type[omim] == self.globaltt['gene']:
                                dbxref_curie = omim
                    if dbxref_curie in self.omim_type and \
                            self.omim_type[dbxref_curie] != self.globaltt['gene']:
                        continue
                try:
                    if self.class_or_indiv.get(gene_id) == 'C':
                        model.addEquivalentClass(gene_id, dbxref_curie)
                        if taxon in clique_map:
                            if clique_map[taxon] == prefix:
                                model.makeLeader(dbxref_curie)
                            elif clique_map[taxon] == gene_id.split(':')[0]:
                                model.makeLeader(gene_id)
                    else:
                        model.addSameIndividual(gene_id, dbxref_curie)
                except AssertionError as err:
                    LOG.warning("Error parsing %s: %s", gene_id, err)
Exemple #4
0
    def _add_gene_equivalencies(self, xrefs, gene_id, taxon):
        """
        Add equivalentClass and sameAs relationships

        Uses external resource map located in
        /resources/clique_leader.yaml to determine
        if an NCBITaxon ID space is a clique leader
        """

        clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport']

        # deal with the dbxrefs
        # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696

        for dbxref in xrefs.strip().split('|'):
            prefix = ':'.join(dbxref.split(':')[:-1]).strip()
            if prefix in self.localtt:
                prefix = self.localtt[prefix]
            dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1]))

            if dbxref_curie is not None and prefix != '':
                if prefix == 'HPRD':  # proteins are not == genes.
                    model.addTriple(gene_id, self.globaltt['has gene product'],
                                    dbxref_curie)
                    continue
                    # skip some of these for now based on curie prefix
                if prefix in filter_out:
                    continue

                if prefix == 'ENSEMBL':
                    model.addXref(gene_id, dbxref_curie)
                if prefix == 'OMIM':
                    if dbxref_curie in self.omim_replaced:
                        repl = self.omim_replaced[dbxref_curie]
                        for omim in repl:
                            if omim in self.omim_type and \
                                    self.omim_type[omim] == self.globaltt['gene']:
                                dbxref_curie = omim
                    if dbxref_curie in self.omim_type and \
                            self.omim_type[dbxref_curie] != self.globaltt['gene']:
                        continue
                try:
                    if self.class_or_indiv.get(gene_id) == 'C':
                        model.addEquivalentClass(gene_id, dbxref_curie)
                        if taxon in clique_map:
                            if clique_map[taxon] == prefix:
                                model.makeLeader(dbxref_curie)
                            elif clique_map[taxon] == gene_id.split(':')[0]:
                                model.makeLeader(gene_id)
                    else:
                        model.addSameIndividual(gene_id, dbxref_curie)
                except AssertionError as err:
                    LOG.warning("Error parsing %s: %s", gene_id, err)
Exemple #5
0
    def _add_gene_equivalencies(self, xrefs, gene_id, taxon):
        """
        Add equivalentClass and sameAs relationships

        Uses external resource map located in
        /resources/clique_leader.yaml to determine
        if an ID space is a clique leader
        """

        clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph

        filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport']
        # These will be made xrefs
        taxon_spec_xref_filters = {'10090': ['ENSEMBL'], '9606': ['ENSEMBL']}
        if taxon in taxon_spec_xref_filters:
            taxon_spec_filters = taxon_spec_xref_filters[taxon]
        else:
            taxon_spec_filters = []

        model = Model(graph)
        # deal with the xrefs
        # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
        for ref in xrefs.strip().split('|'):
            xref_curie = self._cleanup_id(ref)
            if xref_curie is not None and xref_curie.strip() != '':
                if re.match(r'HPRD', xref_curie):
                    # proteins are not == genes.
                    model.addTriple(gene_id,
                                    self.properties['has_gene_product'],
                                    xref_curie)
                    continue
                    # skip some of these for now
                if xref_curie.split(':')[0] in filter_out:
                    continue
                if xref_curie.split(':')[0] in taxon_spec_xref_filters:
                    model.addXref(gene_id, xref_curie)
                if re.match(r'^OMIM', xref_curie):
                    if DipperUtil.is_omim_disease(xref_curie):
                        continue
                try:
                    if self.class_or_indiv.get(gene_id) == 'C':
                        model.addEquivalentClass(gene_id, xref_curie)
                        if int(taxon) in clique_map:
                            if clique_map[int(taxon)] == xref_curie.split(
                                    ':')[0]:
                                model.makeLeader(xref_curie)
                            elif clique_map[int(taxon)] == gene_id.split(
                                    ':')[0]:
                                model.makeLeader(gene_id)
                    else:
                        model.addSameIndividual(gene_id, xref_curie)
                except AssertionError as e:
                    logger.warn("Error parsing {0}: {1}".format(gene_id, e))
        return
    def _process_trait_mappings(self, raw, src_key, limit=None):
        """
        This method mapps traits from/to ...

        Triples created:

        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        model = Model(graph)

        col = self.files[src_key]['columns']

        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            header = next(filereader, None)
            self.check_fileheader(col, header)
            for row in filereader:
                line_counter += 1
                # need to skip the last line
                if len(row) != len(col):
                    LOG.info("skipping line %d: %s", line_counter, '\t'.join(row))
                    continue
                vto_id = row[col.index('VT')].strip()
                pto_id = row[col.index('LPT')].strip()
                cmo_id = row[col.index('CMO')].strip()
                ato_column = row[col.index('ATO')].strip()
                # species = row[col.index('Species')].strip()
                # trait_class = row[col.index('Class')].strip()
                # trait_type = row[col.index('Type')].strip()
                # qtl_count = row[col.index('QTL_Count')].strip()

                ato_id = re.sub(
                    r'ATO #', 'AQTLTrait:', re.sub(
                        r'\].*', '', re.sub(r'\[', '', ato_column)))
                ato_id = ato_id.strip()

                ato_label = re.sub(r'.*\]\s*', '', ato_column)

                model.addClassToGraph(ato_id, ato_label.strip())

                if re.match(r'VT:.*', vto_id):
                    model.addClassToGraph(vto_id, None)
                    model.addEquivalentClass(ato_id, vto_id)
                if re.match(r'LPT:.*', pto_id):
                    model.addClassToGraph(pto_id, None)
                    model.addXref(ato_id, pto_id)
                if re.match(r'CMO:.*', cmo_id):
                    model.addClassToGraph(cmo_id, None)
                    model.addXref(ato_id, cmo_id)

        LOG.info("Done with trait mappings")
        return
Exemple #7
0
    def _process_genes_kegg2ncbi(self, limit=None):
        """
        This method maps the KEGG human gene IDs
            to the corresponding NCBI Gene IDs.

        Triples created:
        <kegg_gene_id> is a class
        <ncbi_gene_id> is a class
        <kegg_gene_id> equivalentClass <ncbi_gene_id>

        <kegg_gene_id> biolink:category biolink:Gene
        <ncbi_gene_id> biolink:category biolink:Gene
        :param limit:
        :return:

        """
        src_key = 'ncbi'
        LOG.info("Processing KEGG gene IDs to NCBI gene IDs")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)

        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in reader:
                (kegg_gene_id, ncbi_gene_id, link_type) = row

                if self.test_mode and kegg_gene_id not in self.test_ids[
                        'genes']:
                    continue

                # Adjust the NCBI gene ID prefix.
                ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id)
                kegg_gene_id = 'KEGG-' + kegg_gene_id

                # Adding the KEGG gene ID to the graph here is redundant,
                # unless there happens to be additional gene IDs in this table
                # not present in the genes table.
                model.addClassToGraph(kegg_gene_id,
                                      None,
                                      class_category=blv.terms['Gene'])
                model.addClassToGraph(ncbi_gene_id,
                                      None,
                                      class_category=blv.terms['Gene'])
                model.addEquivalentClass(kegg_gene_id,
                                         ncbi_gene_id,
                                         subject_category=blv.terms['Gene'],
                                         object_category=blv.terms['Gene'])

                if not self.test_mode and (limit is not None
                                           and reader.line_num > limit):
                    break
        LOG.info("Done with KEGG gene IDs to NCBI gene IDs")
Exemple #8
0
    def _add_gene_equivalencies(self, xrefs, gene_id, taxon):
        """
        Add equivalentClass and sameAs relationships

        Uses external resource map located in
        /resources/clique_leader.yaml to determine
        if an ID space is a clique leader
        """

        clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph

        filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport']
        taxon_spec_filters = {
            '10090': ['ENSEMBL']
        }
        if taxon in taxon_spec_filters:
            filter_out += taxon_spec_filters[taxon]

        model = Model(graph)
        # deal with the xrefs
        # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
        for ref in xrefs.strip().split('|'):
            xref_curie = self._cleanup_id(ref)
            if xref_curie is not None and xref_curie.strip() != '':
                if re.match(r'HPRD', xref_curie):
                    # proteins are not == genes.
                    model.addTriple(
                        gene_id,
                        self.properties['has_gene_product'], xref_curie)
                    continue
                    # skip some of these for now
                if xref_curie.split(':')[0] in filter_out:
                    continue
                if re.match(r'^OMIM', xref_curie):
                    if DipperUtil.is_omim_disease(xref_curie):
                        continue
                try:
                    if self.class_or_indiv.get(gene_id) == 'C':
                        model.addEquivalentClass(
                            gene_id, xref_curie)
                        if int(taxon) in clique_map:
                            if clique_map[int(taxon)] == xref_curie.split(':')[0]:
                                model.makeLeader(xref_curie)
                            elif clique_map[int(taxon)] == gene_id.split(':')[0]:
                                model.makeLeader(gene_id)
                    else:
                        model.addSameIndividual(gene_id, xref_curie)
                except AssertionError as e:
                    logger.warn("Error parsing {0}: {1}".format(gene_id, e))
        return
Exemple #9
0
    def _process_trait_mappings(self, raw, limit=None):
        """
        This method mapps traits from/to ...

        Triples created:

        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        model = Model(graph)

        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip header line
            for row in filereader:
                line_counter += 1
                # need to skip the last line
                if len(row) < 8:
                    LOG.info("skipping line %d: %s", line_counter,
                             '\t'.join(row))
                    continue
                (vto_id, pto_id, cmo_id, ato_column, species, trait_class,
                 trait_type, qtl_count) = row

                ato_id = re.sub(
                    r'ATO #', 'AQTLTrait:',
                    re.sub(r'\].*', '', re.sub(r'\[', '', ato_column)))
                ato_id = ato_id.strip()

                ato_label = re.sub(r'.*\]\s*', '', ato_column)

                model.addClassToGraph(ato_id, ato_label.strip())

                if re.match(r'VT:.*', vto_id):
                    model.addClassToGraph(vto_id, None)
                    model.addEquivalentClass(ato_id, vto_id)
                if re.match(r'LPT:.*', pto_id):
                    model.addClassToGraph(pto_id, None)
                    model.addXref(ato_id, pto_id)
                if re.match(r'CMO:.*', cmo_id):
                    model.addClassToGraph(cmo_id, None)
                    model.addXref(ato_id, cmo_id)

        LOG.info("Done with trait mappings")
        return
Exemple #10
0
    def _process_genes_kegg2ncbi(self, limit=None):
        """
        This method maps the KEGG human gene IDs
            to the corresponding NCBI Gene IDs.

        Triples created:
        <kegg_gene_id> is a class
        <ncbi_gene_id> is a class
        <kegg_gene_id> equivalentClass <ncbi_gene_id>
        :param limit:
        :return:

        """

        logger.info("Processing KEGG gene IDs to NCBI gene IDs")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        line_counter = 0

        raw = '/'.join((self.rawdir, self.files['ncbi']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (kegg_gene_id, ncbi_gene_id, link_type) = row

                if self.testMode and \
                        kegg_gene_id not in self.test_ids['genes']:
                    continue

                # Adjust the NCBI gene ID prefix.
                ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id)
                kegg_gene_id = 'KEGG-'+kegg_gene_id

                # Adding the KEGG gene ID to the graph here is redundant,
                # unless there happens to be additional gene IDs in this table
                # not present in the genes table.
                model.addClassToGraph(kegg_gene_id, None)
                model.addClassToGraph(ncbi_gene_id, None)
                model.addEquivalentClass(kegg_gene_id, ncbi_gene_id)

                if (not self.testMode) and (
                        limit is not None and line_counter > limit):
                    break

        logger.info("Done with KEGG gene IDs to NCBI gene IDs")
        return
Exemple #11
0
    def _process_trait_mappings(self, raw, limit=None):
        """
        This method mapps traits from/to ...

        Triples created:

        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        model = Model(graph)

        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip header line
            for row in filereader:
                line_counter += 1
                # need to skip the last line
                if len(row) < 8:
                    LOG.info("skipping line %d: %s", line_counter, '\t'.join(row))
                    continue
                (vto_id, pto_id, cmo_id, ato_column, species, trait_class,
                 trait_type, qtl_count) = row

                ato_id = re.sub(
                    r'ATO #', 'AQTLTrait:', re.sub(
                        r'\].*', '', re.sub(r'\[', '', ato_column)))
                ato_id = ato_id.strip()

                ato_label = re.sub(r'.*\]\s*', '', ato_column)

                model.addClassToGraph(ato_id, ato_label.strip())

                if re.match(r'VT:.*', vto_id):
                    model.addClassToGraph(vto_id, None)
                    model.addEquivalentClass(ato_id, vto_id)
                if re.match(r'LPT:.*', pto_id):
                    model.addClassToGraph(pto_id, None)
                    model.addXref(ato_id, pto_id)
                if re.match(r'CMO:.*', cmo_id):
                    model.addClassToGraph(cmo_id, None)
                    model.addXref(ato_id, cmo_id)

        LOG.info("Done with trait mappings")
        return
Exemple #12
0
    def _process_gene_xref(self, limit):
        """
        Make equivalentClass axioms between flybase gene ids
            and NCBIGene
            and HGNC noting these are not expected to be orthologs just bad renaming.

        Note that there are a lot of genes in flybase from other organisms
        we make the eq axioms so that they clique merge in our large graph
        (for example, human genes should merge with HGNC)

        Adds triples to self.graph

        :param limit: number of rows to process
        :return: None

        """
        model = Model(self.graph)
        src_key = 'gene_xref'
        raw = '/'.join((self.rawdir, self.queries[src_key]['file']))
        LOG.info("processing gene xrefs")

        col = self.queries[src_key]['columns']

        with open(raw, 'r') as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t')
            row = next(reader)  # headers
            self.check_fileheader(col, row)

            for row in reader:
                gene_id = row[col.index('gene_id')]
                xref_id = row[col.index('xref_id')]
                xref_source = row[col.index('xref_source')]

                gene_curie = 'FlyBase:' + gene_id
                xref_prefix = None
                if xref_source == 'EntrezGene':
                    xref_prefix = 'NCBIGene'
                elif xref_source == 'HGNC':
                    xref_prefix = 'HGNC'
                    # gene_taxon = self.globaltt['H**o sapiens']
                xref_curie = xref_prefix + ':' + xref_id

                model.addEquivalentClass(gene_curie,
                                         xref_curie,
                                         object_category=blv.terms['Gene'])

                if limit is not None and reader.line_num > limit:
                    break
Exemple #13
0
 def _get_mapped_gene_ids(self, entry, graph):
     gene_ids = []
     model = Model(graph)
     omim_num = str(entry['mimNumber'])
     omim_curie = 'OMIM:' + omim_num
     if 'externalLinks' in entry:
         links = entry['externalLinks']
         omimtype = self.omim_type[omim_num]
         if 'geneIDs' in links:
             entrez_mappings = links['geneIDs']
             gene_ids = entrez_mappings.split(',')
             self.omim_ncbigene_idmap[omim_curie] = gene_ids
             if omimtype in [
                     self.globaltt['gene'], self.globaltt['has_affected_feature']]:
                 for ncbi in gene_ids:
                     model.addEquivalentClass(omim_curie, 'NCBIGene:' + str(ncbi))
     return gene_ids
Exemple #14
0
    def _get_mapped_gene_ids(self, entry, g):

        gene_ids = []
        model = Model(g)
        omimid = 'OMIM:'+str(entry['mimNumber'])
        if 'externalLinks' in entry:
            links = entry['externalLinks']
            omimtype = self._get_omimtype(entry)
            if 'geneIDs' in links:
                entrez_mappings = links['geneIDs']
                gene_ids = entrez_mappings.split(',')
                self.omim_ncbigene_idmap[omimid] = gene_ids
                if omimtype == Genotype.genoparts['gene']:
                    for i in gene_ids:
                        model.addEquivalentClass(omimid, 'NCBIGene:'+str(i))

        return gene_ids
Exemple #15
0
    def _process_genes_kegg2ncbi(self, limit=None):
        """
        This method maps the KEGG human gene IDs
            to the corresponding NCBI Gene IDs.

        Triples created:
        <kegg_gene_id> is a class
        <ncbi_gene_id> is a class
        <kegg_gene_id> equivalentClass <ncbi_gene_id>
        :param limit:
        :return:

        """

        LOG.info("Processing KEGG gene IDs to NCBI gene IDs")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)

        raw = '/'.join((self.rawdir, self.files['ncbi']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in reader:
                (kegg_gene_id, ncbi_gene_id, link_type) = row

                if self.test_mode and kegg_gene_id not in self.test_ids['genes']:
                    continue

                # Adjust the NCBI gene ID prefix.
                ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id)
                kegg_gene_id = 'KEGG-' + kegg_gene_id

                # Adding the KEGG gene ID to the graph here is redundant,
                # unless there happens to be additional gene IDs in this table
                # not present in the genes table.
                model.addClassToGraph(kegg_gene_id, None)
                model.addClassToGraph(ncbi_gene_id, None)
                model.addEquivalentClass(kegg_gene_id, ncbi_gene_id)

                if not self.test_mode and (
                        limit is not None and reader.line_num > limit):
                    break
        LOG.info("Done with KEGG gene IDs to NCBI gene IDs")
Exemple #16
0
    def _map_eom_terms(self, raw, limit=None):
        """
        This table contains the HP ID mappings from the local tsv file.
        Triples:
            <eom id> owl:equivalentClass <hp id>
        :param raw:
        :param limit:
        :return:
        """

        model = Model(self.graph)
        line_counter = 0
        col = self.files['map']['columns']
        with open(raw, 'r') as reader:
            line = reader.readline().strip()
            line = line.strip('/n')
            if self.check_fileheader(col, line.split('\t')):
                pass
            for line in reader:
                line_counter += 1
                row = line.strip('\n').split('\t')

                morphology_term_id = row[col.index(
                    'morphology_term_id')].strip()
                # morphology_term_label = row[col.index('morphology_term_label')]
                hp_id = row[col.index('HP ID')].strip()
                # hp_label = row[col.index('HP_Label')]
                # notes = row[col.index('Notes')]

                # Sub out the underscores for colons.
                hp_id = re.sub('_', ':', hp_id)
                if re.match(".*HP:.*", hp_id):
                    # add the HP term as a class
                    model.addClassToGraph(hp_id,
                                          None)  # TEC subclass of phenotype??
                    # Add the HP ID as an equivalent class
                    model.addEquivalentClass(morphology_term_id, hp_id)
                else:
                    LOG.warning('No matching HP term for %s',
                                morphology_term_id)

                if limit is not None and line_counter > limit:
                    break
Exemple #17
0
    def _process_pathway_pathway(self, limit):
        """
        There are "map" and "ko" identifiers for pathways.
        This makes equivalence mapping between them, where they exist.
        :param limit:
        :return:

        """
        logger.info("Processing KEGG pathways to other ids")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0

        model = Model(g)
        raw = '/'.join((self.rawdir, self.files['pathway_pathway']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (pathway_id_1, pathway_id_2) = row

                if self.testMode and \
                        pathway_id_1 not in self.test_ids['pathway']:
                    continue

                pathway_id_1 = 'KEGG-'+pathway_id_1
                # will look like KEGG-path:map04130 or KEGG-path:ko04130
                pathway_id_2 = 'KEGG-'+pathway_id_2

                if pathway_id_1 != pathway_id_2:
                    model.addEquivalentClass(pathway_id_1, pathway_id_2)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Exemple #18
0
    def _process_genes(self, limit=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        model = Model(g)
        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        line_counter = 0
        logger.info("Processing HGNC genes")

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n  .
            for row in filereader:
                (hgnc_id,
                 symbol,
                 name,
                 locus_group,
                 locus_type,
                 status,
                 location,
                 location_sortable,
                 alias_symbol,
                 alias_name,
                 prev_symbol,
                 prev_name,
                 gene_family,
                 gene_family_id,
                 date_approved_reserved,
                 date_symbol_changed,
                 date_name_changed,
                 date_modified,
                 entrez_id,
                 ensembl_gene_id,
                 vega_id,
                 ucsc_id,
                 ena,
                 refseq_accession,
                 ccds_id,
                 uniprot_ids,
                 pubmed_id,
                 mgd_id,
                 rgd_id,
                 lsdb,
                 cosmic,
                 omim_id,
                 mirbase,
                 homeodb,
                 snornabase,
                 bioparadigms_slc,
                 orphanet,
                 pseudogene_org,
                 horde_id,
                 merops,
                 imgt,
                 iuphar,
                 kznf_gene_catalog,
                 mamit_trnadb,
                 cd,
                 lncrnadb,
                 enzyme_id,
                 intermediate_filament_db,
                 rna_central_ids) = row

                line_counter += 1

                # skip header
                if line_counter <= 1:
                    continue

                if self.testMode and entrez_id != '' \
                        and int(entrez_id) not in self.gene_ids:
                    continue

                if name == '':
                    name = None
                gene_type_id = self._get_gene_type(locus_type)
                model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id)
                else:
                    model.makeLeader(hgnc_id)
                if entrez_id != '':
                    model.addEquivalentClass(
                        hgnc_id, 'NCBIGene:' + entrez_id)
                if ensembl_gene_id != '':
                    model.addEquivalentClass(
                        hgnc_id, 'ENSEMBL:' + ensembl_gene_id)
                if omim_id != '' and "|" not in omim_id:
                    omim_curie = 'OMIM:' + omim_id
                    if not DipperUtil.is_omim_disease(omim_curie):
                        model.addEquivalentClass(hgnc_id, omim_curie)

                geno.addTaxon('NCBITaxon:9606', hgnc_id)

                # add pubs as "is about"
                if pubmed_id != '':
                    for p in re.split(r'\|', pubmed_id.strip()):
                        if str(p) != '':
                            g.addTriple(
                                'PMID:' + str(p.strip()),
                                model.object_properties['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]'
                chr_match = re.match(chr_pattern, location)
                if chr_match is not None and len(chr_match.groups()) > 0:
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR')
                    band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)'
                    band_match = re.search(band_pattern, location)
                    f = Feature(g, hgnc_id, None, None)
                    if band_match is not None and len(band_match.groups()) > 0:
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        # TEC Monoch? Monarchdom??
                        band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR')
                        model.addClassToGraph(band_id, None)
                        f.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        f.addSubsequenceOfFeature(chrom_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

            # end loop through file

        return
Exemple #19
0
    def _add_gene_equivalencies(self, dbxrefs, gene_id, taxon):
        """
        Add equivalentClass and sameAs relationships

        Uses external resource map located in
        /resources/clique_leader.yaml to determine
        if an NCBITaxon ID space is a clique leader
        """

        clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport', '', None]

        # deal with the dbxrefs
        # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696

        for dbxref in dbxrefs.strip().split('|'):
            dbxref = dbxref.strip()
            # de stutter dbxref
            (prefix, local_id) = dbxref.split(':')[-2:]
            prefix = prefix.strip()
            local_id = local_id.strip()

            # skip some of these based on curie prefix or malformatting
            if prefix is None or prefix in filter_out or \
                    local_id is None or local_id == '':
                continue

            if prefix in self.localtt:
                prefix = self.localtt[prefix]

            if prefix == 'AnimalQTLdb' and taxon in self.informal_species:
                prefix = self.informal_species[taxon] + 'QTL'
            elif prefix == 'AnimalQTLdb':
                LOG.warning('Unknown AnimalQTLdb species %s for %s:%s', taxon,
                            prefix, local_id)
            # else: # taxon is not in informal species (not unexpected)

            dbxref_curie = ':'.join((prefix, local_id))

            if dbxref_curie is not None:
                if prefix == 'HPRD':  # proteins are not == genes.
                    model.addTriple(gene_id, self.globaltt['has gene product'],
                                    dbxref_curie)
                    continue
                if prefix == 'ENSEMBL':
                    model.addXref(gene_id, dbxref_curie)
                    # For Ensembl xrefs, don't proceed to equivalent class code
                    # these are more loose xrefs than equivalent identifiers
                    continue
                if prefix == 'OMIM':
                    omim_num = dbxref_curie[5:]
                    if omim_num in self.omim_replaced:
                        repl = self.omim_replaced[omim_num]
                        for omim in repl:
                            if omim in self.omim_type and \
                                    self.omim_type[omim] == self.globaltt['gene']:
                                dbxref_curie = 'OMIM:' + omim
                                omim_num = omim  # last "gene" wins (is never > 2)

                    if omim_num in self.omim_type and\
                            self.omim_type[omim_num] == self.globaltt['gene']:
                        model.addXref(gene_id, dbxref_curie)
                    else:
                        # OMIM disease/phenotype is not considered a gene at all
                        # no equivilance between ncbigene and omin-nongene
                        # and ncbi is never a human clique leader in any case
                        dbxref_curie = None
                        continue

                # designate clique leaders and equivalentClass/sameAs triples
                # (perhaps premature as this ingest can't know what else exists)
                try:
                    if self.class_or_indiv.get(gene_id) == 'C' and \
                            dbxref_curie is not None:
                        model.addEquivalentClass(gene_id, dbxref_curie)
                        if taxon in clique_map:
                            if clique_map[taxon] == prefix:
                                model.makeLeader(dbxref_curie)
                            elif clique_map[taxon] == gene_id.split(':')[0]:
                                model.makeLeader(gene_id)
                    elif dbxref_curie is not None:
                        model.addSameIndividual(gene_id, dbxref_curie)
                except AssertionError as err:
                    LOG.warning("Error parsing %s: %s", gene_id, err)
Exemple #20
0
    def _get_identifiers(self, limit):
        """
        This will process the id mapping file provided by Biogrid.
        The file has a very large header, which we scan past,
        then pull the identifiers, and make equivalence axioms

        :param limit:
        :return:

        """

        LOG.info("getting identifier mapping")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['identifiers']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        foundheader = False

        # TODO align this species filter with the one above
        # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster,
        # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',')

        speciesfilters = 'H**o sapiens,Mus musculus'.split(',')
        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip header lines
                if not foundheader:
                    if re.match(r'BIOGRID_ID', line.decode()):
                        foundheader = True
                    continue

                line = line.decode().strip()
                # BIOGRID_ID
                # IDENTIFIER_VALUE
                # IDENTIFIER_TYPE
                # ORGANISM_OFFICIAL_NAME
                # 1	814566	ENTREZ_GENE	Arabidopsis thaliana
                (biogrid_num, id_num, id_type,
                 organism_label) = line.split('\t')

                if self.test_mode:
                    graph = self.testgraph
                    # skip any genes that don't match our test set
                    if int(biogrid_num) not in self.biogrid_ids:
                        continue
                else:
                    graph = self.graph

                model = Model(graph)

                # for each one of these,
                # create the node and add equivalent classes
                biogrid_id = 'BIOGRID:' + biogrid_num
                prefix = self.localtt[id_type]

                # TODO make these filters available as commandline options
                # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,
                #                   WormBase,XenBase,ENSEMBL,miRBase'.split(',')
                geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',')
                # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein'
                if (speciesfilters is not None) \
                        and (organism_label.strip() in speciesfilters):
                    line_counter += 1
                    if (geneidtypefilters is not None) \
                            and (prefix in geneidtypefilters):
                        mapped_id = ':'.join((prefix, id_num))
                        model.addEquivalentClass(biogrid_id, mapped_id)
                    # this symbol will only get attached to the biogrid class
                    elif id_type == 'OFFICIAL_SYMBOL':
                        model.addClassToGraph(biogrid_id, id_num)
                    # elif (id_type == 'SYNONYM'):
                    #   FIXME - i am not sure these are synonyms, altids?
                    #   gu.addSynonym(g,biogrid_id,id_num)

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        myzip.close()

        return
Exemple #21
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0

        model = Model(graph)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignore element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text
                disorder_id = 'ORPHA:' + str(disorder_num)

                if self.test_mode and disorder_id not in self.all_test_ids[
                        'disease']:
                    continue
                disorder_label = elem.find('Name').text

                # assuming that these are in the ontology (...any particular one?)
                model.addClassToGraph(disorder_id, disorder_label)
                assoc_list = elem.find('DisorderGeneAssociationList')
                expected_genes = assoc_list.get('count')
                LOG.info('Expecting %s genes assdciated with disorder %s.',
                         expected_genes, disorder_id)
                processed_genes = 0
                for assoc in assoc_list.findall('DisorderGeneAssociation'):
                    processed_genes += 1
                    gene = assoc.find('Gene')

                    # get gene's curie  HGNC or Ensembl ...

                    lclid = gene.find('OrphaNumber').text
                    gene_curie = 'ORPHA:' + lclid
                    gene_set = {'ORPHA': lclid}
                    for gene_ref in gene.findall(
                            './ExternalReferenceList/ExternalReference'):
                        gene_set[gene_ref.find('Source').text] = \
                            gene_ref.find('Reference').text

                    # set priority (clique leader if available) but default to OPRHA
                    for pfx in ('HGNC', 'Ensembl', 'SwissProt'):
                        #       'OMIM', 'Genatlas','Reactome', 'IUPHAR'):
                        if pfx in gene_set:
                            if pfx in self.localtt:
                                pfx = self.localtt[pfx]
                            gene_curie = pfx + ':' + gene_set[pfx]
                            gene_set.pop(pfx)
                            model.addClassToGraph(gene_curie, None)
                            break

                    # TEC have reservations w.r.t aggerator links being gene classes
                    for prefix in gene_set:
                        lclid = gene_set[prefix]
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]

                        dbxref = prefix + ':' + lclid

                        if gene_curie != dbxref:
                            model.addClassToGraph(dbxref, None)
                            model.addEquivalentClass(gene_curie, dbxref)

                    # TEC. would prefer this not happen here. let HGNC handle it
                    # except there are some w/o explicit external links ...

                    # gene_name = gene.find('Name').text
                    gene_symbol = gene.find('Symbol').text
                    # gene_iid = assoc.find('DisorderGeneAssociationType').get('id')
                    # gene_type_id = self.resolve(gene_iid)
                    # don't  know the 'type' of the gene for this class anymore
                    # model.addClassToGraph(
                    #    gene_curie, gene_symbol, gene_type_id, gene_name)

                    syn_list = gene.find('./SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for syn in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_curie, syn.text)

                    dg_label = assoc.find(
                        './DisorderGeneAssociationType/Name').text
                    # rel_id = self.resolve(dg_label)

                    # alt_locus_id = '_:' + gene_num + '-' + disorder_num + 'VL'
                    # alt_label = ' '.join((
                    #    'some variant of', gene_symbol.strip(), disorder_label))
                    # model.addIndividualToGraph(
                    #    alt_locus_id, alt_label, self.globaltt['variant_locus'])
                    # geno.addAffectedLocus(alt_locus_id, gene_id)
                    # model.addBlankNodeAnnotation(alt_locus_id)
                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use dg association status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    eco_id = self.resolve(
                        assoc.find('DisorderGeneAssociationStatus/Name').text)

                    # assoc = G2PAssoc(
                    #    graph, self.name, alt_locus_id, disorder_id, rel_id)
                    # assoc.add_evidence(eco_id)
                    # assoc.add_association_to_graph()

                    self.add_gene_to_disease(dg_label, gene_curie, gene_symbol,
                                             disorder_id, eco_id)

                elem.clear()  # empty the element
                if int(expected_genes) != processed_genes:
                    LOG.warning(
                        '% expected %s associated genes but we processed %i',
                        disorder_id, expected_genes, processed_genes)

            if self.test_mode and limit is not None and line_counter > limit:
                return

        return
Exemple #22
0
    def _process_genes(self, taxid, limit=None):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        line_counter = 0
        LOG.info("Processing Ensembl genes for tax %s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t')
            for row in filereader:
                if len(row) < 4:
                    LOG.warning("Too few columns in: " + row)
                    raise ValueError("Data error for file %s", raw)
                (ensembl_gene_id, external_gene_name, description, gene_biotype,
                 entrezgene, ensembl_peptide_id, uniprotswissprot) = row[0:7]

                # in the case of human genes, we also get the hgnc id,
                # and is the last col
                if taxid == '9606':
                    hgnc_id = row[7]
                else:
                    hgnc_id = None

                if self.test_mode and entrezgene != '' and \
                        int(entrezgene) not in self.gene_ids:
                    continue

                line_counter += 1
                gene_id = 'ENSEMBL:' + ensembl_gene_id
                peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id)
                uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot)
                entrez_curie = 'NCBIGene:{}'.format(entrezgene)

                if description == '':
                    description = None
                gene_biotype = gene_biotype.strip()
                gene_type_id = self.resolve(gene_biotype, False)
                if gene_type_id == gene_biotype.strip():   # did not resolve
                    gene_type_id = self.globaltt['polypeptide']

                model.addClassToGraph(
                    gene_id, external_gene_name, gene_type_id, description)
                model.addIndividualToGraph(peptide_curie, None, gene_type_id)
                model.addIndividualToGraph(uniprot_curie, None, gene_type_id)

                if entrezgene != '':
                    if taxid == '9606':
                        # Use HGNC for eq in human data
                        model.addXref(gene_id, entrez_curie)
                    else:
                        model.addEquivalentClass(gene_id, entrez_curie)
                if hgnc_id is not None and hgnc_id != '':
                    model.addEquivalentClass(gene_id, hgnc_id)
                geno.addTaxon('NCBITaxon:'+taxid, gene_id)
                if ensembl_peptide_id != '':
                    geno.addGeneProduct(gene_id, peptide_curie)
                    if uniprotswissprot != '':
                        geno.addGeneProduct(gene_id, uniprot_curie)
                        model.addXref(peptide_curie, uniprot_curie)

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        return
Exemple #23
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        model = Model(g)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        # PYLINT complains iterparse deprecated,
        # but as of py 3.4 only the optional & unsupplied parse arg is.
        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignoreS element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text

                disorder_id = 'Orphanet:' + str(disorder_num)

                if self.testMode and \
                        disorder_id not in \
                        config.get_config()['test_ids']['disease']:
                    continue

                disorder_label = elem.find('Name').text

                # make a hash of internal gene id to type for later lookup
                gene_iid_to_type = {}
                gene_list = elem.find('GeneList')
                for gene in gene_list.findall('Gene'):
                    gene_iid = gene.get('id')
                    gene_type = gene.find('GeneType').get('id')
                    gene_iid_to_type[gene_iid] = gene_type

                # assuming that these are in the ontology
                model.addClassToGraph(disorder_id, disorder_label)

                assoc_list = elem.find('DisorderGeneAssociationList')
                for a in assoc_list.findall('DisorderGeneAssociation'):
                    gene_iid = a.find('.//Gene').get('id')
                    gene_name = a.find('.//Gene/Name').text
                    gene_symbol = a.find('.//Gene/Symbol').text
                    gene_num = a.find('./Gene/OrphaNumber').text
                    gene_id = 'Orphanet:' + str(gene_num)
                    gene_type_id = \
                        self._map_gene_type_id(gene_iid_to_type[gene_iid])
                    model.addClassToGraph(gene_id, gene_symbol, gene_type_id,
                                          gene_name)
                    syn_list = a.find('./Gene/SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for s in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_id, s.text)

                    dgtype = a.find('DisorderGeneAssociationType').get('id')
                    rel_id = self._map_rel_id(dgtype)
                    dg_label = \
                        a.find('./DisorderGeneAssociationType/Name').text
                    if rel_id is None:
                        logger.warning(
                            "Cannot map association type (%s) to RO " +
                            "for association (%s | %s).  Skipping.", dg_label,
                            disorder_label, gene_symbol)
                        continue

                    alt_locus_id = '_:' + gene_num + '-' + disorder_num + 'VL'
                    alt_label = \
                        ' '.join(('some variant of', gene_symbol.strip(),
                                  'that is a', dg_label.lower(),
                                  disorder_label))

                    model.addIndividualToGraph(alt_locus_id, alt_label,
                                               geno.genoparts['variant_locus'])
                    geno.addAffectedLocus(alt_locus_id, gene_id)
                    model.addBlankNodeAnnotation(alt_locus_id)

                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use "assessed" status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    status_code = \
                        a.find('DisorderGeneAssociationStatus').get('id')
                    # imported automatically asserted information
                    # used in automatic assertion
                    eco_id = 'ECO:0000323'
                    # Assessed
                    # TODO are these internal ids stable between releases?
                    if status_code == '17991':
                        # imported manually asserted information
                        # used in automatic assertion
                        eco_id = 'ECO:0000322'
                    # Non-traceable author statement ECO_0000034
                    # imported information in automatic assertion ECO_0000313

                    assoc = G2PAssoc(g, self.name, alt_locus_id, disorder_id,
                                     rel_id)
                    assoc.add_evidence(eco_id)
                    assoc.add_association_to_graph()

                    rlist = a.find('./Gene/ExternalReferenceList')
                    eqid = None

                    for r in rlist.findall('ExternalReference'):
                        if r.find('Source').text == 'Ensembl':
                            eqid = 'ENSEMBL:' + r.find('Reference').text
                        elif r.find('Source').text == 'HGNC':
                            eqid = 'HGNC:' + r.find('Reference').text
                        elif r.find('Source').text == 'OMIM':
                            eqid = 'OMIM:' + r.find('Reference').text
                        else:
                            pass  # skip the others for now
                        if eqid is not None:
                            model.addClassToGraph(eqid, None)
                            model.addEquivalentClass(gene_id, eqid)
                elem.clear()  # empty the element

            if self.testMode and limit is not None and line_counter > limit:
                return

        return
Exemple #24
0
    def _process_trait_mappings(self, raw, limit=None):
        """
        This method mapps traits from/to ...

        Triples created:

        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        model = Model(g)

        # with open(raw, 'r') as csvfile:
        #     filereader = csv.reader(csvfile, delimiter=',')
        #     row_count = sum(1 for row in filereader)
        #     row_count = row_count - 1

        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip header line
            for row in filereader:
                line_counter += 1
                # need to skip the last line
                if len(row) < 8:
                    logger.info(
                        "skipping line %d: %s", line_counter, '\t'.join(row))
                    continue
                (vto_id, pto_id, cmo_id, ato_column, species, trait_class,
                 trait_type, qtl_count) = row

                ato_id = re.sub(r'ATO #', 'AQTLTrait:',
                                re.sub(r'\].*', '',
                                       re.sub(r'\[', '', ato_column)))
                ato_id = ato_id.strip()

                ato_label = re.sub(r'.*\]\s*', '', ato_column)

                # if species == 'Cattle':
                #     ato_id = re.sub(r'ATO:', 'AQTLTraitCattle:', ato_id)
                # elif species == 'Chicken':
                #     ato_id = re.sub(r'ATO:', 'AQTLTraitChicken:', ato_id)
                # elif species == 'Sheep':
                #     ato_id = re.sub(r'ATO:', 'AQTLTraitSheep:', ato_id)
                # elif species == 'Horse':
                #     ato_id = re.sub(r'ATO:', 'AQTLTraitHorse:', ato_id)
                # elif species == 'Pig':
                #     ato_id = re.sub(r'ATO:', 'AQTLTraitPig:', ato_id)
                # elif species == 'Rainbow trout':
                #     ato_id = re.sub(
                #       r'ATO:', 'AQTLTraitRainbowTrout:', ato_id)
                # else:
                #     logger.warning(
                #       'Unknown species %s foufnd in trait mapping file.',
                #       species)
                #     continue
                # print(ato_label)

                model.addClassToGraph(ato_id, ato_label.strip())

                if re.match(r'VT:.*', vto_id):
                    model.addClassToGraph(vto_id, None)
                    model.addEquivalentClass(ato_id, vto_id)
                if re.match(r'LPT:.*', pto_id):
                    model.addClassToGraph(pto_id, None)
                    model.addXref(ato_id, pto_id)
                if re.match(r'CMO:.*', cmo_id):
                    model.addClassToGraph(cmo_id, None)
                    model.addXref(ato_id, cmo_id)

        logger.info("Done with trait mappings")
        return
Exemple #25
0
    def _process_genes(self, taxid, limit=None):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        geno = Genotype(g)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        line_counter = 0
        logger.info("Processing Ensembl genes for tax %s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t')
            for row in filereader:
                if len(row) < 4:
                    raise ValueError("Data error for file %s", raw)
                (ensembl_gene_id, external_gene_name,
                 description, gene_biotype, entrezgene,
                 peptide_id, uniprot_swissprot) = row[0:7]

                # in the case of human genes, we also get the hgnc id,
                # and is the last col
                if taxid == '9606':
                    hgnc_id = row[7]
                else:
                    hgnc_id = None

                if self.testMode and entrezgene != '' \
                        and int(entrezgene) not in self.gene_ids:
                    continue

                line_counter += 1
                gene_id = 'ENSEMBL:' + ensembl_gene_id
                peptide_curie = 'ENSEMBL:{}'.format(peptide_id)
                uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot)
                entrez_curie = 'NCBIGene:{}'.format(entrezgene)

                if description == '':
                    description = None
                # gene_type_id = self._get_gene_type(gene_biotype)
                gene_type_id = None
                model.addClassToGraph(
                    gene_id, external_gene_name, gene_type_id, description)
                model.addIndividualToGraph(peptide_curie, None, self._get_gene_type("polypeptide"))
                model.addIndividualToGraph(uniprot_curie, None, self._get_gene_type("polypeptide"))

                if entrezgene != '':
                    model.addEquivalentClass(gene_id, entrez_curie)
                if hgnc_id is not None and hgnc_id != '':
                    model.addEquivalentClass(gene_id, hgnc_id)
                geno.addTaxon('NCBITaxon:'+taxid, gene_id)
                if peptide_id != '':
                    geno.addGeneProduct(gene_id, peptide_curie)
                    if uniprot_swissprot != '':
                        geno.addGeneProduct(gene_id, uniprot_curie)
                        model.addXref(peptide_curie, uniprot_curie)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Exemple #26
0
    def _process_omim2gene(self, limit=None):
        """
        This method maps the OMIM IDs and KEGG gene ID.
        Currently split based on the link_type field.
        Equivalent link types are mapped as gene XRefs.
        Reverse link types are mapped as disease to gene associations.
        Original link types are currently skipped.

        Triples created:
        <kegg_gene_id> is a Gene
        <omim_gene_id> is a Gene
        <kegg_gene_id>> hasXref <omim_gene_id>

        <assoc_id> has subject <omim_disease_id>
        <assoc_id> has object <kegg_gene_id>
        :param limit:

        :return:
        """

        LOG.info("Processing OMIM to KEGG gene")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        line_counter = 0
        geno = Genotype(graph)
        raw = '/'.join((self.rawdir, self.files['omim2gene']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (kegg_gene_id, omim_id, link_type) = row

                if self.test_mode and kegg_gene_id not in self.test_ids['genes']:
                    continue

                kegg_gene_id = 'KEGG-' + kegg_gene_id.strip()
                omim_id = re.sub(r'omim', 'OMIM', omim_id)
                if link_type == 'equivalent':
                    # these are genes!
                    # so add them as a class then make equivalence
                    model.addClassToGraph(omim_id, None)
                    geno.addGene(kegg_gene_id, None)
                    if not DipperUtil.is_omim_disease(omim_id):
                        model.addEquivalentClass(kegg_gene_id, omim_id)
                elif link_type == 'reverse':
                    # make an association between an OMIM ID & the KEGG gene ID
                    # we do this with omim ids because
                    # they are more atomic than KEGG ids

                    alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id)
                    alt_label = self.label_hash[alt_locus_id]
                    model.addIndividualToGraph(
                        alt_locus_id, alt_label, self.globaltt['variant_locus'])
                    geno.addAffectedLocus(alt_locus_id, kegg_gene_id)
                    model.addBlankNodeAnnotation(alt_locus_id)

                    # Add the disease to gene relationship.
                    rel = self.globaltt['is marker for']
                    assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel)
                    assoc.add_association_to_graph()

                elif link_type == 'original':
                    # these are sometimes a gene, and sometimes a disease
                    LOG.info(
                        'Unable to handle original link for %s-%s',
                        kegg_gene_id, omim_id)
                else:
                    # don't know what these are
                    LOG.warning(
                        'Unhandled link type for %s-%s: %s',
                        kegg_gene_id, omim_id, link_type)

                if (not self.test_mode) and (
                        limit is not None and line_counter > limit):
                    break

        LOG.info("Done with OMIM to KEGG gene")

        return
Exemple #27
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0

        model = Model(graph)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignore element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text
                disorder_id = 'ORPHA:' + str(disorder_num)

                if self.test_mode and disorder_id not in self.all_test_ids['disease']:
                    continue
                disorder_label = elem.find('Name').text

                # assuming that these are in the ontology (...any particular one?)
                model.addClassToGraph(disorder_id, disorder_label)
                assoc_list = elem.find('DisorderGeneAssociationList')
                expected_genes = assoc_list.get('count')
                LOG.info(
                    'Expecting %s genes associated with disorder %s.',
                    expected_genes, disorder_id)
                processed_genes = 0
                for assoc in assoc_list.findall('DisorderGeneAssociation'):
                    processed_genes += 1
                    gene = assoc.find('Gene')

                    # get gene's curie  HGNC or Ensembl ...

                    lclid = gene.find('OrphaNumber').text
                    gene_curie = 'ORPHA:' + lclid
                    gene_set = {'ORPHA': lclid}
                    for gene_ref in gene.findall(
                            './ExternalReferenceList/ExternalReference'):
                        gene_set[gene_ref.find('Source').text] = \
                            gene_ref.find('Reference').text

                    # set priority (clique leader if available) but default to OPRHA
                    for pfx in ('HGNC', 'Ensembl', 'SwissProt'):
                        if pfx in gene_set:
                            if pfx in self.localtt:
                                pfx = self.localtt[pfx]
                            gene_curie = pfx + ':' + gene_set[pfx]
                            gene_set.pop(pfx)
                            model.addClassToGraph(gene_curie, None)
                            break

                    # TEC have reservations w.r.t aggerator links being gene classes
                    for prefix in gene_set:
                        lclid = gene_set[prefix]
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]

                        dbxref = prefix + ':' + lclid

                        if gene_curie != dbxref:
                            model.addClassToGraph(dbxref, None)
                            model.addEquivalentClass(gene_curie, dbxref)

                    # TEC. would prefer this not happen here. let HGNC handle it
                    # except there are some w/o explicit external links ...

                    gene_symbol = gene.find('Symbol').text

                    syn_list = gene.find('./SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for syn in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_curie, syn.text)

                    dg_label = assoc.find('./DisorderGeneAssociationType/Name').text

                    # use dg association status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    eco_id = self.resolve(
                        assoc.find('DisorderGeneAssociationStatus/Name').text)

                    rel_id = self.resolve(dg_label)
                    
                    g2p_assoc = G2PAssoc(self.graph, self.name, gene_curie, disorder_id, rel_id)
                    g2p_assoc.add_evidence(eco_id)
                    g2p_assoc.add_association_to_graph()

                elem.clear()  # empty the element
                if int(expected_genes) != processed_genes:
                    LOG.warning(
                        '% expected %s associated genes but we processed %i',
                        disorder_id, expected_genes, processed_genes)

            if self.test_mode and limit is not None and line_counter > limit:
                return

        return
Exemple #28
0
    def _process_omim2disease(self, limit=None):
        """
        This method maps the KEGG disease IDs to
        the corresponding OMIM disease IDs.
        Currently this only maps KEGG diseases and OMIM diseases that are 1:1.

        Triples created:
        <kegg_disease_id> is a class
        <omim_disease_id> is a class
        <kegg_disease_id> hasXref <omim_disease_id>
        :param limit:

        :return:

        """

        LOG.info("Processing 1:1 KEGG disease to OMIM disease mappings")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        model = Model(graph)
        raw = '/'.join((self.rawdir, self.files['omim2disease']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                (omim_disease_id, kegg_disease_id, link_type) = row

                kegg_disease_id = 'KEGG-' + kegg_disease_id.strip()
                omim_disease_id = re.sub(r'omim', 'OMIM', omim_disease_id)

                # Create hash for the links from OMIM ID -> KEGG ID
                if omim_disease_id not in self.omim_disease_hash:
                    self.omim_disease_hash[omim_disease_id] = [kegg_disease_id]
                else:
                    self.omim_disease_hash[omim_disease_id].append(kegg_disease_id)

                # Create hash for the links from KEGG ID -> OMIM ID
                if kegg_disease_id not in self.kegg_disease_hash:
                    self.kegg_disease_hash[kegg_disease_id] = [omim_disease_id]
                else:
                    self.kegg_disease_hash[kegg_disease_id].append(omim_disease_id)

        # Now process the disease hashes
        # and only pass 1:1 omim disease:KEGG disease entries.
        for omim_disease_id in self.omim_disease_hash:
            if self.test_mode and omim_disease_id not in self.test_ids['disease']:
                continue

            if (not self.test_mode) and (limit is not None and line_counter > limit):
                break
            line_counter += 1

            if len(self.omim_disease_hash[omim_disease_id]) == 1:
                kegg_disease_id = ''.join(self.omim_disease_hash.get(omim_disease_id))
                if len(self.kegg_disease_hash[kegg_disease_id]) == 1:
                    # add ids, and deal with the labels separately
                    model.addClassToGraph(kegg_disease_id, None)
                    model.addClassToGraph(omim_disease_id, None)
                    # TODO is this safe?
                    model.addEquivalentClass(kegg_disease_id, omim_disease_id)
            else:
                pass
                # gu.addXref(g, omim_disease_id, kegg_disease_id)
                # TODO add xrefs if >1:1 mapping?

        LOG.info("Done with KEGG disease to OMIM disease mappings.")
        return
Exemple #29
0
    def _process_omim2gene(self, limit=None):
        """
        This method maps the OMIM IDs and KEGG gene ID.
        Currently split based on the link_type field.
        Equivalent link types are mapped as gene XRefs.
        Reverse link types are mapped as disease to gene associations.
        Original link types are currently skipped.

        Triples created:
        <kegg_gene_id> is a Gene
        <omim_gene_id> is a Gene
        <kegg_gene_id>> hasXref <omim_gene_id>

        <assoc_id> has subject <omim_disease_id>
        <assoc_id> has object <kegg_gene_id>
        :param limit:

        :return:
        """

        LOG.info("Processing OMIM to KEGG gene")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)
        raw = '/'.join((self.rawdir, self.files['omim2gene']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in reader:
                (kegg_gene_id, omim_id, link_type) = row

                if self.test_mode and kegg_gene_id not in self.test_ids['genes']:
                    continue

                kegg_gene_id = 'KEGG-' + kegg_gene_id.strip()
                omim_id = re.sub(r'omim', 'OMIM', omim_id)
                if link_type == 'equivalent':
                    # these are genes!
                    # so add them as a class then make equivalence
                    model.addClassToGraph(omim_id, None)
                    geno.addGene(kegg_gene_id, None)

                    # previous: if omim type is not disease-ish then use
                    # now is:   if omim type is gene then use

                    if omim_id in self.omim_replaced:
                        repl = self.omim_replaced[omim_id]
                        for omim in repl:
                            if omim in self.omim_type and \
                                    self.omim_type[omim] == self.globaltt['gene']:
                                omim_id = omim
                    if omim_id in self.omim_type and \
                            self.omim_type[omim_id] == self.globaltt['gene']:
                        model.addEquivalentClass(kegg_gene_id, omim_id)
                elif link_type == 'reverse':
                    # make an association between an OMIM ID & the KEGG gene ID
                    # we do this with omim ids because
                    # they are more atomic than KEGG ids

                    alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id)
                    alt_label = self.label_hash[alt_locus_id]
                    model.addIndividualToGraph(
                        alt_locus_id, alt_label, self.globaltt['variant_locus'])
                    geno.addAffectedLocus(alt_locus_id, kegg_gene_id)
                    model.addBlankNodeAnnotation(alt_locus_id)

                    # Add the disease to gene relationship.
                    rel = self.globaltt['is marker for']
                    assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel)
                    assoc.add_association_to_graph()

                elif link_type == 'original':
                    # these are sometimes a gene, and sometimes a disease
                    LOG.info(
                        'Unable to handle original link for %s-%s',
                        kegg_gene_id, omim_id)
                else:
                    # don't know what these are
                    LOG.warning(
                        'Unhandled link type for %s-%s: %s',
                        kegg_gene_id, omim_id, link_type)

                if (not self.test_mode) and (
                        limit is not None and reader.line_num > limit):
                    break
        LOG.info("Done with OMIM to KEGG gene")
Exemple #30
0
    def _process_genes(self, limit=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        geno = Genotype(graph)
        model = Model(graph)

        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        col = self.files['genes']['columns']
        LOG.info("Processing HGNC genes")

        chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]')
        band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)')

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')

            row = next(filereader)
            if not self.check_fileheader(col, row):
                exit(-1)

            for row in filereader:
                # To generate:
                # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' |
                # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g"

                hgnc_id = row[col.index('hgnc_id')].strip()
                symbol = row[col.index('symbol')].strip()
                name = row[col.index('name')].strip()
                # locus_group = row[col.index('locus_group')]
                locus_type = row[col.index('locus_type')].strip()
                # status = row[col.index('status')]
                location = row[col.index('location')].strip()
                # location_sortable = row[col.index('location_sortable')]
                # alias_symbol = row[col.index('alias_symbol')]
                # alias_name = row[col.index('alias_name')]
                # prev_symbol = row[col.index('prev_symbol')]
                # prev_name = row[col.index('prev_name')]
                # gene_family = row[col.index('gene_family')]
                # gene_family_id = row[col.index('gene_family_id')]
                # date_approved_reserved = row[col.index('date_approved_reserved')]
                # date_symbol_changed = row[col.index('date_symbol_changed')]
                # date_name_changed = row[col.index('date_name_changed')]
                # date_modified = row[col.index('date_modified')]
                entrez_id = row[col.index('entrez_id')].strip()
                ensembl_gene_id = row[col.index('ensembl_gene_id')].strip()
                # vega_id = row[col.index('vega_id')]
                # ucsc_id = row[col.index('ucsc_id')]
                # ena = row[col.index('ena')]
                # refseq_accession = row[col.index('refseq_accession')]
                # ccds_id = row[col.index('ccds_id')]
                # uniprot_ids = row[col.index('uniprot_ids')]
                pubmed_ids = row[col.index('pubmed_id')].strip()  # pipe seperated!
                # mgd_id = row[col.index('mgd_id')]
                # rgd_id = row[col.index('rgd_id')]
                # lsdb = row[col.index('lsdb')]
                # cosmic = row[col.index('cosmic')]
                omim_ids = row[col.index('omim_id')].strip()  # pipe seperated!
                # mirbase = row[col.index('mirbase')]
                # homeodb = row[col.index('homeodb')]
                # snornabase = row[col.index('snornabase')]
                # bioparadigms_slc = row[col.index('bioparadigms_slc')]
                # orphanet = row[col.index('orphanet')]
                # pseudogene.org = row[col.index('pseudogene.org')]
                # horde_id = row[col.index('horde_id')]
                # merops = row[col.index('merops')]
                # imgt = row[col.index('imgt')]
                # iuphar = row[col.index('iuphar')]
                # kznf_gene_catalog = row[col.index('kznf_gene_catalog')]
                # mamit_trnadb = row[col.index('mamit-trnadb')]
                # cd = row[col.index('cd')]
                # lncrnadb = row[col.index('lncrnadb')]
                # enzyme_id = row[col.index('enzyme_id')]
                # intermediate_filament_db = row[col.index('intermediate_filament_db')]
                # rna_central_ids = row[col.index('rna_central_ids')]
                # lncipedia = row[col.index('lncipedia')]
                # gtrnadb = row[col.index('gtrnadb')]

                if self.test_mode and entrez_id != '' and \
                        entrez_id not in self.gene_ids:
                    continue

                if name == '':
                    name = None

                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id)
                else:
                    gene_type_id = self.resolve(locus_type, False)  # withdrawn -> None?
                    if gene_type_id != locus_type:
                        model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
                    model.makeLeader(hgnc_id)

                if entrez_id != '':
                    model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id)

                if ensembl_gene_id != '':
                    model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id)

                for omim_id in omim_ids.split('|'):
                    if omim_id in self.omim_replaced:
                        repl = self.omim_replaced[omim_id]
                        LOG.warning('%s is replaced with %s', omim_id, repl)
                        for omim in repl:
                            if self.omim_type[omim] == self.globaltt['gene']:
                                omim_id = omim

                    if omim_id in self.omim_type and \
                            self.omim_type[omim_id] == self.globaltt['gene']:
                        model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id)

                geno.addTaxon(self.hs_txid, hgnc_id)

                # add pubs as "is about"
                for pubmed_id in pubmed_ids.split('|'):
                    graph.addTriple(
                        'PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_match = chr_pattern.match(location)
                if chr_match is not None and len(chr_match.groups()) > 0:
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, self.hs_txid, 'CHR')
                    band_match = band_pattern.search(location)
                    feat = Feature(graph, hgnc_id, None, None)
                    if band_match is not None and len(band_match.groups()) > 0:
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        band_id = makeChromID(band, self.hs_txid, 'CHR')
                        model.addClassToGraph(band_id, None)
                        feat.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        feat.addSubsequenceOfFeature(chrom_id)

                if not self.test_mode and limit is not None and \
                        filereader.line_num > limit:
                    break
Exemple #31
0
    def _process_omim2disease(self, limit=None):
        """
        This method maps the KEGG disease IDs to
        the corresponding OMIM disease IDs.
        Currently this only maps KEGG diseases and OMIM diseases that are 1:1.

        Triples created:
        <kegg_disease_id> is a class
        <omim_disease_id> is a class
        <kegg_disease_id> hasXref <omim_disease_id>
        :param limit:

        :return:

        """

        LOG.info("Processing 1:1 KEGG disease to OMIM disease mappings")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        raw = '/'.join((self.rawdir, self.files['omim2disease']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in reader:
                (omim_disease_id, kegg_disease_id, link_type) = row

                kegg_disease_id = 'KEGG-' + kegg_disease_id.strip()
                omim_disease_id = re.sub(r'omim', 'OMIM', omim_disease_id)

                # Create hash for the links from OMIM ID -> KEGG ID
                if omim_disease_id not in self.omim_disease_hash:
                    self.omim_disease_hash[omim_disease_id] = [kegg_disease_id]
                else:
                    self.omim_disease_hash[omim_disease_id].append(kegg_disease_id)

                # Create hash for the links from KEGG ID -> OMIM ID
                if kegg_disease_id not in self.kegg_disease_hash:
                    self.kegg_disease_hash[kegg_disease_id] = [omim_disease_id]
                else:
                    self.kegg_disease_hash[kegg_disease_id].append(omim_disease_id)

        # Now process the disease hashes
        # and only pass 1:1 omim disease:KEGG disease entries.
        for omim_disease_id in self.omim_disease_hash:
            if self.test_mode and omim_disease_id not in self.test_ids['disease']:
                continue

            if (not self.test_mode) and (limit is not None and reader.line_num > limit):
                break

            if len(self.omim_disease_hash[omim_disease_id]) == 1:
                kegg_disease_id = ''.join(self.omim_disease_hash.get(omim_disease_id))
                if len(self.kegg_disease_hash[kegg_disease_id]) == 1:
                    # add ids, and deal with the labels separately
                    model.addClassToGraph(kegg_disease_id, None)
                    model.addClassToGraph(omim_disease_id, None)
                    # TODO is this safe?
                    model.addEquivalentClass(kegg_disease_id, omim_disease_id)
            else:
                pass
                # gu.addXref(g, omim_disease_id, kegg_disease_id)
                # TODO add xrefs if >1:1 mapping?

        LOG.info("Done with KEGG disease to OMIM disease mappings.")
Exemple #32
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0

        model = Model(graph)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignore element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text
                disorder_id = 'ORPHA:' + str(disorder_num)

                if self.test_mode and disorder_id not in self.all_test_ids[
                        'disease']:
                    continue
                disorder_label = elem.find('Name').text

                # assuming that these are in the ontology (...any particular one?)
                model.addClassToGraph(disorder_id, disorder_label)
                assoc_list = elem.find('DisorderGeneAssociationList')
                expected_genes = assoc_list.get('count')
                LOG.info('Expecting %s genes associated with disorder %s.',
                         expected_genes, disorder_id)
                processed_genes = 0
                for assoc in assoc_list.findall('DisorderGeneAssociation'):
                    processed_genes += 1
                    gene = assoc.find('Gene')

                    # get gene's curie  HGNC or Ensembl ...

                    lclid = gene.find('OrphaNumber').text
                    gene_curie = 'ORPHA:' + lclid
                    gene_set = {'ORPHA': lclid}
                    for gene_ref in gene.findall(
                            './ExternalReferenceList/ExternalReference'):
                        gene_set[gene_ref.find('Source').text] = \
                            gene_ref.find('Reference').text

                    # set priority (clique leader if available) but default to OPRHA
                    for pfx in ('HGNC', 'Ensembl', 'SwissProt'):
                        if pfx in gene_set:
                            if pfx in self.localtt:
                                pfx = self.localtt[pfx]
                            gene_curie = pfx + ':' + gene_set[pfx]
                            gene_set.pop(pfx)
                            model.addClassToGraph(gene_curie, None)
                            break

                    # TEC have reservations w.r.t aggerator links being gene classes
                    for prefix in gene_set:
                        lclid = gene_set[prefix]
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]

                        dbxref = prefix + ':' + lclid

                        if gene_curie != dbxref:
                            model.addClassToGraph(dbxref, None)
                            model.addEquivalentClass(gene_curie, dbxref)

                    # TEC. would prefer this not happen here. let HGNC handle it
                    # except there are some w/o explicit external links ...

                    gene_symbol = gene.find('Symbol').text

                    syn_list = gene.find('./SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for syn in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_curie, syn.text)

                    dg_label = assoc.find(
                        './DisorderGeneAssociationType/Name').text

                    # use dg association status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    eco_id = self.resolve(
                        assoc.find('DisorderGeneAssociationStatus/Name').text)

                    rel_id = self.resolve(dg_label)

                    g2p_assoc = G2PAssoc(self.graph, self.name, gene_curie,
                                         disorder_id, rel_id)
                    g2p_assoc.add_evidence(eco_id)
                    g2p_assoc.add_association_to_graph()

                elem.clear()  # empty the element
                if int(expected_genes) != processed_genes:
                    LOG.warning(
                        '% expected %s associated genes but we processed %i',
                        disorder_id, expected_genes, processed_genes)

            if self.test_mode and limit is not None and line_counter > limit:
                return

        return
Exemple #33
0
    def _add_gene_equivalencies(self, dbxrefs, gene_id, taxon):
        """
        Add equivalentClass and sameAs relationships

        Uses external resource map located in
        /resources/clique_leader.yaml to determine
        if an NCBITaxon ID space is a clique leader
        """

        clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport', '']

        # deal with the dbxrefs
        # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696

        for dbxref in dbxrefs.strip().split('|'):
            prefix = ':'.join(
                dbxref.split(':')[:-1]).strip()  # restore nonterminal ':'

            if prefix in self.localtt:
                prefix = self.localtt[prefix]

            # skip some of these for now based on curie prefix
            if prefix in filter_out:
                continue

            if prefix == 'AnimalQTLdb' and taxon in self.informal_species:
                prefix = self.informal_species[taxon] + 'QTL'

            dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1]))
            if dbxref_curie is not None:
                if prefix == 'HPRD':  # proteins are not == genes.
                    model.addTriple(gene_id, self.globaltt['has gene product'],
                                    dbxref_curie)
                    continue

                if prefix == 'ENSEMBL':
                    model.addXref(gene_id, dbxref_curie)
                if prefix == 'OMIM':
                    omim_num = dbxref_curie[5:]
                    if omim_num in self.omim_replaced:
                        repl = self.omim_replaced[omim_num]
                        for omim in repl:
                            if omim in self.omim_type and \
                                    self.omim_type[omim] == self.globaltt['gene']:
                                dbxref_curie = 'OMIM:' + omim
                                model.addXref(gene_id, dbxref_curie)
                                omim_num = omim  # last wins

                    elif omim_num in self.omim_type and\
                            self.omim_type[omim_num] == self.globaltt['gene']:
                        model.addXref(gene_id, dbxref_curie)
                    else:
                        continue  # no equivilance between ncbigene and omin-nongene
                # designate clique leaders
                # (perhaps premature as this ingest can't know what else exists)
                try:
                    if self.class_or_indiv.get(gene_id) == 'C':
                        model.addEquivalentClass(gene_id, dbxref_curie)
                        if taxon in clique_map:
                            if clique_map[taxon] == prefix:
                                model.makeLeader(dbxref_curie)
                            elif clique_map[taxon] == gene_id.split(':')[0]:
                                model.makeLeader(gene_id)
                    else:
                        model.addSameIndividual(gene_id, dbxref_curie)
                except AssertionError as err:
                    LOG.warning("Error parsing %s: %s", gene_id, err)
Exemple #34
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        model = Model(g)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        # PYLINT complains iterparse deprecated,
        # but as of py 3.4 only the optional & unsupplied parse arg is.
        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignoreS element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text

                disorder_id = 'Orphanet:'+str(disorder_num)

                if self.testMode and \
                        disorder_id not in \
                        config.get_config()['test_ids']['disease']:
                    continue

                disorder_label = elem.find('Name').text

                # make a hash of internal gene id to type for later lookup
                gene_iid_to_type = {}
                gene_list = elem.find('GeneList')
                for gene in gene_list.findall('Gene'):
                    gene_iid = gene.get('id')
                    gene_type = gene.find('GeneType').get('id')
                    gene_iid_to_type[gene_iid] = gene_type

                # assuming that these are in the ontology
                model.addClassToGraph(disorder_id, disorder_label)

                assoc_list = elem.find('DisorderGeneAssociationList')
                for a in assoc_list.findall('DisorderGeneAssociation'):
                    gene_iid = a.find('.//Gene').get('id')
                    gene_name = a.find('.//Gene/Name').text
                    gene_symbol = a.find('.//Gene/Symbol').text
                    gene_num = a.find('./Gene/OrphaNumber').text
                    gene_id = 'Orphanet:'+str(gene_num)
                    gene_type_id = \
                        self._map_gene_type_id(gene_iid_to_type[gene_iid])
                    model.addClassToGraph(
                        gene_id, gene_symbol, gene_type_id, gene_name)
                    syn_list = a.find('./Gene/SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for s in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_id, s.text)

                    dgtype = a.find('DisorderGeneAssociationType').get('id')
                    rel_id = self._map_rel_id(dgtype)
                    dg_label = \
                        a.find('./DisorderGeneAssociationType/Name').text
                    if rel_id is None:
                        logger.warning(
                            "Cannot map association type (%s) to RO " +
                            "for association (%s | %s).  Skipping.",
                            dg_label, disorder_label, gene_symbol)
                        continue

                    alt_locus_id = '_:'+gene_num+'-'+disorder_num+'VL'
                    alt_label = \
                        ' '.join(('some variant of', gene_symbol.strip(),
                                  'that is a', dg_label.lower(),
                                  disorder_label))

                    model.addIndividualToGraph(alt_locus_id, alt_label,
                                               geno.genoparts['variant_locus'])
                    geno.addAffectedLocus(alt_locus_id, gene_id)
                    model.addBlankNodeAnnotation(alt_locus_id)

                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use "assessed" status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    status_code = \
                        a.find('DisorderGeneAssociationStatus').get('id')
                    # imported automatically asserted information
                    # used in automatic assertion
                    eco_id = 'ECO:0000323'
                    # Assessed
                    # TODO are these internal ids stable between releases?
                    if status_code == '17991':
                        # imported manually asserted information
                        # used in automatic assertion
                        eco_id = 'ECO:0000322'
                    # Non-traceable author statement ECO_0000034
                    # imported information in automatic assertion ECO_0000313

                    assoc = G2PAssoc(g, self.name, alt_locus_id,
                                     disorder_id, rel_id)
                    assoc.add_evidence(eco_id)
                    assoc.add_association_to_graph()

                    rlist = a.find('./Gene/ExternalReferenceList')
                    eqid = None

                    for r in rlist.findall('ExternalReference'):
                        if r.find('Source').text == 'Ensembl':
                            eqid = 'ENSEMBL:'+r.find('Reference').text
                        elif r.find('Source').text == 'HGNC':
                            eqid = 'HGNC:'+r.find('Reference').text
                        elif r.find('Source').text == 'OMIM':
                            eqid = 'OMIM:'+r.find('Reference').text
                        else:
                            pass  # skip the others for now
                        if eqid is not None:
                            model.addClassToGraph(eqid, None)
                            model.addEquivalentClass(gene_id, eqid)
                elem.clear()  # empty the element

            if self.testMode and limit is not None and line_counter > limit:
                return

        return
Exemple #35
0
    def _get_identifiers(self, limit):
        """
        This will process the id mapping file provided by Biogrid.
        The file has a very large header, which we scan past,
        then pull the identifiers, and make equivalence axioms

        :param limit:
        :return:

        """

        logger.info("getting identifier mapping")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['identifiers']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        foundheader = False

        # TODO align this species filter with the one above
        # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster,
        # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',')

        speciesfilters = 'H**o sapiens,Mus musculus'.split(',')
        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip header lines
                if not foundheader:
                    if re.match(r'BIOGRID_ID', line.decode()):
                        foundheader = True
                    continue

                line = line.decode().strip()
                # BIOGRID_ID
                # IDENTIFIER_VALUE
                # IDENTIFIER_TYPE
                # ORGANISM_OFFICIAL_NAME
                # 1	814566	ENTREZ_GENE	Arabidopsis thaliana
                (biogrid_num, id_num, id_type,
                 organism_label) = line.split('\t')

                if self.testMode:
                    g = self.testgraph
                    # skip any genes that don't match our test set
                    if int(biogrid_num) not in self.biogrid_ids:
                        continue
                else:
                    g = self.graph

                model = Model(g)

                # for each one of these,
                # create the node and add equivalent classes
                biogrid_id = 'BIOGRID:'+biogrid_num
                prefix = self._map_idtype_to_prefix(id_type)

                # TODO make these filters available as commandline options
                # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,
                #                   WormBase,XenBase,ENSEMBL,miRBase'.split(',')
                geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',')
                # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein'
                if (speciesfilters is not None) \
                        and (organism_label.strip() in speciesfilters):
                    line_counter += 1
                    if (geneidtypefilters is not None) \
                            and (prefix in geneidtypefilters):
                        mapped_id = ':'.join((prefix, id_num))
                        model.addEquivalentClass(biogrid_id, mapped_id)
                    # this symbol will only get attached to the biogrid class
                    elif id_type == 'OFFICIAL_SYMBOL':
                        model.addClassToGraph(biogrid_id, id_num)
                    # elif (id_type == 'SYNONYM'):
                    #   FIXME - i am not sure these are synonyms, altids?
                    #   gu.addSynonym(g,biogrid_id,id_num)

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        myzip.close()

        return
Exemple #36
0
    def _process_genes(self, taxid, limit=None):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        col = list(self.columns['bmq_attributes'])
        if taxid != '9606' and 'hgnc_id' in col:
            col.remove('hgnc_id')
        col_exp = [
            self.columns['bmq_headers'][self.columns['bmq_attributes'].index(x)]
            for x in col]

        LOG.info("Processing Ensembl genes for NCBITaxon:%s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t')
            row = next(reader)
            if not self.check_fileheader(col_exp, row):
                pass
            for row in reader:
                ensembl_gene_id = row[col.index('ensembl_gene_id')]
                external_gene_name = row[col.index('external_gene_name')]
                description = row[col.index('description')].strip()
                gene_biotype = row[col.index('gene_biotype')].strip()
                entrezgene = row[col.index('entrezgene_id')].strip()
                ensembl_peptide_id = row[col.index('ensembl_peptide_id')].strip()
                uniprotswissprot = row[col.index('uniprotswissprot')].strip()
                hgnc_curie = None
                # in the case of human genes, we also get the hgnc id,
                if taxid == '9606' and 'hgnc_id' in col:
                    hgnc_curie = row[col.index('hgnc_id')].strip()

                if self.test_mode and entrezgene != '' and \
                        entrezgene not in self.gene_ids:
                    continue

                gene_id = 'ENSEMBL:' + ensembl_gene_id
                entrez_curie = 'NCBIGene:{}'.format(entrezgene)

                if description == '':
                    description = None

                gene_type_id = self.resolve(
                    gene_biotype, mandatory=False,
                    default=self.globaltt['polypeptide'])

                model.addClassToGraph(
                    gene_id, external_gene_name, gene_type_id, description)

                if entrezgene != '':
                    if taxid == '9606':
                        # Use HGNC for eq in human data
                        model.addXref(gene_id, entrez_curie)
                    else:
                        model.addEquivalentClass(gene_id, entrez_curie)

                if hgnc_curie is not None and hgnc_curie != '':
                    model.addEquivalentClass(gene_id, hgnc_curie)
                geno.addTaxon('NCBITaxon:' + taxid, gene_id)
                if ensembl_peptide_id is not None and ensembl_peptide_id != '':
                    peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id)
                    model.addIndividualToGraph(peptide_curie, None, gene_type_id)
                    geno.addGeneProduct(gene_id, peptide_curie)
                    if uniprotswissprot != '':
                        uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot)
                        model.addIndividualToGraph(uniprot_curie, None, gene_type_id)
                        geno.addGeneProduct(gene_id, uniprot_curie)
                        model.addXref(peptide_curie, uniprot_curie)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
Exemple #37
0
    def _process_genes(self, limit=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        geno = Genotype(graph)
        model = Model(graph)

        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        col = self.files['genes']['columns']
        LOG.info("Processing HGNC genes")

        chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]')
        band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)')

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')

            row = next(filereader)
            if not self.check_fileheader(col, row):
                pass

            for row in filereader:
                # To generate:
                # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' |
                # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g"

                hgnc_id = row[col.index('hgnc_id')].strip()
                symbol = row[col.index('symbol')].strip()
                name = row[col.index('name')].strip()
                # locus_group = row[col.index('locus_group')]
                locus_type = row[col.index('locus_type')].strip()
                # status = row[col.index('status')]
                location = row[col.index('location')].strip()
                # location_sortable = row[col.index('location_sortable')]
                # alias_symbol = row[col.index('alias_symbol')]
                # alias_name = row[col.index('alias_name')]
                # prev_symbol = row[col.index('prev_symbol')]
                # prev_name = row[col.index('prev_name')]
                # gene_family = row[col.index('gene_family')]
                # gene_family_id = row[col.index('gene_family_id')]
                # date_approved_reserved = row[col.index('date_approved_reserved')]
                # date_symbol_changed = row[col.index('date_symbol_changed')]
                # date_name_changed = row[col.index('date_name_changed')]
                # date_modified = row[col.index('date_modified')]
                entrez_id = row[col.index('entrez_id')].strip()
                ensembl_gene_id = row[col.index('ensembl_gene_id')].strip()
                # vega_id = row[col.index('vega_id')]
                # ucsc_id = row[col.index('ucsc_id')]
                # ena = row[col.index('ena')]
                # refseq_accession = row[col.index('refseq_accession')]
                # ccds_id = row[col.index('ccds_id')]
                # uniprot_ids = row[col.index('uniprot_ids')]
                pubmed_ids = row[col.index(
                    'pubmed_id')].strip()  # pipe separated!
                # mgd_id = row[col.index('mgd_id')]
                # rgd_id = row[col.index('rgd_id')]
                # lsdb = row[col.index('lsdb')]
                # cosmic = row[col.index('cosmic')]
                omim_ids = row[col.index('omim_id')].strip()  # pipe separated!
                # mirbase = row[col.index('mirbase')]
                # homeodb = row[col.index('homeodb')]
                # snornabase = row[col.index('snornabase')]
                # bioparadigms_slc = row[col.index('bioparadigms_slc')]
                # orphanet = row[col.index('orphanet')]
                # pseudogene.org = row[col.index('pseudogene.org')]
                # horde_id = row[col.index('horde_id')]
                # merops = row[col.index('merops')]
                # imgt = row[col.index('imgt')]
                # iuphar = row[col.index('iuphar')]
                # kznf_gene_catalog = row[col.index('kznf_gene_catalog')]
                # mamit_trnadb = row[col.index('mamit-trnadb')]
                # cd = row[col.index('cd')]
                # lncrnadb = row[col.index('lncrnadb')]
                # enzyme_id = row[col.index('enzyme_id')]
                # intermediate_filament_db = row[col.index('intermediate_filament_db')]
                # rna_central_ids = row[col.index('rna_central_ids')]
                # lncipedia = row[col.index('lncipedia')]
                # gtrnadb = row[col.index('gtrnadb')]

                if self.test_mode and entrez_id != '' and \
                        entrez_id not in self.gene_ids:
                    continue

                if name == '':
                    name = None

                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id)
                elif symbol[
                        -1] == '@':  # 10)  region (HOX), RNA cluster, gene (PCDH)
                    continue

                else:
                    gene_type_id = self.resolve(locus_type, mandatory=False)
                    if gene_type_id != locus_type:
                        model.addClassToGraph(hgnc_id, symbol, gene_type_id,
                                              name)
                    model.makeLeader(hgnc_id)

                if entrez_id != '':
                    model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id)

                if ensembl_gene_id != '':
                    model.addEquivalentClass(hgnc_id,
                                             'ENSEMBL:' + ensembl_gene_id)

                for omim_id in omim_ids.split('|'):
                    if omim_id in self.omim_replaced:
                        repl = self.omim_replaced[omim_id]
                        LOG.warning('%s is replaced with %s', omim_id, repl)
                        for omim in repl:
                            if self.omim_type[omim] == self.globaltt['gene']:
                                omim_id = omim

                    if omim_id in self.omim_type and \
                            self.omim_type[omim_id] == self.globaltt['gene']:
                        model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id)

                geno.addTaxon(self.hs_txid, hgnc_id)

                # add pubs as "is about"
                for pubmed_id in pubmed_ids.split('|'):
                    graph.addTriple('PMID:' + pubmed_id,
                                    self.globaltt['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_match = chr_pattern.match(location)
                if chr_match is not None and chr_match.groups():
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, self.hs_txid, 'CHR')
                    band_match = band_pattern.search(location)
                    feat = Feature(graph, hgnc_id, None, None)
                    if band_match is not None and band_match.groups():
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        band_id = makeChromID(band, self.hs_txid, 'CHR')
                        model.addClassToGraph(band_id, None)
                        feat.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        feat.addSubsequenceOfFeature(chrom_id)

                if not self.test_mode and limit is not None and \
                        filereader.line_num > limit:
                    break
Exemple #38
0
    def _process_genes(self, limit=None):

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph

        geno = Genotype(graph)
        model = Model(graph)
        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        line_counter = 0
        logger.info("Processing HGNC genes")

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n  .
            for row in filereader:
                (hgnc_id, symbol, name, locus_group, locus_type, status,
                 location, location_sortable, alias_symbol, alias_name,
                 prev_symbol, prev_name, gene_family, gene_family_id,
                 date_approved_reserved, date_symbol_changed,
                 date_name_changed, date_modified, entrez_id, ensembl_gene_id,
                 vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids,
                 pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase,
                 homeodb, snornabase, bioparadigms_slc, orphanet,
                 pseudogene_org, horde_id, merops, imgt, iuphar,
                 kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id,
                 intermediate_filament_db, rna_central_ids) = row

                line_counter += 1

                # skip header
                if line_counter <= 1:
                    continue

                if self.testMode and entrez_id != ''  and \
                        int(entrez_id) not in self.gene_ids:
                    continue

                if name == '':
                    name = None
                gene_type_id = self.resolve(locus_type,
                                            False)  # withdrawn -> None?
                if gene_type_id != locus_type:
                    model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id)
                else:
                    model.makeLeader(hgnc_id)
                if entrez_id != '':
                    model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id)
                if ensembl_gene_id != '':
                    model.addEquivalentClass(hgnc_id,
                                             'ENSEMBL:' + ensembl_gene_id)
                if omim_id != '' and "|" not in omim_id:
                    omim_curie = 'OMIM:' + omim_id
                    if not DipperUtil.is_omim_disease(omim_curie):
                        model.addEquivalentClass(hgnc_id, omim_curie)

                geno.addTaxon(self.hs_txid, hgnc_id)

                # add pubs as "is about"
                if pubmed_id != '':
                    for p in re.split(r'\|', pubmed_id.strip()):
                        if str(p) != '':
                            graph.addTriple('PMID:' + str(p.strip()),
                                            self.globaltt['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]'
                chr_match = re.match(chr_pattern, location)
                if chr_match is not None and len(chr_match.groups()) > 0:
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, self.hs_txid, 'CHR')
                    band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)'
                    band_match = re.search(band_pattern, location)
                    feat = Feature(graph, hgnc_id, None, None)
                    if band_match is not None and len(band_match.groups()) > 0:
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        band_id = makeChromID(band, self.hs_txid, 'CHR')
                        model.addClassToGraph(band_id, None)
                        feat.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        feat.addSubsequenceOfFeature(chrom_id)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

            # end loop through file

        return
Exemple #39
0
    def _process_genes(self, taxid, limit=None):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        geno = Genotype(g)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        line_counter = 0
        logger.info("Processing Ensembl genes for tax %s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t')
            for row in filereader:
                if len(row) < 4:
                    raise ValueError("Data error for file %s", raw)
                (ensembl_gene_id, external_gene_name, description,
                 gene_biotype, entrezgene, peptide_id,
                 uniprot_swissprot) = row[0:7]

                # in the case of human genes, we also get the hgnc id,
                # and is the last col
                if taxid == '9606':
                    hgnc_id = row[7]
                else:
                    hgnc_id = None

                if self.testMode and entrezgene != '' \
                        and int(entrezgene) not in self.gene_ids:
                    continue

                line_counter += 1
                gene_id = 'ENSEMBL:' + ensembl_gene_id
                peptide_curie = 'ENSEMBL:{}'.format(peptide_id)
                uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot)
                entrez_curie = 'NCBIGene:{}'.format(entrezgene)

                if description == '':
                    description = None
                # gene_type_id = self._get_gene_type(gene_biotype)
                gene_type_id = None
                model.addClassToGraph(gene_id, external_gene_name,
                                      gene_type_id, description)
                model.addIndividualToGraph(peptide_curie, None,
                                           self._get_gene_type("polypeptide"))
                model.addIndividualToGraph(uniprot_curie, None,
                                           self._get_gene_type("polypeptide"))

                if entrezgene != '':
                    model.addEquivalentClass(gene_id, entrez_curie)
                if hgnc_id is not None and hgnc_id != '':
                    model.addEquivalentClass(gene_id, hgnc_id)
                geno.addTaxon('NCBITaxon:' + taxid, gene_id)
                if peptide_id != '':
                    geno.addGeneProduct(gene_id, peptide_curie)
                    if uniprot_swissprot != '':
                        geno.addGeneProduct(gene_id, uniprot_curie)
                        model.addXref(peptide_curie, uniprot_curie)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return