Beispiel #1
0
 def _add_deprecated_snp(
         self, snp_id, snp_id_current, merged, chrom_num, chrom_pos):
     if self.test_mode:
         graph = self.testgraph
     else:
         graph = self.graph
     model = Model(graph)
     location = self._make_location_curie(chrom_num, chrom_pos)
     # add deprecation information
     if merged == '1' and str(snp_id_current.strip()) != '':
         # get the current rs_id
         current_rs_id = 'dbSNP:'
         if not re.match(r'rs', snp_id_current):
             current_rs_id += 'rs'
         current_rs_id += str(snp_id_current)
         if location is not None:
             if location not in self.id_location_map:
                 self.id_location_map[location] = set(current_rs_id)
             else:
                 self.id_location_map[location].add(current_rs_id)
         model.addDeprecatedIndividual(snp_id, current_rs_id)
         # TODO check on this
         # should we add the annotations to the current
         # or orig?
         model.makeLeader(current_rs_id)
     else:
         model.makeLeader(snp_id)
Beispiel #2
0
 def _add_deprecated_snp(self, snp_id, snp_id_current, merged,
                         chrom_num, chrom_pos):
     if self.testMode:
         g = self.testgraph
     else:
         g = self.graph
     model = Model(g)
     location = self._make_location_curie(chrom_num, chrom_pos)
     # add deprecation information
     if merged == '1' and str(snp_id_current.strip()) != '':
         # get the current rs_id
         current_rs_id = 'dbSNP:'
         if not re.match(r'rs', snp_id_current):
             current_rs_id += 'rs'
         current_rs_id += str(snp_id_current)
         if location is not None:
             if location not in self.id_location_map:
                 self.id_location_map[location] = set(current_rs_id)
             else:
                 self.id_location_map[location].add(current_rs_id)
         model.addDeprecatedIndividual(snp_id, current_rs_id)
         # TODO check on this
         # should we add the annotations to the current
         # or orig?
         model.makeLeader(current_rs_id)
     else:
         model.makeLeader(snp_id)
Beispiel #3
0
    def _add_deprecated_snp(self, snp_id, snp_id_current, merged, chrom_num,
                            chrom_pos):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        location = self._make_location_curie(chrom_num, chrom_pos)
        # add deprecation information
        if merged == '1' and snp_id_current != '':
            current_rs_id = 'dbSNP:rs' + snp_id_current
            if location is not None:
                if location not in self.id_location_map:
                    self.id_location_map[location] = set(current_rs_id)
                else:
                    self.id_location_map[location].add(current_rs_id)
            model.addDeprecatedIndividual(
                snp_id,
                current_rs_id,
                old_id_category=blv.terms['SequenceVariant'])

            # TODO check on this
            # should we add the annotations to the current
            # or orig?
            model.makeLeader(current_rs_id)
        else:
            model.makeLeader(snp_id)
Beispiel #4
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, discontinued_num, discontinued_symbol,
                 discontinued_date) = line.split('\t')

                # set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))

                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(discontinued_gene_id,
                                          discontinued_symbol)

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(discontinued_gene_id,
                                               discontinued_symbol)
                    model.addDeprecatedIndividual(discontinued_gene_id,
                                                  [gene_id])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if (not self.testMode) and\
                        (limit is not None and line_counter > limit):
                    break

        return
Beispiel #5
0
    def _get_process_allelic_variants(self, entry, g):
        model = Model(g)
        reference = Reference(g)
        geno = Genotype(g)
        if entry is not None:
            # to hold the entry-specific publication mentions
            # for the allelic variants
            publist = {}
            entry_num = entry['mimNumber']

            # process the ref list just to get the pmids
            ref_to_pmid = self._get_pubs(entry, g)

            if 'allelicVariantList' in entry:
                allelicVariantList = entry['allelicVariantList']
                for al in allelicVariantList:
                    al_num = al['allelicVariant']['number']
                    al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4)
                    al_label = None
                    al_description = None
                    if al['allelicVariant']['status'] == 'live':
                        publist[al_id] = set()
                        if 'mutations' in al['allelicVariant']:
                            al_label = al['allelicVariant']['mutations']
                        if 'text' in al['allelicVariant']:
                            al_description = al['allelicVariant']['text']
                            m = re.findall(r'\{(\d+)\:', al_description)
                            publist[al_id] = set(m)
                        geno.addAllele(
                            al_id, al_label, geno.genoparts['variant_locus'],
                            al_description)
                        geno.addAlleleOfGene(
                            al_id, 'OMIM:'+str(entry_num),
                            geno.object_properties[
                                'is_sequence_variant_instance_of'])
                        for r in publist[al_id]:
                            pmid = ref_to_pmid[int(r)]
                            g.addTriple(
                                pmid, model.object_properties['is_about'],
                                al_id)
                        # look up the pubmed id in the list of references
                        if 'dbSnps' in al['allelicVariant']:
                            dbsnp_ids = \
                                re.split(r',', al['allelicVariant']['dbSnps'])
                            for dnum in dbsnp_ids:
                                did = 'dbSNP:'+dnum.strip()
                                model.addIndividualToGraph(did, None)
                                model.addSameIndividual(al_id, did)
                        if 'clinvarAccessions' in al['allelicVariant']:
                            # clinvarAccessions triple semicolon delimited
                            # each >1 like RCV000020059;;;
                            rcv_ids = \
                                re.split(
                                    r';;;',
                                    al['allelicVariant']['clinvarAccessions'])
                            rcv_ids = [
                                (re.match(r'(RCV\d+);*', r)).group(1)
                                for r in rcv_ids]
                            for rnum in rcv_ids:
                                rid = 'ClinVar:'+rnum
                                model.addXref(al_id, rid)
                        reference.addPage(
                            al_id, "http://omim.org/entry/" +
                            str(entry_num)+"#" + str(al_num).zfill(4))
                    elif re.search(
                            r'moved', al['allelicVariant']['status']):
                        # for both 'moved' and 'removed'
                        moved_ids = None
                        if 'movedTo' in al['allelicVariant']:
                            moved_id = 'OMIM:'+al['allelicVariant']['movedTo']
                            moved_ids = [moved_id]
                        model.addDeprecatedIndividual(al_id, moved_ids)
                    else:
                        logger.error('Uncaught alleleic variant status %s',
                                     al['allelicVariant']['status'])
                # end loop allelicVariantList

        return
Beispiel #6
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        src_key = 'gene_history'
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        LOG.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("FILE: %s", myfile)
        col = self.files[src_key]['columns']
        with gzip.open(myfile, 'rb') as tsv:
            row = tsv.readline().decode().strip().split('\t')
            row[0] = row[0][1:]  # strip comment
            if not self.check_fileheader(col, row):
                pass

            for line in tsv:
                # skip comments
                row = line.decode().strip().split('\t')
                if row[0][0] == '#':
                    continue

                tax_num = row[col.index('tax_id')].strip()
                gene_num = row[col.index('GeneID')].strip()
                discontinued_num = row[col.index(
                    'Discontinued_GeneID')].strip()
                discontinued_symbol = row[col.index(
                    'Discontinued_Symbol')].strip()
                # discontinued_date = row[col.index('Discontinue_Date')]

                # set filter=None in init if you don't want to have a filter
                # if self.id_filter is not None:
                #     if ((self.id_filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.id_filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.test_mode and gene_num not in self.gene_ids:
                    continue

                if not self.test_mode and tax_num not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(discontinued_gene_id,
                                          discontinued_symbol,
                                          class_category=blv.terms['Gene'])

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id],
                                             old_id_category=blv.terms['Gene'])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(discontinued_gene_id,
                                               discontinued_symbol,
                                               ind_category=blv.terms['Gene'])
                    model.addDeprecatedIndividual(
                        discontinued_gene_id, [gene_id],
                        old_id_category=blv.terms['Gene'])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if not self.test_mode and (limit is not None
                                           and line_counter > limit):
                    break
Beispiel #7
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, discontinued_num, discontinued_symbol,
                 discontinued_date) = line.split('\t')

                # set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))

                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(
                        discontinued_gene_id, discontinued_symbol)

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(
                        discontinued_gene_id, discontinued_symbol)
                    model.addDeprecatedIndividual(
                        discontinued_gene_id, [gene_id])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if (not self.testMode) and\
                        (limit is not None and line_counter > limit):
                    break

        return
Beispiel #8
0
    def _get_process_allelic_variants(self, entry, graph):
        model = Model(graph)
        reference = Reference(graph)
        geno = Genotype(graph)
        if entry is not None:
            # to hold the entry-specific publication mentions
            # for the allelic variants
            publist = {}
            entry_num = entry['mimNumber']

            # process the ref list just to get the pmids
            ref_to_pmid = self._get_pubs(entry, graph)

            if 'allelicVariantList' in entry:
                for alv in entry['allelicVariantList']:
                    al_num = alv['allelicVariant']['number']
                    al_id = 'OMIM:' + str(entry_num) + '.' + str(al_num).zfill(
                        4)
                    al_label = None
                    al_description = None
                    if alv['allelicVariant']['status'] == 'live':
                        publist[al_id] = set()
                        if 'mutations' in alv['allelicVariant']:
                            al_label = alv['allelicVariant']['mutations']
                        if 'text' in alv['allelicVariant']:
                            al_description = alv['allelicVariant']['text']
                            mch = re.findall(r'\{(\d+)\:', al_description)
                            publist[al_id] = set(mch)
                        geno.addAllele(al_id, al_label,
                                       self.globaltt['variant_locus'],
                                       al_description)
                        geno.addAlleleOfGene(al_id, 'OMIM:' + str(entry_num),
                                             self.globaltt['is_allele_of'])
                        for ref in publist[al_id]:
                            pmid = ref_to_pmid[int(ref)]
                            graph.addTriple(pmid, self.globaltt['is_about'],
                                            al_id)
                        # look up the pubmed id in the list of references
                        if 'dbSnps' in alv['allelicVariant']:
                            dbsnp_ids = re.split(
                                r',', alv['allelicVariant']['dbSnps'])
                            for dnum in dbsnp_ids:
                                did = 'dbSNP:' + dnum.strip()
                                model.addIndividualToGraph(did, None)
                                model.addSameIndividual(al_id, did)

                        # Note that RCVs are variant to disease associations
                        # in ClinVar, rather than variant entries
                        # so we make these xrefs instead of equivalents
                        if 'clinvarAccessions' in alv['allelicVariant']:
                            # clinvarAccessions triple semicolon delimited
                            # each >1 like RCV000020059;;;
                            rcv_ids = \
                                alv['allelicVariant']['clinvarAccessions'].split(';;;')
                            rcv_ids = [rcv[:12]
                                       for rcv in rcv_ids]  # incase more cruft

                            for rnum in rcv_ids:
                                rid = 'ClinVar:' + rnum
                                model.addXref(al_id, rid)
                        reference.addPage(
                            al_id, "http://omim.org/entry/" + '#'.join(
                                (str(entry_num), str(al_num).zfill(4))))
                    elif re.search(r'moved', alv['allelicVariant']['status']):
                        # for both 'moved' and 'removed'
                        moved_ids = None
                        if 'movedTo' in alv['allelicVariant']:
                            moved_id = 'OMIM:' + alv['allelicVariant'][
                                'movedTo']
                            moved_ids = [moved_id]
                        model.addDeprecatedIndividual(al_id, moved_ids)
                    else:
                        LOG.error('Uncaught alleleic variant status %s',
                                  alv['allelicVariant']['status'])
Beispiel #9
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        src_key = 'gene_history'
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        LOG.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("FILE: %s", myfile)
        col = self.files[src_key]['columns']
        with gzip.open(myfile, 'rb') as tsv:
            row = tsv.readline().decode().strip().split('\t')
            row[0] = row[0][1:]  # strip comment
            if not self.check_fileheader(col, row):
                pass

            for line in tsv:
                # skip comments
                row = line.decode().strip().split('\t')
                if row[0][0] == '#':
                    continue

                tax_num = row[col.index('tax_id')].strip()
                gene_num = row[col.index('GeneID')].strip()
                discontinued_num = row[col.index('Discontinued_GeneID')].strip()
                discontinued_symbol = row[col.index('Discontinued_Symbol')].strip()
                # discontinued_date = row[col.index('Discontinue_Date')]

                # set filter=None in init if you don't want to have a filter
                # if self.id_filter is not None:
                #     if ((self.id_filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.id_filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.test_mode and gene_num not in self.gene_ids:
                    continue

                if not self.test_mode and tax_num not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(discontinued_gene_id, discontinued_symbol)

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(
                        discontinued_gene_id, discontinued_symbol)
                    model.addDeprecatedIndividual(discontinued_gene_id, [gene_id])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if not self.test_mode and (limit is not None and line_counter > limit):
                    break