Exemple #1
0
    def _process_all(self, limit):
        """
        This takes the list of omim identifiers from the omimTitles file,
        excludes those designated as obsolete and iteratively queries the omim api
        in batches of 20 for the json-formatted data.

        This will create OMIM classes, with the label & definition.
        If an entry is "removed",
            it is added as a deprecated class.
        If an entry is "moved",
            it is deprecated and consider annotations are added.

        Additionally, we extract:
        *phenotypicSeries ids as superclasses
        *equivalent ids for Orphanet and UMLS

        If set to testMode,
            it will write only those items in the test_ids to the testgraph.

        :param limit:
        """
        omimids = list(self.omim_type.keys() - self.omim_replaced.keys())

        LOG.info('Have %i omim numbers to fetch records from their API',
                 len(omimids))
        LOG.info('Have %i omim types ', len(self.omim_type))

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        tax_label = 'H**o sapiens'
        tax_id = self.globaltt[tax_label]

        # add genome and taxon
        geno.addGenome(tax_id, tax_label)
        model.addClassToGraph(tax_id, tax_label)

        includes = set()
        includes.add('all')

        self.process_entries(omimids, self._transform_entry, includes, graph,
                             limit)

        # since we are not fetching obsolete records any more add them all in here
        for omim_id in self.omim_replaced:
            model.addDeprecatedClass(
                'OMIM:' + omim_id,
                ['OMIM:' + o for o in self.omim_replaced[omim_id]])
Exemple #2
0
    def process_gene_ids(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        logger.info("Processing: %s", self.files['gene_ids']['file'])
        line_counter = 0
        geno = Genotype(g)
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter=',',
                quotechar='\"')
            for row in filereader:
                line_counter += 1
                (taxon_num,
                 gene_num,
                 gene_symbol,
                 gene_synonym,
                 live,
                 gene_type) = row
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                taxon_id = 'NCBITaxon:'+taxon_num
                gene_id = 'WormBase:'+gene_num
                if gene_symbol == '':
                    gene_symbol = gene_synonym
                if gene_symbol == '':
                    gene_symbol = None
                model.addClassToGraph(
                    gene_id, gene_symbol, Genotype.genoparts['gene'])
                if live == 'Dead':
                    model.addDeprecatedClass(gene_id)
                geno.addTaxon(taxon_id, gene_id)
                if gene_synonym != '' and gene_synonym is not None:
                    model.addSynonym(gene_id, gene_synonym)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Exemple #3
0
    def process_gene_ids(self, limit):
        src_key = 'gene_ids'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)
        col = self.files[src_key]['columns']
        LOG.info("Processing: %s", self.files[src_key]['file'])

        with gzip.open(raw, 'rb') as csvfile:
            reader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                delimiter=',',
                                quotechar='\"')
            # no header row to check
            collen = len(col)
            for row in reader:
                if len(row) != collen:
                    LOG.error('In %s line %i expected %i colums but got %s.',
                              self.files[src_key]['file'], reader.line_num,
                              collen, row)
                    pass
                taxon_num = row[col.index('taxon_num')]
                gene_num = row[col.index('gene_num')]
                gene_symbol = row[col.index('gene_symbol')]
                gene_synonym = row[col.index('gene_synonym')]
                live = row[col.index('live')]
                # gene_type = row[col.index('gene_type')]
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene

                taxon_curie = 'NCBITaxon:' + taxon_num
                gene_curie = 'WormBase:' + gene_num

                if gene_symbol == '':
                    gene_symbol = gene_synonym  # these are not the same in my book tec.
                if gene_symbol == '':
                    gene_symbol = None
                model.addClassToGraph(gene_curie, gene_symbol,
                                      self.globaltt['gene'])
                if live == 'Dead':
                    model.addDeprecatedClass(gene_curie,
                                             old_id_category=blv.terms['Gene'])
                geno.addTaxon(taxon_curie, gene_curie)
                if gene_synonym is not None and gene_synonym != '':
                    model.addSynonym(gene_curie, gene_synonym)

                if limit is not None and reader.line_num > limit:
                    break
Exemple #4
0
    def process_gene_ids(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        logger.info("Processing: %s", self.files['gene_ids']['file'])
        line_counter = 0
        geno = Genotype(graph)
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter=',',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                (taxon_num, gene_num, gene_symbol, gene_synonym, live,
                 gene_type) = row
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                taxon_id = 'NCBITaxon:' + taxon_num
                gene_id = 'WormBase:' + gene_num
                if gene_symbol == '':
                    gene_symbol = gene_synonym
                if gene_symbol == '':
                    gene_symbol = None
                model.addClassToGraph(gene_id, gene_symbol,
                                      self.globaltt['gene'])
                if live == 'Dead':
                    model.addDeprecatedClass(gene_id)
                geno.addTaxon(taxon_id, gene_id)
                if gene_synonym != '' and gene_synonym is not None:
                    model.addSynonym(gene_id, gene_synonym)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Exemple #5
0
    def _process_genes(self, limit=None):

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph

        geno = Genotype(graph)
        model = Model(graph)
        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        line_counter = 0
        logger.info("Processing HGNC genes")

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n  .
            for row in filereader:
                (hgnc_id, symbol, name, locus_group, locus_type, status,
                 location, location_sortable, alias_symbol, alias_name,
                 prev_symbol, prev_name, gene_family, gene_family_id,
                 date_approved_reserved, date_symbol_changed,
                 date_name_changed, date_modified, entrez_id, ensembl_gene_id,
                 vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids,
                 pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase,
                 homeodb, snornabase, bioparadigms_slc, orphanet,
                 pseudogene_org, horde_id, merops, imgt, iuphar,
                 kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id,
                 intermediate_filament_db, rna_central_ids) = row

                line_counter += 1

                # skip header
                if line_counter <= 1:
                    continue

                if self.testMode and entrez_id != ''  and \
                        int(entrez_id) not in self.gene_ids:
                    continue

                if name == '':
                    name = None
                gene_type_id = self.resolve(locus_type,
                                            False)  # withdrawn -> None?
                if gene_type_id != locus_type:
                    model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id)
                else:
                    model.makeLeader(hgnc_id)
                if entrez_id != '':
                    model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id)
                if ensembl_gene_id != '':
                    model.addEquivalentClass(hgnc_id,
                                             'ENSEMBL:' + ensembl_gene_id)
                if omim_id != '' and "|" not in omim_id:
                    omim_curie = 'OMIM:' + omim_id
                    if not DipperUtil.is_omim_disease(omim_curie):
                        model.addEquivalentClass(hgnc_id, omim_curie)

                geno.addTaxon(self.hs_txid, hgnc_id)

                # add pubs as "is about"
                if pubmed_id != '':
                    for p in re.split(r'\|', pubmed_id.strip()):
                        if str(p) != '':
                            graph.addTriple('PMID:' + str(p.strip()),
                                            self.globaltt['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]'
                chr_match = re.match(chr_pattern, location)
                if chr_match is not None and len(chr_match.groups()) > 0:
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, self.hs_txid, 'CHR')
                    band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)'
                    band_match = re.search(band_pattern, location)
                    feat = Feature(graph, hgnc_id, None, None)
                    if band_match is not None and len(band_match.groups()) > 0:
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        band_id = makeChromID(band, self.hs_txid, 'CHR')
                        model.addClassToGraph(band_id, None)
                        feat.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        feat.addSubsequenceOfFeature(chrom_id)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

            # end loop through file

        return
Exemple #6
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, discontinued_num, discontinued_symbol,
                 discontinued_date) = line.split('\t')

                # set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))

                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(discontinued_gene_id,
                                          discontinued_symbol)

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(discontinued_gene_id,
                                               discontinued_symbol)
                    model.addDeprecatedIndividual(discontinued_gene_id,
                                                  [gene_id])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if (not self.testMode) and\
                        (limit is not None and line_counter > limit):
                    break

        return
Exemple #7
0
    def _process_genes(self, limit=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        geno = Genotype(graph)
        model = Model(graph)

        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        col = self.files['genes']['columns']
        LOG.info("Processing HGNC genes")

        chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]')
        band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)')

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')

            row = next(filereader)
            if not self.check_fileheader(col, row):
                exit(-1)

            for row in filereader:
                # To generate:
                # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' |
                # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g"

                hgnc_id = row[col.index('hgnc_id')].strip()
                symbol = row[col.index('symbol')].strip()
                name = row[col.index('name')].strip()
                # locus_group = row[col.index('locus_group')]
                locus_type = row[col.index('locus_type')].strip()
                # status = row[col.index('status')]
                location = row[col.index('location')].strip()
                # location_sortable = row[col.index('location_sortable')]
                # alias_symbol = row[col.index('alias_symbol')]
                # alias_name = row[col.index('alias_name')]
                # prev_symbol = row[col.index('prev_symbol')]
                # prev_name = row[col.index('prev_name')]
                # gene_family = row[col.index('gene_family')]
                # gene_family_id = row[col.index('gene_family_id')]
                # date_approved_reserved = row[col.index('date_approved_reserved')]
                # date_symbol_changed = row[col.index('date_symbol_changed')]
                # date_name_changed = row[col.index('date_name_changed')]
                # date_modified = row[col.index('date_modified')]
                entrez_id = row[col.index('entrez_id')].strip()
                ensembl_gene_id = row[col.index('ensembl_gene_id')].strip()
                # vega_id = row[col.index('vega_id')]
                # ucsc_id = row[col.index('ucsc_id')]
                # ena = row[col.index('ena')]
                # refseq_accession = row[col.index('refseq_accession')]
                # ccds_id = row[col.index('ccds_id')]
                # uniprot_ids = row[col.index('uniprot_ids')]
                pubmed_ids = row[col.index('pubmed_id')].strip()  # pipe seperated!
                # mgd_id = row[col.index('mgd_id')]
                # rgd_id = row[col.index('rgd_id')]
                # lsdb = row[col.index('lsdb')]
                # cosmic = row[col.index('cosmic')]
                omim_ids = row[col.index('omim_id')].strip()  # pipe seperated!
                # mirbase = row[col.index('mirbase')]
                # homeodb = row[col.index('homeodb')]
                # snornabase = row[col.index('snornabase')]
                # bioparadigms_slc = row[col.index('bioparadigms_slc')]
                # orphanet = row[col.index('orphanet')]
                # pseudogene.org = row[col.index('pseudogene.org')]
                # horde_id = row[col.index('horde_id')]
                # merops = row[col.index('merops')]
                # imgt = row[col.index('imgt')]
                # iuphar = row[col.index('iuphar')]
                # kznf_gene_catalog = row[col.index('kznf_gene_catalog')]
                # mamit_trnadb = row[col.index('mamit-trnadb')]
                # cd = row[col.index('cd')]
                # lncrnadb = row[col.index('lncrnadb')]
                # enzyme_id = row[col.index('enzyme_id')]
                # intermediate_filament_db = row[col.index('intermediate_filament_db')]
                # rna_central_ids = row[col.index('rna_central_ids')]
                # lncipedia = row[col.index('lncipedia')]
                # gtrnadb = row[col.index('gtrnadb')]

                if self.test_mode and entrez_id != '' and \
                        entrez_id not in self.gene_ids:
                    continue

                if name == '':
                    name = None

                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id)
                else:
                    gene_type_id = self.resolve(locus_type, False)  # withdrawn -> None?
                    if gene_type_id != locus_type:
                        model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
                    model.makeLeader(hgnc_id)

                if entrez_id != '':
                    model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id)

                if ensembl_gene_id != '':
                    model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id)

                for omim_id in omim_ids.split('|'):
                    if omim_id in self.omim_replaced:
                        repl = self.omim_replaced[omim_id]
                        LOG.warning('%s is replaced with %s', omim_id, repl)
                        for omim in repl:
                            if self.omim_type[omim] == self.globaltt['gene']:
                                omim_id = omim

                    if omim_id in self.omim_type and \
                            self.omim_type[omim_id] == self.globaltt['gene']:
                        model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id)

                geno.addTaxon(self.hs_txid, hgnc_id)

                # add pubs as "is about"
                for pubmed_id in pubmed_ids.split('|'):
                    graph.addTriple(
                        'PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_match = chr_pattern.match(location)
                if chr_match is not None and len(chr_match.groups()) > 0:
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, self.hs_txid, 'CHR')
                    band_match = band_pattern.search(location)
                    feat = Feature(graph, hgnc_id, None, None)
                    if band_match is not None and len(band_match.groups()) > 0:
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        band_id = makeChromID(band, self.hs_txid, 'CHR')
                        model.addClassToGraph(band_id, None)
                        feat.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        feat.addSubsequenceOfFeature(chrom_id)

                if not self.test_mode and limit is not None and \
                        filereader.line_num > limit:
                    break
Exemple #8
0
    def _transform_entry(self, e, graph):
        g = graph
        model = Model(g)
        geno = Genotype(graph)

        tax_num = '9606'
        tax_id = 'NCBITaxon:9606'
        tax_label = 'Human'
        build_num = "GRCh38"
        build_id = "NCBIGenome:"+build_num

        # get the numbers, labels, and descriptions
        omimnum = e['entry']['mimNumber']
        titles = e['entry']['titles']
        label = titles['preferredTitle']

        other_labels = []
        if 'alternativeTitles' in titles:
            other_labels += self._get_alt_labels(titles['alternativeTitles'])
        if 'includedTitles' in titles:
            other_labels += self._get_alt_labels(titles['includedTitles'])

        # add synonyms of alternate labels
        # preferredTitle": "PFEIFFER SYNDROME",
        # "alternativeTitles":
        #   "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME",
        # "includedTitles":
        #   "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED"

        # remove the abbreviation (comes after the ;) from the preferredTitle,
        # and add it as a synonym
        abbrev = None
        if len(re.split(r';', label)) > 1:
            abbrev = (re.split(r';', label)[1].strip())
        newlabel = self._cleanup_label(label)

        description = self._get_description(e['entry'])
        omimid = 'OMIM:'+str(omimnum)

        if e['entry']['status'] == 'removed':
            model.addDeprecatedClass(omimid)
        else:
            omimtype = self._get_omimtype(e['entry'])
            nodelabel = newlabel
            # this uses our cleaned-up label
            if omimtype == Genotype.genoparts['heritable_phenotypic_marker']:
                if abbrev is not None:
                    nodelabel = abbrev
                # in this special case,
                # make it a disease by not declaring it as a gene/marker
                model.addClassToGraph(omimid, nodelabel, None, newlabel)
            elif omimtype == Genotype.genoparts['gene']:
                if abbrev is not None:
                    nodelabel = abbrev
                model.addClassToGraph(omimid, nodelabel, omimtype, newlabel)
            else:
                model.addClassToGraph(omimid, newlabel, omimtype)

            # add the original screaming-caps OMIM label as a synonym
            model.addSynonym(omimid, label)

            # add the alternate labels and includes as synonyms
            for l in other_labels:
                model.addSynonym(omimid, l, 'OIO:hasRelatedSynonym')

            # for OMIM, we're adding the description as a definition
            model.addDefinition(omimid, description)
            if abbrev is not None:
                model.addSynonym(omimid, abbrev, 'OIO:hasRelatedSynonym')

            # if this is a genetic locus (but not sequenced)
            #   then add the chrom loc info
            # but add it to the ncbi gene identifier,
            # not to the omim id (we reserve the omim id to be the phenotype)
            feature_id = None
            feature_label = None
            if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']:
                genemap = e['entry']['geneMap']
                is_gene = False

                if omimtype == \
                        Genotype.genoparts['heritable_phenotypic_marker']:
                    # get the ncbigene ids
                    ncbifeature = self._get_mapped_gene_ids(e['entry'], g)
                    if len(ncbifeature) == 1:
                        feature_id = 'NCBIGene:'+str(ncbifeature[0])
                        # add this feature as a cause for the omim disease
                        # TODO SHOULD I EVEN DO THIS HERE?
                        assoc = G2PAssoc(g, self.name, feature_id, omimid)
                        assoc.add_association_to_graph()

                    elif len(ncbifeature) > 1:
                        logger.info(
                            "Its ambiguous when %s maps to >1 gene id: %s",
                            omimid, str(ncbifeature))
                    else:  # no ncbi feature, make an anonymous one
                        feature_id = self._make_anonymous_feature(str(omimnum))
                        feature_label = abbrev

                elif omimtype == Genotype.genoparts['gene']:
                    feature_id = omimid
                    is_gene = True
                else:
                    # 158900 falls into this category
                    feature_id = self._make_anonymous_feature(str(omimnum))
                    if abbrev is not None:
                        feature_label = abbrev
                    omimtype = \
                        Genotype.genoparts[
                            'heritable_phenotypic_marker']

                if feature_id is not None:
                    if 'comments' in genemap:
                        # add a comment to this feature
                        comment = genemap['comments']
                        if comment.strip() != '':
                            model.addDescription(feature_id, comment)
                    if 'cytoLocation' in genemap:
                        cytoloc = genemap['cytoLocation']
                        # parse the cytoloc.
                        # add this omim thing as
                        # a subsequence of the cytofeature
                        # 18p11.3-p11.2
                        # FIXME
                        # add the other end of the range,
                        # but not sure how to do that
                        # not sure if saying subsequence of feature
                        # is the right relationship

                        f = Feature(g, feature_id, feature_label, omimtype)
                        if 'chromosomeSymbol' in genemap:
                            chrom_num = str(genemap['chromosomeSymbol'])
                            chrom = makeChromID(chrom_num, tax_num, 'CHR')
                            geno.addChromosomeClass(
                                chrom_num, tax_id, tax_label)

                            # add the positional information, if available
                            fstart = fend = -1
                            if 'chromosomeLocationStart' in genemap:
                                fstart = genemap['chromosomeLocationStart']
                            if 'chromosomeLocationEnd' in genemap:
                                fend = genemap['chromosomeLocationEnd']
                            if fstart >= 0:
                                # make the build-specific chromosome
                                chrom_in_build = makeChromID(chrom_num,
                                                             build_num,
                                                             'MONARCH')
                                # then, add the chromosome instance
                                # (from the given build)
                                geno.addChromosomeInstance(
                                    chrom_num, build_id, build_num, chrom)
                                if omimtype == \
                                        Genotype.genoparts[
                                            'heritable_phenotypic_marker']:
                                    postypes = [Feature.types['FuzzyPosition']]
                                else:
                                    postypes = None
                                # NOTE that no strand information
                                # is available in the API
                                f.addFeatureStartLocation(
                                    fstart, chrom_in_build, None, postypes)
                                if fend >= 0:
                                    f.addFeatureEndLocation(
                                        fend, chrom_in_build, None, postypes)
                                if fstart > fend:
                                    logger.info(
                                        "start>end (%d>%d) for %s",
                                        fstart, fend, omimid)
                            # add the cytogenic location too
                            # for now, just take the first one
                            cytoloc = cytoloc.split('-')[0]
                            loc = makeChromID(cytoloc, tax_num, 'CHR')
                            model.addClassToGraph(loc, None)
                            f.addSubsequenceOfFeature(loc)
                            f.addFeatureToGraph(True, None, is_gene)

                # end adding causative genes/features

            # check if moved, if so,
            # make it deprecated and
            # replaced consider class to the other thing(s)
            # some entries have been moved to multiple other entries and
            # use the joining raw word "and"
            # 612479 is movedto:  "603075 and 603029"  OR
            # others use a comma-delimited list, like:
            # 610402 is movedto: "609122,300870"
            if e['entry']['status'] == 'moved':
                if re.search(r'and', str(e['entry']['movedTo'])):
                    # split the movedTo entry on 'and'
                    newids = re.split(r'and', str(e['entry']['movedTo']))
                elif len(str(e['entry']['movedTo']).split(',')) > 0:
                    # split on the comma
                    newids = str(e['entry']['movedTo']).split(',')
                else:
                    # make a list of one
                    newids = [str(e['entry']['movedTo'])]
                # cleanup whitespace and add OMIM prefix to numeric portion
                fixedids = []
                for i in newids:
                    fixedids.append('OMIM:'+i.strip())

                model.addDeprecatedClass(omimid, fixedids)

            self._get_phenotypicseries_parents(e['entry'], g)
            self._get_mappedids(e['entry'], g)
            self._get_mapped_gene_ids(e['entry'], g)

            self._get_pubs(e['entry'], g)

            self._get_process_allelic_variants(e['entry'], g)  # temp gag

        return
Exemple #9
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        src_key = 'gene_history'
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        LOG.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("FILE: %s", myfile)
        col = self.files[src_key]['columns']
        with gzip.open(myfile, 'rb') as tsv:
            row = tsv.readline().decode().strip().split('\t')
            row[0] = row[0][1:]  # strip comment
            if not self.check_fileheader(col, row):
                pass

            for line in tsv:
                # skip comments
                row = line.decode().strip().split('\t')
                if row[0][0] == '#':
                    continue

                tax_num = row[col.index('tax_id')].strip()
                gene_num = row[col.index('GeneID')].strip()
                discontinued_num = row[col.index(
                    'Discontinued_GeneID')].strip()
                discontinued_symbol = row[col.index(
                    'Discontinued_Symbol')].strip()
                # discontinued_date = row[col.index('Discontinue_Date')]

                # set filter=None in init if you don't want to have a filter
                # if self.id_filter is not None:
                #     if ((self.id_filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.id_filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.test_mode and gene_num not in self.gene_ids:
                    continue

                if not self.test_mode and tax_num not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(discontinued_gene_id,
                                          discontinued_symbol,
                                          class_category=blv.terms['Gene'])

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id],
                                             old_id_category=blv.terms['Gene'])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(discontinued_gene_id,
                                               discontinued_symbol,
                                               ind_category=blv.terms['Gene'])
                    model.addDeprecatedIndividual(
                        discontinued_gene_id, [gene_id],
                        old_id_category=blv.terms['Gene'])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if not self.test_mode and (limit is not None
                                           and line_counter > limit):
                    break
Exemple #10
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, discontinued_num, discontinued_symbol,
                 discontinued_date) = line.split('\t')

                # set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))

                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(
                        discontinued_gene_id, discontinued_symbol)

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(
                        discontinued_gene_id, discontinued_symbol)
                    model.addDeprecatedIndividual(
                        discontinued_gene_id, [gene_id])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if (not self.testMode) and\
                        (limit is not None and line_counter > limit):
                    break

        return
Exemple #11
0
    def _process_genes(self, limit=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        geno = Genotype(graph)
        model = Model(graph)

        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        col = self.files['genes']['columns']
        LOG.info("Processing HGNC genes")

        chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]')
        band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)')

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')

            row = next(filereader)
            if not self.check_fileheader(col, row):
                pass

            for row in filereader:
                # To generate:
                # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' |
                # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g"

                hgnc_id = row[col.index('hgnc_id')].strip()
                symbol = row[col.index('symbol')].strip()
                name = row[col.index('name')].strip()
                # locus_group = row[col.index('locus_group')]
                locus_type = row[col.index('locus_type')].strip()
                # status = row[col.index('status')]
                location = row[col.index('location')].strip()
                # location_sortable = row[col.index('location_sortable')]
                # alias_symbol = row[col.index('alias_symbol')]
                # alias_name = row[col.index('alias_name')]
                # prev_symbol = row[col.index('prev_symbol')]
                # prev_name = row[col.index('prev_name')]
                # gene_family = row[col.index('gene_family')]
                # gene_family_id = row[col.index('gene_family_id')]
                # date_approved_reserved = row[col.index('date_approved_reserved')]
                # date_symbol_changed = row[col.index('date_symbol_changed')]
                # date_name_changed = row[col.index('date_name_changed')]
                # date_modified = row[col.index('date_modified')]
                entrez_id = row[col.index('entrez_id')].strip()
                ensembl_gene_id = row[col.index('ensembl_gene_id')].strip()
                # vega_id = row[col.index('vega_id')]
                # ucsc_id = row[col.index('ucsc_id')]
                # ena = row[col.index('ena')]
                # refseq_accession = row[col.index('refseq_accession')]
                # ccds_id = row[col.index('ccds_id')]
                # uniprot_ids = row[col.index('uniprot_ids')]
                pubmed_ids = row[col.index(
                    'pubmed_id')].strip()  # pipe separated!
                # mgd_id = row[col.index('mgd_id')]
                # rgd_id = row[col.index('rgd_id')]
                # lsdb = row[col.index('lsdb')]
                # cosmic = row[col.index('cosmic')]
                omim_ids = row[col.index('omim_id')].strip()  # pipe separated!
                # mirbase = row[col.index('mirbase')]
                # homeodb = row[col.index('homeodb')]
                # snornabase = row[col.index('snornabase')]
                # bioparadigms_slc = row[col.index('bioparadigms_slc')]
                # orphanet = row[col.index('orphanet')]
                # pseudogene.org = row[col.index('pseudogene.org')]
                # horde_id = row[col.index('horde_id')]
                # merops = row[col.index('merops')]
                # imgt = row[col.index('imgt')]
                # iuphar = row[col.index('iuphar')]
                # kznf_gene_catalog = row[col.index('kznf_gene_catalog')]
                # mamit_trnadb = row[col.index('mamit-trnadb')]
                # cd = row[col.index('cd')]
                # lncrnadb = row[col.index('lncrnadb')]
                # enzyme_id = row[col.index('enzyme_id')]
                # intermediate_filament_db = row[col.index('intermediate_filament_db')]
                # rna_central_ids = row[col.index('rna_central_ids')]
                # lncipedia = row[col.index('lncipedia')]
                # gtrnadb = row[col.index('gtrnadb')]

                if self.test_mode and entrez_id != '' and \
                        entrez_id not in self.gene_ids:
                    continue

                if name == '':
                    name = None

                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id)
                elif symbol[
                        -1] == '@':  # 10)  region (HOX), RNA cluster, gene (PCDH)
                    continue

                else:
                    gene_type_id = self.resolve(locus_type, mandatory=False)
                    if gene_type_id != locus_type:
                        model.addClassToGraph(hgnc_id, symbol, gene_type_id,
                                              name)
                    model.makeLeader(hgnc_id)

                if entrez_id != '':
                    model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id)

                if ensembl_gene_id != '':
                    model.addEquivalentClass(hgnc_id,
                                             'ENSEMBL:' + ensembl_gene_id)

                for omim_id in omim_ids.split('|'):
                    if omim_id in self.omim_replaced:
                        repl = self.omim_replaced[omim_id]
                        LOG.warning('%s is replaced with %s', omim_id, repl)
                        for omim in repl:
                            if self.omim_type[omim] == self.globaltt['gene']:
                                omim_id = omim

                    if omim_id in self.omim_type and \
                            self.omim_type[omim_id] == self.globaltt['gene']:
                        model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id)

                geno.addTaxon(self.hs_txid, hgnc_id)

                # add pubs as "is about"
                for pubmed_id in pubmed_ids.split('|'):
                    graph.addTriple('PMID:' + pubmed_id,
                                    self.globaltt['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_match = chr_pattern.match(location)
                if chr_match is not None and chr_match.groups():
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, self.hs_txid, 'CHR')
                    band_match = band_pattern.search(location)
                    feat = Feature(graph, hgnc_id, None, None)
                    if band_match is not None and band_match.groups():
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        band_id = makeChromID(band, self.hs_txid, 'CHR')
                        model.addClassToGraph(band_id, None)
                        feat.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        feat.addSubsequenceOfFeature(chrom_id)

                if not self.test_mode and limit is not None and \
                        filereader.line_num > limit:
                    break
Exemple #12
0
    def _process_genes(self, limit=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        model = Model(g)
        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        line_counter = 0
        logger.info("Processing HGNC genes")

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n  .
            for row in filereader:
                (hgnc_id,
                 symbol,
                 name,
                 locus_group,
                 locus_type,
                 status,
                 location,
                 location_sortable,
                 alias_symbol,
                 alias_name,
                 prev_symbol,
                 prev_name,
                 gene_family,
                 gene_family_id,
                 date_approved_reserved,
                 date_symbol_changed,
                 date_name_changed,
                 date_modified,
                 entrez_id,
                 ensembl_gene_id,
                 vega_id,
                 ucsc_id,
                 ena,
                 refseq_accession,
                 ccds_id,
                 uniprot_ids,
                 pubmed_id,
                 mgd_id,
                 rgd_id,
                 lsdb,
                 cosmic,
                 omim_id,
                 mirbase,
                 homeodb,
                 snornabase,
                 bioparadigms_slc,
                 orphanet,
                 pseudogene_org,
                 horde_id,
                 merops,
                 imgt,
                 iuphar,
                 kznf_gene_catalog,
                 mamit_trnadb,
                 cd,
                 lncrnadb,
                 enzyme_id,
                 intermediate_filament_db,
                 rna_central_ids) = row

                line_counter += 1

                # skip header
                if line_counter <= 1:
                    continue

                if self.testMode and entrez_id != '' \
                        and int(entrez_id) not in self.gene_ids:
                    continue

                if name == '':
                    name = None
                gene_type_id = self._get_gene_type(locus_type)
                model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id)
                else:
                    model.makeLeader(hgnc_id)
                if entrez_id != '':
                    model.addEquivalentClass(
                        hgnc_id, 'NCBIGene:' + entrez_id)
                if ensembl_gene_id != '':
                    model.addEquivalentClass(
                        hgnc_id, 'ENSEMBL:' + ensembl_gene_id)
                if omim_id != '' and "|" not in omim_id:
                    omim_curie = 'OMIM:' + omim_id
                    if not DipperUtil.is_omim_disease(omim_curie):
                        model.addEquivalentClass(hgnc_id, omim_curie)

                geno.addTaxon('NCBITaxon:9606', hgnc_id)

                # add pubs as "is about"
                if pubmed_id != '':
                    for p in re.split(r'\|', pubmed_id.strip()):
                        if str(p) != '':
                            g.addTriple(
                                'PMID:' + str(p.strip()),
                                model.object_properties['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]'
                chr_match = re.match(chr_pattern, location)
                if chr_match is not None and len(chr_match.groups()) > 0:
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR')
                    band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)'
                    band_match = re.search(band_pattern, location)
                    f = Feature(g, hgnc_id, None, None)
                    if band_match is not None and len(band_match.groups()) > 0:
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        # TEC Monoch? Monarchdom??
                        band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR')
                        model.addClassToGraph(band_id, None)
                        f.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        f.addSubsequenceOfFeature(chrom_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

            # end loop through file

        return
Exemple #13
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        src_key = 'gene_history'
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        LOG.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("FILE: %s", myfile)
        col = self.files[src_key]['columns']
        with gzip.open(myfile, 'rb') as tsv:
            row = tsv.readline().decode().strip().split('\t')
            row[0] = row[0][1:]  # strip comment
            if not self.check_fileheader(col, row):
                pass

            for line in tsv:
                # skip comments
                row = line.decode().strip().split('\t')
                if row[0][0] == '#':
                    continue

                tax_num = row[col.index('tax_id')].strip()
                gene_num = row[col.index('GeneID')].strip()
                discontinued_num = row[col.index('Discontinued_GeneID')].strip()
                discontinued_symbol = row[col.index('Discontinued_Symbol')].strip()
                # discontinued_date = row[col.index('Discontinue_Date')]

                # set filter=None in init if you don't want to have a filter
                # if self.id_filter is not None:
                #     if ((self.id_filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.id_filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.test_mode and gene_num not in self.gene_ids:
                    continue

                if not self.test_mode and tax_num not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(discontinued_gene_id, discontinued_symbol)

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(
                        discontinued_gene_id, discontinued_symbol)
                    model.addDeprecatedIndividual(discontinued_gene_id, [gene_id])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if not self.test_mode and (limit is not None and line_counter > limit):
                    break