Esempio n. 1
0
    def _process_straininfo(self, limit):
        # line_counter = 0  # TODO unused
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, self.files['straininfo']['file']))

        tax_id = 'NCBITaxon:10090'

        with open(raw, 'r') as f:
            reader = csv.reader(f, delimiter=',', quotechar='\"')
            self.check_header(self.files['straininfo']['file'], f.readline())
            for row in reader:
                (strain_name, vendor, stocknum, panel, mpd_strainid,
                 straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
                # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
                # create the strain as an instance of the taxon
                if self.testMode and \
                        'MPD:' + str(mpd_strainid) not in self.test_ids:
                    continue
                strain_id = 'MPD-strain:' + str(mpd_strainid)
                model.addIndividualToGraph(strain_id, strain_name, tax_id)
                if mpdshortname.strip() != '':
                    model.addSynonym(strain_id, mpdshortname.strip())
                self.idlabel_hash[strain_id] = strain_name
                # make it equivalent to the vendor+stock
                if stocknum != '':
                    if vendor == 'J':
                        jax_id = 'JAX:'+stocknum
                        model.addSameIndividual(strain_id, jax_id)
                    elif vendor == 'Rbrc':
                        # reiken
                        reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
                        model.addSameIndividual(strain_id, reiken_id)
                    else:
                        if url != '':
                            model.addXref(strain_id, url, True)
                        if vendor != '':
                            model.addXref(
                                strain_id, ':'.join((vendor, stocknum)),
                                True)

                # add the panel information
                if panel != '':
                    desc = panel+' [panel]'
                    model.addDescription(strain_id, desc)

                # TODO make the panels as a resource collection

        return
Esempio n. 2
0
    def process_gene_ids(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        logger.info("Processing: %s", self.files['gene_ids']['file'])
        line_counter = 0
        geno = Genotype(g)
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter=',',
                quotechar='\"')
            for row in filereader:
                line_counter += 1
                (taxon_num,
                 gene_num,
                 gene_symbol,
                 gene_synonym,
                 live,
                 gene_type) = row
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                taxon_id = 'NCBITaxon:'+taxon_num
                gene_id = 'WormBase:'+gene_num
                if gene_symbol == '':
                    gene_symbol = gene_synonym
                if gene_symbol == '':
                    gene_symbol = None
                model.addClassToGraph(
                    gene_id, gene_symbol, Genotype.genoparts['gene'])
                if live == 'Dead':
                    model.addDeprecatedClass(gene_id)
                geno.addTaxon(taxon_id, gene_id)
                if gene_synonym != '' and gene_synonym is not None:
                    model.addSynonym(gene_id, gene_synonym)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Esempio n. 3
0
    def _get_titles(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes (not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:
        """
        raw = '/'.join((self.rawdir, self.files['titles']['file']))
        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r', encoding='latin-1') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            header = next(filereader)
            line_counter = 1
            colcount = len(header)
            if colcount != 4:  # ('GR_shortname', 'GR_Title', 'NBK_id', 'PMID')
                logger.error("Unexpected Header ", header)
                exit(-1)
            for row in filereader:
                line_counter += 1
                if len(row) != colcount:
                    logger.error("Unexpected row. got: ", row)
                    logger.error("Expected data for: ", header)
                    exit(-1)
                (shortname, title, nbk_num, pmid) = row
                gr_id = 'GeneReviews:'+nbk_num

                self.book_ids.add(nbk_num)  # a global set of the book nums

                if limit is None or line_counter < limit:
                    model.addClassToGraph(gr_id, title)
                    model.addSynonym(gr_id, shortname)
                # TODO include the new PMID?

        return
Esempio n. 4
0
    def _get_titles(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes (not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:
        """
        raw = '/'.join((self.rawdir, self.files['titles']['file']))

        model = Model(self.graph)
        col = ['GR_shortname', 'GR_Title', 'NBK_id', 'PMID']
        with open(raw, 'r', encoding='latin-1') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            row = next(filereader)
            row[0] = row[0][1:]
            colcount = len(col)
            if not self.check_fileheader(col, row):
                exit(-1)
            for row in filereader:
                if len(row) != colcount:
                    LOG.error("Unexpected row. got: %s", row)
                    LOG.error("Expected data for: %s", col)
                    exit(-1)
                nbk_num = row[col.index('NBK_id')]
                gr_id = 'GeneReviews:' + nbk_num
                self.book_ids.add(nbk_num)  # a global set of the book nums
                if limit is None or filereader.line_num < limit:
                    model.addClassToGraph(gr_id, row[col.index('GR_Title')])
                    model.addSynonym(gr_id, row[col.index('GR_shortname')])
Esempio n. 5
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        model = Model(g)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        # PYLINT complains iterparse deprecated,
        # but as of py 3.4 only the optional & unsupplied parse arg is.
        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignoreS element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text

                disorder_id = 'Orphanet:'+str(disorder_num)

                if self.testMode and \
                        disorder_id not in \
                        config.get_config()['test_ids']['disease']:
                    continue

                disorder_label = elem.find('Name').text

                # make a hash of internal gene id to type for later lookup
                gene_iid_to_type = {}
                gene_list = elem.find('GeneList')
                for gene in gene_list.findall('Gene'):
                    gene_iid = gene.get('id')
                    gene_type = gene.find('GeneType').get('id')
                    gene_iid_to_type[gene_iid] = gene_type

                # assuming that these are in the ontology
                model.addClassToGraph(disorder_id, disorder_label)

                assoc_list = elem.find('DisorderGeneAssociationList')
                for a in assoc_list.findall('DisorderGeneAssociation'):
                    gene_iid = a.find('.//Gene').get('id')
                    gene_name = a.find('.//Gene/Name').text
                    gene_symbol = a.find('.//Gene/Symbol').text
                    gene_num = a.find('./Gene/OrphaNumber').text
                    gene_id = 'Orphanet:'+str(gene_num)
                    gene_type_id = \
                        self._map_gene_type_id(gene_iid_to_type[gene_iid])
                    model.addClassToGraph(
                        gene_id, gene_symbol, gene_type_id, gene_name)
                    syn_list = a.find('./Gene/SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for s in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_id, s.text)

                    dgtype = a.find('DisorderGeneAssociationType').get('id')
                    rel_id = self._map_rel_id(dgtype)
                    dg_label = \
                        a.find('./DisorderGeneAssociationType/Name').text
                    if rel_id is None:
                        logger.warning(
                            "Cannot map association type (%s) to RO " +
                            "for association (%s | %s).  Skipping.",
                            dg_label, disorder_label, gene_symbol)
                        continue

                    alt_locus_id = '_:'+gene_num+'-'+disorder_num+'VL'
                    alt_label = \
                        ' '.join(('some variant of', gene_symbol.strip(),
                                  'that is a', dg_label.lower(),
                                  disorder_label))

                    model.addIndividualToGraph(alt_locus_id, alt_label,
                                               geno.genoparts['variant_locus'])
                    geno.addAffectedLocus(alt_locus_id, gene_id)
                    model.addBlankNodeAnnotation(alt_locus_id)

                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use "assessed" status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    status_code = \
                        a.find('DisorderGeneAssociationStatus').get('id')
                    # imported automatically asserted information
                    # used in automatic assertion
                    eco_id = 'ECO:0000323'
                    # Assessed
                    # TODO are these internal ids stable between releases?
                    if status_code == '17991':
                        # imported manually asserted information
                        # used in automatic assertion
                        eco_id = 'ECO:0000322'
                    # Non-traceable author statement ECO_0000034
                    # imported information in automatic assertion ECO_0000313

                    assoc = G2PAssoc(g, self.name, alt_locus_id,
                                     disorder_id, rel_id)
                    assoc.add_evidence(eco_id)
                    assoc.add_association_to_graph()

                    rlist = a.find('./Gene/ExternalReferenceList')
                    eqid = None

                    for r in rlist.findall('ExternalReference'):
                        if r.find('Source').text == 'Ensembl':
                            eqid = 'ENSEMBL:'+r.find('Reference').text
                        elif r.find('Source').text == 'HGNC':
                            eqid = 'HGNC:'+r.find('Reference').text
                        elif r.find('Source').text == 'OMIM':
                            eqid = 'OMIM:'+r.find('Reference').text
                        else:
                            pass  # skip the others for now
                        if eqid is not None:
                            model.addClassToGraph(eqid, None)
                            model.addEquivalentClass(gene_id, eqid)
                elem.clear()  # empty the element

            if self.testMode and limit is not None and line_counter > limit:
                return

        return
Esempio n. 6
0
    def _transform_entry(self, e, graph):
        g = graph
        model = Model(g)
        geno = Genotype(graph)

        tax_num = '9606'
        tax_id = 'NCBITaxon:9606'
        tax_label = 'Human'
        build_num = "GRCh38"
        build_id = "NCBIGenome:"+build_num

        # get the numbers, labels, and descriptions
        omimnum = e['entry']['mimNumber']
        titles = e['entry']['titles']
        label = titles['preferredTitle']

        other_labels = []
        if 'alternativeTitles' in titles:
            other_labels += self._get_alt_labels(titles['alternativeTitles'])
        if 'includedTitles' in titles:
            other_labels += self._get_alt_labels(titles['includedTitles'])

        # add synonyms of alternate labels
        # preferredTitle": "PFEIFFER SYNDROME",
        # "alternativeTitles":
        #   "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME",
        # "includedTitles":
        #   "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED"

        # remove the abbreviation (comes after the ;) from the preferredTitle,
        # and add it as a synonym
        abbrev = None
        if len(re.split(r';', label)) > 1:
            abbrev = (re.split(r';', label)[1].strip())
        newlabel = self._cleanup_label(label)

        description = self._get_description(e['entry'])
        omimid = 'OMIM:'+str(omimnum)

        if e['entry']['status'] == 'removed':
            model.addDeprecatedClass(omimid)
        else:
            omimtype = self._get_omimtype(e['entry'])
            nodelabel = newlabel
            # this uses our cleaned-up label
            if omimtype == Genotype.genoparts['heritable_phenotypic_marker']:
                if abbrev is not None:
                    nodelabel = abbrev
                # in this special case,
                # make it a disease by not declaring it as a gene/marker
                model.addClassToGraph(omimid, nodelabel, None, newlabel)
            elif omimtype == Genotype.genoparts['gene']:
                if abbrev is not None:
                    nodelabel = abbrev
                model.addClassToGraph(omimid, nodelabel, omimtype, newlabel)
            else:
                model.addClassToGraph(omimid, newlabel, omimtype)

            # add the original screaming-caps OMIM label as a synonym
            model.addSynonym(omimid, label)

            # add the alternate labels and includes as synonyms
            for l in other_labels:
                model.addSynonym(omimid, l, 'OIO:hasRelatedSynonym')

            # for OMIM, we're adding the description as a definition
            model.addDefinition(omimid, description)
            if abbrev is not None:
                model.addSynonym(omimid, abbrev, 'OIO:hasRelatedSynonym')

            # if this is a genetic locus (but not sequenced)
            #   then add the chrom loc info
            # but add it to the ncbi gene identifier,
            # not to the omim id (we reserve the omim id to be the phenotype)
            feature_id = None
            feature_label = None
            if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']:
                genemap = e['entry']['geneMap']
                is_gene = False

                if omimtype == \
                        Genotype.genoparts['heritable_phenotypic_marker']:
                    # get the ncbigene ids
                    ncbifeature = self._get_mapped_gene_ids(e['entry'], g)
                    if len(ncbifeature) == 1:
                        feature_id = 'NCBIGene:'+str(ncbifeature[0])
                        # add this feature as a cause for the omim disease
                        # TODO SHOULD I EVEN DO THIS HERE?
                        assoc = G2PAssoc(g, self.name, feature_id, omimid)
                        assoc.add_association_to_graph()

                    elif len(ncbifeature) > 1:
                        logger.info(
                            "Its ambiguous when %s maps to >1 gene id: %s",
                            omimid, str(ncbifeature))
                    else:  # no ncbi feature, make an anonymous one
                        feature_id = self._make_anonymous_feature(str(omimnum))
                        feature_label = abbrev

                elif omimtype == Genotype.genoparts['gene']:
                    feature_id = omimid
                    is_gene = True
                else:
                    # 158900 falls into this category
                    feature_id = self._make_anonymous_feature(str(omimnum))
                    if abbrev is not None:
                        feature_label = abbrev
                    omimtype = \
                        Genotype.genoparts[
                            'heritable_phenotypic_marker']

                if feature_id is not None:
                    if 'comments' in genemap:
                        # add a comment to this feature
                        comment = genemap['comments']
                        if comment.strip() != '':
                            model.addDescription(feature_id, comment)
                    if 'cytoLocation' in genemap:
                        cytoloc = genemap['cytoLocation']
                        # parse the cytoloc.
                        # add this omim thing as
                        # a subsequence of the cytofeature
                        # 18p11.3-p11.2
                        # FIXME
                        # add the other end of the range,
                        # but not sure how to do that
                        # not sure if saying subsequence of feature
                        # is the right relationship

                        f = Feature(g, feature_id, feature_label, omimtype)
                        if 'chromosomeSymbol' in genemap:
                            chrom_num = str(genemap['chromosomeSymbol'])
                            chrom = makeChromID(chrom_num, tax_num, 'CHR')
                            geno.addChromosomeClass(
                                chrom_num, tax_id, tax_label)

                            # add the positional information, if available
                            fstart = fend = -1
                            if 'chromosomeLocationStart' in genemap:
                                fstart = genemap['chromosomeLocationStart']
                            if 'chromosomeLocationEnd' in genemap:
                                fend = genemap['chromosomeLocationEnd']
                            if fstart >= 0:
                                # make the build-specific chromosome
                                chrom_in_build = makeChromID(chrom_num,
                                                             build_num,
                                                             'MONARCH')
                                # then, add the chromosome instance
                                # (from the given build)
                                geno.addChromosomeInstance(
                                    chrom_num, build_id, build_num, chrom)
                                if omimtype == \
                                        Genotype.genoparts[
                                            'heritable_phenotypic_marker']:
                                    postypes = [Feature.types['FuzzyPosition']]
                                else:
                                    postypes = None
                                # NOTE that no strand information
                                # is available in the API
                                f.addFeatureStartLocation(
                                    fstart, chrom_in_build, None, postypes)
                                if fend >= 0:
                                    f.addFeatureEndLocation(
                                        fend, chrom_in_build, None, postypes)
                                if fstart > fend:
                                    logger.info(
                                        "start>end (%d>%d) for %s",
                                        fstart, fend, omimid)
                            # add the cytogenic location too
                            # for now, just take the first one
                            cytoloc = cytoloc.split('-')[0]
                            loc = makeChromID(cytoloc, tax_num, 'CHR')
                            model.addClassToGraph(loc, None)
                            f.addSubsequenceOfFeature(loc)
                            f.addFeatureToGraph(True, None, is_gene)

                # end adding causative genes/features

            # check if moved, if so,
            # make it deprecated and
            # replaced consider class to the other thing(s)
            # some entries have been moved to multiple other entries and
            # use the joining raw word "and"
            # 612479 is movedto:  "603075 and 603029"  OR
            # others use a comma-delimited list, like:
            # 610402 is movedto: "609122,300870"
            if e['entry']['status'] == 'moved':
                if re.search(r'and', str(e['entry']['movedTo'])):
                    # split the movedTo entry on 'and'
                    newids = re.split(r'and', str(e['entry']['movedTo']))
                elif len(str(e['entry']['movedTo']).split(',')) > 0:
                    # split on the comma
                    newids = str(e['entry']['movedTo']).split(',')
                else:
                    # make a list of one
                    newids = [str(e['entry']['movedTo'])]
                # cleanup whitespace and add OMIM prefix to numeric portion
                fixedids = []
                for i in newids:
                    fixedids.append('OMIM:'+i.strip())

                model.addDeprecatedClass(omimid, fixedids)

            self._get_phenotypicseries_parents(e['entry'], g)
            self._get_mappedids(e['entry'], g)
            self._get_mapped_gene_ids(e['entry'], g)

            self._get_pubs(e['entry'], g)

            self._get_process_allelic_variants(e['entry'], g)  # temp gag

        return
Esempio n. 7
0
    def _process_nlx_157874_1_view(self, raw, limit=None):
        """
        This table contains the Elements of Morphology data that has been
        screen-scraped into DISCO.
        Note that foaf:depiction is inverse of foaf:depicts relationship.

        Since it is bad form to have two definitions,
        we concatenate the two into one string.

        Triples:
            <eom id> a owl:Class
                rdf:label Literal(eom label)
                OIO:hasRelatedSynonym Literal(synonym list)
                IAO:definition Literal(objective_def. subjective def)
                foaf:depiction Literal(small_image_url),
                               Literal(large_image_url)
                foaf:page Literal(page_url)
                rdfs:comment Literal(long commented text)


        :param raw:
        :param limit:
        :return:
        """

        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            filereader = csv.reader(f1, delimiter='\t', quotechar='\"')
            for line in filereader:
                line_counter += 1
                (morphology_term_id, morphology_term_num,
                 morphology_term_label, morphology_term_url,
                 terminology_category_label, terminology_category_url,
                 subcategory, objective_definition, subjective_definition,
                 comments, synonyms, replaces, small_figure_url,
                 large_figure_url, e_uid, v_uid, v_uuid,
                 v_last_modified, v_status, v_lastmodified_epoch) = line

                # note:
                # e_uid v_uuid v_last_modified terminology_category_url
                # subcategory v_uid morphology_term_num
                # terminology_category_label hp_label notes
                # are currently unused.

                # Add morphology term to graph as a class
                # with label, type, and description.
                model.addClassToGraph(morphology_term_id,
                                      morphology_term_label)

                # Assemble the description text

                if subjective_definition != '' and not (
                        re.match(r'.+\.$', subjective_definition)):
                    # add a trailing period.
                    subjective_definition = subjective_definition.strip() + '.'
                if objective_definition != '' and not (
                        re.match(r'.+\.$', objective_definition)):
                    # add a trailing period.
                    objective_definition = objective_definition.strip() + '.'

                definition = \
                    '  '.join(
                        (objective_definition, subjective_definition)).strip()

                model.addDefinition(morphology_term_id, definition)

                # <term id> FOAF:depicted_by literal url
                # <url> type foaf:depiction

                # do we want both images?
                # morphology_term_id has depiction small_figure_url
                if small_figure_url != '':
                    model.addDepiction(morphology_term_id,
                                       small_figure_url)

                # morphology_term_id has depiction large_figure_url
                if large_figure_url != '':
                    model.addDepiction(morphology_term_id,
                                       large_figure_url)

                # morphology_term_id has comment comments
                if comments != '':
                    model.addComment(morphology_term_id,
                                     comments.strip())

                if synonyms != '':
                    for s in synonyms.split(';'):
                        model.addSynonym(
                            morphology_term_id, s.strip(),
                            model.annotation_properties['hasExactSynonym'])

                # morphology_term_id hasRelatedSynonym replaces (; delimited)
                if replaces != '' and replaces != synonyms:
                    for s in replaces.split(';'):
                        model.addSynonym(
                            morphology_term_id, s.strip(),
                            model.annotation_properties['hasRelatedSynonym'])

                # morphology_term_id has page morphology_term_url
                reference = Reference(self.graph)
                reference.addPage(morphology_term_id, morphology_term_url)

                if limit is not None and line_counter > limit:
                    break
        return
Esempio n. 8
0
    def process_feature_loc(self, limit):

        raw = '/'.join((self.rawdir, self.files['feature_loc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Feature location and attributes")
        line_counter = 0
        geno = Genotype(g)
        strain_to_variant_map = {}
        build_num = self.version_num
        build_id = 'WormBase:'+build_num
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
                quotechar='\"')
            for row in filereader:
                if re.match(r'\#', ''.join(row)):
                    continue
                (chrom, db, feature_type_label, start, end, score, strand,
                 phase, attributes) = row

# I	interpolated_pmap_position	gene	1	559768	.	.	.	ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM)
# I	WormBase	gene	3747	3909	.	-	.	ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6
# I	absolute_pmap_position	gene	4119	10230	.	.	.	ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM)

                # dbs = re.split(
                #   r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA')
                #
                # if db not in dbs:
                #     continue

                if feature_type_label not in [
                        'gene', 'point_mutation', 'deletion', 'RNAi_reagent',
                        'duplication', 'enhancer', 'binding_site',
                        'biological_region', 'complex_substitution',
                        'substitution', 'insertion', 'inverted_repeat']:
                    # note biological_regions include balancers
                    # other options here: promoter, regulatory_region, reagent
                    continue
                line_counter += 1

                attribute_dict = {}
                if attributes != '':
                    attribute_dict = dict(
                        item.split("=")for item in
                        re.sub(r'"', '', attributes).split(";"))

                fid = flabel = desc = None
                if 'ID' in attribute_dict:
                    fid = attribute_dict.get('ID')
                    if re.search(r'WB(Gene|Var|sf)', fid):
                        fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid)
                    elif re.match(r'(gmap|landmark)', fid):
                        continue
                    else:
                        logger.info('other identifier %s', fid)
                        fid = None
                elif 'variation' in attribute_dict:
                    fid = 'WormBase:'+attribute_dict.get('variation')
                    flabel = attribute_dict.get('public_name')
                    sub = attribute_dict.get('substitution')
                    ins = attribute_dict.get('insertion')
                    # if it's a variation:
                    # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T
                    desc = ''
                    if sub is not None:
                        desc = 'substitution='+sub
                    if ins is not None:
                        desc = 'insertion='+ins

                    # keep track of the strains with this variation,
                    # for later processing
                    strain_list = attribute_dict.get('strain')
                    if strain_list is not None:
                        for s in re.split(r',', strain_list):
                            if s.strip() not in strain_to_variant_map:
                                strain_to_variant_map[s.strip()] = set()
                            strain_to_variant_map[s.strip()].add(fid)

                # if feature_type_label == 'RNAi_reagent':
                    # Target=WBRNAi00096030 1 4942
                    # this will tell us where the RNAi is actually binding
                    # target = attribute_dict.get('Target') # TODO unused
                    # rnai_num = re.split(r' ', target)[0]  # TODO unused
                    # it will be the reagent-targeted-gene that has a position,
                    # (i think)
                    # TODO finish the RNAi binding location

                name = attribute_dict.get('Name')
                polymorphism = attribute_dict.get('polymorphism')

                if fid is None:
                    if name is not None and re.match(r'WBsf', name):
                        fid = 'WormBase:'+name
                        name = None
                    else:
                        continue

                if self.testMode \
                        and re.sub(r'WormBase:', '', fid) \
                        not in self.test_ids['gene']+self.test_ids['allele']:
                    continue

                # these really aren't that interesting
                if polymorphism is not None:
                    continue

                if name is not None and not re.search(name, fid):
                    if flabel is None:
                        flabel = name
                    else:
                        model.addSynonym(fid, name)

                if desc is not None:
                    model.addDescription(fid, desc)

                alias = attribute_dict.get('Alias')

                biotype = attribute_dict.get('biotype')
                note = attribute_dict.get('Note')
                other_name = attribute_dict.get('other_name')
                for n in [alias, other_name]:
                    if n is not None:
                        model.addSynonym(fid, other_name)

                ftype = self.get_feature_type_by_class_and_biotype(
                    feature_type_label, biotype)

                chr_id = makeChromID(chrom, build_id, 'CHR')
                geno.addChromosomeInstance(chrom, build_id, build_num)

                feature = Feature(g, fid, flabel, ftype)
                feature.addFeatureStartLocation(start, chr_id, strand)
                feature.addFeatureEndLocation(start, chr_id, strand)

                feature_is_class = False
                if feature_type_label == 'gene':
                    feature_is_class = True

                feature.addFeatureToGraph(True, None, feature_is_class)

                if note is not None:
                    model.addDescription(fid, note)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

                # RNAi reagents:
# I	RNAi_primary	RNAi_reagent	4184	10232	.	+	.	Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10
# I	RNAi_primary	RNAi_reagent	4223	10147	.	+	.	Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052
# I	RNAi_primary	RNAi_reagent	5693	9391	.	+	.	Target=WBRNAi00066135 1 3699 +;laboratory=CH

                # TODO TF bindiing sites and network:
# I	TF_binding_site_region	TF_binding_site	1861	2048	.	+	.	Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16
# I	TF_binding_site_region	TF_binding_site	3403	4072	.	+	.	Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1

        return
Esempio n. 9
0
    def _get_gene_info(self, limit):
        """
        Currently loops through the gene_info file and
        creates the genes as classes, typed with SO.  It will add their label,
        any alternate labels as synonyms, alternate ids as equivlaent classes.
        HPRDs get added as protein products.
        The chromosome and chr band get added as blank node regions,
        and the gene is faldo:located
        on the chr band.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        model = Model(g)

        # not unzipping the file
        logger.info("Processing 'Gene Info' records")
        line_counter = 0
        gene_info = '/'.join((self.rawdir, self.files['gene_info']['file']))
        logger.info("FILE: %s", gene_info)
        # Add taxa and genome classes for those in our filter
        for tax_num in self.tax_ids:
            tax_id = ':'.join(('NCBITaxon', str(tax_num)))
            # tax label can get added elsewhere
            geno.addGenome(tax_id, str(tax_num))
            # label added elsewhere
            model.addClassToGraph(tax_id, None)
        with gzip.open(gene_info, 'rb') as f:
            row = f.readline().decode().strip().split('\t')
            logger.info("Header has %i columns", len(row))
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom,
                 map_loc, desc, gtype, authority_symbol, name,
                 nomenclature_status, other_designations,
                 modification_date, feature_type) = line.split('\t')

                # ##set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #           or (self.filter == 'geneids' and \
                #               (int(gene_num) not in self.gene_ids))):
                #         continue
                # #### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1

                gene_id = ':'.join(('NCBIGene', gene_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))
                gene_type_id = self.map_type_of_gene(gtype.strip())

                if symbol == 'NEWENTRY':
                    label = None
                else:
                    label = symbol
                # sequence feature, not a gene
                if gene_type_id == 'SO:0000110':
                    self.class_or_indiv[gene_id] = 'I'
                else:
                    self.class_or_indiv[gene_id] = 'C'

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    continue

                if self.class_or_indiv[gene_id] == 'C':
                    model.addClassToGraph(gene_id, label, gene_type_id, desc)
                    # NCBI will be the default leader,
                    # so we will not add the leader designation here.
                else:
                    model.addIndividualToGraph(
                        gene_id, label, gene_type_id, desc)
                    # in this case, they aren't genes.
                    # so we want someone else to be the leader.

                if name != '-':
                    model.addSynonym(gene_id, name)
                if synonyms.strip() != '-':
                    for s in synonyms.split('|'):
                        model.addSynonym(
                            gene_id, s.strip(),
                            Assoc.annotation_properties['hasRelatedSynonym'])
                if other_designations.strip() != '-':
                    for s in other_designations.split('|'):
                        model.addSynonym(
                            gene_id, s.strip(),
                            Assoc.annotation_properties['hasRelatedSynonym'])
                if xrefs.strip() != '-':
                    self._add_gene_equivalencies(xrefs, gene_id, tax_num)

                # edge cases of id | symbol | chr | map_loc:
                # 263     AMD1P2    X|Y  with   Xq28 and Yq12
                # 438     ASMT      X|Y  with   Xp22.3 or Yp11.3    # in PAR
                # no idea why there's two bands listed - possibly 2 assemblies
                # 419     ART3      4    with   4q21.1|4p15.1-p14
                # 28227   PPP2R3B   X|Y  Xp22.33; Yp11.3            # in PAR
                # this is of "unknown" type == susceptibility
                # 619538  OMS     10|19|3 10q26.3;19q13.42-q13.43;3p25.3
                # unlocated scaffold
                # 101928066       LOC101928066    1|Un    -\
                # mouse --> 2C3
                # 11435   Chrna1  2       2 C3|2 43.76 cM
                # mouse --> 11B1.1
                # 11548   Adra1b  11      11 B1.1|11 25.81 cM
                # 11717   Ampd3   7       7 57.85 cM|7 E2-E3        # mouse
                # 14421   B4galnt1        10      10 D3|10 74.5 cM  # mouse
                # 323212  wu:fb92e12      19|20   -                 # fish
                # 323368  ints10  6|18    -                         # fish
                # 323666  wu:fc06e02      11|23   -                 # fish

                # feel that the chr placement can't be trusted in this table
                # when there is > 1 listed
                # with the exception of human X|Y,
                # we will only take those that align to one chr

                # FIXME remove the chr mapping below
                # when we pull in the genomic coords
                if str(chrom) != '-' and str(chrom) != '':
                    if re.search(r'\|', str(chrom)) and \
                            str(chrom) not in ['X|Y', 'X; Y']:
                        # means that there's uncertainty in the mapping.
                        # so skip it
                        # TODO we'll need to figure out how to deal with
                        # >1 loc mapping
                        logger.info(
                            '%s is non-uniquely mapped to %s.' +
                            ' Skipping for now.',
                            gene_id, str(chr))
                        continue
                        # X|Y	Xp22.33;Yp11.3

                    # if(not re.match(
                    #        r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
                    #    print('odd chr=',str(chr))
                    if str(chrom) == 'X; Y':
                        chrom = 'X|Y'  # rewrite the PAR regions for processing
                    # do this in a loop to allow PAR regions like X|Y
                    for c in re.split(r'\|', str(chrom)):
                        # assume that the chromosome label is added elsewhere
                        geno.addChromosomeClass(c, tax_id, None)
                        mychrom = makeChromID(c, tax_num, 'CHR')
                        # temporarily use taxnum for the disambiguating label
                        mychrom_syn = makeChromLabel(c, tax_num)
                        model.addSynonym(mychrom, mychrom_syn)
                        band_match = re.match(
                            r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc)
                        if band_match is not None and \
                                len(band_match.groups()) > 0:
                            # if tax_num != '9606':
                            #     continue
                            # this matches the regular kind of chrs,
                            # so make that kind of band
                            # not sure why this matches?
                            #   chrX|Y or 10090chr12|Un"
                            # TODO we probably need a different regex
                            # per organism
                            # the maploc_id already has the numeric chromosome
                            # in it, strip it first
                            bid = re.sub(r'^'+c, '', map_loc)
                            # the generic location (no coordinates)
                            maploc_id = makeChromID(c+bid, tax_num, 'CHR')
                            # print(map_loc,'-->',bid,'-->',maploc_id)
                            # Assume it's type will be added elsewhere
                            band = Feature(g, maploc_id, None, None)
                            band.addFeatureToGraph()
                            # add the band as the containing feature
                            g.addTriple(
                                gene_id,
                                Feature.object_properties['is_subsequence_of'],
                                maploc_id)
                        else:
                            # TODO handle these cases: examples are:
                            # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24,
                            # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1,
                            # 12cen-q21,22q13.3|22q13.3
                            logger.debug(
                                'not regular band pattern for %s: %s',
                                gene_id, map_loc)
                            # add the gene as a subsequence of the chromosome
                            g.addTriple(
                                gene_id,
                                Feature.object_properties['is_subsequence_of'],
                                mychrom)

                geno.addTaxon(tax_id, gene_id)

        return
Esempio n. 10
0
    def _process_genes(self, limit=None):
        """
        This method processes the KEGG gene IDs.
        The label for the gene is pulled as
        the first symbol in the list of gene symbols;
        the rest are added as synonyms.
        The long-form of the gene name is added as a definition.
        This is hardcoded to just processes human genes.

        Triples created:
        <gene_id> is a SO:gene
        <gene_id> rdfs:label <gene_name>

        :param limit:
        :return:

        """
        src_key = 'hsa_genes'
        LOG.info("Processing genes")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        family = Family(graph)
        geno = Genotype(graph)
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in reader:
                (gene_id, gene_name) = row

                gene_id = 'KEGG-' + gene_id.strip()

                # the gene listing has a bunch of labels
                # that are delimited, as:
                # DST, BP240, BPA, BPAG1, CATX-15, CATX15, D6S1101, DMH, DT,
                # EBSB2, HSAN6, MACF2; dystonin; K10382 dystonin
                # it looks like the list is semicolon delimited
                # (symbol, name, gene_class)
                # where the symbol is a comma-delimited list

                # here, we split them up.
                # we will take the first abbreviation and make it the symbol
                # then take the rest as synonyms

                gene_stuff = re.split('r;', gene_name)
                symbollist = re.split(r',', gene_stuff[0])
                first_symbol = symbollist[0].strip()

                if gene_id not in self.label_hash:
                    self.label_hash[gene_id] = first_symbol

                if self.test_mode and gene_id not in self.test_ids['genes']:
                    continue

                # Add the gene as a class.
                geno.addGene(gene_id, first_symbol)

                # add the long name as the description
                if len(gene_stuff) > 1:
                    description = gene_stuff[1].strip()
                    model.addDefinition(gene_id, description)

                # add the rest of the symbols as synonyms
                for i in enumerate(symbollist, start=1):
                    model.addSynonym(gene_id, i[1].strip())

                if len(gene_stuff) > 2:
                    ko_part = gene_stuff[2]
                    ko_match = re.search(r'K\d+', ko_part)
                    if ko_match is not None and len(ko_match.groups()) == 1:
                        ko = 'KEGG-ko:' + ko_match.group(1)
                        family.addMemberOf(gene_id, ko)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break

        LOG.info("Done with genes")
Esempio n. 11
0
    def _process_nlx_157874_1_view(self, raw, limit=None):
        """
        This table contains the Elements of Morphology data that has been
        screen-scraped into DISCO.
        Note that foaf:depiction is inverse of foaf:depicts relationship.

        Since it is bad form to have two definitions,
        we concatenate the two into one string.

        Turtle:
            <eom id> a owl:Class
                rdf:label Literal(eom label)
                oboInOwl:has_related_synonym Literal(synonym list)
                IAO:definition Literal(objective_def. subjective def)
                foaf:depiction Literal(small_image_url),
                               Literal(large_image_url)
                foaf:page Literal(page_url)
                rdfs:comment Literal(long commented text)


        :param raw:
        :param limit:
        :return:
        """

        model = Model(self.graph)
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            reader = csv.reader(f1, delimiter='\t', quotechar='\"')
            for line in reader:
                (morphology_term_id, morphology_term_num,
                 morphology_term_label, morphology_term_url,
                 terminology_category_label, terminology_category_url,
                 subcategory, objective_definition, subjective_definition,
                 comments, synonyms, replaces, small_figure_url,
                 large_figure_url, e_uid, v_uid, v_uuid, v_last_modified,
                 v_status, v_lastmodified_epoch) = line

                # note:
                # e_uid v_uuid v_last_modified terminology_category_url
                # subcategory v_uid morphology_term_num
                # terminology_category_label hp_label notes
                # are currently unused.

                # Add morphology term to graph as a class
                # with label, type, and description.
                model.addClassToGraph(morphology_term_id,
                                      morphology_term_label)

                # Assemble the description text

                if subjective_definition != '' and not (re.match(
                        r'.+\.$', subjective_definition)):
                    # add a trailing period.
                    subjective_definition = subjective_definition.strip() + '.'
                if objective_definition != '' and not (re.match(
                        r'.+\.$', objective_definition)):
                    # add a trailing period.
                    objective_definition = objective_definition.strip() + '.'

                definition = '  '.join(
                    (objective_definition, subjective_definition)).strip()

                model.addDefinition(morphology_term_id, definition)

                # <term id> FOAF:depicted_by literal url
                # <url> type foaf:depiction

                # do we want both images?
                # morphology_term_id has depiction small_figure_url
                if small_figure_url != '':
                    model.addDepiction(morphology_term_id, small_figure_url)

                # morphology_term_id has depiction large_figure_url
                if large_figure_url != '':
                    model.addDepiction(morphology_term_id, large_figure_url)

                # morphology_term_id has comment comments
                if comments != '':
                    model.addComment(morphology_term_id, comments.strip())

                for syn in synonyms.split(';'):
                    model.addSynonym(morphology_term_id, syn.strip(),
                                     self.globaltt['has_exact_synonym'])

                # morphology_term_id has_related_synonym replaces (; delimited)
                if replaces != '' and replaces != synonyms:
                    for syn in replaces.split(';'):
                        model.addSynonym(morphology_term_id, syn.strip(),
                                         self.globaltt['has_related_synonym'])

                # <morphology_term_id> <foaf:page> morphology_term_url
                if morphology_term_id is not None and morphology_term_url is not None:
                    reference = Reference(self.graph, morphology_term_url,
                                          self.globaltt['web page'])
                    reference.addPage(morphology_term_id, morphology_term_url)

                if limit is not None and reader.line_num > limit:
                    break
        return
Esempio n. 12
0
    def process_gaf(self, file, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", file)
        line_counter = 0
        uniprot_hit = 0
        uniprot_miss = 0
        if 7955 in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        if 6239 in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue

                if len(row) > 17 or len(row) < 15:
                    LOG.warning(
                        "Wrong number of columns %i, expected 15 or 17\n%s",
                        len(row), row)
                    continue

                if 17 > len(row) >= 15:
                    row += [""] * (17 - len(row))

                (dbase,
                 gene_num,
                 gene_symbol,
                 qualifier,
                 go_id,
                 ref,
                 eco_symbol,
                 with_or_from,
                 aspect,
                 gene_name,
                 gene_synonym,
                 object_type,
                 taxon,
                 date,
                 assigned_by,
                 annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (dbase == '' or gene_num == '' or gene_symbol == '' or
                        go_id == '' or ref == '' or eco_symbol == '' or
                        aspect == '' or object_type == '' or taxon == '' or
                        date == '' or assigned_by == ''):
                    LOG.error(
                        "Missing required part of annotation on row %d:\n"+'\t'
                        .join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s  is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and not(
                        re.match(r'NCBIGene', gene_id) and
                        int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, syn.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    LOG.info(
                        ">1 taxon (%s) on line %d.  skipping", taxon, line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to', qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n", str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                #######################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id+'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i):
                            LOG.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(gene_id, i)
                            geno.addReagentTargetedGene(i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = wbase.make_reagent_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, i, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(
                                    self.globaltt['experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.test_mode and limit is not None and line_counter > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %f.2%% of %i benifited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
        return
Esempio n. 13
0
    def _get_equivids(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes(not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.

        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)    -- fixed

        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['idmap']['file']))
        model = Model(self.graph)
        line_counter = 0

        # we look some stuff up in OMIM, so initialize here
        # omim = OMIM(self.graph_type, self.are_bnodes_skized)
        id_map = {}
        allomimids = set()
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (nbk_num, shortname, omim_num) = row
                gr_id = 'GeneReviews:' + nbk_num
                omim_id = 'OMIM:' + omim_num
                if not ((self.testMode and len(self.test_ids) > 0
                         and omim_id in self.test_ids) or not self.testMode):
                    continue

                # sometimes there's bad omim nums
                omim_num = omim_num.strip()
                if len(omim_num) > 6:
                    LOG.warning(
                        "OMIM number incorrectly formatted in row %d; skipping:\n%s",
                        line_counter, '\t'.join(row))
                    continue

                # build up a hashmap of the mappings; then process later
                if nbk_num not in id_map:
                    id_map[nbk_num] = set()
                id_map[nbk_num].add(omim_num)

                # add the class along with the shortname
                model.addClassToGraph(gr_id, None)
                model.addSynonym(gr_id, shortname)

                allomimids.add(omim_num)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

            # end looping through file

        # get the omim ids that are not genes
        # entries_that_are_phenotypes = omim.process_entries(
        #    list(allomimids), filter_keep_phenotype_entry_ids, None, None,
        #    limit=limit, globaltt=self.globaltt)
        #
        # LOG.info(
        #    "Filtered out %d/%d entries that are genes or features",
        #    len(allomimids)-len(entries_that_are_phenotypes), len(allomimids))
        ##########################################################################

        # given all_omim_ids from GR,
        # we want to update any which are changed or removed
        # before deciding which are disease / phenotypes
        replaced = allomimids & self.omim_replaced.keys()
        if replaced is not None and len(replaced) > 0:
            LOG.warning("These OMIM ID's are past their pull date: %s",
                        str(replaced))
            for oid in replaced:
                allomimids.remove(oid)
                replacements = self.omim_replaced[oid]
                for rep in replacements:
                    allomimids.update(rep)
        # guard against omim identifiers which have been removed
        obsolete = [
            o for o in self.omim_type
            if self.omim_type[o] == self.globaltt['obsolete']
        ]
        removed = allomimids & set(obsolete)
        if removed is not None and len(removed) > 0:
            LOG.warning("These OMIM ID's are gone: %s", str(removed))
            for oid in removed:
                allomimids.remove(oid)
        # filter for disease /phenotype types (we can argue about what is included)
        omim_phenotypes = set([
            omim for omim in self.omim_type if self.omim_type[omim] in (
                self.globaltt['Phenotype'],
                self.globaltt[
                    'has_affected_feature'],  # both a gene and a phenotype
                self.globaltt['heritable_phenotypic_marker'])
        ])  # probable phenotype
        LOG.info("Have %i omim_ids globally typed as phenotypes from OMIM",
                 len(omim_phenotypes))

        entries_that_are_phenotypes = allomimids & omim_phenotypes
        LOG.info("Filtered out %d/%d entries that are genes or features",
                 len(allomimids - entries_that_are_phenotypes),
                 len(allomimids))

        for nbk_num in self.book_ids:
            gr_id = 'GeneReviews:' + nbk_num
            if nbk_num in id_map:
                omim_ids = id_map.get(nbk_num)
                for omim_num in omim_ids:
                    omim_id = 'OMIM:' + omim_num
                    # add the gene reviews as a superclass to the omim id,
                    # but only if the omim id is not a gene
                    if omim_id in entries_that_are_phenotypes:
                        model.addClassToGraph(omim_id, None)
                        model.addSubClass(omim_id, gr_id)
            # add this as a generic subclass of DOID:4
            model.addSubClass(gr_id, 'DOID:4')

        return
Esempio n. 14
0
    def _process_nlx_157874_1_view(self, raw, limit=None):
        """
        This table contains the Elements of Morphology data .
        Note that foaf:depiction is inverse of foaf:depicts relationship.

        Since it is bad form to have two definitions,
        we concatenate the two into one string.

        Turtle:
            <eom id> a owl:Class
                rdfs:label Literal(eom label)
                oboInOwl:has_related_synonym Literal(synonym list)
                IAO:definition Literal(objective_def. subjective def)
                foaf:depiction Literal(small_image_url),
                               Literal(large_image_url)
                foaf:page Literal(page_url)
                rdfs:comment Literal(long commented text)

        TEC_note: URL are not literals.


        :param raw:
        :param limit:
        :return:
        """

        src_key = 'tables'
        model = Model(self.graph)
        col = self.resources[src_key]['columns']
        with open(raw, 'r') as rawread:
            reader = csv.reader(rawread, delimiter='\t', quotechar='\"')
            row = next(reader)
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                # head -1 dvp.pr_nlx_157874_1|tr '\t' '\n'|
                # sed "s|\(.*\)|# \1 = row[col.index('\1')]|g"

                morphology_term_id = row[col.index(
                    'morphology_term_id')].strip()
                # morphology_term_num = row[col.index('morphology_term_num')]
                morphology_term_label = row[col.index(
                    'morphology_term_label')].strip()
                morphology_term_url = row[col.index(
                    'morphology_term_url')].strip()
                # terminology_category_label = row[
                #   col.index('terminology_category_label')]
                # terminology_category_url = row[col.index('terminology_category_url')]
                # subcategory = row[col.index('subcategory')]
                objective_definition = row[col.index(
                    'objective_definition')].strip()
                subjective_definition = row[col.index(
                    'subjective_definition')].strip()
                comments = row[col.index('comments')].strip()
                synonyms = row[col.index('synonyms')].strip()
                replaces = row[col.index('replaces')].strip()
                small_figure_url = row[col.index('small_figure_url')].strip()
                large_figure_url = row[col.index('large_figure_url')].strip()
                # e_uid = row[col.index('e_uid')]
                # v_uid = row[col.index('v_uid')]
                # v_uuid = row[col.index('v_uuid')]
                # v_lastmodified = row[col.index('v_lastmodified')]
                # v_status = row[col.index('v_status')]
                # v_lastmodified_epoch = row[col.index('v_lastmodified_epoch')]

                # Add morphology term to graph as a class
                # with label, type, and description.
                model.addClassToGraph(morphology_term_id,
                                      morphology_term_label,
                                      blv.terms['PhenotypicFeature'])

                # Assemble the description text

                if subjective_definition != '' and not (re.match(
                        r'.+\.$', subjective_definition)):
                    # add a trailing period.
                    subjective_definition = subjective_definition + '.'
                if objective_definition != '' and not (re.match(
                        r'.+\.$', objective_definition)):
                    # add a trailing period.
                    objective_definition = objective_definition + '.'

                definition = '  '.join(
                    (objective_definition, subjective_definition))

                model.addDefinition(
                    morphology_term_id,
                    definition,
                    class_category=blv.terms['PhenotypicFeature'])

                # <term id> FOAF:depicted_by literal url
                # <url> type foaf:depiction

                # do we want both images?
                # morphology_term_id has depiction small_figure_url
                if small_figure_url != '':
                    model.addDepiction(morphology_term_id, small_figure_url)

                # morphology_term_id has depiction large_figure_url
                if large_figure_url != '':
                    model.addDepiction(morphology_term_id, large_figure_url)

                # morphology_term_id has comment comments
                if comments != '':
                    model.addComment(morphology_term_id, comments)

                for syn in synonyms.split(';'):
                    model.addSynonym(morphology_term_id, syn.strip(),
                                     self.globaltt['has_exact_synonym'])

                # morphology_term_id has_related_synonym replaces (; delimited)
                if replaces not in ['', synonyms]:
                    for syn in replaces.split(';'):
                        syn.strip()
                        if syn != '':
                            model.addSynonym(
                                morphology_term_id, syn,
                                self.globaltt['has_related_synonym'])

                # <morphology_term_id> <foaf:page> morphology_term_url
                if morphology_term_id is not None:
                    reference = Reference(self.graph, morphology_term_id,
                                          self.globaltt['web page'])

                    # TEC 201905:
                    # Not so sure we need explicit   <eom_uri> <webpage> <eom_url>.
                    # since <eom_uri> IS the <eom_url>.

                    reference.addPage(morphology_term_id, morphology_term_url)

                if limit is not None and reader.line_num > limit:
                    break
Esempio n. 15
0
    def _transform_entry(self, ent, graph):
        self.graph = graph
        model = Model(graph)
        geno = Genotype(graph)
        tax_label = 'H**o sapiens'
        tax_id = self.globaltt[tax_label]
        build_num = "GRCh38"
        asm_curie = ':'.join(('NCBIAssembly', build_num))

        # get the numbers, labels, and descriptions
        omim_num = str(ent['entry']['mimNumber'])
        titles = ent['entry']['titles']
        label = titles['preferredTitle']

        other_labels = []
        if 'alternativeTitles' in titles:
            other_labels += self._get_alt_labels(titles['alternativeTitles'])
        if 'includedTitles' in titles:
            other_labels += self._get_alt_labels(titles['includedTitles'])

        # remove the abbreviation (comes after the ;) from the preferredTitle,
        abbrev = None
        lab_lst = label.split(';')
        if len(lab_lst) > 1:
            abbrev = lab_lst[1].strip()
        newlabel = self._cleanup_label(label)

        omim_curie = 'OMIM:' + omim_num
        omimtype = self.omim_type[omim_num]
        nodelabel = newlabel
        # this uses our cleaned-up label
        if omimtype == self.globaltt['heritable_phenotypic_marker']:
            #if abbrev is not None:
            #    nodelabel = abbrev
            # in this special case,
            # make it a disease by not declaring it as a gene/marker
            # ??? and if abbrev is None?
            model.addClassToGraph(omim_curie,
                                  nodelabel,
                                  description=newlabel,
                                  class_category=blv.terms['Disease'])

        elif omimtype in [
                self.globaltt['gene'], self.globaltt['has_affected_feature']
        ]:
            omimtype = self.globaltt['gene']
            if abbrev is not None:
                nodelabel = abbrev
            # omim is subclass_of gene (provide type term)
            model.addClassToGraph(omim_curie,
                                  nodelabel,
                                  self.globaltt['gene'],
                                  newlabel,
                                  class_category=blv.terms['Gene'])
        elif omimtype == self.globaltt['phenotype']:
            model.addClassToGraph(omim_curie,
                                  nodelabel,
                                  description=newlabel,
                                  class_category=blv.terms['Disease'])
        else:
            # omim is NOT subclass_of D|P|or ?...
            model.addClassToGraph(omim_curie, newlabel)

        model.addSynonym(omim_curie, label)

        # add the alternate labels and includes as synonyms
        for label in other_labels:
            model.addSynonym(omim_curie, label,
                             model.globaltt['has_related_synonym'])
            model.addSynonym(omim_curie, label,
                             model.globaltt['has_related_synonym'])

        # KS: commenting out, we will get disease descriptions
        # from MONDO, and gene descriptions from the mygene API

        # if this is a genetic locus (not sequenced) then
        #  add the chrom loc info to the ncbi gene identifier,
        # not to the omim id (we reserve the omim id to be the phenotype)
        #################################################################
        # the above makes no sense to me. (TEC)
        # For Monarch, OMIM is authoritative for disease / phenotype
        #   if they say a phenotype is associated with a locus
        #   that is what dipper should report.
        # OMIM is not authoritative for NCBI gene locations, locus or otherwise.
        # and dipper should not be reporting gene locations via OMIM.

        feature_id = None
        feature_label = None
        if 'geneMapExists' in ent['entry'] and ent['entry']['geneMapExists']:
            genemap = ent['entry']['geneMap']
            is_gene = False

            if omimtype == self.globaltt['heritable_phenotypic_marker']:
                # get the ncbigene ids
                ncbifeature = self._get_mapped_gene_ids(ent['entry'], graph)
                if len(ncbifeature) == 1:
                    feature_id = 'NCBIGene:' + str(ncbifeature[0])
                    # add this feature as a cause for the omim disease
                    # TODO SHOULD I EVEN DO THIS HERE?
                    assoc = G2PAssoc(graph, self.name, feature_id, omim_curie)
                    assoc.add_association_to_graph()
                else:
                    LOG.info(
                        "Its ambiguous when %s maps to not one gene id: %s",
                        omim_curie, str(ncbifeature))
            elif omimtype in [
                    self.globaltt['gene'],
                    self.globaltt['has_affected_feature']
            ]:
                feature_id = omim_curie
                is_gene = True
                omimtype = self.globaltt['gene']
            else:
                # 158900 falls into this category
                feature_id = self._make_anonymous_feature(omim_num)
                if abbrev is not None:
                    feature_label = abbrev
                omimtype = self.globaltt['heritable_phenotypic_marker']

            if feature_id is not None:
                if 'comments' in genemap:
                    # add a comment to this feature
                    comment = genemap['comments']
                    if comment.strip() != '':
                        model.addDescription(feature_id, comment)
                if 'cytoLocation' in genemap:
                    cytoloc = genemap['cytoLocation']
                    # parse the cytoloc.
                    # add this omim thing as
                    # a subsequence of the cytofeature
                    # 18p11.3-p11.2
                    # FIXME
                    # add the other end of the range,
                    # but not sure how to do that
                    # not sure if saying subsequence of feature
                    # is the right relationship

                    feat = Feature(graph, feature_id, feature_label, omimtype)
                    if 'chromosomeSymbol' in genemap:
                        chrom_num = str(genemap['chromosomeSymbol'])
                        chrom = makeChromID(chrom_num, tax_id, 'CHR')
                        geno.addChromosomeClass(chrom_num,
                                                self.globaltt['H**o sapiens'],
                                                tax_label)

                        # add the positional information, if available
                        fstart = fend = -1
                        if 'chromosomeLocationStart' in genemap:
                            fstart = genemap['chromosomeLocationStart']
                        if 'chromosomeLocationEnd' in genemap:
                            fend = genemap['chromosomeLocationEnd']
                        if fstart >= 0:
                            # make the build-specific chromosome
                            chrom_in_build = makeChromID(
                                chrom_num, build_num, 'MONARCH')
                            # then, add the chromosome instance
                            # (from the given build)
                            geno.addChromosomeInstance(chrom_num, asm_curie,
                                                       build_num, chrom)
                            if omimtype == self.globaltt[
                                    'heritable_phenotypic_marker']:
                                postypes = [self.globaltt['FuzzyPosition']]
                            else:
                                postypes = None
                            # NOTE that no strand information
                            # is available in the API
                            feat.addFeatureStartLocation(
                                fstart, chrom_in_build, None, postypes)
                            if fend >= 0:
                                feat.addFeatureEndLocation(
                                    fend, chrom_in_build, None, postypes)
                            if fstart > fend:
                                LOG.info("start>end (%d>%d) for %s", fstart,
                                         fend, omim_curie)
                        # add the cytogenic location too
                        # for now, just take the first one
                        cytoloc = cytoloc.split('-')[0]
                        loc = makeChromID(cytoloc, tax_id, 'CHR')
                        model.addClassToGraph(loc, None)
                        feat.addSubsequenceOfFeature(loc)
                        feat.addFeatureToGraph(True, None, is_gene)

            # end adding causative genes/features

            if ent['entry']['status'] in ['moved', 'removed']:
                LOG.warning('UNEXPECTED! not expecting obsolete record %s',
                            omim_curie)

        self._get_phenotypicseries_parents(ent['entry'], graph)
        self._get_mappedids(ent['entry'], graph)
        self._get_mapped_gene_ids(ent['entry'], graph)
        self._get_pubs(ent['entry'], graph)
        self._get_process_allelic_variants(ent['entry'], graph)
Esempio n. 16
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        src_key = 'gene_history'
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        LOG.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("FILE: %s", myfile)
        col = self.files[src_key]['columns']
        with gzip.open(myfile, 'rb') as tsv:
            row = tsv.readline().decode().strip().split('\t')
            row[0] = row[0][1:]  # strip comment
            if not self.check_fileheader(col, row):
                pass

            for line in tsv:
                # skip comments
                row = line.decode().strip().split('\t')
                if row[0][0] == '#':
                    continue

                tax_num = row[col.index('tax_id')].strip()
                gene_num = row[col.index('GeneID')].strip()
                discontinued_num = row[col.index(
                    'Discontinued_GeneID')].strip()
                discontinued_symbol = row[col.index(
                    'Discontinued_Symbol')].strip()
                # discontinued_date = row[col.index('Discontinue_Date')]

                # set filter=None in init if you don't want to have a filter
                # if self.id_filter is not None:
                #     if ((self.id_filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.id_filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.test_mode and gene_num not in self.gene_ids:
                    continue

                if not self.test_mode and tax_num not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(discontinued_gene_id,
                                          discontinued_symbol)

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(discontinued_gene_id,
                                               discontinued_symbol)
                    model.addDeprecatedIndividual(discontinued_gene_id,
                                                  [gene_id])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if not self.test_mode and (limit is not None
                                           and line_counter > limit):
                    break
Esempio n. 17
0
    def _process_ortholog_classes(self, limit=None):
        """
        This method add the KEGG orthology classes to the graph.

        If there's an embedded enzyme commission number,
        that is added as an xref.

        Triples created:
        <orthology_class_id> is a class
        <orthology_class_id> has label <orthology_symbols>
        <orthology_class_id> has description <orthology_description>
        :param limit:

        :return:
        """
        src_key = 'ortholog_classes'
        LOG.info("Processing ortholog classes")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in reader:
                (orthology_class_id, orthology_class_name) = row

                if self.test_mode and orthology_class_id \
                        not in self.test_ids[src_key]:
                    continue

                # The orthology class is essentially a KEGG gene ID
                # that is species agnostic.
                # Add the ID and label as a gene family class

                other_labels = re.split(r'[;,]', orthology_class_name)
                # the first one is the label we'll use
                orthology_label = other_labels[0]

                orthology_class_id = 'KEGG-' + orthology_class_id.strip()

                orthology_type = self.globaltt['gene_family']
                model.addClassToGraph(orthology_class_id, orthology_label,
                                      orthology_type)
                if len(other_labels) > 1:
                    # add the rest as synonyms
                    # todo skip the first
                    for s in other_labels:
                        model.addSynonym(orthology_class_id, s.strip())

                    # add the last one as the description
                    d = other_labels[len(other_labels) - 1]
                    model.addDescription(orthology_class_id, d)

                    # add the enzyme commission number (EC:1.2.99.5)as an xref
                    # sometimes there's two, like [EC:1.3.5.1 1.3.5.4]
                    # can also have a dash, like EC:1.10.3.-
                    ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d)
                    if ec_matches is not None:
                        for ecm in ec_matches:
                            model.addXref(orthology_class_id, 'EC:' + ecm)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
        LOG.info("Done with ortholog classes")
Esempio n. 18
0
    def process_gaf(self, file, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", file)
        line_counter = 0
        uniprot_hit = 0
        uniprot_miss = 0
        if 7955 in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        if 6239 in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue

                if len(row) > 17 or len(row) < 15:
                    LOG.warning(
                        "Wrong number of columns %i, expected 15 or 17\n%s",
                        len(row), row)
                    continue

                if 17 > len(row) >= 15:
                    row += [""] * (17 - len(row))

                (dbase, gene_num, gene_symbol, qualifier, go_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 object_type, taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (dbase == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    LOG.error(
                        "Missing required part of annotation on row %d:\n" +
                        '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s  is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and not (re.match(r'NCBIGene', gene_id)
                                           and int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, syn.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    LOG.info(">1 taxon (%s) on line %d.  skipping", taxon,
                             line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to',
                                                   qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n",
                                str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                #######################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i):
                            LOG.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = wbase.make_reagent_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, i, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(self.globaltt[
                                    'experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.test_mode and limit is not None and line_counter > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %f.2%% of %i benifited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
        return
Esempio n. 19
0
    def _process_ortholog_classes(self, limit=None):
        """
        This method add the KEGG orthology classes to the graph.

        If there's an embedded enzyme commission number,
        that is added as an xref.

        Triples created:
        <orthology_class_id> is a class
        <orthology_class_id> has label <orthology_symbols>
        <orthology_class_id> has description <orthology_description>
        :param limit:

        :return:
        """

        LOG.info("Processing ortholog classes")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in reader:
                (orthology_class_id, orthology_class_name) = row

                if self.test_mode and orthology_class_id \
                        not in self.test_ids['orthology_classes']:
                    continue

                # The orthology class is essentially a KEGG gene ID
                # that is species agnostic.
                # Add the ID and label as a gene family class

                other_labels = re.split(r'[;,]', orthology_class_name)
                # the first one is the label we'll use
                orthology_label = other_labels[0]

                orthology_class_id = 'KEGG-'+orthology_class_id.strip()

                orthology_type = self.globaltt['gene_family']
                model.addClassToGraph(
                    orthology_class_id, orthology_label, orthology_type)
                if len(other_labels) > 1:
                    # add the rest as synonyms
                    # todo skip the first
                    for s in other_labels:
                        model.addSynonym(orthology_class_id, s.strip())

                    # add the last one as the description
                    d = other_labels[len(other_labels)-1]
                    model.addDescription(orthology_class_id, d)

                    # add the enzyme commission number (EC:1.2.99.5)as an xref
                    # sometimes there's two, like [EC:1.3.5.1 1.3.5.4]
                    # can also have a dash, like EC:1.10.3.-
                    ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d)
                    if ec_matches is not None:
                        for ecm in ec_matches:
                            model.addXref(orthology_class_id, 'EC:' + ecm)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
        LOG.info("Done with ortholog classes")
Esempio n. 20
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0

        model = Model(graph)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignoreS element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text

                disorder_id = 'Orphanet:'+str(disorder_num)

                if self.testMode and disorder_id \
                        not in config.get_config()['test_ids']['disease']:
                    continue

                disorder_label = elem.find('Name').text

                # make a hash of internal gene id to type for later lookup
                gene_iid_to_type = {}
                gene_list = elem.find('GeneList')
                for gene in gene_list.findall('Gene'):
                    gene_iid = gene.get('id')
                    gene_type = gene.find('GeneType').get('id')
                    gene_iid_to_type[gene_iid] = gene_type

                # assuming that these are in the ontology
                model.addClassToGraph(disorder_id, disorder_label)

                assoc_list = elem.find('DisorderGeneAssociationList')
                for assoc in assoc_list.findall('DisorderGeneAssociation'):
                    gene_iid = assoc.find('.//Gene').get('id')
                    gene_name = assoc.find('.//Gene/Name').text
                    gene_symbol = assoc.find('.//Gene/Symbol').text
                    gene_num = assoc.find('./Gene/OrphaNumber').text
                    gene_id = 'Orphanet:' + str(gene_num)
                    gene_type_id = self.resolve(gene_iid_to_type[gene_iid])
                    model.addClassToGraph(
                        gene_id, gene_symbol, gene_type_id, gene_name)
                    syn_list = assoc.find('./Gene/SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for s in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_id, s.text)

                    # IDs appear stable but removing for now  KS
                    # dgtype = assoc.find('DisorderGeneAssociationType').get('id')
                    # rel_id = self.resolve(dgtype)
                    dg_label = assoc.find('./DisorderGeneAssociationType/Name').text
                    # if rel_id is None:
                    #    logger.warning(
                    #        "Cannot map association type (%s) to RO " +
                    #        "for association (%s | %s).  Skipping.",
                    #        dg_label, disorder_label, gene_symbol)
                    #    continue

                    # alt_locus_id = '_:' + gene_num + '-' + disorder_num + 'VL'
                    # alt_label = ' '.join((
                    #    'some variant of', gene_symbol.strip(), disorder_label))

                    # model.addIndividualToGraph(
                    #    alt_locus_id, alt_label, self.globaltt['variant_locus'])
                    # geno.addAffectedLocus(alt_locus_id, gene_id)
                    # model.addBlankNodeAnnotation(alt_locus_id)

                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use "assessed" status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    status_code = assoc.find('DisorderGeneAssociationStatus').get('id')
                    # imported automatically asserted information
                    # used in automatic assertion
                    eco_id = self.globaltt[
                        # can we get a more consice label, this reads like a description
                        'imported automatically asserted information used in automatic assertion']
                    # Assessed
                    # TODO are these internal ids stable between releases?
                    if status_code == '17991':
                        # imported manually asserted information
                        # used in automatic assertion
                        eco_id = self.globaltt[
                            'imported manually asserted information used in automatic assertion']
                    # Non-traceable author statement ECO_0000034
                    # imported information in automatic assertion ECO_0000313

                    # assoc = G2PAssoc(
                    #    graph, self.name, alt_locus_id, disorder_id, rel_id)
                    # assoc.add_evidence(eco_id)
                    # assoc.add_association_to_graph()

                    self.add_gene_to_disease(
                        dg_label, gene_id, gene_symbol, disorder_id, eco_id)

                    rlist = assoc.find('./Gene/ExternalReferenceList')
                    eqid = None

                    for r in rlist.findall('ExternalReference'):
                        if r.find('Source').text == 'Ensembl':
                            eqid = 'ENSEMBL:' + r.find('Reference').text
                        elif r.find('Source').text == 'HGNC':
                            eqid = 'HGNC:' + r.find('Reference').text
                        elif r.find('Source').text == 'OMIM':
                            eqid = 'OMIM:' + r.find('Reference').text
                        else:
                            pass  # skip the others for now
                        if eqid is not None:
                            model.addClassToGraph(eqid, None)
                            model.addEquivalentClass(gene_id, eqid)
                elem.clear()  # empty the element

            if self.testMode and limit is not None and line_counter > limit:
                return

        return
Esempio n. 21
0
    def _get_chrbands(self, limit, taxon, genome_id=None):
        """
        For the given taxon, it will fetch the chr band file.
        We will not deal with the coordinate information with this parser.
        Here, we only are concerned with building the partonomy.
        :param limit:
        :param: taxon:
        :param: genome
        :return:

        """
        model = Model(self.graph)
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        LOG.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)

        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:' + taxon

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_id, None)
        model.addSynonym(taxon_id, genome_label)

        if genome_id is None:
            genome_id = geno.makeGenomeID(taxon_id)  # makes a blank node allways
        geno.addGenome(taxon_id, genome_label, genome_id)
        model.addOWLPropertyClassRestriction(
            genome_id, self.globaltt['in taxon'], taxon_id)

        placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)'
        # currently unused patterns
        # unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random'
        # unplaced_scaffold_pattern = r'chrUn_(\w+)'

        col = ['chrom', 'start', 'stop', 'band', 'rtype']
        with gzip.open(myfile, 'rb') as reader:
            for line in reader:
                line_counter += 1
                # skip comments
                line = line.decode().strip()
                if line[0] == '#':
                    continue
                # chr13	4500000	10000000	p12	stalk
                row = line.split('\t')
                chrom = row[col.index('chrom')]
                band = row[col.index('band')]
                rtype = row[col.index('rtype')]
                # NOTE
                # some less-finished genomes have placed and unplaced scaffolds
                # * Placed scaffolds:
                #    Scaffold has an oriented location within a chromosome.
                # * Unlocalized scaffolds:
                #     scaffold 's chromosome  is known,
                #     scaffold's position, orientation or both is not known.
                # *Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to.
                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1

                mch = re.match(placed_scaffold_pattern+r'$', chrom)
                if mch is not None and len(mch.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    # chrom = m.group(1)  # TODO unused
                    pass
                else:
                    # let's skip over anything that isn't a placed_scaffold
                    LOG.info("Skipping non-placed chromosome %s", chrom)
                    continue
                # the chrom class, taxon as the reference
                cclassid = makeChromID(chrom, taxon, 'CHR')

                # add the chromosome as a class
                geno.addChromosomeClass(chrom, taxon_id, genome_label)
                model.addOWLPropertyClassRestriction(
                    cclassid, self.globaltt['member of'], genome_id)

                # add the band(region) as a class
                maplocclass_id = cclassid+band
                maplocclass_label = makeChromLabel(chrom+band, genome_label)
                if band is not None and band.strip() != '':

                    region_type_id = self.map_type_of_region(rtype)
                    model.addClassToGraph(
                        maplocclass_id, maplocclass_label, region_type_id)
                else:
                    region_type_id = self.globaltt['chromosome']

                # add the staining intensity of the band
                if re.match(r'g(neg|pos|var)', rtype):
                    if region_type_id in [
                            self.globaltt['chromosome_band'],
                            self.globaltt['chromosome_subband']]:
                        stain_type = self.resolve(rtype)
                        if stain_type is not None:
                            model.addOWLPropertyClassRestriction(
                                maplocclass_id,
                                self.globaltt['has_sequence_attribute'],
                                self.resolve(rtype))
                    else:
                        # usually happens if it's a chromosome because
                        # they don't actually have banding info
                        LOG.info("feature type %s != chr band", region_type_id)
                else:
                    LOG.info('staining type not found for: %s', rtype)

                # get the parent bands, and make them unique
                parents = list(self.make_parent_bands(band, set()))
                # alphabetical sort will put them in smallest to biggest
                parents.sort(reverse=True)

                # print("PARENTS of", maplocclass_id, "=", parents)
                # add the parents to the graph, in hierarchical order
                # TODO this is somewhat inefficient due to
                # re-adding upper-level nodes when iterating over the file
                for prnt in parents:
                    parent = prnt.strip()
                    if parent is None or parent == "":
                        continue
                    pclassid = cclassid + parent  # class chr parts
                    pclass_label = makeChromLabel(chrom + parent, genome_label)
                    rti = getChrPartTypeByNotation(parent, self.graph)
                    model.addClassToGraph(pclassid, pclass_label, rti)

                    # for canonical chromosomes,
                    # then the subbands are subsequences of the full band
                    # add the subsequence stuff as restrictions

                    if prnt != parents[-1]:
                        grandparent = 1 + parents.index(prnt)
                        pid = cclassid + parents[grandparent]   # the instance
                        model.addOWLPropertyClassRestriction(
                            pclassid, self.globaltt['is subsequence of'], pid)
                        model.addOWLPropertyClassRestriction(
                            pid, self.globaltt['has subsequence'], pclassid)
                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        model.addOWLPropertyClassRestriction(
                            pclassid, self.globaltt['is subsequence of'], cclassid)
                        model.addOWLPropertyClassRestriction(
                            cclassid, self.globaltt['has subsequence'], pclassid)

                # connect the band here to the first one in the parent list
                if len(parents) > 0:
                    model.addOWLPropertyClassRestriction(
                        maplocclass_id, self.globaltt['is subsequence of'],
                        cclassid + parents[0])
                    model.addOWLPropertyClassRestriction(
                        cclassid + parents[0], self.globaltt['has subsequence'],
                        maplocclass_id)

                if limit is not None and line_counter > limit:
                    break
Esempio n. 22
0
    def process_gaf(self, file, limit, id_map=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        geno = Genotype(g)
        logger.info("Processing Gene Associations from %s", file)
        line_counter = 0

        if 7955 in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        elif 6239 in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue
                (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
                 with_or_from, aspect, gene_name, gene_synonym, object_type,
                 taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (db == '' or gene_num == '' or gene_symbol == '' or
                        go_id == '' or ref == '' or eco_symbol == '' or
                        aspect == '' or object_type == '' or taxon == '' or
                        date == '' or assigned_by == ''):
                    logger.error(
                        "Missing required part of annotation " +
                        "on row %d:\n"+'\t'.join(row),
                        line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                db = self.clean_db_prefix(db)
                uniprotid = None
                gene_id = None
                if db == 'UniProtKB':
                    mapped_ids = id_map.get(gene_num)
                    if id_map is not None and mapped_ids is not None:
                        if len(mapped_ids) == 1:
                            gene_id = mapped_ids[0]
                            uniprotid = ':'.join((db, gene_num))
                            gene_num = re.sub(r'\w+\:', '', gene_id)
                        elif len(mapped_ids) > 1:
                            # logger.warning(
                            #   "Skipping gene id mapped for >1 gene %s -> %s",
                            #    gene_num, str(mapped_ids))
                            continue
                    else:
                        continue
                elif db == 'MGI':
                    gene_num = re.sub(r'MGI:', '', gene_num)
                    gene_id = ':'.join((db, gene_num))
                    gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
                else:
                    gene_id = ':'.join((db, gene_num))

                if self.testMode \
                        and not(
                            re.match(r'NCBIGene', gene_id) and
                            int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for s in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, s.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    logger.info(">1 taxon (%s) on line %d.  skipping", taxon,
                                line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(g, self.name)

                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                refs = re.split(r'\|', ref)
                for r in refs:
                    r = r.strip()
                    if r != '':
                        prefix = re.split(r':', r)[0]
                        r = re.sub(prefix, self.clean_db_prefix(prefix), r)
                        r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                        ref = Reference(g, r)
                        if re.match(r'PMID', r):
                            ref_type = Reference.ref_types['journal_article']
                            ref.setType(ref_type)
                        ref.addRefToGraph()
                        assoc.add_source(r)

                # TODO add the source of the annotations from assigned by?

                aspect_rel_map = {
                    'P': model.object_properties['involved_in'],  # involved in
                    'F': model.object_properties['enables'],  # enables
                    'C': model.object_properties['part_of']  # part of
                }

                if aspect not in aspect_rel_map:
                    logger.error("Aspect not recognized: %s", aspect)

                rel = aspect_rel_map.get(aspect)
                if aspect == 'F' and re.search(r'contributes_to', qualifier):
                    rel = model.object_properties['contributes_to']
                assoc.set_relationship(rel)
                if uniprotid is not None:
                    assoc.set_description('Mapped from '+uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used

                assoc.add_association_to_graph()

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id+'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or \
                                re.match(
                                    r'(UniProtKB|WBPhenotype|InterPro|HGNC)',
                                    i):
                            logger.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s",
                                uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(i, gene_id,
                                                        targeted_gene_id)
                            # TODO PYLINT why is this:
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = \
                                wbase.make_reagent_targeted_gene_id(
                                    gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(
                                g, self.name, targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(g, self.name, i, phenotypeid)
                        for r in refs:
                            r = r.strip()
                            if r != '':
                                prefix = re.split(r':', r)[0]
                                r = re.sub(
                                    prefix, self.clean_db_prefix(prefix), r)
                                r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                                assoc.add_source(r)
                                # experimental phenotypic evidence
                                assoc.add_evidence("ECO:0000059")
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Esempio n. 23
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0

        model = Model(graph)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignore element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text
                disorder_id = 'ORPHA:' + str(disorder_num)

                if self.test_mode and disorder_id not in self.all_test_ids[
                        'disease']:
                    continue
                disorder_label = elem.find('Name').text

                # assuming that these are in the ontology (...any particular one?)
                model.addClassToGraph(disorder_id, disorder_label)
                assoc_list = elem.find('DisorderGeneAssociationList')
                expected_genes = assoc_list.get('count')
                LOG.info('Expecting %s genes associated with disorder %s.',
                         expected_genes, disorder_id)
                processed_genes = 0
                for assoc in assoc_list.findall('DisorderGeneAssociation'):
                    processed_genes += 1
                    gene = assoc.find('Gene')

                    # get gene's curie  HGNC or Ensembl ...

                    lclid = gene.find('OrphaNumber').text
                    gene_curie = 'ORPHA:' + lclid
                    gene_set = {'ORPHA': lclid}
                    for gene_ref in gene.findall(
                            './ExternalReferenceList/ExternalReference'):
                        gene_set[gene_ref.find('Source').text] = \
                            gene_ref.find('Reference').text

                    # set priority (clique leader if available) but default to OPRHA
                    for pfx in ('HGNC', 'Ensembl', 'SwissProt'):
                        if pfx in gene_set:
                            if pfx in self.localtt:
                                pfx = self.localtt[pfx]
                            gene_curie = pfx + ':' + gene_set[pfx]
                            gene_set.pop(pfx)
                            model.addClassToGraph(gene_curie, None)
                            break

                    # TEC have reservations w.r.t aggerator links being gene classes
                    for prefix in gene_set:
                        lclid = gene_set[prefix]
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]

                        dbxref = prefix + ':' + lclid

                        if gene_curie != dbxref:
                            model.addClassToGraph(dbxref, None)
                            model.addEquivalentClass(gene_curie, dbxref)

                    # TEC. would prefer this not happen here. let HGNC handle it
                    # except there are some w/o explicit external links ...

                    gene_symbol = gene.find('Symbol').text

                    syn_list = gene.find('./SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for syn in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_curie, syn.text)

                    dg_label = assoc.find(
                        './DisorderGeneAssociationType/Name').text

                    # use dg association status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    eco_id = self.resolve(
                        assoc.find('DisorderGeneAssociationStatus/Name').text)

                    rel_id = self.resolve(dg_label)

                    g2p_assoc = G2PAssoc(self.graph, self.name, gene_curie,
                                         disorder_id, rel_id)
                    g2p_assoc.add_evidence(eco_id)
                    g2p_assoc.add_association_to_graph()

                elem.clear()  # empty the element
                if int(expected_genes) != processed_genes:
                    LOG.warning(
                        '% expected %s associated genes but we processed %i',
                        disorder_id, expected_genes, processed_genes)

            if self.test_mode and limit is not None and line_counter > limit:
                return

        return
Esempio n. 24
0
    def _process_genes(self, limit=None):
        """
        This method processes the KEGG gene IDs.
        The label for the gene is pulled as
        the first symbol in the list of gene symbols;
        the rest are added as synonyms.
        The long-form of the gene name is added as a definition.
        This is hardcoded to just processes human genes.

        Triples created:
        <gene_id> is a SO:gene
        <gene_id> rdfs:label <gene_name>

        :param limit:
        :return:

        """

        LOG.info("Processing genes")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        family = Family(graph)
        geno = Genotype(graph)
        raw = '/'.join((self.rawdir, self.files['hsa_genes']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in reader:
                (gene_id, gene_name) = row

                gene_id = 'KEGG-'+gene_id.strip()

                # the gene listing has a bunch of labels
                # that are delimited, as:
                # DST, BP240, BPA, BPAG1, CATX-15, CATX15, D6S1101, DMH, DT,
                # EBSB2, HSAN6, MACF2; dystonin; K10382 dystonin
                # it looks like the list is semicolon delimited
                # (symbol, name, gene_class)
                # where the symbol is a comma-delimited list

                # here, we split them up.
                # we will take the first abbreviation and make it the symbol
                # then take the rest as synonyms

                gene_stuff = re.split('r;', gene_name)
                symbollist = re.split(r',', gene_stuff[0])
                first_symbol = symbollist[0].strip()

                if gene_id not in self.label_hash:
                    self.label_hash[gene_id] = first_symbol

                if self.test_mode and gene_id not in self.test_ids['genes']:
                    continue

                # Add the gene as a class.
                geno.addGene(gene_id, first_symbol)

                # add the long name as the description
                if len(gene_stuff) > 1:
                    description = gene_stuff[1].strip()
                    model.addDefinition(gene_id, description)

                # add the rest of the symbols as synonyms
                for i in enumerate(symbollist, start=1):
                    model.addSynonym(gene_id, i[1].strip())

                if len(gene_stuff) > 2:
                    ko_part = gene_stuff[2]
                    ko_match = re.search(r'K\d+', ko_part)
                    if ko_match is not None and len(ko_match.groups()) == 1:
                        ko = 'KEGG-ko:'+ko_match.group(1)
                        family.addMemberOf(gene_id, ko)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break

        LOG.info("Done with genes")
Esempio n. 25
0
    def _get_gene_info(self, limit):
        """
        Currently loops through the gene_info file and
        creates the genes as classes, typed with SO.  It will add their label,
        any alternate labels as synonyms, alternate ids as equivlaent classes.
        HPRDs get added as protein products.
        The chromosome and chr band get added as blank node regions,
        and the gene is faldo:located
        on the chr band.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        model = Model(g)

        # not unzipping the file
        logger.info("Processing 'Gene Info' records")
        line_counter = 0
        gene_info = '/'.join((self.rawdir, self.files['gene_info']['file']))
        logger.info("FILE: %s", gene_info)
        # Add taxa and genome classes for those in our filter
        for tax_num in self.tax_ids:
            tax_id = ':'.join(('NCBITaxon', str(tax_num)))
            # tax label can get added elsewhere
            geno.addGenome(tax_id, str(tax_num))
            # label added elsewhere
            model.addClassToGraph(tax_id, None)
        with gzip.open(gene_info, 'rb') as f:
            row = f.readline().decode().strip().split('\t')
            logger.info("Header has %i columns", len(row))
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom,
                 map_loc, desc, gtype, authority_symbol, name,
                 nomenclature_status, other_designations, modification_date,
                 feature_type) = line.split('\t')

                # ##set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #           or (self.filter == 'geneids' and \
                #               (int(gene_num) not in self.gene_ids))):
                #         continue
                # #### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1

                gene_id = ':'.join(('NCBIGene', gene_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))
                gene_type_id = self.map_type_of_gene(gtype.strip())

                if symbol == 'NEWENTRY':
                    label = None
                else:
                    label = symbol
                # sequence feature, not a gene
                if gene_type_id == 'SO:0000110':
                    self.class_or_indiv[gene_id] = 'I'
                else:
                    self.class_or_indiv[gene_id] = 'C'

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    continue

                if self.class_or_indiv[gene_id] == 'C':
                    model.addClassToGraph(gene_id, label, gene_type_id, desc)
                    # NCBI will be the default leader,
                    # so we will not add the leader designation here.
                else:
                    model.addIndividualToGraph(gene_id, label, gene_type_id,
                                               desc)
                    # in this case, they aren't genes.
                    # so we want someone else to be the leader.

                if name != '-':
                    model.addSynonym(gene_id, name)
                if synonyms.strip() != '-':
                    for s in synonyms.split('|'):
                        model.addSynonym(
                            gene_id, s.strip(),
                            Assoc.annotation_properties['hasRelatedSynonym'])
                if other_designations.strip() != '-':
                    for s in other_designations.split('|'):
                        model.addSynonym(
                            gene_id, s.strip(),
                            Assoc.annotation_properties['hasRelatedSynonym'])
                if xrefs.strip() != '-':
                    self._add_gene_equivalencies(xrefs, gene_id, tax_num)

                # edge cases of id | symbol | chr | map_loc:
                # 263     AMD1P2    X|Y  with   Xq28 and Yq12
                # 438     ASMT      X|Y  with   Xp22.3 or Yp11.3    # in PAR
                # no idea why there's two bands listed - possibly 2 assemblies
                # 419     ART3      4    with   4q21.1|4p15.1-p14
                # 28227   PPP2R3B   X|Y  Xp22.33; Yp11.3            # in PAR
                # this is of "unknown" type == susceptibility
                # 619538  OMS     10|19|3 10q26.3;19q13.42-q13.43;3p25.3
                # unlocated scaffold
                # 101928066       LOC101928066    1|Un    -\
                # mouse --> 2C3
                # 11435   Chrna1  2       2 C3|2 43.76 cM
                # mouse --> 11B1.1
                # 11548   Adra1b  11      11 B1.1|11 25.81 cM
                # 11717   Ampd3   7       7 57.85 cM|7 E2-E3        # mouse
                # 14421   B4galnt1        10      10 D3|10 74.5 cM  # mouse
                # 323212  wu:fb92e12      19|20   -                 # fish
                # 323368  ints10  6|18    -                         # fish
                # 323666  wu:fc06e02      11|23   -                 # fish

                # feel that the chr placement can't be trusted in this table
                # when there is > 1 listed
                # with the exception of human X|Y,
                # we will only take those that align to one chr

                # FIXME remove the chr mapping below
                # when we pull in the genomic coords
                if str(chrom) != '-' and str(chrom) != '':
                    if re.search(r'\|', str(chrom)) and \
                            str(chrom) not in ['X|Y', 'X; Y']:
                        # means that there's uncertainty in the mapping.
                        # so skip it
                        # TODO we'll need to figure out how to deal with
                        # >1 loc mapping
                        logger.info(
                            '%s is non-uniquely mapped to %s.' +
                            ' Skipping for now.', gene_id, str(chr))
                        continue
                        # X|Y	Xp22.33;Yp11.3

                    # if(not re.match(
                    #        r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
                    #    print('odd chr=',str(chr))
                    if str(chrom) == 'X; Y':
                        chrom = 'X|Y'  # rewrite the PAR regions for processing
                    # do this in a loop to allow PAR regions like X|Y
                    for c in re.split(r'\|', str(chrom)):
                        # assume that the chromosome label is added elsewhere
                        geno.addChromosomeClass(c, tax_id, None)
                        mychrom = makeChromID(c, tax_num, 'CHR')
                        # temporarily use taxnum for the disambiguating label
                        mychrom_syn = makeChromLabel(c, tax_num)
                        model.addSynonym(mychrom, mychrom_syn)
                        band_match = re.match(r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$',
                                              map_loc)
                        if band_match is not None and \
                                len(band_match.groups()) > 0:
                            # if tax_num != '9606':
                            #     continue
                            # this matches the regular kind of chrs,
                            # so make that kind of band
                            # not sure why this matches?
                            #   chrX|Y or 10090chr12|Un"
                            # TODO we probably need a different regex
                            # per organism
                            # the maploc_id already has the numeric chromosome
                            # in it, strip it first
                            bid = re.sub(r'^' + c, '', map_loc)
                            # the generic location (no coordinates)
                            maploc_id = makeChromID(c + bid, tax_num, 'CHR')
                            # print(map_loc,'-->',bid,'-->',maploc_id)
                            # Assume it's type will be added elsewhere
                            band = Feature(g, maploc_id, None, None)
                            band.addFeatureToGraph()
                            # add the band as the containing feature
                            g.addTriple(
                                gene_id,
                                Feature.object_properties['is_subsequence_of'],
                                maploc_id)
                        else:
                            # TODO handle these cases: examples are:
                            # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24,
                            # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1,
                            # 12cen-q21,22q13.3|22q13.3
                            logger.debug('not regular band pattern for %s: %s',
                                         gene_id, map_loc)
                            # add the gene as a subsequence of the chromosome
                            g.addTriple(
                                gene_id,
                                Feature.object_properties['is_subsequence_of'],
                                mychrom)

                geno.addTaxon(tax_id, gene_id)

        return
Esempio n. 26
0
    def _get_equivids(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes(not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.

        :param limit:

        """
        raw = '/'.join((self.rawdir, self.files['idmap']['file']))
        model = Model(self.graph)
        LOG.info('Looping over %s', raw)
        # we look some stuff up in OMIM, so initialize here
        # omim = OMIM(self.graph_type, self.are_bnodes_skized)
        id_map = {}
        allomimids = set()
        col = ['NBK_id', 'GR_shortname', 'OMIM']

        with open(raw, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            row = next(reader)
            row[0] = row[0][1:]
            if not self.check_fileheader(col, row):
                exit(-1)

            for row in filereader:

                nbk_num = row[col.index('NBK_id')]
                shortname = row[col.index('GR_shortname')]
                omim_num = row[col.index('OMIM')]
                gr_id = 'GeneReviews:' + nbk_num
                omim_id = 'OMIM:' + omim_num
                if not (
                        (self.test_mode and
                         len(self.test_ids) > 0 and
                         omim_id in self.test_ids) or not
                        self.test_mode):
                    continue

                # sometimes there's bad omim nums
                omim_num = omim_num.strip()
                if len(omim_num) != 6:
                    LOG.warning(
                        "OMIM number incorrectly formatted in row %i; skipping:\n%s",
                        filereader.line_num, '\t'.join(row))
                    continue

                # build up a hashmap of the mappings; then process later
                if nbk_num not in id_map:
                    id_map[nbk_num] = set()
                id_map[nbk_num].add(omim_num)

                # add the class along with the shortname
                model.addClassToGraph(gr_id, None)
                model.addSynonym(gr_id, shortname)

                allomimids.add(omim_num)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break

            # end looping through file

        # given all_omim_ids from GR,
        # we want to update any which are changed or removed
        # before deciding which are disease / phenotypes
        replaced = allomimids & self.omim_replaced.keys()
        if replaced is not None and len(replaced) > 0:
            LOG.warning("These OMIM ID's are past their pull date: %s", str(replaced))
            for oid in replaced:
                allomimids.remove(oid)
                replacements = self.omim_replaced[oid]
                for rep in replacements:
                    allomimids.update(rep)
        # guard against omim identifiers which have been removed
        obsolete = [
            o for o in self.omim_type
            if self.omim_type[o] == self.globaltt['obsolete']]
        removed = allomimids & set(obsolete)
        if removed is not None and len(removed) > 0:
            LOG.warning("These OMIM ID's are gone: %s", str(removed))
            for oid in removed:
                allomimids.remove(oid)
        # filter for disease /phenotype types (we can argue about what is included)
        omim_phenotypes = set([
            omim for omim in self.omim_type if self.omim_type[omim] in (
                self.globaltt['Phenotype'],
                self.globaltt['has_affected_feature'],  # both a gene and a phenotype
                self.globaltt['heritable_phenotypic_marker'])])  # probable phenotype
        LOG.info(
            "Have %i omim_ids globally typed as phenotypes from OMIM",
            len(omim_phenotypes))

        entries_that_are_phenotypes = allomimids & omim_phenotypes
        LOG.info(
            "Filtered out %d/%d entries that are genes or features",
            len(allomimids - entries_that_are_phenotypes), len(allomimids))

        for nbk_num in self.book_ids:
            gr_id = 'GeneReviews:'+nbk_num
            if nbk_num in id_map:
                omim_ids = id_map.get(nbk_num)
                for omim_num in omim_ids:
                    omim_id = 'OMIM:'+omim_num
                    # add the gene reviews as a superclass to the omim id,
                    # but only if the omim id is not a gene
                    if omim_id in entries_that_are_phenotypes:
                        model.addClassToGraph(omim_id, None)
                        model.addSubClass(omim_id, gr_id)
            # add this as a generic subclass  -- TEC: this is the job of inference
            model.addSubClass(gr_id, self.globaltt['disease'])
Esempio n. 27
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, discontinued_num, discontinued_symbol,
                 discontinued_date) = line.split('\t')

                # set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))

                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(discontinued_gene_id,
                                          discontinued_symbol)

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(discontinued_gene_id,
                                               discontinued_symbol)
                    model.addDeprecatedIndividual(discontinued_gene_id,
                                                  [gene_id])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if (not self.testMode) and\
                        (limit is not None and line_counter > limit):
                    break

        return
Esempio n. 28
0
    def _get_chrbands(self, limit, taxon):
        """
        For the given taxon, it will fetch the chr band file.
        We will not deal with the coordinate information with this parser.
        Here, we only are concerned with building the partonomy.
        :param limit:
        :return:

        """
        model = Model(self.graph)
        family = Family(self.graph)
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        logger.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)

        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:'+taxon

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_id, None)
        model.addSynonym(taxon_id, genome_label)

        genome_id = geno.makeGenomeID(taxon_id)
        geno.addGenome(taxon_id, genome_label)
        model.addOWLPropertyClassRestriction(
            genome_id, Genotype.object_properties['in_taxon'],
            taxon_id)

        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # chr13	4500000	10000000	p12	stalk
                (chrom, start, stop, band, rtype) = line.split('\t')
                line_counter += 1

                # NOTE
                # some less-finished genomes have placed and unplaced scaffolds
                # * Placed scaffolds:
                #    Scaffold has an oriented location within a chromosome.
                # * Unlocalized scaffolds:
                #     scaffold 's chromosome  is known,
                #     scaffold's position, orientation or both is not known.
                # *Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to.

                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1
                placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)'

                # TODO unused
                # unlocalized_scaffold_pattern = \
                #    placed_scaffold_pattern + r'_(\w+)_random'
                # unplaced_scaffold_pattern = r'chrUn_(\w+)'

                m = re.match(placed_scaffold_pattern+r'$', chrom)
                if m is not None and len(m.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    # ch = m.group(1)  # TODO unused
                    pass
                else:
                    # let's skip over anything that isn't a placed_scaffold
                    # at the class level
                    logger.info("Skipping non-placed chromosome %s", chrom)
                    continue
                # the chrom class, taxon as the reference
                cclassid = makeChromID(chrom, taxon, 'CHR')

                # add the chromosome as a class
                geno.addChromosomeClass(chrom, taxon_id, genome_label)
                model.addOWLPropertyClassRestriction(
                    cclassid, family.object_properties['member_of'], genome_id)

                # add the band(region) as a class
                maplocclass_id = cclassid+band
                maplocclass_label = makeChromLabel(chrom+band, genome_label)
                if band is not None and band.strip() != '':
                    region_type_id = self.map_type_of_region(rtype)
                    model.addClassToGraph(
                        maplocclass_id, maplocclass_label,
                        region_type_id)
                else:
                    region_type_id = Feature.types['chromosome']
                # add the staining intensity of the band
                if re.match(r'g(neg|pos|var)', rtype):
                    if region_type_id in [
                            Feature.types['chromosome_band'],
                            Feature.types['chromosome_subband']]:
                        stain_type = Feature.types.get(rtype)
                        if stain_type is not None:
                            model.addOWLPropertyClassRestriction(
                                maplocclass_id,
                                Feature.properties['has_staining_intensity'],
                                Feature.types.get(rtype))
                    else:
                        # usually happens if it's a chromosome because
                        # they don't actually have banding info
                        logger.info("feature type %s != chr band",
                                    region_type_id)
                else:
                    logger.warning('staining type not found: %s', rtype)

                # get the parent bands, and make them unique
                parents = list(self.make_parent_bands(band, set()))
                # alphabetical sort will put them in smallest to biggest
                parents.sort(reverse=True)

                # print("PARENTS of",maplocclass_id,"=",parents)
                # add the parents to the graph, in hierarchical order
                # TODO this is somewhat inefficient due to
                # re-adding upper-level nodes when iterating over the file
                # TODO PYLINT Consider using enumerate
                # instead of iterating with range and len
                for i in range(len(parents)):
                    pclassid = cclassid+parents[i]  # class chr parts
                    pclass_label = \
                        makeChromLabel(chrom+parents[i], genome_label)

                    rti = getChrPartTypeByNotation(parents[i])

                    model.addClassToGraph(pclassid, pclass_label, rti)

                    # for canonical chromosomes,
                    # then the subbands are subsequences of the full band
                    # add the subsequence stuff as restrictions
                    if i < len(parents) - 1:
                        pid = cclassid+parents[i+1]   # the instance
                        model.addOWLPropertyClassRestriction(
                            pclassid,
                            Feature.object_properties['is_subsequence_of'],
                            pid)
                        model.addOWLPropertyClassRestriction(
                            pid,
                            Feature.object_properties['has_subsequence'],
                            pclassid)

                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        model.addOWLPropertyClassRestriction(
                            pclassid,
                            Feature.object_properties['is_subsequence_of'],
                            cclassid)
                        model.addOWLPropertyClassRestriction(
                            cclassid,
                            Feature.object_properties['has_subsequence'],
                            pclassid)

                # connect the band here to the first one in the parent list
                if len(parents) > 0:
                    model.addOWLPropertyClassRestriction(
                        maplocclass_id,
                        Feature.object_properties['is_subsequence_of'],
                        cclassid+parents[0])
                    model.addOWLPropertyClassRestriction(
                        cclassid+parents[0],
                        Feature.object_properties['has_subsequence'],
                        maplocclass_id)

                if limit is not None and line_counter > limit:
                    break

        # TODO figure out the staining intensities for the encompassing bands

        return
Esempio n. 29
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        src_key = 'gene_history'
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        LOG.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("FILE: %s", myfile)
        col = self.files[src_key]['columns']
        with gzip.open(myfile, 'rb') as tsv:
            row = tsv.readline().decode().strip().split('\t')
            row[0] = row[0][1:]  # strip comment
            if not self.check_fileheader(col, row):
                pass

            for line in tsv:
                # skip comments
                row = line.decode().strip().split('\t')
                if row[0][0] == '#':
                    continue

                tax_num = row[col.index('tax_id')].strip()
                gene_num = row[col.index('GeneID')].strip()
                discontinued_num = row[col.index('Discontinued_GeneID')].strip()
                discontinued_symbol = row[col.index('Discontinued_Symbol')].strip()
                # discontinued_date = row[col.index('Discontinue_Date')]

                # set filter=None in init if you don't want to have a filter
                # if self.id_filter is not None:
                #     if ((self.id_filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.id_filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.test_mode and gene_num not in self.gene_ids:
                    continue

                if not self.test_mode and tax_num not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(discontinued_gene_id, discontinued_symbol)

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(
                        discontinued_gene_id, discontinued_symbol)
                    model.addDeprecatedIndividual(discontinued_gene_id, [gene_id])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if not self.test_mode and (limit is not None and line_counter > limit):
                    break
Esempio n. 30
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, discontinued_num, discontinued_symbol,
                 discontinued_date) = line.split('\t')

                # set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))

                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(
                        discontinued_gene_id, discontinued_symbol)

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(
                        discontinued_gene_id, discontinued_symbol)
                    model.addDeprecatedIndividual(
                        discontinued_gene_id, [gene_id])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if (not self.testMode) and\
                        (limit is not None and line_counter > limit):
                    break

        return
Esempio n. 31
0
    def process_feature_loc(self, limit):

        raw = '/'.join((self.rawdir, self.files['feature_loc']['file']))

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        LOG.info("Processing Feature location and attributes")
        line_counter = 0
        geno = Genotype(graph)
        strain_to_variant_map = {}
        build_num = self.version_num
        build_id = 'WormBase:' + build_num
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                if re.match(r'\#', ''.join(row)):
                    continue
                (chrom, db, feature_type_label, start, end, score, strand,
                 phase, attributes) = row

                # I	interpolated_pmap_position	gene	1	559768	.	.	.	ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM)
                # I	WormBase	gene	3747	3909	.	-	.	ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6
                # I	absolute_pmap_position	gene	4119	10230	.	.	.	ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM)

                # dbs = re.split(
                #   r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA')
                #
                # if db not in dbs:
                #     continue

                if feature_type_label not in [
                        'gene', 'point_mutation', 'deletion', 'RNAi_reagent',
                        'duplication', 'enhancer', 'binding_site',
                        'biological_region', 'complex_substitution',
                        'substitution', 'insertion', 'inverted_repeat'
                ]:
                    # note biological_regions include balancers
                    # other options here: promoter, regulatory_region, reagent
                    continue
                line_counter += 1

                attribute_dict = {}
                if attributes != '':
                    attribute_dict = dict(
                        item.split("=")
                        for item in re.sub(r'"', '', attributes).split(";"))

                fid = flabel = desc = None
                if 'ID' in attribute_dict:
                    fid = attribute_dict.get('ID')
                    if re.search(r'WB(Gene|Var|sf)', fid):
                        fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid)
                    elif re.match(r'(gmap|landmark)', fid):
                        continue
                    else:
                        LOG.info('other identifier %s', fid)
                        fid = None
                elif 'variation' in attribute_dict:
                    fid = 'WormBase:' + attribute_dict.get('variation')
                    flabel = attribute_dict.get('public_name')
                    sub = attribute_dict.get('substitution')
                    ins = attribute_dict.get('insertion')
                    # if it's a variation:
                    # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T
                    desc = ''
                    if sub is not None:
                        desc = 'substitution=' + sub
                    if ins is not None:
                        desc = 'insertion=' + ins

                    # keep track of the strains with this variation,
                    # for later processing
                    strain_list = attribute_dict.get('strain')
                    if strain_list is not None:
                        for s in re.split(r',', strain_list):
                            if s.strip() not in strain_to_variant_map:
                                strain_to_variant_map[s.strip()] = set()
                            strain_to_variant_map[s.strip()].add(fid)

                # if feature_type_label == 'RNAi_reagent':
                # Target=WBRNAi00096030 1 4942
                # this will tell us where the RNAi is actually binding
                # target = attribute_dict.get('Target') # TODO unused
                # rnai_num = re.split(r' ', target)[0]  # TODO unused
                # it will be the reagent-targeted-gene that has a position,
                # (i think)
                # TODO finish the RNAi binding location

                name = attribute_dict.get('Name')
                polymorphism = attribute_dict.get('polymorphism')

                if fid is None:
                    if name is not None and re.match(r'WBsf', name):
                        fid = 'WormBase:' + name
                        name = None
                    else:
                        continue

                if self.test_mode \
                        and re.sub(r'WormBase:', '', fid) \
                        not in self.test_ids['gene']+self.test_ids['allele']:
                    continue

                # these really aren't that interesting
                if polymorphism is not None:
                    continue

                if name is not None and not re.search(name, fid):
                    if flabel is None:
                        flabel = name
                    else:
                        model.addSynonym(fid, name)

                if desc is not None:
                    model.addDescription(fid, desc)

                alias = attribute_dict.get('Alias')

                biotype = attribute_dict.get('biotype')
                note = attribute_dict.get('Note')
                other_name = attribute_dict.get('other_name')
                for n in [alias, other_name]:
                    if n is not None:
                        model.addSynonym(fid, other_name)

                if feature_type_label == 'gene':
                    ftype_id = self.resolve(biotype)
                else:
                    # so far, they all come with SO label syntax. resolve if need be.
                    ftype_id = self.globaltt[feature_type_label]
                chr_id = makeChromID(chrom, build_id, 'CHR')
                geno.addChromosomeInstance(chrom, build_id, build_num)

                feature = Feature(graph, fid, flabel, ftype_id)
                feature.addFeatureStartLocation(start, chr_id, strand)
                feature.addFeatureEndLocation(start, chr_id, strand)

                feature_is_class = False
                if feature_type_label == 'gene':
                    feature_is_class = True

                feature.addFeatureToGraph(True, None, feature_is_class)

                if note is not None:
                    model.addDescription(fid, note)

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

                # RNAi reagents:
# I	RNAi_primary	RNAi_reagent	4184	10232	.	+	.	Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10
# I	RNAi_primary	RNAi_reagent	4223	10147	.	+	.	Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052
# I	RNAi_primary	RNAi_reagent	5693	9391	.	+	.	Target=WBRNAi00066135 1 3699 +;laboratory=CH

# TODO TF bindiing sites and network:
# I	TF_binding_site_region	TF_binding_site	1861	2048	.	+	.	Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16
# I	TF_binding_site_region	TF_binding_site	3403	4072	.	+	.	Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1

        return
Esempio n. 32
0
    def _get_equivids(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes(not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.

        :param limit:

        """
        raw = '/'.join((self.rawdir, self.files['idmap']['file']))
        model = Model(self.graph)
        LOG.info('Looping over %s', raw)
        # we look some stuff up in OMIM, so initialize here
        # omim = OMIM(self.graph_type, self.are_bnodes_skized)
        id_map = {}
        allomimids = set()
        col = ['NBK_id', 'GR_shortname', 'OMIM']

        with open(raw, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            row = next(reader)
            row[0] = row[0][1:]
            if not self.check_fileheader(col, row):
                pass

            for row in reader:

                nbk_num = row[col.index('NBK_id')]
                shortname = row[col.index('GR_shortname')]
                omim_num = row[col.index('OMIM')]
                gr_id = 'GeneReviews:' + nbk_num
                omim_id = 'OMIM:' + omim_num
                if not ((self.test_mode and len(self.test_ids) > 0
                         and omim_id in self.test_ids) or not self.test_mode):
                    continue

                # sometimes there's bad omim nums
                omim_num = omim_num.strip()
                if len(omim_num) != 6:
                    LOG.warning(
                        "OMIM number incorrectly formatted in row %i; skipping:\n%s",
                        reader.line_num, '\t'.join(row))
                    continue

                # build up a hashmap of the mappings; then process later
                if nbk_num not in id_map:
                    id_map[nbk_num] = set()
                id_map[nbk_num].add(omim_num)

                # add the class along with the shortname
                model.addClassToGraph(gr_id, None)
                model.addSynonym(gr_id, shortname)

                allomimids.add(omim_num)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break

            # end looping through file

        # given all_omim_ids from GR,
        # we want to update any which are changed or removed
        # before deciding which are disease / phenotypes
        replaced = allomimids & self.omim_replaced.keys()
        if replaced is not None and len(replaced) > 0:
            LOG.warning("These OMIM ID's are past their pull date: %s",
                        str(replaced))
            for oid in replaced:
                allomimids.remove(oid)
                replacements = self.omim_replaced[oid]
                for rep in replacements:
                    allomimids.update(rep)
        # guard against omim identifiers which have been removed
        obsolete = [
            o for o in self.omim_type
            if self.omim_type[o] == self.globaltt['obsolete']
        ]
        removed = allomimids & set(obsolete)
        if removed is not None and len(removed) > 0:
            LOG.warning("These OMIM ID's are gone: %s", str(removed))
            for oid in removed:
                allomimids.remove(oid)
        # filter for disease /phenotype types (we can argue about what is included)
        omim_phenotypes = set([
            omim for omim in self.omim_type if self.omim_type[omim] in (
                self.globaltt['phenotype'],
                self.globaltt[
                    'has_affected_feature'],  # both a gene and a phenotype
                self.globaltt['heritable_phenotypic_marker'])
        ])  # probable phenotype
        LOG.info("Have %i omim_ids globally typed as phenotypes from OMIM",
                 len(omim_phenotypes))

        entries_that_are_phenotypes = allomimids & omim_phenotypes
        LOG.info("Filtered out %d/%d entries that are genes or features",
                 len(allomimids - entries_that_are_phenotypes),
                 len(allomimids))

        for nbk_num in self.book_ids:
            gr_id = 'GeneReviews:' + nbk_num
            if nbk_num in id_map:
                omim_ids = id_map.get(nbk_num)
                for omim_num in omim_ids:
                    omim_id = 'OMIM:' + omim_num
                    # add the gene reviews as a superclass to the omim id,
                    # but only if the omim id is not a gene
                    if omim_id in entries_that_are_phenotypes:
                        model.addClassToGraph(omim_id, None)
                        model.addSubClass(omim_id, gr_id)
            # add this as a generic subclass  -- TEC: this is the job of inference
            model.addSubClass(gr_id, self.globaltt['disease'])
Esempio n. 33
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0

        model = Model(graph)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignore element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text
                disorder_id = 'ORPHA:' + str(disorder_num)

                if self.test_mode and disorder_id not in self.all_test_ids[
                        'disease']:
                    continue
                disorder_label = elem.find('Name').text

                # assuming that these are in the ontology (...any particular one?)
                model.addClassToGraph(disorder_id, disorder_label)
                assoc_list = elem.find('DisorderGeneAssociationList')
                expected_genes = assoc_list.get('count')
                LOG.info('Expecting %s genes assdciated with disorder %s.',
                         expected_genes, disorder_id)
                processed_genes = 0
                for assoc in assoc_list.findall('DisorderGeneAssociation'):
                    processed_genes += 1
                    gene = assoc.find('Gene')

                    # get gene's curie  HGNC or Ensembl ...

                    lclid = gene.find('OrphaNumber').text
                    gene_curie = 'ORPHA:' + lclid
                    gene_set = {'ORPHA': lclid}
                    for gene_ref in gene.findall(
                            './ExternalReferenceList/ExternalReference'):
                        gene_set[gene_ref.find('Source').text] = \
                            gene_ref.find('Reference').text

                    # set priority (clique leader if available) but default to OPRHA
                    for pfx in ('HGNC', 'Ensembl', 'SwissProt'):
                        #       'OMIM', 'Genatlas','Reactome', 'IUPHAR'):
                        if pfx in gene_set:
                            if pfx in self.localtt:
                                pfx = self.localtt[pfx]
                            gene_curie = pfx + ':' + gene_set[pfx]
                            gene_set.pop(pfx)
                            model.addClassToGraph(gene_curie, None)
                            break

                    # TEC have reservations w.r.t aggerator links being gene classes
                    for prefix in gene_set:
                        lclid = gene_set[prefix]
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]

                        dbxref = prefix + ':' + lclid

                        if gene_curie != dbxref:
                            model.addClassToGraph(dbxref, None)
                            model.addEquivalentClass(gene_curie, dbxref)

                    # TEC. would prefer this not happen here. let HGNC handle it
                    # except there are some w/o explicit external links ...

                    # gene_name = gene.find('Name').text
                    gene_symbol = gene.find('Symbol').text
                    # gene_iid = assoc.find('DisorderGeneAssociationType').get('id')
                    # gene_type_id = self.resolve(gene_iid)
                    # don't  know the 'type' of the gene for this class anymore
                    # model.addClassToGraph(
                    #    gene_curie, gene_symbol, gene_type_id, gene_name)

                    syn_list = gene.find('./SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for syn in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_curie, syn.text)

                    dg_label = assoc.find(
                        './DisorderGeneAssociationType/Name').text
                    # rel_id = self.resolve(dg_label)

                    # alt_locus_id = '_:' + gene_num + '-' + disorder_num + 'VL'
                    # alt_label = ' '.join((
                    #    'some variant of', gene_symbol.strip(), disorder_label))
                    # model.addIndividualToGraph(
                    #    alt_locus_id, alt_label, self.globaltt['variant_locus'])
                    # geno.addAffectedLocus(alt_locus_id, gene_id)
                    # model.addBlankNodeAnnotation(alt_locus_id)
                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use dg association status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    eco_id = self.resolve(
                        assoc.find('DisorderGeneAssociationStatus/Name').text)

                    # assoc = G2PAssoc(
                    #    graph, self.name, alt_locus_id, disorder_id, rel_id)
                    # assoc.add_evidence(eco_id)
                    # assoc.add_association_to_graph()

                    self.add_gene_to_disease(dg_label, gene_curie, gene_symbol,
                                             disorder_id, eco_id)

                elem.clear()  # empty the element
                if int(expected_genes) != processed_genes:
                    LOG.warning(
                        '% expected %s associated genes but we processed %i',
                        disorder_id, expected_genes, processed_genes)

            if self.test_mode and limit is not None and line_counter > limit:
                return

        return
Esempio n. 34
0
    def _get_chrbands(self, limit, taxon):
        """
        :param limit:
        :return:

        """
        model = Model(self.graph)
        # TODO PYLINT figure out what limit was for and why it is unused
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        logger.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)
        monochrom = Monochrom(self.graph_type, self.are_bnodes_skized)

        # used to hold band definitions for a chr
        # in order to compute extent of encompasing bands

        mybands = {}
        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:'+taxon

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_id, None)
        model.addSynonym(taxon_id, genome_label)

        geno.addGenome(taxon_id, genome_label)

        # add the build and the taxon it's in
        build_num = self.files[taxon]['build_num']
        build_id = 'UCSC:'+build_num
        geno.addReferenceGenome(build_id, build_num, taxon_id)

        # process the bands
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue

                # chr13	4500000	10000000	p12	stalk
                (scaffold, start, stop, band_num, rtype) = line.split('\t')
                line_counter += 1

                # NOTE some less-finished genomes have
                # placed and unplaced scaffolds
                # * Placed scaffolds:
                #       the scaffolds have been placed within a chromosome.
                # * Unlocalized scaffolds:
                #   although the chromosome within which the scaffold occurs
                #   is known, the scaffold's position or orientation
                #   is not known.
                # * Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to
                #
                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1
                placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))'
                unlocalized_scaffold_pattern = \
                    placed_scaffold_pattern+r'_(\w+)_random'
                unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)'

                m = re.match(placed_scaffold_pattern+r'$', scaffold)
                if m is not None and len(m.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    chrom_num = m.group(1)
                else:
                    # skip over anything that isn't a placed_scaffold
                    # at the class level
                    logger.info("Found non-placed chromosome %s", scaffold)
                    chrom_num = None

                m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold)
                m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold)

                scaffold_num = None
                if m:
                    pass
                elif m_chr_unloc is not None and\
                        len(m_chr_unloc.groups()) == 2:
                    chrom_num = m_chr_unloc.group(1)
                    scaffold_num = chrom_num+'_'+m_chr_unloc.group(2)
                elif m_chr_unplaced is not None and\
                        len(m_chr_unplaced.groups()) == 1:
                    scaffold_num = m_chr_unplaced.group(1)
                else:
                    logger.error(
                        "There's a chr pattern that we aren't matching: %s",
                        scaffold)

                if chrom_num is not None:
                    # the chrom class (generic) id
                    chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')

                    # first, add the chromosome class (in the taxon)
                    geno.addChromosomeClass(
                        chrom_num, taxon_id, self.files[taxon]['genome_label'])

                    # then, add the chromosome instance (from the given build)
                    geno.addChromosomeInstance(chrom_num, build_id, build_num,
                                               chrom_class_id)

                    # add the chr to the hashmap of coordinates for this build
                    # the chromosome coordinate space is itself
                    if chrom_num not in mybands.keys():
                        mybands[chrom_num] = {
                            'min': 0,
                            'max': int(stop),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': Feature.types['chromosome']}

                if scaffold_num is not None:
                    # this will put the coordinates of the scaffold
                    # in the scaffold-space and make sure that the scaffold
                    # is part of the correct parent.
                    # if chrom_num is None,
                    # then it will attach it to the genome,
                    # just like a reg chrom
                    mybands[scaffold_num] = {
                        'min': start,
                        'max': stop,
                        'chr': scaffold_num,
                        'ref': build_id,
                        'parent': chrom_num,
                        'stain': None,
                        'type': Feature.types['assembly_component'],
                        'synonym': scaffold}

                if band_num is not None and band_num.strip() != '':
                    # add the specific band
                    mybands[chrom_num+band_num] = {'min': start,
                                                   'max': stop,
                                                   'chr': chrom_num,
                                                   'ref': build_id,
                                                   'parent': None,
                                                   'stain': None,
                                                   'type': None}

                    # add the staining intensity of the band
                    if re.match(r'g(neg|pos|var)', rtype):
                        mybands[chrom_num+band_num]['stain'] = \
                            Feature.types.get(rtype)

                    # get the parent bands, and make them unique
                    parents = list(
                        monochrom.make_parent_bands(band_num, set()))
                    # alphabetical sort will put them in smallest to biggest,
                    # so we reverse
                    parents.sort(reverse=True)
                    # print('parents of',chrom,band,':',parents)

                    if len(parents) > 0:
                        mybands[chrom_num+band_num]['parent'] = \
                            chrom_num+parents[0]
                else:
                    # TODO PYLINT why is 'parent'
                    # a list() a couple of lines up and a set() here?
                    parents = set()

                # loop through the parents and add them to the hash
                # add the parents to the graph, in hierarchical order
                # TODO PYLINT Consider using enumerate
                # instead of iterating with range and len
                for i in range(len(parents)):
                    rti = getChrPartTypeByNotation(parents[i])

                    pnum = chrom_num+parents[i]
                    sta = int(start)
                    sto = int(stop)
                    if pnum not in mybands.keys():
                        # add the parental band to the hash
                        b = {'min': min(sta, sto),
                             'max': max(sta, sto),
                             'chr': chrom_num,
                             'ref': build_id,
                             'parent': None,
                             'stain': None,
                             'type': rti}
                        mybands[pnum] = b
                    else:
                        # band already in the hash means it's a grouping band
                        # need to update the min/max coords
                        b = mybands.get(pnum)
                        b['min'] = min(sta, sto, b['min'])
                        b['max'] = max(sta, sto, b['max'])
                        mybands[pnum] = b

                        # also, set the max for the chrom
                        c = mybands.get(chrom_num)
                        c['max'] = max(sta, sto, c['max'])
                        mybands[chrom_num] = c

                    # add the parent relationships to each
                    if i < len(parents) - 1:
                        mybands[pnum]['parent'] = chrom_num+parents[i+1]
                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        mybands[pnum]['parent'] = chrom_num

        f.close()  # end looping through file

        # loop through the hash and add the bands to the graph
        for b in mybands.keys():
            myband = mybands.get(b)
            band_class_id = makeChromID(b, taxon, 'CHR')
            band_class_label = makeChromLabel(b, genome_label)
            band_build_id = makeChromID(b, build_num, 'MONARCH')
            band_build_label = makeChromLabel(b, build_num)
            # the build-specific chrom
            chrom_in_build_id = makeChromID(
                myband['chr'], build_num, 'MONARCH')
            # if it's != part, then add the class
            if myband['type'] != Feature.types['assembly_component']:
                model.addClassToGraph(band_class_id,
                                      band_class_label, myband['type'])
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   band_class_id)
            else:
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   myband['type'])
                if 'synonym' in myband:
                    model.addSynonym(band_build_id, myband['synonym'])

            if myband['parent'] is None:
                if myband['type'] == Feature.types['assembly_component']:
                    # since we likely don't know the chr,
                    # add it as a part of the build
                    geno.addParts(band_build_id, build_id)
            elif myband['type'] == Feature.types['assembly_component']:
                # geno.addParts(band_build_id, chrom_in_build_id)
                parent_chrom_in_build = makeChromID(myband['parent'],
                                                    build_num, 'MONARCH')
                bfeature.addSubsequenceOfFeature(parent_chrom_in_build)

            # add the band as a feature
            # (which also instantiates the owl:Individual)
            bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id)
            bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id)
            if 'stain' in myband and myband['stain'] is not None:
                # TODO 'has_staining_intensity' being dropped by MB
                bfeature.addFeatureProperty(
                    Feature.properties['has_staining_intensity'],
                    myband['stain'])

            # type the band as a faldo:Region directly (add_region=False)
            # bfeature.setNoBNodes(self.nobnodes)
            # to come when we merge in ZFIN.py
            bfeature.addFeatureToGraph(False)

        return
Esempio n. 35
0
    def _get_chrbands(self, limit, taxon):
        """
        :param limit:
        :return:

        """

        if limit is None:
            limit = sys.maxsize  # practical limit anyway
        model = Model(self.graph)
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        LOG.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)
        monochrom = Monochrom(self.graph_type, self.are_bnodes_skized)

        # used to hold band definitions for a chr
        # in order to compute extent of encompasing bands

        mybands = {}
        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:' + taxon

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_id, None)
        model.addSynonym(taxon_id, genome_label)

        geno.addGenome(taxon_id, genome_label)

        # add the build and the taxon it's in
        build_num = self.files[taxon]['build_num']
        build_id = 'UCSC:' + build_num
        geno.addReferenceGenome(build_id, build_num, taxon_id)

        # process the bands
        col = ['scaffold', 'start', 'stop', 'band_num', 'rtype']
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                line_counter += 1
                # skip comments
                line = line.decode().strip()
                if line[0] == '#' or line_counter > limit:
                    continue
                # chr13	4500000	10000000	p12	stalk
                row = line.split('\t')
                scaffold = row[col.index('scaffold')]
                start = row[col.index('start')]
                stop = row[col.index('stop')]
                band_num = row[col.index('band_num')].strip()
                rtype = row[col.index('rtype')]

                # NOTE some less-finished genomes have
                # placed and unplaced scaffolds
                # * Placed scaffolds:
                #       the scaffolds have been placed within a chromosome.
                # * Unlocalized scaffolds:
                #   although the chromosome within which the scaffold occurs
                #   is known, the scaffold's position or orientation
                #   is not known.
                # * Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to
                #
                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1
                placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))'
                unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random'
                unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)'

                mch = re.match(placed_scaffold_pattern + r'$', scaffold)
                if mch is not None and len(mch.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    chrom_num = mch.group(1)
                else:
                    # skip over anything that isn't a placed_scaffold
                    # at the class level
                    LOG.info("Found non-placed chromosome %s", scaffold)
                    chrom_num = None

                m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold)
                m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold)

                scaffold_num = None
                if mch:
                    pass
                elif m_chr_unloc is not None and len(
                        m_chr_unloc.groups()) == 2:
                    chrom_num = m_chr_unloc.group(1)
                    scaffold_num = chrom_num + '_' + m_chr_unloc.group(2)
                elif m_chr_unplaced is not None and len(
                        m_chr_unplaced.groups()) == 1:
                    scaffold_num = m_chr_unplaced.group(1)
                else:
                    LOG.error(
                        "There's a chr pattern that we aren't matching: %s",
                        scaffold)

                if chrom_num is not None:
                    # the chrom class (generic) id
                    chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')

                    # first, add the chromosome class (in the taxon)
                    geno.addChromosomeClass(chrom_num, taxon_id,
                                            self.files[taxon]['genome_label'])

                    # then, add the chromosome instance (from the given build)
                    geno.addChromosomeInstance(chrom_num, build_id, build_num,
                                               chrom_class_id)

                    # add the chr to the hashmap of coordinates for this build
                    # the chromosome coordinate space is itself
                    if chrom_num not in mybands.keys():
                        mybands[chrom_num] = {
                            'min': 0,
                            'max': int(stop),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': self.globaltt['chromosome']
                        }

                if scaffold_num is not None:
                    # this will put the coordinates of the scaffold
                    # in the scaffold-space and make sure that the scaffold
                    # is part of the correct parent.
                    # if chrom_num is None,
                    # then it will attach it to the genome,
                    # just like a reg chrom
                    mybands[scaffold_num] = {
                        'min': start,
                        'max': stop,
                        'chr': scaffold_num,
                        'ref': build_id,
                        'parent': chrom_num,
                        'stain': None,
                        'type': self.globaltt['assembly_component'],
                        'synonym': scaffold
                    }

                parents = list()
                if band_num is not None and band_num != '':
                    # add the specific band
                    mybands[chrom_num + band_num] = {
                        'min': start,
                        'max': stop,
                        'chr': chrom_num,
                        'ref': build_id,
                        'parent': None,
                        'stain': None,
                        'type': None
                    }

                    # add the staining intensity of the band
                    if re.match(r'g(neg|pos|var)', rtype):
                        mybands[chrom_num +
                                band_num]['stain'] = self.resolve(rtype)

                    # get the parent bands, and make them unique
                    parents = list(monochrom.make_parent_bands(
                        band_num, set()))
                    # alphabetical sort will put them in smallest to biggest,
                    # so we reverse
                    parents.sort(reverse=True)
                    # print('parents of',chrom,band,':',parents)

                    if len(parents) > 0:
                        mybands[chrom_num +
                                band_num]['parent'] = chrom_num + parents[0]

                # loop through the parents and add them to the hash
                # add the parents to the graph, in hierarchical order
                # TODO PYLINT Consider using enumerate
                # instead of iterating with range and len
                for i in range(len(parents)):
                    rti = getChrPartTypeByNotation(parents[i])

                    pnum = chrom_num + parents[i]
                    sta = int(start)
                    sto = int(stop)
                    if pnum not in mybands.keys():
                        # add the parental band to the hash
                        bnd = {
                            'min': min(sta, sto),
                            'max': max(sta, sto),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': rti
                        }
                        mybands[pnum] = bnd
                    else:
                        # band already in the hash means it's a grouping band
                        # need to update the min/max coords
                        bnd = mybands.get(pnum)
                        bnd['min'] = min(sta, sto, bnd['min'])
                        bnd['max'] = max(sta, sto, bnd['max'])
                        mybands[pnum] = bnd

                        # also, set the max for the chrom
                        chrom = mybands.get(chrom_num)
                        chrom['max'] = max(sta, sto, chrom['max'])
                        mybands[chrom_num] = chrom

                    # add the parent relationships to each
                    if i < len(parents) - 1:
                        mybands[pnum]['parent'] = chrom_num + parents[i + 1]
                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        mybands[pnum]['parent'] = chrom_num

        f.close()  # end looping through file

        # loop through the hash and add the bands to the graph
        for bnd in mybands.keys():
            myband = mybands.get(bnd)
            band_class_id = makeChromID(bnd, taxon, 'CHR')
            band_class_label = makeChromLabel(bnd, genome_label)
            band_build_id = makeChromID(bnd, build_num, 'MONARCH')
            band_build_label = makeChromLabel(bnd, build_num)
            # the build-specific chrom
            chrom_in_build_id = makeChromID(myband['chr'], build_num,
                                            'MONARCH')
            # if it's != part, then add the class
            if myband['type'] != self.globaltt['assembly_component']:
                model.addClassToGraph(band_class_id, band_class_label,
                                      myband['type'])
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   band_class_id)
            else:
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   myband['type'])
                if 'synonym' in myband:
                    model.addSynonym(band_build_id, myband['synonym'])

            if myband['parent'] is None:
                if myband['type'] == self.globaltt['assembly_component']:
                    # since we likely don't know the chr,
                    # add it as a part of the build
                    geno.addParts(band_build_id, build_id)
            elif myband['type'] == self.globaltt['assembly_component']:
                # geno.addParts(band_build_id, chrom_in_build_id)
                parent_chrom_in_build = makeChromID(myband['parent'],
                                                    build_num, 'MONARCH')
                bfeature.addSubsequenceOfFeature(parent_chrom_in_build)

            # add the band as a feature
            # (which also instantiates the owl:Individual)
            bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id)
            bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id)
            if 'stain' in myband and myband['stain'] is not None:
                bfeature.addFeatureProperty(
                    self.globaltt['has_sequence_attribute'], myband['stain'])

            # type the band as a faldo:Region directly (add_region=False)
            # bfeature.setNoBNodes(self.nobnodes)
            # to come when we merge in ZFIN.py
            bfeature.addFeatureToGraph(False)

        return
Esempio n. 36
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0

        model = Model(graph)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignore element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text
                disorder_id = 'ORPHA:' + str(disorder_num)

                if self.test_mode and disorder_id not in self.all_test_ids['disease']:
                    continue
                disorder_label = elem.find('Name').text

                # assuming that these are in the ontology (...any particular one?)
                model.addClassToGraph(disorder_id, disorder_label)
                assoc_list = elem.find('DisorderGeneAssociationList')
                expected_genes = assoc_list.get('count')
                LOG.info(
                    'Expecting %s genes associated with disorder %s.',
                    expected_genes, disorder_id)
                processed_genes = 0
                for assoc in assoc_list.findall('DisorderGeneAssociation'):
                    processed_genes += 1
                    gene = assoc.find('Gene')

                    # get gene's curie  HGNC or Ensembl ...

                    lclid = gene.find('OrphaNumber').text
                    gene_curie = 'ORPHA:' + lclid
                    gene_set = {'ORPHA': lclid}
                    for gene_ref in gene.findall(
                            './ExternalReferenceList/ExternalReference'):
                        gene_set[gene_ref.find('Source').text] = \
                            gene_ref.find('Reference').text

                    # set priority (clique leader if available) but default to OPRHA
                    for pfx in ('HGNC', 'Ensembl', 'SwissProt'):
                        if pfx in gene_set:
                            if pfx in self.localtt:
                                pfx = self.localtt[pfx]
                            gene_curie = pfx + ':' + gene_set[pfx]
                            gene_set.pop(pfx)
                            model.addClassToGraph(gene_curie, None)
                            break

                    # TEC have reservations w.r.t aggerator links being gene classes
                    for prefix in gene_set:
                        lclid = gene_set[prefix]
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]

                        dbxref = prefix + ':' + lclid

                        if gene_curie != dbxref:
                            model.addClassToGraph(dbxref, None)
                            model.addEquivalentClass(gene_curie, dbxref)

                    # TEC. would prefer this not happen here. let HGNC handle it
                    # except there are some w/o explicit external links ...

                    gene_symbol = gene.find('Symbol').text

                    syn_list = gene.find('./SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for syn in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_curie, syn.text)

                    dg_label = assoc.find('./DisorderGeneAssociationType/Name').text

                    # use dg association status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    eco_id = self.resolve(
                        assoc.find('DisorderGeneAssociationStatus/Name').text)

                    rel_id = self.resolve(dg_label)
                    
                    g2p_assoc = G2PAssoc(self.graph, self.name, gene_curie, disorder_id, rel_id)
                    g2p_assoc.add_evidence(eco_id)
                    g2p_assoc.add_association_to_graph()

                elem.clear()  # empty the element
                if int(expected_genes) != processed_genes:
                    LOG.warning(
                        '% expected %s associated genes but we processed %i',
                        disorder_id, expected_genes, processed_genes)

            if self.test_mode and limit is not None and line_counter > limit:
                return

        return
Esempio n. 37
0
    def _get_equivids(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes(not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['idmap']['file']))
        model = Model(self.graph)
        line_counter = 0

        # we look some stuff up in OMIM, so initialize here
        omim = OMIM(self.graph_type, self.are_bnodes_skized)
        id_map = {}
        allomimids = set()
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (nbk_num, shortname, omim_num) = row
                gr_id = 'GeneReviews:'+nbk_num
                omim_id = 'OMIM:'+omim_num
                if not (
                        (self.testMode and
                         len(self.test_ids) > 0 and
                         omim_id in self.test_ids) or not
                        self.testMode):
                    continue

                # sometimes there's bad omim nums
                if len(omim_num) > 6:
                    logger.warning(
                        "OMIM number incorrectly formatted " +
                        "in row %d; skipping:\n%s",
                        line_counter, '\t'.join(row))
                    continue

                # build up a hashmap of the mappings; then process later
                if nbk_num not in id_map:
                    id_map[nbk_num] = set()
                id_map[nbk_num].add(omim_num)

                # add the class along with the shortname
                model.addClassToGraph(gr_id, None)
                model.addSynonym(gr_id, shortname)

                allomimids.add(omim_num)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

            # end looping through file

        # get the omim ids that are not genes
        entries_that_are_phenotypes = \
            omim.process_entries(
                list(allomimids), filter_keep_phenotype_entry_ids,
                None, None, limit)

        logger.info("Filtered out %d/%d entries that are genes or features",
                    len(allomimids)-len(entries_that_are_phenotypes),
                    len(allomimids))

        for nbk_num in self.book_ids:
            gr_id = 'GeneReviews:'+nbk_num
            if nbk_num in id_map:
                omim_ids = id_map.get(nbk_num)
                for omim_num in omim_ids:
                    omim_id = 'OMIM:'+omim_num
                    # add the gene reviews as a superclass to the omim id,
                    # but only if the omim id is not a gene
                    if omim_id in entries_that_are_phenotypes:
                        model.addClassToGraph(omim_id, None)
                        model.addSubClass(omim_id, gr_id)
            # add this as a generic subclass of DOID:4
            model.addSubClass(gr_id, 'DOID:4')

        return
Esempio n. 38
0
    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        geno = Genotype(g)
        f = Feature(g, None, None, None)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:'+tax_num
        tax_label = 'Human'
        model.addClassToGraph(tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)",
                        num_cols, expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                tax_num, 'CHR')
                            geno.addChromosomeInstance(
                                cytogenetic_loc, build_id, assembly, band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                assembly, 'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(
                        str(chr), build_id, assembly, chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    if re.match(r'^Gene:', gene_num):
                        gene_num = "NCBI" + gene_num
                    else:
                        gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph()
                f.addTaxonToFeature(tax_id)
                # make the ClinVarVariant the clique leader
                model.makeLeader(seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs'+str(dbsnp_num)
                    model.addIndividualToGraph(dbsnp_id, None)
                    model.addSameIndividual(seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:'+dbvar_num
                    model.addIndividualToGraph(dbvar_id, None)
                    model.addSameIndividual(seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        model.addIndividualToGraph(rcv_id, None)
                        model.addXref(seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    model.addClassToGraph(gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_'+gene_num+'-'+variant_num
                    if self.nobnodes:
                        vl_id = ':'+vl_id
                    vl_label = allele_name
                    model.addIndividualToGraph(
                        vl_id, vl_label, geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info(
                            "No gene listed for variant %d",
                            int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for phenotype in pheno_list:
                        m = re.match(
                            r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype)
                        if m is not None and len(m.groups()) > 0:
                            phenotype = re.sub(
                                m.group(1), 'Orphanet:', phenotype.strip())
                        elif re.match(r'ORPHA:\d+', phenotype):
                            phenotype = re.sub(
                                r'^ORPHA', 'Orphanet', phenotype.strip())
                        elif re.match(r'Human Phenotype Ontology', phenotype):
                            phenotype = re.sub(
                                r'^Human Phenotype Ontology', '',
                                phenotype.strip())
                        elif re.match(r'SNOMED CT:\s?', phenotype):
                            phenotype = re.sub(
                                r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip())
                        elif re.match(r'^Gene:', phenotype):
                            continue

                        assoc = G2PAssoc(
                            g, self.name, seqalt_id, phenotype.strip())
                        assoc.add_association_to_graph()

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:'+xrefid.split(':')[1]
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        logger.info("Finished parsing variants")

        return
Esempio n. 39
0
    def process_gaf(self, file, limit, id_map=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        geno = Genotype(g)
        logger.info("Processing Gene Associations from %s", file)
        line_counter = 0

        if 7955 in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        elif 6239 in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue
                (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
                 with_or_from, aspect, gene_name, gene_synonym, object_type,
                 taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (db == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    logger.error(
                        "Missing required part of annotation " +
                        "on row %d:\n" + '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                db = self.clean_db_prefix(db)
                uniprotid = None
                gene_id = None
                if db == 'UniProtKB':
                    mapped_ids = id_map.get(gene_num)
                    if id_map is not None and mapped_ids is not None:
                        if len(mapped_ids) == 1:
                            gene_id = mapped_ids[0]
                            uniprotid = ':'.join((db, gene_num))
                            gene_num = re.sub(r'\w+\:', '', gene_id)
                        elif len(mapped_ids) > 1:
                            # logger.warning(
                            #   "Skipping gene id mapped for >1 gene %s -> %s",
                            #    gene_num, str(mapped_ids))
                            continue
                    else:
                        continue
                elif db == 'MGI':
                    gene_num = re.sub(r'MGI:', '', gene_num)
                    gene_id = ':'.join((db, gene_num))
                    gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
                else:
                    gene_id = ':'.join((db, gene_num))

                if self.testMode \
                        and not(
                            re.match(r'NCBIGene', gene_id) and
                            int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for s in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, s.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    logger.info(">1 taxon (%s) on line %d.  skipping", taxon,
                                line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(g, self.name)

                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                refs = re.split(r'\|', ref)
                for r in refs:
                    r = r.strip()
                    if r != '':
                        prefix = re.split(r':', r)[0]
                        r = re.sub(prefix, self.clean_db_prefix(prefix), r)
                        r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                        ref = Reference(g, r)
                        if re.match(r'PMID', r):
                            ref_type = Reference.ref_types['journal_article']
                            ref.setType(ref_type)
                        ref.addRefToGraph()
                        assoc.add_source(r)

                # TODO add the source of the annotations from assigned by?

                aspect_rel_map = {
                    'P': model.object_properties['involved_in'],  # involved in
                    'F': model.object_properties['enables'],  # enables
                    'C': model.object_properties['part_of']  # part of
                }

                if aspect not in aspect_rel_map:
                    logger.error("Aspect not recognized: %s", aspect)

                rel = aspect_rel_map.get(aspect)
                if aspect == 'F' and re.search(r'contributes_to', qualifier):
                    rel = model.object_properties['contributes_to']
                assoc.set_relationship(rel)
                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used

                assoc.add_association_to_graph()

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or \
                                re.match(
                                    r'(UniProtKB|WBPhenotype|InterPro|HGNC)',
                                    i):
                            logger.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this:
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = \
                                wbase.make_reagent_targeted_gene_id(
                                    gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        else:
                            assoc = G2PAssoc(g, self.name, i, phenotypeid)
                        for r in refs:
                            r = r.strip()
                            if r != '':
                                prefix = re.split(r':', r)[0]
                                r = re.sub(prefix,
                                           self.clean_db_prefix(prefix), r)
                                r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                                assoc.add_source(r)
                                # experimental phenotypic evidence
                                assoc.add_evidence("ECO:0000059")
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return