Ejemplo n.º 1
0
    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(
            graph_type,
            are_bnodes_skolemized,
            'decipher',
            ingest_title='Development Disorder Genotype Phenotype Database',
            ingest_url='https://decipher.sanger.ac.uk/',
            license_url='https://decipher.sanger.ac.uk/legal',
            data_rights='https://decipher.sanger.ac.uk/datasharing',
            # file_handle=None
        )

        if 'disease' not in self.all_test_ids:
            LOG.warning("not configured with disease test ids.")
            self.test_ids = []
        else:
            self.test_ids = self.all_test_ids['disease']

        self.graph = self.graph
        self.geno = Genotype(self.graph)
        self.model = Model(self.graph)
        self.graph_type = graph_type
        self.are_bnodes_skolemized = are_bnodes_skolemized

        return
Ejemplo n.º 2
0
    def _add_snp_gene_relation(self, snp_id, snp_gene_nums,
                               upstream_gene_num, downstream_gene_num):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        geno = Genotype(g)
        # add the feature as a sequence alteration
        # affecting various genes
        # note that intronic variations don't necessarily list
        # the genes such as for rs10448080  FIXME
        if snp_gene_nums != '':
            for s in re.split(r',', snp_gene_nums):
                s = s.strip()
                # still have to test for this,
                # because sometimes there's a leading comma
                if s != '':
                    gene_id = 'NCBIGene:' + s
                    geno.addAffectedLocus(snp_id, gene_id)

        # add the up and downstream genes if they are available
        if upstream_gene_num != '':
            downstream_gene_id = 'NCBIGene:' + downstream_gene_num
            g.addTriple(
                snp_id,
                Feature.object_properties[
                    r'upstream_of_sequence_of'],
                downstream_gene_id)
        if downstream_gene_num != '':
            upstream_gene_id = 'NCBIGene:' + upstream_gene_num
            g.addTriple(
                snp_id,
                Feature.object_properties[
                    'downstream_of_sequence_of'],
                upstream_gene_id)
Ejemplo n.º 3
0
    def _add_snp_gene_relation(self, snp_id, snp_gene_nums, upstream_gene_num,
                               downstream_gene_num):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        # add the feature as a sequence alteration
        # affecting various genes
        # note that intronic variations don't necessarily list
        # the genes such as for rs10448080  FIXME
        if snp_gene_nums != '':
            for geneid in re.split(r',', snp_gene_nums):
                geneid = geneid.strip()
                # still have to test for this,
                # because sometimes there's a leading comma
                if geneid != '':
                    geno.addAffectedLocus(snp_id, 'ENSEMBL:' + geneid)

        # add the up and downstream genes if they are available
        if upstream_gene_num != '':
            downstream_gene_id = 'ENSEMBL:' + downstream_gene_num
            graph.addTriple(snp_id,
                            self.globaltt['is upstream of sequence of'],
                            downstream_gene_id)
        if downstream_gene_num != '':
            upstream_gene_id = 'ENSEMBL:' + upstream_gene_num
            graph.addTriple(snp_id,
                            self.globaltt['is downstream of sequence of'],
                            upstream_gene_id)
Ejemplo n.º 4
0
    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'ctd')
        self.dataset = Dataset(
            'ctd', 'CTD', 'http://ctdbase.org', None,
            'http://ctdbase.org/about/legal.jsp')

        if 'test_ids' not in config.get_config() \
                or 'gene' not in config.get_config()['test_ids']:
            logger.warning("not configured with gene test ids.")
            self.test_geneids = []
        else:
            self.test_geneids = config.get_config()['test_ids']['gene']

        if 'test_ids' not in config.get_config() \
                or 'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_diseaseids = []
        else:
            self.test_diseaseids = config.get_config()['test_ids']['disease']

        self.g = self.graph
        self.geno = Genotype(self.graph)
        self.pathway = Pathway(self.graph)

        return
Ejemplo n.º 5
0
    def _add_snp_gene_relation(
            self, snp_id, snp_gene_nums, upstream_gene_num, downstream_gene_num):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        # add the feature as a sequence alteration
        # affecting various genes
        # note that intronic variations don't necessarily list
        # the genes such as for rs10448080  FIXME
        if snp_gene_nums != '':
            for geneid in re.split(r',', snp_gene_nums):
                geneid = geneid.strip()
                # still have to test for this,
                # because sometimes there's a leading comma
                if geneid != '':
                    geno.addAffectedLocus(snp_id, 'NCBIGene:' + geneid)

        # add the up and downstream genes if they are available
        if upstream_gene_num != '':
            downstream_gene_id = 'NCBIGene:' + downstream_gene_num
            graph.addTriple(
                snp_id, self.globaltt['is upstream of sequence of'], downstream_gene_id)
        if downstream_gene_num != '':
            upstream_gene_id = 'NCBIGene:' + upstream_gene_num
            graph.addTriple(
                snp_id, self.globaltt['is downstream of sequence of'], upstream_gene_id)
Ejemplo n.º 6
0
    def process_disease_association(self, limit):

        raw = '/'.join((self.rawdir, self.files['disease_assoc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing disease models")
        geno = Genotype(g, self.nobnodes)
        line_counter = 0
        worm_taxon = 'NCBITaxon:6239'
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, disease_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                # WB	WBGene00000001	aap-1		DOID:2583	PMID:19029536	IEA	ENSEMBL:ENSG00000145675|OMIM:615214	D		Y110A7A.10	gene	taxon:6239	20150612	WB
                gene_id = 'WormBase:'+gene_num

                # make a variant of the gene
                vl = '_'+'-'.join((gene_num, 'unspecified'))
                if self.nobnodes:
                    vl = ':'+vl
                vl_label = 'some variant of '+gene_symbol
                geno.addAlleleOfGene(vl, gene_id)
                animal_id = geno.make_experimental_model_with_genotype(
                    g, vl, vl_label, worm_taxon, 'worm')

                assoc = G2PAssoc(
                    self.name, animal_id,
                    disease_id, gu.object_properties['model_of'])
                ref = re.sub(r'WB_REF:', 'WormBase:', ref)
                if ref != '':
                    assoc.add_source(ref)
                eco_id = None
                if eco_symbol == 'IEA':
                    eco_id = 'ECO:0000501'  # IEA is this now
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                assoc.add_association_to_graph(g)

        return
Ejemplo n.º 7
0
    def _build_gene_disease_model(self,
                                  gene_id,
                                  relation_id,
                                  disease_id,
                                  variant_label,
                                  consequence_predicate=None,
                                  consequence_id=None,
                                  allelic_requirement=None,
                                  pmids=None):
        """
        Builds gene variant disease model

        :return: None
        """
        model = Model(self.graph)
        geno = Genotype(self.graph)

        pmids = [] if pmids is None else pmids

        is_variant = False
        variant_or_gene = gene_id

        variant_id_string = variant_label
        variant_bnode = self.make_id(variant_id_string, "_")

        if consequence_predicate is not None \
                and consequence_id is not None:
            is_variant = True
            model.addTriple(variant_bnode, consequence_predicate,
                            consequence_id)
            # Hack to add labels to terms that
            # don't exist in an ontology
            if consequence_id.startswith(':'):
                model.addLabel(consequence_id,
                               consequence_id.strip(':').replace('_', ' '))

        if is_variant:
            variant_or_gene = variant_bnode
            # Typically we would type the variant using the
            # molecular consequence, but these are not specific
            # enough for us to make mappings (see translation table)
            model.addIndividualToGraph(variant_bnode, variant_label,
                                       self.globaltt['variant_locus'])
            geno.addAffectedLocus(variant_bnode, gene_id)
            model.addBlankNodeAnnotation(variant_bnode)

        assoc = G2PAssoc(self.graph, self.name, variant_or_gene, disease_id,
                         relation_id)
        assoc.source = pmids
        assoc.add_association_to_graph()

        if allelic_requirement is not None and is_variant is False:
            model.addTriple(assoc.assoc_id,
                            self.globaltt['has_allelic_requirement'],
                            allelic_requirement)
            if allelic_requirement.startswith(':'):
                model.addLabel(
                    allelic_requirement,
                    allelic_requirement.strip(':').replace('_', ' '))
Ejemplo n.º 8
0
    def _get_process_allelic_variants(self, entry, g):
        gu = GraphUtils(curie_map.get())
        geno = Genotype(g)
        du = DipperUtil()
        if entry is not None:
            publist = {}  # to hold the entry-specific publication mentions for the allelic variants
            entry_num = entry['mimNumber']

            # process the ref list just to get the pmids
            ref_to_pmid = self._get_pubs(entry, g)

            if 'allelicVariantList' in entry:
                allelicVariantList = entry['allelicVariantList']
                for al in allelicVariantList:
                    al_num = al['allelicVariant']['number']
                    al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4)
                    al_label = None
                    al_description = None
                    if al['allelicVariant']['status'] == 'live':
                        publist[al_id] = set()
                        if 'mutations' in al['allelicVariant']:
                            al_label = al['allelicVariant']['mutations']
                        if 'text' in al['allelicVariant']:
                            al_description = al['allelicVariant']['text']
                            m = re.findall('\{(\d+)\:', al_description)
                            publist[al_id] = set(m)
                        geno.addAllele(al_id, al_label, geno.genoparts['variant_locus'], al_description)
                        geno.addAlleleOfGene(al_id, 'OMIM:'+str(entry_num),
                                             geno.object_properties['is_sequence_variant_instance_of'])
                        for r in publist[al_id]:
                            pmid = ref_to_pmid[int(r)]
                            gu.addTriple(g, pmid, gu.object_properties['is_about'], al_id)
                        # look up the pubmed id in the list of references
                        if 'dbSnps' in al['allelicVariant']:
                            dbsnp_ids = re.split(',', al['allelicVariant']['dbSnps'])
                            for dnum in dbsnp_ids:
                                did = 'dbSNP:'+dnum.strip()
                                gu.addIndividualToGraph(g, did, None)
                                gu.addEquivalentClass(g, al_id, did)
                        if 'clinvarAccessions' in al['allelicVariant']:
                            # clinvarAccessions triple semicolon delimited, each lik eRCV000020059;;1
                            rcv_ids = re.split(';;;', al['allelicVariant']['clinvarAccessions'])
                            rcv_ids = [(re.match('(RCV\d+)\;\;', r)).group(1) for r in rcv_ids]
                            for rnum in rcv_ids:
                                rid = 'ClinVar:'+rnum
                                gu.addXref(g, al_id, rid)
                        gu.addPage(g, al_id, "http://omim.org/entry/"+str(entry_num)+"#"+str(al_num).zfill(4))
                    elif re.search('moved', al['allelicVariant']['status']):
                        # for both 'moved' and 'removed'
                        moved_ids = None
                        if 'movedTo' in al['allelicVariant']:
                            moved_id = 'OMIM:'+al['allelicVariant']['movedTo']
                            moved_ids = [moved_id]
                        gu.addDeprecatedIndividual(g, al_id, moved_ids)
                    else:
                        logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status'])
                # end loop allelicVariantList

        return
Ejemplo n.º 9
0
    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            logger.info("Only parsing first %s rows fo each file", str(limit))

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            g = self.testgraph
        else:
            g = self.graph

        tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
        self._process_trait_mappings(tmap, limit)

        geno = Genotype(g)
        # organisms  = ['chicken']
        organisms = [
            'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for o in organisms:
            tax_id = self._get_tax_by_common_name(o)
            geno.addGenome(tax_id, o)
            build_id = None
            build = None

            k = o+'_bp'
            if k in self.files:
                file = self.files[k]['file']
                m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file)
                if m is None:
                    logger.error("Can't match a gff build")
                else:
                    build = m.group(1)
                    build_id = self._map_build_by_abbrev(build)
                    logger.info("Build = %s", build_id)
                    geno.addReferenceGenome(build_id, build, tax_id)
                if build_id is not None:
                    self._process_QTLs_genomic_location(
                        '/'.join((self.rawdir, file)), tax_id, build_id, build,
                        limit)

            k = o+'_cm'
            if k in self.files:
                file = self.files[k]['file']
                self._process_QTLs_genetic_location(
                    '/'.join((self.rawdir, file)), tax_id, o, limit)

        logger.info("Finished parsing")

        self.load_bindings()

        logger.info("Found %d nodes", len(self.graph))
        return
Ejemplo n.º 10
0
    def _process_genes(self, taxid, limit=None):
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        line_counter = 0
        logger.info("Processing Ensembl genes for tax %s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t')
            for row in filereader:
                if len(row) < 4:
                    logger.error("Data error for file %s", raw)
                    return
                (ensembl_gene_id, external_gene_name, description,
                 gene_biotype, entrezgene) = row[0:5]

                # in the case of human genes, we also get the hgnc id,
                # and is the last col
                if taxid == '9606':
                    hgnc_id = row[5]
                else:
                    hgnc_id = None

                if self.testMode and entrezgene != '' \
                        and int(entrezgene) not in self.gene_ids:
                    continue

                line_counter += 1
                gene_id = 'ENSEMBL:'+ensembl_gene_id
                if description == '':
                    description = None
                gene_type_id = self._get_gene_type(gene_biotype)
                gene_type_id = None
                gu.addClassToGraph(
                    g, gene_id, external_gene_name, gene_type_id, description)

                if entrezgene != '':
                    gu.addEquivalentClass(g, gene_id, 'NCBIGene:'+entrezgene)
                if hgnc_id is not None and hgnc_id != '':
                    gu.addEquivalentClass(g, gene_id, hgnc_id)
                geno.addTaxon('NCBITaxon:'+taxid, gene_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
        gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
        gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        return
Ejemplo n.º 11
0
    def process_disease_association(self, limit):

        raw = '/'.join((self.rawdir, self.files['disease_assoc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        logger.info("Processing disease models")
        geno = Genotype(g)
        line_counter = 0
        worm_taxon = 'NCBITaxon:6239'
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, disease_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                # WB	WBGene00000001	aap-1		DOID:2583	PMID:19029536	IEA	ENSEMBL:ENSG00000145675|OMIM:615214	D		Y110A7A.10	gene	taxon:6239	20150612	WB
                gene_id = 'WormBase:' + gene_num

                # make a variant of the gene
                vl = '_:' + '-'.join((gene_num, 'unspecified'))
                vl_label = 'some variant of ' + gene_symbol
                geno.addAffectedLocus(vl, gene_id)
                model.addBlankNodeAnnotation(vl)
                animal_id = geno.make_experimental_model_with_genotype(
                    vl, vl_label, worm_taxon, 'worm')

                assoc = G2PAssoc(g, self.name, animal_id, disease_id,
                                 model.object_properties['model_of'])
                ref = re.sub(r'WB_REF:', 'WormBase:', ref)
                if ref != '':
                    assoc.add_source(ref)
                eco_id = None
                if eco_symbol == 'IEA':
                    eco_id = 'ECO:0000501'  # IEA is this now
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                assoc.add_association_to_graph()

        return
Ejemplo n.º 12
0
 def setUp(self):
     self.graph = RDFGraph()
     self.curie_map = curie_map.get()
     self.genotype = Genotype(self.graph)
     self.cutil = CurieUtil(self.curie_map)
     self.test_cat_pred = self.cutil.get_uri(blv.terms['category'])
     self.test_cat_genotype_category = self.cutil.get_uri(
         blv.terms['Genotype'])
     self.test_cat_background_category = self.cutil.get_uri(
         blv.terms['PopulationOfIndividualOrganisms'])
Ejemplo n.º 13
0
    def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num,
                          disorder_label, phene_key):

        geno = Genotype(g)
        model = Model(g)
        disorder_id = ':'.join(('OMIM', disorder_num))
        rel_id = model.object_properties['has_phenotype']  # default
        rel_label = 'causes'
        if re.match(r'\[', disorder_label):
            rel_id = model.object_properties['is_marker_for']
            rel_label = 'is a marker for'
        elif re.match(r'\{', disorder_label):
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'
        elif re.match(r'\?', disorder_label):
            # this is a questionable mapping!  skip?
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'

        evidence = self._map_phene_mapping_code_to_eco(phene_key)

        # we actually want the association between the gene and the disease
        # to be via an alternate locus not the "wildtype" gene itself.
        # so we make an anonymous alternate locus,
        # and put that in the association.
        # but we only need to do that in the cases when it's not an NCBIGene
        # (as that is a sequence feature itself)
        if re.match(r'OMIM:', gene_id):
            alt_locus = '_:' + re.sub(r':', '',
                                      gene_id) + '-' + disorder_num + 'VL'
            alt_label = gene_symbol.strip()
            if alt_label is not None and alt_label != '':
                alt_label = \
                    ' '.join(('some variant of', alt_label,
                              'that', rel_label, disorder_label))
            else:
                alt_label = None

            model.addIndividualToGraph(alt_locus, alt_label,
                                       Genotype.genoparts['variant_locus'])
            geno.addAffectedLocus(alt_locus, gene_id)
            model.addBlankNodeAnnotation(alt_locus)

        else:
            # assume it's already been added
            alt_locus = gene_id

        assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id)
        assoc.add_evidence(evidence)
        assoc.add_association_to_graph()

        return
Ejemplo n.º 14
0
    def process_disease_association(self, limit):

        raw = '/'.join((self.rawdir, self.files['disease_assoc']['file']))

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        LOG.info("Processing disease models")
        geno = Genotype(graph)
        line_counter = 0
        worm_taxon = self.globaltt['Caenorhabditis elegans']
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, disease_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.test_mode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                # WB	WBGene00000001	aap-1		DOID:2583	PMID:19029536	IEA	ENSEMBL:ENSG00000145675|OMIM:615214	D		Y110A7A.10	gene	taxon:6239	20150612	WB
                gene_id = 'WormBase:'+gene_num

                # make a variant of the gene
                vl = '_:'+'-'.join((gene_num, 'unspecified'))
                vl_label = 'some variant of '+gene_symbol
                geno.addAffectedLocus(vl, gene_id)
                model.addBlankNodeAnnotation(vl)
                animal_id = geno.make_experimental_model_with_genotype(
                    vl, vl_label, worm_taxon, 'worm')

                assoc = G2PAssoc(
                    graph, self.name, animal_id,
                    disease_id, self.globaltt['is model of'])
                ref = re.sub(r'WB_REF:', 'WormBase:', ref)
                if ref != '':
                    assoc.add_source(ref)
                assoc.add_evidence(self.resolve(eco_symbol))
                assoc.add_association_to_graph()

        return
Ejemplo n.º 15
0
    def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num,
                          disorder_label, phene_key):

        geno = Genotype(g)
        model = Model(g)
        disorder_id = ':'.join(('OMIM', disorder_num))
        rel_id = model.object_properties['has_phenotype']  # default
        rel_label = 'causes'
        if re.match(r'\[', disorder_label):
            rel_id = model.object_properties['is_marker_for']
            rel_label = 'is a marker for'
        elif re.match(r'\{', disorder_label):
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'
        elif re.match(r'\?', disorder_label):
            # this is a questionable mapping!  skip?
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'

        evidence = self._map_phene_mapping_code_to_eco(phene_key)

        # we actually want the association between the gene and the disease
        # to be via an alternate locus not the "wildtype" gene itself.
        # so we make an anonymous alternate locus,
        # and put that in the association.
        # but we only need to do that in the cases when it's not an NCBIGene
        # (as that is a sequence feature itself)
        if re.match(r'OMIM:', gene_id):
            alt_locus = '_:'+re.sub(r':', '', gene_id)+'-'+disorder_num+'VL'
            alt_label = gene_symbol.strip()
            if alt_label is not None and alt_label != '':
                alt_label = \
                    ' '.join(('some variant of', alt_label,
                              'that', rel_label, disorder_label))
            else:
                alt_label = None

            model.addIndividualToGraph(
                alt_locus, alt_label, Genotype.genoparts['variant_locus'])
            geno.addAffectedLocus(alt_locus, gene_id)
            model.addBlankNodeAnnotation(alt_locus)

        else:
            # assume it's already been added
            alt_locus = gene_id

        assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id)
        assoc.add_evidence(evidence)
        assoc.add_association_to_graph()

        return
Ejemplo n.º 16
0
    def _process_all(self, limit):
        """
        This takes the list of omim identifiers from the omimTitles file,
        excludes those designated as obsolete and iteratively queries the omim api
        in batches of 20 for the json-formatted data.

        This will create OMIM classes, with the label & definition.
        If an entry is "removed",
            it is added as a deprecated class.
        If an entry is "moved",
            it is deprecated and consider annotations are added.

        Additionally, we extract:
        *phenotypicSeries ids as superclasses
        *equivalent ids for Orphanet and UMLS

        If set to testMode,
            it will write only those items in the test_ids to the testgraph.

        :param limit:
        """
        omimids = list(self.omim_type.keys() - self.omim_replaced.keys())

        LOG.info('Have %i omim numbers to fetch records from their API',
                 len(omimids))
        LOG.info('Have %i omim types ', len(self.omim_type))

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        tax_label = 'H**o sapiens'
        tax_id = self.globaltt[tax_label]

        # add genome and taxon
        geno.addGenome(tax_id, tax_label)
        model.addClassToGraph(tax_id, tax_label)

        includes = set()
        includes.add('all')

        self.process_entries(omimids, self._transform_entry, includes, graph,
                             limit)

        # since we are not fetching obsolete records any more add them all in here
        for omim_id in self.omim_replaced:
            model.addDeprecatedClass(
                'OMIM:' + omim_id,
                ['OMIM:' + o for o in self.omim_replaced[omim_id]])
Ejemplo n.º 17
0
Archivo: OMIA.py Proyecto: sgml/dipper
 def _process_gene_row(self, row):
     model = Model(self.graph)
     geno = Genotype(self.graph)
     if self.test_mode and row['gene_id'] not in self.test_ids['gene']:
         return
     gene_id = 'NCBIGene:' + str(row['gene_id'])
     self.id_hash['gene'][row['gene_id']] = gene_id
     gene_label = row['symbol']
     self.label_hash[gene_id] = gene_label
     tax_id = 'NCBITaxon:' + str(row['gb_species_id'])
     if row['gene_type'] is not None:
         gene_type_id = self.resolve(row['gene_type'])
         model.addClassToGraph(gene_id, gene_label, gene_type_id)
     geno.addTaxon(tax_id, gene_id)
Ejemplo n.º 18
0
    def _process_phene_gene_row(self, row):
        geno = Genotype(self.g)
        model = Model(self.g)
        gene_id = self.id_hash['gene'].get(row['gene_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if self.testMode and not (
                omia_id in self.test_ids['disease'] and
                row['gene_id'] in self.test_ids['gene']) or\
                gene_id is None or phene_id is None:
            return

        # occasionally some phenes are missing!  (ex: 406)
        if phene_id is None:
            logger.warning("Phene id %s is missing", str(row['phene_id']))
            return

        gene_label = self.label_hash[gene_id]
        # some variant of gene_id has phenotype d
        vl = '_:'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL'
        geno.addAllele(vl, 'some variant of ' + gene_label)
        geno.addAlleleOfGene(vl, gene_id)
        geno.addAffectedLocus(vl, gene_id)
        model.addBlankNodeAnnotation(vl)
        assoc = G2PAssoc(self.g, self.name, vl, phene_id)
        assoc.add_association_to_graph()

        # add the gene id to the set of annotated genes
        # for later lookup by orthology
        self.annotated_genes.add(gene_id)

        return
Ejemplo n.º 19
0
    def _process_phene_gene_row(self, row):
        geno = Genotype(self.graph)
        model = Model(self.graph)
        gene_id = self.id_hash['gene'].get(row['gene_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if self.test_mode and not (omia_id in self.test_ids['disease']
                                   and row['gene_id'] in self.test_ids['gene']
                                   ) or gene_id is None or phene_id is None:
            return

        # occasionally some phenes are missing!  (ex: 406)
        if phene_id is None:
            LOG.warning("Phene id %s is missing", str(row['phene_id']))
            return

        gene_label = self.label_hash[gene_id]
        # some variant of gene_id has phenotype d
        var = self.make_id(gene_id.split(':')[-1] + 'VL', '_')
        geno.addAllele(var, 'some variant of ' + gene_label)
        geno.addAlleleOfGene(var, gene_id)
        geno.addAffectedLocus(var, gene_id)
        model.addBlankNodeAnnotation(var)
        assoc = G2PAssoc(self.graph, self.name, var, phene_id)
        assoc.add_association_to_graph()

        # add the gene id to the set of annotated genes
        # for later lookup by orthology
        self.annotated_genes.add(gene_id)
Ejemplo n.º 20
0
    def process_gene_ids(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        logger.info("Processing: %s", self.files['gene_ids']['file'])
        line_counter = 0
        geno = Genotype(g)
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter=',',
                quotechar='\"')
            for row in filereader:
                line_counter += 1
                (taxon_num,
                 gene_num,
                 gene_symbol,
                 gene_synonym,
                 live,
                 gene_type) = row
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                taxon_id = 'NCBITaxon:'+taxon_num
                gene_id = 'WormBase:'+gene_num
                if gene_symbol == '':
                    gene_symbol = gene_synonym
                if gene_symbol == '':
                    gene_symbol = None
                model.addClassToGraph(
                    gene_id, gene_symbol, Genotype.genoparts['gene'])
                if live == 'Dead':
                    model.addDeprecatedClass(gene_id)
                geno.addTaxon(taxon_id, gene_id)
                if gene_synonym != '' and gene_synonym is not None:
                    model.addSynonym(gene_id, gene_synonym)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Ejemplo n.º 21
0
    def parse(self, limit=None):
        """
        Override Source.parse()
        Parses version and interaction information from CTD
        Args:
        :param limit (int, optional) limit the number of rows processed
        Returns:
        :return None
        """
        if limit is not None:
            LOG.info("Only parsing first %d rows", limit)

        LOG.info("Parsing files...")

        if self.test_only:
            self.test_mode = True

        self.geno = Genotype(self.graph)
        self.pathway = Pathway(self.graph)

        src_key = 'chemical_disease_associations'
        self._parse_ctd_file(limit, src_key)

        # self._parse_ctd_file(limit, 'gene_pathway')
        # self._parse_ctd_file(limit, 'gene_disease')

        src_key = 'publications'
        file_path = '/'.join((self.rawdir, self.api_fetch[src_key]['file']))
        if os.path.exists(file_path) is True:
            self._parse_curated_chem_disease(file_path, limit)
        else:
            LOG.error('Batch Query file "%s" does not exist', file_path)
        LOG.info("Done parsing files.")
Ejemplo n.º 22
0
    def __init__(self):
        Source.__init__(self, 'ctd')
        self.dataset = Dataset(
            'ctd', 'CTD', 'http://ctdbase.org', None,
            'http://ctdbase.org/about/legal.jsp')

        if 'test_ids' not in config.get_config() \
                or 'gene' not in config.get_config()['test_ids']:
            logger.warning("not configured with gene test ids.")
            self.test_geneids = []
        else:
            self.test_geneids = config.get_config()['test_ids']['gene']

        if 'test_ids' not in config.get_config() \
                or 'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_diseaseids = []
        else:
            self.test_diseaseids = config.get_config()['test_ids']['disease']

        self.gu = GraphUtils(curie_map.get())
        self.g = self.graph
        self.geno = Genotype(self.g)

        return
Ejemplo n.º 23
0
    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(
            graph_type,
            are_bnodes_skolemized,
            'decipher',
            ingest_title='Development Disorder Genotype Phenotype Database',
            ingest_url='https://decipher.sanger.ac.uk/',
            license_url='https://decipher.sanger.ac.uk/legal',
            data_rights='https://decipher.sanger.ac.uk/datasharing',
            # file_handle=None
        )

        if 'disease' not in self.all_test_ids:
            LOG.warning("not configured with disease test ids.")
            self.test_ids = []
        else:
            self.test_ids = self.all_test_ids['disease']

        self.graph = self.graph
        self.geno = Genotype(self.graph)
        self.model = Model(self.graph)
        self.graph_type = graph_type
        self.are_bnodes_skolemized = are_bnodes_skolemized

        return
Ejemplo n.º 24
0
    def __init__(self):
        Source.__init__(self, 'mpd')
        # @N, not sure if this step is required
        self.namespaces.update(curie_map.get())
        self.stdevthreshold = 2

        self.nobnodes = True  # FIXME

        # update the dataset object with details about this resource
        # @N: Note that there is no license as far as I can tell
        self.dataset = Dataset(
            'mpd', 'MPD', 'http://phenome.jax.org', None, None)

        # TODO add a citation for mpd dataset as a whole
        self.dataset.set_citation('PMID:15619963')

        self.assayhash = {}
        self.idlabel_hash = {}
        # to store the mean/zscore of each measure by strain+sex
        self.score_means_by_measure = {}
        # to store the mean value for each measure by strain+sex
        self.strain_scores_by_measure = {}

        self.geno = Genotype(self.graph)
        self.gu = GraphUtils(curie_map.get())

        return
Ejemplo n.º 25
0
Archivo: CTD.py Proyecto: sgml/dipper
    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(
            graph_type,
            are_bnodes_skolemized,
            'ctd',
            ingest_title='Comparative Toxicogenomics Database',
            ingest_url='http://ctdbase.org',
            license_url=None,
            data_rights='http://ctdbase.org/about/legal.jsp'
            # file_handle=None
        )

        if 'gene' not in self.all_test_ids:
            LOG.warning("not configured with gene test ids.")
            self.test_geneids = []
        else:
            self.test_geneids = self.all_test_ids['gene']

        if 'disease' not in self.all_test_ids:
            LOG.warning("not configured with disease test ids.")
            self.test_diseaseids = []
        else:
            self.test_diseaseids = self.all_test_ids['disease']

        self.geno = Genotype(self.graph)
        self.pathway = Pathway(self.graph)

        return
Ejemplo n.º 26
0
    def process_gene_ids(self, limit):
        src_key = 'gene_ids'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)
        col = self.files[src_key]['columns']
        LOG.info("Processing: %s", self.files[src_key]['file'])

        with gzip.open(raw, 'rb') as csvfile:
            reader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                delimiter=',',
                                quotechar='\"')
            # no header row to check
            collen = len(col)
            for row in reader:
                if len(row) != collen:
                    LOG.error('In %s line %i expected %i colums but got %s.',
                              self.files[src_key]['file'], reader.line_num,
                              collen, row)
                    pass
                taxon_num = row[col.index('taxon_num')]
                gene_num = row[col.index('gene_num')]
                gene_symbol = row[col.index('gene_symbol')]
                gene_synonym = row[col.index('gene_synonym')]
                live = row[col.index('live')]
                # gene_type = row[col.index('gene_type')]
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene

                taxon_curie = 'NCBITaxon:' + taxon_num
                gene_curie = 'WormBase:' + gene_num

                if gene_symbol == '':
                    gene_symbol = gene_synonym  # these are not the same in my book tec.
                if gene_symbol == '':
                    gene_symbol = None
                model.addClassToGraph(gene_curie, gene_symbol,
                                      self.globaltt['gene'])
                if live == 'Dead':
                    model.addDeprecatedClass(gene_curie,
                                             old_id_category=blv.terms['Gene'])
                geno.addTaxon(taxon_curie, gene_curie)
                if gene_synonym is not None and gene_synonym != '':
                    model.addSynonym(gene_curie, gene_synonym)

                if limit is not None and reader.line_num > limit:
                    break
Ejemplo n.º 27
0
 def _add_variant_gene_relationship(self, variant_id, hgnc_symbol):
     """
     :param variant_id
     :param hgnc_symbol
     :return: None
     """
     gu = GraphUtils(curie_map.get())
     geno = Genotype(self.graph)
     if hgnc_symbol in self.gene_map:
         gene_id = self.gene_map[hgnc_symbol]
     else:
         gene_id = self.make_cgd_id("{0}{1}".format(variant_id, hgnc_symbol))
         logger.warn("Can't map gene symbol {0} "
                     "to entrez ID".format(hgnc_symbol))
     gu.addClassToGraph(self.graph, gene_id, hgnc_symbol)
     geno.addAlleleOfGene(variant_id, gene_id)
     return
Ejemplo n.º 28
0
    def _process_all(self, limit):
        """
        This takes the list of omim identifiers from the omim.txt.Z file,
        and iteratively queries the omim api for the json-formatted data.
        This will create OMIM classes, with the label,
        definition, and some synonyms.
        If an entry is "removed",
            it is added as a deprecated class.
        If an entry is "moved",
            it is deprecated and consider annotations are added.

        Additionally, we extract:
        *phenotypicSeries ids as superclasses
        *equivalent ids for Orphanet and UMLS

        If set to testMode,
            it will write only those items in the test_ids to the testgraph.

        :param limit:
        :return:
        """
        omimids = self._get_omim_ids()  # store the set of omim identifiers

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        geno = Genotype(g)
        model = Model(g)
        # tax_num = '9606'   # TODO PYLINT unused
        tax_id = 'NCBITaxon:9606'
        tax_label = 'Human'

        # add genome and taxon
        geno.addGenome(tax_id, tax_label)   # tax label can get added elsewhere
        model.addClassToGraph(tax_id, None)   # label added elsewhere

        includes = set()
        includes.add('all')

        self.process_entries(
            omimids, self._transform_entry, includes, g, limit)

        return
Ejemplo n.º 29
0
    def process_gene_ids(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        logger.info("Processing: %s", self.files['gene_ids']['file'])
        line_counter = 0
        geno = Genotype(graph)
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter=',',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                (taxon_num, gene_num, gene_symbol, gene_synonym, live,
                 gene_type) = row
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                taxon_id = 'NCBITaxon:' + taxon_num
                gene_id = 'WormBase:' + gene_num
                if gene_symbol == '':
                    gene_symbol = gene_synonym
                if gene_symbol == '':
                    gene_symbol = None
                model.addClassToGraph(gene_id, gene_symbol,
                                      self.globaltt['gene'])
                if live == 'Dead':
                    model.addDeprecatedClass(gene_id)
                geno.addTaxon(taxon_id, gene_id)
                if gene_synonym != '' and gene_synonym is not None:
                    model.addSynonym(gene_id, gene_synonym)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Ejemplo n.º 30
0
    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species

        graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        LOG.info("Adding equivalent assembly identifiers")
        for sp in self.species:
            tax_id = self.globaltt[sp]
            txid_num = tax_id.split(':')[1]
            for key in self.files[txid_num]['assembly']:
                ucsc_id = key
                try:
                    ucsc_label = ucsc_id.split(':')[1]
                except IndexError:
                    LOG.error('%s Assembly id:  "%s" is problematic', sp, key)
                    continue
                if key in self.localtt:
                    mapped_id = self.localtt[key]
                else:
                    LOG.error(
                        '%s Assembly id:  "%s" is not in local translation table',
                        sp, key)

                mapped_label = mapped_id.split(':')[1]

                mapped_label = 'NCBI build ' + str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return
Ejemplo n.º 31
0
class GenotypeTestCase(unittest.TestCase):
    def setUp(self):
        self.graph = RDFGraph()
        self.curie_map = curie_map.get()
        self.genotype = Genotype(self.graph)

    def tearDown(self):
        self.genotype = None

    def test_addGenotype(self):
        from rdflib.namespace import RDFS, URIRef
        from rdflib import Literal
        from dipper.utils.CurieUtil import CurieUtil
        cutil = CurieUtil(self.curie_map)
        gid = 'MGI:5515892'
        label = \
            'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]'
        self.genotype.addGenotype(gid, label)
        self.assertTrue((URIRef(cutil.get_uri(gid)), RDFS['label'],
                         Literal(label)) in self.genotype.graph)
Ejemplo n.º 32
0
    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'decipher')

        self.dataset = Dataset(
            'decipher', 'Development Disorder Genotype – Phenotype Database',
            'https://decipher.sanger.ac.uk/', None,
            'https://decipher.sanger.ac.uk/legal')

        if 'test_ids' not in config.get_config() \
                or 'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_ids = []
        else:
            self.test_ids = config.get_config()['test_ids']['disease']

        self.g = self.graph
        self.geno = Genotype(self.g)
        self.model = Model(self.g)

        return
Ejemplo n.º 33
0
    def parse(self, limit=None):
        if limit is not None:
            LOG.info("Only parsing first %s rows", limit)

        LOG.info("Parsing files...")

        if self.test_only:
            self.test_mode = True
            self.graph = self.testgraph
        else:
            self.graph = self.graph

        self.geno = Genotype(self.graph)

        # rare disease-phenotype associations
        self._process_ddg2p_annotations(limit)

        LOG.info("Finished parsing.")

        return
Ejemplo n.º 34
0
class GenotypeTestCase(unittest.TestCase):

    def setUp(self):
        self.graph = Graph()
        self.curie_map = curie_map.get()
        self.genotype = Genotype(self.graph)

    def tearDown(self):
        self.genotype = None

    def test_addGenotype(self):
        from rdflib.namespace import RDFS,URIRef
        from rdflib import Literal
        from dipper.utils.CurieUtil import CurieUtil
        cu = CurieUtil(self.curie_map)
        id = 'MGI:5515892'
        label = \
            'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]'
        self.genotype.addGenotype(id, label)
        self.assertTrue((URIRef(cu.get_uri(id)), RDFS['label'],
                         Literal(label)) in self.genotype.graph)
Ejemplo n.º 35
0
    def parse(self, limit=None):
        """
        Override Source.parse()
        Parses version and interaction information from CTD
        Args:
        :param limit (int, optional) limit the number of rows processed
        Returns:
        :return None
        """
        if limit is not None:
            logger.info("Only parsing first %d rows", limit)

        logger.info("Parsing files...")
        # pub_map = dict()
        # file_path = '/'.join((self.rawdir,
        # self.static_files['publications']['file']))
        # if os.path.exists(file_path) is True:
        #     pub_map = self._parse_publication_file(
        #         self.static_files['publications']['file']
        #     )

        if self.testOnly:
            self.testMode = True

        if self.testMode:
            self.g = self.testgraph
        else:
            self.g = self.graph
        self.geno = Genotype(self.g)
        self.pathway = Pathway(self.g)

        self._parse_ctd_file(
            limit, self.files['chemical_disease_interactions']['file'])
        self._parse_ctd_file(limit, self.files['gene_pathway']['file'])
        self._parse_ctd_file(limit, self.files['gene_disease']['file'])
        self._parse_curated_chem_disease(limit)

        logger.info("Done parsing files.")

        return
Ejemplo n.º 36
0
    def parse(self, limit=None):
        """
        Override Source.parse()
        Parses version and interaction information from CTD
        Args:
            :param limit (int, optional) limit the number of rows processed
        Returns:
            :return None
        """
        if limit is not None:
            logger.info("Only parsing first %d rows", limit)

        logger.info("Parsing files...")
        # pub_map = dict()
        # file_path = '/'.join((self.rawdir,
        # self.static_files['publications']['file']))
        # if os.path.exists(file_path) is True:
        #     pub_map = self._parse_publication_file(
        #         self.static_files['publications']['file']
        #     )

        if self.testOnly:
            self.testMode = True

        if self.testMode:
            self.g = self.testgraph
        else:
            self.g = self.graph
        self.geno = Genotype(self.g)
        self.path = Pathway(self.g, self.nobnodes)

        self._parse_ctd_file(
            limit, self.files['chemical_disease_interactions']['file'])
        self._parse_ctd_file(limit, self.files['gene_pathway']['file'])
        self._parse_ctd_file(limit, self.files['gene_disease']['file'])
        self._parse_curated_chem_disease(limit)
        self.gu.loadAllProperties(self.g)

        self.gu.loadProperties(
            self.g, G2PAssoc.object_properties, self.gu.OBJPROP)
        self.gu.loadProperties(
            self.g, G2PAssoc.datatype_properties, self.gu.DATAPROP)
        self.gu.loadProperties(
            self.g, G2PAssoc.annotation_properties, self.gu.ANNOTPROP)
        self.gu.loadProperties(
            self.g, Pathway.object_properties, self.gu.OBJPROP)

        self.load_bindings()
        logger.info("Done parsing files.")

        return
Ejemplo n.º 37
0
    def parse(self, limit=None):
        """
        MPD data is delivered in four separate csv files and one xml file,
        which we process iteratively and write out as
        one large graph.

        :param limit:
        :return:
        """
        if limit is not None:
            logger.info("Only parsing first %s rows fo each file", str(limit))

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            g = self.testgraph
            self.geno = Genotype(self.testgraph)
        else:
            g = self.graph

        self._process_straininfo(limit)
        # the following will provide us the hash-lookups
        # These must be processed in a specific order

        # mapping between assays and ontology terms
        self._process_ontology_mappings_file(limit)
        # this is the metadata about the measurements
        self._process_measurements_file(limit)
        # get all the measurements per strain
        self._process_strainmeans_file(limit)

        # The following will use the hash populated above
        # to lookup the ids when filling in the graph
        self._fill_provenance_graph(limit)

        logger.info("Finished parsing.")

        self.load_bindings()

        gu = GraphUtils(curie_map.get())
        gu.loadAllProperties(g)
        gu.loadProperties(g, G2PAssoc.object_properties, GraphUtils.OBJPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, GraphUtils.OBJPROP)
        gu.loadProperties(
            g, G2PAssoc.annotation_properties, GraphUtils.ANNOTPROP)

        logger.info("Found %d nodes", len(self.graph))
        return
Ejemplo n.º 38
0
    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            logger.info("Only parsing first %s rows fo each file", str(limit))

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            g = self.testgraph
        else:
            g = self.graph

        tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
        self._process_trait_mappings(tmap, limit)

        geno = Genotype(g)
        # organisms  = ['chicken']
        organisms = [
            'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for o in organisms:
            tax_id = self._get_tax_by_common_name(o)
            geno.addGenome(tax_id, o)
            build_id = None
            build = None

            k = o + '_bp'
            if k in self.files:
                file = self.files[k]['file']
                m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file)
                if m is None:
                    logger.error("Can't match a gff build")
                else:
                    build = m.group(1)
                    build_id = self._map_build_by_abbrev(build)
                    logger.info("Build = %s", build_id)
                    geno.addReferenceGenome(build_id, build, tax_id)
                if build_id is not None:
                    self._process_QTLs_genomic_location(
                        '/'.join((self.rawdir, file)), tax_id, build_id, build,
                        limit)

            k = o+'_cm'
            if k in self.files:
                file = self.files[k]['file']
                self._process_QTLs_genetic_location(
                    '/'.join((self.rawdir, file)), tax_id, o, limit)

        logger.info("Finished parsing")
        return
Ejemplo n.º 39
0
    def parse(self, limit=None):
        # names of tables to iterate - probably don't need all these:
        # Article_Breed, Article_Keyword, Article_Gene, Article_Keyword,
        # Article_People, Article_Phene, Articles, Breed, Breed_Phene,
        # Genes_gb, Group_Categories, Group_MPO, Inherit_Type, Keywords,
        # Landmark, Lida_Links, OMIA_Group, OMIA_author, Omim_Xref, People,
        # Phene, Phene_Gene, Publishers, Resources, Species_gb, Synonyms

        self.scrub()

        if limit is not None:
            logger.info("Only parsing first %d rows", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        if self.testMode:
            self.g = self.testgraph
        else:
            self.g = self.graph
        self.geno = Genotype(self.g)

        # we do three passes through the file
        # first process species (two others reference this one)
        self.process_species(limit)

        # then, process the breeds, genes, articles, and other static stuff
        self.process_classes(limit)

        # next process the association data
        self.process_associations(limit)

        # process the vertebrate orthology for genes
        # that are annotated with phenotypes
        ncbi = NCBIGene()
        ncbi.add_orthologs_by_gene_group(self.g, self.annotated_genes)

        self.load_core_bindings()
        self.load_bindings()

        logger.info("Done parsing.")

        self.write_molgen_report()

        return
Ejemplo n.º 40
0
    def __init__(self):
        Source.__init__(self, 'omia')

        self.load_bindings()

        self.dataset = Dataset(
            'omia', 'Online Mendelian Inheritance in Animals',
            'http://omia.angis.org.au', None, None,
            'http://sydney.edu.au/disclaimer.shtml')

        self.id_hash = {
            'article': {},
            'phene': {},
            'breed': {},
            'taxon': {},
            'gene': {}
        }
        self.label_hash = {}
        self.gu = GraphUtils(curie_map.get())
        # used to store the omia to omim phene mappings
        self.omia_omim_map = {}
        # used to store the unique genes that have phenes
        # (for fetching orthology)
        self.annotated_genes = set()

        self.test_ids = {
            'disease': [
                'OMIA:001702', 'OMIA:001867', 'OMIA:000478', 'OMIA:000201',
                'OMIA:000810', 'OMIA:001400'],
            'gene': [
                492297, 434, 492296, 3430235, 200685834, 394659996, 200685845,
                28713538, 291822383],
            'taxon': [9691, 9685, 9606, 9615, 9913, 93934, 37029, 9627, 9825],
            # to be filled in during parsing of breed table
            # for lookup by breed-associations
            'breed': []
        }
        # to store a map of omia ids and any molecular info
        # to write a report for curation
        self.stored_omia_mol_gen = {}
        self.g = self.graph
        self.geno = Genotype(self.g)
        return
Ejemplo n.º 41
0
    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            self.g = self.testgraph
        else:
            self.g = self.graph

        self.geno = Genotype(self.g)

        # rare disease-phenotype associations
        self._process_ddg2p_annotations(limit)

        logger.info("Finished parsing.")

        return
Ejemplo n.º 42
0
    def __init__(self):
        Source.__init__(self, 'decipher')

        self.load_bindings()

        self.dataset = Dataset(
            'decipher', 'Development Disorder Genotype – Phenotype Database',
            'https://decipher.sanger.ac.uk/', None,
            'https://decipher.sanger.ac.uk/legal')

        if 'test_ids' not in config.get_config() \
                or 'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_ids = []
        else:
            self.test_ids = config.get_config()['test_ids']['disease']

        self.gu = GraphUtils(curie_map.get())
        self.g = self.graph
        self.geno = Genotype(self.g)

        return
Ejemplo n.º 43
0
    def parse(self, limit=None):
        """
        Override Source.parse()
        Parses version and interaction information from CTD
        Args:
        :param limit (int, optional) limit the number of rows processed
        Returns:
        :return None
        """
        if limit is not None:
            LOG.info("Only parsing first %d rows", limit)

        LOG.info("Parsing files...")

        if self.test_only:
            self.test_mode = True

        self.geno = Genotype(self.graph)
        self.pathway = Pathway(self.graph)

        src_key = 'chemical_disease_associations'
        self._parse_ctd_file(limit, src_key)
Ejemplo n.º 44
0
 def _add_gene_anatomy_association(self, gene_id, anatomy_curie, rank):
     """
     :param gene_id: str Non curified ID
     :param gene_label: str Gene symbol
     :param anatomy_curie: str curified anatomy term
     :param rank: str rank
     :return: None
     """
     g2a_association = Assoc(self.graph, self.name)
     genotype = Genotype(self.graph)
     model = Model(self.graph)
     gene_curie = "ENSEMBL:{}".format(gene_id)
     rank = re.sub(r',', '', rank)
     model.addIndividualToGraph(ind_id=gene_curie, label=None,
                                ind_type=genotype.genoparts['gene'])
     g2a_association.sub = gene_curie
     g2a_association.obj = anatomy_curie
     g2a_association.rel = Assoc.object_properties['expressed_in']
     g2a_association.add_association_to_graph()
     g2a_association.add_predicate_object(
         Assoc.datatype_properties['has_quantifier'],
         float(rank), 'Literal', 'xsd:float')
     return
Ejemplo n.º 45
0
    def parse(self, limit=None):
        """
        Override Source.parse()
        Parses version and interaction information from CTD
        Args:
        :param limit (int, optional) limit the number of rows processed
        Returns:
        :return None
        """
        if limit is not None:
            logger.info("Only parsing first %d rows", limit)

        logger.info("Parsing files...")
        # pub_map = dict()
        # file_path = '/'.join((self.rawdir,
        # self.static_files['publications']['file']))
        # if os.path.exists(file_path) is True:
        #     pub_map = self._parse_publication_file(
        #         self.static_files['publications']['file']
        #     )

        if self.testOnly:
            self.testMode = True

        self.geno = Genotype(self.graph)
        self.pathway = Pathway(self.graph)

        self._parse_ctd_file(
            limit, self.files['chemical_disease_interactions']['file'])
        self._parse_ctd_file(limit, self.files['gene_pathway']['file'])
        self._parse_ctd_file(limit, self.files['gene_disease']['file'])
        self._parse_curated_chem_disease(limit)

        logger.info("Done parsing files.")

        return
Ejemplo n.º 46
0
    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species

        graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        logger.info("Adding equivalent assembly identifiers")
        for sp in self.species:
            tax_id = self.resolve(sp)
            txid_num = tax_id.split(':')[1]
            for key in self.files[txid_num]['assembly']:
                ucsc_id = key
                try:
                    ucsc_label = ucsc_id.split(':')[1]
                except IndexError:
                    logger.error('%s Assembly id:  "%s" is problematic', sp,
                                 key)
                    continue
                if key in self.localtt:
                    mapped_id = self.localtt[key]
                else:
                    logger.error(
                        '%s Assembly id:  "%s" is not in local translation table',
                        sp, key)

                mapped_label = mapped_id.split(':')[1]

                mapped_label = 'NCBI build ' + str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return
Ejemplo n.º 47
0
    def _parse_genepage2gene(self, limit) -> Dict[str, List[str]]:
        """
        :return:
        """
        src_key = 'genepage2gene'
        columns = self.files[src_key]['columns']
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))

        geno = Genotype(self.graph)
        genepage2gene = {}

        LOG.info("Processing GenePage to Gene file")

        with open(raw, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t')

            for row in reader:

                gene_page = row[columns.index('gene_page_id')]
                # gene_page_label = row[columns.index('gene_page_label')]
                tropicalis_id = row[columns.index('tropicalis_id')]
                tropicalis_label = row[columns.index('tropicalis_label')]
                laevis_l_id = row[columns.index('laevis_l_id')]
                laevis_l_label = row[columns.index('laevis_l_label')]
                laevis_s_id = row[columns.index('laevis_s_id')]
                laevis_s_label = row[columns.index('laevis_s_label')]

                tropicalis_curie = 'Xenbase:' + tropicalis_id
                laevis_l_curie = 'Xenbase:' + laevis_l_id
                laevis_s_curie = 'Xenbase:' + laevis_s_id

                genepage2gene[gene_page] = [tropicalis_curie, laevis_l_curie, laevis_s_curie]

                geno.addGene(tropicalis_curie, tropicalis_label)
                geno.addGene(laevis_l_curie, laevis_l_label)
                geno.addGene(laevis_s_curie, laevis_s_label)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break

        return genepage2gene
Ejemplo n.º 48
0
    def _add_g2p_assoc(self, graph, strain_id, sex, assay_id, phenotypes,
                       comment):
        """
        Create an association between a sex-specific strain id
        and each of the phenotypes.
        Here, we create a genotype from the strain,
        and a sex-specific genotype.
        Each of those genotypes are created as anonymous nodes.

        The evidence code is hardcoded to be:
            ECO:experimental_phenotypic_evidence.

        :param g:
        :param strain_id:
        :param sex:
        :param assay_id:
        :param phenotypes: a list of phenotypes to association with the strain
        :param comment:
        :return:

        """
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['experimental phenotypic evidence']
        strain_label = self.idlabel_hash.get(strain_id)
        # strain genotype
        genotype_id = '_' + '-'.join((re.sub(r':', '', strain_id), 'genotype'))
        genotype_label = '[' + strain_label + ']'

        sex_specific_genotype_id = '_' + '-'.join(
            (re.sub(r':', '', strain_id), sex, 'genotype'))
        if strain_label is not None:
            sex_specific_genotype_label = strain_label + ' (' + sex + ')'
        else:
            sex_specific_genotype_label = strain_id + '(' + sex + ')'

        genotype_type = self.globaltt['sex_qualified_genotype']
        if sex == 'm':
            genotype_type = self.globaltt['male_genotype']
        elif sex == 'f':
            genotype_type = self.globaltt['female_genotype']

        # add the genotype to strain connection
        geno.addGenotype(genotype_id, genotype_label,
                         self.globaltt['genomic_background'])
        graph.addTriple(strain_id, self.globaltt['has_genotype'], genotype_id)

        geno.addGenotype(sex_specific_genotype_id, sex_specific_genotype_label,
                         genotype_type)

        # add the strain as the background for the genotype
        graph.addTriple(sex_specific_genotype_id,
                        self.globaltt['has_sex_agnostic_part'], genotype_id)

        # #############    BUILD THE G2P ASSOC    #############
        # TODO add more provenance info when that model is completed

        if phenotypes is not None:
            for phenotype_id in phenotypes:
                assoc = G2PAssoc(graph, self.name, sex_specific_genotype_id,
                                 phenotype_id)
                assoc.add_evidence(assay_id)
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()
                model.addComment(assoc_id, comment)
                model._addSexSpecificity(assoc_id, self.resolve(sex))

        return
Ejemplo n.º 49
0
 def setUp(self):
     self.graph = RDFGraph()
     self.curie_map = curie_map.get()
     self.genotype = Genotype(self.graph)
Ejemplo n.º 50
0
    def process_gaf(self, file, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", file)
        line_counter = 0
        uniprot_hit = 0
        uniprot_miss = 0
        if '7955' in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        if '6239' in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue

                if len(row) > 17 or len(row) < 15:
                    LOG.warning(
                        "Wrong number of columns %i, expected 15 or 17\n%s",
                        len(row), row)
                    continue

                if 17 > len(row) >= 15:
                    row += [""] * (17 - len(row))

                (dbase, gene_num, gene_symbol, qualifier, go_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 object_type, taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (dbase == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    LOG.error(
                        "Missing required part of annotation on row %d:\n" +
                        '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s  is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and not (re.match(r'NCBIGene', gene_id)
                                           and int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, syn.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    LOG.info(">1 taxon (%s) on line %d.  skipping", taxon,
                             line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to',
                                                   qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n",
                                str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                #######################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i):
                            LOG.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = wbase.make_reagent_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, i, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(self.globaltt[
                                    'experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.test_mode and limit is not None and line_counter > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
        return
Ejemplo n.º 51
0
    def process_rnai_phenotypes(self, limit=None):

        raw = '/'.join((self.rawdir, self.files['rnai_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        # gu = GraphUtils(curie_map.get())  # TODO unused

        logger.info("Processing RNAi phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (gene_num, gene_alt_symbol, phenotype_label, phenotype_id,
                 rnai_and_refs) = row
# WBGene00001908	F17E9.9	locomotion variant	WBPhenotype:0000643	WBRNAi00025129|WBPaper00006395 WBRNAi00025631|WBPaper00006395
# WBGene00001908	F17E9.9	avoids bacterial lawn	WBPhenotype:0000402	WBRNAi00095640|WBPaper00040984
# WBGene00001908	F17E9.9	RAB-11 recycling endosome localization variant	WBPhenotype:0002107	WBRNAi00090830|WBPaper00041129

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                gene_id = 'WormBase:'+gene_num
                # refs = list()  # TODO unused

                # the rnai_and_refs has this so that
                # WBRNAi00008687|WBPaper00005654 WBRNAi00025197|WBPaper00006395 WBRNAi00045381|WBPaper00025054
                # space delimited between RNAi sets;
                # then each RNAi should have a paper

                rnai_sets = re.split(r' ', rnai_and_refs)

                for s in rnai_sets:

                    # get the rnai_id
                    (rnai_num, ref_num) = re.split(r'\|', s)
                    if len(re.split(r'\|', s)) > 2:
                        logger.warning(
                            "There's an unexpected number of items in %s", s)
                    if rnai_num not in self.rnai_gene_map:
                        self.rnai_gene_map[rnai_num] = set()

                    # to use for looking up later
                    self.rnai_gene_map[rnai_num].add(gene_num)

                    rnai_id = 'WormBase:'+rnai_num
                    geno.addGeneTargetingReagent(
                        rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id)

                    # make the "allele" of the gene
                    # that is targeted by the reagent
                    allele_id = self.make_reagent_targeted_gene_id(
                        gene_num, rnai_num, self.nobnodes)
                    allele_label = gene_alt_symbol+'<'+rnai_num+'>'
                    geno.addReagentTargetedGene(
                        rnai_id, gene_id, allele_id, allele_label)

                    assoc = G2PAssoc(self.name, allele_id, phenotype_id)
                    assoc.add_source('WormBase:'+ref_num)
                    # eco_id = 'ECO:0000019'  # RNAi evidence  # TODO unused
                    assoc.add_association_to_graph(g)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Ejemplo n.º 52
0
    def process_feature_loc(self, limit):

        raw = '/'.join((self.rawdir, self.files['feature_loc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing Feature location and attributes")
        line_counter = 0
        geno = Genotype(g)
        strain_to_variant_map = {}
        build_num = self.version_num
        build_id = 'WormBase:'+build_num
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
                quotechar='\"')
            for row in filereader:
                if re.match(r'\#', ''.join(row)):
                    continue
                (chrom, db, feature_type_label, start, end, score, strand,
                 phase, attributes) = row

# I	interpolated_pmap_position	gene	1	559768	.	.	.	ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM)
# I	WormBase	gene	3747	3909	.	-	.	ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6
# I	absolute_pmap_position	gene	4119	10230	.	.	.	ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM)

                # dbs = re.split(
                #   r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA')
                #
                # if db not in dbs:
                #     continue

                if feature_type_label not in [
                        'gene', 'point_mutation', 'deletion', 'RNAi_reagent',
                        'duplication', 'enhancer', 'binding_site',
                        'biological_region', 'complex_substitution',
                        'substitution', 'insertion', 'inverted_repeat']:
                    # note biological_regions include balancers
                    # other options here: promoter, regulatory_region, reagent
                    continue
                line_counter += 1

                attribute_dict = {}
                if attributes != '':
                    attribute_dict = dict(
                        item.split("=")for item in
                        re.sub(r'"', '', attributes).split(";"))

                fid = flabel = desc = None
                if 'ID' in attribute_dict:
                    fid = attribute_dict.get('ID')
                    if re.search(r'WB(Gene|Var|sf)', fid):
                        fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid)
                    elif re.match(r'(gmap|landmark)', fid):
                        continue
                    else:
                        logger.info('other identifier %s', fid)
                        fid = None
                elif 'variation' in attribute_dict:
                    fid = 'WormBase:'+attribute_dict.get('variation')
                    flabel = attribute_dict.get('public_name')
                    sub = attribute_dict.get('substitution')
                    ins = attribute_dict.get('insertion')
                    # if it's a variation:
                    # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T
                    desc = ''
                    if sub is not None:
                        desc = 'substitution='+sub
                    if ins is not None:
                        desc = 'insertion='+ins

                    # keep track of the strains with this variation,
                    # for later processing
                    strain_list = attribute_dict.get('strain')
                    if strain_list is not None:
                        for s in re.split(r',', strain_list):
                            if s.strip() not in strain_to_variant_map:
                                strain_to_variant_map[s.strip()] = set()
                            strain_to_variant_map[s.strip()].add(fid)

                # if feature_type_label == 'RNAi_reagent':
                    # Target=WBRNAi00096030 1 4942
                    # this will tell us where the RNAi is actually binding
                    # target = attribute_dict.get('Target') # TODO unused
                    # rnai_num = re.split(r' ', target)[0]  # TODO unused
                    # it will be the reagent-targeted-gene that has a position,
                    # (i think)
                    # TODO finish the RNAi binding location

                name = attribute_dict.get('Name')
                polymorphism = attribute_dict.get('polymorphism')

                if fid is None:
                    if name is not None and re.match(r'WBsf', name):
                        fid = 'WormBase:'+name
                        name = None
                    else:
                        continue

                if self.testMode \
                        and re.sub(r'WormBase:', '', fid) \
                        not in self.test_ids['gene']+self.test_ids['allele']:
                    continue

                # these really aren't that interesting
                if polymorphism is not None:
                    continue

                if name is not None and not re.search(name, fid):
                    if flabel is None:
                        flabel = name
                    else:
                        gu.addSynonym(g, fid, name)

                if desc is not None:
                    gu.addDescription(g, fid, desc)

                alias = attribute_dict.get('Alias')

                biotype = attribute_dict.get('biotype')
                note = attribute_dict.get('Note')
                other_name = attribute_dict.get('other_name')
                for n in [alias, other_name]:
                    if n is not None:
                        gu.addSynonym(g, fid, other_name)

                ftype = self.get_feature_type_by_class_and_biotype(
                    feature_type_label, biotype)

                chr_id = makeChromID(chrom, build_id, 'CHR')
                geno.addChromosomeInstance(chrom, build_id, build_num)

                f = Feature(fid, flabel, ftype)
                f.addFeatureStartLocation(start, chr_id, strand)
                f.addFeatureEndLocation(start, chr_id, strand)

                feature_is_class = False
                if feature_type_label == 'gene':
                    feature_is_class = True

                f.addFeatureToGraph(g, True, None, feature_is_class)

                if note is not None:
                    gu.addDescription(g, fid, note)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

                # RNAi reagents:
# I	RNAi_primary	RNAi_reagent	4184	10232	.	+	.	Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10
# I	RNAi_primary	RNAi_reagent	4223	10147	.	+	.	Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052
# I	RNAi_primary	RNAi_reagent	5693	9391	.	+	.	Target=WBRNAi00066135 1 3699 +;laboratory=CH

                # TODO TF bindiing sites and network:
# I	TF_binding_site_region	TF_binding_site	1861	2048	.	+	.	Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16
# I	TF_binding_site_region	TF_binding_site	3403	4072	.	+	.	Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1

        return
Ejemplo n.º 53
0
    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['allele_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        # gu = GraphUtils(curie_map.get())  # TODO unused

        logger.info("Processing Allele phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, phenotype_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_id = None
                if eco_symbol == 'IMP':
                    eco_id = 'ECO:0000015'
                elif eco_symbol.strip() != '':
                    logger.warning(
                        "Encountered an ECO code we don't have: %s",
                        eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    logger.error(
                        "Missing alleles from phenotype assoc at line %d",
                        line_counter)
                    continue
                else:
                    for a in allele_list:
                        allele_num = re.sub(r'WB:', '', a.strip())
                        allele_id = 'WormBase:'+allele_num
                        gene_id = 'WormBase:'+gene_num

                        if re.search(r'WBRNAi', allele_id):
                            # make the reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            rnai_num = re.sub(r'WormBase:', '', allele_id)
                            rnai_id = allele_id
                            rtg_id = self.make_reagent_targeted_gene_id(
                                gene_num, rnai_num, self.nobnodes)
                            geno.addReagentTargetedGene(
                                rnai_id, 'WormBase:'+gene_num, rtg_id)
                            geno.addGeneTargetingReagent(
                                rnai_id, None, geno.genoparts['RNAi_reagent'],
                                gene_id)
                            allele_id = rtg_id
                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations

                            # the public name will come from elsewhere
                            geno.addSequenceAlteration(allele_id, None)
                            vl_id = '_'+'-'.join((gene_num, allele_num))
                            if self.nobnodes:
                                vl_id = ':'+vl_id
                            geno.addSequenceAlterationToVariantLocus(
                                allele_id, vl_id)
                            geno.addAlleleOfGene(vl_id, gene_id)
                        else:
                            logger.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(self.name, allele_id, phenotype_id)

                        if eco_id is not None:
                            assoc.add_evidence(eco_id)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            r = Reference(ref)
                            if re.search(r'Person', ref):
                                r.setType(r.ref_types['person'])
                                # also add
                                # inferred from background scientific knowledge
                                assoc.add_evidence('ECO:0000001')
                            r.addRefToGraph(g)
                            assoc.add_source(ref)

                        assoc.add_association_to_graph(g)

                        # finish looping through all alleles

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Ejemplo n.º 54
0
    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        geno = Genotype(g)
        f = Feature(g, None, None, None)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:'+tax_num
        tax_label = 'Human'
        model.addClassToGraph(tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)",
                        num_cols, expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                tax_num, 'CHR')
                            geno.addChromosomeInstance(
                                cytogenetic_loc, build_id, assembly, band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                assembly, 'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(
                        str(chr), build_id, assembly, chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    if re.match(r'^Gene:', gene_num):
                        gene_num = "NCBI" + gene_num
                    else:
                        gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph()
                f.addTaxonToFeature(tax_id)
                # make the ClinVarVariant the clique leader
                model.makeLeader(seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs'+str(dbsnp_num)
                    model.addIndividualToGraph(dbsnp_id, None)
                    model.addSameIndividual(seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:'+dbvar_num
                    model.addIndividualToGraph(dbvar_id, None)
                    model.addSameIndividual(seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        model.addIndividualToGraph(rcv_id, None)
                        model.addXref(seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    model.addClassToGraph(gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_'+gene_num+'-'+variant_num
                    if self.nobnodes:
                        vl_id = ':'+vl_id
                    vl_label = allele_name
                    model.addIndividualToGraph(
                        vl_id, vl_label, geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info(
                            "No gene listed for variant %d",
                            int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for phenotype in pheno_list:
                        m = re.match(
                            r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype)
                        if m is not None and len(m.groups()) > 0:
                            phenotype = re.sub(
                                m.group(1), 'Orphanet:', phenotype.strip())
                        elif re.match(r'ORPHA:\d+', phenotype):
                            phenotype = re.sub(
                                r'^ORPHA', 'Orphanet', phenotype.strip())
                        elif re.match(r'Human Phenotype Ontology', phenotype):
                            phenotype = re.sub(
                                r'^Human Phenotype Ontology', '',
                                phenotype.strip())
                        elif re.match(r'SNOMED CT:\s?', phenotype):
                            phenotype = re.sub(
                                r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip())
                        elif re.match(r'^Gene:', phenotype):
                            continue

                        assoc = G2PAssoc(
                            g, self.name, seqalt_id, phenotype.strip())
                        assoc.add_association_to_graph()

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:'+xrefid.split(':')[1]
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        logger.info("Finished parsing variants")

        return
Ejemplo n.º 55
0
    def _process_data(self, raw, limit=None):
        LOG.info("Processing Data from %s", raw)

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)

        # Add the taxon as a class
        taxon_id = self.globaltt['Mus musculus']
        model.addClassToGraph(taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        col = self.files['all']['columns']
        with gzip.open(raw, 'rt') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            row = next(reader)  # presumed header
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g"
                marker_accession_id = row[col.index('marker_accession_id')].strip()
                marker_symbol = row[col.index('marker_symbol')].strip()
                phenotyping_center = row[col.index('phenotyping_center')].strip()
                colony_raw = row[col.index('colony_id')].strip()
                sex = row[col.index('sex')].strip()
                zygosity = row[col.index('zygosity')].strip()
                allele_accession_id = row[col.index('allele_accession_id')].strip()
                allele_symbol = row[col.index('allele_symbol')].strip()
                # allele_name = row[col.index('allele_name')]
                strain_accession_id = row[col.index('strain_accession_id')].strip()
                strain_name = row[col.index('strain_name')].strip()
                # project_name = row[col.index('project_name')]
                project_fullname = row[col.index('project_fullname')].strip()
                pipeline_name = row[col.index('pipeline_name')].strip()
                pipeline_stable_id = row[col.index('pipeline_stable_id')].strip()
                procedure_stable_id = row[col.index('procedure_stable_id')].strip()
                procedure_name = row[col.index('procedure_name')].strip()
                parameter_stable_id = row[col.index('parameter_stable_id')].strip()
                parameter_name = row[col.index('parameter_name')].strip()
                # top_level_mp_term_id = row[col.index('top_level_mp_term_id')]
                # top_level_mp_term_name = row[col.index('top_level_mp_term_name')]
                mp_term_id = row[col.index('mp_term_id')].strip()
                mp_term_name = row[col.index('mp_term_name')].strip()
                p_value = row[col.index('p_value')].strip()
                percentage_change = row[col.index('percentage_change')].strip()
                effect_size = row[col.index('effect_size')].strip()
                statistical_method = row[col.index('statistical_method')].strip()
                resource_name = row[col.index('resource_name')].strip()

                if self.test_mode and marker_accession_id not in self.gene_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity = zygosity.strip()
                zygosity_id = self.resolve(zygosity)
                if zygosity_id == zygosity:
                    LOG.warning(
                        "Zygosity '%s' unmapped. detting to indeterminate", zygosity)
                    zygosity_id = self.globaltt['indeterminate']

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_:' + re.sub(r'\W+', '_', colony_raw)

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = '_:IMPC-'+re.sub(
                        r':', '', allele_accession_id)

                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    # TODO blank nodes do not maintain identifiers
                    strain_accession_id = '_:' + strain_accession_id

                elif not re.match(r'MGI', strain_accession_id):
                    LOG.info(
                        "Found a strange strain accession...%s", strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = re.match(
                        r'.*<(.*)>', allele_symbol)
                    if sequence_alteration_name is not None:
                        sequence_alteration_name = sequence_alteration_name.group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and marker_accession_id == '':
                    LOG.warning("Marker unspecified on row %d", reader.line_num)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = self.globaltt['variant_locus']
                    geno.addGene(
                        marker_accession_id, marker_symbol, self.globaltt['gene'])

                    geno.addAllele(
                        variant_locus_id, variant_locus_name, variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    # TAG bnode
                    sequence_alteration_id = '_:seqalt' + re.sub(
                        r':', '', allele_accession_id)
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(
                    sequence_alteration_id, sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,  with unknown zygosity

                stem_cell_class = self.globaltt['embryonic stem cell line']

                if colony_id is None:
                    print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num)
                model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = '_:'+re.sub(
                    r':', '', allele_accession_id + self.globaltt['indeterminate'])
                vslc_colony_label = allele_symbol + '/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(
                    allele_accession_id, colony_genotype_id,
                    self.globaltt['has_variant_part'])

                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    self.globaltt['indeterminate'], self.globaltt['has_variant_part'])
                graph.addTriple(
                    colony_id, self.globaltt['has_genotype'], colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = self.make_id(
                    (colony_id + phenotyping_center + zygosity + strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = self.globaltt['has_variant_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    LOG.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '-'.join(
                    (marker_accession_id, allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                vslc_id = '_:'+vslc_id
                model.addIndividualToGraph(
                    vslc_id, vslc_name,
                    self.globaltt['variant single locus complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    self.globaltt['has_variant_part'], allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                model.addType(vslc_id, self.globaltt['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        self.globaltt['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = strain_name + '-' + phenotyping_center \
                        + '-' + colony_raw
                    pheno_center_strain_id = '-'.join((
                        re.sub(r':', '', genomic_background_id),
                        re.sub(r'\s', '_', phenotyping_center),
                        re.sub(r'\W+', '', colony_raw)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        # Tag bnode
                        pheno_center_strain_id = '_:' + pheno_center_strain_id

                    geno.addGenotype(
                        pheno_center_strain_id, pheno_center_strain_label,
                        self.globaltt['genomic_background'])
                    geno.addSequenceDerivesFrom(
                        pheno_center_strain_id, genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name + ' [' + pheno_center_strain_label + ']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(taxon_id, pheno_center_strain_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id((
                        colony_id + phenotyping_center + zygosity +
                        strain_accession_id + sex))
                sex_qualified_genotype_label = genotype_name + ' (' + sex + ')'

                sq_type_id = self.resolve(sex, False)

                if sq_type_id == sex:
                    sq_type_id = self.globaltt['intrinsic_genotype']
                    LOG.warning(
                        "Unknown sex qualifier %s, adding as intrinsic_genotype",
                        sex)

                geno.addGenotype(
                    sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    self.globaltt['has_variant_part'])

                if genomic_background_id is not None and genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                # sometimes phenotype ids are missing.  (about 711 early 2020)
                if mp_term_id is None or mp_term_id == '':
                    LOG.warning(
                        "No phenotype id specified for row %d", reader.line_num)
                    continue
                # hard coded ECO code
                eco_id = self.globaltt['mutant phenotype evidence']

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(
                    graph, self.name, sex_qualified_genotype_id, mp_term_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                model._addSexSpecificity(assoc_id, self.resolve(sex))

                # add a free-text description
                try:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of',
                        str(round(float(effect_size), 5)),
                        '(p =', "{:.4e}".format(float(p_value)), ').'))
                except ValueError:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of', str(effect_size),
                        '(p =', "{0}".format(p_value), ').'))

                study_bnode = self._add_study_provenance(
                    phenotyping_center, colony_raw, project_fullname, pipeline_name,
                    pipeline_stable_id, procedure_stable_id, procedure_name,
                    parameter_stable_id, parameter_name, statistical_method,
                    resource_name)

                evidence_line_bnode = self._add_evidence(
                    assoc_id, eco_id, p_value, percentage_change, effect_size,
                    study_bnode)

                self._add_assertion_provenance(assoc_id, evidence_line_bnode)

                model.addDescription(evidence_line_bnode, description)

                # resource_id = resource_name
                # assoc.addSource(graph, assoc_id, resource_id)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
Ejemplo n.º 56
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        model = Model(g)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        # PYLINT complains iterparse deprecated,
        # but as of py 3.4 only the optional & unsupplied parse arg is.
        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignoreS element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text

                disorder_id = 'Orphanet:'+str(disorder_num)

                if self.testMode and \
                        disorder_id not in \
                        config.get_config()['test_ids']['disease']:
                    continue

                disorder_label = elem.find('Name').text

                # make a hash of internal gene id to type for later lookup
                gene_iid_to_type = {}
                gene_list = elem.find('GeneList')
                for gene in gene_list.findall('Gene'):
                    gene_iid = gene.get('id')
                    gene_type = gene.find('GeneType').get('id')
                    gene_iid_to_type[gene_iid] = gene_type

                # assuming that these are in the ontology
                model.addClassToGraph(disorder_id, disorder_label)

                assoc_list = elem.find('DisorderGeneAssociationList')
                for a in assoc_list.findall('DisorderGeneAssociation'):
                    gene_iid = a.find('.//Gene').get('id')
                    gene_name = a.find('.//Gene/Name').text
                    gene_symbol = a.find('.//Gene/Symbol').text
                    gene_num = a.find('./Gene/OrphaNumber').text
                    gene_id = 'Orphanet:'+str(gene_num)
                    gene_type_id = \
                        self._map_gene_type_id(gene_iid_to_type[gene_iid])
                    model.addClassToGraph(
                        gene_id, gene_symbol, gene_type_id, gene_name)
                    syn_list = a.find('./Gene/SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for s in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_id, s.text)

                    dgtype = a.find('DisorderGeneAssociationType').get('id')
                    rel_id = self._map_rel_id(dgtype)
                    dg_label = \
                        a.find('./DisorderGeneAssociationType/Name').text
                    if rel_id is None:
                        logger.warning(
                            "Cannot map association type (%s) to RO " +
                            "for association (%s | %s).  Skipping.",
                            dg_label, disorder_label, gene_symbol)
                        continue

                    alt_locus_id = '_:'+gene_num+'-'+disorder_num+'VL'
                    alt_label = \
                        ' '.join(('some variant of', gene_symbol.strip(),
                                  'that is a', dg_label.lower(),
                                  disorder_label))

                    model.addIndividualToGraph(alt_locus_id, alt_label,
                                               geno.genoparts['variant_locus'])
                    geno.addAffectedLocus(alt_locus_id, gene_id)
                    model.addBlankNodeAnnotation(alt_locus_id)

                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use "assessed" status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    status_code = \
                        a.find('DisorderGeneAssociationStatus').get('id')
                    # imported automatically asserted information
                    # used in automatic assertion
                    eco_id = 'ECO:0000323'
                    # Assessed
                    # TODO are these internal ids stable between releases?
                    if status_code == '17991':
                        # imported manually asserted information
                        # used in automatic assertion
                        eco_id = 'ECO:0000322'
                    # Non-traceable author statement ECO_0000034
                    # imported information in automatic assertion ECO_0000313

                    assoc = G2PAssoc(g, self.name, alt_locus_id,
                                     disorder_id, rel_id)
                    assoc.add_evidence(eco_id)
                    assoc.add_association_to_graph()

                    rlist = a.find('./Gene/ExternalReferenceList')
                    eqid = None

                    for r in rlist.findall('ExternalReference'):
                        if r.find('Source').text == 'Ensembl':
                            eqid = 'ENSEMBL:'+r.find('Reference').text
                        elif r.find('Source').text == 'HGNC':
                            eqid = 'HGNC:'+r.find('Reference').text
                        elif r.find('Source').text == 'OMIM':
                            eqid = 'OMIM:'+r.find('Reference').text
                        else:
                            pass  # skip the others for now
                        if eqid is not None:
                            model.addClassToGraph(eqid, None)
                            model.addEquivalentClass(gene_id, eqid)
                elem.clear()  # empty the element

            if self.testMode and limit is not None and line_counter > limit:
                return

        return
Ejemplo n.º 57
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        gu = GraphUtils(curie_map.get())

        myfile = "/".join((self.rawdir, self.files["disease-gene"]["file"]))

        for event, elem in ET.iterparse(myfile):
            if elem.tag == "Disorder":
                # get the element name and id
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find("OrphaNumber").text

                disorder_id = "Orphanet:" + str(disorder_num)

                if self.testMode and disorder_id not in config.get_config()["test_ids"]["disease"]:
                    continue

                disorder_label = elem.find("Name").text

                # make a hash of internal gene id to type for later lookup
                gene_iid_to_type = {}
                gene_list = elem.find("GeneList")
                for gene in gene_list.findall("Gene"):
                    gene_iid = gene.get("id")
                    gene_type = gene.find("GeneType").get("id")
                    gene_iid_to_type[gene_iid] = gene_type

                gu.addClassToGraph(g, disorder_id, disorder_label)  # assuming that these are in the ontology

                assoc_list = elem.find("DisorderGeneAssociationList")
                for a in assoc_list.findall("DisorderGeneAssociation"):
                    gene_iid = a.find(".//Gene").get("id")
                    gene_name = a.find(".//Gene/Name").text
                    gene_symbol = a.find(".//Gene/Symbol").text
                    gene_num = a.find("./Gene/OrphaNumber").text
                    gene_id = "Orphanet:" + str(gene_num)
                    gene_type_id = self._map_gene_type_id(gene_iid_to_type[gene_iid])
                    gu.addClassToGraph(g, gene_id, gene_symbol, gene_type_id, gene_name)
                    syn_list = a.find("./Gene/SynonymList")
                    if int(syn_list.get("count")) > 0:
                        for s in syn_list.findall("./Synonym"):
                            gu.addSynonym(g, gene_id, s.text)

                    dgtype = a.find("DisorderGeneAssociationType").get("id")
                    rel_id = self._map_rel_id(dgtype)
                    dg_label = a.find("./DisorderGeneAssociationType/Name").text
                    if rel_id is None:
                        logger.warn(
                            "Cannot map association type (%s) to RO for association (%s | %s).  Skipping.",
                            dg_label,
                            disorder_label,
                            gene_symbol,
                        )
                        continue

                    alt_locus_id = "_" + gene_num + "-" + disorder_num + "VL"
                    alt_label = " ".join(
                        ("some variant of", gene_symbol.strip(), "that is a", dg_label.lower(), disorder_label)
                    )
                    if self.nobnodes:
                        alt_locus_id = ":" + alt_locus_id
                    gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts["variant_locus"])
                    geno.addAlleleOfGene(alt_locus_id, gene_id)

                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use "assessed" status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    status_code = a.find("DisorderGeneAssociationStatus").get("id")
                    eco_id = "ECO:0000323"  # imported automatically asserted information used in automatic assertion
                    if status_code == "17991":  # Assessed  # TODO are these internal ids stable between releases?
                        eco_id = "ECO:0000322"  # imported manually asserted information used in automatic assertion
                    # Non-traceable author statement ECO_0000034
                    # imported information in automatic assertion ECO_0000313

                    assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id)
                    assoc.add_evidence(eco_id)
                    assoc.add_association_to_graph(g)

                    rlist = a.find("./Gene/ExternalReferenceList")
                    eqid = None

                    for r in rlist.findall("ExternalReference"):
                        if r.find("Source").text == "Ensembl":
                            eqid = "ENSEMBL:" + r.find("Reference").text
                        elif r.find("Source").text == "HGNC":
                            eqid = "HGNC:" + r.find("Reference").text
                        elif r.find("Source").text == "OMIM":
                            eqid = "OMIM:" + r.find("Reference").text
                        else:
                            pass  # skip the others for now
                        if eqid is not None:
                            gu.addClassToGraph(g, eqid, None)
                            gu.addEquivalentClass(g, gene_id, eqid)
                            pass
                elem.clear()  # discard the element

            if self.testMode and limit is not None and line_counter > limit:
                return

        gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
        gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
        gu.loadAllProperties(g)

        return