Ejemplo n.º 1
0
    def parse(self, limit=None):
        zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized)
        model = Model(self.graph)
        zp_file = '/'.join((self.rawdir, self.files['zpmap']['file']))
        g2p_file = '/'.join((self.rawdir, self.files['g2p_clean']['file']))
        zfin_parser.zp_map = zfin_parser._load_zp_mappings(zp_file)

        with open(g2p_file, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:

                (internal_id, symbol, gene_id, subterm1_id, subterm1_label,
                 pc_rel_id, pc_rel_label, superterm1_id, superterm1_label,
                 quality_id, quality_name, modifier, subterm2_id,
                 subterm2_label, pc_rel2_id, pc_rel2_id, superterm2_id,
                 superterm2_label, fish_id, fish_label, start_stage, end_stage,
                 environment, pub_id, figure_id, unknown_field) = row

                zp_id = zfin_parser._map_sextuple_to_phenotype(
                    superterm1_id, subterm1_id, quality_id, superterm2_id,
                    subterm2_id, modifier)

                gene_curie = "ZFIN:{0}".format(gene_id)
                model.makeLeader(gene_curie)
                pub_curie = "ZFIN:{0}".format(pub_id)
                if zp_id:
                    assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id)
                    if pub_id:
                        reference = Reference(self.graph, pub_curie,
                                              Reference.ref_types['document'])
                        reference.addRefToGraph()
                        assoc.add_source(pub_curie)

                    assoc.add_evidence('ECO:0000059')
                    assoc.add_association_to_graph()
Ejemplo n.º 2
0
    def __init__(self,
                 graph_type,
                 are_bnodes_skolemized,
                 data_release_version=None,
                 tax_ids=None):
        super().__init__(
            graph_type=graph_type,
            are_bnodes_skized=are_bnodes_skolemized,
            data_release_version=data_release_version,
            name='go',
            ingest_title='Gene Ontology',
            ingest_url='http://www.geneontology.org',
            ingest_logo='source-geneontology.png',
            license_url=None,
            data_rights='http://geneontology.org/page/use-and-license'
            # file_handle=None
        )
        self.test_ids = []
        # note: dipper-etl defaults tax_ids to '9606'
        # note: sorting tax_ids for stable digest
        if tax_ids is not None and [] != set(tax_ids).difference(['9606']):
            LOG.info('Have %s given as taxon to ingest', str(tax_ids))
            self.tax_ids = sorted([str(x) for x in tax_ids])
            nottax = set(tax_ids) - set(self.files.keys())
            if nottax:
                LOG.error('Cant process taxon number(s):\t%s', str(nottax))
                self.tax_ids = list(set(self.tax_ids) - nottax)
        else:
            self.tax_ids = sorted(['9606', '10090', '7955'])

        LOG.info("Filtering to the following taxa: %s", self.tax_ids)

        # moving this from process_gaf() to avoid repeating this for each
        # file to be processed.
        if '7955' in self.tax_ids:
            self.zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        if '6239' in self.tax_ids:
            self.wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        if 'gene' not in self.all_test_ids:
            LOG.warning("not configured with gene test ids.")
        else:
            self.test_ids = self.all_test_ids['gene']

        # build the id map for mapping uniprot ids to genes ... ONCE
        self.uniprot_entrez_id_map = self.get_uniprot_entrez_id_map()

        # gaf evidence code mapping is built in parse(), after the file is fetched.
        self.gaf_eco = {}
Ejemplo n.º 3
0
    def process_gaf(self, file, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", file)
        line_counter = 0
        uniprot_hit = 0
        uniprot_miss = 0
        if '7955' in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        if '6239' in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue

                if len(row) > 17 or len(row) < 15:
                    LOG.warning(
                        "Wrong number of columns %i, expected 15 or 17\n%s",
                        len(row), row)
                    continue

                if 17 > len(row) >= 15:
                    row += [""] * (17 - len(row))

                (dbase, gene_num, gene_symbol, qualifier, go_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 object_type, taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (dbase == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    LOG.error(
                        "Missing required part of annotation on row %d:\n" +
                        '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s  is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and not (re.match(r'NCBIGene', gene_id)
                                           and int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, syn.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    LOG.info(">1 taxon (%s) on line %d.  skipping", taxon,
                             line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to',
                                                   qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n",
                                str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                #######################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i):
                            LOG.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = wbase.make_reagent_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, i, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(self.globaltt[
                                    'experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.test_mode and limit is not None and line_counter > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
        return
Ejemplo n.º 4
0
 def setUp(self):
     self.source = ZFIN('rdf_graph', True)
     self.source.settestonly(True)
     self._setDirToSource()
Ejemplo n.º 5
0
    def process_gaf(self, file, limit, id_map=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        geno = Genotype(g)
        logger.info("Processing Gene Associations from %s", file)
        line_counter = 0

        if 7955 in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        elif 6239 in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue
                (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
                 with_or_from, aspect, gene_name, gene_synonym, object_type,
                 taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (db == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    logger.error(
                        "Missing required part of annotation " +
                        "on row %d:\n" + '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                db = self.clean_db_prefix(db)
                uniprotid = None
                gene_id = None
                if db == 'UniProtKB':
                    mapped_ids = id_map.get(gene_num)
                    if id_map is not None and mapped_ids is not None:
                        if len(mapped_ids) == 1:
                            gene_id = mapped_ids[0]
                            uniprotid = ':'.join((db, gene_num))
                            gene_num = re.sub(r'\w+\:', '', gene_id)
                        elif len(mapped_ids) > 1:
                            # logger.warning(
                            #   "Skipping gene id mapped for >1 gene %s -> %s",
                            #    gene_num, str(mapped_ids))
                            continue
                    else:
                        continue
                elif db == 'MGI':
                    gene_num = re.sub(r'MGI:', '', gene_num)
                    gene_id = ':'.join((db, gene_num))
                    gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
                else:
                    gene_id = ':'.join((db, gene_num))

                if self.testMode \
                        and not(
                            re.match(r'NCBIGene', gene_id) and
                            int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for s in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, s.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    logger.info(">1 taxon (%s) on line %d.  skipping", taxon,
                                line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(g, self.name)

                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                refs = re.split(r'\|', ref)
                for r in refs:
                    r = r.strip()
                    if r != '':
                        prefix = re.split(r':', r)[0]
                        r = re.sub(prefix, self.clean_db_prefix(prefix), r)
                        r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                        ref = Reference(g, r)
                        if re.match(r'PMID', r):
                            ref_type = Reference.ref_types['journal_article']
                            ref.setType(ref_type)
                        ref.addRefToGraph()
                        assoc.add_source(r)

                # TODO add the source of the annotations from assigned by?

                aspect_rel_map = {
                    'P': model.object_properties['involved_in'],  # involved in
                    'F': model.object_properties['enables'],  # enables
                    'C': model.object_properties['part_of']  # part of
                }

                if aspect not in aspect_rel_map:
                    logger.error("Aspect not recognized: %s", aspect)

                rel = aspect_rel_map.get(aspect)
                if aspect == 'F' and re.search(r'contributes_to', qualifier):
                    rel = model.object_properties['contributes_to']
                assoc.set_relationship(rel)
                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used

                assoc.add_association_to_graph()

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or \
                                re.match(
                                    r'(UniProtKB|WBPhenotype|InterPro|HGNC)',
                                    i):
                            logger.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this:
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = \
                                wbase.make_reagent_targeted_gene_id(
                                    gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        else:
                            assoc = G2PAssoc(g, self.name, i, phenotypeid)
                        for r in refs:
                            r = r.strip()
                            if r != '':
                                prefix = re.split(r':', r)[0]
                                r = re.sub(prefix,
                                           self.clean_db_prefix(prefix), r)
                                r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                                assoc.add_source(r)
                                # experimental phenotypic evidence
                                assoc.add_evidence("ECO:0000059")
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Ejemplo n.º 6
0
    def parse(self, limit=None):
        zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized)
        model = Model(self.graph)

        src_key = 'zpmap'  # keep same-as zfin.files[key]
        zfin_parser.zp_map = zfin_parser._load_zp_mappings(src_key)

        src_key = 'g2p_clean'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("Processing clean Geno to Pheno from file: %s", raw)
        col = self.files[src_key]['columns']
        collen = len(col)
        with open(raw, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in reader:
                if len(row) != collen:
                    LOG.warning('Row: %i has unexpected format',
                                reader.line_num)
                # internal_id = row[col.index('ID')]
                # symbol = row[col.index('Gene Symbol')]
                gene_id = row[col.index('Gene ID')]
                subterm1_id = row[col.index(
                    'Affected Structure or Process 1 subterm ID')]
                # subterm1_label = row[col.index(
                #    'Affected Structure or Process 1 subterm Name')]
                pc_rel_id = row[col.index(
                    'Post-composed Relationship ID')].strip()
                # pc_rel_label = row[col.index('Post-composed Relationship Name')]
                superterm1_id = row[col.index(
                    'Affected Structure or Process 1 superterm ID')].strip()
                # superterm1_label = row[col.index(
                #    'Affected Structure or Process 1 superterm Name')]
                quality_id = row[col.index('Phenotype Keyword ID')].strip()
                # quality_name = row[col.index('Phenotype Keyword Name')]
                modifier = row[col.index('Phenotype Tag')].strip()
                subterm2_id = row[col.index(
                    'Affected Structure or Process 2 subterm ID')].strip()
                # subterm2_label = row[col.index(
                #    'Affected Structure or Process 2 subterm name')]
                pc_rel2_id = row[col.index(
                    'Post-composed Relationship (rel) ID')]
                # pc_rel2_label = row[col.index(
                #   'Post-composed Relationship (rel) Name')]
                superterm2_id = row[col.index(
                    'Affected Structure or Process 2 superterm ID')].strip()
                # superterm2_label = row[col.index(
                #    'Affected Structure or Process 2 superterm name')]
                # fish_id = row[col.index('Fish ID')]
                # fish_label = row[col.index('Fish Display Name')]
                start_stage = row[col.index('Start Stage ID')]
                # end_stage = row[col.index('End Stage ID')]
                # environment = row[col.index('Fish Environment ID')]
                pub_id = row[col.index('Publication ID')].strip()
                # figure_id = row[col.index('Figure ID')]

                if modifier != 'abnormal':
                    LOG.warning(
                        "skipping phenotype with modifier %s != abnormal ",
                        modifier)
                    continue

                zp_id = zfin_parser._map_octuple_to_phenotype(
                    subterm1_id, pc_rel_id, superterm1_id, quality_id,
                    subterm2_id, pc_rel2_id, superterm2_id, modifier)

                gene_curie = "ZFIN:{0}".format(gene_id)
                model.makeLeader(gene_curie)
                pub_curie = "ZFIN:{0}".format(pub_id)
                if zp_id:
                    assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id)
                    if pub_id:
                        reference = Reference(self.graph, pub_curie,
                                              self.globaltt['document'])
                        reference.addRefToGraph()
                        assoc.add_source(pub_curie)

                    assoc.add_evidence(
                        self.globaltt['experimental phenotypic evidence'])
                    assoc.add_association_to_graph()