Python Reference.setType Examples

Programming Language: Python

Namespace/Package Name: dipper.models.Reference

Class/Type: Reference

Method/Function: setType

Examples at hotexamples.com: 12

Python Reference.setType - 12 examples found. These are the top rated real world Python examples of dipper.models.Reference.Reference.setType extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Reference(30)

addRefToGraph(30)

setType(7)

addPage(5)

setTitle(4)

setAuthorList(2)

setShortCitation(2)

setYear(1)

Example #1

Show file

File: GeneOntology.py Project: david-w-millar/dipper

    def process_gaf(self, file, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", file)
        line_counter = 0
        uniprot_hit = 0
        uniprot_miss = 0
        if '7955' in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        if '6239' in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue

                if len(row) > 17 or len(row) < 15:
                    LOG.warning(
                        "Wrong number of columns %i, expected 15 or 17\n%s",
                        len(row), row)
                    continue

                if 17 > len(row) >= 15:
                    row += [""] * (17 - len(row))

                (dbase, gene_num, gene_symbol, qualifier, go_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 object_type, taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (dbase == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    LOG.error(
                        "Missing required part of annotation on row %d:\n" +
                        '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s  is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and not (re.match(r'NCBIGene', gene_id)
                                           and int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, syn.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    LOG.info(">1 taxon (%s) on line %d.  skipping", taxon,
                             line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to',
                                                   qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n",
                                str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                #######################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i):
                            LOG.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = wbase.make_reagent_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, i, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(self.globaltt[
                                    'experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.test_mode and limit is not None and line_counter > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
        return

Example #2

Show file

    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['allele_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing Allele phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, phenotype_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_id = None
                if eco_symbol == 'IMP':
                    eco_id = 'ECO:0000015'
                elif eco_symbol.strip() != '':
                    logger.warning("Encountered an ECO code we don't have: %s",
                                   eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    logger.error(
                        "Missing alleles from phenotype assoc at line %d",
                        line_counter)
                    continue
                else:
                    for a in allele_list:
                        allele_num = re.sub(r'WB:', '', a.strip())
                        allele_id = 'WormBase:' + allele_num
                        gene_id = 'WormBase:' + gene_num

                        if re.search(r'WBRNAi', allele_id):
                            # make the reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            rnai_num = re.sub(r'WormBase:', '', allele_id)
                            rnai_id = allele_id
                            rtg_id = self.make_reagent_targeted_gene_id(
                                gene_num, rnai_num)
                            geno.addReagentTargetedGene(
                                rnai_id, 'WormBase:' + gene_num, rtg_id)
                            geno.addGeneTargetingReagent(
                                rnai_id, None, geno.genoparts['RNAi_reagent'],
                                gene_id)
                            allele_id = rtg_id
                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations

                            # the public name will come from elsewhere
                            geno.addSequenceAlteration(allele_id, None)
                            vl_id = '_:' + '-'.join((gene_num, allele_num))
                            geno.addSequenceAlterationToVariantLocus(
                                allele_id, vl_id)
                            geno.addAlleleOfGene(vl_id, gene_id)
                        else:
                            logger.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(g, self.name, allele_id, phenotype_id)

                        if eco_id is not None:
                            assoc.add_evidence(eco_id)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            reference = Reference(g, ref)
                            if re.search(r'Person', ref):
                                reference.setType(
                                    reference.ref_types['person'])
                                # also add
                                # inferred from background scientific knowledge
                                assoc.add_evidence('ECO:0000001')
                            reference.addRefToGraph()
                            assoc.add_source(ref)

                        assoc.add_association_to_graph()

                        # finish looping through all alleles

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return

Example #3

Show file

File: WormBase.py Project: JervenBolleman/dipper

    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['allele_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        # gu = GraphUtils(curie_map.get())  # TODO unused

        logger.info("Processing Allele phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, phenotype_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_id = None
                if eco_symbol == 'IMP':
                    eco_id = 'ECO:0000015'
                elif eco_symbol.strip() != '':
                    logger.warning(
                        "Encountered an ECO code we don't have: %s",
                        eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    logger.error(
                        "Missing alleles from phenotype assoc at line %d",
                        line_counter)
                    continue
                else:
                    for a in allele_list:
                        allele_num = re.sub(r'WB:', '', a.strip())
                        allele_id = 'WormBase:'+allele_num
                        gene_id = 'WormBase:'+gene_num

                        if re.search(r'WBRNAi', allele_id):
                            # make the reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            rnai_num = re.sub(r'WormBase:', '', allele_id)
                            rnai_id = allele_id
                            rtg_id = self.make_reagent_targeted_gene_id(
                                gene_num, rnai_num, self.nobnodes)
                            geno.addReagentTargetedGene(
                                rnai_id, 'WormBase:'+gene_num, rtg_id)
                            geno.addGeneTargetingReagent(
                                rnai_id, None, geno.genoparts['RNAi_reagent'],
                                gene_id)
                            allele_id = rtg_id
                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations

                            # the public name will come from elsewhere
                            geno.addSequenceAlteration(allele_id, None)
                            vl_id = '_'+'-'.join((gene_num, allele_num))
                            if self.nobnodes:
                                vl_id = ':'+vl_id
                            geno.addSequenceAlterationToVariantLocus(
                                allele_id, vl_id)
                            geno.addAlleleOfGene(vl_id, gene_id)
                        else:
                            logger.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(self.name, allele_id, phenotype_id)

                        if eco_id is not None:
                            assoc.add_evidence(eco_id)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            r = Reference(ref)
                            if re.search(r'Person', ref):
                                r.setType(r.ref_types['person'])
                                # also add
                                # inferred from background scientific knowledge
                                assoc.add_evidence('ECO:0000001')
                            r.addRefToGraph(g)
                            assoc.add_source(ref)

                        assoc.add_association_to_graph(g)

                        # finish looping through all alleles

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return

Example #4

Show file

File: WormBase.py Project: nicholsn/dipper

    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """
        src_key = 'allele_pheno'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        col = self.files[src_key]['columns']

        graph = self.graph
        model = Model(self.graph)

        LOG.info("Processing Allele phenotype associations")
        geno = Genotype(graph)
        with open(raw, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            row = next(reader)
            if row[0] != '!gaf-version: 2.0':
                LOG.error('Not a vlaid gaf v2.0 formatted file: %s', raw)
                # raise
            for row in reader:
                if row[0][0] == '!':
                    continue
                # db = row[col.index('DB')]
                gene_num = row[col.index('DB Object ID')]
                # gene_symbol = row[col.index('DB Object Symbol')]
                is_not = row[col.index('Qualifier')]
                phenotype_id = row[col.index('GO ID')]
                ref = row[col.index('DB:Reference (|DB:Reference)')].strip()
                eco_symbol = row[col.index('Evidence Code')]
                with_or_from = row[col.index('With (or) From')]
                # aspect = row[col.index('Aspect')]
                # gene_name = row[col.index('DB Object Name')]
                # gene_synonym = row[col.index('DB Object Synonym (|Synonym)')]
                # gene_class = row[col.index('DB Object Type')]
                # taxon = row[col.index('Taxon(|taxon)')]
                # date = row[col.index('Date')]
                # assigned_by = row[col.index('Assigned By')]
                # blank = row[col.index('Annotation Extension')]
                # blank2 = row[col.index('Gene Product Form ID')]

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_symbol = eco_symbol.strip()
                eco_curie = None
                if eco_symbol.strip() != '' and eco_symbol in self.gaf_eco:
                    eco_curie = self.gaf_eco[eco_symbol]
                else:
                    LOG.warning(
                        'Evidence code %s is not found in the (gaf) gaf_eco',
                        eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    LOG.error(
                        "Missing alleles from phenotype assoc at line %d",
                        reader.line_num)
                    continue
                else:
                    for allele in allele_list:
                        allele_num = re.sub(r'WB:', '', allele.strip())
                        allele_id = 'WormBase:' + allele_num
                        gene_id = 'WormBase:' + gene_num

                        if re.search(r'WBRNAi', allele_id):

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling
                            # make the WormBase:WBRNAi* id
                            # a self.globaltt['reagent_targeted_gene'], and attach
                            # phenotype to this ID

                            # Previous model - make a bnode reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            # rnai_num = re.sub(r'WormBase:', '', allele_id)
                            # rnai_id = allele_id
                            # rtg_id = self.make_reagent_targeted_gene_id(
                            #    gene_num, rnai_num)
                            # geno.addReagentTargetedGene(
                            #    rnai_id, 'WormBase:' + gene_num, rtg_id)
                            # allele_id = rtg_id

                            # Could type the IRI as both the reagant and reagant
                            # targeted gene but not sure if this needed
                            # geno.addGeneTargetingReagent(
                            #   allele_id, None, self.globaltt['RNAi_reagent'], gene_id)

                            model.addIndividualToGraph(
                                allele_id, None,
                                self.globaltt['reagent_targeted_gene'])

                            self.graph.addTriple(
                                allele_id,
                                self.globaltt['is_expression_variant_of'],
                                gene_id)

                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations
                            # the public name will come from elsewhere

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling, treat variant
                            # like an allele
                            # vl_id = '_:'+'-'.join((gene_num, allele_num))
                            # geno.addSequenceAlterationToVariantLocus(
                            #    allele_id, vl_id)
                            # geno.addAlleleOfGene(vl_id, gene_id)

                            geno.addSequenceAlteration(allele_id, None)
                            geno.addAlleleOfGene(allele_id, gene_id)
                        else:
                            LOG.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(graph, self.name, allele_id,
                                         phenotype_id)

                        if eco_curie is not None:
                            assoc.add_evidence(eco_curie)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            reference = Reference(graph, ref)
                            if re.search(r'Person', ref):
                                reference.setType(self.globaltt['person'])
                                assoc.add_evidence(self.globaltt[
                                    'inference from background scientific knowledge']
                                                   )
                            reference.addRefToGraph()
                            assoc.add_source(ref)

                        assoc.add_association_to_graph()

                        # finish looping through all alleles

                if limit is not None and reader.line_num > limit:
                    break

Example #5

Show file

    def _process_data(self, source, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page
         https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

        line_id a CL_0000057,  #fibroblast line
            derives_from patient_id
            part_of :NIGMSrepository
            RO:model_of OMIM:disease_id

        patient id a foaf:person,
            label: "fibroblast from patient 12345 with disease X"
            member_of family_id  #what is the right thing here?
            SIO:race EFO:caucasian  #subclass of EFO:0001799
            in_taxon NCBITaxon:9606
            dc:description Literal(remark)
            RO:has_phenotype OMIM:disease_id
            GENO:has_genotype genotype_id

        family_id a owl:NamedIndividual
            foaf:page
             "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

        genotype_id a intrinsic_genotype
            GENO:has_alternate_part allelic_variant_id
            we don't necessarily know much about the genotype,
            other than the allelic variant. also there's the sex here

        pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files[source]['file']))

        LOG.info("Processing Data from %s", raw)

        if self.testMode:  # set the graph to build
            graph = self.testgraph
        else:
            graph = self.graph

        family = Family(graph)
        model = Model(graph)

        line_counter = 1
        geno = Genotype(graph)
        diputil = DipperUtil()
        col = self.files[source]['columns']
        # affords access with
        # x = row[col.index('x')].strip()

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"')
            # we can keep a close watch on changing file formats
            fileheader = next(filereader, None)
            fileheader = [c.lower() for c in fileheader]
            if col != fileheader:  # assert
                LOG.error('Expected  %s to have columns: %s', raw, col)
                LOG.error('But Found %s to have columns: %s', raw, fileheader)
                raise AssertionError('Incomming data headers have changed.')

            for row in filereader:
                line_counter += 1
                if len(row) != len(col):
                    LOG.warning('Expected %i values but find %i in  row %i',
                                len(col), len(row), line_counter)
                    continue

                # (catalog_id, description, omim_number, sample_type,
                # cell_line_available, dna_in_stock, dna_ref, gender, age,
                # race, ethnicity, affected, karyotype, relprob, mutation,
                # gene, family_id, collection, url, cat_remark, pubmed_ids,
                # family_member, variant_id, dbsnp_id, species) = row

                # example:
                # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,
                #       ,Female,26 YR,Caucasian,,,,
                # parent,,,39,NIGMS Human Genetic Cell Repository,
                # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                # 46;XX; clinically normal mother of a child with Hurler syndrome;
                #       proband not in Repository,,
                # 2,,18343,H**o sapiens

                catalog_id = row[col.index('catalog_id')].strip()

                if self.testMode and catalog_id not in self.test_lines:
                    # skip rows not in our test lines, when in test mode
                    continue

                # ###########    BUILD REQUIRED VARIABLES    ###########

                # Make the cell line ID
                cell_line_id = 'Coriell:' + catalog_id
                # Map the cell/sample type
                cell_type = self.resolve(row[col.index('sample_type')].strip())
                # on fail cell_type = self.globaltt['cell'] ?

                # Make a cell line label
                collection = row[col.index('collection')].strip()
                line_label = collection.partition(' ')[0] + '-' + catalog_id

                # Map the repository/collection
                repository = self.localtt[collection]

                # patients are uniquely identified by one of:
                # dbsnp id (which is == an individual haplotype)
                # family id + family member (if present) OR
                # probands are usually family member zero
                # cell line id
                # since some patients have >1 cell line derived from them,
                # we must make sure that the genotype is attached to
                # the patient, and can be inferred to the cell line
                # examples of repeated patients are:
                #   famid=1159, member=1; fam=152,member=1

                # Make the patient ID

                # make an anonymous patient
                patient_id = '_:person'
                fam_id = row[col.index('fam')].strip()
                fammember = row[col.index('fammember')].strip()
                if fam_id != '':
                    patient_id = '-'.join((patient_id, fam_id, fammember))
                else:
                    # make an anonymous patient
                    patient_id = '-'.join((patient_id, catalog_id))

                # properties of the individual patients:  sex, family id,
                # member/relproband, description descriptions are
                # really long and ugly SCREAMING text, so need to clean up
                # the control cases are so odd with this labeling scheme;
                # but we'll deal with it as-is for now.
                description = row[col.index('description')].strip()
                short_desc = (description.split(';')[0]).capitalize()

                gender = row[col.index('gender')].strip().lower()
                affected = row[col.index('affected')].strip()
                relprob = row[col.index('relprob')].strip()

                if affected == '':
                    affected = 'unspecified'
                elif affected in self.localtt:
                    affected = self.localtt[affected]
                else:
                    LOG.warning('Novel Affected status  %s at row: %i of %s',
                                affected, line_counter, raw)
                patient_label = ' '.join((affected, gender, relprob))
                if relprob == 'proband':
                    patient_label = ' '.join(
                        (patient_label.strip(), 'with', short_desc))
                else:
                    patient_label = ' '.join(
                        (patient_label.strip(), 'of proband with', short_desc))

                # #############    BUILD THE CELL LINE    #############

                # Adding the cell line as a typed individual.
                cell_line_reagent_id = self.globaltt['cell line']

                model.addIndividualToGraph(cell_line_id, line_label,
                                           cell_line_reagent_id)

                # add the equivalent id == dna_ref
                dna_ref = row[col.index('dna_ref')].strip()
                if dna_ref != '' and dna_ref != catalog_id:
                    equiv_cell_line = 'Coriell:' + dna_ref
                    # some of the equivalent ids are not defined
                    # in the source data; so add them
                    model.addIndividualToGraph(equiv_cell_line, None,
                                               cell_line_reagent_id)
                    model.addSameIndividual(cell_line_id, equiv_cell_line)

                # Cell line derives from patient
                geno.addDerivesFrom(cell_line_id, patient_id)
                geno.addDerivesFrom(cell_line_id, cell_type)

                # Cell line a member of repository
                family.addMember(repository, cell_line_id)

                cat_remark = row[col.index('cat_remark')].strip()

                if cat_remark != '':
                    model.addDescription(cell_line_id, cat_remark)

                # Cell age_at_sampling
                # TODO add the age nodes when modeled properly in #78
                # if (age != ''):
                # this would give a BNode that is an instance of Age.
                # but i don't know how to connect
                # the age node to the cell line? we need to ask @mbrush
                # age_id = '_'+re.sub('\s+','_',age)
                # gu.addIndividualToGraph(
                #   graph,age_id,age,self.globaltt['age'])
                # gu.addTriple(
                #   graph,age_id,self.globaltt['has measurement value'],age,
                #   True)

                # #############    BUILD THE PATIENT    #############

                # Add the patient ID as an individual.
                model.addPerson(patient_id, patient_label)
                # TODO map relationship to proband as a class
                # (what ontology?)

                # Add race of patient
                # FIXME: Adjust for subcategories based on ethnicity field
                # EDIT: There are 743 different entries for ethnicity...
                # Too many to map?
                # Add ethnicity as literal in addition to the mapped race?
                # Adjust the ethnicity txt (if using)
                # to initial capitalization to remove ALLCAPS

                # TODO race should go into the individual's background
                # and abstracted out to the Genotype class punting for now.
                # if race != '':
                #    mapped_race = self.resolve(race)
                #    if mapped_race is not None:
                #        gu.addTriple(
                #           g,patient_id,self.globaltt['race'], mapped_race)
                #        model.addSubClass(
                #           mapped_race,self.globaltt['ethnic_group'])

                # #############    BUILD THE FAMILY    #############

                # Add triples for family_id, if present.
                if fam_id != '':
                    family_comp_id = 'CoriellFamily:' + fam_id

                    family_label = ' '.join(
                        ('Family of proband with', short_desc))

                    # Add the family ID as a named individual
                    model.addIndividualToGraph(family_comp_id, family_label,
                                               self.globaltt['family'])

                    # Add the patient as a member of the family
                    family.addMemberOf(patient_id, family_comp_id)

                # #############    BUILD THE GENOTYPE   #############

                # the important things to pay attention to here are:
                # karyotype = chr rearrangements  (somatic?)
                # mutation = protein-level mutation as a label,
                # often from omim
                # gene = gene symbol - TODO get id
                # variant_id = omim variant ids (; delimited)
                # dbsnp_id = snp individual ids = full genotype?

                # note GM00633 is a good example of chromosomal variation
                # - do we have enough to capture this?
                # GM00325 has both abnormal karyotype and variation

                # make an assumption that if the taxon is blank,
                # that it is human!
                species = row[col.index('species')].strip()
                if species is None or species == '':
                    species = 'H**o sapiens'
                taxon = self.resolve(species)

                # if there's a dbSNP id,
                # this is actually the individual's genotype
                genotype_id = None
                genotype_label = None

                dbsnp_id = row[col.index('dbsnp_id')].strip()
                if dbsnp_id != '':
                    genotype_id = 'dbSNPIndividual:' + dbsnp_id

                omim_map = {}
                gvc_id = None

                # some of the karyotypes are encoded
                # with terrible hidden codes. remove them here
                # i've seen a <98> character
                karyotype = row[col.index('karyotype')].strip()
                karyotype = diputil.remove_control_characters(karyotype)
                karyotype_id = None
                if karyotype.strip() != '':
                    karyotype_id = '_:' + re.sub('MONARCH:', '',
                                                 self.make_id(karyotype))
                    # add karyotype as karyotype_variation_complement
                    model.addIndividualToGraph(
                        karyotype_id, karyotype,
                        self.globaltt['karyotype_variation_complement'])
                    # TODO break down the karyotype into parts
                    # and map into GENO. depends on #77

                    # place the karyotype in a location(s).
                    karyo_chrs = self._get_affected_chromosomes_from_karyotype(
                        karyotype)
                    for chrom in karyo_chrs:
                        chr_id = makeChromID(chrom, taxon, 'CHR')
                        # add an anonymous sequence feature,
                        # each located on chr
                        karyotype_feature_id = '-'.join((karyotype_id, chrom))
                        karyotype_feature_label = \
                            'some karyotype alteration on chr' + str(chrom)
                        feat = Feature(graph, karyotype_feature_id,
                                       karyotype_feature_label,
                                       self.globaltt['sequence_alteration'])
                        feat.addFeatureStartLocation(None, chr_id)
                        feat.addFeatureToGraph()
                        geno.addParts(karyotype_feature_id, karyotype_id,
                                      self.globaltt['has_variant_part'])

                gene = row[col.index('gene')].strip()
                mutation = row[col.index('mutation')].strip()
                if gene != '':
                    vl = gene + '(' + mutation + ')'

                # fix the variant_id so it's always in the same order
                variant_id = row[col.index('variant_id')].strip()
                vids = variant_id.split(';')
                variant_id = ';'.join(sorted(list(set(vids))))

                if karyotype.strip() != '' and not self._is_normal_karyotype(
                        karyotype):

                    gvc_id = karyotype_id
                    if variant_id != '':
                        gvc_id = '_:' + variant_id.replace(';', '-') + '-' \
                            + re.sub(r'\w*:', '', karyotype_id)
                    if mutation.strip() != '':
                        gvc_label = '; '.join((vl, karyotype))
                    else:
                        gvc_label = karyotype
                elif variant_id.strip() != '':
                    gvc_id = '_:' + variant_id.replace(';', '-')
                    gvc_label = vl
                else:
                    # wildtype?
                    pass

                # add the karyotype to the gvc.
                # use reference if normal karyotype
                karyo_rel = self.globaltt['has_variant_part']
                if self._is_normal_karyotype(karyotype):
                    karyo_rel = self.globaltt['has_reference_part']
                if karyotype_id is not None \
                        and not self._is_normal_karyotype(karyotype) \
                        and gvc_id is not None and karyotype_id != gvc_id:
                    geno.addParts(karyotype_id, gvc_id, karyo_rel)

                if variant_id.strip() != '':
                    # split the variants & add them as part of the genotype
                    # we don't necessarily know their zygosity,
                    # just that they are part of the genotype variant ids
                    # are from OMIM, so prefix as such we assume that the
                    # sequence alts will be defined in OMIM not here
                    # TODO sort the variant_id list, if the omim prefix is
                    # the same, then assume it's the locus make a hashmap
                    # of the omim id to variant id list;
                    # then build the genotype hashmap is also useful for
                    # removing the "genes" from the list of "phenotypes"

                    # will hold gene/locus id to variant list
                    omim_map = {}

                    locus_num = None
                    for var in variant_id.split(';'):
                        # handle omim-style and odd var ids
                        # like 610661.p.R401X
                        mch = re.match(r'(\d+)\.+(.*)', var.strip())
                        if mch is not None and len(mch.groups()) == 2:
                            (locus_num, var_num) = mch.groups()

                        if locus_num is not None and locus_num not in omim_map:
                            omim_map[locus_num] = [var_num]
                        else:
                            omim_map[locus_num] += [var_num]

                    for omim in omim_map:
                        # gene_id = 'OMIM:' + omim  # TODO unused
                        vslc_id = '_:' + '-'.join(
                            [omim + '.' + a for a in omim_map.get(omim)])
                        vslc_label = vl
                        # we don't really know the zygosity of
                        # the alleles at all.
                        # so the vslcs are just a pot of them
                        model.addIndividualToGraph(
                            vslc_id, vslc_label,
                            self.globaltt['variant single locus complement'])
                        for var in omim_map.get(omim):
                            # this is actually a sequence alt
                            allele1_id = 'OMIM:' + omim + '.' + var
                            geno.addSequenceAlteration(allele1_id, None)

                            # assume that the sa -> var_loc -> gene
                            # is taken care of in OMIM
                            geno.addPartsToVSLC(
                                vslc_id, allele1_id, None,
                                self.globaltt['indeterminate'],
                                self.globaltt['has_variant_part'])

                        if vslc_id != gvc_id:
                            geno.addVSLCtoParent(vslc_id, gvc_id)

                if affected == 'unaffected':
                    # let's just say that this person is wildtype
                    model.addType(patient_id, self.globaltt['wildtype'])
                elif genotype_id is None:
                    # make an anonymous genotype id (aka blank node)
                    genotype_id = '_:geno' + catalog_id.strip()

                # add the gvc
                if gvc_id is not None:
                    model.addIndividualToGraph(
                        gvc_id, gvc_label,
                        self.globaltt['genomic_variation_complement'])

                    # add the gvc to the genotype
                    if genotype_id is not None:
                        if affected == 'unaffected':
                            rel = self.globaltt['has_reference_part']
                        else:
                            rel = self.globaltt['has_variant_part']
                        geno.addParts(gvc_id, genotype_id, rel)

                    if karyotype_id is not None \
                            and self._is_normal_karyotype(karyotype):
                        if gvc_label is not None and gvc_label != '':
                            genotype_label = '; '.join((gvc_label, karyotype))
                        elif karyotype is not None:
                            genotype_label = karyotype
                        if genotype_id is None:
                            genotype_id = karyotype_id
                        else:
                            geno.addParts(karyotype_id, genotype_id,
                                          self.globaltt['has_reference_part'])
                    else:
                        genotype_label = gvc_label
                        # use the catalog id as the background
                    genotype_label += ' [' + catalog_id.strip() + ']'

                if genotype_id is not None and gvc_id is not None:
                    # only add the genotype if it has some parts
                    geno.addGenotype(genotype_id, genotype_label,
                                     self.globaltt['intrinsic_genotype'])
                    geno.addTaxon(taxon, genotype_id)
                    # add that the patient has the genotype
                    # TODO check if the genotype belongs to
                    # the cell line or to the patient
                    graph.addTriple(patient_id, self.globaltt['has_genotype'],
                                    genotype_id)
                else:
                    geno.addTaxon(taxon, patient_id)

                # TODO: Add sex/gender  (as part of the karyotype?)
                # = row[col.index('')].strip()
                # #############    DEAL WITH THE DISEASES   #############
                omim_num = row[col.index('omim_num')].strip()

                # we associate the disease to the patient
                if affected == 'affected' and omim_num != '':
                    for d in omim_num.split(';'):
                        if d is not None and d != '':
                            # if the omim number is in omim_map,
                            # then it is a gene not a pheno

                            # TEC - another place to use the mimTitle omim
                            # classifier omia & genereviews are using

                            if d not in omim_map:
                                disease_id = 'OMIM:' + d.strip()
                                # assume the label is taken care of in OMIM
                                model.addClassToGraph(disease_id, None)

                                # add the association:
                                #   the patient has the disease
                                assoc = G2PAssoc(graph, self.name, patient_id,
                                                 disease_id)
                                assoc.add_association_to_graph()

                                # this line is a model of this disease
                                # TODO abstract out model into
                                # it's own association class?
                                graph.addTriple(cell_line_id,
                                                self.globaltt['is model of'],
                                                disease_id)
                            else:
                                LOG.info('drop gene %s from disease list', d)

                # #############    ADD PUBLICATIONS   #############
                pubmed_ids = row[col.index('pubmed_ids')].strip()
                if pubmed_ids != '':
                    for s in pubmed_ids.split(';'):
                        pubmed_id = 'PMID:' + s.strip()
                        ref = Reference(graph, pubmed_id)
                        ref.setType(self.globaltt['journal article'])
                        ref.addRefToGraph()
                        graph.addTriple(pubmed_id, self.globaltt['mentions'],
                                        cell_line_id)

                if not self.testMode and (limit is not None
                                          and line_counter > limit):
                    break
        return

Example #6

Show file

File: Coriell.py Project: JervenBolleman/dipper

    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

            line_id a CL_0000057,  #fibroblast line
                derives_from patient_id
                part_of :NIGMSrepository
                RO:model_of OMIM:disease_id

            patient id a foaf:person,
                label: "fibroblast from patient 12345 with disease X"
                member_of family_id  #what is the right thing here?
                SIO:race EFO:caucasian  #subclass of EFO:0001799
                in_taxon NCBITaxon:9606
                dc:description Literal(remark)
                RO:has_phenotype OMIM:disease_id
                GENO:has_genotype genotype_id

            family_id a owl:NamedIndividual
                foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

            genotype_id a intrinsic_genotype
                GENO:has_alternate_part allelic_variant_id
                we don't necessarily know much about the genotype,
                other than the allelic variant. also there's the sex here

            pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:
        """
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:      # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        geno = Genotype(g)
        du = DipperUtil()

        gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1

                    (catalog_id, description, omim_number, sample_type,
                     cell_line_available, dna_in_stock, dna_ref, gender, age,
                     race, ethnicity, affected, karyotype, relprob, mutation,
                     gene, family_id, collection, url, cat_remark, pubmed_ids,
                     family_member, variant_id, dbsnp_id, species) = row

                    # example:
                    # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
                    # parent,,,39,NIGMS Human Genetic Cell Repository,
                    # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                    # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
                    # 2,,18343,H**o sapiens

                    if self.testMode and catalog_id not in self.test_lines:
                        # skip rows not in our test lines, when in test mode
                        continue

                    # ###########    BUILD REQUIRED VARIABLES    ###########

                    # Make the cell line ID
                    cell_line_id = 'Coriell:'+catalog_id.strip()

                    # Map the cell/sample type
                    cell_type = self._map_cell_type(sample_type)

                    # Make a cell line label
                    line_label = \
                        collection.partition(' ')[0]+'-'+catalog_id.strip()

                    # Map the repository/collection
                    repository = self._map_collection(collection)

                    # patients are uniquely identified by one of:
                    # dbsnp id (which is == an individual haplotype)
                    # family id + family member (if present) OR
                    # probands are usually family member zero
                    # cell line id
                    # since some patients have >1 cell line derived from them,
                    # we must make sure that the genotype is attached to
                    # the patient, and can be inferred to the cell line
                    # examples of repeated patients are:
                    #   famid=1159, member=1; fam=152,member=1

                    # Make the patient ID

                    # make an anonymous patient
                    patient_id = '_person'
                    if self.nobnodes:
                        patient_id = ':'+patient_id
                    if family_id != '':
                        patient_id = \
                            '-'.join((patient_id, family_id, family_member))
                    else:
                        # make an anonymous patient
                        patient_id = '-'.join((patient_id, catalog_id.strip()))

                    # properties of the individual patients:  sex, family id,
                    # member/relproband, description descriptions are
                    # really long and ugly SCREAMING text, so need to clean up
                    # the control cases are so odd with this labeling scheme;
                    # but we'll deal with it as-is for now.
                    short_desc = (description.split(';')[0]).capitalize()
                    if affected == 'Yes':
                        affected = 'affected'
                    elif affected == 'No':
                        affected = 'unaffected'
                    gender = gender.lower()
                    patient_label = ' '.join((affected, gender, relprob))
                    if relprob == 'proband':
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'with', short_desc))
                    else:
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'of proband with',
                                 short_desc))

                    # #############    BUILD THE CELL LINE    #############

                    # Adding the cell line as a typed individual.
                    cell_line_reagent_id = 'CLO:0000031'

                    gu.addIndividualToGraph(
                        g, cell_line_id, line_label, cell_line_reagent_id)

                    # add the equivalent id == dna_ref
                    if dna_ref != '' and dna_ref != catalog_id:
                        equiv_cell_line = 'Coriell:'+dna_ref
                        # some of the equivalent ids are not defined
                        # in the source data; so add them
                        gu.addIndividualToGraph(
                            g, equiv_cell_line, None, cell_line_reagent_id)
                        gu.addSameIndividual(g, cell_line_id, equiv_cell_line)

                    # Cell line derives from patient
                    geno.addDerivesFrom(cell_line_id, patient_id)
                    geno.addDerivesFrom(cell_line_id, cell_type)

                    # Cell line a member of repository
                    gu.addMember(g, repository, cell_line_id)

                    if cat_remark != '':
                        gu.addDescription(g, cell_line_id, cat_remark)

                    # Cell age_at_sampling
                    # TODO add the age nodes when modeled properly in #78
                    # if (age != ''):
                        # this would give a BNode that is an instance of Age.
                        # but i don't know how to connect
                        # the age node to the cell line? we need to ask @mbrush
                        # age_id = '_'+re.sub('\s+','_',age)
                        # gu.addIndividualToGraph(
                        #   g,age_id,age,self.terms['age'])
                        # gu.addTriple(
                        #   g,age_id,self.properties['has_measurement'],age,
                        #   True)

                    # #############    BUILD THE PATIENT    #############

                    # Add the patient ID as an individual.
                    gu.addPerson(g, patient_id, patient_label)
                    # TODO map relationship to proband as a class
                    # (what ontology?)

                    # Add race of patient
                    # FIXME: Adjust for subcategories based on ethnicity field
                    # EDIT: There are 743 different entries for ethnicity...
                    # Too many to map?
                    # Add ethnicity as literal in addition to the mapped race?
                    # Adjust the ethnicity txt (if using)
                    # to initial capitalization to remove ALLCAPS

                    # TODO race should go into the individual's background
                    # and abstracted out to the Genotype class punting for now.
                    # if race != '':
                    #    mapped_race = self._map_race(race)
                    #    if mapped_race is not None:
                    #        gu.addTriple(
                    #           g,patient_id,self.terms['race'],mapped_race)
                    #        gu.addSubclass(
                    #           g,self.terms['ethnic_group'],mapped_race)

                    # #############    BUILD THE FAMILY    #############

                    # Add triples for family_id, if present.
                    if family_id != '':
                        family_comp_id = 'CoriellFamily:'+family_id

                        family_label = \
                            ' '.join(('Family of proband with', short_desc))

                        # Add the family ID as a named individual
                        gu.addIndividualToGraph(
                            g, family_comp_id, family_label,
                            geno.genoparts['family'])

                        # Add the patient as a member of the family
                        gu.addMemberOf(g, patient_id, family_comp_id)

                    # #############    BUILD THE GENOTYPE   #############

                    # the important things to pay attention to here are:
                    # karyotype = chr rearrangements  (somatic?)
                    # mutation = protein-level mutation as a label,
                    # often from omim
                    # gene = gene symbol - TODO get id
                    # variant_id = omim variant ids (; delimited)
                    # dbsnp_id = snp individual ids = full genotype?

                    # note GM00633 is a good example of chromosomal variation
                    # - do we have enough to capture this?
                    # GM00325 has both abnormal karyotype and variation

                    # make an assumption that if the taxon is blank,
                    # that it is human!
                    if species is None or species == '':
                        species = 'H**o sapiens'
                    taxon = self._map_species(species)

                    # if there's a dbSNP id,
                    # this is actually the individual's genotype
                    genotype_id = None
                    genotype_label = None
                    if dbsnp_id != '':
                        genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip()

                    omim_map = {}
                    gvc_id = None

                    # some of the karyotypes are encoded
                    # with terrible hidden codes. remove them here
                    # i've seen a <98> character
                    karyotype = du.remove_control_characters(karyotype)
                    karyotype_id = None
                    if karyotype.strip() != '':
                        karyotype_id = \
                            '_'+re.sub('MONARCH:', '', self.make_id(karyotype))
                        if self.nobnodes:
                            karyotype_id = ':'+karyotype_id
                        # add karyotype as karyotype_variation_complement
                        gu.addIndividualToGraph(
                            g, karyotype_id, karyotype,
                            geno.genoparts['karyotype_variation_complement'])
                        # TODO break down the karyotype into parts
                        # and map into GENO. depends on #77

                        # place the karyotype in a location(s).
                        karyo_chrs = \
                            self._get_affected_chromosomes_from_karyotype(
                                karyotype)
                        for c in karyo_chrs:
                            chr_id = makeChromID(c, taxon, 'CHR')
                            # add an anonymous sequence feature,
                            # each located on chr
                            karyotype_feature_id = '-'.join((karyotype_id, c))
                            karyotype_feature_label = \
                                'some karyotype alteration on chr'+str(c)
                            f = Feature(
                                karyotype_feature_id, karyotype_feature_label,
                                geno.genoparts['sequence_alteration'])
                            f.addFeatureStartLocation(None, chr_id)
                            f.addFeatureToGraph(g)
                            f.loadAllProperties(g)
                            geno.addParts(
                                karyotype_feature_id, karyotype_id,
                                geno.object_properties['has_alternate_part'])

                    if gene != '':
                        vl = gene+'('+mutation+')'

                    # fix the variant_id so it's always in the same order
                    vids = variant_id.split(';')
                    variant_id = ';'.join(sorted(list(set(vids))))

                    if karyotype.strip() != '' \
                            and not self._is_normal_karyotype(karyotype):
                        mutation = mutation.strip()
                        gvc_id = karyotype_id
                        if variant_id != '':
                            gvc_id = '_' + variant_id.replace(';', '-') + '-' \
                                    + re.sub(r'\w*:', '', karyotype_id)
                        if mutation.strip() != '':
                            gvc_label = '; '.join((vl, karyotype))
                        else:
                            gvc_label = karyotype
                    elif variant_id.strip() != '':
                        gvc_id = '_' + variant_id.replace(';', '-')
                        gvc_label = vl
                    else:
                        # wildtype?
                        pass

                    if gvc_id is not None and gvc_id != karyotype_id \
                            and self.nobnodes:
                        gvc_id = ':'+gvc_id

                    # add the karyotype to the gvc.
                    # use reference if normal karyotype
                    karyo_rel = geno.object_properties['has_alternate_part']
                    if self._is_normal_karyotype(karyotype):
                        karyo_rel = \
                            geno.object_properties['has_reference_part']
                    if karyotype_id is not None \
                            and not self._is_normal_karyotype(karyotype) \
                            and gvc_id is not None and karyotype_id != gvc_id:
                        geno.addParts(karyotype_id, gvc_id, karyo_rel)

                    if variant_id.strip() != '':
                        # split the variants & add them as part of the genotype
                        # we don't necessarily know their zygosity,
                        # just that they are part of the genotype variant ids
                        # are from OMIM, so prefix as such we assume that the
                        # sequence alts will be defined in OMIM not here
                        # TODO sort the variant_id list, if the omim prefix is
                        # the same, then assume it's the locus make a hashmap
                        # of the omim id to variant id list;
                        # then build the genotype hashmap is also useful for
                        # removing the "genes" from the list of "phenotypes"

                        # will hold gene/locus id to variant list
                        omim_map = {}

                        locus_num = None
                        for v in variant_id.split(';'):
                            # handle omim-style and odd var ids
                            # like 610661.p.R401X
                            m = re.match(r'(\d+)\.+(.*)', v.strip())
                            if m is not None and len(m.groups()) == 2:
                                (locus_num, var_num) = m.groups()

                            if locus_num is not None \
                                    and locus_num not in omim_map:
                                omim_map[locus_num] = [var_num]
                            else:
                                omim_map[locus_num] += [var_num]

                        for o in omim_map:
                            # gene_id = 'OMIM:' + o  # TODO unused
                            vslc_id = \
                                '_' + '-'.join(
                                    [o + '.' + a for a in omim_map.get(o)])
                            if self.nobnodes:
                                vslc_id = ':'+vslc_id
                            vslc_label = vl
                            # we don't really know the zygosity of
                            # the alleles at all.
                            # so the vslcs are just a pot of them
                            gu.addIndividualToGraph(
                                g, vslc_id, vslc_label,
                                geno.genoparts[
                                    'variant_single_locus_complement'])
                            for v in omim_map.get(o):
                                # this is actually a sequence alt
                                allele1_id = 'OMIM:'+o+'.'+v
                                geno.addSequenceAlteration(allele1_id, None)

                                # assume that the sa -> var_loc -> gene
                                # is taken care of in OMIM
                                geno.addPartsToVSLC(
                                    vslc_id, allele1_id, None,
                                    geno.zygosity['indeterminate'],
                                    geno.object_properties[
                                        'has_alternate_part'])

                            if vslc_id != gvc_id:
                                geno.addVSLCtoParent(vslc_id, gvc_id)

                    if affected == 'unaffected':
                        # let's just say that this person is wildtype
                        gu.addType(g, patient_id, geno.genoparts['wildtype'])
                    elif genotype_id is None:
                        # make an anonymous genotype id
                        genotype_id = '_geno'+catalog_id.strip()
                        if self.nobnodes:
                            genotype_id = ':'+genotype_id

                    # add the gvc
                    if gvc_id is not None:
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])

                        # add the gvc to the genotype
                        if genotype_id is not None:
                            if affected == 'unaffected':
                                rel = \
                                    geno.object_properties[
                                        'has_reference_part']
                            else:
                                rel = \
                                    geno.object_properties[
                                        'has_alternate_part']
                            geno.addParts(gvc_id, genotype_id, rel)
                        if karyotype_id is not None \
                                and self._is_normal_karyotype(karyotype):
                            if gvc_label is not None and gvc_label != '':
                                genotype_label = \
                                    '; '.join((gvc_label, karyotype))
                            else:
                                genotype_label = karyotype
                            if genotype_id is None:
                                genotype_id = karyotype_id
                            else:
                                geno.addParts(
                                    karyotype_id, genotype_id,
                                    geno.object_properties[
                                        'has_reference_part'])
                        else:
                            genotype_label = gvc_label
                            # use the catalog id as the background
                        genotype_label += ' ['+catalog_id.strip()+']'

                    if genotype_id is not None and gvc_id is not None:
                        # only add the genotype if it has some parts
                        geno.addGenotype(
                            genotype_id, genotype_label,
                            geno.genoparts['intrinsic_genotype'])
                        geno.addTaxon(taxon, genotype_id)
                        # add that the patient has the genotype
                        # TODO check if the genotype belongs to
                        # the cell line or to the patient
                        gu.addTriple(
                            g, patient_id,
                            geno.properties['has_genotype'], genotype_id)
                    else:
                        geno.addTaxon(taxon, patient_id)

                    # TODO: Add sex/gender  (as part of the karyotype?)

                    # #############    DEAL WITH THE DISEASES   #############

                    # we associate the disease to the patient
                    if affected == 'affected':
                        if omim_number != '':
                            for d in omim_number.split(';'):
                                if d is not None and d != '':
                                    # if the omim number is in omim_map,
                                    # then it is a gene not a pheno
                                    if d not in omim_map:
                                        disease_id = 'OMIM:'+d.strip()
                                        # assume the label is taken care of
                                        gu.addClassToGraph(g, disease_id, None)

                                        # add the association:
                                        #   the patient has the disease
                                        assoc = G2PAssoc(
                                            self.name, patient_id, disease_id)
                                        assoc.add_association_to_graph(g)

                                        # this line is a model of this disease
                                        # TODO abstract out model into
                                        # it's own association class?
                                        gu.addTriple(
                                            g, cell_line_id,
                                            gu.properties['model_of'],
                                            disease_id)
                                    else:
                                        logger.info(
                                            'removing %s from disease list ' +
                                            'since it is a gene', d)

                    # #############    ADD PUBLICATIONS   #############

                    if pubmed_ids != '':
                        for s in pubmed_ids.split(';'):
                            pubmed_id = 'PMID:'+s.strip()
                            ref = Reference(pubmed_id)
                            ref.setType(Reference.ref_types['journal_article'])
                            ref.addRefToGraph(g)
                            gu.addTriple(
                                g, pubmed_id, gu.properties['mentions'],
                                cell_line_id)

                    if not self.testMode \
                            and (limit is not None and line_counter > limit):
                        break

            Assoc(self.name).load_all_properties(g)

        return

Example #7

Show file

    def process_gaf(self, gaffile, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", gaffile)
        uniprot_hit = 0
        uniprot_miss = 0
        col = self.gaf_columns

        with gzip.open(gaffile, 'rb') as csvfile:
            reader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"')
            for row in reader:
                # comments start with exclamation
                if row[0][0] == '!':
                    continue

                if len(row) != len(col):
                    LOG.error(
                        "Wrong number of columns %i, expected ... got:\n\t%s",
                        len(col), row)
                    exit(1)

                dbase = row[col.index('DB')].strip()
                gene_num = row[col.index('DB_Object_ID')].strip()
                gene_symbol = row[col.index('DB_Object_Symbol')].strip()
                qualifier = row[col.index('Qualifier')]
                go_id = row[col.index('GO_ID')].strip()
                ref = row[col.index('DB:Reference')].strip()
                eco_symbol = row[col.index('Evidence Code')].strip()
                with_or_from = row[col.index('With (or) From')]
                aspect = row[col.index('Aspect')].strip()
                gene_name = row[col.index('DB_Object_Name')]
                gene_synonym = row[col.index('DB_Object_Synonym')]
                # object_type = row[col.index('DB_Object_Type')].strip()
                taxon = row[col.index('Taxon and Interacting taxon')].strip()
                # date = row[col.index('Date')].strip()
                # assigned_by = row[col.index('Assigned_By')].strip()
                # annotation_extension = row[col.index('Annotation_Extension')]
                # gene_product_form_id = row[col.index('Gene_Product_Form_ID')]

                # test for required fields
                if '' in [row[:10], row[12]]:
                    LOG.error(
                        "Missing required part of annotation on row %i:\n%s",
                        reader.line_num, str(row[:-4]))
                    continue

                # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and gene_id[:9] != 'NCBIGene:' and\
                        gene_num not in self.test_ids:
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        syn = syn.strip()
                        if syn[:10] == 'UniProtKB:':
                            model.addTriple(
                                gene_id, self.globaltt['has gene product'], syn)
                        elif re.fullmatch(graph.curie_regexp, syn) is not None:
                            LOG.warning(
                                'possible curie "%s" as a literal synomym for %s',
                                syn, gene_id)
                            model.addSynonym(gene_id, syn)
                        else:
                            model.addSynonym(gene_id, syn)

                for txid in taxon.split('|'):
                    tax_curie = re.sub(r'taxon:', 'NCBITaxon:', txid)
                    geno.addTaxon(tax_curie, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to', qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n", str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                ########################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = with_or_from.split('|')
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for itm in withitems:
                        if itm == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm):
                            LOG.warning(
                                "Skipping  %s from or with %s", uniprotid, itm)
                            continue
                        itm = re.sub(r'MGI\:MGI\:', 'MGI:', itm)
                        itm = re.sub(r'WB:', 'WormBase:', itm)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', itm):
                            targeted_gene_id = self.zfin.make_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', itm):
                            targeted_gene_id = self.wbase.make_reagent_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, itm, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(
                                    self.globaltt['experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be the evidence for the GO assoc?

                if not self.test_mode and limit is not None and \
                        reader.line_num > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)

Example #8

Show file

File: WormBase.py Project: matthewbrickley/dipper

    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['allele_pheno']['file']))

        graph = self.graph
        model = Model(self.graph)

        LOG.info("Processing Allele phenotype associations")
        line_counter = 0
        geno = Genotype(graph)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, phenotype_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_symbol = eco_symbol.strip()
                eco_id = None
                if eco_symbol.strip() != '':
                    eco_id = self.resolve(eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    LOG.error(
                        "Missing alleles from phenotype assoc at line %d",
                        line_counter)
                    continue
                else:
                    for allele in allele_list:
                        allele_num = re.sub(r'WB:', '', allele.strip())
                        allele_id = 'WormBase:' + allele_num
                        gene_id = 'WormBase:' + gene_num

                        if re.search(r'WBRNAi', allele_id):

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling
                            # make the WormBase:WBRNAi* id
                            # a self.globaltt['reagent_targeted_gene'], and attach
                            # phenotype to this ID

                            # Previous model - make a bnode reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            #rnai_num = re.sub(r'WormBase:', '', allele_id)
                            #rnai_id = allele_id
                            #rtg_id = self.make_reagent_targeted_gene_id(
                            #    gene_num, rnai_num)
                            #geno.addReagentTargetedGene(
                            #    rnai_id, 'WormBase:' + gene_num, rtg_id)
                            # allele_id = rtg_id

                            # Could type the IRI as both the reagant and reagant
                            # targeted gene but not sure if this needed
                            # geno.addGeneTargetingReagent(
                            #   allele_id, None, self.globaltt['RNAi_reagent'], gene_id)

                            model.addIndividualToGraph(
                                allele_id, None,
                                self.globaltt['reagent_targeted_gene'])

                            self.graph.addTriple(
                                allele_id, self.globaltt['is_expression_variant_of'],
                                gene_id)

                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations
                            # the public name will come from elsewhere

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling, treat variant
                            # like an allele
                            #vl_id = '_:'+'-'.join((gene_num, allele_num))
                            #geno.addSequenceAlterationToVariantLocus(
                            #    allele_id, vl_id)
                            #geno.addAlleleOfGene(vl_id, gene_id)

                            geno.addSequenceAlteration(allele_id, None)
                            geno.addAlleleOfGene(allele_id, gene_id)
                        else:
                            LOG.warning(
                                "Some kind of allele I don't recognize: %s", allele_num)
                            continue
                        assoc = G2PAssoc(graph, self.name, allele_id, phenotype_id)

                        if eco_id is not None:
                            assoc.add_evidence(eco_id)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            reference = Reference(graph, ref)
                            if re.search(r'Person', ref):
                                reference.setType(self.globaltt['person'])
                                assoc.add_evidence(
                                    self.globaltt[
                                        'inference from background scientific knowledge'
                                    ])
                            reference.addRefToGraph()
                            assoc.add_source(ref)

                        assoc.add_association_to_graph()

                        # finish looping through all alleles

                if limit is not None and line_counter > limit:
                    break

        return

Example #9

Show file

File: HPOAnnotations.py Project: d3borah/dipper

    def _process_phenotype_tab(self, raw, limit):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        gu = GraphUtils(curie_map.get())
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (db, num, name, qual, pheno_id, publist, eco, onset, freq, w, asp, syn, date, curator) = row
                disease_id = db + ":" + str(num)

                if self.testMode and disease_id.strip() not in config.get_config()['test_ids']['disease']:
                    continue

                # logger.info('adding %s', disease_id)

                gu.addClassToGraph(g, disease_id, None)
                gu.addClassToGraph(g, pheno_id, None)
                eco_id = self._map_evidence_to_codes(eco)
                gu.addClassToGraph(g, eco_id, None)
                if onset is not None and onset.strip() != '':
                    gu.addClassToGraph(g, onset, None)

                # we want to do things differently depending on the aspect of the annotation
                if asp == 'O' or asp == 'M':  # organ abnormality or mortality
                    assoc = D2PAssoc(self.name, disease_id, pheno_id, onset, freq)
                elif asp == 'I':  # inheritance patterns for the whole disease
                    assoc = DispositionAssoc(self.name, disease_id, pheno_id)
                elif asp == 'C':  # clinical course / onset
                    assoc = DispositionAssoc(self.name, disease_id, pheno_id)
                else:
                    logger.error("I don't know what this aspect is:", asp)

                assoc.add_evidence(eco_id)

                publist = publist.split(';')
                # blow these apart if there is a list of pubs
                for pub in publist:
                    pub = pub.strip()
                    if pub != '':
                        # if re.match('http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene', pub):
                        #     #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced
                        #     m = re.search('part\=(\w+)', pub)
                        #     pub_id = 'GeneReviews:'+m.group(1)
                        # elif re.search('http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=', pub):
                        #     m = re.search('Expert=(\d+)', pub)
                        #     pub_id = 'Orphanet:'+m.group(1)
                        if not re.match('http', pub):
                            r = Reference(pub)
                            if re.match('PMID', pub):
                                r.setType(Reference.ref_types['journal_article'])
                            r.addRefToGraph(g)
                        # TODO add curator
                        assoc.add_source(pub)

                assoc.add_association_to_graph(g)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

            Assoc(None).load_all_properties(g)

        return

Example #10

Show file

File: GeneOntology.py Project: TomConlin/dipper

    def process_gaf(self, file, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", file)
        line_counter = 0
        uniprot_hit = 0
        uniprot_miss = 0
        if 7955 in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        if 6239 in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue

                if len(row) > 17 or len(row) < 15:
                    LOG.warning(
                        "Wrong number of columns %i, expected 15 or 17\n%s",
                        len(row), row)
                    continue

                if 17 > len(row) >= 15:
                    row += [""] * (17 - len(row))

                (dbase,
                 gene_num,
                 gene_symbol,
                 qualifier,
                 go_id,
                 ref,
                 eco_symbol,
                 with_or_from,
                 aspect,
                 gene_name,
                 gene_synonym,
                 object_type,
                 taxon,
                 date,
                 assigned_by,
                 annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (dbase == '' or gene_num == '' or gene_symbol == '' or
                        go_id == '' or ref == '' or eco_symbol == '' or
                        aspect == '' or object_type == '' or taxon == '' or
                        date == '' or assigned_by == ''):
                    LOG.error(
                        "Missing required part of annotation on row %d:\n"+'\t'
                        .join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s  is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and not(
                        re.match(r'NCBIGene', gene_id) and
                        int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, syn.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    LOG.info(
                        ">1 taxon (%s) on line %d.  skipping", taxon, line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to', qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n", str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                #######################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id+'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i):
                            LOG.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(gene_id, i)
                            geno.addReagentTargetedGene(i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = wbase.make_reagent_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, i, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(
                                    self.globaltt['experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.test_mode and limit is not None and line_counter > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %f.2%% of %i benifited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
        return

Example #11

Show file

File: GeneOntology.py Project: kshefchek/dipper

    def process_gaf(self, file, limit, id_map=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        geno = Genotype(g)
        logger.info("Processing Gene Associations from %s", file)
        line_counter = 0

        if 7955 in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        elif 6239 in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue
                (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
                 with_or_from, aspect, gene_name, gene_synonym, object_type,
                 taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (db == '' or gene_num == '' or gene_symbol == '' or
                        go_id == '' or ref == '' or eco_symbol == '' or
                        aspect == '' or object_type == '' or taxon == '' or
                        date == '' or assigned_by == ''):
                    logger.error(
                        "Missing required part of annotation " +
                        "on row %d:\n"+'\t'.join(row),
                        line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                db = self.clean_db_prefix(db)
                uniprotid = None
                gene_id = None
                if db == 'UniProtKB':
                    mapped_ids = id_map.get(gene_num)
                    if id_map is not None and mapped_ids is not None:
                        if len(mapped_ids) == 1:
                            gene_id = mapped_ids[0]
                            uniprotid = ':'.join((db, gene_num))
                            gene_num = re.sub(r'\w+\:', '', gene_id)
                        elif len(mapped_ids) > 1:
                            # logger.warning(
                            #   "Skipping gene id mapped for >1 gene %s -> %s",
                            #    gene_num, str(mapped_ids))
                            continue
                    else:
                        continue
                elif db == 'MGI':
                    gene_num = re.sub(r'MGI:', '', gene_num)
                    gene_id = ':'.join((db, gene_num))
                    gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
                else:
                    gene_id = ':'.join((db, gene_num))

                if self.testMode \
                        and not(
                            re.match(r'NCBIGene', gene_id) and
                            int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for s in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, s.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    logger.info(">1 taxon (%s) on line %d.  skipping", taxon,
                                line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(g, self.name)

                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                refs = re.split(r'\|', ref)
                for r in refs:
                    r = r.strip()
                    if r != '':
                        prefix = re.split(r':', r)[0]
                        r = re.sub(prefix, self.clean_db_prefix(prefix), r)
                        r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                        ref = Reference(g, r)
                        if re.match(r'PMID', r):
                            ref_type = Reference.ref_types['journal_article']
                            ref.setType(ref_type)
                        ref.addRefToGraph()
                        assoc.add_source(r)

                # TODO add the source of the annotations from assigned by?

                aspect_rel_map = {
                    'P': model.object_properties['involved_in'],  # involved in
                    'F': model.object_properties['enables'],  # enables
                    'C': model.object_properties['part_of']  # part of
                }

                if aspect not in aspect_rel_map:
                    logger.error("Aspect not recognized: %s", aspect)

                rel = aspect_rel_map.get(aspect)
                if aspect == 'F' and re.search(r'contributes_to', qualifier):
                    rel = model.object_properties['contributes_to']
                assoc.set_relationship(rel)
                if uniprotid is not None:
                    assoc.set_description('Mapped from '+uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used

                assoc.add_association_to_graph()

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id+'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or \
                                re.match(
                                    r'(UniProtKB|WBPhenotype|InterPro|HGNC)',
                                    i):
                            logger.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s",
                                uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(i, gene_id,
                                                        targeted_gene_id)
                            # TODO PYLINT why is this:
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = \
                                wbase.make_reagent_targeted_gene_id(
                                    gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(
                                g, self.name, targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(g, self.name, i, phenotypeid)
                        for r in refs:
                            r = r.strip()
                            if r != '':
                                prefix = re.split(r':', r)[0]
                                r = re.sub(
                                    prefix, self.clean_db_prefix(prefix), r)
                                r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                                assoc.add_source(r)
                                # experimental phenotypic evidence
                                assoc.add_evidence("ECO:0000059")
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return

Example #12

Show file

    def process_gaf(self, file, limit, id_map=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        geno = Genotype(g)
        logger.info("Processing Gene Associations from %s", file)
        line_counter = 0

        if 7955 in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        elif 6239 in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue
                (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
                 with_or_from, aspect, gene_name, gene_synonym, object_type,
                 taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (db == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    logger.error(
                        "Missing required part of annotation " +
                        "on row %d:\n" + '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                db = self.clean_db_prefix(db)
                uniprotid = None
                gene_id = None
                if db == 'UniProtKB':
                    mapped_ids = id_map.get(gene_num)
                    if id_map is not None and mapped_ids is not None:
                        if len(mapped_ids) == 1:
                            gene_id = mapped_ids[0]
                            uniprotid = ':'.join((db, gene_num))
                            gene_num = re.sub(r'\w+\:', '', gene_id)
                        elif len(mapped_ids) > 1:
                            # logger.warning(
                            #   "Skipping gene id mapped for >1 gene %s -> %s",
                            #    gene_num, str(mapped_ids))
                            continue
                    else:
                        continue
                elif db == 'MGI':
                    gene_num = re.sub(r'MGI:', '', gene_num)
                    gene_id = ':'.join((db, gene_num))
                    gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
                else:
                    gene_id = ':'.join((db, gene_num))

                if self.testMode \
                        and not(
                            re.match(r'NCBIGene', gene_id) and
                            int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for s in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, s.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    logger.info(">1 taxon (%s) on line %d.  skipping", taxon,
                                line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(g, self.name)

                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                refs = re.split(r'\|', ref)
                for r in refs:
                    r = r.strip()
                    if r != '':
                        prefix = re.split(r':', r)[0]
                        r = re.sub(prefix, self.clean_db_prefix(prefix), r)
                        r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                        ref = Reference(g, r)
                        if re.match(r'PMID', r):
                            ref_type = Reference.ref_types['journal_article']
                            ref.setType(ref_type)
                        ref.addRefToGraph()
                        assoc.add_source(r)

                # TODO add the source of the annotations from assigned by?

                aspect_rel_map = {
                    'P': model.object_properties['involved_in'],  # involved in
                    'F': model.object_properties['enables'],  # enables
                    'C': model.object_properties['part_of']  # part of
                }

                if aspect not in aspect_rel_map:
                    logger.error("Aspect not recognized: %s", aspect)

                rel = aspect_rel_map.get(aspect)
                if aspect == 'F' and re.search(r'contributes_to', qualifier):
                    rel = model.object_properties['contributes_to']
                assoc.set_relationship(rel)
                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used

                assoc.add_association_to_graph()

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or \
                                re.match(
                                    r'(UniProtKB|WBPhenotype|InterPro|HGNC)',
                                    i):
                            logger.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this:
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = \
                                wbase.make_reagent_targeted_gene_id(
                                    gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        else:
                            assoc = G2PAssoc(g, self.name, i, phenotypeid)
                        for r in refs:
                            r = r.strip()
                            if r != '':
                                prefix = re.split(r':', r)[0]
                                r = re.sub(prefix,
                                           self.clean_db_prefix(prefix), r)
                                r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                                assoc.add_source(r)
                                # experimental phenotypic evidence
                                assoc.add_evidence("ECO:0000059")
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return