コード例 #1
0
ファイル: ClinVar.py プロジェクト: DoctorBud/dipper
    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        geno = Genotype(g)
        f = Feature(g, None, None, None)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:'+tax_num
        tax_label = 'Human'
        model.addClassToGraph(tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)",
                        num_cols, expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                tax_num, 'CHR')
                            geno.addChromosomeInstance(
                                cytogenetic_loc, build_id, assembly, band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                assembly, 'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(
                        str(chr), build_id, assembly, chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    if re.match(r'^Gene:', gene_num):
                        gene_num = "NCBI" + gene_num
                    else:
                        gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph()
                f.addTaxonToFeature(tax_id)
                # make the ClinVarVariant the clique leader
                model.makeLeader(seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs'+str(dbsnp_num)
                    model.addIndividualToGraph(dbsnp_id, None)
                    model.addSameIndividual(seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:'+dbvar_num
                    model.addIndividualToGraph(dbvar_id, None)
                    model.addSameIndividual(seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        model.addIndividualToGraph(rcv_id, None)
                        model.addXref(seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    model.addClassToGraph(gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_'+gene_num+'-'+variant_num
                    if self.nobnodes:
                        vl_id = ':'+vl_id
                    vl_label = allele_name
                    model.addIndividualToGraph(
                        vl_id, vl_label, geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info(
                            "No gene listed for variant %d",
                            int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for phenotype in pheno_list:
                        m = re.match(
                            r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype)
                        if m is not None and len(m.groups()) > 0:
                            phenotype = re.sub(
                                m.group(1), 'Orphanet:', phenotype.strip())
                        elif re.match(r'ORPHA:\d+', phenotype):
                            phenotype = re.sub(
                                r'^ORPHA', 'Orphanet', phenotype.strip())
                        elif re.match(r'Human Phenotype Ontology', phenotype):
                            phenotype = re.sub(
                                r'^Human Phenotype Ontology', '',
                                phenotype.strip())
                        elif re.match(r'SNOMED CT:\s?', phenotype):
                            phenotype = re.sub(
                                r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip())
                        elif re.match(r'^Gene:', phenotype):
                            continue

                        assoc = G2PAssoc(
                            g, self.name, seqalt_id, phenotype.strip())
                        assoc.add_association_to_graph()

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:'+xrefid.split(':')[1]
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        logger.info("Finished parsing variants")

        return
コード例 #2
0
ファイル: WormBase.py プロジェクト: JervenBolleman/dipper
    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['allele_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        # gu = GraphUtils(curie_map.get())  # TODO unused

        logger.info("Processing Allele phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, phenotype_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_id = None
                if eco_symbol == 'IMP':
                    eco_id = 'ECO:0000015'
                elif eco_symbol.strip() != '':
                    logger.warning(
                        "Encountered an ECO code we don't have: %s",
                        eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    logger.error(
                        "Missing alleles from phenotype assoc at line %d",
                        line_counter)
                    continue
                else:
                    for a in allele_list:
                        allele_num = re.sub(r'WB:', '', a.strip())
                        allele_id = 'WormBase:'+allele_num
                        gene_id = 'WormBase:'+gene_num

                        if re.search(r'WBRNAi', allele_id):
                            # make the reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            rnai_num = re.sub(r'WormBase:', '', allele_id)
                            rnai_id = allele_id
                            rtg_id = self.make_reagent_targeted_gene_id(
                                gene_num, rnai_num, self.nobnodes)
                            geno.addReagentTargetedGene(
                                rnai_id, 'WormBase:'+gene_num, rtg_id)
                            geno.addGeneTargetingReagent(
                                rnai_id, None, geno.genoparts['RNAi_reagent'],
                                gene_id)
                            allele_id = rtg_id
                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations

                            # the public name will come from elsewhere
                            geno.addSequenceAlteration(allele_id, None)
                            vl_id = '_'+'-'.join((gene_num, allele_num))
                            if self.nobnodes:
                                vl_id = ':'+vl_id
                            geno.addSequenceAlterationToVariantLocus(
                                allele_id, vl_id)
                            geno.addAlleleOfGene(vl_id, gene_id)
                        else:
                            logger.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(self.name, allele_id, phenotype_id)

                        if eco_id is not None:
                            assoc.add_evidence(eco_id)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            r = Reference(ref)
                            if re.search(r'Person', ref):
                                r.setType(r.ref_types['person'])
                                # also add
                                # inferred from background scientific knowledge
                                assoc.add_evidence('ECO:0000001')
                            r.addRefToGraph(g)
                            assoc.add_source(ref)

                        assoc.add_association_to_graph(g)

                        # finish looping through all alleles

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
コード例 #3
0
    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['allele_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing Allele phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, phenotype_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_id = None
                if eco_symbol == 'IMP':
                    eco_id = 'ECO:0000015'
                elif eco_symbol.strip() != '':
                    logger.warning("Encountered an ECO code we don't have: %s",
                                   eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    logger.error(
                        "Missing alleles from phenotype assoc at line %d",
                        line_counter)
                    continue
                else:
                    for a in allele_list:
                        allele_num = re.sub(r'WB:', '', a.strip())
                        allele_id = 'WormBase:' + allele_num
                        gene_id = 'WormBase:' + gene_num

                        if re.search(r'WBRNAi', allele_id):
                            # make the reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            rnai_num = re.sub(r'WormBase:', '', allele_id)
                            rnai_id = allele_id
                            rtg_id = self.make_reagent_targeted_gene_id(
                                gene_num, rnai_num)
                            geno.addReagentTargetedGene(
                                rnai_id, 'WormBase:' + gene_num, rtg_id)
                            geno.addGeneTargetingReagent(
                                rnai_id, None, geno.genoparts['RNAi_reagent'],
                                gene_id)
                            allele_id = rtg_id
                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations

                            # the public name will come from elsewhere
                            geno.addSequenceAlteration(allele_id, None)
                            vl_id = '_:' + '-'.join((gene_num, allele_num))
                            geno.addSequenceAlterationToVariantLocus(
                                allele_id, vl_id)
                            geno.addAlleleOfGene(vl_id, gene_id)
                        else:
                            logger.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(g, self.name, allele_id, phenotype_id)

                        if eco_id is not None:
                            assoc.add_evidence(eco_id)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            reference = Reference(g, ref)
                            if re.search(r'Person', ref):
                                reference.setType(
                                    reference.ref_types['person'])
                                # also add
                                # inferred from background scientific knowledge
                                assoc.add_evidence('ECO:0000001')
                            reference.addRefToGraph()
                            assoc.add_source(ref)

                        assoc.add_association_to_graph()

                        # finish looping through all alleles

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
コード例 #4
0
    def _process_data(self, raw, limit=None):
        LOG.info("Processing Data from %s", raw)

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)

        # Add the taxon as a class
        taxon_id = self.globaltt['Mus musculus']
        model.addClassToGraph(taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        col = self.files['all']['columns']
        with gzip.open(raw, 'rt') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            row = next(reader)  # presumed header
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g"
                marker_accession_id = row[col.index('marker_accession_id')].strip()
                marker_symbol = row[col.index('marker_symbol')].strip()
                phenotyping_center = row[col.index('phenotyping_center')].strip()
                colony_raw = row[col.index('colony_id')].strip()
                sex = row[col.index('sex')].strip()
                zygosity = row[col.index('zygosity')].strip()
                allele_accession_id = row[col.index('allele_accession_id')].strip()
                allele_symbol = row[col.index('allele_symbol')].strip()
                # allele_name = row[col.index('allele_name')]
                strain_accession_id = row[col.index('strain_accession_id')].strip()
                strain_name = row[col.index('strain_name')].strip()
                # project_name = row[col.index('project_name')]
                project_fullname = row[col.index('project_fullname')].strip()
                pipeline_name = row[col.index('pipeline_name')].strip()
                pipeline_stable_id = row[col.index('pipeline_stable_id')].strip()
                procedure_stable_id = row[col.index('procedure_stable_id')].strip()
                procedure_name = row[col.index('procedure_name')].strip()
                parameter_stable_id = row[col.index('parameter_stable_id')].strip()
                parameter_name = row[col.index('parameter_name')].strip()
                # top_level_mp_term_id = row[col.index('top_level_mp_term_id')]
                # top_level_mp_term_name = row[col.index('top_level_mp_term_name')]
                mp_term_id = row[col.index('mp_term_id')].strip()
                mp_term_name = row[col.index('mp_term_name')].strip()
                p_value = row[col.index('p_value')].strip()
                percentage_change = row[col.index('percentage_change')].strip()
                effect_size = row[col.index('effect_size')].strip()
                statistical_method = row[col.index('statistical_method')].strip()
                resource_name = row[col.index('resource_name')].strip()

                if self.test_mode and marker_accession_id not in self.gene_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity = zygosity.strip()
                zygosity_id = self.resolve(zygosity)
                if zygosity_id == zygosity:
                    LOG.warning(
                        "Zygosity '%s' unmapped. detting to indeterminate", zygosity)
                    zygosity_id = self.globaltt['indeterminate']

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_:' + re.sub(r'\W+', '_', colony_raw)

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = '_:IMPC-'+re.sub(
                        r':', '', allele_accession_id)

                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    # TODO blank nodes do not maintain identifiers
                    strain_accession_id = '_:' + strain_accession_id

                elif not re.match(r'MGI', strain_accession_id):
                    LOG.info(
                        "Found a strange strain accession...%s", strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = re.match(
                        r'.*<(.*)>', allele_symbol)
                    if sequence_alteration_name is not None:
                        sequence_alteration_name = sequence_alteration_name.group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and marker_accession_id == '':
                    LOG.warning("Marker unspecified on row %d", reader.line_num)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = self.globaltt['variant_locus']
                    geno.addGene(
                        marker_accession_id, marker_symbol, self.globaltt['gene'])

                    geno.addAllele(
                        variant_locus_id, variant_locus_name, variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    # TAG bnode
                    sequence_alteration_id = '_:seqalt' + re.sub(
                        r':', '', allele_accession_id)
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(
                    sequence_alteration_id, sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,  with unknown zygosity

                stem_cell_class = self.globaltt['embryonic stem cell line']

                if colony_id is None:
                    print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num)
                model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = '_:'+re.sub(
                    r':', '', allele_accession_id + self.globaltt['indeterminate'])
                vslc_colony_label = allele_symbol + '/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(
                    allele_accession_id, colony_genotype_id,
                    self.globaltt['has_variant_part'])

                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    self.globaltt['indeterminate'], self.globaltt['has_variant_part'])
                graph.addTriple(
                    colony_id, self.globaltt['has_genotype'], colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = self.make_id(
                    (colony_id + phenotyping_center + zygosity + strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = self.globaltt['has_variant_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    LOG.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '-'.join(
                    (marker_accession_id, allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                vslc_id = '_:'+vslc_id
                model.addIndividualToGraph(
                    vslc_id, vslc_name,
                    self.globaltt['variant single locus complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    self.globaltt['has_variant_part'], allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                model.addType(vslc_id, self.globaltt['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        self.globaltt['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = strain_name + '-' + phenotyping_center \
                        + '-' + colony_raw
                    pheno_center_strain_id = '-'.join((
                        re.sub(r':', '', genomic_background_id),
                        re.sub(r'\s', '_', phenotyping_center),
                        re.sub(r'\W+', '', colony_raw)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        # Tag bnode
                        pheno_center_strain_id = '_:' + pheno_center_strain_id

                    geno.addGenotype(
                        pheno_center_strain_id, pheno_center_strain_label,
                        self.globaltt['genomic_background'])
                    geno.addSequenceDerivesFrom(
                        pheno_center_strain_id, genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name + ' [' + pheno_center_strain_label + ']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(taxon_id, pheno_center_strain_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id((
                        colony_id + phenotyping_center + zygosity +
                        strain_accession_id + sex))
                sex_qualified_genotype_label = genotype_name + ' (' + sex + ')'

                sq_type_id = self.resolve(sex, False)

                if sq_type_id == sex:
                    sq_type_id = self.globaltt['intrinsic_genotype']
                    LOG.warning(
                        "Unknown sex qualifier %s, adding as intrinsic_genotype",
                        sex)

                geno.addGenotype(
                    sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    self.globaltt['has_variant_part'])

                if genomic_background_id is not None and genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                # sometimes phenotype ids are missing.  (about 711 early 2020)
                if mp_term_id is None or mp_term_id == '':
                    LOG.warning(
                        "No phenotype id specified for row %d", reader.line_num)
                    continue
                # hard coded ECO code
                eco_id = self.globaltt['mutant phenotype evidence']

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(
                    graph, self.name, sex_qualified_genotype_id, mp_term_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                model._addSexSpecificity(assoc_id, self.resolve(sex))

                # add a free-text description
                try:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of',
                        str(round(float(effect_size), 5)),
                        '(p =', "{:.4e}".format(float(p_value)), ').'))
                except ValueError:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of', str(effect_size),
                        '(p =', "{0}".format(p_value), ').'))

                study_bnode = self._add_study_provenance(
                    phenotyping_center, colony_raw, project_fullname, pipeline_name,
                    pipeline_stable_id, procedure_stable_id, procedure_name,
                    parameter_stable_id, parameter_name, statistical_method,
                    resource_name)

                evidence_line_bnode = self._add_evidence(
                    assoc_id, eco_id, p_value, percentage_change, effect_size,
                    study_bnode)

                self._add_assertion_provenance(assoc_id, evidence_line_bnode)

                model.addDescription(evidence_line_bnode, description)

                # resource_id = resource_name
                # assoc.addSource(graph, assoc_id, resource_id)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
コード例 #5
0
ファイル: AnimalQTLdb.py プロジェクト: TomConlin/dipper
    def _process_qtls_genetic_location(
            self, raw, txid, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:

        """
        aql_curie = self.files[common_name + '_cm']['curie']

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['quantitative trait analysis evidence']

        taxon_curie = 'NCBITaxon:' + txid

        LOG.info("Processing genetic location for %s from %s", taxon_curie, raw)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (qtl_id,
                 qtl_symbol,
                 trait_name,
                 assotype,
                 empty,
                 chromosome,
                 position_cm,
                 range_cm,
                 flankmark_a2,
                 flankmark_a1,
                 peak_mark,
                 flankmark_b1,
                 flankmark_b2,
                 exp_id,
                 model_id,
                 test_base,
                 sig_level,
                 lod_score,
                 ls_mean,
                 p_values,
                 f_statistics,
                 variance,
                 bayes_value,
                 likelihood_ratio,
                 trait_id, dom_effect,
                 add_effect,
                 pubmed_id,
                 gene_id,
                 gene_id_src,
                 gene_id_type,
                 empty2) = row

                if self.test_mode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = common_name + 'QTL:' + qtl_id.strip()
                trait_id = ':'.join((aql_curie, trait_id.strip()))

                # Add QTL to graph
                feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL'])
                feature.addTaxonToFeature(taxon_curie)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                # add a version of the chromosome which is defined as
                # the genetic map
                build_id = 'MONARCH:'+common_name.strip()+'-linkage'
                build_label = common_name+' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_curie)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                start = stop = None
                # range_cm sometimes ends in "(Mb)"  (i.e pig 2016 Nov)
                range_mb = re.split(r'\(', range_cm)
                if range_mb is not None:
                    range_cm = range_mb[0]

                if re.search(r'[0-9].*-.*[0-9]', range_cm):
                    range_parts = re.split(r'-', range_cm)

                    # check for poorly formed ranges
                    if len(range_parts) == 2 and\
                            range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [
                            int(float(x.strip())) for x in re.split(r'-', range_cm)]
                    else:
                        LOG.info(
                            "A cM range we can't handle for QTL %s: %s",
                            qtl_id, range_cm)
                elif position_cm != '':
                    match = re.match(r'([0-9]*\.[0-9]*)', position_cm)
                    if match is not None:
                        position_cm = match.group()
                        start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop
                # when schema can handle floats add in the genetic location
                # based on the range
                feature.addFeatureStartLocation(
                    start, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureEndLocation(
                    stop, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureToGraph()

                # sometimes there's a peak marker, like a rsid.
                # we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and \
                        re.match(r'rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:'+peak_mark.strip()

                    model.addIndividualToGraph(
                        dbsnp_id, None,
                        self.globaltt['sequence_alteration'])
                    model.addXref(qtl_id, dbsnp_id)

                gene_id = gene_id.replace('uncharacterized ', '').strip()
                if gene_id is not None and gene_id != '' and gene_id != '.'\
                        and re.fullmatch(r'[^ ]*', gene_id) is not None:

                    # we assume if no src is provided and gene_id is an integer,
                    # then it is an NCBI gene ... (okay, lets crank that back a notch)
                    if gene_id_src == '' and gene_id.isdigit() and \
                            gene_id in self.gene_info:
                        # LOG.info(
                        #    'Warm & Fuzzy saying %s is a NCBI gene for %s',
                        #    gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '' and gene_id.isdigit():
                        LOG.warning(
                            'Cold & Prickely saying %s is a NCBI gene for %s',
                            gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '':
                        LOG.error(
                            ' "%s" is a NOT NCBI gene for %s', gene_id, common_name)
                        gene_id_src = None

                    if gene_id_src == 'NCBIgene':
                        gene_id = 'NCBIGene:' + gene_id
                        # we will expect that these will get labels elsewhere
                        geno.addGene(gene_id, None)
                        # FIXME what is the right relationship here?
                        geno.addAffectedLocus(qtl_id, gene_id)

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id
                            vl_id = '_:' + re.sub(
                                r':', '', gene_id) + '-' + peak_mark.strip()
                            geno.addSequenceAlterationToVariantLocus(
                                dbsnp_id, vl_id)
                            geno.addAffectedLocus(vl_id, gene_id)

                # add the trait
                model.addClassToGraph(trait_id, trait_name)

                # Add publication
                reference = None
                if re.match(r'ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:'+pubmed_id.strip()
                    reference = Reference(graph, pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:' + pubmed_id.strip()
                    reference = Reference(
                        graph, pub_id, self.globaltt['journal article'])

                if reference is not None:
                    reference.addRefToGraph()

                # make the association to the QTL
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id, self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    scr = re.sub(r'<', '', p_values)
                    scr = re.sub(r',', '.', scr)  # international notation
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph()

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(
                        graph, self.name, dbsnp_id, trait_id,
                        self.globaltt['is marker for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        scr = re.sub(r'<', '', p_values)
                        scr = re.sub(r',', '.', scr)
                        if scr.isnumeric():
                            score = float(scr)
                            assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph()

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        LOG.info("Done with QTL genetic info")
        return
コード例 #6
0
ファイル: ClinVar.py プロジェクト: putmantime/dipper
    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        geno = Genotype(g)
        f = Feature(g, None, None, None)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:' + tax_num
        tax_label = 'Human'
        model.addClassToGraph(tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)", num_cols,
                        expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)), tax_num,
                                'CHR')
                            geno.addChromosomeInstance(cytogenetic_loc,
                                                       build_id, assembly,
                                                       band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)), assembly,
                                'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(str(chr), build_id, assembly,
                                               chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    if re.match(r'^Gene:', gene_num):
                        gene_num = "NCBI" + gene_num
                    else:
                        gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph()
                f.addTaxonToFeature(tax_id)
                # make the ClinVarVariant the clique leader
                model.makeLeader(seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs' + str(dbsnp_num)
                    model.addIndividualToGraph(dbsnp_id, None)
                    model.addSameIndividual(seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:' + dbvar_num
                    model.addIndividualToGraph(dbvar_id, None)
                    model.addSameIndividual(seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        model.addIndividualToGraph(rcv_id, None)
                        model.addXref(seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    model.addClassToGraph(gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_' + gene_num + '-' + variant_num
                    if self.nobnodes:
                        vl_id = ':' + vl_id
                    vl_label = allele_name
                    model.addIndividualToGraph(vl_id, vl_label,
                                               geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info("No gene listed for variant %d",
                                    int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for phenotype in pheno_list:
                        m = re.match(r"(Orphanet:ORPHA(?:\s*ORPHA)?)",
                                     phenotype)
                        if m is not None and len(m.groups()) > 0:
                            phenotype = re.sub(m.group(1), 'Orphanet:',
                                               phenotype.strip())
                        elif re.match(r'ORPHA:\d+', phenotype):
                            phenotype = re.sub(r'^ORPHA', 'Orphanet',
                                               phenotype.strip())
                        elif re.match(r'Human Phenotype Ontology', phenotype):
                            phenotype = re.sub(r'^Human Phenotype Ontology',
                                               '', phenotype.strip())
                        elif re.match(r'SNOMED CT:\s?', phenotype):
                            phenotype = re.sub(r'SNOMED CT:\s?', 'SNOMED:',
                                               phenotype.strip())
                        elif re.match(r'^Gene:', phenotype):
                            continue

                        assoc = G2PAssoc(g, self.name, seqalt_id,
                                         phenotype.strip())
                        assoc.add_association_to_graph()

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:' + xrefid.split(':')[1]
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        logger.info("Finished parsing variants")

        return
コード例 #7
0
    def _process_qtls_genetic_location(
            self, raw, src_key, txid, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:

        """
        aql_curie = self.files[src_key]['curie']
        common_name = common_name.strip()
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['quantitative trait analysis evidence']
        taxon_curie = 'NCBITaxon:' + txid

        LOG.info("Processing genetic location for %s from %s", taxon_curie, raw)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            # no header in these files, so no header checking
            col = self.files[src_key]['columns']
            col_len = len(col)
            for row in reader:
                if len(row) != col_len and ''.join(row[col_len:]) != '':
                    LOG.warning(
                        "Problem parsing %s line %i containing: \n%s\n"
                        "got %i cols but expected %i",
                        raw, reader.line_num, row, len(row), col_len)
                    # LOG.info(row)
                    continue

                qtl_id = row[col.index('QTL_ID')].strip()
                qtl_symbol = row[col.index('QTL_symbol')].strip()
                trait_name = row[col.index('Trait_name')].strip()
                # assotype = row[col.index('assotype')].strip()
                chromosome = row[col.index('Chromosome')].strip()
                position_cm = row[col.index('Position_cm')].strip()
                range_cm = row[col.index('range_cm')].strip()
                # flankmark_a2 = row[col.index('FlankMark_A2')].strip()
                # flankmark_a1 = row[col.index('FlankMark_A1')].strip()
                peak_mark = row[col.index('Peak_Mark')].strip()
                # flankmark_b1 = row[col.index('FlankMark_B1')].strip()
                # flankmark_b2 = row[col.index('FlankMark_B2')].strip()
                # exp_id = row[col.index('Exp_ID')].strip()
                # model_id = row[col.index('Model')].strip()
                # test_base = row[col.index('testbase')].strip()
                # sig_level = row[col.index('siglevel')].strip()
                # lod_score = row[col.index('LOD_score')].strip()
                # ls_mean = row[col.index('LS_mean')].strip()
                p_values = row[col.index('P_values')].strip()
                # f_statistics = row[col.index('F_Statistics')].strip()
                # variance = row[col.index('VARIANCE')].strip()
                # bayes_value = row[col.index('Bayes_value')].strip()
                # likelihood_ratio = row[col.index('LikelihoodR')].strip()
                trait_id = row[col.index('TRAIT_ID')].strip()
                # dom_effect = row[col.index('Dom_effect')].strip()
                # add_effect = row[col.index('Add_effect')].strip()
                pubmed_id = row[col.index('PUBMED_ID')].strip()
                gene_id = row[col.index('geneID')].strip()
                gene_id_src = row[col.index('geneIDsrc')].strip()
                # gene_id_type = row[col.index('geneIDtype')].strip()

                if self.test_mode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = common_name + 'QTL:' + qtl_id.strip()
                trait_id = ':'.join((aql_curie, trait_id.strip()))

                # Add QTL to graph
                feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL'])
                feature.addTaxonToFeature(taxon_curie)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                # add a version of the chromosome which is defined as
                # the genetic map
                build_id = 'MONARCH:' + common_name + '-linkage'
                build_label = common_name + ' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_curie)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                start = stop = None
                # range_cm sometimes ends in "(Mb)"  (i.e pig 2016 Nov)
                range_mb = re.split(r'\(', range_cm)
                if range_mb is not None:
                    range_cm = range_mb[0]

                if re.search(r'[0-9].*-.*[0-9]', range_cm):
                    range_parts = re.split(r'-', range_cm)

                    # check for poorly formed ranges
                    if len(range_parts) == 2 and\
                            range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [
                            int(float(x.strip())) for x in re.split(r'-', range_cm)]
                    else:
                        LOG.info(
                            "A cM range we can't handle for QTL %s: %s",
                            qtl_id, range_cm)
                elif position_cm != '':
                    match = re.match(r'([0-9]*\.[0-9]*)', position_cm)
                    if match is not None:
                        position_cm = match.group()
                        start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop
                # when schema can handle floats add in the genetic location
                # based on the range
                feature.addFeatureStartLocation(
                    start, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureEndLocation(
                    stop, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureToGraph()

                # sometimes there's a peak marker, like a rsid.
                # we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and \
                        re.match(r'rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:' + peak_mark.strip()

                    model.addIndividualToGraph(
                        dbsnp_id, None, self.globaltt['sequence_alteration'])

                    model.addXref(
                        qtl_id, dbsnp_id, xref_category=blv.terms['SequenceVariant'])

                gene_id = gene_id.replace('uncharacterized ', '').strip()
                gene_id = gene_id.strip(',')  # for "100157483,"  in pig_QTLdata.txt
                if gene_id is not None and gene_id != '' and gene_id != '.'\
                        and re.fullmatch(r'[^ ]*', gene_id) is not None:

                    # we assume if no src is provided and gene_id is an integer,
                    # then it is an NCBI gene ... (okay, lets crank that back a notch)
                    if gene_id_src == '' and gene_id.isdigit() and \
                            gene_id in self.gene_info:
                        # LOG.info(
                        #    'Warm & Fuzzy saying %s is a NCBI gene for %s',
                        #    gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '' and gene_id.isdigit():
                        LOG.warning(
                            'Cold & Prickely saying %s is a NCBI gene for %s',
                            gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '':
                        LOG.error(
                            ' "%s" is a NOT NCBI gene for %s', gene_id, common_name)
                        gene_id_src = None

                    if gene_id_src == 'NCBIgene':
                        gene_id = 'NCBIGene:' + gene_id
                        # we will expect that these will get labels elsewhere
                        geno.addGene(gene_id, None)
                        # FIXME what is the right relationship here?
                        geno.addAffectedLocus(qtl_id, gene_id)

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id as a bnode
                            vl_id = self.make_id(re.sub(
                                r':', '', gene_id) + '-' + peak_mark.strip(), '_')
                            geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id)
                            geno.addAffectedLocus(vl_id, gene_id)

                # add the trait
                model.addClassToGraph(
                    trait_id,
                    trait_name,
                    class_category=blv.terms['PhenotypicFeature'])

                # Add publication
                reference = None
                if re.match(r'ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:' + pubmed_id.strip()
                    reference = Reference(graph, pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:' + pubmed_id.strip()
                    reference = Reference(
                        graph, pub_id, self.globaltt['journal article'])

                if reference is not None:
                    reference.addRefToGraph()

                # make the association to the QTL
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id, self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    scr = re.sub(r'<', '', p_values)
                    scr = re.sub(r',', '.', scr)  # international notation
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph()

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(
                        graph, self.name, dbsnp_id, trait_id,
                        self.globaltt['is marker for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        scr = re.sub(r'<', '', p_values)
                        scr = re.sub(r',', '.', scr)
                        if scr.isnumeric():
                            score = float(scr)
                            assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph()

                # off by one - the following actually gives us (limit + 1) records
                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break

        LOG.info("Done with QTL genetic info")
コード例 #8
0
ファイル: IMPC.py プロジェクト: JervenBolleman/dipper
    def _process_data(self, raw, limit=None):
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        line_counter = 0
        gu.loadAllProperties(g)
        gu.loadObjectProperties(g, geno.object_properties)

        # Add the taxon as a class
        taxon_id = 'NCBITaxon:10090'  # map to Mus musculus
        gu.addClassToGraph(g, taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        with gzip.open(raw, 'rt') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                line_counter += 1

                (marker_accession_id, marker_symbol, phenotyping_center,
                 colony, sex, zygosity, allele_accession_id, allele_symbol,
                 allele_name, strain_accession_id, strain_name, project_name,
                 project_fullname, pipeline_name, pipeline_stable_id,
                 procedure_stable_id, procedure_name, parameter_stable_id,
                 parameter_name, top_level_mp_term_id, top_level_mp_term_name,
                 mp_term_id, mp_term_name, p_value, percentage_change,
                 effect_size, statistical_method, resource_name) = row

                if self.testMode and marker_accession_id not in self.test_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity_id = self._map_zygosity(zygosity)

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_'+re.sub(r'\W+', '_', colony)
                if self.nobnodes:
                    colony_id = ':'+colony_id

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = \
                        '_IMPC-'+re.sub(r':', '', allele_accession_id)
                    if self.nobnodes:
                        allele_accession_id = ':'+allele_accession_id
                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    strain_accession_id = '_'+strain_accession_id
                    if self.nobnodes:
                        strain_accession_id = ':'+strain_accession_id
                elif not re.match(r'MGI', strain_accession_id):
                    logger.info(
                        "Found a strange strain accession...%s",
                        strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = \
                        re.match(r'.*<(.*)>', allele_symbol).group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and \
                        marker_accession_id == '':
                    logger.warning(
                        "Marker unspecified on row %d", line_counter)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = geno.genoparts['variant_locus']
                    geno.addGene(marker_accession_id, marker_symbol,
                                 geno.genoparts['gene'])
                    geno.addAllele(variant_locus_id, variant_locus_name,
                                   variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    sequence_alteration_id = \
                        '_seqalt'+re.sub(r':', '', allele_accession_id)
                    if self.nobnodes:
                        sequence_alteration_id = ':'+sequence_alteration_id
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(sequence_alteration_id,
                                           sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,
                # with unknown zygosity
                stem_cell_class = 'ERO:0002002'
                gu.addIndividualToGraph(g, colony_id, colony, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = \
                    '_'+allele_accession_id+geno.zygosity['indeterminate']
                vslc_colony = re.sub(r':', '', vslc_colony)
                if self.nobnodes:
                    vslc_colony = ':'+vslc_colony
                vslc_colony_label = allele_symbol+'/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(allele_accession_id, colony_genotype_id,
                              geno.object_properties['has_alternate_part'])
                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    geno.zygosity['indeterminate'],
                    geno.object_properties['has_alternate_part'])
                gu.addTriple(
                    g, colony_id,
                    geno.object_properties['has_genotype'],
                    colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = geno.object_properties['has_alternate_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    logger.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '_' + '-'.join((marker_accession_id,
                                          allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                if self.nobnodes:
                    vslc_id = ':'+vslc_id
                gu.addIndividualToGraph(
                    g, vslc_id, vslc_name,
                    geno.genoparts['variant_single_locus_complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    geno.object_properties['has_alternate_part'],
                    allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                gu.addType(
                    g, vslc_id,
                    Genotype.genoparts['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        geno.genoparts['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = \
                        strain_name + '/' + phenotyping_center
                    pheno_center_strain_id = \
                        '-'.join((re.sub(r':', '', genomic_background_id),
                                  re.sub(r'\s', '_', phenotyping_center)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        pheno_center_strain_id = '_'+pheno_center_strain_id
                    if self.nobnodes:
                        pheno_center_strain_id = ':'+pheno_center_strain_id
                    geno.addGenotype(pheno_center_strain_id,
                                     pheno_center_strain_label,
                                     geno.genoparts['genomic_background'])
                    geno.addSequenceDerivesFrom(pheno_center_strain_id,
                                                genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name+' ['+pheno_center_strain_label+']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(pheno_center_strain_id, taxon_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                genotype_name += '['+colony+']'
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id+sex))
                sex_qualified_genotype_label = genotype_name+' ('+sex+')'
                if sex == 'male':
                    sq_type_id = geno.genoparts['male_genotype']
                elif sex == 'female':
                    sq_type_id = geno.genoparts['female_genotype']
                else:
                    sq_type_id = geno.genoparts['sex_qualified_genotype']

                geno.addGenotype(
                    sex_qualified_genotype_id,
                    sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    geno.object_properties['has_alternate_part'])

                if genomic_background_id is not None and \
                        genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                phenotype_id = mp_term_id

                # it seems that sometimes phenotype ids are missing.
                # indicate here
                if phenotype_id is None or phenotype_id == '':
                    logger.warning(
                        "No phenotype id specified for row %d: %s",
                        line_counter, str(row))
                    continue
                # experimental_phenotypic_evidence This was used in ZFIN
                eco_id = "ECO:0000059"

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(self.name, sex_qualified_genotype_id,
                                 phenotype_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph(g)
                assoc_id = assoc.get_association_id()

                # add a free-text description
                description = \
                    ' '.join((mp_term_name, 'phenotype determined by',
                              phenotyping_center, 'in an',
                              procedure_name, 'assay where',
                              parameter_name.strip(),
                              'was measured with an effect_size of',
                              str(round(float(effect_size), 5)),
                              '(p =', "{:.4e}".format(float(p_value)), ').'))

                gu.addDescription(g, assoc_id, description)

                # TODO add provenance information
                # resource_id = resource_name
                # assoc.addSource(g, assoc_id, resource_id)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP)
        gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP)

        return
コード例 #9
0
    def _process_qtls_genetic_location(
            self, raw, txid, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:

        """
        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['quantitative trait analysis evidence']

        taxon_curie = 'NCBITaxon:' + txid

        LOG.info("Processing genetic location for %s from %s", taxon_curie, raw)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (qtl_id,
                 qtl_symbol,
                 trait_name,
                 assotype,
                 empty,
                 chromosome,
                 position_cm,
                 range_cm,
                 flankmark_a2,
                 flankmark_a1,
                 peak_mark,
                 flankmark_b1,
                 flankmark_b2,
                 exp_id,
                 model_id,
                 test_base,
                 sig_level,
                 lod_score,
                 ls_mean,
                 p_values,
                 f_statistics,
                 variance,
                 bayes_value,
                 likelihood_ratio,
                 trait_id, dom_effect,
                 add_effect,
                 pubmed_id,
                 gene_id,
                 gene_id_src,
                 gene_id_type,
                 empty2) = row

                if self.testMode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = common_name + 'QTL:' + qtl_id.strip()
                trait_id = 'AQTLTrait:' + trait_id.strip()

                # Add QTL to graph
                feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL'])
                feature.addTaxonToFeature(taxon_curie)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                # add a version of the chromosome which is defined as
                # the genetic map
                build_id = 'MONARCH:'+common_name.strip()+'-linkage'
                build_label = common_name+' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_curie)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                start = stop = None
                # range_cm sometimes ends in "(Mb)"  (i.e pig 2016 Nov)
                range_mb = re.split(r'\(', range_cm)
                if range_mb is not None:
                    range_cm = range_mb[0]

                if re.search(r'[0-9].*-.*[0-9]', range_cm):
                    range_parts = re.split(r'-', range_cm)

                    # check for poorly formed ranges
                    if len(range_parts) == 2 and\
                            range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [
                            int(float(x.strip())) for x in re.split(r'-', range_cm)]
                    else:
                        LOG.info(
                            "A cM range we can't handle for QTL %s: %s",
                            qtl_id, range_cm)
                elif position_cm != '':
                    match = re.match(r'([0-9]*\.[0-9]*)', position_cm)
                    if match is not None:
                        position_cm = match.group()
                        start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop
                # when schema can handle floats add in the genetic location
                # based on the range
                feature.addFeatureStartLocation(
                    start, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureEndLocation(
                    stop, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureToGraph()

                # sometimes there's a peak marker, like a rsid.
                # we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and \
                        re.match(r'rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:'+peak_mark.strip()

                    model.addIndividualToGraph(
                        dbsnp_id, None,
                        self.globaltt['sequence_alteration'])
                    model.addXref(qtl_id, dbsnp_id)

                gene_id = gene_id.replace('uncharacterized ', '').strip()
                if gene_id is not None and gene_id != '' and gene_id != '.'\
                        and re.fullmatch(r'[^ ]*', gene_id) is not None:

                    # we assume if no src is provided and gene_id is an integer,
                    # then it is an NCBI gene ... (okay, lets crank that back a notch)
                    if gene_id_src == '' and gene_id.isdigit() and \
                            gene_id in self.gene_info:
                        # LOG.info(
                        #    'Warm & Fuzzy saying %s is a NCBI gene for %s',
                        #    gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '' and gene_id.isdigit():
                        LOG.warning(
                            'Cold & Prickely saying %s is a NCBI gene for %s',
                            gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '':
                        LOG.error(
                            ' "%s" is a NOT NCBI gene for %s', gene_id, common_name)
                        gene_id_src = None

                    if gene_id_src == 'NCBIgene':
                        gene_id = 'NCBIGene:' + gene_id
                        # we will expect that these will get labels elsewhere
                        geno.addGene(gene_id, None)
                        # FIXME what is the right relationship here?
                        geno.addAffectedLocus(qtl_id, gene_id)

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id
                            vl_id = '_:' + re.sub(
                                r':', '', gene_id) + '-' + peak_mark.strip()
                            geno.addSequenceAlterationToVariantLocus(
                                dbsnp_id, vl_id)
                            geno.addAffectedLocus(vl_id, gene_id)

                # add the trait
                model.addClassToGraph(trait_id, trait_name)

                # Add publication
                reference = None
                if re.match(r'ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:'+pubmed_id.strip()
                    reference = Reference(graph, pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:' + pubmed_id.strip()
                    reference = Reference(
                        graph, pub_id, self.globaltt['journal article'])

                if reference is not None:
                    reference.addRefToGraph()

                # make the association to the QTL
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id, self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    scr = re.sub(r'<', '', p_values)
                    scr = re.sub(r',', '.', scr)  # international notation
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph()

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(
                        graph, self.name, dbsnp_id, trait_id,
                        self.globaltt['is marker for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        scr = re.sub(r'<', '', p_values)
                        scr = re.sub(r',', '.', scr)
                        if scr.isnumeric():
                            score = float(scr)
                            assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph()

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        LOG.info("Done with QTL genetic info")
        return
コード例 #10
0
ファイル: IMPC.py プロジェクト: lwinfree/dipper
    def _process_data(self, raw, limit=None):
        logger.info("Processing Data from %s", raw)

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        geno = Genotype(g)
        line_counter = 0

        impc_map = self.open_and_parse_yaml(self.map_files['impc_map'])
        impress_map = json.loads(
            self.fetch_from_url(
                self.map_files['impress_map']).read().decode('utf-8'))

        # Add the taxon as a class
        taxon_id = 'NCBITaxon:10090'  # map to Mus musculus
        model.addClassToGraph(taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        with gzip.open(raw, 'rt') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                line_counter += 1

                (marker_accession_id, marker_symbol, phenotyping_center,
                 colony, sex, zygosity, allele_accession_id, allele_symbol,
                 allele_name, strain_accession_id, strain_name, project_name,
                 project_fullname, pipeline_name, pipeline_stable_id,
                 procedure_stable_id, procedure_name, parameter_stable_id,
                 parameter_name, top_level_mp_term_id, top_level_mp_term_name,
                 mp_term_id, mp_term_name, p_value, percentage_change,
                 effect_size, statistical_method, resource_name) = row

                if self.testMode and marker_accession_id not in self.test_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity_id = self._map_zygosity(zygosity)

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_:' + re.sub(r'\W+', '_', colony)

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = \
                        '_:IMPC-'+re.sub(r':', '', allele_accession_id)

                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    strain_accession_id = '_:' + strain_accession_id

                elif not re.match(r'MGI', strain_accession_id):
                    logger.info("Found a strange strain accession...%s",
                                strain_accession_id)
                    strain_accession_id = 'IMPC:' + strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = \
                        re.match(r'.*<(.*)>', allele_symbol).group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and \
                        marker_accession_id == '':
                    logger.warning("Marker unspecified on row %d",
                                   line_counter)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = geno.genoparts['variant_locus']
                    geno.addGene(marker_accession_id, marker_symbol,
                                 geno.genoparts['gene'])
                    geno.addAllele(variant_locus_id, variant_locus_name,
                                   variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    sequence_alteration_id = \
                        '_:seqalt'+re.sub(r':', '', allele_accession_id)
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(sequence_alteration_id,
                                           sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,
                # with unknown zygosity
                stem_cell_class = 'ERO:0002002'
                model.addIndividualToGraph(colony_id, colony, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = \
                    '_:'+re.sub(r':', '', allele_accession_id+geno.zygosity['indeterminate'])
                vslc_colony_label = allele_symbol + '/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(allele_accession_id, colony_genotype_id,
                              geno.object_properties['has_alternate_part'])
                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    geno.zygosity['indeterminate'],
                    geno.object_properties['has_alternate_part'])
                g.addTriple(colony_id, geno.object_properties['has_genotype'],
                            colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = geno.object_properties['has_alternate_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    logger.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '-'.join(
                    (marker_accession_id, allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                vslc_id = '_:' + vslc_id
                model.addIndividualToGraph(
                    vslc_id, vslc_name,
                    geno.genoparts['variant_single_locus_complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    geno.object_properties['has_alternate_part'], allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                model.addType(
                    vslc_id,
                    Genotype.genoparts['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(genomic_background_id, strain_name,
                                     geno.genoparts['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = \
                        strain_name + '-' + phenotyping_center + '-' + colony
                    pheno_center_strain_id = \
                        '-'.join((re.sub(r':', '', genomic_background_id),
                                  re.sub(r'\s', '_', phenotyping_center),
                                  re.sub(r'\W+', '', colony)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        pheno_center_strain_id = '_:' + pheno_center_strain_id

                    geno.addGenotype(pheno_center_strain_id,
                                     pheno_center_strain_label,
                                     geno.genoparts['genomic_background'])
                    geno.addSequenceDerivesFrom(pheno_center_strain_id,
                                                genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name+' ['+pheno_center_strain_label+']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(taxon_id, pheno_center_strain_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id+sex))
                sex_qualified_genotype_label = genotype_name + ' (' + sex + ')'
                if sex == 'male':
                    sq_type_id = geno.genoparts['male_genotype']
                elif sex == 'female':
                    sq_type_id = geno.genoparts['female_genotype']
                else:
                    sq_type_id = geno.genoparts['sex_qualified_genotype']

                geno.addGenotype(sex_qualified_genotype_id,
                                 sex_qualified_genotype_label, sq_type_id)
                geno.addParts(genotype_id, sex_qualified_genotype_id,
                              geno.object_properties['has_alternate_part'])

                if genomic_background_id is not None and \
                        genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                phenotype_id = mp_term_id

                # it seems that sometimes phenotype ids are missing.
                # indicate here
                if phenotype_id is None or phenotype_id == '':
                    logger.warning("No phenotype id specified for row %d: %s",
                                   line_counter, str(row))
                    continue
                # hard coded ECO code
                eco_id = "ECO:0000015"

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(g, self.name, sex_qualified_genotype_id,
                                 phenotype_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                # add a free-text description
                try:
                    description = \
                        ' '.join((mp_term_name, 'phenotype determined by',
                                  phenotyping_center, 'in an',
                                  procedure_name, 'assay where',
                                  parameter_name.strip(),
                                  'was measured with an effect_size of',
                                  str(round(float(effect_size), 5)),
                                  '(p =', "{:.4e}".format(float(p_value)), ').'))
                except ValueError:
                    description = \
                        ' '.join((mp_term_name, 'phenotype determined by',
                                  phenotyping_center, 'in an',
                                  procedure_name, 'assay where',
                                  parameter_name.strip(),
                                  'was measured with an effect_size of',
                                  str(effect_size),
                                  '(p =', "{0}".format(p_value), ').'))

                study_bnode = \
                    self._add_study_provenance(
                        impc_map, impress_map, phenotyping_center, colony,
                        project_fullname, pipeline_name, pipeline_stable_id,
                        procedure_stable_id, procedure_name,
                        parameter_stable_id, parameter_name,
                        statistical_method, resource_name)

                evidence_line_bnode = \
                    self._add_evidence(
                        assoc_id, eco_id, impc_map, p_value, percentage_change,
                        effect_size, study_bnode)

                self._add_assertion_provenance(assoc_id, evidence_line_bnode,
                                               impc_map)

                model.addDescription(evidence_line_bnode, description)

                # resource_id = resource_name
                # assoc.addSource(g, assoc_id, resource_id)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
コード例 #11
0
ファイル: AnimalQTLdb.py プロジェクト: d3borah/dipper
    def _process_QTLs_genetic_location(self, raw, taxon_id, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        gu = GraphUtils(curie_map.get())
        eco_id = "ECO:0000061"  # Quantitative Trait Analysis Evidence

        logger.info("Processing genetic location for %s", taxon_id)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm,
                 flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model, test_base,
                 sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio,
                 trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row

                if self.testMode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = 'AQTL:'+qtl_id
                trait_id = 'AQTLTrait:'+trait_id

                # Add QTL to graph
                f = Feature(qtl_id, qtl_symbol, geno.genoparts['QTL'])
                f.addTaxonToFeature(g, taxon_id)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_id, 'CHR')

                # add a version of the chromosome which is defined as the genetic map
                build_id = 'MONARCH:'+common_name.strip()+'-linkage'
                build_label = common_name+' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_id)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id)
                start = stop = None
                if re.search('-', range_cm):
                    range_parts = re.split('-', range_cm)
                    # check for poorly formed ranges
                    if len(range_parts) == 2 and range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [int(float(x.strip())) for x in re.split('-', range_cm)]
                    else:
                        logger.info("There's a cM range we can't handle for QTL %s: %s", qtl_id, range_cm)
                elif position_cm != '':
                    start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop when schema can handle floats
                # add in the genetic location based on the range
                f.addFeatureStartLocation(start, chrom_in_build_id, None, [Feature.types['FuzzyPosition']])
                f.addFeatureEndLocation(stop, chrom_in_build_id, None, [Feature.types['FuzzyPosition']])
                f.addFeatureToGraph(g)

                # sometimes there's a peak marker, like a rsid.  we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and re.match('rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:'+peak_mark.strip()

                    gu.addIndividualToGraph(g, dbsnp_id, None, geno.genoparts['sequence_alteration'])
                    gu.addXref(g, qtl_id, dbsnp_id)

                if gene_id is not None and gene_id != '' and gene_id != '.':
                    if gene_id_src == 'NCBIgene' or gene_id_src == '':  # we assume if no src is provided, it's NCBI
                        gene_id = 'NCBIGene:'+gene_id.strip()
                        geno.addGene(gene_id, None)  # we will expect that these labels provided elsewhere
                        geno.addAlleleOfGene(qtl_id, gene_id, geno.object_properties['feature_to_gene_relation'])   # FIXME what is the right relationship here?

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id
                            vl_id = '_' + re.sub(':', '', gene_id) + '-' + peak_mark
                            if self.nobnodes:
                                vl_id = ':' + vl_id
                            geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id)
                            geno.addAlleleOfGene(vl_id, gene_id)

                # add the trait
                gu.addClassToGraph(g, trait_id, trait_name)

                # Add publication
                r = None
                if re.match('ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:'+pubmed_id.strip()
                    r = Reference(pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:'+pubmed_id.strip()
                    r = Reference(pub_id, Reference.ref_types['journal_article'])

                if r is not None:
                    r.addRefToGraph(g)

                # make the association to the QTL
                assoc = G2PAssoc(self.name, qtl_id, trait_id, gu.object_properties['is_marker_for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    score = float(re.sub('<', '', p_values))
                    assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph(g)

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(self.name, dbsnp_id, trait_id, gu.object_properties['is_marker_for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        score = float(re.sub('<', '', p_values))
                        assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph(g)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        logger.info("Done with QTL genetic info")
        return