Ejemplo n.º 1
0
    def _add_variant_gene_relationship(self, patient_var_map,
                                       gene_coordinate_map):
        """
        Right now it is unclear the best approach on how to connect
        variants to genes.  In most cases has_affected_locus/GENO:0000418
        is accurate; however, there are cases where a variant is in the intron
        on one gene and is purported to causally affect another gene down or
        upstream.  In these cases we must first disambiguate which gene
        is the affected locus, and which gene(s) are predicated to be
        causully influenced by (RO:0002566)

        UPDATE 8-30: In the latest dataset we no longer have 1-many mappings
        between variants and genes, but leaving this here in case we see
        these in the future

        The logic followed here is:
        if mutation type contains downstream/upstream and more than one
        gene of interest, investigate coordinates of all genes to
        see if we can disambiguate which genes are which
        :return: None
        """
        # genotype = Genotype(self.graph)
        dipper_util = DipperUtil()
        model = Model(self.graph)
        # Note this could be compressed in someway to remove one level of for looping
        for patient in patient_var_map:
            for variant_id, variant in patient_var_map[patient].items():
                variant_bnode = self.make_id("{0}".format(variant_id), "_")
                genes_of_interest = variant['genes_of_interest']
                if len(genes_of_interest) == 1:
                    # Assume variant is variant allele of gene
                    gene = genes_of_interest[0]
                    gene_id = dipper_util.get_ncbi_id_from_symbol(gene)
                    self._add_gene_to_graph(
                        gene, variant_bnode, gene_id,
                        self.globaltt['has_affected_feature'])

                elif re.search(r'upstream|downstream',
                               variant['type'],
                               flags=re.I):
                    # Attempt to disambiguate
                    ref_gene = []
                    up_down_gene = []
                    unmatched_genes = []
                    for gene in variant['genes_of_interest']:
                        if gene_id and gene_id != '' and gene_id in gene_coordinate_map:
                            if gene_coordinate_map[gene_id]['start'] \
                                    <= variant['position']\
                                    <= gene_coordinate_map[gene_id]['end']:
                                gene_info = {
                                    'symbol':
                                    gene,
                                    'strand':
                                    gene_coordinate_map[gene_id]['strand']
                                }
                                ref_gene.append(gene_info)
                            else:
                                up_down_gene.append(gene)
                        else:
                            unmatched_genes.append(gene)
                    if len(ref_gene) == 1:
                        self._add_gene_to_graph(
                            ref_gene[0]['symbol'], variant_bnode, gene_id,
                            self.globaltt['has_affected_feature'])

                        # update label with gene
                        gene_list = [ref_gene[0]['symbol']
                                     ]  # build label expects list
                        variant_label = self._build_variant_label(
                            variant['build'], variant['chromosome'],
                            variant['position'], variant['reference_allele'],
                            variant['variant_allele'], gene_list)
                        model.addLabel(variant_bnode, variant_label)

                    # In some cases there are multiple instances
                    # of same gene from dupe rows in the source
                    # Credit http://stackoverflow.com/a/3844832
                    elif len(ref_gene) > 0 and ref_gene[1:] == ref_gene[:-1]:
                        self._add_gene_to_graph(
                            ref_gene[0]['symbol'], variant_bnode, gene_id,
                            self.globaltt['has_affected_feature'])

                        # build label function expects list
                        gene_list = [ref_gene[0]['symbol']]
                        variant_label = self._build_variant_label(
                            variant['build'], variant['chromosome'],
                            variant['position'], variant['reference_allele'],
                            variant['variant_allele'], gene_list)
                        model.addLabel(variant_bnode, variant_label)

                    # Check if reference genes are on different strands
                    elif len(ref_gene) == 2:
                        strands = [st['strand'] for st in ref_gene]
                        if "minus" in strands and "plus" in strands:
                            for r_gene in ref_gene:
                                self._add_gene_to_graph(
                                    r_gene['symbol'], variant_bnode, gene_id,
                                    self.globaltt['has_affected_feature'])
                        else:
                            logger.warn(
                                "unable to map intron variant"
                                " to gene coordinates: {0}".format(variant))
                            for r_gene in ref_gene:
                                self._add_gene_to_graph(
                                    r_gene['symbol'], variant_bnode, gene_id,
                                    self.globaltt['causally_influences'])
                    elif re.search(r'intron', variant['type'], flags=re.I):
                        logger.warn(
                            "unable to map intron variant"
                            " to gene coordinates: {0}".format(variant))
                    for neighbor in up_down_gene:
                        self._add_gene_to_graph(
                            neighbor, variant_bnode, gene_id,
                            self.globaltt['causally_influences'])
                    # Unmatched genes are likely because we cannot map to an NCBIGene
                    # or we do not have coordinate information
                    for unmatched_gene in unmatched_genes:
                        self._add_gene_to_graph(
                            unmatched_gene, variant_bnode, gene_id,
                            self.globaltt['causally_influences'])

        return
Ejemplo n.º 2
0
    def _add_variant_gene_relationship(self, patient_var_map, gene_coordinate_map):
        """
        Right now it is unclear the best approach on how to connect
        variants to genes.  In most cases has_affected_locus/GENO:0000418
        is accurate; however, there are cases where a variant is in the intron
        on one gene and is purported to causally affect another gene down or
        upstream.  In these cases we must first disambiguate which gene
        is the affected locus, and which gene(s) are predicated to be
        causully influenced by (RO:0002566)

        UPDATE 8-30: In the latest dataset we no longer have 1-many mappings
        between variants and genes, but leaving this here in case we see
        these in the future

        The logic followed here is:
        if mutation type contains downstream/upstream and more than one
        gene of interest, investigate coordinates of all genes to
        see if we can disambiguate which genes are which
        :return: None
        """
        # genotype = Genotype(self.graph)
        dipper_util = DipperUtil()
        model = Model(self.graph)
        # Note this could be compressed in someway to remove one level of for looping
        for patient in patient_var_map:
            for variant_id, variant in patient_var_map[patient].items():
                variant_bnode = self.make_id("{0}".format(variant_id), "_")
                genes_of_interest = variant['genes_of_interest']
                if len(genes_of_interest) == 1:
                    # Assume variant is variant allele of gene
                    gene = genes_of_interest[0]
                    gene_id = dipper_util.get_ncbi_id_from_symbol(gene)
                    self._add_gene_to_graph(
                        gene, variant_bnode, gene_id,
                        self.globaltt['has_affected_feature'])

                elif re.search(r'upstream|downstream', variant['type'], flags=re.I):
                    # Attempt to disambiguate
                    ref_gene = []
                    up_down_gene = []
                    unmatched_genes = []
                    for gene in variant['genes_of_interest']:
                        if gene_id and gene_id != '' and gene_id in gene_coordinate_map:
                            if gene_coordinate_map[gene_id]['start'] \
                                    <= variant['position']\
                                    <= gene_coordinate_map[gene_id]['end']:
                                gene_info = {
                                    'symbol': gene,
                                    'strand': gene_coordinate_map[gene_id]['strand']
                                }
                                ref_gene.append(gene_info)
                            else:
                                up_down_gene.append(gene)
                        else:
                            unmatched_genes.append(gene)
                    if len(ref_gene) == 1:
                        self._add_gene_to_graph(
                            ref_gene[0]['symbol'], variant_bnode, gene_id,
                            self.globaltt['has_affected_feature'])

                        # update label with gene
                        gene_list = [ref_gene[0]['symbol']]  # build label expects list
                        variant_label = self._build_variant_label(
                            variant['build'], variant['chromosome'],
                            variant['position'], variant['reference_allele'],
                            variant['variant_allele'], gene_list)
                        model.addLabel(variant_bnode, variant_label)

                    # In some cases there are multiple instances
                    # of same gene from dupe rows in the source
                    # Credit http://stackoverflow.com/a/3844832
                    elif len(ref_gene) > 0 and ref_gene[1:] == ref_gene[:-1]:
                        self._add_gene_to_graph(
                            ref_gene[0]['symbol'], variant_bnode, gene_id,
                            self.globaltt['has_affected_feature'])

                        # build label function expects list
                        gene_list = [ref_gene[0]['symbol']]
                        variant_label = self._build_variant_label(
                            variant['build'], variant['chromosome'],
                            variant['position'], variant['reference_allele'],
                            variant['variant_allele'], gene_list)
                        model.addLabel(variant_bnode, variant_label)

                    # Check if reference genes are on different strands
                    elif len(ref_gene) == 2:
                        strands = [st['strand'] for st in ref_gene]
                        if "minus" in strands and "plus" in strands:
                            for r_gene in ref_gene:
                                self._add_gene_to_graph(
                                    r_gene['symbol'], variant_bnode, gene_id,
                                    self.globaltt['has_affected_feature'])
                        else:
                            LOG.warning(
                                "unable to map intron variant to gene coordinates: %s",
                                variant)
                            for r_gene in ref_gene:
                                self._add_gene_to_graph(
                                    r_gene['symbol'], variant_bnode, gene_id,
                                    self.globaltt['causally_influences'])
                    elif re.search(r'intron', variant['type'], flags=re.I):
                        LOG.warning(
                            "unable to map intron variant to gene coordinates_2: %s",
                            variant)
                    for neighbor in up_down_gene:
                        self._add_gene_to_graph(
                            neighbor, variant_bnode, gene_id,
                            self.globaltt['causally_influences'])
                    # Unmatched genes are likely because we cannot map to an NCBIGene
                    # or we do not have coordinate information
                    for unmatched_gene in unmatched_genes:
                        self._add_gene_to_graph(
                            unmatched_gene, variant_bnode, gene_id,
                            self.globaltt['causally_influences'])

        return
Ejemplo n.º 3
0
    def _process_haplotype(
            self, hap_id, hap_label, chrom_num, chrom_pos, context,
            risk_allele_frequency, mapped_gene, so_ontology):
        tax_id = 'NCBITaxon:9606'

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        geno = Genotype(g)
        model = Model(g)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency != '' and \
                risk_allele_frequency != 'NR':
            hap_description = \
                str(risk_allele_frequency) + \
                ' [risk allele frequency]'

        model.addIndividualToGraph(hap_id, hap_label.strip(),
                                   Feature.types['haplotype'], hap_description)
        geno.addTaxon(tax_id, hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)
        snp_curies = list()

        for index, snp in enumerate(snp_labels):
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                # make blank node
                snp_curie = self.make_id(snp, "_")

            g.addTriple(hap_id, geno.object_properties['has_variant_part'],
                        snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        length = len(snp_labels)
        if not all(len(lst) == length
                   for lst in [chrom_nums, chrom_positions, context_list]):
            logger.warn(
                "Unexpected data field for haplotype {} \n "
                "will not add snp details".format(hap_label))
            return

        variant_in_gene_count = 0
        for index, snp_curie in enumerate(snp_curies):
            self._add_snp_to_graph(
                snp_curie, snp_labels[index], chrom_nums[index],
                chrom_positions[index], context_list[index])

            if len(mapped_genes) == len(snp_labels):

                so_class = self._map_variant_type(context_list[index])

                if so_class is None:
                    raise ValueError("Unknown SO class {} in haplotype {}"
                                     .format(context_list[index], hap_label))
                so_query = """
                    SELECT ?variant_label
                    WHERE {{
                        {0} rdfs:subClassOf+ SO:0001564 ;
                            rdfs:label ?variant_label .
                    }}
                """.format(so_class)

                query_result = so_ontology.query(so_query)
                if len(list(query_result)) > 0:
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(
                        mapped_genes[index])
                    if gene_id is not None:
                        geno.addAffectedLocus(snp_curie, gene_id)
                        geno.addAffectedLocus(hap_id, gene_id)
                        variant_in_gene_count += 1

                if context_list[index] == 'upstream_gene_variant':
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(
                        mapped_genes[index])
                    if gene_id is not None:
                        g.addTriple(
                            snp_curie,
                            Feature.object_properties[
                                'upstream_of_sequence_of'],
                            gene_id)
                elif context_list[index] == 'downstream_gene_variant':
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(
                        mapped_genes[index])
                    if gene_id is not None:
                        g.addTriple(
                            snp_curie,
                            Feature.object_properties[
                                'downstream_of_sequence_of'],
                            gene_id)
            else:
                logger.warn("More mapped genes than snps, "
                            "cannot disambiguate for {}".format(hap_label))

        # Seperate in case we want to apply a different relation
        # If not this is redundant with triples added above
        if len(mapped_genes) == variant_in_gene_count \
                and len(set(mapped_genes)) == 1:
            gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0])
            geno.addAffectedLocus(hap_id, gene_id)

        return
Ejemplo n.º 4
0
    def _process_haplotype(
            self, hap_id, hap_label, chrom_num, chrom_pos, context,
            risk_allele_frequency, mapped_gene, so_ontology):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency != '' and risk_allele_frequency != 'NR':
            hap_description = str(risk_allele_frequency) + ' [risk allele frequency]'

        model.addIndividualToGraph(
            hap_id, hap_label.strip(), self.globaltt['haplotype'], hap_description)
        geno.addTaxon(self.globaltt["H**o sapiens"], hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)
        snp_curies = list()

        for index, snp in enumerate(snp_labels):
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                # make blank node
                snp_curie = self.make_id(snp, "_")

            graph.addTriple(hap_id, self.globaltt['has_variant_part'], snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        length = len(snp_labels)
        if not all(len(lst) == length
                   for lst in [chrom_nums, chrom_positions, context_list]):
            LOG.warning(
                "Unexpected data field for haplotype %s \n "
                "will not add snp details", hap_label)
            return

        variant_in_gene_count = 0
        for index, snp_curie in enumerate(snp_curies):
            self._add_snp_to_graph(
                snp_curie, snp_labels[index], chrom_nums[index],
                chrom_positions[index], context_list[index])

            if len(mapped_genes) == len(snp_labels):
                so_class = self.resolve(context_list[index])
                # removed the '+' for recursive  one-or-more rdfs:subClassOf  paths
                # just so it did not return an empty graph
                so_query = """
SELECT ?variant_label
    WHERE {{
        {0} rdfs:subClassOf {1} ;
        rdfs:label ?variant_label .
    }}
                """.format(so_class, self.globaltt['gene_variant'])

                query_result = so_ontology.query(so_query)

                if len(list(query_result)) == 1:
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[index])

                    if gene_id is not None:
                        geno.addAffectedLocus(snp_curie, gene_id)
                        geno.addAffectedLocus(hap_id, gene_id)
                        variant_in_gene_count += 1

                gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[index])
                if gene_id is not None:
                    graph.addTriple(
                        snp_curie, self.resolve(context_list[index]), gene_id)

            else:
                LOG.warning(
                    "More mapped genes than snps, cannot disambiguate for %s",
                    hap_label)

        # Seperate in case we want to apply a different relation
        # If not this is redundant with triples added above
        if len(mapped_genes) == variant_in_gene_count and len(set(mapped_genes)) == 1:
            gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0])
            geno.addAffectedLocus(hap_id, gene_id)

        return
Ejemplo n.º 5
0
    def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos,
                           context, risk_allele_frequency, mapped_gene,
                           so_ontology):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency != '' and risk_allele_frequency != 'NR':
            hap_description = str(
                risk_allele_frequency) + ' [risk allele frequency]'

        model.addIndividualToGraph(hap_id, hap_label.strip(),
                                   self.globaltt['haplotype'], hap_description)
        geno.addTaxon(self.globaltt["H**o sapiens"], hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)
        snp_curies = list()

        for index, snp in enumerate(snp_labels):
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                # make blank node
                snp_curie = self.make_id(snp, "_")

            graph.addTriple(hap_id, self.globaltt['has_variant_part'],
                            snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        length = len(snp_labels)
        if not all(
                len(lst) == length
                for lst in [chrom_nums, chrom_positions, context_list]):
            LOG.warning(
                "Unexpected data field for haplotype %s \n "
                "will not add snp details", hap_label)
            return

        variant_in_gene_count = 0
        for index, snp_curie in enumerate(snp_curies):
            self._add_snp_to_graph(snp_curie, snp_labels[index],
                                   chrom_nums[index], chrom_positions[index],
                                   context_list[index])

            if len(mapped_genes) == len(snp_labels):
                so_class = self.resolve(context_list[index])
                # removed the '+' for recursive  one-or-more rdfs:subClassOf  paths
                # just so it did not return an empty graph
                so_query = """
SELECT ?variant_label
    WHERE {{
        {0} rdfs:subClassOf {1} ;
        rdfs:label ?variant_label .
    }}
                """.format(so_class, self.globaltt['gene_variant'])

                query_result = so_ontology.query(so_query)

                if len(list(query_result)) == 1:
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(
                        mapped_genes[index])

                    if gene_id is not None:
                        geno.addAffectedLocus(snp_curie, gene_id)
                        geno.addAffectedLocus(hap_id, gene_id)
                        variant_in_gene_count += 1

                gene_id = DipperUtil.get_ncbi_id_from_symbol(
                    mapped_genes[index])
                if gene_id is not None:
                    graph.addTriple(snp_curie,
                                    self.resolve(context_list[index]), gene_id)

            else:
                LOG.warning(
                    "More mapped genes than snps, cannot disambiguate for %s",
                    hap_label)

        # Seperate in case we want to apply a different relation
        # If not this is redundant with triples added above
        if len(mapped_genes) == variant_in_gene_count and len(
                set(mapped_genes)) == 1:
            gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0])
            geno.addAffectedLocus(hap_id, gene_id)

        return