Exemple #1
0
    def get_variants(self,
                     project_id,
                     family_id,
                     genotype_filter=None,
                     variant_filter=None,
                     quality_filter=None,
                     indivs_to_consider=None,
                     user=None):
        db_query = self._make_db_query(genotype_filter, variant_filter)
        collection = self._get_family_collection(project_id, family_id)
        if not collection:
            logger.error(
                "Error: mongodb collection not found for project %s family %s "
                % (project_id, family_id))
            return
        for i, variant_dict in enumerate(
                collection.find({
                    '$and': [{
                        k: v
                    } for k, v in db_query.items()]
                }).sort('xpos').limit(settings.VARIANT_QUERY_RESULTS_LIMIT +
                                      5)):
            if i >= settings.VARIANT_QUERY_RESULTS_LIMIT:
                raise Exception(
                    "ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again."
                    % settings.VARIANT_QUERY_RESULTS_LIMIT)

            variant = Variant.fromJSON(variant_dict)
            variant.set_extra('project_id', project_id)
            variant.set_extra('family_id', family_id)
            self.add_annotations_to_variants([variant], project_id)

            if passes_variant_filter(variant, variant_filter)[0]:
                yield variant
Exemple #2
0
    def get_project_variants_in_gene(self,
                                     project_id,
                                     gene_id,
                                     variant_filter=None):

        if variant_filter is None:
            modified_variant_filter = VariantFilter()
        else:
            modified_variant_filter = copy.deepcopy(variant_filter)
        modified_variant_filter.add_gene(gene_id)

        db_query = self._make_db_query(None, modified_variant_filter)
        sys.stderr.write("Project Gene Search: " + str(project_id) +
                         " all variants query: " + str(db_query))
        collection = self._get_project_collection(project_id)
        # we have to collect list in memory here because mongo can't sort on xpos,
        # as result size can get too big.
        # need to find a better way to do this.
        variants = []
        for variant_dict in collection.find(db_query).hint([
            ('db_gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING)
        ]):
            variant = Variant.fromJSON(variant_dict)
            self.add_annotations_to_variant(variant, project_id)
            if passes_variant_filter(variant, modified_variant_filter):
                variants.append(variant)
        variants = sorted(variants, key=lambda v: v.unique_tuple())
        return variants
Exemple #3
0
    def get_variants_in_gene(self,
                             project_id,
                             family_id,
                             gene_id,
                             genotype_filter=None,
                             variant_filter=None):

        if variant_filter is None:
            modified_variant_filter = VariantFilter()
        else:
            modified_variant_filter = copy.deepcopy(variant_filter)
        modified_variant_filter.add_gene(gene_id)

        db_query = self._make_db_query(genotype_filter,
                                       modified_variant_filter)
        collection = self._get_family_collection(project_id, family_id)
        if not collection:
            return

        # we have to collect list in memory here because mongo can't sort on xpos,
        # as result size can get too big.
        # need to find a better way to do this.
        variants = []
        for variant_dict in collection.find(db_query).hint([
            ('db_gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING)
        ]):
            variant = Variant.fromJSON(variant_dict)
            self.add_annotations_to_variant(variant, project_id)
            if passes_variant_filter(variant, modified_variant_filter):
                variants.append(variant)
        variants = sorted(variants, key=lambda v: v.unique_tuple())
        for v in variants:
            yield v
Exemple #4
0
    def get_variants(self,
                     project_id,
                     family_id,
                     genotype_filter=None,
                     variant_filter=None):

        db_query = self._make_db_query(genotype_filter, variant_filter)
        collection = self._get_family_collection(project_id, family_id)
        if not collection:
            print(
                "Error: mongodb collection not found for project %s family %s "
                % (project_id, family_id))
            return
        for i, variant_dict in enumerate(
                collection.find(db_query).sort('xpos').limit(
                    MONGO_QUERY_RESULTS_LIMIT + 5)):
            if i >= MONGO_QUERY_RESULTS_LIMIT:
                raise Exception(
                    "ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again."
                    % MONGO_QUERY_RESULTS_LIMIT)

            variant = Variant.fromJSON(variant_dict)
            self.add_annotations_to_variant(variant, project_id)
            if passes_variant_filter(variant, variant_filter)[0]:
                yield variant
Exemple #5
0
    def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None):

        db_query = _make_db_query(genotype_filter, variant_filter)
        collection = self._get_family_collection(project_id, family_id)
        for variant_dict in collection.find(db_query).sort('xpos'):
            variant = Variant.fromJSON(variant_dict)
            if passes_variant_filter(variant, variant_filter)[0]:
                yield variant
    def get_variants(self, project_id, variant_filter=None):

        variant_filter_t = VariantFilter(**(variant_filter if variant_filter else {}))

        db_query = self._make_db_query(None, variant_filter)
        collection = self._get_project_collection(project_id)
        for variant_dict in collection.find(db_query).sort('xpos'):
            variant = Variant.fromJSON(variant_dict)
            if variant_filter is None:
                yield variant
            if passes_variant_filter(variant, variant_filter_t)[0]:
                yield variant
    def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None):

        db_query = self._make_db_query(genotype_filter, variant_filter)
        collection = self._get_family_collection(project_id, family_id)
        if not collection:
            print("Error: mongodb collection not found for project %s family %s " % (project_id, family_id))
            return
        for variant_dict in collection.find(db_query).sort('xpos'):
            variant = Variant.fromJSON(variant_dict)
            self.add_annotations_to_variant(variant, project_id)
            if passes_variant_filter(variant, variant_filter)[0]:
                yield variant
Exemple #8
0
    def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None):

        db_query = self._make_db_query(genotype_filter, variant_filter)
        collection = self._get_family_collection(project_id, family_id)
        if not collection:
            print("Error: mongodb collection not found for project %s family %s " % (project_id, family_id))
            return
        for i, variant_dict in enumerate(collection.find(db_query).sort('xpos').limit(MONGO_QUERY_RESULTS_LIMIT+5)):
            if i >= MONGO_QUERY_RESULTS_LIMIT:
                raise Exception("ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % MONGO_QUERY_RESULTS_LIMIT)

            variant = Variant.fromJSON(variant_dict)
            self.add_annotations_to_variant(variant, project_id)
            if passes_variant_filter(variant, variant_filter)[0]:
                yield variant
    def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None, quality_filter=None, indivs_to_consider=None, user=None):
        db_query = self._make_db_query(genotype_filter, variant_filter)
        collection = self._get_family_collection(project_id, family_id)
        if not collection:
            logger.error("Error: mongodb collection not found for project %s family %s " % (project_id, family_id))
            return
        for i, variant_dict in enumerate(collection.find({'$and' : [{k: v} for k, v in db_query.items()]}).sort('xpos').limit(settings.VARIANT_QUERY_RESULTS_LIMIT+5)):
            if i >= settings.VARIANT_QUERY_RESULTS_LIMIT:
                raise Exception("ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % settings.VARIANT_QUERY_RESULTS_LIMIT)

            variant = Variant.fromJSON(variant_dict)
            variant.set_extra('project_id', project_id)
            variant.set_extra('family_id', family_id)
            self.add_annotations_to_variants([variant], project_id)

            if passes_variant_filter(variant, variant_filter)[0]:
                yield variant
    def get_variants_in_gene(self, project_id, gene_id, variant_filter=None):

        if variant_filter is None:
            modified_variant_filter = VariantFilter()
        else:
            modified_variant_filter = copy.deepcopy(variant_filter)
        modified_variant_filter.add_gene(gene_id)

        db_query = self._make_db_query(None, modified_variant_filter)
        collection = self._get_project_collection(project_id)

        variants = []
        for variant_dict in collection.find(db_query).hint([('gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING)]):
            variant = Variant.fromJSON(variant_dict)
            if passes_variant_filter(variant, modified_variant_filter):
                variants.append(variant)
        variants = sorted(variants, key=lambda v: v.unique_tuple())
        return variants
Exemple #11
0
    def get_project_variants_in_gene(self, project_id, gene_id, variant_filter=None):

        if variant_filter is None:
            modified_variant_filter = VariantFilter()
        else:
            modified_variant_filter = copy.deepcopy(variant_filter)
        modified_variant_filter.add_gene(gene_id)

        db_query = self._make_db_query(None, modified_variant_filter)
        logger.info("Project Gene Search: " + str(project_id) + " all variants query: " + str(db_query))
        collection = self._get_project_collection(project_id)
        # we have to collect list in memory here because mongo can't sort on xpos,
        # as result size can get too big.
        # need to find a better way to do this.
        variants = [Variant.fromJSON(variant_dict) for variant_dict in collection.find(db_query).hint([('db_gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING)])]
        self.add_annotations_to_variants(variants, project_id)
        variants = filter(lambda variant: passes_variant_filter(variant, modified_variant_filter), variants)
        variants = sorted(variants, key=lambda v: v.unique_tuple())
        return variants
Exemple #12
0
    def get_variants_in_gene(self, project_id, family_id, gene_id, genotype_filter=None, variant_filter=None):

        if variant_filter is None:
            modified_variant_filter = VariantFilter()
        else:
            modified_variant_filter = copy.deepcopy(variant_filter)
        modified_variant_filter.add_gene(gene_id)

        db_query = _make_db_query(genotype_filter, modified_variant_filter)
        collection = self._get_family_collection(project_id, family_id)

        # we have to collect list in memory here because mongo can't sort on xpos,
        # as result size can get too big.
        # need to find a better way to do this.
        variants = []
        for variant_dict in collection.find(db_query).hint([('gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING)]):
            variant = Variant.fromJSON(variant_dict)
            if passes_variant_filter(variant, modified_variant_filter):
                variants.append(variant)
        variants = sorted(variants, key=lambda v: v.unique_tuple())
        for v in variants:
            yield v
Exemple #13
0
    def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None):

        db_query = self._make_db_query(genotype_filter, variant_filter)
        collection = self._get_family_collection(project_id, family_id)
        if not collection:
            print("Error: mongodb collection not found for project %s family %s " % (project_id, family_id))
            return
        
        counters = OrderedDict([('returned_by_query', 0), ('passes_variant_filter', 0)])
        for i, variant_dict in enumerate(collection.find({'$and' : [{k: v} for k, v in db_query.items()]}).sort('xpos').limit(MONGO_QUERY_RESULTS_LIMIT+5)):
            if i >= MONGO_QUERY_RESULTS_LIMIT:
                raise Exception("ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % MONGO_QUERY_RESULTS_LIMIT)

            variant = Variant.fromJSON(variant_dict)
            self.add_annotations_to_variant(variant, project_id)
            counters["returned_by_query"] += 1
            if passes_variant_filter(variant, variant_filter)[0]:
                counters["passes_variant_filter"] += 1
                yield variant

        for k, v in counters.items():
            sys.stderr.write("    %s: %s\n" % (k,v))
Exemple #14
0
def get_de_novo_variants(datastore,
                         reference,
                         family,
                         variant_filter=None,
                         quality_filter=None):
    """
    Returns variants that follow homozygous recessive inheritance in family
    """
    de_novo_filter = inheritance.get_de_novo_filter(family)
    db_query = datastore._make_db_query(de_novo_filter, variant_filter)

    collection = datastore._get_family_collection(family.project_id,
                                                  family.family_id)
    if not collection:
        raise ValueError(
            "Error: mongodb collection not found for project %s family %s " %
            (family.project_id, family.family_id))

    MONGO_QUERY_RESULTS_LIMIT = 5000
    variant_iter = collection.find(db_query).sort('xpos').limit(
        MONGO_QUERY_RESULTS_LIMIT + 5)

    # get ids of parents in this family
    valid_ids = set(indiv_id for indiv_id in family.individuals)
    paternal_ids = set(i.paternal_id for i in family.get_individuals()
                       if i.paternal_id in valid_ids)
    maternal_ids = set(i.maternal_id for i in family.get_individuals()
                       if i.maternal_id in valid_ids)
    parental_ids = paternal_ids | maternal_ids

    # loop over all variants returned
    for i, variant_dict in enumerate(variant_iter):
        if i > MONGO_QUERY_RESULTS_LIMIT:
            raise Exception(
                "MONGO_QUERY_RESULTS_LIMIT of %s exceeded for query: %s" %
                (MONGO_QUERY_RESULTS_LIMIT, db_query))

        variant = Variant.fromJSON(variant_dict)
        datastore.add_annotations_to_variant(variant, family.project_id)
        if not passes_variant_filter(variant, variant_filter)[0]:
            continue

        # handle genotype filters
        if len(parental_ids) != 2:
            # ordinary filters for non-trios
            for indiv_id in de_novo_filter.keys():
                genotype = variant.get_genotype(indiv_id)
                if not passes_genotype_filter(genotype, quality_filter):
                    break
            else:
                yield variant
        else:
            # for trios use Mark's recommended filters for de-novo variants:
            # Hard-coded thresholds:
            #   1) Child must have > 10% of combined Parental Read Depth
            #   2) MinimumChildGQscore >= 20
            #   3) MaximumParentAlleleBalance <= 5%
            # Adjustable filters:
            #   Variants should PASS
            #   Child AB should be >= 20

            # compute parental read depth for filter 1
            total_parental_read_depth = 0
            for indiv_id in parental_ids:
                genotype = variant.get_genotype(indiv_id)
                if genotype.extras and 'dp' in genotype.extras and genotype.extras[
                        'dp'] != '.':
                    total_parental_read_depth += int(genotype.extras['dp'])
                else:
                    total_parental_read_depth = None  # both parents must have DP to use the parental_read_depth filters
                    break

            for indiv_id in de_novo_filter.keys():
                quality_filter_temp = quality_filter.copy(
                )  # copy before modifying
                if indiv_id in parental_ids:
                    # handle one of the parents
                    quality_filter_temp['max_ab'] = 5
                else:
                    # handle child
                    quality_filter_temp['min_gq'] = 20
                    if total_parental_read_depth is not None:
                        quality_filter_temp[
                            'min_dp'] = total_parental_read_depth * 0.1

                genotype = variant.get_genotype(indiv_id)
                if not passes_genotype_filter(genotype, quality_filter_temp):
                    #print("%s: %s " % (variant.chr, variant.pos))
                    break
            else:
                yield variant
Exemple #15
0
def get_de_novo_variants(datastore, reference, family, variant_filter=None, quality_filter=None):
    """
    Returns variants that follow homozygous recessive inheritance in family
    """
    de_novo_filter = inheritance.get_de_novo_filter(family)
    db_query = datastore._make_db_query(de_novo_filter, variant_filter)

    collection = datastore._get_family_collection(family.project_id, family.family_id)
    if not collection:
        raise ValueError("Error: mongodb collection not found for project %s family %s " % (family.project_id, family.family_id))

    variant_iter = collection.find(db_query).sort('xpos')

    # get ids of parents in this family
    valid_ids = set(indiv_id for indiv_id in family.individuals)
    paternal_ids = set(i.paternal_id for i in family.get_individuals() if i.paternal_id in valid_ids)
    maternal_ids = set(i.maternal_id for i in family.get_individuals() if i.maternal_id in valid_ids)
    parental_ids = paternal_ids | maternal_ids

    # loop over all variants returned
    for variant_dict in variant_iter:
        variant = Variant.fromJSON(variant_dict)
        datastore.add_annotations_to_variant(variant, family.project_id)
        if not passes_variant_filter(variant, variant_filter)[0]:
            continue

        # handle genotype filters
        if len(parental_ids) != 2:
            # ordinary filters for non-trios
            for indiv_id in de_novo_filter.keys():
                genotype = variant.get_genotype(indiv_id)
                if not passes_genotype_filter(genotype, quality_filter):
                    break
            else:
                yield variant
        else:
            # for trios use Mark's recommended filters for de-novo variants:
            # Hard-coded thresholds:
            #   1) Child must have > 10% of combined Parental Read Depth
            #   2) MinimumChildGQscore >= 20
            #   3) MaximumParentAlleleBalance <= 5%
            # Adjustable filters:
            #   Variants should PASS
            #   Child AB should be >= 20

            # compute parental read depth for filter 1
            total_parental_read_depth = 0
            for indiv_id in parental_ids:
                genotype = variant.get_genotype(indiv_id)
                if genotype.extras and 'dp' in genotype.extras:
                    total_parental_read_depth += int(genotype.extras['dp'])
                else:
                    total_parental_read_depth = None  # both parents must have DP to use the parental_read_depth filters 
                    break
                
            for indiv_id in de_novo_filter.keys():            
                quality_filter_temp = quality_filter.copy()  # copy before modifying
                if indiv_id in parental_ids:
                    # handle one of the parents
                    quality_filter_temp['max_ab'] = 5
                else: 
                    # handle child
                    quality_filter_temp['min_gq'] = 20
                    if total_parental_read_depth is not None:
                        quality_filter_temp['min_dp'] = total_parental_read_depth * 0.1

                genotype = variant.get_genotype(indiv_id)
                if not passes_genotype_filter(genotype, quality_filter_temp):
                    #print("%s: %s " % (variant.chr, variant.pos))
                    break
            else:
                yield variant