Example #1
0
def get_compound_het_genes(datastore, reference, family, variant_filter=None, quality_filter=None):
    """
    Gene-based inheritance; genes with variants that follow compound het inheritance in a family
    Note that compound het implies two variants, so we look at all variant pairs
    Return is a stream of tuples (gene_name, variant_list)
    """

    # only ask for variants that are het in all affected
    initial_filter = {}
    for indiv_id, individual in family.individuals.items():
        if individual.affected_status == 'affected':
            initial_filter[indiv_id] = 'ref_alt'

    het_variants = get_variants(datastore, family, initial_filter, variant_filter, quality_filter, indivs_to_consider=family.indiv_id_list())
    for gene_name, raw_variants in stream_utils.variant_stream_to_gene_stream(het_variants, reference):

        variants = search_utils.filter_gene_variants_by_variant_filter(raw_variants, gene_name, variant_filter)

        variants_to_return = {}
        
        # don't care about genes w less than 2 variants
        if len(variants) < 2:
            continue

        combos = itertools.combinations(variants, 2)
        for combo in combos: 
            valid = is_family_compound_het_for_combo(combo, family)
            if valid: 
                variants_to_return[combo[0].unique_tuple()] = combo[0]
                variants_to_return[combo[1].unique_tuple()] = combo[1]
        
        if len(variants_to_return) > 0: 
            yield (gene_name, variants_to_return.values())
Example #2
0
def get_genes(db,
              reference,
              family,
              burden_filter=None,
              variant_filter=None,
              quality_filter=None):
    """
    Get gene stream for a family that meets the burden filter above
    Burden filters are analagous to genotype filters, but for gene burden:
    a dict of indiv_id -> key
    Currently available keys are: at_least_1, at_least_2, less_than_2, none
    All refer to allele counts
    Food for thought: should "compound_het" be a burden_filter in the future? Or does that go somewhere else?
    TODO: this is really slow right now, we need to optimize
    """
    indivs_to_consider = burden_filter.keys() if burden_filter else []
    variant_stream = get_variants(db,
                                  family,
                                  variant_filter=variant_filter,
                                  quality_filter=quality_filter)
    for gene_id, variant_list in stream_utils.variant_stream_to_gene_stream(
            variant_stream, reference):
        quality_filtered_variant_list = [
            v for v in variant_list
            if passes_quality_filter(v, quality_filter, indivs_to_consider)
        ]
        if len(quality_filtered_variant_list) == 0:
            continue
        if burden_filter is None:
            yield gene_id, quality_filtered_variant_list
        elif _passes_burden_filter(quality_filtered_variant_list,
                                   burden_filter):
            yield gene_id, quality_filtered_variant_list
Example #3
0
def get_recessive_genes(datastore,
                        reference,
                        family,
                        variant_filter=None,
                        quality_filter=None):
    """
    Combination of homozygous recessive, x-linked, and compound het inheritances
    Gene-based, but genes are unique and variants within them unique too
    """
    #sys.stderr.write("     getting recessive genes for family: %s %s" % (family.project_id, family.family_id))

    # combine hom rec and x linked into single variant stream, then gene stream
    hom_rec_variants = get_homozygous_recessive_variants(
        datastore, reference, family, variant_filter, quality_filter)
    x_linked_variants = get_x_linked_variants(datastore, reference, family,
                                              variant_filter, quality_filter)
    single_variants = stream_utils.combine_variant_streams(
        [hom_rec_variants, x_linked_variants])
    single_variants_by_gene = stream_utils.variant_stream_to_gene_stream(
        single_variants, reference)

    # combine with compound het genes
    compound_het_genes = get_compound_het_genes(datastore, reference, family,
                                                variant_filter, quality_filter)
    genes_with_duplicates = stream_utils.combine_gene_streams(
        [single_variants_by_gene, compound_het_genes], reference)

    # return uniqified
    for item in stream_utils.remove_duplicate_variants_from_gene_stream(
            genes_with_duplicates):
        yield item
Example #4
0
def get_compound_het_genes(datastore, reference, family, variant_filter=None, quality_filter=None):
    """
    Gene-based inheritance; genes with variants that follow compound het inheritance in a family
    Note that compound het implies two variants, so we look at all variant pairs
    Return is a stream of tuples (gene_name, variant_list)
    """

    # only ask for variants that are het in all affected
    initial_filter = {}
    for indiv_id, individual in family.individuals.items():
        if individual.affected_status == 'affected':
            initial_filter[indiv_id] = 'ref_alt'

    het_variants = get_variants(datastore, family, initial_filter, variant_filter, quality_filter, indivs_to_consider=family.indiv_id_list())
    for gene_name, raw_variants in stream_utils.variant_stream_to_gene_stream(het_variants, reference):

        variants = search_utils.filter_gene_variants_by_variant_filter(raw_variants, gene_name, variant_filter)

        variants_to_return = {}
        
        # don't care about genes w less than 2 variants
        if len(variants) < 2:
            continue

        combos = itertools.combinations(variants, 2)
        for combo in combos: 
            valid = is_family_compound_het_for_combo(combo, family)
            if valid: 
                variants_to_return[combo[0].unique_tuple()] = combo[0]
                variants_to_return[combo[1].unique_tuple()] = combo[1]
        
        if len(variants_to_return) > 0: 
            yield (gene_name, variants_to_return.values())
Example #5
0
def get_genes(datastore, reference, cohort, variant_filter=None):
    """
    Returns cohort variants grouped by gene
    TODO: quality filter. Need to set to null genotype instead of removing variant
    """
    variants = datastore.get_variants(cohort.project_id, cohort.cohort_id, variant_filter=variant_filter)
    for gene_id, variant_list in stream_utils.variant_stream_to_gene_stream(variants, reference):
        yield gene_id, variant_list
Example #6
0
def get_genes(datastore, reference, cohort, variant_filter=None):
    """
    Returns cohort variants grouped by gene
    TODO: quality filter. Need to set to null genotype instead of removing variant
    """
    variants = datastore.get_variants(cohort.project_id, cohort.cohort_id, variant_filter=variant_filter)
    for gene_id, variant_list in stream_utils.variant_stream_to_gene_stream(variants, reference):
        yield gene_id, variant_list
Example #7
0
def get_recessive_genes(datastore, reference, family, variant_filter=None, quality_filter=None):
    """
    Combination of homozygous recessive, x-linked, and compound het inheritances
    Gene-based, but genes are unique and variants within them unique too
    """

    # combine hom rec and x linked into single variant stream, then gene stream
    hom_rec_variants = get_homozygous_recessive_variants(datastore, reference, family, variant_filter, quality_filter)
    x_linked_variants = get_x_linked_variants(datastore, reference, family, variant_filter, quality_filter)
    single_variants = stream_utils.combine_variant_streams([hom_rec_variants, x_linked_variants])
    single_variants_by_gene = stream_utils.variant_stream_to_gene_stream(single_variants, reference)

    # combine with compound het genes
    compound_het_genes = get_compound_het_genes(datastore, reference, family, variant_filter, quality_filter)
    genes_with_duplicates = stream_utils.combine_gene_streams([single_variants_by_gene, compound_het_genes], reference)

    # return uniqified
    for item in stream_utils.remove_duplicate_variants_from_gene_stream(genes_with_duplicates): 
        yield item
Example #8
0
def get_genes(db, reference, family, burden_filter=None, variant_filter=None, quality_filter=None): 
    """
    Get gene stream for a family that meets the burden filter above
    Burden filters are analagous to genotype filters, but for gene burden: 
    a dict of indiv_id -> key
    Currently available keys are: at_least_1, at_least_2, less_than_2, none
    All refer to allele counts
    Food for thought: should "compound_het" be a burden_filter in the future? Or does that go somewhere else?
    TODO: this is really slow right now, we need to optimize
    """
    indivs_to_consider = burden_filter.keys() if burden_filter else []
    variant_stream = get_variants(db, family, variant_filter=variant_filter, quality_filter=quality_filter)
    for gene_id, variant_list in stream_utils.variant_stream_to_gene_stream(variant_stream, reference):
        quality_filtered_variant_list = [v for v in variant_list if passes_quality_filter(v, quality_filter, indivs_to_consider)]
        if len(quality_filtered_variant_list) == 0:
            continue
        if burden_filter is None:
            yield gene_id, quality_filtered_variant_list
        elif _passes_burden_filter(quality_filtered_variant_list, burden_filter):
            yield gene_id, quality_filtered_variant_list