def get_compound_het_genes(datastore, reference, family, variant_filter=None, quality_filter=None): """ Gene-based inheritance; genes with variants that follow compound het inheritance in a family Note that compound het implies two variants, so we look at all variant pairs Return is a stream of tuples (gene_name, variant_list) """ # only ask for variants that are het in all affected initial_filter = {} for indiv_id, individual in family.individuals.items(): if individual.affected_status == 'affected': initial_filter[indiv_id] = 'ref_alt' het_variants = get_variants(datastore, family, initial_filter, variant_filter, quality_filter, indivs_to_consider=family.indiv_id_list()) for gene_name, raw_variants in stream_utils.variant_stream_to_gene_stream(het_variants, reference): variants = search_utils.filter_gene_variants_by_variant_filter(raw_variants, gene_name, variant_filter) variants_to_return = {} # don't care about genes w less than 2 variants if len(variants) < 2: continue combos = itertools.combinations(variants, 2) for combo in combos: valid = is_family_compound_het_for_combo(combo, family) if valid: variants_to_return[combo[0].unique_tuple()] = combo[0] variants_to_return[combo[1].unique_tuple()] = combo[1] if len(variants_to_return) > 0: yield (gene_name, variants_to_return.values())
def get_genes(db, reference, family, burden_filter=None, variant_filter=None, quality_filter=None): """ Get gene stream for a family that meets the burden filter above Burden filters are analagous to genotype filters, but for gene burden: a dict of indiv_id -> key Currently available keys are: at_least_1, at_least_2, less_than_2, none All refer to allele counts Food for thought: should "compound_het" be a burden_filter in the future? Or does that go somewhere else? TODO: this is really slow right now, we need to optimize """ indivs_to_consider = burden_filter.keys() if burden_filter else [] variant_stream = get_variants(db, family, variant_filter=variant_filter, quality_filter=quality_filter) for gene_id, variant_list in stream_utils.variant_stream_to_gene_stream( variant_stream, reference): quality_filtered_variant_list = [ v for v in variant_list if passes_quality_filter(v, quality_filter, indivs_to_consider) ] if len(quality_filtered_variant_list) == 0: continue if burden_filter is None: yield gene_id, quality_filtered_variant_list elif _passes_burden_filter(quality_filtered_variant_list, burden_filter): yield gene_id, quality_filtered_variant_list
def get_recessive_genes(datastore, reference, family, variant_filter=None, quality_filter=None): """ Combination of homozygous recessive, x-linked, and compound het inheritances Gene-based, but genes are unique and variants within them unique too """ #sys.stderr.write(" getting recessive genes for family: %s %s" % (family.project_id, family.family_id)) # combine hom rec and x linked into single variant stream, then gene stream hom_rec_variants = get_homozygous_recessive_variants( datastore, reference, family, variant_filter, quality_filter) x_linked_variants = get_x_linked_variants(datastore, reference, family, variant_filter, quality_filter) single_variants = stream_utils.combine_variant_streams( [hom_rec_variants, x_linked_variants]) single_variants_by_gene = stream_utils.variant_stream_to_gene_stream( single_variants, reference) # combine with compound het genes compound_het_genes = get_compound_het_genes(datastore, reference, family, variant_filter, quality_filter) genes_with_duplicates = stream_utils.combine_gene_streams( [single_variants_by_gene, compound_het_genes], reference) # return uniqified for item in stream_utils.remove_duplicate_variants_from_gene_stream( genes_with_duplicates): yield item
def get_genes(datastore, reference, cohort, variant_filter=None): """ Returns cohort variants grouped by gene TODO: quality filter. Need to set to null genotype instead of removing variant """ variants = datastore.get_variants(cohort.project_id, cohort.cohort_id, variant_filter=variant_filter) for gene_id, variant_list in stream_utils.variant_stream_to_gene_stream(variants, reference): yield gene_id, variant_list
def get_recessive_genes(datastore, reference, family, variant_filter=None, quality_filter=None): """ Combination of homozygous recessive, x-linked, and compound het inheritances Gene-based, but genes are unique and variants within them unique too """ # combine hom rec and x linked into single variant stream, then gene stream hom_rec_variants = get_homozygous_recessive_variants(datastore, reference, family, variant_filter, quality_filter) x_linked_variants = get_x_linked_variants(datastore, reference, family, variant_filter, quality_filter) single_variants = stream_utils.combine_variant_streams([hom_rec_variants, x_linked_variants]) single_variants_by_gene = stream_utils.variant_stream_to_gene_stream(single_variants, reference) # combine with compound het genes compound_het_genes = get_compound_het_genes(datastore, reference, family, variant_filter, quality_filter) genes_with_duplicates = stream_utils.combine_gene_streams([single_variants_by_gene, compound_het_genes], reference) # return uniqified for item in stream_utils.remove_duplicate_variants_from_gene_stream(genes_with_duplicates): yield item
def get_genes(db, reference, family, burden_filter=None, variant_filter=None, quality_filter=None): """ Get gene stream for a family that meets the burden filter above Burden filters are analagous to genotype filters, but for gene burden: a dict of indiv_id -> key Currently available keys are: at_least_1, at_least_2, less_than_2, none All refer to allele counts Food for thought: should "compound_het" be a burden_filter in the future? Or does that go somewhere else? TODO: this is really slow right now, we need to optimize """ indivs_to_consider = burden_filter.keys() if burden_filter else [] variant_stream = get_variants(db, family, variant_filter=variant_filter, quality_filter=quality_filter) for gene_id, variant_list in stream_utils.variant_stream_to_gene_stream(variant_stream, reference): quality_filtered_variant_list = [v for v in variant_list if passes_quality_filter(v, quality_filter, indivs_to_consider)] if len(quality_filtered_variant_list) == 0: continue if burden_filter is None: yield gene_id, quality_filtered_variant_list elif _passes_burden_filter(quality_filtered_variant_list, burden_filter): yield gene_id, quality_filtered_variant_list