def add_custom_populations_to_variants(variants, population_slug_list): if population_slug_list: try: mall.get_custom_population_store().add_populations_to_variants( variants, population_slug_list) except Exception, e: print( "WARNING: got unexpected error in add_custom_populations_to_variants: %s" % e)
def handle(self, *args, **options): from xbrowse_server import mall if len(args) == 0: print("Global: " + str([slug for slug in settings.ANNOTATOR_REFERENCE_POPULATION_SLUGS])) print("Private: " + str([p.slug for p in ReferencePopulation.objects.all()])) else: pop_store = mall.get_custom_population_store() pop_store._ensure_indices() population_id = args[0] print("Loading population: " + population_id) populations = [p for p in settings.ANNOTATOR_REFERENCE_POPULATIONS if p["slug"] == population_id] + \ [p.to_dict() for p in ReferencePopulation.objects.all() if p.slug == population_id] assert len(populations) == 1 population_dict = populations[0] print(options) if options["AF_key"]: population_dict["vcf_info_key"] = options["AF_key"] elif options["AC_key"] and options["AN_key"]: population_dict["ac_info_key"] = options["AC_key"] population_dict["an_info_key"] = options["AN_key"] else: sys.exit("Must specify either --AF-key or both --AC-key and --AN-key") pop_store.load_population(population_dict)
def handle(self, *args, **options): from xbrowse_server import mall if len(args) == 0: print("Global: " + str([ slug for slug in settings.ANNOTATOR_REFERENCE_POPULATION_SLUGS ])) print("Private: " + str([p.slug for p in ReferencePopulation.objects.all()])) else: pop_store = mall.get_custom_population_store() pop_store._ensure_indices() population_id = args[0] populations = [p for p in settings.ANNOTATOR_REFERENCE_POPULATIONS if p["slug"] == population_id] + \ [p.to_dict() for p in ReferencePopulation.objects.all() if p.slug == population_id] assert len(populations) == 1 population_dict = populations[0] if options["AF_key"]: population_dict["vcf_info_key"] = options["AF_key"] elif options["AC_key"] and options["AN_key"]: population_dict["ac_info_key"] = options["AC_key"] population_dict["an_info_key"] = options["AN_key"] else: sys.exit( "Must specify either --AF-key or both --AC-key and --AN-key" ) print("Loading pouplation: " + population_id) pop_store.load_population(population_dict)
def handle(self, *args, **options): if not args: sys.exit("ERROR: please specify project id on the command line") if len(args) > 1: sys.exit("ERROR: too many args: %s. Only one project id should be provided." % " ".join(args)) project_id = args[0] # create family_variants.tsv family_variants_f = gzip.open("family_variants_%s.tsv.gz" % project_id, "w") writer = csv.writer(family_variants_f, dialect="excel", delimiter="\t") header_fields = [ "#inheritance_mode", "project_id", "family_id", "gene", "chrom", "pos", "ref", "alt", "rsid", "annotation", "1kg_af", "1kg_popmax_af", "exac_af", "exac_popmax_af", "", ] genotype_headers = ["sample_id", "str", "num_alt", "allele_balance", "AD", "DP", "GQ", "PL"] for i in range(0, 10): for h in genotype_headers: header_fields.append("genotype%d_%s" % (i, h)) writer.writerow(header_fields) family_variants_f.flush() for inheritance_mode in ["dominant", "homozygous_recessive", "compound_het", "de_novo", "x_linked_recessive"]: # collect the resources that we'll need here annotator = mall.get_annotator() custom_population_store = mall.get_custom_population_store() project = Project.objects.get(project_id=project_id) families = project.get_families() # get the variants for this inheritance / project combination for i, (family, variant_list) in enumerate( get_variants_for_inheritance_for_project(project, inheritance_mode) ): for variant in variant_list: # if variant.annotation['vep_group'] != "missense": # continue custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt) g1k_freq = variant.annotation["freqs"]["1kg_wgs_phase3"] g1k_popmax_freq = variant.annotation["freqs"]["1kg_wgs_phase3_popmax"] exac_freq = variant.annotation["freqs"]["exac_v3"] exac_popmax_freq = variant.annotation["freqs"]["exac_v3_popmax"] assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % ( g1k_popmax_freq, g1k_popmax_freq_threshold, ) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % ( exac_popmax_freq, exac_popmax_threshold, ) row = [ inheritance_mode, project_id, family.family_id, get_gene_symbol(variant), variant.chr, str(variant.pos), variant.ref, variant.alt, variant.vcf_id, variant.annotation["vep_group"], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, "", ] for i, individual in enumerate(family.get_individuals()): if i >= 10: break genotype = variant.get_genotype(individual.indiv_id) if genotype is None: print("WARNING: %s variant genotype for %s is None" % (variant, individual.indiv_id)) continue assert genotype.filter == "pass", "%s %s - filter is %s " % ( variant.chr, variant.pos, genotype.filter, ) assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.gq, ) assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.extras["dp"], ) if genotype.num_alt == 1: assert genotype.ab >= AB_threshold / 100.0, "%s %s - AB is %s " % ( variant.chr, variant.pos, genotype.ab, ) genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./." row.extend( [ individual.indiv_id, genotype_str, genotype.num_alt, genotype.ab, genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"], ] ) writer.writerow(row) family_variants_f.flush() family_variants_f.close()
def add_custom_populations_to_variants(variants, population_slug_list): if population_slug_list: try: mall.get_custom_population_store().add_populations_to_variants(variants, population_slug_list) except Exception, e: print("WARNING: got unexpected error in add_custom_populations_to_variants: %s" % e)
def add_custom_populations_to_variants(variants, population_slug_list): if population_slug_list: mall.get_custom_population_store().add_populations_to_variants(variants, population_slug_list)
def handle_project(project_id): filename = 'family_variants_%s.tsv.gz' % project_id print("Generating report: " + filename) # create family_variants.tsv family_variants_f = gzip.open(filename, 'w') writer = csv.writer(family_variants_f, dialect='excel', delimiter='\t') header_fields = [ '#inheritance_mode', 'project_id', 'family_id', 'gene', 'chrom', 'pos', 'ref', 'alt', 'rsid', 'filter', 'clinvar_status', 'annotation', '1kg_af', '1kg_popmax_af', 'exac_af', 'exac_popmax_af', 'merck_wgs_3793_af', 'merck_wgs_144_af', 'multiallelic_site_alt_alleles (* = spanning deletion)', '', ] genotype_headers = [ 'sample_id', 'str', 'num_alt', 'allele_balance', 'AD', 'DP', 'GQ', 'PL', ] for i in range(0, 10): for h in genotype_headers: header_fields.append("genotype%d_%s" % (i, h)) writer.writerow(header_fields) for inheritance_mode in [ 'homozygous_recessive', 'dominant', 'compound_het', 'de_novo', 'x_linked_recessive', 'all_variants' ]: # collect the resources that we'll need here annotator = mall.get_annotator() custom_population_store = mall.get_custom_population_store() project = Project.objects.get(project_id=project_id) # get the variants for this inheritance / project combination for i, (family, family_results) in enumerate( get_variants_for_inheritance_for_project( project, inheritance_mode)): for variant in family_results: custom_populations = custom_population_store.get_frequencies( variant.xpos, variant.ref, variant.alt) g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3'] g1k_popmax_freq = variant.annotation['freqs'][ '1kg_wgs_phase3_popmax'] exac_freq = variant.annotation['freqs']['exac_v3'] exac_popmax_freq = variant.annotation['freqs'][ 'exac_v3_popmax'] merck_wgs_3793_freq = custom_populations.get( 'merck-wgs-3793', 0.0) merck_wgs_144_freq = custom_populations.get( 'merck-pcr-free-wgs-144', 0.0) try: assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % ( g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % ( g1k_popmax_freq, g1k_popmax_freq_threshold) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % ( exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % ( exac_popmax_freq, exac_popmax_threshold) #assert merck_wgs_3793_freq <= merck_wgs_3793_threshold, "Merck WGS 3793 threshold %s > %s" % (merck_wgs_3793_freq, merck_wgs_3793_threshold) #assert merck_wgs_144_freq <= merck_wgs_144_threshold, "Merck PCR free 144 threshold %s > %s" % (merck_wgs_144_freq, merck_wgs_144_threshold) except AssertionError as e: import traceback traceback.print_exc() # filter value is stored in the genotypes if len(family.get_individuals()) == 0: print("Family has 0 individuals: %s - skipping..." % str(family)) continue genotype = variant.get_genotype( family.get_individuals()[0].indiv_id) if genotype is not None: filter_value = genotype.filter else: filter_value = 'unknown' multiallelic_site_other_alleles = [] if len(variant.extras['orig_alt_alleles']) > 1: multiallelic_site_other_alleles = variant.extras[ 'orig_alt_alleles'] clinvar_significance = get_clinvar_variants().get( variant.unique_tuple(), [""])[-1] row = [ inheritance_mode, project_id, family.family_id, get_gene_symbol(variant), variant.chr, str(variant.pos), variant.ref, variant.alt, variant.vcf_id, filter_value, clinvar_significance, variant.annotation['vep_group'], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, merck_wgs_3793_freq, merck_wgs_144_freq, ", ".join(multiallelic_site_other_alleles), '', ] for i, individual in enumerate(family.get_individuals()): if i >= 10: break genotype = variant.get_genotype(individual.indiv_id) if genotype is None: row.extend([ individual.indiv_id, "./.", "", "", "", "", "", "" ]) continue else: #assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter) try: assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.gq) assert genotype.extras[ "dp"] >= DP_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.extras["dp"]) if genotype.num_alt == 1: assert genotype.ab is None or genotype.ab >= AB_threshold / 100., "%s %s - AB is %s " % ( variant.chr, variant.pos, genotype.ab) except AssertionError as e: import traceback traceback.print_exc() genotype_str = "/".join( genotype.alleles) if genotype.alleles else "./." row.extend([ individual.indiv_id, genotype_str, genotype.num_alt, genotype.ab if genotype.ab is not None else '', genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"], ]) writer.writerow(row) family_variants_f.flush() family_variants_f.close() print("Done with " + filename)
def add_custom_populations_to_variants(variants, population_slug_list): if population_slug_list: mall.get_custom_population_store().add_populations_to_variants( variants, population_slug_list)
def handle(self, *args, **options): if not args: sys.exit("ERROR: please specify project id on the command line") if len(args) > 1: sys.exit("ERROR: too many args: %s. Only one project id should be provided." % " ".join(args) ) project_id = args[0] # create family_variants.tsv family_variants_f = gzip.open('family_variants_%s.tsv.gz' % project_id, 'w') writer = csv.writer(family_variants_f, dialect='excel', delimiter='\t') header_fields = [ '#inheritance_mode', 'project_id', 'family_id', 'gene', 'chrom', 'pos', 'ref', 'alt', 'rsid', 'annotation', '1kg_af', '1kg_popmax_af', 'exac_af', 'exac_popmax_af', '', ] genotype_headers = [ 'sample_id', 'str', 'num_alt', 'allele_balance', 'AD', 'DP', 'GQ', 'PL', ] for i in range(0, 10): for h in genotype_headers: header_fields.append("genotype%d_%s" % (i, h)) writer.writerow(header_fields) family_variants_f.flush() for inheritance_mode in ['dominant', 'homozygous_recessive', 'compound_het', 'de_novo', 'x_linked_recessive']: # collect the resources that we'll need here annotator = mall.get_annotator() custom_population_store = mall.get_custom_population_store() project = Project.objects.get(project_id=project_id) families = project.get_families() # get the variants for this inheritance / project combination for i, (family, variant_list) in enumerate(get_variants_for_inheritance_for_project(project, inheritance_mode)): for variant in variant_list: #if variant.annotation['vep_group'] != "missense": # continue custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt) g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3'] g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax'] exac_freq = variant.annotation['freqs']['exac_v3'] exac_popmax_freq = variant.annotation['freqs']['exac_v3_popmax'] assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold) row = [ inheritance_mode, project_id, family.family_id, get_gene_symbol(variant), variant.chr, str(variant.pos), variant.ref, variant.alt, variant.vcf_id, variant.annotation['vep_group'], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, '', ] for i, individual in enumerate(family.get_individuals()): if i >= 10: break genotype = variant.get_genotype(individual.indiv_id) if genotype is None: print("WARNING: %s variant genotype for %s is None" % (variant, individual.indiv_id)) continue assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter) assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq) assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"]) if genotype.num_alt == 1: assert genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab) genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./." row.extend([ individual.indiv_id, genotype_str, genotype.num_alt, genotype.ab, genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"],]) writer.writerow(row) family_variants_f.flush() family_variants_f.close()
def handle_project(project_id): filename = 'family_variants_%s.tsv.gz' % project_id print("Generating report: " + filename) # create family_variants.tsv family_variants_f = gzip.open(filename, 'w') writer = csv.writer(family_variants_f, dialect='excel', delimiter='\t') header_fields = [ '#inheritance_mode', 'project_id', 'family_id', 'gene', 'chrom', 'pos', 'ref', 'alt', 'rsid', 'filter', 'clinvar_status', 'annotation', '1kg_af', '1kg_popmax_af', 'exac_af', 'exac_popmax_af', 'merck_wgs_3793_af', 'merck_wgs_144_af', 'multiallelic_site_alt_alleles (* = spanning deletion)', '', ] genotype_headers = [ 'sample_id', 'str', 'num_alt', 'allele_balance', 'AD', 'DP', 'GQ', 'PL', ] for i in range(0, 10): for h in genotype_headers: header_fields.append("genotype%d_%s" % (i, h)) writer.writerow(header_fields) for inheritance_mode in ['homozygous_recessive', 'dominant', 'compound_het', 'de_novo', 'x_linked_recessive', 'all_variants']: # collect the resources that we'll need here annotator = mall.get_annotator() custom_population_store = mall.get_custom_population_store() project = Project.objects.get(project_id=project_id) # get the variants for this inheritance / project combination for i, (family, family_results) in enumerate(get_variants_for_inheritance_for_project(project, inheritance_mode)): for variant in family_results: custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt) g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3'] g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax'] exac_freq = variant.annotation['freqs']['exac_v3'] exac_popmax_freq = variant.annotation['freqs']['exac_v3_popmax'] merck_wgs_3793_freq = custom_populations.get('merck-wgs-3793', 0.0) merck_wgs_144_freq = custom_populations.get('merck-pcr-free-wgs-144', 0.0) try: assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold) #assert merck_wgs_3793_freq <= merck_wgs_3793_threshold, "Merck WGS 3793 threshold %s > %s" % (merck_wgs_3793_freq, merck_wgs_3793_threshold) #assert merck_wgs_144_freq <= merck_wgs_144_threshold, "Merck PCR free 144 threshold %s > %s" % (merck_wgs_144_freq, merck_wgs_144_threshold) except AssertionError as e: import traceback traceback.print_exc() # filter value is stored in the genotypes if len(family.get_individuals()) == 0: print("Family has 0 individuals: %s - skipping..." % str(family)) continue filter_value = variant.get_genotype(family.get_individuals()[0].indiv_id).filter multiallelic_site_other_alleles = [] if len(variant.extras['orig_alt_alleles']) > 1: multiallelic_site_other_alleles = variant.extras['orig_alt_alleles'] clinvar_significance = CLINVAR_VARIANTS.get(variant.unique_tuple(), [""])[-1] row = [ inheritance_mode, project_id, family.family_id, get_gene_symbol(variant), variant.chr, str(variant.pos), variant.ref, variant.alt, variant.vcf_id, filter_value, clinvar_significance, variant.annotation['vep_group'], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, merck_wgs_3793_freq, merck_wgs_144_freq, ", ".join(multiallelic_site_other_alleles), '', ] for i, individual in enumerate(family.get_individuals()): if i >= 10: break genotype = variant.get_genotype(individual.indiv_id) if genotype is None: row.extend([individual.indiv_id, "./.", "", "", "", "", "", ""]) continue else: #assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter) try: assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq) assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"]) if genotype.num_alt == 1: assert genotype.ab is None or genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab) except AssertionError as e: import traceback traceback.print_exc() genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./." row.extend([ individual.indiv_id, genotype_str, genotype.num_alt, genotype.ab if genotype.ab is not None else '', genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"], ]) writer.writerow(row) family_variants_f.flush() family_variants_f.close() print("Done with " + filename)
def handle(self, *args, **options): if len(args) != 2: sys.exit("ERROR: please specify the project_id and file of individual ids as command line args.") project_id = args[0] individuals_file = args[1] # init objects project = Project.objects.get(project_id=project_id) all_individual_ids_in_project = set([i.indiv_id for i in project.get_individuals()]) individuals_of_interest = [] invalid_individual_ids = [] with open(individuals_file) as f: for line in f: line = line.strip('\n') if not line or line.startswith("#"): continue individual_id = line.split("\t")[0] if individual_id in all_individual_ids_in_project: individuals_of_interest.append(individual_id) else: invalid_individual_ids.append(individual_id) print("Processing %s: %d individuals " % (project_id, len(individuals_of_interest))) if invalid_individual_ids: num_invalid = len(invalid_individual_ids) total_ids = len(all_individual_ids_in_project) sys.exit(("ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: " "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s") % locals()) # filter variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold)) variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold)) quality_filter = { 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # create individuals_variants.tsv individual_variants_f = gzip.open('individuals_in_%s.tsv.gz' % project_id, 'w') writer = csv.writer(individual_variants_f, dialect='excel', delimiter='\t') header_fields = [ 'project_id', 'family_id', 'individual_id', 'gene', 'chrom', 'pos', 'ref', 'alt', 'rsid', 'annotation', '1kg_af', '1kg_popmax_af', 'exac_af', 'exac_popmax_af', 'merck_wgs_3793_af', 'genotype_str', 'genotype_num_alt', 'genotype_allele_balance', 'genotype_AD', 'genotype_DP', 'genotype_GQ', 'genotype_PL', 'genotype_filter', ] writer.writerow(header_fields) # collect the resources that we'll need here annotator = get_annotator() custom_population_store = get_custom_population_store() individual_counter = 0 for i, family in enumerate(project.get_families()): for individual in family.get_individuals(): if individual.indiv_id not in individuals_of_interest: continue individual_counter += 1 print("%s: %s, individual %s" % (individual_counter, family.family_id, individual.indiv_id)) for variant in get_variants(get_datastore(project.project_id), family.xfamily(), variant_filter = variant_filter, quality_filter = quality_filter, indivs_to_consider = [individual.indiv_id] ): genotype = variant.get_genotype(individual.indiv_id) if len(genotype.alleles) == 0 or genotype.extras["dp"] < DP_threshold or genotype.num_alt == 0: continue custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt) genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./." g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3'] g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax'] exac_freq = variant.annotation['freqs']['exac_v3'] exac_popmax_freq = variant.annotation['freqs']['exac_v3_popmax'] merck_wgs_3793_freq = custom_populations.get('merck-wgs-3793', 0.0) assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold) assert merck_wgs_3793_freq <= merck_wgs_3793_threshold assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq) assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"]) if genotype.num_alt == 1: assert genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab) assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter) writer.writerow(map(str, [ project_id, family.family_id, individual.indiv_id, get_gene_symbol(variant), variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id, variant.annotation['vep_group'], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, merck_wgs_3793_freq, genotype_str, genotype.num_alt, genotype.ab, genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"], genotype.filter, ])) individual_variants_f.flush() individual_variants_f.close()
def handle(self, *args, **options): if len(args) != 2: sys.exit( "ERROR: please specify the project_id and file of individual ids as command line args." ) project_id = args[0] individuals_file = args[1] # init objects project = Project.objects.get(project_id=project_id) all_individual_ids_in_project = set( [i.indiv_id for i in project.get_individuals()]) individuals_of_interest = [] invalid_individual_ids = [] with open(individuals_file) as f: for line in f: line = line.strip('\n') if not line or line.startswith("#"): continue individual_id = line.split("\t")[0] if individual_id in all_individual_ids_in_project: individuals_of_interest.append(individual_id) else: invalid_individual_ids.append(individual_id) print("Processing %s: %d individuals " % (project_id, len(individuals_of_interest))) if invalid_individual_ids: num_invalid = len(invalid_individual_ids) total_ids = len(all_individual_ids_in_project) sys.exit(( "ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: " "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s" ) % locals()) # filter variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append( ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append( ('exac_v3_popmax', exac_popmax_threshold)) variant_filter.ref_freqs.append( ('merck-wgs-3793', merck_wgs_3793_threshold)) quality_filter = { 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # create individuals_variants.tsv individual_variants_f = gzip.open( 'individuals_in_%s.tsv.gz' % project_id, 'w') writer = csv.writer(individual_variants_f, dialect='excel', delimiter='\t') header_fields = [ 'project_id', 'family_id', 'individual_id', 'gene', 'chrom', 'pos', 'ref', 'alt', 'rsid', 'annotation', '1kg_af', '1kg_popmax_af', 'exac_af', 'exac_popmax_af', 'merck_wgs_3793_af', 'genotype_str', 'genotype_num_alt', 'genotype_allele_balance', 'genotype_AD', 'genotype_DP', 'genotype_GQ', 'genotype_PL', 'genotype_filter', ] writer.writerow(header_fields) # collect the resources that we'll need here annotator = get_annotator() custom_population_store = get_custom_population_store() individual_counter = 0 for i, family in enumerate(project.get_families()): for individual in family.get_individuals(): if individual.indiv_id not in individuals_of_interest: continue individual_counter += 1 print("%s: %s, individual %s" % (individual_counter, family.family_id, individual.indiv_id)) for variant in get_variants( get_datastore(project.project_id), family.xfamily(), variant_filter=variant_filter, quality_filter=quality_filter, indivs_to_consider=[individual.indiv_id]): genotype = variant.get_genotype(individual.indiv_id) if len(genotype.alleles) == 0 or genotype.extras[ "dp"] < DP_threshold or genotype.num_alt == 0: continue custom_populations = custom_population_store.get_frequencies( variant.xpos, variant.ref, variant.alt) genotype_str = "/".join( genotype.alleles) if genotype.alleles else "./." g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3'] g1k_popmax_freq = variant.annotation['freqs'][ '1kg_wgs_phase3_popmax'] exac_freq = variant.annotation['freqs']['exac_v3'] exac_popmax_freq = variant.annotation['freqs'][ 'exac_v3_popmax'] merck_wgs_3793_freq = custom_populations.get( 'merck-wgs-3793', 0.0) assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % ( g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % ( g1k_popmax_freq, g1k_popmax_freq_threshold) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % ( exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % ( exac_popmax_freq, exac_popmax_threshold) assert merck_wgs_3793_freq <= merck_wgs_3793_threshold assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.gq) assert genotype.extras[ "dp"] >= DP_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.extras["dp"]) if genotype.num_alt == 1: assert genotype.ab >= AB_threshold / 100., "%s %s - AB is %s " % ( variant.chr, variant.pos, genotype.ab) assert genotype.filter == "pass", "%s %s - filter is %s " % ( variant.chr, variant.pos, genotype.filter) writer.writerow( map(str, [ project_id, family.family_id, individual.indiv_id, get_gene_symbol(variant), variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id, variant.annotation['vep_group'], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, merck_wgs_3793_freq, genotype_str, genotype.num_alt, genotype.ab, genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"], genotype.filter, ])) individual_variants_f.flush() individual_variants_f.close()