def handle(self, *args, **options): """load CADD scores for all variants in a project, or all variants in the annotator_store.""" annotator_store = mall.get_annotator().get_annotator_datastore() if options['cadd_file']: print("Loading " + options['cadd_file']) load_from_cadd_file(options['cadd_file']) elif options['project_id']: print("Loading " + options['project_id']) project = Project.objects.get(project_id=options['project_id']) variant_collection = get_project_datastore(project)._get_project_collection(options['project_id']).find({'annotation.cadd_phred': {'$exists' : False}}) else: variant_collection = annotator_store.variants.find({'annotation.cadd_phred': {'$exists' : False}}) #print("Variant collection: " + str(variant_collection)) #print("Annotating %s variants" % variant_collection.count()) for r in tqdm.tqdm(variant_collection, unit=' variants'): #, total=variant_collection.count()): chrom, pos = genomeloc.get_chr_pos(r['xpos']) cadd_phred = fetch(chrom, pos, r['ref'], r['alt']) if cadd_phred is not None: result = annotator_store.variants.update({'xpos': r['xpos'], 'ref': r['ref'], 'alt': r['alt']}, {'$set': {'annotation.cadd_phred': cadd_phred}}, upsert=False) assert result['updatedExisting'] print("Done")
def get_x_linked_variants(datastore, reference, family, variant_filter=None, quality_filter=None): """ Variants that follow x linked inheritance in a family """ x_linked_filter = inheritance.get_x_linked_filter(family) for variant in get_variants(datastore, family, genotype_filter=x_linked_filter, variant_filter=variant_filter, quality_filter=quality_filter, indivs_to_consider=family.indiv_id_list()): if genomeloc.get_chr_pos(variant.xpos)[0] == 'chrX': yield variant
def handle(self, *args, **options): number_of_variants_to_check = int(options.get("number_of_variants_to_check") or 20000) if not args: args = [p.project_id for p in Project.objects.all()] args.reverse() for project_id in args: try: project = Project.objects.get(project_id=project_id) except: print("ERROR: Project not found. Skipping..") continue all_counter = 0 #found_counter = 0 not_found_counter = 0 not_found_variants = [] for vcf_file in project.get_all_vcf_files(): path = vcf_file.file_path #print("Processing %s - %s" % (project.project_id, path)) if not os.path.isfile(path) and path.endswith(".vcf"): path = path + ".gz" if path.endswith(".gz"): f = gzip.open(path) else: f = open(path) if f: for variant in vcf_stuff.iterate_vcf(f): all_counter += 1 try: get_mall(project_id).annotator.get_annotation(variant.xpos, variant.ref, variant.alt) except ValueError, e: not_found_counter += 1 if len(not_found_variants) < 30: chrom, pos = genomeloc.get_chr_pos(variant.xpos) chrom = chrom.replace("chr","") ref, alt = variant.ref, variant.alt not_found_variants.append("%(chrom)s-%(pos)s-%(ref)s-%(alt)s" % locals()) #print("WARNING: variant not found in annotator cache: " + str(e)) #if not_found_counter > 5: # print("---- ERROR: 5 variants not found. Project %s should be reloaded." % project_id) # break found_counter = 0 #else: # found_counter += 1 # if found_counter > 15000: # #print("---- Found 5000 variants in a row. Project %s looks ok." % project_id) # break if all_counter >= number_of_variants_to_check: fraction_missing = float(not_found_counter) / all_counter if not_found_counter > 10: print("---- ERROR: (%(fraction_missing)0.2f%%) %(not_found_counter)s / %(all_counter)s variants not found. Project %(project_id)s should be reloaded. Examples: " % locals()) for v in not_found_variants: print("http://exac.broadinstitute.org/variant/" + v) break
def calculate_combine_mendelian_families(family_group, search_spec, user=None): """ Calculate search results from the params in search_spec Should be called after cache is checked - this does all the computation Returns (is_error, genes) tuple """ xfamilygroup = family_group.xfamilygroup() genes = [] for gene_id, family_id_list in get_families_by_gene( get_mall(family_group.project), xfamilygroup, search_spec.inheritance_mode, search_spec.variant_filter, search_spec.quality_filter, user=user, ): xgene = get_reference().get_gene(gene_id) if xgene is None: continue try: start_pos, end_pos = get_reference().get_gene_bounds(gene_id) chr, start = genomeloc.get_chr_pos(start_pos) end = genomeloc.get_chr_pos(end_pos)[1] except KeyError: chr, start, end = None, None, None gene = { 'gene_info': xgene, 'gene_id': gene_id, 'gene_name': xgene['symbol'], 'chr': chr, 'start': start, 'end': end, 'family_id_list': family_id_list, } genes.append(gene) return genes
def get_recessive_individuals(gene_variation, indiv_id_list): """ An individual is recessive if they have *any* homozyogus recessvie, x-linked, or compound het recessive inheritance """ list_of_lists = [ get_homozygous_recessive_individuals(gene_variation, indiv_id_list), get_compound_het_individuals(gene_variation, indiv_id_list), ] if genomeloc.get_chr_pos(gene_variation.get_gene_bounds()[0])[0] == 'chrX': list_of_lists.append(get_x_linked_recessive_individuals(gene_variation, indiv_id_list)) return set([indiv_id for indiv_list in list_of_lists for indiv_id in indiv_list])
def write_map(filename, snp_panel): """ Writes a MAP file to filename, with the SNPs in snp_panel Note that current implementation does not consider genetic distance, may want to fix that. """ f = open(filename, 'w') for snp in snp_panel: chr, pos = genomeloc.get_chr_pos(snp['pos']) fields = [chr[3:], str(snp['pos']), '0', str(pos)] f.write('\t'.join(fields) + '\n') f.close()
def write_sites_vcf(f, sites_list): """ Write a sites VCF file to file_path Args: sites_list: iterator of (xpos, ref, alt) tuples Returns: True or False, if successful """ f.write("##fileformat=VCFv4.0\n") f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") for site in sites_list: chrom, pos = genomeloc.get_chr_pos(site[0]) fields = [chrom[3:], str(pos), '.', site[1], site[2], '.', '.', '.'] f.write('\t'.join(fields) + '\n') return True
def handle(self, *args, **options): """load CADD scores for all variants in a project, or all variants in the annotator_store.""" annotator_store = mall.get_annotator().get_annotator_datastore() if options['cadd_file']: print("Loading " + options['cadd_file']) load_from_cadd_file(options['cadd_file']) elif options['project_id']: print("Loading " + options['project_id']) project = Project.objects.get(project_id=options['project_id']) variant_collection = get_project_datastore( project)._get_project_collection(options['project_id']).find( {'annotation.cadd_phred': { '$exists': False }}) else: variant_collection = annotator_store.variants.find( {'annotation.cadd_phred': { '$exists': False }}) #print("Variant collection: " + str(variant_collection)) #print("Annotating %s variants" % variant_collection.count()) for r in tqdm.tqdm( variant_collection, unit=' variants'): #, total=variant_collection.count()): chrom, pos = genomeloc.get_chr_pos(r['xpos']) cadd_phred = fetch(chrom, pos, r['ref'], r['alt']) if cadd_phred is not None: result = annotator_store.variants.update( { 'xpos': r['xpos'], 'ref': r['ref'], 'alt': r['alt'] }, {'$set': { 'annotation.cadd_phred': cadd_phred }}, upsert=False) assert result['updatedExisting'] print("Done")
def calculate_cohort_gene_search(cohort, search_spec): """ Calculate search results from the params in search_spec Should be called after cache is checked - this does all the computation Returns (is_error, genes) tuple """ xcohort = cohort.xcohort() cohort_size = len(xcohort.individuals) indiv_id_list = xcohort.indiv_id_list() genes = [] for gene_id, indivs_with_inheritance, gene_variation in cohort_get_genes_with_inheritance( get_datastore(cohort.project.project_id), get_reference(), xcohort, search_spec.inheritance_mode, search_spec.variant_filter, search_spec.genotype_quality_filter, ): num_hits = len(indivs_with_inheritance) # don't return genes with a single variant if num_hits < 2: continue try: start_pos, end_pos = get_reference().get_gene_bounds(gene_id) chr, start = genomeloc.get_chr_pos(start_pos) end = genomeloc.get_chr_pos(end_pos)[1] except KeyError: chr, start, end = None, None, None control_cohort = cohort.project.default_control_cohort if cohort.project.default_control_cohort else settings.DEFAULT_CONTROL_COHORT control_comparison = population_controls.control_comparison( control_cohort, gene_id, num_hits, cohort_size, search_spec.inheritance_mode, search_spec.variant_filter, search_spec.genotype_quality_filter) xgene = get_reference().get_gene(gene_id) if xgene is None: continue sys.stderr.write( " cohort_gene_search - found gene: %s, gene_id: %s \n" % ( xgene['symbol'], gene_id, )) gene = { 'gene_info': xgene, 'gene_id': gene_id, 'gene_name': xgene['symbol'], 'num_hits': num_hits, 'num_unique_variants': len( gene_variation.get_relevant_variants_for_indiv_ids( indiv_id_list)), 'chr': chr, 'start': start, 'end': end, 'control_comparison': control_comparison, } genes.append(gene) sys.stderr.write( " cohort_gene_search - finished. (cohort_genes_with_inheritance iterator)" ) return genes
def generate_rows(project, errors): rows = [] loaded_datasets = list( Dataset.objects.filter(project=project, analysis_type="VARIANTS", is_loaded=True)) if not loaded_datasets: errors.append("No data loaded for project: %s" % project) logger.info("No data loaded for project: %s" % project) return [] for d in loaded_datasets: print("Loaded time %s: %s" % (d, d.loaded_date)) #project_variant_tag_filter = Q(family__project=project) & ( # Q(variant_tag_type__name__icontains="tier 1") | # Q(variant_tag_type__name__icontains="tier 2") | # Q(variant_tag_type__name__icontains="known gene for phenotype")) #project_variant_tags = list(VariantTag.objects.select_related('variant_tag_type').filter(project_variant_tag_filter)) #project_variant_tag_names = [vt.variant_tag_type.name.lower() for vt in project_variant_tags] #project_has_tier1 = any([vt_name.startswith("tier 1") for vt_name in project_variant_tag_names]) #project_has_tier2 = any([vt_name.startswith("tier 2") for vt_name in project_variant_tag_names]) #project_has_known_gene_for_phenotype = any([(vt_name == "known gene for phenotype") for vt_name in project_variant_tag_names]) #"External" = REAN #"RNA" = RNA #"WGS" or "Genome" . = WGS #else "WES" lower_case_project_id = project.deprecated_project_id.lower() if "external" in lower_case_project_id or "reprocessed" in lower_case_project_id: sequencing_approach = "REAN" elif "rna" in lower_case_project_id: sequencing_approach = "RNA" elif "wgs" in lower_case_project_id or "genome" in lower_case_project_id: sequencing_approach = "WGS" else: sequencing_approach = "WES" now = timezone.now() for family in Family.objects.filter(project=project): individuals = list(Individual.objects.filter(family=family)) samples = list(Sample.objects.filter(individual__family=family)) phenotips_individual_data_records = [ json.loads(i.phenotips_data) for i in individuals if i.phenotips_data ] phenotips_individual_features = [ phenotips_data.get("features", []) for phenotips_data in phenotips_individual_data_records ] phenotips_individual_mim_disorders = [ phenotips_data.get("disorders", []) for phenotips_data in phenotips_individual_data_records ] phenotips_individual_expected_inheritance_model = [ inheritance_mode["label"] for phenotips_data in phenotips_individual_data_records for inheritance_mode in phenotips_data.get( "global_mode_of_inheritance", []) ] omim_ids = [ disorder.get("id") for disorders in phenotips_individual_mim_disorders for disorder in disorders if "id" in disorder ] omim_number_initial = omim_ids[0].replace("MIM:", "") if omim_ids else "" if omim_number_initial: if omim_number_initial in PHENOTYPIC_SERIES_CACHE: omim_number_initial = PHENOTYPIC_SERIES_CACHE[ omim_number_initial] else: try: response = requests.get( 'https://www.omim.org/entry/' + omim_number_initial, headers={ 'Host': 'www.omim.org', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Upgrade-Insecure-Requests': '1', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8', }) if not response.ok: raise ValueError("omim request failed: %s %s" % (response, response.reason)) omim_page_html = response.content # <a href="/phenotypicSeries/PS613280" class="btn btn-info" role="button"> Phenotypic Series </a> match = re.search("/phenotypicSeries/([a-zA-Z0-9]+)", omim_page_html) if not match: logger.info( "No phenotypic series found for OMIM initial # %s" % omim_number_initial) PHENOTYPIC_SERIES_CACHE[ omim_number_initial] = omim_number_initial else: phenotypic_series_id = match.group(1) logger.info( "Will replace OMIM initial # %s with phenotypic series %s" % (omim_number_initial, phenotypic_series_id)) PHENOTYPIC_SERIES_CACHE[ omim_number_initial] = phenotypic_series_id omim_number_initial = PHENOTYPIC_SERIES_CACHE[ omim_number_initial] except Exception as e: # don't change omim_number_initial logger.info( "Unable to look up phenotypic series for OMIM initial number: %s. %s" % (omim_number_initial, e)) submitted_to_mme = any([ individual.mme_submitted_data for individual in individuals if individual.mme_submitted_data ]) #samples #print([s for s in samples]) #print([(dataset, dataset.is_loaded, dataset.loaded_date) for sample in samples for dataset in sample.dataset_set.all()]) datesets_loaded_date_for_family = [ dataset.loaded_date for sample in samples for dataset in sample.dataset_set.filter(analysis_type="VARIANTS") if dataset.loaded_date is not None ] if not datesets_loaded_date_for_family: errors.append("No data loaded for family: %s. Skipping..." % family) continue t0 = min(datesets_loaded_date_for_family) t0_diff = rdelta.relativedelta(now, t0) t0_months_since_t0 = t0_diff.years * 12 + t0_diff.months analysis_complete_status = "first_pass_in_progress" if t0_months_since_t0 >= 12: # or (project_has_tier1 or project_has_tier2 or project_has_known_gene_for_phenotype): analysis_complete_status = "complete" row = { "extras_pedigree_url": family.pedigree_image.url if family.pedigree_image else "", "project_id": project.deprecated_project_id, "project_name": project.name, "t0": t0, "months_since_t0": t0_months_since_t0, "family_id": family.family_id, "coded_phenotype": family.coded_phenotype or "", # "Coded Phenotype" field - Ben will add a field that only staff can edit. Will be on the family page, above short description. "sequencing_approach": sequencing_approach, # WES, WGS, RNA, REAN, GENO - Ben will do this using a script based off project name - may need to backfill some "sample_source": "CMG", # CMG, NHLBI-X01, NHLBI-nonX01, NEI - Most are CMG so default to them all being CMG. "n_kindreds": "1", "actual_inheritance_model": "", "expected_inheritance_model": "".join(set(phenotips_individual_expected_inheritance_model)) if len(set(phenotips_individual_expected_inheritance_model)) == 1 else "multiple", # example: 20161205_044436_852786_MAN_0851_05_1 - AR-homozygote, AR, AD, de novo, X-linked, UPD, other, multiple - phenotips - Global mode of inheritance: "omim_number_initial": omim_number_initial or "NA", "omim_number_post_discovery": family.post_discovery_omim_number or "NA", "collaborator": project.name, # TODO use email addresses? "analysis_summary": family.analysis_summary.strip('" \n'), "phenotype_class": "Known" if omim_number_initial else "New", # "disorders" UE, NEW, MULTI, EXPAN, KNOWN - If there is a MIM number enter "Known" - otherwise put "New" and then we will need to edit manually for the other possible values "solved": "N", # TIER 1 GENE (or known gene for phenotype also record as TIER 1 GENE), TIER 2 GENE, N - Pull from seqr using tags "submitted_to_mme": "Y" if submitted_to_mme else "NS", "pubmed_ids": "", "posted_publicly": "NS", "gene_name": "NS", "gene_count": "NA", "novel_mendelian_gene": "NS", "analysis_complete_status": analysis_complete_status, # If known gene for phenotype, tier 1 or tier 2 tag is used on any variant in project, or 1 year past t0 = complete. If less than a year and none of the tags above = first pass in progress "genome_wide_linkage": "NS", "p_value": "NS", "n_kindreds_overlapping_sv_similar_phenotype": "NS", "n_unrelated_kindreds_with_causal_variants_in_gene": "NS", "biochemical_function": "NS", "protein_interaction": "NS", "expression": "NS", "patient_cells": "NS", "non_patient_cell_model": "NS", "animal_model": "NS", "non_human_cell_culture_model": "NS", "rescue": "NS", } #for hpo_category_id, hpo_category_name in HPO_CATEGORY_NAMES.items(): # row[hpo_category_name.lower().replace(" ", "_").replace("/", "_")] = "N" for hpo_category_name in [ "connective_tissue", "voice", "nervous_system", "breast", "eye_defects", "prenatal_development_or_birth", "neoplasm", "endocrine_system", "head_or_neck", "immune_system", "growth", "limbs", "thoracic_cavity", "blood", "musculature", "cardiovascular_system", "abdomen", "skeletal_system", "respiratory", "ear_defects", "metabolism_homeostasis", "genitourinary_system", "integument", ]: row[hpo_category_name] = "N" category_not_set_on_some_features = False for features_list in phenotips_individual_features: for feature in features_list: if "category" not in feature: category_not_set_on_some_features = True continue if feature["observed"].lower() == "yes": hpo_category_id = feature["category"] hpo_category_name = HPO_CATEGORY_NAMES[hpo_category_id] key = hpo_category_name.lower().replace(" ", "_").replace( "/", "_") row[key] = "Y" elif feature["observed"].lower() == "no": continue else: raise ValueError("Unexpected value for 'observed' in %s" % (feature, )) if category_not_set_on_some_features: errors.append( "HPO category field not set for some HPO terms in %s" % family) variant_tag_filter = Q(family=family) & ( Q(variant_tag_type__name__icontains="tier 1") | Q(variant_tag_type__name__icontains="tier 2") | Q(variant_tag_type__name__icontains="known gene for phenotype")) variant_tags = list( VariantTag.objects.select_related('variant_tag_type').filter( variant_tag_filter)) if not variant_tags: rows.append(row) continue gene_ids_to_variant_tags = defaultdict(list) for vt in variant_tags: if not vt.saved_variant_json: errors.append("%s - variant annotation not found" % vt) rows.append(row) continue vt.saved_variant_json = json.loads(vt.saved_variant_json) if "coding_gene_ids" not in vt.saved_variant_json[ "annotation"] and "gene_ids" not in vt.saved_variant_json[ "annotation"]: errors.append("%s - no gene_ids" % vt) rows.append(row) continue gene_ids = vt.saved_variant_json["annotation"].get( "coding_gene_ids", []) if not gene_ids: gene_ids = vt.saved_variant_json["annotation"].get( "gene_ids", []) if not gene_ids: errors.append("%s - gene_ids not specified" % vt) rows.append(row) continue # get the shortest gene_id gene_id = list(sorted(gene_ids, key=lambda gene_id: len(gene_id)))[0] gene_ids_to_variant_tags[gene_id].append(vt) for gene_id, variant_tags in gene_ids_to_variant_tags.items(): gene_symbol = get_reference().get_gene_symbol(gene_id) lower_case_variant_tag_type_names = [ vt.variant_tag_type.name.lower() for vt in variant_tags ] has_tier1 = any( name.startswith("tier 1") for name in lower_case_variant_tag_type_names) has_tier2 = any( name.startswith("tier 2") for name in lower_case_variant_tag_type_names) has_known_gene_for_phenotype = any( name == "known gene for phenotype" for name in lower_case_variant_tag_type_names) has_tier1_phenotype_expansion_or_novel_mode_of_inheritance = any( name.startswith("tier 1") and ('expansion' in name.lower() or 'novel mode' in name.lower()) for name in lower_case_variant_tag_type_names) has_tier_1_or_2_phenotype_not_delineated = any( (name.startswith("tier 1") or name.startswith("tier 2")) and ( 'not delineated' in name.lower()) for name in lower_case_variant_tag_type_names) analysis_complete_status = row["analysis_complete_status"] if has_tier1 or has_tier2 or has_known_gene_for_phenotype: analysis_complete_status = "complete" variant_tag_list = [ ("%s %s %s" % ("-".join( map( str, list(genomeloc.get_chr_pos(vt.xpos_start)) + [vt.ref, vt.alt])), gene_symbol, vt.variant_tag_type.name.lower())) for vt in variant_tags ] actual_inheritance_models = set() potential_compound_hets = defaultdict( int) # gene_id to compound_hets counter for vt in variant_tags: affected_indivs_with_hom_alt_variants = set() affected_indivs_with_het_variants = set() affected_total_individuals = 0 unaffected_indivs_with_hom_alt_variants = set() unaffected_indivs_with_het_variants = set() unaffected_total_individuals = 0 is_x_linked = False if vt.saved_variant_json["genotypes"]: chrom, pos = genomeloc.get_chr_pos(vt.xpos_start) is_x_linked = "X" in chrom for indiv_id, genotype in json.loads( vt.saved_variant_json["genotypes"]).items(): try: i = Individual.objects.get(family=family, individual_id=indiv_id) except ObjectDoesNotExist as e: logger.warn( "WARNING: Couldn't find individual: %s, %s" % (family, indiv_id)) continue if i.affected == "A": affected_total_individuals += 1 elif i.affected == "N": unaffected_total_individuals += 1 if genotype["num_alt"] == 2 and i.affected == "A": affected_indivs_with_hom_alt_variants.add(indiv_id) elif genotype["num_alt"] == 1 and i.affected == "A": affected_indivs_with_het_variants.add(indiv_id) elif genotype["num_alt"] == 2 and i.affected == "N": unaffected_indivs_with_hom_alt_variants.add( indiv_id) elif genotype["num_alt"] == 1 and i.affected == "N": unaffected_indivs_with_het_variants.add(indiv_id) # AR-homozygote, AR-comphet, AR, AD, de novo, X-linked, UPD, other, multiple if not unaffected_indivs_with_hom_alt_variants and affected_indivs_with_hom_alt_variants: if "AR-comphet" not in actual_inheritance_models: if is_x_linked: actual_inheritance_models.add("X-linked") else: actual_inheritance_models.add("AR-homozygote") if not unaffected_indivs_with_hom_alt_variants and not unaffected_indivs_with_het_variants and affected_indivs_with_het_variants: if "AR-comphet" not in actual_inheritance_models: if unaffected_total_individuals > 0: actual_inheritance_models.add("de novo") else: actual_inheritance_models.add("AD") if not unaffected_indivs_with_hom_alt_variants and ( unaffected_total_individuals < 2 or unaffected_indivs_with_het_variants ) and affected_indivs_with_het_variants and not affected_indivs_with_hom_alt_variants: potential_compound_hets[gene_id] += 1 print("%s incremented compound het for %s to %s" % (vt, gene_id, potential_compound_hets[gene_id])) if potential_compound_hets[gene_id] >= 2: actual_inheritance_models.clear() actual_inheritance_models.add("AR-comphet") actual_inheritance_model = " (%d aff hom, %d aff het, %d unaff hom, %d unaff het) " % ( #affected_total_individuals, #unaffected_total_individuals, len(affected_indivs_with_hom_alt_variants), len(affected_indivs_with_het_variants), len(unaffected_indivs_with_hom_alt_variants), len(unaffected_indivs_with_het_variants), ) actual_inheritance_model = ", ".join( actual_inheritance_models) #+ actual_inheritance_model NA_or_KPG_or_NS = "NA" if has_tier1 or has_tier2 else ( "KPG" if has_known_gene_for_phenotype else "NS") KPG_or_blank_or_NS = "KPG" if has_known_gene_for_phenotype else ( "" if has_tier1 or has_tier2 else "NS") # "disorders" UE, NEW, MULTI, EXPAN, KNOWN - If there is a MIM number enter "Known" - otherwise put "New" and then we will need to edit manually for the other possible values phenotype_class = "EXPAN" if has_tier1_phenotype_expansion_or_novel_mode_of_inheritance else ( "UE" if has_tier_1_or_2_phenotype_not_delineated else ("Known" if omim_number_initial else "New")) # create a copy of the row dict row = dict(row) row.update({ "extras_variant_tag_list": variant_tag_list, "extras_num_variant_tags": len(variant_tags), "gene_name": str(gene_symbol) if gene_symbol and (has_tier1 or has_tier2 or has_known_gene_for_phenotype) else "NS", "gene_count": len(gene_ids_to_variant_tags.keys()) if len(gene_ids_to_variant_tags.keys()) > 1 else "NA", "novel_mendelian_gene": "Y" if any("novel gene" in name for name in lower_case_variant_tag_type_names) else ("N" if has_tier1 or has_tier2 or has_known_gene_for_phenotype else "NS"), "solved": ("TIER 1 GENE" if (has_tier1 or has_known_gene_for_phenotype) else ("TIER 2 GENE" if has_tier2 else "N")), "posted_publicly": ("" if has_tier1 or has_tier2 or has_known_gene_for_phenotype else "NS"), "submitted_to_mme": "Y" if submitted_to_mme else ("TBD" if has_tier1 or has_tier2 else ("KPG" if has_known_gene_for_phenotype else "NS")), "actual_inheritance_model": actual_inheritance_model, "analysis_complete_status": analysis_complete_status, # If known gene for phenotype, tier 1 or tier 2 tag is used on any variant in project, or 1 year past t0 = complete. If less than a year and none of the tags above = first pass in progress "genome_wide_linkage": NA_or_KPG_or_NS, "p_value": NA_or_KPG_or_NS, "n_kindreds_overlapping_sv_similar_phenotype": NA_or_KPG_or_NS, "n_unrelated_kindreds_with_causal_variants_in_gene": "1" if has_tier1 or has_tier2 else ("KPG" if has_known_gene_for_phenotype else "NS"), "biochemical_function": KPG_or_blank_or_NS, "protein_interaction": KPG_or_blank_or_NS, "expression": KPG_or_blank_or_NS, "patient_cells": KPG_or_blank_or_NS, "non_patient_cell_model": KPG_or_blank_or_NS, "animal_model": KPG_or_blank_or_NS, "non_human_cell_culture_model": KPG_or_blank_or_NS, "rescue": KPG_or_blank_or_NS, "phenotype_class": phenotype_class, }) rows.append(row) return rows
def calculate_cohort_gene_search(cohort, search_spec): """ Calculate search results from the params in search_spec Should be called after cache is checked - this does all the computation Returns (is_error, genes) tuple """ xcohort = cohort.xcohort() cohort_size = len(xcohort.individuals) indiv_id_list = xcohort.indiv_id_list() genes = [] for gene_id, indivs_with_inheritance, gene_variation in cohort_get_genes_with_inheritance( get_datastore(cohort.project.project_id), get_reference(), xcohort, search_spec.inheritance_mode, search_spec.variant_filter, search_spec.quality_filter, ): num_hits = len(indivs_with_inheritance) # don't return genes with a single variant if num_hits < 2: continue try: start_pos, end_pos = get_reference().get_gene_bounds(gene_id) chr, start = genomeloc.get_chr_pos(start_pos) end = genomeloc.get_chr_pos(end_pos)[1] except KeyError: chr, start, end = None, None, None control_cohort = cohort.project.default_control_cohort if cohort.project.default_control_cohort else settings.DEFAULT_CONTROL_COHORT control_comparison = population_controls.control_comparison( control_cohort, gene_id, num_hits, cohort_size, search_spec.inheritance_mode, search_spec.variant_filter, search_spec.quality_filter ) xgene = get_reference().get_gene(gene_id) if xgene is None: continue sys.stderr.write(" cohort_gene_search - found gene: %s, gene_id: %s \n" % (xgene['symbol'], gene_id, )) gene = { 'gene_info': xgene, 'gene_id': gene_id, 'gene_name': xgene['symbol'], 'num_hits': num_hits, 'num_unique_variants': len(gene_variation.get_relevant_variants_for_indiv_ids(indiv_id_list)), 'chr': chr, 'start': start, 'end': end, 'control_comparison': control_comparison, } genes.append(gene) sys.stderr.write(" cohort_gene_search - finished. (cohort_genes_with_inheritance iterator)") return genes
def get_output_row(self, variant, xpos, ref, alt, individual_id, family, all_fields=False, comments="", gene_id=""): v = variant if individual_id not in v.genotypes: print("skipping variant: %s because individual %s not in %s" % (str(xpos) + " " + ref + ">" + alt, individual_id, family.family_id)) return None gene_id = gene_id.split( "." )[0] if gene_id else None # strip off the gene_id suffix (eg. '.3') genotype = v.genotypes[individual_id] if genotype.gq is None: print( "skipping variant: %s because this variant is not called in this individual (%s)" % (str(xpos) + " " + ref + ">" + alt, individual_id)) #, str(genotype))) return None chrom, pos = genomeloc.get_chr_pos(xpos) chrom_without_chr = chrom.replace("chr", "") annot = v.annotation if gene_id: worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index( annot["vep_annotation"], gene_id=gene_id) else: # create dictionary that maps gene id to the index of the worst vep annotation for that gene protein_coding_gene_ids = set(a['gene'] for a in annot["vep_annotation"] if a['biotype'] == 'protein_coding') if not protein_coding_gene_ids: print( "skipping variant %s in this individual (%s) because none of the transcripts are protein coding: %s" % (str(xpos) + " " + ref + ">" + alt, individual_id, annot)) return None worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index( annot["vep_annotation"], gene_id=protein_coding_gene_ids) if len(protein_coding_gene_ids) > 1: selected_gene_id = annot["vep_annotation"][ worst_vep_annotation_index]['gene'] print("Selected %s from %s" % (annot["vep_annotation"][worst_vep_annotation_index] ['symbol'], set([ a['symbol'] for a in annot["vep_annotation"] if a['gene'] in protein_coding_gene_ids ]))) vep = annot["vep_annotation"][ worst_vep_annotation_index] # ea_maf, swissprot, existing_variation, pubmed, aa_maf, ccds, high_inf_pos, cdna_position, canonical, tsl, feature_type, intron, trembl, feature, codons, polyphen, clin_sig, motif_pos, protein_position, afr_maf, amino_acids, cds_position, symbol, uniparc, eur_maf, hgnc_id, consequence, sift, exon, biotype, is_nc, gmaf, motif_name, strand, motif_score_change, distance, hgvsp, ensp, allele, symbol_source, amr_maf, somatic, hgvsc, asn_maf, is_nmd, domains, gene worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index( annot["vep_annotation"]) vep = annot["vep_annotation"][worst_vep_annotation_index] if "symbol" in vep and "consequence" in vep: gene_name = vep["symbol"] # vep["gene"] functional_class = vep["consequence"] else: gene_name = functional_class = "" print( "ERROR: gene_name and functional_class not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) if genotype.num_alt is None: s = "\n\n" for i, g in v.genotypes.items(): s += str(i) + ": " + str(g) + "\n" raise ValueError("genotype.num_alt is None: " + str(genotype) + "\n" + str(v.toJSON()) + "\n" + s) genotype_str = genotype_map[genotype.num_alt] variant_str = "%s:%s %s>%s" % (chrom, pos, ref, alt) if "hgvsc" in vep and "hgvsp" in vep: #print("hgvs_c and/or hgvs_p WAS found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) hgvs_c = urllib.unquote(vep["hgvsc"]) hgvs_p = urllib.unquote(vep["hgvsp"]) else: hgvs_c = hgvs_p = "" #print("ERROR: hgvs_c and/or hgvs_p not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) rsid = annot["rsid"] or "" #rsid = vep["clinvar_rs"] exac_global_af, exac_popmax_af, exac_popmax_population = get_exac_af( chrom, pos, ref, alt) if exac_global_af is None: exac_global_af, exac_popmax_af, exac_popmax_population = 0, 0, "[variant not found in ExACv0.3]" else: exac_global_af_annot = str(annot["freqs"]["exac_v3"]) if abs(float(exac_global_af) - float(exac_global_af_annot)) > 0.01: print( "Error annot['freqs']['exac_v3'] (%s) doesn't match %s" % (float(exac_global_af), float(exac_global_af_annot))) clinvar_clinsig = "" clinvar_clnrevstat = "" if "clin_sig" in vep: clinvar_clinsig_from_dbnsfp = vep["clin_sig"] else: clinvar_clinsig_from_dbnsfp = "" #print("ERROR: clin_sig not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) clinvar_records = [ record for record in clinvar_vcf_file.fetch(chrom_without_chr, pos, pos) if record.POS == pos and record.REF == ref ] #if clinvar_clinsig_from_dbnsfp or clinvar_records: # defensive programming #if clinvar_clinsig_from_dbnsfp and not clinvar_records: # raise ValueError("record has dbNSFP clinvar entry but is not in clinvar vcf: %s" % variant_str) #if not clinvar_clinsig_from_dbnsfp and clinvar_records: # raise ValueError("record doesn't have a dbNSFP clinvar entry but is in clinvar vcf: %s" % variant_str) if clinvar_records: #if len(clinvar_records) > 1: # raise ValueError("multiple clinvar records found for variant: %s" % variant_str) clinvar_record = clinvar_records[-1] clinvar_allele_indexes = map(int, clinvar_record.INFO["CLNALLE"]) clinvar_alleles = map(str, [clinvar_record.REF] + clinvar_record.ALT) xbrowse_alleles = map(str, [ref] + [alt]) clinvar_value_indexes_to_use = [ i for i, clinvar_allele_index in enumerate( clinvar_allele_indexes) if str(clinvar_alleles[clinvar_allele_index]).upper() in xbrowse_alleles ] clnrevstat = clinvar_record.INFO["CLNREVSTAT"] clnrevstat = [clnrevstat[i] for i in clinvar_value_indexes_to_use] clnsig = clinvar_record.INFO["CLNSIG"] clnsig = [clnsig[i] for i in clinvar_value_indexes_to_use] # print("Fetched clinvar %s: %s"% (clinvar_record, clinvar_record.INFO)) if clnsig: clinvar_clinsig_numbers = map(int, clnsig[0].split("|")) clinvar_clinsig = "|".join( set([ clinsig_map[clinvar_clinsig_number][0] for clinvar_clinsig_number in clinvar_clinsig_numbers ])) clinvar_clnrevstat = "|".join(set(clnrevstat[0].split("|"))) # get number_of_stars = "[not found]" if all_fields else "[not retrieved to save time]" clinvar_url = "http://www.ncbi.nlm.nih.gov/clinvar/?term=" + chrom_without_chr + "[chr]+AND+" + str( pos) + "[chrpos37]" if clinvar_clinsig and all_fields: print("Reading from: " + clinvar_url) url_opener = urllib2.build_opener() url_opener.addheaders = [( 'User-agent', "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11" )] page_contents = url_opener.open(clinvar_url).read() match = re.search("(\d) star.? out of maximum of 4 stars", page_contents) if match: number_of_stars = int(match.group(1)) else: print("No match in page: " + clinvar_url) for line in page_contents.split("\n"): if "rev_stat_text hide" in line: print( " -- this line was expected to contain number of stars: " + line) row = map(str, [ gene_name, genotype_str, variant_str, functional_class, hgvs_c, hgvs_p, rsid, exac_global_af, exac_popmax_af, exac_popmax_population, clinvar_clinsig, clinvar_clnrevstat, number_of_stars, clinvar_url, comments ]) return row
def add_preannotated_vcf_file(self, vcf_file_path, force=False, start_from_chrom=None, end_with_chrom=None): """ Add the variants in vcf_file_path to annotator Convenience wrapper around add_variants_to_annotator """ if not force and self._db.vcf_files.find_one( {'vcf_file_path': vcf_file_path}): print "VCF %(vcf_file_path)s already loaded into db.variants cache" % locals( ) return r = vcf.VCFReader(filename=vcf_file_path) if "CSQ" not in r.infos: raise ValueError( "ERROR: CSQ field not found in %s. Was this VCF annotated with VEP?" % vcf_file_path) expected_csq_fields = set( "Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|SYMBOL|SYMBOL_SOURCE|HGNC_ID|BIOTYPE|CANONICAL|TSL|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|SIFT|PolyPhen|EXON|INTRON|DOMAINS|HGVSc|HGVSp|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF_info|LoF_flags|LoF_filter|LoF|Polyphen2_HVAR_pred|CADD_phred|MutationTaster_pred|MetaSVM_pred|SIFT_pred|FATHMM_pred" .split("|")) actual_csq_fields_string = str( r.infos["CSQ"].desc).split("Format:")[1].strip() actual_csq_fields = set(actual_csq_fields_string.split("|")) if len(expected_csq_fields - actual_csq_fields) > 0: raise ValueError( "ERROR: VEP did not add all expected CSQ fields to the VCF. The VCF's CSQ = %s and is missing these fields: %s" % (actual_csq_fields_string, expected_csq_fields - actual_csq_fields)) if start_from_chrom or end_with_chrom: if start_from_chrom: print("Start chrom: chr%s" % start_from_chrom) if end_with_chrom: print("End chrom: chr%s" % end_with_chrom) chrom_list = list(map(str, range(1, 23))) + ['X', 'Y'] chrom_list_start_index = 0 if start_from_chrom: chrom_list_start_index = chrom_list.index( start_from_chrom.replace("chr", "").upper()) chrom_list_end_index = len(chrom_list) if end_with_chrom: chrom_list_end_index = chrom_list.index( end_with_chrom.replace("chr", "").upper()) tabix_file = pysam.TabixFile(vcf_file_path) vcf_iter = tabix_file.header for chrom in chrom_list[ chrom_list_start_index:chrom_list_end_index + 1]: print("Will load chrom: " + chrom) try: vcf_iter = itertools.chain(vcf_iter, tabix_file.fetch(chrom)) except ValueError as e: print("WARNING: add_preannotated_vcf_file: " + str(e)) vcf_file_obj = vcf_iter else: print("Loading pre-annotated VCF file: %s into db.variants cache" % vcf_file_path) vcf_file_obj = gzip.open(vcf_file_path) if vcf_file_path.endswith( '.gz') else open(vcf_file_path) counters = defaultdict(int) for variant, vep_annotation in vep_annotations.parse_vep_annotations_from_vcf( vcf_file_obj): # for variant_t in vcf_stuff.iterate_tuples(compressed_file(vcf_file_path)): variant_t = variant.unique_tuple() counters['alleles'] += 1 annotation = { 'vep_annotation': vep_annotation, 'freqs': self._population_frequency_store.get_frequencies( variant_t[0], variant_t[1], variant_t[2]), } add_convenience_annotations(annotation) chrom, pos = genomeloc.get_chr_pos(variant_t[0]) worst_annotation = vep_annotation[ annotation["worst_vep_annotation_index"]] predictors = get_predictors(worst_annotation) annotation.update(predictors) #if self._custom_annotator: # custom_annotations = self._custom_annotator.get_annotations_for_variants([variant_t]) # annotation.update(custom_annotations[variant_t]) if counters['alleles'] % 10000 == 0: import pprint pprint.pprint(variant_t) self._db.variants.update( { 'xpos': variant_t[0], 'ref': variant_t[1], 'alt': variant_t[2] }, {'$set': { 'annotation': annotation }}, upsert=True) print("Finished parsing %s alleles from %s" % (counters.get('alleles', 0), vcf_file_path)) self._db.vcf_files.update({'vcf_file_path': vcf_file_path}, { 'vcf_file_path': vcf_file_path, 'date_added': datetime.datetime.utcnow() }, upsert=True)
def get_output_row(self, variant, xpos, ref, alt, individual_id, family, all_fields=False, comments="", gene_id=""): v = variant if individual_id not in v.genotypes: print("skipping variant: %s because individual %s not in %s" % (str(xpos) + " " + ref + ">" + alt, individual_id, family.family_id)) return None gene_id = gene_id.split(".")[0] if gene_id else None # strip off the gene_id suffix (eg. '.3') genotype = v.genotypes[individual_id] if genotype.gq is None: print("skipping variant: %s because this variant is not called in this individual (%s)" % (str(xpos)+" " + ref + ">" + alt, individual_id)) #, str(genotype))) return None chrom, pos = genomeloc.get_chr_pos(xpos) chrom_without_chr = chrom.replace("chr", "") annot = v.annotation if gene_id: worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(annot["vep_annotation"], gene_id = gene_id) else: # create dictionary that maps gene id to the index of the worst vep annotation for that gene protein_coding_gene_ids = set(a['gene'] for a in annot["vep_annotation"] if a['biotype'] == 'protein_coding') if not protein_coding_gene_ids: print("skipping variant %s in this individual (%s) because none of the transcripts are protein coding: %s" % (str(xpos)+" " + ref + ">" + alt, individual_id, annot)) return None worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(annot["vep_annotation"], gene_id=protein_coding_gene_ids) if len(protein_coding_gene_ids) > 1: selected_gene_id = annot["vep_annotation"][worst_vep_annotation_index]['gene'] print("Selected %s from %s" % (annot["vep_annotation"][worst_vep_annotation_index]['symbol'], set([a['symbol'] for a in annot["vep_annotation"] if a['gene'] in protein_coding_gene_ids]))) vep = annot["vep_annotation"][worst_vep_annotation_index] # ea_maf, swissprot, existing_variation, pubmed, aa_maf, ccds, high_inf_pos, cdna_position, canonical, tsl, feature_type, intron, trembl, feature, codons, polyphen, clin_sig, motif_pos, protein_position, afr_maf, amino_acids, cds_position, symbol, uniparc, eur_maf, hgnc_id, consequence, sift, exon, biotype, is_nc, gmaf, motif_name, strand, motif_score_change, distance, hgvsp, ensp, allele, symbol_source, amr_maf, somatic, hgvsc, asn_maf, is_nmd, domains, gene worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(annot["vep_annotation"]) vep = annot["vep_annotation"][worst_vep_annotation_index] if "symbol" in vep and "consequence"in vep: gene_name = vep["symbol"] # vep["gene"] functional_class = vep["consequence"] else: gene_name = functional_class = "" print("ERROR: gene_name and functional_class not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) if genotype.num_alt is None: s = "\n\n" for i, g in v.genotypes.items(): s += str(i) + ": " + str(g) + "\n" raise ValueError("genotype.num_alt is None: " + str(genotype) + "\n" + str(v.toJSON()) + "\n" + s) genotype_str = genotype_map[genotype.num_alt] variant_str = "%s:%s %s>%s" % (chrom, pos, ref, alt) if "hgvsc" in vep and "hgvsp"in vep: #print("hgvs_c and/or hgvs_p WAS found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) hgvs_c = urllib.unquote(vep["hgvsc"]) hgvs_p = urllib.unquote(vep["hgvsp"]) else: hgvs_c = hgvs_p = "" #print("ERROR: hgvs_c and/or hgvs_p not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) rsid = annot["rsid"] or "" #rsid = vep["clinvar_rs"] exac_global_af, exac_popmax_af, exac_popmax_population = get_exac_af(chrom, pos, ref, alt) if exac_global_af is None: exac_global_af, exac_popmax_af, exac_popmax_population = 0, 0, "[variant not found in ExACv0.3]" else: exac_global_af_annot = str(annot["freqs"]["exac_v3"]) if abs(float(exac_global_af) - float(exac_global_af_annot)) > 0.01: print("Error annot['freqs']['exac_v3'] (%s) doesn't match %s" % (float(exac_global_af), float(exac_global_af_annot))) clinvar_clinsig = "" clinvar_clnrevstat = "" if "clin_sig" in vep: clinvar_clinsig_from_dbnsfp = vep["clin_sig"] else: clinvar_clinsig_from_dbnsfp = "" #print("ERROR: clin_sig not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) clinvar_records = [record for record in clinvar_vcf_file.fetch(chrom_without_chr, pos, pos) if record.POS == pos and record.REF == ref] #if clinvar_clinsig_from_dbnsfp or clinvar_records: # defensive programming #if clinvar_clinsig_from_dbnsfp and not clinvar_records: # raise ValueError("record has dbNSFP clinvar entry but is not in clinvar vcf: %s" % variant_str) #if not clinvar_clinsig_from_dbnsfp and clinvar_records: # raise ValueError("record doesn't have a dbNSFP clinvar entry but is in clinvar vcf: %s" % variant_str) if clinvar_records: #if len(clinvar_records) > 1: # raise ValueError("multiple clinvar records found for variant: %s" % variant_str) clinvar_record = clinvar_records[-1] clinvar_allele_indexes = map(int, clinvar_record.INFO["CLNALLE"]) clinvar_alleles = map(str, [clinvar_record.REF] + clinvar_record.ALT) xbrowse_alleles = map(str, [ref] + [alt]) clinvar_value_indexes_to_use = [i for i, clinvar_allele_index in enumerate(clinvar_allele_indexes) if str(clinvar_alleles[clinvar_allele_index]).upper() in xbrowse_alleles] clnrevstat = clinvar_record.INFO["CLNREVSTAT"] clnrevstat = [clnrevstat[i] for i in clinvar_value_indexes_to_use] clnsig = clinvar_record.INFO["CLNSIG"] clnsig = [clnsig[i] for i in clinvar_value_indexes_to_use] # print("Fetched clinvar %s: %s"% (clinvar_record, clinvar_record.INFO)) if clnsig: clinvar_clinsig_numbers = map(int, clnsig[0].split("|")) clinvar_clinsig = "|".join(set([clinsig_map[clinvar_clinsig_number][0] for clinvar_clinsig_number in clinvar_clinsig_numbers])) clinvar_clnrevstat = "|".join(set(clnrevstat[0].split("|"))) # get number_of_stars = "[not found]" if all_fields else "[not retrieved to save time]" clinvar_url = "http://www.ncbi.nlm.nih.gov/clinvar/?term="+chrom_without_chr+"[chr]+AND+"+str(pos)+"[chrpos37]" if clinvar_clinsig and all_fields: print("Reading from: " + clinvar_url) url_opener = urllib2.build_opener() url_opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11")] page_contents = url_opener.open(clinvar_url).read() match = re.search("(\d) star.? out of maximum of 4 stars", page_contents) if match: number_of_stars = int(match.group(1)) else: print("No match in page: " + clinvar_url) for line in page_contents.split("\n"): if "rev_stat_text hide" in line: print(" -- this line was expected to contain number of stars: " + line) row = map(str, [gene_name, genotype_str, variant_str, functional_class, hgvs_c, hgvs_p, rsid, exac_global_af, exac_popmax_af, exac_popmax_population, clinvar_clinsig, clinvar_clnrevstat, number_of_stars, clinvar_url, comments]) return row
def handle(self, *args, **options): number_of_variants_to_check = int( options.get("number_of_variants_to_check") or 20000) if not args: args = [p.project_id for p in Project.objects.all()] args.reverse() for project_id in args: try: project = Project.objects.get(project_id=project_id) except: print("ERROR: Project not found. Skipping..") continue all_counter = 0 #found_counter = 0 not_found_counter = 0 not_found_variants = [] for vcf_file in project.get_all_vcf_files(): path = vcf_file.file_path #print("Processing %s - %s" % (project.project_id, path)) if not os.path.isfile(path) and path.endswith(".vcf"): path = path + ".gz" if path.endswith(".gz"): f = gzip.open(path) else: f = open(path) if f: for variant in vcf_stuff.iterate_vcf(f): all_counter += 1 try: get_mall(project).annotator.get_annotation( variant.xpos, variant.ref, variant.alt) except ValueError, e: not_found_counter += 1 if len(not_found_variants) < 30: chrom, pos = genomeloc.get_chr_pos( variant.xpos) chrom = chrom.replace("chr", "") ref, alt = variant.ref, variant.alt not_found_variants.append( "%(chrom)s-%(pos)s-%(ref)s-%(alt)s" % locals()) #print("WARNING: variant not found in annotator cache: " + str(e)) #if not_found_counter > 5: # print("---- ERROR: 5 variants not found. Project %s should be reloaded." % project_id) # break found_counter = 0 #else: # found_counter += 1 # if found_counter > 15000: # #print("---- Found 5000 variants in a row. Project %s looks ok." % project_id) # break if all_counter >= number_of_variants_to_check: fraction_missing = float( not_found_counter) / all_counter if not_found_counter > 10: print( "---- ERROR: (%(fraction_missing)0.2f%%) %(not_found_counter)s / %(all_counter)s variants not found. Project %(project_id)s should be reloaded. Examples: " % locals()) for v in not_found_variants: print( "http://exac.broadinstitute.org/variant/" + v) break
def __str__(self): chr, pos = genomeloc.get_chr_pos(self.xpos) return "%s-%s-%s-%s:%s" % (chr, pos, self.ref, self.alt, self.project_tag.tag)