def update_annotator_variants_table(self): """Updates all db.variants population frequencies based on population_frequency""" population_frequency_store = mall.get_annotator().get_population_frequency_store() population_slugs_to_load = [ population_spec["slug"] for population_spec in annotator_settings.reference_populations ] annotator_store = mall.get_annotator().get_annotator_datastore() counter = 0 for variant_dict in annotator_store.variants.find(): counter += 1 if counter % 10000 == 0: print("%s: %s processed" % (datetime.datetime.now(), counter)) freqs = population_frequency_store.get_frequencies( variant_dict["xpos"], variant_dict["ref"], variant_dict["alt"] ) full_freqs = { "annotation.freqs." + population_slug: freqs.get(population_slug, 0) for population_slug in population_slugs_to_load } if sum(full_freqs.values()) > 0: # only update if atleast one of the freqs is > 0 annotator_store.variants.update( {"xpos": variant_dict["xpos"], "ref": variant_dict["ref"], "alt": variant_dict["alt"]}, {"$set": full_freqs}, upsert=False, )
def update_annotator_variants_table(self): """Updates all db.variants population frequencies based on population_frequency""" population_frequency_store = mall.get_annotator( ).get_population_frequency_store() population_slugs_to_load = [ population_spec['slug'] for population_spec in annotator_settings.reference_populations ] annotator_store = mall.get_annotator().get_annotator_datastore() counter = 0 for variant_dict in annotator_store.variants.find(): counter += 1 if counter % 10000 == 0: print("%s: %s processed" % (datetime.datetime.now(), counter)) freqs = population_frequency_store.get_frequencies( variant_dict['xpos'], variant_dict['ref'], variant_dict['alt']) full_freqs = { 'annotation.freqs.' + population_slug: freqs.get(population_slug, 0) for population_slug in population_slugs_to_load } if sum(full_freqs.values()) > 0: # only update if atleast one of the freqs is > 0 annotator_store.variants.update( { 'xpos': variant_dict['xpos'], 'ref': variant_dict['ref'], 'alt': variant_dict['alt'] }, {'$set': full_freqs}, upsert=False)
def load_project_variants_from_vcf(project_id, vcf_files): """ Load any families and cohorts in this project that aren't loaded already """ print("Called load_project_variants_from_vcf on " + str(vcf_files)) print "Loading project %s" % project_id print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) project = Project.objects.get(project_id=project_id) for vcf_file in vcf_files: r = vcf.VCFReader(filename=vcf_file) if "CSQ" in r.infos: mall.get_annotator().add_preannotated_vcf_file(vcf_file) else: mall.get_annotator().add_vcf_file_to_annotator(vcf_file) # batch load families by VCF file print("project.families_by_vcf(): " + str(project.families_by_vcf())) for vcf_file, families in project.families_by_vcf().items(): if vcf_file not in vcf_files: print("Skipping %(vcf_file)s since its not in %(vcf_files)s" % locals()) continue #families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded'] print("Loading families for VCF file: " + vcf_file) for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): #print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i+settings.FAMILY_LOAD_BATCH_SIZE])))) load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, mark_as_loaded=True) print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- finished loading project: " + project_id))
def load_project_variants_from_vcf(project_id, vcf_files, mark_as_loaded=True, start_from_chrom=None, end_with_chrom=None): """ Load any families and cohorts in this project that aren't loaded already Args: project_id: the project id as a string vcf_files: a list of one or more vcf file paths """ project = Project.objects.get(project_id=project_id) for vcf_file in vcf_files: if not os.path.isfile(vcf_file): print("Skipping " + vcf_file) continue r = vcf.VCFReader(filename=vcf_file) if "CSQ" not in r.infos: raise ValueError("VEP annotations not found in VCF: " + vcf_file) if vcf_file in vcf_files: mall.get_annotator().add_preannotated_vcf_file(vcf_file, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # batch load families by VCF file print("project.families_by_vcf(): " + str(project.families_by_vcf())) for vcf_file, families in project.families_by_vcf().items(): if vcf_file not in vcf_files: print("Skipping %(vcf_file)s since its not in %(vcf_files)s" % locals()) continue #families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded'] print("Loading families for VCF file: " + vcf_file) for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, mark_as_loaded=mark_as_loaded, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom)
def load_project_variants(project_id, force_annotations=False, ignore_csq_in_vcf=False): """ Load any families and cohorts in this project that aren't loaded already """ print "Loading project %s" % project_id print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) project = Project.objects.get(project_id=project_id) for vcf_obj in project.get_all_vcf_files(): r = vcf.VCFReader(filename=vcf_obj.path()) if not ignore_csq_in_vcf and "CSQ" not in r.infos: raise ValueError("VEP annotations not found in VCF: " + vcf_file) mall.get_annotator().add_preannotated_vcf_file(vcf_obj.path(), force=force_annotations) # batch load families by VCF file for vcf_file, families in project.families_by_vcf().items(): families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded'] for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i+settings.FAMILY_LOAD_BATCH_SIZE])) )) load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file) # now load cohorts load_cohorts(project_id)
def load_project_variants(project_id, force_load_annotations=False, force_load_variants=False, ignore_csq_in_vcf=False, start_from_chrom=None, end_with_chrom=None): """ Load any families and cohorts in this project that aren't loaded already """ print "Loading project %s" % project_id print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) project = Project.objects.get(project_id=project_id) for vcf_obj in sorted(project.get_all_vcf_files(), key=lambda v:v.path()): if not os.path.isfile(vcf_obj.path()): print("Skipping " + vcf_obj.path()) continue r = vcf.VCFReader(filename=vcf_obj.path()) if not ignore_csq_in_vcf and "CSQ" not in r.infos: raise ValueError("VEP annotations not found in VCF: " + vcf_obj.path()) mall.get_annotator().add_preannotated_vcf_file(vcf_obj.path(), force=force_load_annotations, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # batch load families by VCF file for vcf_file, families in project.families_by_vcf().items(): if not force_load_variants: # filter out families that have already finished loading families = [f for f in families if get_mall(project).variant_store.get_family_status(project_id, f.family_id) != 'loaded'] for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # now load cohorts load_cohorts(project_id)
def get_variants_from_variant_tuples(project, variant_tuples, user=None): datastore = get_datastore(project) population_slugs = project.get_reference_population_slugs() variant_tuples_by_family_id = {} for xpos, ref, alt, family_id in variant_tuples: if family_id not in variant_tuples_by_family_id: variant_tuples_by_family_id[family_id] = [] variant_tuples_by_family_id[family_id].append((xpos, ref, alt)) variants = [] for family_id, variant_tuples in variant_tuples_by_family_id.items(): variants_for_family = datastore.get_multiple_variants( project.project_id, family_id, variant_tuples, user=user ) for (xpos, ref, alt), variant in zip(variant_tuples, variants_for_family): if not variant: variant = Variant(xpos, ref, alt) get_annotator().annotate_variant(variant, population_slugs) variant.set_extra('created_variant', True) variant.set_extra('family_id', family_id) variant.set_extra('project_id', project.project_id) variants.append(variant) return variants
def get_variants_from_variant_tuples(project, variant_tuples, user=None): datastore = get_datastore(project) population_slugs = project.get_reference_population_slugs() variant_tuples_by_family_id = {} for xpos, ref, alt, family_id in variant_tuples: if family_id not in variant_tuples_by_family_id: variant_tuples_by_family_id[family_id] = [] variant_tuples_by_family_id[family_id].append((xpos, ref, alt)) variants = [] for family_id, variant_tuples in variant_tuples_by_family_id.items(): variants_for_family = datastore.get_multiple_variants( project.project_id, family_id, variant_tuples, user=user) for (xpos, ref, alt), variant in zip(variant_tuples, variants_for_family): if not variant: variant = Variant(xpos, ref, alt) get_annotator().annotate_variant(variant, population_slugs) variant.set_extra('created_variant', True) variant.set_extra('family_id', family_id) variant.set_extra('project_id', project.project_id) variants.append(variant) return variants
def load_project_variants(project_id, force_load_annotations=False, force_load_variants=False, ignore_csq_in_vcf=False, start_from_chrom=None, end_with_chrom=None): """ Load any families and cohorts in this project that aren't loaded already """ print "Loading project %s" % project_id print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) project = Project.objects.get(project_id=project_id) for vcf_obj in sorted(project.get_all_vcf_files(), key=lambda v: v.path()): if not os.path.isfile(vcf_obj.path()): print("Skipping " + vcf_obj.path()) continue r = vcf.VCFReader(filename=vcf_obj.path()) if not ignore_csq_in_vcf and "CSQ" not in r.infos: raise ValueError("VEP annotations not found in VCF: " + vcf_obj.path()) mall.get_annotator().add_preannotated_vcf_file( vcf_obj.path(), force=force_load_annotations, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # batch load families by VCF file for vcf_file, families in project.families_by_vcf().items(): if not force_load_variants: # filter out families that have already finished loading families = [ f for f in families if get_mall(project.project_id).variant_store. get_family_status(project_id, f.family_id) != 'loaded' ] for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i + settings.FAMILY_LOAD_BATCH_SIZE])))) load_variants_for_family_list( project, families[i:i + settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # now load cohorts load_cohorts(project_id)
def load_project_variants_from_vcf(project_id, vcf_files, mark_as_loaded=True, start_from_chrom=None, end_with_chrom=None): """ Load any families and cohorts in this project that aren't loaded already Args: project_id: the project id as a string vcf_files: a list of one or more vcf file paths """ print("Called load_project_variants_from_vcf on " + str(vcf_files)) print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) project = Project.objects.get(project_id=project_id) for vcf_file in vcf_files: if not os.path.isfile(vcf_file): print("Skipping " + vcf_file) continue r = vcf.VCFReader(filename=vcf_file) if "CSQ" not in r.infos: raise ValueError("VEP annotations not found in VCF: " + vcf_file) if vcf_file in vcf_files: mall.get_annotator().add_preannotated_vcf_file( vcf_file, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # batch load families by VCF file print("project.families_by_vcf(): " + str(project.families_by_vcf())) for vcf_file, families in project.families_by_vcf().items(): if vcf_file not in vcf_files: print("Skipping %(vcf_file)s since its not in %(vcf_files)s" % locals()) continue #families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded'] print("Loading families for VCF file: " + vcf_file) for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): #print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i+settings.FAMILY_LOAD_BATCH_SIZE])))) load_variants_for_family_list( project, families[i:i + settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, mark_as_loaded=mark_as_loaded, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- finished loading project: " + project_id))
def add_populations_to_variants(variants, population_slug_list): if population_slug_list: try: mall.get_annotator().get_population_frequency_store( ).add_populations_to_variants(variants, population_slug_list) except Exception, e: print( "WARNING: got unexpected error in add_custom_populations_to_variants: %s" % e)
def handle(self, *args, **options): if settings.CUSTOM_ANNOTATOR_SETTINGS is not None: print("Load dbNSFP.. ") # note that you could use mall.get_custom_annotator() here too custom_annotator = CustomAnnotator(settings.CUSTOM_ANNOTATOR_SETTINGS) custom_annotator.load() get_reference().load() mall.get_annotator().load()
def handle(self, *args, **options): if settings.CUSTOM_ANNOTATOR_SETTINGS is not None: print("Load dbNSFP.. ") # note that you could use mall.get_custom_annotator() here too custom_annotator = CustomAnnotator( settings.CUSTOM_ANNOTATOR_SETTINGS) custom_annotator.load() get_reference().load() mall.get_annotator().load()
def get_variants_from_note_tuples(project, note_tuples): variants = [] for note_t in note_tuples: variant = get_datastore(project.project_id).get_single_variant( project.project_id, note_t[3], note_t[0], note_t[1], note_t[2] ) if not variant: variant = Variant(note_t[0], note_t[1], note_t[2]) get_annotator().annotate_variant(variant, project.get_reference_population_slugs()) # variant.annotation = get_annotator().get_variant(note_t[0], note_t[1], note_t[2]) variant.set_extra("family_id", note_t[3]) variant.set_extra("project_id", project.project_id) variants.append(variant) return variants
def inheritance_matrix_for_gene(project, gene_id): """ Run get_family_matrix_for_gene for the families in this project """ variant_filter = get_default_variant_filter('moderate_impact', mall.get_annotator().reference_population_slugs) quality_filter = get_default_quality_filter('high_quality', mall.get_annotator().reference_population_slugs) matrix = get_family_matrix_for_gene( get_mall(), [f.xfamily() for f in project.get_active_families()], gene_id, variant_filter, quality_filter ) return matrix
def get_variants_from_variant_tuples(project, variant_tuples): variants = [] for t in variant_tuples: variant = get_datastore(project.project_id).get_single_variant( project.project_id, t[3], t[0], t[1], t[2]) if not variant: variant = Variant(t[0], t[1], t[2]) get_annotator().annotate_variant( variant, project.get_reference_population_slugs()) variant.set_extra('family_id', t[3]) variant.set_extra('project_id', project.project_id) variants.append(variant) return variants
def inheritance_matrix_for_gene(project, gene_id): """ Run get_family_matrix_for_gene for the families in this project """ variant_filter = get_default_variant_filter( 'moderate_impact', mall.get_annotator().reference_population_slugs) quality_filter = get_default_quality_filter( 'high_quality', mall.get_annotator().reference_population_slugs) matrix = get_family_matrix_for_gene( get_mall(project.project_id), [f.xfamily() for f in project.get_active_families()], gene_id, variant_filter, quality_filter) return matrix
def family_group_gene(request, project_id, family_group_slug, gene_id): project = get_object_or_404(Project, project_id=project_id) family_group = get_object_or_404(FamilyGroup, project=project, slug=family_group_slug) if not project.can_view(request.user): return HttpResponse('unauthorized') gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) varfilter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) variants_by_family = family_group_analysis.get_variants_in_gene( family_group, gene_id, variant_filter=varfilter) return render( request, 'family_group/family_group_gene.html', { 'project': project, 'family_group': family_group, 'family_group_json': json.dumps(family_group.toJSON()), 'gene_json': json.dumps(gene), 'gene': gene, 'variants_by_family_json': json.dumps(variants_by_family), })
def get_knockouts_in_gene(project, gene_id, quality_filter=None): """ Get all the variants in a gene, but filter out quality_filter genotypes """ indiv_id_list = [i.indiv_id for i in project.get_individuals()] # filter out variants > 0.01 AF in any of the reference populations reference_populations = mall.get_annotator().reference_population_slugs variant_filter = get_default_variant_filter('moderate_impact', reference_populations) variant_list = get_project_datastore( project.project_id).get_project_variants_in_gene( project.project_id, gene_id, variant_filter=variant_filter, ) variant_list = search_utils.filter_gene_variants_by_variant_filter( variant_list, gene_id, variant_filter) variation = CohortGeneVariation( get_reference(), gene_id, variant_list, indiv_id_list, quality_filter={}, ) knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list) return knockouts, variation
def handle(self, *args, **options): """load CADD scores for all variants in a project, or all variants in the annotator_store.""" annotator_store = mall.get_annotator().get_annotator_datastore() if options['cadd_file']: print("Loading " + options['cadd_file']) load_from_cadd_file(options['cadd_file']) elif options['project_id']: print("Loading " + options['project_id']) project = Project.objects.get(project_id=options['project_id']) variant_collection = get_project_datastore(project)._get_project_collection(options['project_id']).find({'annotation.cadd_phred': {'$exists' : False}}) else: variant_collection = annotator_store.variants.find({'annotation.cadd_phred': {'$exists' : False}}) #print("Variant collection: " + str(variant_collection)) #print("Annotating %s variants" % variant_collection.count()) for r in tqdm.tqdm(variant_collection, unit=' variants'): #, total=variant_collection.count()): chrom, pos = genomeloc.get_chr_pos(r['xpos']) cadd_phred = fetch(chrom, pos, r['ref'], r['alt']) if cadd_phred is not None: result = annotator_store.variants.update({'xpos': r['xpos'], 'ref': r['ref'], 'alt': r['alt']}, {'$set': {'annotation.cadd_phred': cadd_phred}}, upsert=False) assert result['updatedExisting'] print("Done")
def look_up_vcf_loaded_date(vcf_path): vcf_record = get_annotator().get_vcf_file_from_annotator(vcf_path) if vcf_record is None: raise ValueError("Couldn't find loaded date for %s" % vcf_path) loaded_date = vcf_record['_id'].generation_time logger.info("%s data-loaded date: %s" % (vcf_path, loaded_date)) return loaded_date
def look_up_vcf_loaded_date(vcf_path): vcf_record = get_annotator().get_vcf_file_from_annotator(vcf_path) if vcf_record is None: raise ValueError("Couldn't find loaded date for %s" % vcf_path) loaded_date = vcf_record['_id'].generation_time # logger.info("%s data-loaded date: %s" % (vcf_path, loaded_date)) return loaded_date
def handle(self, *args, **options): if not args: print("Must provide at least one project_id") return for project_id in args: print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) project = Project.objects.get(project_id=project_id) for vcf_obj in project.get_all_vcf_files(): r = vcf.VCFReader(filename=vcf_obj.path()) if "CSQ" not in r.infos: print("VCF %s isn't annotated (eg. doesn't have a CSQ)" % str(vcf_obj.path())) else: print("Loading VCF %s with CSQ: %s" % (vcf_obj.path(), r.infos["CSQ"])) mall.get_annotator().add_preannotated_vcf_file(vcf_obj.path(), force=True) print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache"))
def get_variants_from_variant_tuples(project, variant_tuples): variants = [] for t in variant_tuples: variant = get_datastore(project.project_id).get_single_variant( project.project_id, t[3], t[0], t[1], t[2] ) if not variant: variant = Variant(t[0], t[1], t[2]) get_annotator().annotate_variant(variant, project.get_reference_population_slugs()) variant.set_extra('family_id', t[3]) variant.set_extra('project_id', project.project_id) variants.append(variant) return variants
def update_pop_freqs_in_family_tables(self): # Load family tables population_frequency_store = mall.get_annotator().get_population_frequency_store() db = sqlite3.connect("reference_populations_family_tables.db", isolation_level=None) db.execute( "CREATE TABLE if not exists all_projects(project_id varchar(200), family_id varchar(200), started bool, finished bool)" ) db.execute("CREATE UNIQUE INDEX IF NOT EXISTS all_projects_idx ON all_projects(project_id, family_id)") for project in Project.objects.all().order_by("-last_accessed_date"): project_id = project.project_id datastore = get_datastore(project_id) for i, family_info in enumerate(datastore._get_family_info(project_id)): family_id = family_info["family_id"] db.execute("INSERT OR IGNORE INTO all_projects VALUES (?, ?, 0, 0)", (project_id, family_id)) # Go through each project in decending order population_slugs_to_load = [ population_spec["slug"] for population_spec in annotator_settings.reference_populations_to_load ] while True: remaining_work = list( db.execute("SELECT project_id, family_id FROM all_projects WHERE started=0 ORDER BY RANDOM()") ) print("%d projects / families remaining" % len(remaining_work)) if not remaining_work: print("Done with all projects/families") break project_id, family_id = remaining_work[0] datastore = get_datastore(project_id) print(" updating %s / %s" % (project_id, family_id)) db.execute("UPDATE all_projects SET started=1 WHERE project_id=? AND family_id=?", (project_id, family_id)) family_collection = datastore._get_family_collection(project_id, family_id) for variant_dict in family_collection.find(): freqs = population_frequency_store.get_frequencies( variant_dict["xpos"], variant_dict["ref"], variant_dict["alt"] ) full_freqs = { "db_freqs." + population_slug: freqs.get(population_slug, 0) for population_slug in population_slugs_to_load } family_collection.update( {"xpos": variant_dict["xpos"], "ref": variant_dict["ref"], "alt": variant_dict["alt"]}, {"$set": full_freqs}, upsert=False, ) # print("---------\nvariant_dict: %s, \nfreqs: %s, \nupdated_variant_dict: %s" % (variant_dict, full_freqs, str(family_collection.find_one( # {'xpos':variant_dict['xpos'], 'ref' :variant_dict['ref'], 'alt': variant_dict['alt']})))) print(" ---> done updating project_id: %s, family_id: %s" % (project_id, family_id)) db.execute("UPDATE all_projects SET finished=1 WHERE project_id=? AND family_id=?", (project_id, family_id))
def load_project_variants_from_vcf(project_id, vcf_files): """ Load any families and cohorts in this project that aren't loaded already """ print("Called load_project_variants_from_vcf on " + str(vcf_files)) print "Loading project %s" % project_id print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) project = Project.objects.get(project_id=project_id) for vcf_file in vcf_files: r = vcf.VCFReader(filename=vcf_file) if "CSQ" in r.infos: mall.get_annotator().add_preannotated_vcf_file(vcf_file) else: mall.get_annotator().add_vcf_file_to_annotator(vcf_file) # batch load families by VCF file print("project.families_by_vcf(): " + str(project.families_by_vcf())) for vcf_file, families in project.families_by_vcf().items(): if vcf_file not in vcf_files: print("Skipping %(vcf_file)s since its not in %(vcf_files)s" % locals()) continue #families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded'] print("Loading families for VCF file: " + vcf_file) for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): #print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i+settings.FAMILY_LOAD_BATCH_SIZE])))) load_variants_for_family_list( project, families[i:i + settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, mark_as_loaded=True) print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- finished loading project: " + project_id))
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene)) variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) num_indivs = len( [i for i in project.get_individuals() if i.has_variant_data()]) aac_threshold = (.2 * num_indivs) + 5 rare_variants = [] for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): aac = get_alt_allele_count(variant) max_af = max(variant.annotation['freqs'].values()) if aac <= aac_threshold and max_af < .01: rare_variants.append(variant) add_extra_info_to_variants_project(get_reference(), project, rare_variants) knockouts = [] knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for kid in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([kid]) add_extra_info_to_variants_project(get_reference(), project, variants) knockouts.append({ 'indiv_id': kid, 'variants': [v.toJSON() for v in variants], }) sys.stderr.write("Retrieved %s variants \n" % len(rare_variants)) return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]), 'knockouts_json': json.dumps(knockouts), })
def update_pop_freqs_in_project_tables(self): # Load project tables population_frequency_store = mall.get_annotator().get_population_frequency_store() db = sqlite3.connect("reference_populations_project_tables.db", isolation_level=None) db.execute("CREATE TABLE if not exists all_projects(project_id varchar(200), started bool, finished bool)") db.execute("CREATE UNIQUE INDEX IF NOT EXISTS all_projects_idx ON all_projects(project_id)") import random other_project_ids = [p.project_id for p in Project.objects.all() if p.project_id != "myoseq_v11"] random.shuffle(other_project_ids) project_ids = ["myoseq_v11"] + other_project_ids for project_id in project_ids: db.execute("INSERT OR IGNORE INTO all_projects VALUES (?, 0, 0)", (project_id,)) # Go through each project and update the variant records population_slugs_to_load = [ population_spec["slug"] for population_spec in annotator_settings.reference_populations ] while True: remaining_work = list(db.execute("SELECT project_id FROM all_projects WHERE started=0")) print("%d projects remaining" % len(remaining_work)) if not remaining_work: print("Done with all projects") break project_id, = remaining_work[0] project_store = get_project_datastore(project_id) print(" updating %s " % project_id) db.execute("UPDATE all_projects SET started=1 WHERE project_id=?", (project_id,)) project_collection = project_store._get_project_collection(project_id) for variant_dict in project_collection.find(): freqs = population_frequency_store.get_frequencies( variant_dict["xpos"], variant_dict["ref"], variant_dict["alt"] ) full_freqs = { "db_freqs." + population_slug: freqs.get(population_slug, 0) for population_slug in population_slugs_to_load } project_collection.update( {"xpos": variant_dict["xpos"], "ref": variant_dict["ref"], "alt": variant_dict["alt"]}, {"$set": full_freqs}, upsert=False, ) print(" ---> done updating project_id: %s" % project_id) db.execute("UPDATE all_projects SET finished=1 WHERE project_id=?", (project_id,))
def load_project_variants(project_id, force_annotations=False): """ Load any families and cohorts in this project that aren't loaded already """ print "Loading project %s" % project_id project = Project.objects.get(project_id=project_id) for vcf in project.get_all_vcf_files(): mall.get_annotator().add_vcf_file_to_annotator(vcf.path(), force_all=force_annotations) # batch load families by VCF file for vcf_file, families in project.families_by_vcf().items(): families = [f for f in families if get_mall().variant_store.get_family_status(project_id, f.family_id) != 'loaded'] for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file) # now load cohorts # TODO: load cohorts and families together for vcf_file, cohorts in project.cohorts_by_vcf().items(): cohorts = [c for c in cohorts if get_mall().variant_store.get_family_status(project_id, c.cohort_id) != 'loaded'] for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE): load_variants_for_cohort_list(project, cohorts[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file) print "Finished loading project %s!" % project_id
def update_pop_freqs_in_project_tables(self): # Load project tables population_frequency_store = mall.get_annotator().get_population_frequency_store() db = sqlite3.connect("reference_populations_project_tables.db", isolation_level=None) db.execute("CREATE TABLE if not exists all_projects(project_id varchar(200), started bool, finished bool)") db.execute("CREATE UNIQUE INDEX IF NOT EXISTS all_projects_idx ON all_projects(project_id)") import random other_project_ids = [p.project_id for p in Project.objects.all() if p.project_id != "myoseq_v11"] random.shuffle(other_project_ids) project_ids = ["myoseq_v11"] + other_project_ids for project_id in project_ids: db.execute("INSERT OR IGNORE INTO all_projects VALUES (?, 0, 0)", (project_id,)) # Go through each project and update the variant records population_slugs_to_load = [population_spec['slug'] for population_spec in annotator_settings.reference_populations] while True: remaining_work = list(db.execute("SELECT project_id FROM all_projects WHERE started=0")) print("%d projects remaining" % len(remaining_work)) if not remaining_work: print("Done with all projects") break project_id, = remaining_work[0] project_store = get_project_datastore(project_id) print(" updating %s " % project_id) db.execute("UPDATE all_projects SET started=1 WHERE project_id=?", (project_id,)) project_collection = project_store._get_project_collection(project_id) for variant_dict in project_collection.find(): freqs = population_frequency_store.get_frequencies(variant_dict['xpos'], variant_dict['ref'], variant_dict['alt']) full_freqs = {'db_freqs.'+population_slug: freqs.get(population_slug, 0) for population_slug in population_slugs_to_load} project_collection.update({'xpos':variant_dict['xpos'], 'ref' :variant_dict['ref'], 'alt': variant_dict['alt']}, {'$set': full_freqs}, upsert=False) print(" ---> done updating project_id: %s" % project_id) db.execute("UPDATE all_projects SET finished=1 WHERE project_id=?", (project_id,))
def update_pop_freqs_in_family_tables(self): # Load family tables population_frequency_store = mall.get_annotator().get_population_frequency_store() db = sqlite3.connect("reference_populations_family_tables.db", isolation_level=None) db.execute("CREATE TABLE if not exists all_projects(project_id varchar(200), family_id varchar(200), started bool, finished bool)") db.execute("CREATE UNIQUE INDEX IF NOT EXISTS all_projects_idx ON all_projects(project_id, family_id)") for project in Project.objects.all().order_by('-last_accessed_date'): project_id = project.project_id datastore = get_datastore(project_id) for i, family_info in enumerate(datastore._get_family_info(project_id)): family_id = family_info['family_id'] db.execute("INSERT OR IGNORE INTO all_projects VALUES (?, ?, 0, 0)", (project_id, family_id)) # Go through each project in decending order population_slugs_to_load = [population_spec['slug'] for population_spec in annotator_settings.reference_populations_to_load] while True: remaining_work = list(db.execute("SELECT project_id, family_id FROM all_projects WHERE started=0 ORDER BY RANDOM()")) print("%d projects / families remaining" % len(remaining_work)) if not remaining_work: print("Done with all projects/families") break project_id, family_id = remaining_work[0] datastore = get_datastore(project_id) print(" updating %s / %s" % (project_id, family_id)) db.execute("UPDATE all_projects SET started=1 WHERE project_id=? AND family_id=?", (project_id, family_id)) family_collection = datastore._get_family_collection(project_id, family_id) for variant_dict in family_collection.find(): freqs = population_frequency_store.get_frequencies(variant_dict['xpos'], variant_dict['ref'], variant_dict['alt']) full_freqs = {'db_freqs.'+population_slug: freqs.get(population_slug, 0) for population_slug in population_slugs_to_load} family_collection.update({'xpos':variant_dict['xpos'], 'ref' :variant_dict['ref'], 'alt': variant_dict['alt']}, {'$set': full_freqs}, upsert=False) #print("---------\nvariant_dict: %s, \nfreqs: %s, \nupdated_variant_dict: %s" % (variant_dict, full_freqs, str(family_collection.find_one( # {'xpos':variant_dict['xpos'], 'ref' :variant_dict['ref'], 'alt': variant_dict['alt']})))) print(" ---> done updating project_id: %s, family_id: %s" % (project_id, family_id)) db.execute("UPDATE all_projects SET finished=1 WHERE project_id=? AND family_id=?", (project_id, family_id))
def handle(self, *args, **options): """load CADD scores for all variants in a project, or all variants in the annotator_store.""" annotator_store = mall.get_annotator().get_annotator_datastore() if options['cadd_file']: print("Loading " + options['cadd_file']) load_from_cadd_file(options['cadd_file']) elif options['project_id']: print("Loading " + options['project_id']) project = Project.objects.get(project_id=options['project_id']) variant_collection = get_project_datastore( project)._get_project_collection(options['project_id']).find( {'annotation.cadd_phred': { '$exists': False }}) else: variant_collection = annotator_store.variants.find( {'annotation.cadd_phred': { '$exists': False }}) #print("Variant collection: " + str(variant_collection)) #print("Annotating %s variants" % variant_collection.count()) for r in tqdm.tqdm( variant_collection, unit=' variants'): #, total=variant_collection.count()): chrom, pos = genomeloc.get_chr_pos(r['xpos']) cadd_phred = fetch(chrom, pos, r['ref'], r['alt']) if cadd_phred is not None: result = annotator_store.variants.update( { 'xpos': r['xpos'], 'ref': r['ref'], 'alt': r['alt'] }, {'$set': { 'annotation.cadd_phred': cadd_phred }}, upsert=False) assert result['updatedExisting'] print("Done")
def get_knockouts_in_gene(project, gene_id, gene_variants): """ Get all the variants in a gene, but filter out quality_filter genotypes """ indiv_id_list = [i.indiv_id for i in project.get_individuals()] # filter out variants > 0.01 AF in any of the reference populations reference_populations = mall.get_annotator().reference_population_slugs variant_filter = get_default_variant_filter('moderate_impact', reference_populations) variant_list = search_utils.filter_gene_variants_by_variant_filter(gene_variants, gene_id, variant_filter) variation = CohortGeneVariation( get_reference(), gene_id, variant_list, indiv_id_list, quality_filter={}, ) knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list) return knockouts, variation
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene)) variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) num_indivs = len([i for i in project.get_individuals() if i.has_variant_data()]) aac_threshold = (.2 * num_indivs) + 5 rare_variants = [] for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter): aac = get_alt_allele_count(variant) max_af = max(variant.annotation['freqs'].values()) if aac <= aac_threshold and max_af < .01: rare_variants.append(variant) add_extra_info_to_variants_project(get_reference(), project, rare_variants) knockouts = [] knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for kid in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([kid]) add_extra_info_to_variants_project(get_reference(), project, variants) knockouts.append({ 'indiv_id': kid, 'variants': [v.toJSON() for v in variants], }) sys.stderr.write("Retrieved %s variants \n" % len(rare_variants)) return render(request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]), 'knockouts_json': json.dumps(knockouts), })
def family_group_gene(request, project_id, family_group_slug, gene_id): project = get_object_or_404(Project, project_id=project_id) family_group = get_object_or_404(FamilyGroup, project=project, slug=family_group_slug) if not project.can_view(request.user): return HttpResponse('unauthorized') gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) varfilter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) variants_by_family = family_group_analysis.get_variants_in_gene(family_group, gene_id, variant_filter=varfilter) return render(request, 'family_group/family_group_gene.html', { 'project': project, 'family_group': family_group, 'family_group_json': json.dumps(family_group.toJSON()), 'gene_json': json.dumps(gene), 'gene': gene, 'variants_by_family_json': json.dumps(variants_by_family), })
def default_variant_filters_json(): filters = get_default_variant_filters( mall.get_annotator().reference_population_slugs) for item in filters: item['variant_filter'] = item['variant_filter'].toJSON() return filters
def default_variant_filters_json(): filters = get_default_variant_filters(mall.get_annotator().reference_population_slugs) for item in filters: item['variant_filter'] = item['variant_filter'].toJSON() return filters
def preload_vep_vcf_annotations(vcf_file_path): mall.get_annotator().preload_vep_annotated_vcf(open(vcf_file_path))
def handle(self, *args, **options): if len(args) != 2: sys.exit("ERROR: please specify the project_id and file of individual ids as command line args.") project_id = args[0] individuals_file = args[1] # init objects project = Project.objects.get(project_id=project_id) all_individual_ids_in_project = set([i.indiv_id for i in project.get_individuals()]) individuals_of_interest = [] invalid_individual_ids = [] with open(individuals_file) as f: for line in f: line = line.strip('\n') if not line or line.startswith("#"): continue individual_id = line.split("\t")[0] if individual_id in all_individual_ids_in_project: individuals_of_interest.append(individual_id) else: invalid_individual_ids.append(individual_id) print("Processing %s: %d individuals " % (project_id, len(individuals_of_interest))) if invalid_individual_ids: num_invalid = len(invalid_individual_ids) total_ids = len(all_individual_ids_in_project) sys.exit(("ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: " "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s") % locals()) # filter variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold)) variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold)) quality_filter = { 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # create individuals_variants.tsv individual_variants_f = gzip.open('individuals_in_%s.tsv.gz' % project_id, 'w') writer = csv.writer(individual_variants_f, dialect='excel', delimiter='\t') header_fields = [ 'project_id', 'family_id', 'individual_id', 'gene', 'chrom', 'pos', 'ref', 'alt', 'rsid', 'annotation', '1kg_af', '1kg_popmax_af', 'exac_af', 'exac_popmax_af', 'merck_wgs_3793_af', 'genotype_str', 'genotype_num_alt', 'genotype_allele_balance', 'genotype_AD', 'genotype_DP', 'genotype_GQ', 'genotype_PL', 'genotype_filter', ] writer.writerow(header_fields) # collect the resources that we'll need here annotator = get_annotator() custom_population_store = get_custom_population_store() individual_counter = 0 for i, family in enumerate(project.get_families()): for individual in family.get_individuals(): if individual.indiv_id not in individuals_of_interest: continue individual_counter += 1 print("%s: %s, individual %s" % (individual_counter, family.family_id, individual.indiv_id)) for variant in get_variants(get_datastore(project.project_id), family.xfamily(), variant_filter = variant_filter, quality_filter = quality_filter, indivs_to_consider = [individual.indiv_id] ): genotype = variant.get_genotype(individual.indiv_id) if len(genotype.alleles) == 0 or genotype.extras["dp"] < DP_threshold or genotype.num_alt == 0: continue custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt) genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./." g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3'] g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax'] exac_freq = variant.annotation['freqs']['exac_v3'] exac_popmax_freq = variant.annotation['freqs']['exac_v3_popmax'] merck_wgs_3793_freq = custom_populations.get('merck-wgs-3793', 0.0) assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold) assert merck_wgs_3793_freq <= merck_wgs_3793_threshold assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq) assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"]) if genotype.num_alt == 1: assert genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab) assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter) writer.writerow(map(str, [ project_id, family.family_id, individual.indiv_id, get_gene_symbol(variant), variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id, variant.annotation['vep_group'], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, merck_wgs_3793_freq, genotype_str, genotype.num_alt, genotype.ab, genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"], genotype.filter, ])) individual_variants_f.flush() individual_variants_f.close()
def handle(self, *args, **options): mall.get_annotator().load()
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") if project.project_status == Project.NEEDS_MORE_PHENOTYPES and not request.user.is_staff: return render(request, 'analysis_unavailable.html', {'reason': 'Awaiting phenotype data.'}) # other projects this user can view if request.user.is_staff: other_projects = [p for p in Project.objects.all()] # if p != project else: other_projects = [ c.project for c in ProjectCollaborator.objects.filter(user=request.user) ] # if c.project != project other_projects = filter( lambda p: get_project_datastore(p.project_id). project_collection_is_loaded(p.project_id), other_projects) if other_projects: other_projects_json = json.dumps([{ 'project_id': p.project_id, 'project_name': p.project_name } for p in sorted(other_projects, key=lambda p: p.project_id)]) else: other_projects_json = None if gene_id is None: return render( request, 'project/gene_quicklook.html', { 'project': project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, 'other_projects_json': other_projects_json, }) projects_to_search_param = request.GET.get('selected_projects') if projects_to_search_param: projects_to_search = [] project_ids = projects_to_search_param.split(",") for project_id in project_ids: project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") projects_to_search.append(project) else: projects_to_search = [project] gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write( project_id + " - staring gene search for: %s in projects: %s\n" % (gene_id, ",".join([p.project_id for p in projects_to_search]) + "\n")) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) indiv_id_to_project_id = {} rare_variant_dict = {} rare_variants = [] for project in projects_to_search: project_variants = [] for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): max_af = max(variant.annotation['freqs'].values()) if not any([ indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0 ]): continue if max_af >= .01: continue # add project id to genotypes for indiv_id in variant.genotypes: indiv_id_to_project_id[indiv_id] = project.project_id # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project) variant_id = "%s-%s-%s-%s" % (variant.chr, variant.pos, variant.ref, variant.alt) if variant_id not in rare_variant_dict: rare_variant_dict[variant_id] = variant project_variants.append(variant) else: rare_variant_dict[variant_id].genotypes.update( variant.genotypes) #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation'])) add_extra_info_to_variants_project(get_reference(), project, project_variants) rare_variants.extend(project_variants) sys.stderr.write("Retreived %s rare variants\n" % len(rare_variants)) # compute knockout individuals individ_ids_and_variants = [] for project in projects_to_search: knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids( [indiv_id]) add_extra_info_to_variants_project(get_reference(), project, variants) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) #sys.stderr.write("%s : %s: Retrieved %s knockout variants\n" % (project.project_id, indiv_id, len(variants), )) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format( download_csv, gene["transcript_name"]) if download_csv == 'knockouts': individuals_to_include = [ individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants ] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace( "%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get( "1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get( "exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = [ "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] + list( map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i], individuals_to_include)) writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [ v.toJSON() for v in variants ] return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([ i.get_json_obj() for project in projects_to_search for i in project.get_individuals() ]), 'knockouts_json': json.dumps(individ_ids_and_variants), 'other_projects_json': other_projects_json, })
def handle(self, *args, **options): if not args: sys.exit("ERROR: please specify project id on the command line") if len(args) > 1: sys.exit("ERROR: too many args: %s. Only one project id should be provided." % " ".join(args) ) project_id = args[0] # create family_variants.tsv family_variants_f = gzip.open('family_variants_%s.tsv.gz' % project_id, 'w') writer = csv.writer(family_variants_f, dialect='excel', delimiter='\t') header_fields = [ '#inheritance_mode', 'project_id', 'family_id', 'gene', 'chrom', 'pos', 'ref', 'alt', 'rsid', 'annotation', '1kg_af', '1kg_popmax_af', 'exac_af', 'exac_popmax_af', '', ] genotype_headers = [ 'sample_id', 'str', 'num_alt', 'allele_balance', 'AD', 'DP', 'GQ', 'PL', ] for i in range(0, 10): for h in genotype_headers: header_fields.append("genotype%d_%s" % (i, h)) writer.writerow(header_fields) family_variants_f.flush() for inheritance_mode in ['dominant', 'homozygous_recessive', 'compound_het', 'de_novo', 'x_linked_recessive']: # collect the resources that we'll need here annotator = mall.get_annotator() custom_population_store = mall.get_custom_population_store() project = Project.objects.get(project_id=project_id) families = project.get_families() # get the variants for this inheritance / project combination for i, (family, variant_list) in enumerate(get_variants_for_inheritance_for_project(project, inheritance_mode)): for variant in variant_list: #if variant.annotation['vep_group'] != "missense": # continue custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt) g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3'] g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax'] exac_freq = variant.annotation['freqs']['exac_v3'] exac_popmax_freq = variant.annotation['freqs']['exac_v3_popmax'] assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold) row = [ inheritance_mode, project_id, family.family_id, get_gene_symbol(variant), variant.chr, str(variant.pos), variant.ref, variant.alt, variant.vcf_id, variant.annotation['vep_group'], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, '', ] for i, individual in enumerate(family.get_individuals()): if i >= 10: break genotype = variant.get_genotype(individual.indiv_id) if genotype is None: print("WARNING: %s variant genotype for %s is None" % (variant, individual.indiv_id)) continue assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter) assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq) assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"]) if genotype.num_alt == 1: assert genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab) genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./." row.extend([ individual.indiv_id, genotype_str, genotype.num_alt, genotype.ab, genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"],]) writer.writerow(row) family_variants_f.flush() family_variants_f.close()
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") if project.project_status == Project.NEEDS_MORE_PHENOTYPES and not request.user.is_staff: return render(request, 'analysis_unavailable.html', {'reason': 'Awaiting phenotype data.'}) if gene_id is None: return render( request, 'project/gene_quicklook.html', { 'project': project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, }) gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene)) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) rare_variants = [] for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): max_af = max(variant.annotation['freqs'].values()) if not any([ indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0 ]): continue if max_af < .01: rare_variants.append(variant) #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation'])) add_extra_info_to_variants_project(get_reference(), project, rare_variants) # compute knockout individuals individ_ids_and_variants = [] knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([indiv_id]) add_extra_info_to_variants_project(get_reference(), project, variants) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) sys.stderr.write( "Project-wide gene search retrieved %s rare variants for gene: %s \n" % (len(rare_variants), gene_id)) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format( download_csv, gene["transcript_name"]) if download_csv == 'knockouts': individuals_to_include = [ individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants ] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " if genotype.num_alt > 0: genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace( "%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get( "1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get( "exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " if genotype.num_alt > 0: genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = [ "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] + individuals_to_include writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [ v.toJSON() for v in variants ] return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps( [i.get_json_obj() for i in project.get_individuals()]), 'knockouts_json': json.dumps(individ_ids_and_variants), })
def handle(self, *args, **options): if len(args) != 2: sys.exit( "ERROR: please specify the project_id and file of individual ids as command line args." ) project_id = args[0] individuals_file = args[1] # init objects project = Project.objects.get(project_id=project_id) all_individual_ids_in_project = set( [i.indiv_id for i in project.get_individuals()]) individuals_of_interest = [] invalid_individual_ids = [] with open(individuals_file) as f: for line in f: line = line.strip('\n') if not line or line.startswith("#"): continue individual_id = line.split("\t")[0] if individual_id in all_individual_ids_in_project: individuals_of_interest.append(individual_id) else: invalid_individual_ids.append(individual_id) print("Processing %s: %d individuals " % (project_id, len(individuals_of_interest))) if invalid_individual_ids: num_invalid = len(invalid_individual_ids) total_ids = len(all_individual_ids_in_project) sys.exit(( "ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: " "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s" ) % locals()) # filter variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append( ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append( ('exac_v3_popmax', exac_popmax_threshold)) variant_filter.ref_freqs.append( ('merck-wgs-3793', merck_wgs_3793_threshold)) quality_filter = { 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # create individuals_variants.tsv individual_variants_f = gzip.open( 'individuals_in_%s.tsv.gz' % project_id, 'w') writer = csv.writer(individual_variants_f, dialect='excel', delimiter='\t') header_fields = [ 'project_id', 'family_id', 'individual_id', 'gene', 'chrom', 'pos', 'ref', 'alt', 'rsid', 'annotation', '1kg_af', '1kg_popmax_af', 'exac_af', 'exac_popmax_af', 'merck_wgs_3793_af', 'genotype_str', 'genotype_num_alt', 'genotype_allele_balance', 'genotype_AD', 'genotype_DP', 'genotype_GQ', 'genotype_PL', 'genotype_filter', ] writer.writerow(header_fields) # collect the resources that we'll need here annotator = get_annotator() custom_population_store = get_custom_population_store() individual_counter = 0 for i, family in enumerate(project.get_families()): for individual in family.get_individuals(): if individual.indiv_id not in individuals_of_interest: continue individual_counter += 1 print("%s: %s, individual %s" % (individual_counter, family.family_id, individual.indiv_id)) for variant in get_variants( get_datastore(project.project_id), family.xfamily(), variant_filter=variant_filter, quality_filter=quality_filter, indivs_to_consider=[individual.indiv_id]): genotype = variant.get_genotype(individual.indiv_id) if len(genotype.alleles) == 0 or genotype.extras[ "dp"] < DP_threshold or genotype.num_alt == 0: continue custom_populations = custom_population_store.get_frequencies( variant.xpos, variant.ref, variant.alt) genotype_str = "/".join( genotype.alleles) if genotype.alleles else "./." g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3'] g1k_popmax_freq = variant.annotation['freqs'][ '1kg_wgs_phase3_popmax'] exac_freq = variant.annotation['freqs']['exac_v3'] exac_popmax_freq = variant.annotation['freqs'][ 'exac_v3_popmax'] merck_wgs_3793_freq = custom_populations.get( 'merck-wgs-3793', 0.0) assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % ( g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % ( g1k_popmax_freq, g1k_popmax_freq_threshold) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % ( exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % ( exac_popmax_freq, exac_popmax_threshold) assert merck_wgs_3793_freq <= merck_wgs_3793_threshold assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.gq) assert genotype.extras[ "dp"] >= DP_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.extras["dp"]) if genotype.num_alt == 1: assert genotype.ab >= AB_threshold / 100., "%s %s - AB is %s " % ( variant.chr, variant.pos, genotype.ab) assert genotype.filter == "pass", "%s %s - filter is %s " % ( variant.chr, variant.pos, genotype.filter) writer.writerow( map(str, [ project_id, family.family_id, individual.indiv_id, get_gene_symbol(variant), variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id, variant.annotation['vep_group'], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, merck_wgs_3793_freq, genotype_str, genotype.num_alt, genotype.ab, genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"], genotype.filter, ])) individual_variants_f.flush() individual_variants_f.close()
def handle_project(project_id): filename = 'family_variants_%s.tsv.gz' % project_id print("Generating report: " + filename) # create family_variants.tsv family_variants_f = gzip.open(filename, 'w') writer = csv.writer(family_variants_f, dialect='excel', delimiter='\t') header_fields = [ '#inheritance_mode', 'project_id', 'family_id', 'gene', 'chrom', 'pos', 'ref', 'alt', 'rsid', 'filter', 'clinvar_status', 'annotation', '1kg_af', '1kg_popmax_af', 'exac_af', 'exac_popmax_af', 'merck_wgs_3793_af', 'merck_wgs_144_af', 'multiallelic_site_alt_alleles (* = spanning deletion)', '', ] genotype_headers = [ 'sample_id', 'str', 'num_alt', 'allele_balance', 'AD', 'DP', 'GQ', 'PL', ] for i in range(0, 10): for h in genotype_headers: header_fields.append("genotype%d_%s" % (i, h)) writer.writerow(header_fields) for inheritance_mode in ['homozygous_recessive', 'dominant', 'compound_het', 'de_novo', 'x_linked_recessive', 'all_variants']: # collect the resources that we'll need here annotator = mall.get_annotator() custom_population_store = mall.get_custom_population_store() project = Project.objects.get(project_id=project_id) # get the variants for this inheritance / project combination for i, (family, family_results) in enumerate(get_variants_for_inheritance_for_project(project, inheritance_mode)): for variant in family_results: custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt) g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3'] g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax'] exac_freq = variant.annotation['freqs']['exac_v3'] exac_popmax_freq = variant.annotation['freqs']['exac_v3_popmax'] merck_wgs_3793_freq = custom_populations.get('merck-wgs-3793', 0.0) merck_wgs_144_freq = custom_populations.get('merck-pcr-free-wgs-144', 0.0) try: assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold) #assert merck_wgs_3793_freq <= merck_wgs_3793_threshold, "Merck WGS 3793 threshold %s > %s" % (merck_wgs_3793_freq, merck_wgs_3793_threshold) #assert merck_wgs_144_freq <= merck_wgs_144_threshold, "Merck PCR free 144 threshold %s > %s" % (merck_wgs_144_freq, merck_wgs_144_threshold) except AssertionError as e: import traceback traceback.print_exc() # filter value is stored in the genotypes if len(family.get_individuals()) == 0: print("Family has 0 individuals: %s - skipping..." % str(family)) continue filter_value = variant.get_genotype(family.get_individuals()[0].indiv_id).filter multiallelic_site_other_alleles = [] if len(variant.extras['orig_alt_alleles']) > 1: multiallelic_site_other_alleles = variant.extras['orig_alt_alleles'] clinvar_significance = CLINVAR_VARIANTS.get(variant.unique_tuple(), [""])[-1] row = [ inheritance_mode, project_id, family.family_id, get_gene_symbol(variant), variant.chr, str(variant.pos), variant.ref, variant.alt, variant.vcf_id, filter_value, clinvar_significance, variant.annotation['vep_group'], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, merck_wgs_3793_freq, merck_wgs_144_freq, ", ".join(multiallelic_site_other_alleles), '', ] for i, individual in enumerate(family.get_individuals()): if i >= 10: break genotype = variant.get_genotype(individual.indiv_id) if genotype is None: row.extend([individual.indiv_id, "./.", "", "", "", "", "", ""]) continue else: #assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter) try: assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq) assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"]) if genotype.num_alt == 1: assert genotype.ab is None or genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab) except AssertionError as e: import traceback traceback.print_exc() genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./." row.extend([ individual.indiv_id, genotype_str, genotype.num_alt, genotype.ab if genotype.ab is not None else '', genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"], ]) writer.writerow(row) family_variants_f.flush() family_variants_f.close() print("Done with " + filename)
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ main_project = get_object_or_404(Project, project_id=project_id) if not main_project.can_view(request.user): return HttpResponse("Unauthorized") # other projects this user can view other_projects = get_loaded_projects_for_user( request.user, fields=['project_id', 'project_name']) if other_projects: other_projects_json = json.dumps([{ 'project_id': p.project_id, 'project_name': p.project_name } for p in sorted(other_projects, key=lambda p: p.project_id.lower())]) else: other_projects_json = None if gene_id is None: return render( request, 'project/gene_quicklook.html', { 'project': main_project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, 'other_projects_json': other_projects_json, }) projects_to_search_param = request.GET.get('selected_projects') if projects_to_search_param: project_ids = projects_to_search_param.split(",") projects_to_search = [ project for project in other_projects if project.project_id in project_ids ] if len(projects_to_search) < len(project_ids): # If not all the specified project ids are in the other projects list then they are not authorized return HttpResponse("Unauthorized") else: project_ids = [main_project.project_id] projects_to_search = [main_project] gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write( project_id + " - staring gene search for: %s in projects: %s\n" % (gene_id, ",".join([p.project_id for p in projects_to_search]) + "\n")) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) indiv_id_to_project_id = {} rare_variant_dict = {} rare_variants = [] individ_ids_and_variants = [] for project in projects_to_search: all_project_variants = project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter) # compute knockout individuals knockout_ids, variation = get_knockouts_in_gene( project, gene_id, all_project_variants) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids( [indiv_id]) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) # compute rare variants project_variants = [] for i, variant in enumerate(all_project_variants): max_af = max([ freq for label, freq in variant.annotation['freqs'].items() if label != "AF" ]) # don't filter on within-cohort AF if not any([ indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0 ]): continue if max_af >= .01: continue # add project id to genotypes for indiv_id in variant.genotypes: indiv_id_to_project_id[indiv_id] = project.project_id # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project) variant_id = "%s-%s-%s-%s" % (variant.chr, variant.pos, variant.ref, variant.alt) if variant_id not in rare_variant_dict: rare_variant_dict[variant_id] = variant project_variants.append(variant) else: rare_variant_dict[variant_id].genotypes.update( variant.genotypes) rare_variants.extend(project_variants) all_variants = sum([i['variants'] for i in individ_ids_and_variants], rare_variants) add_extra_info_to_variants_project(get_reference(), project, all_variants) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format( download_csv, gene.get("symbol") or gene.get("transcript_name")) if download_csv == 'knockouts': individuals_to_include = [ individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants ] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = get_reference( ).get_clinvar_info(*variant.unique_tuple()) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace( "%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get( "1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get( "exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = get_reference( ).get_clinvar_info(*variant.unique_tuple()) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = [ "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] + list( map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i], individuals_to_include)) writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [ v.toJSON() for v in variants ] individ_ids = {i['indiv_id'] for i in individ_ids_and_variants} for var in rare_variants: individ_ids.update(var.genotypes.keys()) individuals = Individual.objects.filter( indiv_id__in=individ_ids, project__project_id__in=project_ids).select_related( 'project').select_related('family').only( 'project__project_id', 'family__family_id', *Individual.INDIVIDUAL_JSON_FIELDS_NO_IDS) return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': main_project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([ i.get_json_obj(skip_has_variant_data=True) for i in individuals ]), 'knockouts_json': json.dumps(individ_ids_and_variants), 'other_projects_json': other_projects_json, })
def search_for_gene(self, search_gene_id, project_id_list, max_af=0.01): ''' Search for a gene across project(s) Args: 1. search_gene_id: Gene ID to search for 2. proj_list: An optional list of projects to narrow down search to ''' gene_id = get_gene_id_from_str(search_gene_id, get_reference()) gene = get_reference().get_gene(gene_id) print("Staring gene search for: %s %s in projects: %s\n" % (search_gene_id, gene['gene_id'], ", ".join(project_id_list))) print("Max AF threshold: %s" % max_af) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) print("All Filters: ") pprint(variant_filter.toJSON()) output_filename = 'results_' + search_gene_id + '.tsv' outfile = open(output_filename, 'w') header = [ "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] writer = csv.writer(outfile, delimiter='\t') writer.writerow(header) if project_id_list: for project_id in project_id_list: project = Project.objects.filter( project_id=project_id)[0] # TODO validate else: project_id_list = [p.project_id for p in Project.objects.all()] for project_id in project_id_list: project = Project.objects.filter(project_id=project_id)[0] if get_project_datastore(project_id).project_collection_is_loaded( project_id): print("Running on project %s" % project_id) else: print( "Skipping project %s - gene search is not enabled for this project" % project_id) continue for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): if max(variant.annotation['freqs'].values()) >= max_af: continue #pprint(variant.toJSON()) add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] all_genotypes_list = [] pass_filter = "N/A" for indiv_id, genotype in variant.genotypes.items(): pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: all_genotypes_list.append( "%s[gt:%s GQ:%s AB:%0.3f]" % (indiv_id, ">".join( genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) row = map(str, [ project_id, gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)
def add_populations_to_variants(variants, population_slug_list): if population_slug_list: try: mall.get_annotator().get_population_frequency_store().add_populations_to_variants(variants, population_slug_list) except Exception, e: print("WARNING: got unexpected error in add_custom_populations_to_variants: %s" % e)
def load_population_frequency_store(self): population_frequency_store = mall.get_annotator().get_population_frequency_store() for population_spec in annotator_settings.reference_populations_to_load: print("Loading " + str(population_spec)) population_frequency_store.load_population(population_spec)
def handle_project(project_id): filename = 'family_variants_%s.tsv.gz' % project_id print("Generating report: " + filename) # create family_variants.tsv family_variants_f = gzip.open(filename, 'w') writer = csv.writer(family_variants_f, dialect='excel', delimiter='\t') header_fields = [ '#inheritance_mode', 'project_id', 'family_id', 'gene', 'chrom', 'pos', 'ref', 'alt', 'rsid', 'filter', 'clinvar_status', 'annotation', '1kg_af', '1kg_popmax_af', 'exac_af', 'exac_popmax_af', 'merck_wgs_3793_af', 'merck_wgs_144_af', 'multiallelic_site_alt_alleles (* = spanning deletion)', '', ] genotype_headers = [ 'sample_id', 'str', 'num_alt', 'allele_balance', 'AD', 'DP', 'GQ', 'PL', ] for i in range(0, 10): for h in genotype_headers: header_fields.append("genotype%d_%s" % (i, h)) writer.writerow(header_fields) for inheritance_mode in [ 'homozygous_recessive', 'dominant', 'compound_het', 'de_novo', 'x_linked_recessive', 'all_variants' ]: # collect the resources that we'll need here annotator = mall.get_annotator() custom_population_store = mall.get_custom_population_store() project = Project.objects.get(project_id=project_id) # get the variants for this inheritance / project combination for i, (family, family_results) in enumerate( get_variants_for_inheritance_for_project( project, inheritance_mode)): for variant in family_results: custom_populations = custom_population_store.get_frequencies( variant.xpos, variant.ref, variant.alt) g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3'] g1k_popmax_freq = variant.annotation['freqs'][ '1kg_wgs_phase3_popmax'] exac_freq = variant.annotation['freqs']['exac_v3'] exac_popmax_freq = variant.annotation['freqs'][ 'exac_v3_popmax'] merck_wgs_3793_freq = custom_populations.get( 'merck-wgs-3793', 0.0) merck_wgs_144_freq = custom_populations.get( 'merck-pcr-free-wgs-144', 0.0) try: assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % ( g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % ( g1k_popmax_freq, g1k_popmax_freq_threshold) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % ( exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % ( exac_popmax_freq, exac_popmax_threshold) #assert merck_wgs_3793_freq <= merck_wgs_3793_threshold, "Merck WGS 3793 threshold %s > %s" % (merck_wgs_3793_freq, merck_wgs_3793_threshold) #assert merck_wgs_144_freq <= merck_wgs_144_threshold, "Merck PCR free 144 threshold %s > %s" % (merck_wgs_144_freq, merck_wgs_144_threshold) except AssertionError as e: import traceback traceback.print_exc() # filter value is stored in the genotypes if len(family.get_individuals()) == 0: print("Family has 0 individuals: %s - skipping..." % str(family)) continue genotype = variant.get_genotype( family.get_individuals()[0].indiv_id) if genotype is not None: filter_value = genotype.filter else: filter_value = 'unknown' multiallelic_site_other_alleles = [] if len(variant.extras['orig_alt_alleles']) > 1: multiallelic_site_other_alleles = variant.extras[ 'orig_alt_alleles'] clinvar_significance = get_clinvar_variants().get( variant.unique_tuple(), [""])[-1] row = [ inheritance_mode, project_id, family.family_id, get_gene_symbol(variant), variant.chr, str(variant.pos), variant.ref, variant.alt, variant.vcf_id, filter_value, clinvar_significance, variant.annotation['vep_group'], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, merck_wgs_3793_freq, merck_wgs_144_freq, ", ".join(multiallelic_site_other_alleles), '', ] for i, individual in enumerate(family.get_individuals()): if i >= 10: break genotype = variant.get_genotype(individual.indiv_id) if genotype is None: row.extend([ individual.indiv_id, "./.", "", "", "", "", "", "" ]) continue else: #assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter) try: assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.gq) assert genotype.extras[ "dp"] >= DP_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.extras["dp"]) if genotype.num_alt == 1: assert genotype.ab is None or genotype.ab >= AB_threshold / 100., "%s %s - AB is %s " % ( variant.chr, variant.pos, genotype.ab) except AssertionError as e: import traceback traceback.print_exc() genotype_str = "/".join( genotype.alleles) if genotype.alleles else "./." row.extend([ individual.indiv_id, genotype_str, genotype.num_alt, genotype.ab if genotype.ab is not None else '', genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"], ]) writer.writerow(row) family_variants_f.flush() family_variants_f.close() print("Done with " + filename)
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ main_project = get_object_or_404(Project, project_id=project_id) if not main_project.can_view(request.user): return HttpResponse("Unauthorized") new_page_url = '/variant_search/project/{}'.format(main_project.seqr_project.guid) if main_project.seqr_project and main_project.seqr_project.has_new_search else None # other projects this user can view other_projects = get_loaded_projects_for_user(request.user, fields=['project_id', 'project_name']) if other_projects: other_projects_json = json.dumps([{'project_id': p.project_id, 'project_name': p.project_name} for p in sorted(other_projects, key=lambda p: p.project_id.lower())]) else: other_projects_json = None if gene_id is None: return render(request, 'project/gene_quicklook.html', { 'project': main_project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, 'other_projects_json': other_projects_json, 'new_page_url': new_page_url, }) projects_to_search_param = request.GET.get('selected_projects') if projects_to_search_param: project_ids = projects_to_search_param.split(",") projects_to_search = [project for project in other_projects if project.project_id in project_ids] if len(projects_to_search) < len(project_ids): # If not all the specified project ids are in the other projects list then they are not authorized return HttpResponse("Unauthorized") else: project_ids = [main_project.project_id] projects_to_search = [main_project] gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) # all rare coding variants variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) indiv_id_to_project_id = {} rare_variant_dict = {} rare_variants = [] individ_ids_and_variants = [] for project in projects_to_search: all_project_variants = project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter) # compute knockout individuals knockout_ids, variation = get_knockouts_in_gene(project, gene_id, all_project_variants) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([indiv_id]) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) # compute rare variants project_variants = [] for i, variant in enumerate(all_project_variants): max_af = max([freq for label, freq in variant.annotation['freqs'].items() if label != "AF"]) # don't filter on within-cohort AF if not any([indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0]): continue if max_af >= .01: continue # add project id to genotypes for indiv_id in variant.genotypes: indiv_id_to_project_id[indiv_id] = project.project_id # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project) variant_id = "%s-%s-%s-%s" % (variant.chr,variant.pos, variant.ref, variant.alt) if variant_id not in rare_variant_dict: rare_variant_dict[variant_id] = variant project_variants.append(variant) else: for indiv_id, genotype in variant.genotypes.items(): existing_genotype = rare_variant_dict[variant_id].genotypes.get(indiv_id) if not existing_genotype or existing_genotype.num_alt == -1: rare_variant_dict[variant_id].genotypes[indiv_id] = genotype if project != main_project: add_extra_info_to_variants_project(get_reference(), project, project_variants) rare_variants.extend(project_variants) all_variants = sum([i['variants'] for i in individ_ids_and_variants], rare_variants) add_extra_info_to_variants_project(get_reference(), main_project, all_variants, add_family_tags=True) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(download_csv, gene.get("symbol") or gene.get("transcript_name")) def get_row(variant, worst_annotation): if 'clinvar_allele_id' in variant.extras: measureset_id = variant.extras['clinvar_allele_id'] clinvar_significance = variant.extras['clinvar_clinsig'] else: measureset_id, clinvar_significance = get_reference().get_clinvar_info(*variant.unique_tuple()) genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") return [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or variant.annotation.get("rsid") or "", variant.annotation.get("vep_consequence") or "", worst_annotation.get("hgvsc") or "", (worst_annotation.get("hgvsp") or "").replace("%3D", "="), variant.annotation.get("sift") or "", variant.annotation.get("polyphen") or "", variant.annotation.get("mutationtaster_pred") or variant.annotation.get("muttaster") or "", (";".join(set((worst_annotation.get("fathmm_pred") or "").split('%3B')))) or variant.annotation.get("fathmm") or "", measureset_id or "", clinvar_significance or "", variant.annotation["freqs"].get("1kg_wgs_phase3") or variant.annotation["freqs"].get("1kg_wgs_AF") or "", variant.annotation["freqs"].get("1kg_wgs_phase3_popmax") or variant.annotation["freqs"].get("1kg_wgs_popmax_AF") or "", variant.annotation["freqs"].get("exac_v3") or variant.annotation["freqs"].get("exac_v3_AF") or "", variant.annotation["freqs"].get("exac_v3_popmax") or variant.annotation["freqs"].get("exac_v3_popmax_AF") or "", variant.annotation["freqs"].get("gnomad_exomes_AF") or "", variant.annotation["freqs"].get("gnomad_exomes_popmax_AF") or "", variant.annotation["freqs"].get("gnomad_genomes_AF") or "", variant.annotation["freqs"].get("gnomad_genomes_popmax_AF") or "", all_genotypes_string, ] + genotypes if download_csv == 'knockouts': individuals_to_include = [individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") rows.append(map(str, get_row(variant, worst_annotation))) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] rows.append(map(str, get_row(variant, worst_annotation))) header = ["gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "freq_gnomad_exomes", "freq_gnomad_exomes_popmax", "freq_gnomad_genomes", "freq_gnomad_genomes_popmax", "all_genotypes"] + list(map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i], individuals_to_include)) writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [v.toJSON() for v in variants] individ_ids = {i['indiv_id'] for i in individ_ids_and_variants} for var in rare_variants: individ_ids.update(var.genotypes.keys()) individuals = Individual.objects.filter( indiv_id__in=individ_ids, project__project_id__in=project_ids ).select_related('project').select_related('family').only('project__project_id', 'family__family_id', *Individual.INDIVIDUAL_JSON_FIELDS_NO_IDS) return render(request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': main_project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([i.get_json_obj(skip_has_variant_data=True) for i in individuals]), 'knockouts_json': json.dumps(individ_ids_and_variants), 'other_projects_json': other_projects_json, 'new_page_url': new_page_url, })
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") if gene_id is None: return render(request, 'project/gene_quicklook.html', { 'project': project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, }) gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene)) # all rare coding variants variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) rare_variants = [] for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter): max_af = max(variant.annotation['freqs'].values()) if max_af < .01: rare_variants.append(variant) #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation'])) add_extra_info_to_variants_project(get_reference(), project, rare_variants) # compute knockout individuals individ_ids_and_variants = [] knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([indiv_id]) add_extra_info_to_variants_project(get_reference(), project, variants) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) sys.stderr.write("Project-wide gene search retrieved %s rare variants for gene: %s \n" % (len(rare_variants), gene_id)) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(download_csv, gene["transcript_name"]) if download_csv == 'knockouts': individuals_to_include = [individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " if genotype.num_alt > 0: genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", "")) rows.append(map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " if genotype.num_alt > 0: genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", "")) rows.append(map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = ["gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes"] + individuals_to_include writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [v.toJSON() for v in variants] return render(request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]), 'knockouts_json': json.dumps(individ_ids_and_variants), })
def search_for_genes(self, gene_or_variant_ids, project_id_list, output_filename, max_af=0.01, knockouts=False, in_clinvar_only=False, include_non_coding=False): """ Search for a gene across project(s) Args: gene_or_variant_ids (list): 'ENSG..' gene id strings. project_id_list (list): (optional) project ids to narrow down the search output_filename (string): output file name max_af (float): AF filter in_clinvar_only (bool): include_non_coding (bool): """ projects = [ Project.objects.get(project_id=project_id) for project_id in project_id_list ] outfile = open(output_filename, 'w') header = [ "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "gnomad-exomes", "gnomad-genomes", "families", "all_genotypes" ] writer = csv.writer(outfile, delimiter='\t') writer.writerow(header) # all rare coding variants if not knockouts: variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) #variant_filter.set_max_AF(max_af) if include_non_coding: variant_filter.so_annotations = [] print("All Filters: ") pprint(variant_filter.toJSON()) #print("Max AF threshold: %s" % max_af) print("Starting search for:\n%s\nin projects:\n%s\n" % (", ".join(gene_or_variant_ids), ", ".join( [p.project_id for p in projects]))) for project in projects: project_id = project.project_id if get_project_datastore(project).project_collection_is_loaded( project): print("=====================") print("Searching project %s" % project_id) else: print( "Skipping project %s - gene search is not enabled for this project" % project_id) continue indiv_cache = {} for gene_or_variant_id in gene_or_variant_ids: chrom_pos_match = re.match("([0-9XY]{1,2})-([0-9]{1,9})", gene_or_variant_id) chrom_pos_ref_alt_match = re.match( "([0-9XY]{1,2})-([0-9]{1,9})-([ACTG]+)-([ACTG]+)", gene_or_variant_id) if chrom_pos_match or chrom_pos_ref_alt_match: chrom = chrom_pos_match.group(1) pos = int(chrom_pos_match.group(2)) xpos = genomeloc.get_xpos(chrom, pos) ref = alt = None if chrom_pos_ref_alt_match: ref = chrom_pos_ref_alt_match.group(3) alt = chrom_pos_ref_alt_match.group(4) variant = get_project_datastore( project).get_single_variant(project.project_id, None, xpos, ref, alt) if variant is None: continue variants = [variant] print("-- searching %s for variant %s-%s-%s: found %s" % (project_id, xpos, ref, alt, variant)) worst_annotation_idx = variant.annotation[ 'worst_vep_annotation_index'] print(variant.annotation["vep_annotation"] [worst_annotation_idx]) gene_id = variant.annotation["vep_annotation"][ worst_annotation_idx]['gene_id'] gene = get_reference().get_gene(gene_id) else: gene_id = get_gene_id_from_str(gene_or_variant_id, get_reference()) gene = get_reference().get_gene(gene_id) print("-- searching %s for gene %s (%s)" % (project_id, gene["symbol"], gene_id)) if knockouts: knockout_ids, variation = project_analysis.get_knockouts_in_gene( project, gene_id) variants = variation.get_relevant_variants_for_indiv_ids( knockout_ids) else: variants = project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter) for variant in variants: if not chrom_pos_match and not chrom_pos_ref_alt_match and max( variant.annotation['freqs'].values()) >= max_af: continue add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"].get(gene_id) if worst_annotation_idx is not None: worst_annotation = variant.annotation[ "vep_annotation"][worst_annotation_idx] else: worst_annotation = None all_genotypes_list = [] pass_filter = "N/A" family_ids = set() for indiv_id, genotype in variant.genotypes.items(): if indiv_id in indiv_cache: individual = indiv_cache[indiv_id] if individual == 'deleted': continue else: try: individual = Individual.objects.get( project=project, indiv_id=indiv_id) indiv_cache[indiv_id] = individual except ObjectDoesNotExist: # this can happen when an individual is deleted from the project - from postgres, but not from mong indiv_cache[indiv_id] = 'deleted' continue except MultipleObjectsReturned: # when several families have an individual with the same id individuals = Individual.objects.filter( project=project, indiv_id=indiv_id) individual = individuals[0] indiv_cache[indiv_id] = individual pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: family_ids.add(individual.family.family_id) all_genotypes_list.append( "%s/%s%s[gt:%s GQ:%s AB:%0.3f]" % (individual.family.family_id, indiv_id, "[Affected]" if individual.affected == "A" else ("[-]" if individual.affected == "N" else "[?]"), ">".join(genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) if len(all_genotypes_list) == 0: continue measureset_id, clinvar_significance = get_reference( ).get_clinvar_info(*variant.unique_tuple()) if in_clinvar_only and ( not clinvar_significance or "path" not in clinvar_significance.lower()): continue row = map(str, [ project_id, gene, variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", "") if worst_annotation else "", (worst_annotation.get("hgvsp", "") or "").replace( "%3D", "=") if worst_annotation else "", worst_annotation.get("sift", "") if worst_annotation else "", worst_annotation.get("polyphen", "") if worst_annotation else "", worst_annotation.get("mutationtaster_pred", "") if worst_annotation else "", ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))) if worst_annotation else "", measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), variant.annotation["freqs"].get("gnomad-exomes2", ""), variant.annotation["freqs"].get("gnomad-genomes2", ""), ", ".join(sorted(list(family_ids))), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)
def handle(self, *args, **options): if not args: sys.exit("ERROR: please specify project id on the command line") if len(args) > 1: sys.exit("ERROR: too many args: %s. Only one project id should be provided." % " ".join(args)) project_id = args[0] # create family_variants.tsv family_variants_f = gzip.open("family_variants_%s.tsv.gz" % project_id, "w") writer = csv.writer(family_variants_f, dialect="excel", delimiter="\t") header_fields = [ "#inheritance_mode", "project_id", "family_id", "gene", "chrom", "pos", "ref", "alt", "rsid", "annotation", "1kg_af", "1kg_popmax_af", "exac_af", "exac_popmax_af", "", ] genotype_headers = ["sample_id", "str", "num_alt", "allele_balance", "AD", "DP", "GQ", "PL"] for i in range(0, 10): for h in genotype_headers: header_fields.append("genotype%d_%s" % (i, h)) writer.writerow(header_fields) family_variants_f.flush() for inheritance_mode in ["dominant", "homozygous_recessive", "compound_het", "de_novo", "x_linked_recessive"]: # collect the resources that we'll need here annotator = mall.get_annotator() custom_population_store = mall.get_custom_population_store() project = Project.objects.get(project_id=project_id) families = project.get_families() # get the variants for this inheritance / project combination for i, (family, variant_list) in enumerate( get_variants_for_inheritance_for_project(project, inheritance_mode) ): for variant in variant_list: # if variant.annotation['vep_group'] != "missense": # continue custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt) g1k_freq = variant.annotation["freqs"]["1kg_wgs_phase3"] g1k_popmax_freq = variant.annotation["freqs"]["1kg_wgs_phase3_popmax"] exac_freq = variant.annotation["freqs"]["exac_v3"] exac_popmax_freq = variant.annotation["freqs"]["exac_v3_popmax"] assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % ( g1k_popmax_freq, g1k_popmax_freq_threshold, ) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % ( exac_popmax_freq, exac_popmax_threshold, ) row = [ inheritance_mode, project_id, family.family_id, get_gene_symbol(variant), variant.chr, str(variant.pos), variant.ref, variant.alt, variant.vcf_id, variant.annotation["vep_group"], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, "", ] for i, individual in enumerate(family.get_individuals()): if i >= 10: break genotype = variant.get_genotype(individual.indiv_id) if genotype is None: print("WARNING: %s variant genotype for %s is None" % (variant, individual.indiv_id)) continue assert genotype.filter == "pass", "%s %s - filter is %s " % ( variant.chr, variant.pos, genotype.filter, ) assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.gq, ) assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.extras["dp"], ) if genotype.num_alt == 1: assert genotype.ab >= AB_threshold / 100.0, "%s %s - AB is %s " % ( variant.chr, variant.pos, genotype.ab, ) genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./." row.extend( [ individual.indiv_id, genotype_str, genotype.num_alt, genotype.ab, genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"], ] ) writer.writerow(row) family_variants_f.flush() family_variants_f.close()
def load_project_variants(project_id, force_annotations=False, ignore_csq_in_vcf=False): """ Load any families and cohorts in this project that aren't loaded already """ print "Loading project %s" % project_id print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) os.system("du /mongo/mongodb") project = Project.objects.get(project_id=project_id) for vcf_obj in project.get_all_vcf_files(): r = vcf.VCFReader(filename=vcf_obj.path()) if not ignore_csq_in_vcf and "CSQ" in r.infos: mall.get_annotator().add_preannotated_vcf_file( vcf_obj.path(), force=force_annotations) else: mall.get_annotator().add_vcf_file_to_annotator( vcf_obj.path(), force_all=force_annotations) # batch load families by VCF file for vcf_file, families in project.families_by_vcf().items(): families = [ f for f in families if get_mall(project.project_id).variant_store.get_family_status( project_id, f.family_id) != 'loaded' ] for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i + settings.FAMILY_LOAD_BATCH_SIZE])))) load_variants_for_family_list( project, families[i:i + settings.FAMILY_LOAD_BATCH_SIZE], vcf_file) # now load cohorts print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - cohorts")) # TODO: load cohorts and families together print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - cohorts")) os.system("du /mongo/mongodb") for vcf_file, cohorts in project.cohorts_by_vcf().items(): cohorts = [ c for c in cohorts if get_mall(project.project_id).variant_store.get_family_status( project_id, c.cohort_id) != 'loaded' ] for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE): print("Loading project %s - cohorts: %s" % (project_id, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE])) load_variants_for_cohort_list( project, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE]) print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- finished loading project: " + project_id))