Example #1
0
def get_variants_for_inheritance_for_project(project, inheritance_mode):
    """
    Get the variants for this project / inheritance combo
    Return dict of family -> list of variants
    """

    # create search specification
    # this could theoretically differ by project, if there are different reference populations
    variant_filter = get_default_variant_filter('moderate_impact')
    variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
    variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold))
    quality_filter = {
        'vcf_filter': 'pass',
        'min_gq': GQ_threshold,
        'min_ab': AB_threshold,
    }

    # run MendelianVariantSearch for each family, collect results
    families = project.get_families()
    for i, family in enumerate(families):
        sys.stdout.write("Processing %s - family %s  (%d / %d) .." % (inheritance_mode, family.family_id, i+1, len(families)))
        variant_list = list(get_variants_with_inheritance_mode(
            get_mall(project.project_id),
            family.xfamily(),
            inheritance_mode,
            variant_filter=variant_filter,
            quality_filter=quality_filter,
            ))
        yield family, variant_list
        print(" got %d variants" % len(variant_list))
Example #2
0
def family_group_gene(request, project_id, family_group_slug, gene_id):

    project = get_object_or_404(Project, project_id=project_id)
    family_group = get_object_or_404(FamilyGroup,
                                     project=project,
                                     slug=family_group_slug)
    if not project.can_view(request.user):
        return HttpResponse('unauthorized')

    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)

    varfilter = get_default_variant_filter(
        'all_coding',
        mall.get_annotator().reference_population_slugs)
    variants_by_family = family_group_analysis.get_variants_in_gene(
        family_group, gene_id, variant_filter=varfilter)

    return render(
        request, 'family_group/family_group_gene.html', {
            'project': project,
            'family_group': family_group,
            'family_group_json': json.dumps(family_group.toJSON()),
            'gene_json': json.dumps(gene),
            'gene': gene,
            'variants_by_family_json': json.dumps(variants_by_family),
        })
Example #3
0
def get_knockouts_in_gene(project, gene_id, quality_filter=None):
    """
    Get all the variants in a gene, but filter out quality_filter genotypes
    """
    indiv_id_list = [i.indiv_id for i in project.get_individuals()]

    # filter out variants > 0.01 AF in any of the reference populations
    reference_populations = mall.get_annotator().reference_population_slugs
    variant_filter = get_default_variant_filter('moderate_impact',
                                                reference_populations)
    variant_list = get_project_datastore(
        project.project_id).get_project_variants_in_gene(
            project.project_id,
            gene_id,
            variant_filter=variant_filter,
        )
    variant_list = search_utils.filter_gene_variants_by_variant_filter(
        variant_list, gene_id, variant_filter)
    variation = CohortGeneVariation(
        get_reference(),
        gene_id,
        variant_list,
        indiv_id_list,
        quality_filter={},
    )
    knockouts = get_individuals_with_inheritance('recessive', variation,
                                                 indiv_id_list)
    return knockouts, variation
def get_variants_for_inheritance_for_project(project, inheritance_mode):
    """
    Get the variants for this project / inheritance combo
    Return dict of family -> list of variants
    """

    # create search specification
    # this could theoretically differ by project, if there are different reference populations
    variant_filter = get_default_variant_filter("moderate_impact")
    variant_filter.ref_freqs.append(("1kg_wgs_phase3", g1k_freq_threshold))
    variant_filter.ref_freqs.append(("1kg_wgs_phase3_popmax", g1k_popmax_freq_threshold))
    variant_filter.ref_freqs.append(("exac_v3", exac_freq_threshold))
    variant_filter.ref_freqs.append(("exac_v3_popmax", exac_popmax_threshold))
    quality_filter = {"vcf_filter": "pass", "min_gq": GQ_threshold, "min_ab": AB_threshold}

    # run MendelianVariantSearch for each family, collect results
    families = project.get_families()
    for i, family in enumerate(families):
        sys.stdout.write(
            "Processing %s - family %s  (%d / %d) .." % (inheritance_mode, family.family_id, i + 1, len(families))
        )
        variant_list = list(
            get_variants_with_inheritance_mode(
                get_mall(project.project_id),
                family.xfamily(),
                inheritance_mode,
                variant_filter=variant_filter,
                quality_filter=quality_filter,
            )
        )
        yield family, variant_list
        print(" got %d variants" % len(variant_list))
Example #5
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    project = get_object_or_404(Project, project_id=project_id)
    if not project.can_view(request.user):
        return HttpResponse("Unauthorized")
    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)
    sys.stderr.write(project_id + " - staring gene search for: %s %s \n" %
                     (gene_id, gene))
    variant_filter = get_default_variant_filter(
        'all_coding',
        mall.get_annotator().reference_population_slugs)
    num_indivs = len(
        [i for i in project.get_individuals() if i.has_variant_data()])
    aac_threshold = (.2 * num_indivs) + 5
    rare_variants = []
    for variant in project_analysis.get_variants_in_gene(
            project, gene_id, variant_filter=variant_filter):
        aac = get_alt_allele_count(variant)
        max_af = max(variant.annotation['freqs'].values())
        if aac <= aac_threshold and max_af < .01:
            rare_variants.append(variant)

    add_extra_info_to_variants_project(get_reference(), project, rare_variants)

    knockouts = []
    knockout_ids, variation = get_knockouts_in_gene(project, gene_id)
    for kid in knockout_ids:
        variants = variation.get_relevant_variants_for_indiv_ids([kid])
        add_extra_info_to_variants_project(get_reference(), project, variants)
        knockouts.append({
            'indiv_id': kid,
            'variants': [v.toJSON() for v in variants],
        })

    sys.stderr.write("Retrieved %s variants \n" % len(rare_variants))
    return render(
        request, 'project/gene_quicklook.html', {
            'gene':
            gene,
            'gene_json':
            json.dumps(gene),
            'project':
            project,
            'rare_variants_json':
            json.dumps([v.toJSON() for v in rare_variants]),
            'individuals_json':
            json.dumps([i.get_json_obj() for i in project.get_individuals()]),
            'knockouts_json':
            json.dumps(knockouts),
        })
Example #6
0
def inheritance_matrix_for_gene(project, gene_id):
    """
    Run get_family_matrix_for_gene for the families in this project
    """
    variant_filter = get_default_variant_filter('moderate_impact', mall.get_annotator().reference_population_slugs)
    quality_filter = get_default_quality_filter('high_quality', mall.get_annotator().reference_population_slugs)
    matrix = get_family_matrix_for_gene(
        get_mall(),
        [f.xfamily() for f in project.get_active_families()],
        gene_id,
        variant_filter,
        quality_filter
    )
    return matrix
Example #7
0
def get_variants_for_inheritance_for_project(project, inheritance_mode):
    """
    Get the variants for this project / inheritance combo
    Return dict of family -> list of variants
    """

    # create search specification
    # this could theoretically differ by project, if there are different reference populations
    #variant_filter = VariantFilter(so_annotations=SO_SEVERITY_ORDER, ref_freqs=[])
    variant_filter = get_default_variant_filter('moderate_impact')
    variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
    variant_filter.ref_freqs.append(
        ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold))
    variant_filter.ref_freqs.append(
        ('merck-wgs-3793', merck_wgs_3793_threshold))
    #variant_filter.ref_freqs.append(('merck-pcr-free-wgs-144', merck_wgs_144_threshold))
    quality_filter = {
        #        'vcf_filter': 'pass',
        'min_gq': GQ_threshold,
        'min_ab': AB_threshold,
    }

    # run MendelianVariantSearch for each family, collect results

    families = project.get_families()

    for i, family in enumerate(families):
        print("Processing %s - family %s  (%d / %d)" %
              (inheritance_mode, family.family_id, i + 1, len(families)))
        try:
            if inheritance_mode == "all_variants":
                yield family, list(
                    get_variants(get_datastore(project.project_id),
                                 family.xfamily(),
                                 variant_filter=variant_filter,
                                 quality_filter=quality_filter,
                                 indivs_to_consider=family.indiv_id_list()))
            else:
                yield family, list(
                    get_variants_with_inheritance_mode(
                        get_mall(project.project_id),
                        family.xfamily(),
                        inheritance_mode,
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                    ))
        except ValueError as e:
            print("Error: %s. Skipping family %s" % (str(e), str(family)))
Example #8
0
def inheritance_matrix_for_gene(project, gene_id):
    """
    Run get_family_matrix_for_gene for the families in this project
    """
    variant_filter = get_default_variant_filter(
        'moderate_impact',
        mall.get_annotator().reference_population_slugs)
    quality_filter = get_default_quality_filter(
        'high_quality',
        mall.get_annotator().reference_population_slugs)
    matrix = get_family_matrix_for_gene(
        get_mall(project.project_id),
        [f.xfamily() for f in project.get_active_families()], gene_id,
        variant_filter, quality_filter)
    return matrix
def get_variants_for_inheritance_for_project(project, inheritance_mode):
    """
    Get the variants for this project / inheritance combo
    Return dict of family -> list of variants
    """

    # create search specification
    # this could theoretically differ by project, if there are different reference populations
    #variant_filter = VariantFilter(so_annotations=SO_SEVERITY_ORDER, ref_freqs=[])
    variant_filter = get_default_variant_filter('moderate_impact')
    variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
    variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold))
    variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold))
    #variant_filter.ref_freqs.append(('merck-pcr-free-wgs-144', merck_wgs_144_threshold))
    quality_filter = {
#        'vcf_filter': 'pass',
        'min_gq': GQ_threshold,
        'min_ab': AB_threshold,
    }

    # run MendelianVariantSearch for each family, collect results

    families = project.get_families()

    for i, family in enumerate(families):
        print("Processing %s - family %s  (%d / %d)" % (inheritance_mode, family.family_id, i+1, len(families)))
        try:
            if inheritance_mode == "all_variants":
                yield family, list(get_variants(
                        get_datastore(project.project_id),
                        family.xfamily(),
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                        indivs_to_consider=family.indiv_id_list()
                        ))
            else:
                yield family, list(get_variants_with_inheritance_mode(
                        get_mall(project.project_id),
                        family.xfamily(),
                        inheritance_mode,
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                        ))
        except ValueError as e:
            print("Error: %s. Skipping family %s" % (str(e), str(family)))
Example #10
0
def get_knockouts_in_gene(project, gene_id, gene_variants):
    """
    Get all the variants in a gene, but filter out quality_filter genotypes
    """
    indiv_id_list = [i.indiv_id for i in project.get_individuals()]

    # filter out variants > 0.01 AF in any of the reference populations
    reference_populations = mall.get_annotator().reference_population_slugs
    variant_filter = get_default_variant_filter('moderate_impact', reference_populations)
    variant_list = search_utils.filter_gene_variants_by_variant_filter(gene_variants, gene_id, variant_filter)

    variation = CohortGeneVariation(
        get_reference(),
        gene_id,
        variant_list,
        indiv_id_list,
        quality_filter={},
    )
    knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list)
    return knockouts, variation
Example #11
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    project = get_object_or_404(Project, project_id=project_id)
    if not project.can_view(request.user):
        return HttpResponse("Unauthorized")
    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)
    sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene))
    variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs)
    num_indivs = len([i for i in project.get_individuals() if i.has_variant_data()])
    aac_threshold = (.2 * num_indivs) + 5
    rare_variants = []
    for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter):
        aac = get_alt_allele_count(variant)
        max_af = max(variant.annotation['freqs'].values())
        if aac <= aac_threshold and max_af < .01:
            rare_variants.append(variant)

    add_extra_info_to_variants_project(get_reference(), project, rare_variants)

    knockouts = []
    knockout_ids, variation = get_knockouts_in_gene(project, gene_id)
    for kid in knockout_ids:
        variants = variation.get_relevant_variants_for_indiv_ids([kid])
        add_extra_info_to_variants_project(get_reference(), project, variants)
        knockouts.append({
            'indiv_id': kid,
            'variants': [v.toJSON() for v in variants],
        })

    sys.stderr.write("Retrieved %s variants \n" % len(rare_variants))
    return render(request, 'project/gene_quicklook.html', {
        'gene': gene,
        'gene_json': json.dumps(gene),
        'project': project,
        'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]),
        'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]),
        'knockouts_json': json.dumps(knockouts),
    })
Example #12
0
def family_group_gene(request, project_id, family_group_slug, gene_id):

    project = get_object_or_404(Project, project_id=project_id)
    family_group = get_object_or_404(FamilyGroup, project=project, slug=family_group_slug)
    if not project.can_view(request.user):
        return HttpResponse('unauthorized')

    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)

    varfilter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs)
    variants_by_family = family_group_analysis.get_variants_in_gene(family_group, gene_id, variant_filter=varfilter)

    return render(request, 'family_group/family_group_gene.html', {
        'project': project,
        'family_group': family_group,
        'family_group_json': json.dumps(family_group.toJSON()),
        'gene_json': json.dumps(gene),
        'gene': gene,
        'variants_by_family_json': json.dumps(variants_by_family),
    })
Example #13
0
def get_knockouts_in_gene(project, gene_id, quality_filter=None):
    """
    Get all the variants in a gene, but filter out quality_filter genotypes
    """
    indiv_id_list = [i.indiv_id for i in project.get_individuals()]
    variant_filter = get_default_variant_filter('high_impact')
    variant_list = get_project_datastore().get_variants_in_gene(
        project.project_id,
        gene_id,
        variant_filter=variant_filter,
    )
    variant_list = search_utils.filter_gene_variants_by_variant_filter(variant_list, gene_id, variant_filter)
    variation = CohortGeneVariation(
        get_reference(),
        gene_id,
        variant_list,
        indiv_id_list,
        quality_filter={},
    )
    knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list)
    return knockouts, variation
Example #14
0
def get_knockouts_in_gene(project, gene_id, quality_filter=None):
    """
    Get all the variants in a gene, but filter out quality_filter genotypes
    """
    indiv_id_list = [i.indiv_id for i in project.get_individuals()]
    variant_filter = get_default_variant_filter('high_impact')
    variant_list = get_project_datastore(
        project.project_id).get_project_variants_in_gene(
            project.project_id,
            gene_id,
            variant_filter=variant_filter,
        )
    variant_list = search_utils.filter_gene_variants_by_variant_filter(
        variant_list, gene_id, variant_filter)
    variation = CohortGeneVariation(
        get_reference(),
        gene_id,
        variant_list,
        indiv_id_list,
        quality_filter={},
    )
    knockouts = get_individuals_with_inheritance('recessive', variation,
                                                 indiv_id_list)
    return knockouts, variation
Example #15
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    main_project = get_object_or_404(Project, project_id=project_id)
    if not main_project.can_view(request.user):
        return HttpResponse("Unauthorized")

    new_page_url = '/variant_search/project/{}'.format(main_project.seqr_project.guid) if main_project.seqr_project and main_project.seqr_project.has_new_search else None

    # other projects this user can view
    other_projects = get_loaded_projects_for_user(request.user, fields=['project_id', 'project_name'])

    if other_projects:
        other_projects_json = json.dumps([{'project_id': p.project_id, 'project_name': p.project_name} for p in sorted(other_projects, key=lambda p: p.project_id.lower())])
    else:
        other_projects_json = None

    if gene_id is None:
        return render(request, 'project/gene_quicklook.html', {
            'project': main_project,
            'gene': None,
            'gene_json': None,
            'rare_variants_json': None,
            'individuals_json': None,
            'knockouts_json': None,
            'other_projects_json': other_projects_json,
            'new_page_url': new_page_url,
        })

    projects_to_search_param = request.GET.get('selected_projects')
    if projects_to_search_param:
        project_ids = projects_to_search_param.split(",")
        projects_to_search = [project for project in other_projects if project.project_id in project_ids]
        if len(projects_to_search) < len(project_ids):
            # If not all the specified project ids are in the other projects list then they are not authorized
            return HttpResponse("Unauthorized")
    else:
        project_ids = [main_project.project_id]
        projects_to_search = [main_project]

    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)

    # all rare coding variants
    variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs)

    indiv_id_to_project_id = {}
    rare_variant_dict = {}
    rare_variants = []
    individ_ids_and_variants = []
    for project in projects_to_search:
        all_project_variants = project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter)

        # compute knockout individuals
        knockout_ids, variation = get_knockouts_in_gene(project, gene_id, all_project_variants)
        for indiv_id in knockout_ids:
            variants = variation.get_relevant_variants_for_indiv_ids([indiv_id])
            individ_ids_and_variants.append({
                'indiv_id': indiv_id,
                'variants': variants,
            })

        # compute rare variants
        project_variants = []
        for i, variant in enumerate(all_project_variants):
            max_af = max([freq for label, freq in variant.annotation['freqs'].items() if label != "AF"])  # don't filter on within-cohort AF

            if not any([indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0]):
                continue
            if max_af >= .01:
                continue

            # add project id to genotypes
            for indiv_id in variant.genotypes:
                indiv_id_to_project_id[indiv_id] = project.project_id

            # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project)
            variant_id = "%s-%s-%s-%s" % (variant.chr,variant.pos, variant.ref, variant.alt)
            if variant_id not in rare_variant_dict:
                rare_variant_dict[variant_id] = variant
                project_variants.append(variant)
            else:
                for indiv_id, genotype in variant.genotypes.items():
                    existing_genotype = rare_variant_dict[variant_id].genotypes.get(indiv_id)
                    if not existing_genotype or existing_genotype.num_alt == -1:
                        rare_variant_dict[variant_id].genotypes[indiv_id] = genotype
        if project != main_project:
            add_extra_info_to_variants_project(get_reference(), project, project_variants)
        rare_variants.extend(project_variants)

    all_variants = sum([i['variants'] for i in individ_ids_and_variants], rare_variants)
    add_extra_info_to_variants_project(get_reference(), main_project, all_variants, add_family_tags=True)
    download_csv = request.GET.get('download', '')
    if download_csv:
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(download_csv, gene.get("symbol") or gene.get("transcript_name"))

        def get_row(variant, worst_annotation):
            if 'clinvar_allele_id' in variant.extras:
                measureset_id = variant.extras['clinvar_allele_id']
                clinvar_significance = variant.extras['clinvar_clinsig']
            else:
                measureset_id, clinvar_significance = get_reference().get_clinvar_info(*variant.unique_tuple())
            genotypes = []

            all_genotypes_string = ""
            for indiv_id in individuals_to_include:
                if indiv_id in variant.genotypes and variant.genotypes[indiv_id].num_alt > 0:
                    genotype = variant.genotypes[indiv_id]
                    allele_string = ">".join(genotype.alleles)
                    all_genotypes_string += indiv_id + ":" + allele_string + "  "
                    genotypes.append(allele_string + "   (" + str(genotype.gq) + ")")
                else:
                    genotypes.append("")
            return [
                gene["symbol"],
                variant.chr,
                variant.pos,
                variant.ref,
                variant.alt,
                variant.vcf_id or variant.annotation.get("rsid") or "",
                variant.annotation.get("vep_consequence") or "",
                worst_annotation.get("hgvsc") or "",
                (worst_annotation.get("hgvsp") or "").replace("%3D", "="),
                variant.annotation.get("sift") or "",
                variant.annotation.get("polyphen") or "",
                variant.annotation.get("mutationtaster_pred") or variant.annotation.get("muttaster") or "",
                (";".join(set((worst_annotation.get("fathmm_pred") or "").split('%3B')))) or variant.annotation.get("fathmm") or "",

                measureset_id or "",
                clinvar_significance or "",

                variant.annotation["freqs"].get("1kg_wgs_phase3") or variant.annotation["freqs"].get("1kg_wgs_AF") or "",
                variant.annotation["freqs"].get("1kg_wgs_phase3_popmax") or variant.annotation["freqs"].get("1kg_wgs_popmax_AF") or "",
                variant.annotation["freqs"].get("exac_v3") or variant.annotation["freqs"].get("exac_v3_AF") or "",
                variant.annotation["freqs"].get("exac_v3_popmax") or variant.annotation["freqs"].get("exac_v3_popmax_AF") or "",
                variant.annotation["freqs"].get("gnomad_exomes_AF") or "",
                variant.annotation["freqs"].get("gnomad_exomes_popmax_AF") or "",
                variant.annotation["freqs"].get("gnomad_genomes_AF") or "",
                variant.annotation["freqs"].get("gnomad_genomes_popmax_AF") or "",
                all_genotypes_string,
            ] + genotypes

        if download_csv == 'knockouts':

            individuals_to_include = [individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants]

            rows = []
            for individ_id_and_variants in individ_ids_and_variants:
                rare_variants = individ_id_and_variants["variants"]
                for variant in rare_variants:
                    worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id]
                    worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx]
                    genotypes = []
                    all_genotypes_string = ""
                    for indiv_id in individuals_to_include:
                        if indiv_id in variant.genotypes and variant.genotypes[indiv_id].num_alt > 0:
                            genotype = variant.genotypes[indiv_id]
                            allele_string = ">".join(genotype.alleles)
                            all_genotypes_string += indiv_id + ":" + allele_string + "  "
                            genotypes.append(allele_string + "   (" + str(genotype.gq) + ")")
                        else:
                            genotypes.append("")

                    rows.append(map(str, get_row(variant, worst_annotation)))

        elif download_csv == 'rare_variants':
            individuals_to_include = []
            for variant in rare_variants:
                for indiv_id, genotype in variant.genotypes.items():
                    if genotype.num_alt > 0 and indiv_id not in individuals_to_include:
                        individuals_to_include.append(indiv_id)
            rows = []
            for variant in rare_variants:
                worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id]
                worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx]

                rows.append(map(str, get_row(variant, worst_annotation)))

        header = ["gene", "chr", "pos", "ref", "alt", "rsID", "impact",
                  "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig",
                  "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax",
                  "freq_exac_v3", "freq_exac_v3_popmax",
                  "freq_gnomad_exomes", "freq_gnomad_exomes_popmax",
                  "freq_gnomad_genomes", "freq_gnomad_genomes_popmax",
                  "all_genotypes"] + list(map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i], individuals_to_include))

        writer = csv.writer(response)
        writer.writerow(header)
        for row in rows:
            writer.writerow(row)
        return response
    else:
        for individ_id_and_variants in individ_ids_and_variants:
            variants = individ_id_and_variants["variants"]
            individ_id_and_variants["variants"] = [v.toJSON() for v in variants]

        individ_ids = {i['indiv_id'] for i in individ_ids_and_variants}
        for var in rare_variants:
            individ_ids.update(var.genotypes.keys())
        individuals = Individual.objects.filter(
            indiv_id__in=individ_ids, project__project_id__in=project_ids
        ).select_related('project').select_related('family').only('project__project_id', 'family__family_id', *Individual.INDIVIDUAL_JSON_FIELDS_NO_IDS)

        return render(request, 'project/gene_quicklook.html', {
            'gene': gene,
            'gene_json': json.dumps(gene),
            'project': main_project,
            'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]),
            'individuals_json': json.dumps([i.get_json_obj(skip_has_variant_data=True) for i in individuals]),
            'knockouts_json': json.dumps(individ_ids_and_variants),
            'other_projects_json': other_projects_json,
            'new_page_url': new_page_url,
        })
Example #16
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    project = get_object_or_404(Project, project_id=project_id)
    if not project.can_view(request.user):
        return HttpResponse("Unauthorized")

    if project.project_status == Project.NEEDS_MORE_PHENOTYPES and not request.user.is_staff:
        return render(request, 'analysis_unavailable.html',
                      {'reason': 'Awaiting phenotype data.'})

    # other projects this user can view
    if request.user.is_staff:
        other_projects = [p for p in Project.objects.all()]  #  if p != project
    else:
        other_projects = [
            c.project
            for c in ProjectCollaborator.objects.filter(user=request.user)
        ]  # if c.project != project

    other_projects = filter(
        lambda p: get_project_datastore(p.project_id).
        project_collection_is_loaded(p.project_id), other_projects)

    if other_projects:
        other_projects_json = json.dumps([{
            'project_id': p.project_id,
            'project_name': p.project_name
        } for p in sorted(other_projects, key=lambda p: p.project_id)])
    else:
        other_projects_json = None

    if gene_id is None:
        return render(
            request, 'project/gene_quicklook.html', {
                'project': project,
                'gene': None,
                'gene_json': None,
                'rare_variants_json': None,
                'individuals_json': None,
                'knockouts_json': None,
                'other_projects_json': other_projects_json,
            })

    projects_to_search_param = request.GET.get('selected_projects')
    if projects_to_search_param:
        projects_to_search = []
        project_ids = projects_to_search_param.split(",")
        for project_id in project_ids:
            project = get_object_or_404(Project, project_id=project_id)
            if not project.can_view(request.user):
                return HttpResponse("Unauthorized")
            projects_to_search.append(project)
    else:
        projects_to_search = [project]

    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)
    sys.stderr.write(
        project_id + " - staring gene search for: %s in projects: %s\n" %
        (gene_id, ",".join([p.project_id for p in projects_to_search]) + "\n"))

    # all rare coding variants
    variant_filter = get_default_variant_filter(
        'all_coding',
        mall.get_annotator().reference_population_slugs)

    indiv_id_to_project_id = {}
    rare_variant_dict = {}
    rare_variants = []
    for project in projects_to_search:
        project_variants = []
        for variant in project_analysis.get_variants_in_gene(
                project, gene_id, variant_filter=variant_filter):
            max_af = max(variant.annotation['freqs'].values())
            if not any([
                    indiv_id
                    for indiv_id, genotype in variant.genotypes.items()
                    if genotype.num_alt > 0
            ]):
                continue
            if max_af >= .01:
                continue

            # add project id to genotypes
            for indiv_id in variant.genotypes:
                indiv_id_to_project_id[indiv_id] = project.project_id

            # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project)
            variant_id = "%s-%s-%s-%s" % (variant.chr, variant.pos,
                                          variant.ref, variant.alt)
            if variant_id not in rare_variant_dict:
                rare_variant_dict[variant_id] = variant
                project_variants.append(variant)
            else:
                rare_variant_dict[variant_id].genotypes.update(
                    variant.genotypes)

        #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation']))
        add_extra_info_to_variants_project(get_reference(), project,
                                           project_variants)
        rare_variants.extend(project_variants)
    sys.stderr.write("Retreived %s rare variants\n" % len(rare_variants))

    # compute knockout individuals
    individ_ids_and_variants = []
    for project in projects_to_search:
        knockout_ids, variation = get_knockouts_in_gene(project, gene_id)
        for indiv_id in knockout_ids:
            variants = variation.get_relevant_variants_for_indiv_ids(
                [indiv_id])
            add_extra_info_to_variants_project(get_reference(), project,
                                               variants)
            individ_ids_and_variants.append({
                'indiv_id': indiv_id,
                'variants': variants,
            })
            #sys.stderr.write("%s : %s: Retrieved %s knockout variants\n" % (project.project_id, indiv_id, len(variants), ))

    download_csv = request.GET.get('download', '')
    if download_csv:
        response = HttpResponse(content_type='text/csv')
        response[
            'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(
                download_csv, gene["transcript_name"])

        if download_csv == 'knockouts':

            individuals_to_include = [
                individ_id_and_variants["indiv_id"]
                for individ_id_and_variants in individ_ids_and_variants
            ]

            rows = []
            for individ_id_and_variants in individ_ids_and_variants:
                rare_variants = individ_id_and_variants["variants"]
                for variant in rare_variants:
                    worst_annotation_idx = variant.annotation[
                        "worst_vep_index_per_gene"][gene_id]
                    worst_annotation = variant.annotation["vep_annotation"][
                        worst_annotation_idx]
                    genotypes = []
                    all_genotypes_string = ""
                    for indiv_id in individuals_to_include:
                        if indiv_id in variant.genotypes and variant.genotypes[
                                indiv_id].num_alt > 0:
                            genotype = variant.genotypes[indiv_id]
                            allele_string = ">".join(genotype.alleles)
                            all_genotypes_string += indiv_id + ":" + allele_string + "  "
                            genotypes.append(allele_string + "   (" +
                                             str(genotype.gq) + ")")
                        else:
                            genotypes.append("")

                    measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(
                        variant.unique_tuple(), ("", ""))

                    rows.append(
                        map(str, [
                            gene["symbol"],
                            variant.chr,
                            variant.pos,
                            variant.ref,
                            variant.alt,
                            variant.vcf_id or "",
                            variant.annotation.get("vep_consequence", ""),
                            worst_annotation.get("hgvsc", ""),
                            worst_annotation.get("hgvsp", "").replace(
                                "%3D", "="),
                            worst_annotation.get("sift", ""),
                            worst_annotation.get("polyphen", ""),
                            worst_annotation.get("mutationtaster_pred", ""),
                            ";".join(
                                set(
                                    worst_annotation.get("fathmm_pred",
                                                         "").split('%3B'))),
                            measureset_id,
                            clinvar_significance,
                            variant.annotation["freqs"].get(
                                "1kg_wgs_phase3", ""),
                            variant.annotation["freqs"].get(
                                "1kg_wgs_phase3_popmax", ""),
                            variant.annotation["freqs"].get("exac_v3", ""),
                            variant.annotation["freqs"].get(
                                "exac_v3_popmax", ""),
                            all_genotypes_string,
                        ] + genotypes))
        elif download_csv == 'rare_variants':
            individuals_to_include = []
            for variant in rare_variants:
                for indiv_id, genotype in variant.genotypes.items():
                    if genotype.num_alt > 0 and indiv_id not in individuals_to_include:
                        individuals_to_include.append(indiv_id)
            rows = []
            for variant in rare_variants:
                worst_annotation_idx = variant.annotation[
                    "worst_vep_index_per_gene"][gene_id]
                worst_annotation = variant.annotation["vep_annotation"][
                    worst_annotation_idx]
                genotypes = []
                all_genotypes_string = ""
                for indiv_id in individuals_to_include:
                    if indiv_id in variant.genotypes and variant.genotypes[
                            indiv_id].num_alt > 0:
                        genotype = variant.genotypes[indiv_id]
                        allele_string = ">".join(genotype.alleles)
                        all_genotypes_string += indiv_id + ":" + allele_string + "  "
                        genotypes.append(allele_string + "   (" +
                                         str(genotype.gq) + ")")
                    else:
                        genotypes.append("")

                measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(
                    variant.unique_tuple(), ("", ""))
                rows.append(
                    map(str, [
                        gene["symbol"],
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id or "",
                        variant.annotation.get("vep_consequence", ""),
                        worst_annotation.get("hgvsc", ""),
                        worst_annotation.get("hgvsp", "").replace("%3D", "="),
                        worst_annotation.get("sift", ""),
                        worst_annotation.get("polyphen", ""),
                        worst_annotation.get("mutationtaster_pred", ""),
                        ";".join(
                            set(
                                worst_annotation.get("fathmm_pred",
                                                     "").split('%3B'))),
                        measureset_id,
                        clinvar_significance,
                        variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                        variant.annotation["freqs"].get(
                            "1kg_wgs_phase3_popmax", ""),
                        variant.annotation["freqs"].get("exac_v3", ""),
                        variant.annotation["freqs"].get("exac_v3_popmax", ""),
                        all_genotypes_string,
                    ] + genotypes))

        header = [
            "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c",
            "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id",
            "clinvar_clinical_sig", "freq_1kg_wgs_phase3",
            "freq_1kg_wgs_phase3_popmax", "freq_exac_v3",
            "freq_exac_v3_popmax", "all_genotypes"
        ] + list(
            map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i],
                individuals_to_include))

        writer = csv.writer(response)
        writer.writerow(header)
        for row in rows:
            writer.writerow(row)
        return response
    else:
        for individ_id_and_variants in individ_ids_and_variants:
            variants = individ_id_and_variants["variants"]
            individ_id_and_variants["variants"] = [
                v.toJSON() for v in variants
            ]

        return render(
            request, 'project/gene_quicklook.html', {
                'gene':
                gene,
                'gene_json':
                json.dumps(gene),
                'project':
                project,
                'rare_variants_json':
                json.dumps([v.toJSON() for v in rare_variants]),
                'individuals_json':
                json.dumps([
                    i.get_json_obj() for project in projects_to_search
                    for i in project.get_individuals()
                ]),
                'knockouts_json':
                json.dumps(individ_ids_and_variants),
                'other_projects_json':
                other_projects_json,
            })
Example #17
0
    def search_for_genes(self,
                         gene_ids,
                         project_id_list,
                         output_filename,
                         max_af=0.01):
        """
        Search for a gene across project(s)

        Args:
            gene_ids (list): 'ENSG..' gene id strings.
            project_id_list (list): (optional) project ids to narrow down the search
            output_filename (string): output file name
            max_af (float): AF filter
        """

        outfile = open(output_filename, 'w')

        header = [
            "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter",
            "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster",
            "fathmm", "clinvar_id", "clinvar_clinical_sig",
            "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax",
            "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes"
        ]

        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(header)

        # all rare coding variants
        variant_filter = get_default_variant_filter(
            'all_coding',
            mall.get_annotator().reference_population_slugs)
        print("All Filters: ")
        pprint(variant_filter.toJSON())

        if project_id_list:
            projects = [
                Project.objects.get(project_id=project_id)
                for project_id in project_id_list
            ]
        else:
            projects = Project.objects.all()

        print("Max AF threshold: %s" % max_af)
        print("Staring gene search for:\n%s\nin projects:\n%s\n" %
              (", ".join(gene_ids), ", ".join([p.project_id
                                               for p in projects])))

        indiv_id_cache = {}
        for project in projects:
            project_id = project.project_id
            if get_project_datastore(project_id).project_collection_is_loaded(
                    project_id):
                print("=====================")
                print("Searching project %s" % project_id)
            else:
                print(
                    "Skipping project %s - gene search is not enabled for this project"
                    % project_id)
                continue

            for gene_id in gene_ids:
                gene_id = get_gene_id_from_str(gene_id, get_reference())

                gene = get_reference().get_gene(gene_id)
                print("-- searching %s for gene %s (%s)" %
                      (project_id, gene["symbol"], gene_id))

                for variant in project_analysis.get_variants_in_gene(
                        project, gene_id, variant_filter=variant_filter):
                    if max(variant.annotation['freqs'].values()) >= max_af:
                        continue

                    add_extra_info_to_variants_project(get_reference(),
                                                       project, [variant])

                    worst_annotation_idx = variant.annotation[
                        "worst_vep_index_per_gene"][gene_id]
                    worst_annotation = variant.annotation["vep_annotation"][
                        worst_annotation_idx]
                    all_genotypes_list = []
                    pass_filter = "N/A"
                    for indiv_id, genotype in variant.genotypes.items():
                        individual = indiv_id_cache.get('indiv_id')
                        if individual is None:
                            individual = Individual.objects.get(
                                project=project, indiv_id=indiv_id)
                            indiv_id_cache[indiv_id] = individual

                        pass_filter = genotype.filter  # filter value is stored in the genotypes even though it's the same for all individuals
                        if genotype.num_alt > 0:
                            all_genotypes_list.append(
                                "%s%s[gt:%s GQ:%s AB:%0.3f]" %
                                (indiv_id, "[Affected]"
                                 if individual.affected == "A" else
                                 ("[-]" if individual.affected == "N" else
                                  "[?]"), ">".join(genotype.alleles),
                                 genotype.gq, genotype.ab
                                 if genotype.ab is not None else float('NaN')))

                    measureset_id, clinvar_significance = get_clinvar_variants(
                    ).get(variant.unique_tuple(), ("", ""))
                    row = map(str, [
                        project_id,
                        gene["symbol"],
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id or "",
                        pass_filter,
                        variant.annotation.get("vep_consequence", ""),
                        worst_annotation.get("hgvsc", ""),
                        worst_annotation.get("hgvsp", "").replace("%3D", "="),
                        worst_annotation.get("sift", ""),
                        worst_annotation.get("polyphen", ""),
                        worst_annotation.get("mutationtaster_pred", ""),
                        ";".join(
                            set(
                                worst_annotation.get("fathmm_pred",
                                                     "").split('%3B'))),
                        measureset_id,
                        clinvar_significance,
                        variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                        variant.annotation["freqs"].get(
                            "1kg_wgs_phase3_popmax", ""),
                        variant.annotation["freqs"].get("exac_v3", ""),
                        variant.annotation["freqs"].get("exac_v3_popmax", ""),
                        ", ".join(all_genotypes_list),
                    ])
                    writer.writerow(row)

        outfile.close()
        print("Wrote out %s" % output_filename)
    def search_for_gene(self, search_gene_id, project_id_list, max_af=0.01):
      '''
        Search for a gene across project(s)
        Args:
          1. search_gene_id: Gene ID to search for
          2. proj_list: An optional list of projects to narrow down search to
      '''
      gene_id = get_gene_id_from_str(search_gene_id, get_reference())
      gene = get_reference().get_gene(gene_id)
      
      print("Staring gene search for: %s %s in projects: %s\n" % (search_gene_id, gene['gene_id'], ", ".join(project_id_list)))
      print("Max AF threshold: %s" % max_af)

      # all rare coding variants
      variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs)
      print("All Filters: ")
      pprint(variant_filter.toJSON())

      output_filename = 'results_'+search_gene_id + '.tsv'
      outfile = open(output_filename,'w')

      header = ["project_id","gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact",
                "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig",
                "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax",
                "freq_exac_v3", "freq_exac_v3_popmax",
                "all_genotypes"]

      
      writer = csv.writer(outfile,delimiter='\t')
      writer.writerow(header)
      
      if project_id_list: 
          for project_id in project_id_list:
              project = Project.objects.filter(project_id=project_id)[0]  # TODO validate
      else:
          project_id_list = [p.project_id for p in Project.objects.all()]
      
      for project_id in project_id_list:
          project = Project.objects.filter(project_id=project_id)[0]
          if get_project_datastore(project_id).project_collection_is_loaded(project_id):
              print("Running on project %s" % project_id)
          else:
              print("Skipping project %s - gene search is not enabled for this project" % project_id)
              continue

          for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter):
              if max(variant.annotation['freqs'].values()) >= max_af:
                  continue
              #pprint(variant.toJSON())
              add_extra_info_to_variants_project(get_reference(), project, [variant])

              worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id]
              worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx]
              all_genotypes_list = []
              pass_filter = "N/A"
              for indiv_id, genotype in variant.genotypes.items():
                  pass_filter = genotype.filter  # filter value is stored in the genotypes even though it's the same for all individuals
                  if genotype.num_alt > 0:
                    all_genotypes_list.append("%s[gt:%s GQ:%s AB:%0.3f]" % (indiv_id, ">".join(genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN')))

              measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", ""))
              row = map(str,
                  [project_id, 
                    gene["symbol"],
                    variant.chr,
                    variant.pos,
                    variant.ref,
                    variant.alt,
                    variant.vcf_id or "",
                    pass_filter,
                    variant.annotation.get("vep_consequence", ""),
                    worst_annotation.get("hgvsc", ""),
                    worst_annotation.get("hgvsp", "").replace("%3D", "="),
                    worst_annotation.get("sift", ""),
                    worst_annotation.get("polyphen", ""),
                    worst_annotation.get("mutationtaster_pred", ""),
                    ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))),
                    measureset_id,
                    clinvar_significance,
                    variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                    variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""),
                    variant.annotation["freqs"].get("exac_v3", ""),
                    variant.annotation["freqs"].get("exac_v3_popmax", ""),
                    ", ".join(all_genotypes_list),
                  ])
              writer.writerow(row)
      
      outfile.close()        
      print("Wrote out %s" % output_filename)
Example #19
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    project = get_object_or_404(Project, project_id=project_id)
    if not project.can_view(request.user):
        return HttpResponse("Unauthorized")
    
    if gene_id is None:
        return render(request, 'project/gene_quicklook.html', {
            'project': project,
            'gene': None,
            'gene_json': None,
            'rare_variants_json': None,
            'individuals_json': None,
            'knockouts_json': None,
        })
        
        
    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)
    sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene))

    # all rare coding variants
    variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs)

    rare_variants = []
    for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter):
        max_af = max(variant.annotation['freqs'].values())
        if max_af < .01:
            rare_variants.append(variant)
    #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation']))
    add_extra_info_to_variants_project(get_reference(), project, rare_variants)

    # compute knockout individuals
    individ_ids_and_variants = []
    knockout_ids, variation = get_knockouts_in_gene(project, gene_id)
    for indiv_id in knockout_ids:
        variants = variation.get_relevant_variants_for_indiv_ids([indiv_id])
        add_extra_info_to_variants_project(get_reference(), project, variants)
        individ_ids_and_variants.append({
            'indiv_id': indiv_id,
            'variants': variants,
        })

    sys.stderr.write("Project-wide gene search retrieved %s rare variants for gene: %s \n" % (len(rare_variants), gene_id))

    download_csv = request.GET.get('download', '')
    if download_csv:
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(download_csv, gene["transcript_name"])

        if download_csv == 'knockouts':

            individuals_to_include = [individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants]

            rows = []
            for individ_id_and_variants in individ_ids_and_variants:
                rare_variants = individ_id_and_variants["variants"]
                for variant in rare_variants:
                    worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id]
                    worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx]
                    genotypes = []
                    all_genotypes_string = ""
                    for indiv_id in individuals_to_include:
                        genotype = variant.genotypes[indiv_id]
                        allele_string = ">".join(genotype.alleles)
                        all_genotypes_string += indiv_id + ":" + allele_string + "  "
                        if genotype.num_alt > 0:
                            genotypes.append(allele_string + "   (" + str(genotype.gq) + ")")
                        else:
                            genotypes.append("")

                    measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", ""))

                    rows.append(map(str,
                        [ gene["symbol"],
                          variant.chr,
                          variant.pos,
                          variant.ref,
                          variant.alt,
                          variant.vcf_id or "",
                          variant.annotation.get("vep_consequence", ""),
                          worst_annotation.get("hgvsc", ""),
                          worst_annotation.get("hgvsp", "").replace("%3D", "="),
                          worst_annotation.get("sift", ""),
                          worst_annotation.get("polyphen", ""),
                          worst_annotation.get("mutationtaster_pred", ""),
                          ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))),

                          measureset_id,
                          clinvar_significance,

                          variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                          variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""),
                          variant.annotation["freqs"].get("exac_v3", ""),
                          variant.annotation["freqs"].get("exac_v3_popmax", ""),
                          all_genotypes_string,
                        ] + genotypes))
        elif download_csv == 'rare_variants':
            individuals_to_include = []
            for variant in rare_variants:
                for indiv_id, genotype in variant.genotypes.items():
                    if genotype.num_alt > 0 and indiv_id not in individuals_to_include:
                        individuals_to_include.append(indiv_id)
            rows = []
            for variant in rare_variants:
                worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id]
                worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx]
                genotypes = []
                all_genotypes_string = ""
                for indiv_id in individuals_to_include:
                    genotype = variant.genotypes[indiv_id]
                    allele_string = ">".join(genotype.alleles)
                    all_genotypes_string += indiv_id + ":" + allele_string + "  "
                    if genotype.num_alt > 0:
                        genotypes.append(allele_string + "   (" + str(genotype.gq) + ")")
                    else:
                        genotypes.append("")

                measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", ""))
                rows.append(map(str,
                    [ gene["symbol"],
                      variant.chr,
                      variant.pos,
                      variant.ref,
                      variant.alt,
                      variant.vcf_id or "",
                      variant.annotation.get("vep_consequence", ""),
                      worst_annotation.get("hgvsc", ""),
                      worst_annotation.get("hgvsp", "").replace("%3D", "="),
                      worst_annotation.get("sift", ""),
                      worst_annotation.get("polyphen", ""),
                      worst_annotation.get("mutationtaster_pred", ""),
                      ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))),
                      measureset_id,
                      clinvar_significance,
                      variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                      variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""),
                      variant.annotation["freqs"].get("exac_v3", ""),
                      variant.annotation["freqs"].get("exac_v3_popmax", ""),
                      all_genotypes_string,
                    ] + genotypes))


        header = ["gene", "chr", "pos", "ref", "alt", "rsID", "impact",
                  "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig",
                  "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax",
                  "freq_exac_v3", "freq_exac_v3_popmax",
                  "all_genotypes"] + individuals_to_include

        writer = csv.writer(response)
        writer.writerow(header)
        for row in rows:
            writer.writerow(row)
        return response
    else:
        for individ_id_and_variants in individ_ids_and_variants:
            variants = individ_id_and_variants["variants"]
            individ_id_and_variants["variants"] = [v.toJSON() for v in variants]

        return render(request, 'project/gene_quicklook.html', {
            'gene': gene,
            'gene_json': json.dumps(gene),
            'project': project,
            'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]),
            'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]),
            'knockouts_json': json.dumps(individ_ids_and_variants),
        })
Example #20
0
    def handle(self, *args, **options):
        if len(args) != 2:
            sys.exit(
                "ERROR: please specify the project_id and file of individual ids as command line args."
            )

        project_id = args[0]
        individuals_file = args[1]

        # init objects
        project = Project.objects.get(project_id=project_id)
        all_individual_ids_in_project = set(
            [i.indiv_id for i in project.get_individuals()])

        individuals_of_interest = []
        invalid_individual_ids = []
        with open(individuals_file) as f:
            for line in f:
                line = line.strip('\n')
                if not line or line.startswith("#"):
                    continue
                individual_id = line.split("\t")[0]
                if individual_id in all_individual_ids_in_project:
                    individuals_of_interest.append(individual_id)
                else:
                    invalid_individual_ids.append(individual_id)

        print("Processing %s: %d individuals " %
              (project_id, len(individuals_of_interest)))
        if invalid_individual_ids:
            num_invalid = len(invalid_individual_ids)
            total_ids = len(all_individual_ids_in_project)
            sys.exit((
                "ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: "
                "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s"
            ) % locals())

        # filter
        variant_filter = get_default_variant_filter('moderate_impact')
        variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
        variant_filter.ref_freqs.append(
            ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
        variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
        variant_filter.ref_freqs.append(
            ('exac_v3_popmax', exac_popmax_threshold))
        variant_filter.ref_freqs.append(
            ('merck-wgs-3793', merck_wgs_3793_threshold))
        quality_filter = {
            'vcf_filter': 'pass',
            'min_gq': GQ_threshold,
            'min_ab': AB_threshold,
        }

        # create individuals_variants.tsv
        individual_variants_f = gzip.open(
            'individuals_in_%s.tsv.gz' % project_id, 'w')
        writer = csv.writer(individual_variants_f,
                            dialect='excel',
                            delimiter='\t')

        header_fields = [
            'project_id',
            'family_id',
            'individual_id',
            'gene',
            'chrom',
            'pos',
            'ref',
            'alt',
            'rsid',
            'annotation',
            '1kg_af',
            '1kg_popmax_af',
            'exac_af',
            'exac_popmax_af',
            'merck_wgs_3793_af',
            'genotype_str',
            'genotype_num_alt',
            'genotype_allele_balance',
            'genotype_AD',
            'genotype_DP',
            'genotype_GQ',
            'genotype_PL',
            'genotype_filter',
        ]

        writer.writerow(header_fields)
        # collect the resources that we'll need here
        annotator = get_annotator()
        custom_population_store = get_custom_population_store()

        individual_counter = 0
        for i, family in enumerate(project.get_families()):
            for individual in family.get_individuals():
                if individual.indiv_id not in individuals_of_interest:
                    continue
                individual_counter += 1
                print("%s: %s, individual %s" %
                      (individual_counter, family.family_id,
                       individual.indiv_id))
                for variant in get_variants(
                        get_datastore(project.project_id),
                        family.xfamily(),
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                        indivs_to_consider=[individual.indiv_id]):
                    genotype = variant.get_genotype(individual.indiv_id)
                    if len(genotype.alleles) == 0 or genotype.extras[
                            "dp"] < DP_threshold or genotype.num_alt == 0:
                        continue

                    custom_populations = custom_population_store.get_frequencies(
                        variant.xpos, variant.ref, variant.alt)

                    genotype_str = "/".join(
                        genotype.alleles) if genotype.alleles else "./."

                    g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                    g1k_popmax_freq = variant.annotation['freqs'][
                        '1kg_wgs_phase3_popmax']
                    exac_freq = variant.annotation['freqs']['exac_v3']
                    exac_popmax_freq = variant.annotation['freqs'][
                        'exac_v3_popmax']
                    merck_wgs_3793_freq = custom_populations.get(
                        'merck-wgs-3793', 0.0)

                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (
                        g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % (
                        g1k_popmax_freq, g1k_popmax_freq_threshold)
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (
                        exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (
                        exac_popmax_freq, exac_popmax_threshold)
                    assert merck_wgs_3793_freq <= merck_wgs_3793_threshold

                    assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (
                        variant.chr, variant.pos, genotype.gq)
                    assert genotype.extras[
                        "dp"] >= DP_threshold, "%s %s - GQ is %s " % (
                            variant.chr, variant.pos, genotype.extras["dp"])
                    if genotype.num_alt == 1:
                        assert genotype.ab >= AB_threshold / 100., "%s %s - AB is %s " % (
                            variant.chr, variant.pos, genotype.ab)
                    assert genotype.filter == "pass", "%s %s - filter is %s " % (
                        variant.chr, variant.pos, genotype.filter)

                    writer.writerow(
                        map(str, [
                            project_id,
                            family.family_id,
                            individual.indiv_id,
                            get_gene_symbol(variant),
                            variant.chr,
                            variant.pos,
                            variant.ref,
                            variant.alt,
                            variant.vcf_id,
                            variant.annotation['vep_group'],
                            g1k_freq,
                            g1k_popmax_freq,
                            exac_freq,
                            exac_popmax_freq,
                            merck_wgs_3793_freq,
                            genotype_str,
                            genotype.num_alt,
                            genotype.ab,
                            genotype.extras["ad"],
                            genotype.extras["dp"],
                            genotype.gq,
                            genotype.extras["pl"],
                            genotype.filter,
                        ]))
                    individual_variants_f.flush()
        individual_variants_f.close()
    def search_for_genes(self,
                         gene_or_variant_ids,
                         project_id_list,
                         output_filename,
                         max_af=0.01,
                         knockouts=False,
                         in_clinvar_only=False,
                         include_non_coding=False):
        """
        Search for a gene across project(s)

        Args:
            gene_or_variant_ids (list): 'ENSG..' gene id strings.
            project_id_list (list): (optional) project ids to narrow down the search
            output_filename (string): output file name
            max_af (float): AF filter
            in_clinvar_only (bool):
            include_non_coding (bool):
        """

        projects = [
            Project.objects.get(project_id=project_id)
            for project_id in project_id_list
        ]

        outfile = open(output_filename, 'w')

        header = [
            "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter",
            "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster",
            "fathmm", "clinvar_id", "clinvar_clinical_sig",
            "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax",
            "freq_exac_v3", "freq_exac_v3_popmax", "gnomad-exomes",
            "gnomad-genomes", "families", "all_genotypes"
        ]

        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(header)

        # all rare coding variants
        if not knockouts:
            variant_filter = get_default_variant_filter(
                'all_coding',
                mall.get_annotator().reference_population_slugs)
            #variant_filter.set_max_AF(max_af)
            if include_non_coding:
                variant_filter.so_annotations = []
            print("All Filters: ")
            pprint(variant_filter.toJSON())

        #print("Max AF threshold: %s" % max_af)
        print("Starting search for:\n%s\nin projects:\n%s\n" %
              (", ".join(gene_or_variant_ids), ", ".join(
                  [p.project_id for p in projects])))

        for project in projects:
            project_id = project.project_id
            if get_project_datastore(project).project_collection_is_loaded(
                    project):
                print("=====================")
                print("Searching project %s" % project_id)
            else:
                print(
                    "Skipping project %s - gene search is not enabled for this project"
                    % project_id)
                continue

            indiv_cache = {}
            for gene_or_variant_id in gene_or_variant_ids:
                chrom_pos_match = re.match("([0-9XY]{1,2})-([0-9]{1,9})",
                                           gene_or_variant_id)
                chrom_pos_ref_alt_match = re.match(
                    "([0-9XY]{1,2})-([0-9]{1,9})-([ACTG]+)-([ACTG]+)",
                    gene_or_variant_id)

                if chrom_pos_match or chrom_pos_ref_alt_match:
                    chrom = chrom_pos_match.group(1)
                    pos = int(chrom_pos_match.group(2))
                    xpos = genomeloc.get_xpos(chrom, pos)
                    ref = alt = None
                    if chrom_pos_ref_alt_match:
                        ref = chrom_pos_ref_alt_match.group(3)
                        alt = chrom_pos_ref_alt_match.group(4)

                    variant = get_project_datastore(
                        project).get_single_variant(project.project_id, None,
                                                    xpos, ref, alt)
                    if variant is None:
                        continue
                    variants = [variant]
                    print("-- searching %s for variant %s-%s-%s: found %s" %
                          (project_id, xpos, ref, alt, variant))
                    worst_annotation_idx = variant.annotation[
                        'worst_vep_annotation_index']
                    print(variant.annotation["vep_annotation"]
                          [worst_annotation_idx])
                    gene_id = variant.annotation["vep_annotation"][
                        worst_annotation_idx]['gene_id']
                    gene = get_reference().get_gene(gene_id)
                else:
                    gene_id = get_gene_id_from_str(gene_or_variant_id,
                                                   get_reference())
                    gene = get_reference().get_gene(gene_id)
                    print("-- searching %s for gene %s (%s)" %
                          (project_id, gene["symbol"], gene_id))

                    if knockouts:
                        knockout_ids, variation = project_analysis.get_knockouts_in_gene(
                            project, gene_id)
                        variants = variation.get_relevant_variants_for_indiv_ids(
                            knockout_ids)
                    else:
                        variants = project_analysis.get_variants_in_gene(
                            project, gene_id, variant_filter=variant_filter)

                for variant in variants:
                    if not chrom_pos_match and not chrom_pos_ref_alt_match and max(
                            variant.annotation['freqs'].values()) >= max_af:
                        continue

                    add_extra_info_to_variants_project(get_reference(),
                                                       project, [variant])
                    worst_annotation_idx = variant.annotation[
                        "worst_vep_index_per_gene"].get(gene_id)

                    if worst_annotation_idx is not None:
                        worst_annotation = variant.annotation[
                            "vep_annotation"][worst_annotation_idx]
                    else:
                        worst_annotation = None
                    all_genotypes_list = []
                    pass_filter = "N/A"
                    family_ids = set()
                    for indiv_id, genotype in variant.genotypes.items():
                        if indiv_id in indiv_cache:
                            individual = indiv_cache[indiv_id]
                            if individual == 'deleted':
                                continue
                        else:
                            try:
                                individual = Individual.objects.get(
                                    project=project, indiv_id=indiv_id)
                                indiv_cache[indiv_id] = individual
                            except ObjectDoesNotExist:
                                # this can happen when an individual is deleted from the project - from postgres, but not from mong
                                indiv_cache[indiv_id] = 'deleted'
                                continue
                            except MultipleObjectsReturned:
                                # when several families have an individual with the same id
                                individuals = Individual.objects.filter(
                                    project=project, indiv_id=indiv_id)
                                individual = individuals[0]
                                indiv_cache[indiv_id] = individual

                        pass_filter = genotype.filter  # filter value is stored in the genotypes even though it's the same for all individuals
                        if genotype.num_alt > 0:
                            family_ids.add(individual.family.family_id)
                            all_genotypes_list.append(
                                "%s/%s%s[gt:%s GQ:%s AB:%0.3f]" %
                                (individual.family.family_id, indiv_id,
                                 "[Affected]" if individual.affected == "A"
                                 else ("[-]" if individual.affected == "N" else
                                       "[?]"), ">".join(genotype.alleles),
                                 genotype.gq, genotype.ab
                                 if genotype.ab is not None else float('NaN')))

                    if len(all_genotypes_list) == 0:
                        continue

                    measureset_id, clinvar_significance = get_reference(
                    ).get_clinvar_info(*variant.unique_tuple())
                    if in_clinvar_only and (
                            not clinvar_significance
                            or "path" not in clinvar_significance.lower()):
                        continue

                    row = map(str, [
                        project_id,
                        gene,
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id or "",
                        pass_filter,
                        variant.annotation.get("vep_consequence", ""),
                        worst_annotation.get("hgvsc", "")
                        if worst_annotation else "",
                        (worst_annotation.get("hgvsp", "") or "").replace(
                            "%3D", "=") if worst_annotation else "",
                        worst_annotation.get("sift", "")
                        if worst_annotation else "",
                        worst_annotation.get("polyphen", "")
                        if worst_annotation else "",
                        worst_annotation.get("mutationtaster_pred", "")
                        if worst_annotation else "",
                        ";".join(
                            set(
                                worst_annotation.get("fathmm_pred",
                                                     "").split('%3B')))
                        if worst_annotation else "",
                        measureset_id,
                        clinvar_significance,
                        variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                        variant.annotation["freqs"].get(
                            "1kg_wgs_phase3_popmax", ""),
                        variant.annotation["freqs"].get("exac_v3", ""),
                        variant.annotation["freqs"].get("exac_v3_popmax", ""),
                        variant.annotation["freqs"].get("gnomad-exomes2", ""),
                        variant.annotation["freqs"].get("gnomad-genomes2", ""),
                        ", ".join(sorted(list(family_ids))),
                        ", ".join(all_genotypes_list),
                    ])

                    writer.writerow(row)

        outfile.close()
        print("Wrote out %s" % output_filename)
Example #22
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    main_project = get_object_or_404(Project, project_id=project_id)
    if not main_project.can_view(request.user):
        return HttpResponse("Unauthorized")

    # other projects this user can view
    other_projects = get_loaded_projects_for_user(
        request.user, fields=['project_id', 'project_name'])

    if other_projects:
        other_projects_json = json.dumps([{
            'project_id': p.project_id,
            'project_name': p.project_name
        } for p in sorted(other_projects, key=lambda p: p.project_id.lower())])
    else:
        other_projects_json = None

    if gene_id is None:
        return render(
            request, 'project/gene_quicklook.html', {
                'project': main_project,
                'gene': None,
                'gene_json': None,
                'rare_variants_json': None,
                'individuals_json': None,
                'knockouts_json': None,
                'other_projects_json': other_projects_json,
            })

    projects_to_search_param = request.GET.get('selected_projects')
    if projects_to_search_param:
        project_ids = projects_to_search_param.split(",")
        projects_to_search = [
            project for project in other_projects
            if project.project_id in project_ids
        ]
        if len(projects_to_search) < len(project_ids):
            # If not all the specified project ids are in the other projects list then they are not authorized
            return HttpResponse("Unauthorized")
    else:
        project_ids = [main_project.project_id]
        projects_to_search = [main_project]

    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)
    sys.stderr.write(
        project_id + " - staring gene search for: %s in projects: %s\n" %
        (gene_id, ",".join([p.project_id for p in projects_to_search]) + "\n"))

    # all rare coding variants
    variant_filter = get_default_variant_filter(
        'all_coding',
        mall.get_annotator().reference_population_slugs)

    indiv_id_to_project_id = {}
    rare_variant_dict = {}
    rare_variants = []
    individ_ids_and_variants = []
    for project in projects_to_search:
        all_project_variants = project_analysis.get_variants_in_gene(
            project, gene_id, variant_filter=variant_filter)

        # compute knockout individuals
        knockout_ids, variation = get_knockouts_in_gene(
            project, gene_id, all_project_variants)
        for indiv_id in knockout_ids:
            variants = variation.get_relevant_variants_for_indiv_ids(
                [indiv_id])
            individ_ids_and_variants.append({
                'indiv_id': indiv_id,
                'variants': variants,
            })

        # compute rare variants
        project_variants = []
        for i, variant in enumerate(all_project_variants):
            max_af = max([
                freq for label, freq in variant.annotation['freqs'].items()
                if label != "AF"
            ])  # don't filter on within-cohort AF

            if not any([
                    indiv_id
                    for indiv_id, genotype in variant.genotypes.items()
                    if genotype.num_alt > 0
            ]):
                continue
            if max_af >= .01:
                continue

            # add project id to genotypes
            for indiv_id in variant.genotypes:
                indiv_id_to_project_id[indiv_id] = project.project_id

            # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project)
            variant_id = "%s-%s-%s-%s" % (variant.chr, variant.pos,
                                          variant.ref, variant.alt)
            if variant_id not in rare_variant_dict:
                rare_variant_dict[variant_id] = variant
                project_variants.append(variant)
            else:
                rare_variant_dict[variant_id].genotypes.update(
                    variant.genotypes)

        rare_variants.extend(project_variants)

    all_variants = sum([i['variants'] for i in individ_ids_and_variants],
                       rare_variants)
    add_extra_info_to_variants_project(get_reference(), project, all_variants)
    download_csv = request.GET.get('download', '')
    if download_csv:
        response = HttpResponse(content_type='text/csv')
        response[
            'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(
                download_csv,
                gene.get("symbol") or gene.get("transcript_name"))

        if download_csv == 'knockouts':

            individuals_to_include = [
                individ_id_and_variants["indiv_id"]
                for individ_id_and_variants in individ_ids_and_variants
            ]

            rows = []
            for individ_id_and_variants in individ_ids_and_variants:
                rare_variants = individ_id_and_variants["variants"]
                for variant in rare_variants:
                    worst_annotation_idx = variant.annotation[
                        "worst_vep_index_per_gene"][gene_id]
                    worst_annotation = variant.annotation["vep_annotation"][
                        worst_annotation_idx]
                    genotypes = []
                    all_genotypes_string = ""
                    for indiv_id in individuals_to_include:
                        if indiv_id in variant.genotypes and variant.genotypes[
                                indiv_id].num_alt > 0:
                            genotype = variant.genotypes[indiv_id]
                            allele_string = ">".join(genotype.alleles)
                            all_genotypes_string += indiv_id + ":" + allele_string + "  "
                            genotypes.append(allele_string + "   (" +
                                             str(genotype.gq) + ")")
                        else:
                            genotypes.append("")

                    measureset_id, clinvar_significance = get_reference(
                    ).get_clinvar_info(*variant.unique_tuple())
                    rows.append(
                        map(str, [
                            gene["symbol"],
                            variant.chr,
                            variant.pos,
                            variant.ref,
                            variant.alt,
                            variant.vcf_id or "",
                            variant.annotation.get("vep_consequence", ""),
                            worst_annotation.get("hgvsc", ""),
                            worst_annotation.get("hgvsp", "").replace(
                                "%3D", "="),
                            worst_annotation.get("sift", ""),
                            worst_annotation.get("polyphen", ""),
                            worst_annotation.get("mutationtaster_pred", ""),
                            ";".join(
                                set(
                                    worst_annotation.get("fathmm_pred",
                                                         "").split('%3B'))),
                            measureset_id,
                            clinvar_significance,
                            variant.annotation["freqs"].get(
                                "1kg_wgs_phase3", ""),
                            variant.annotation["freqs"].get(
                                "1kg_wgs_phase3_popmax", ""),
                            variant.annotation["freqs"].get("exac_v3", ""),
                            variant.annotation["freqs"].get(
                                "exac_v3_popmax", ""),
                            all_genotypes_string,
                        ] + genotypes))
        elif download_csv == 'rare_variants':
            individuals_to_include = []
            for variant in rare_variants:
                for indiv_id, genotype in variant.genotypes.items():
                    if genotype.num_alt > 0 and indiv_id not in individuals_to_include:
                        individuals_to_include.append(indiv_id)
            rows = []
            for variant in rare_variants:
                worst_annotation_idx = variant.annotation[
                    "worst_vep_index_per_gene"][gene_id]
                worst_annotation = variant.annotation["vep_annotation"][
                    worst_annotation_idx]
                genotypes = []
                all_genotypes_string = ""
                for indiv_id in individuals_to_include:
                    if indiv_id in variant.genotypes and variant.genotypes[
                            indiv_id].num_alt > 0:
                        genotype = variant.genotypes[indiv_id]
                        allele_string = ">".join(genotype.alleles)
                        all_genotypes_string += indiv_id + ":" + allele_string + "  "
                        genotypes.append(allele_string + "   (" +
                                         str(genotype.gq) + ")")
                    else:
                        genotypes.append("")

                measureset_id, clinvar_significance = get_reference(
                ).get_clinvar_info(*variant.unique_tuple())
                rows.append(
                    map(str, [
                        gene["symbol"],
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id or "",
                        variant.annotation.get("vep_consequence", ""),
                        worst_annotation.get("hgvsc", ""),
                        worst_annotation.get("hgvsp", "").replace("%3D", "="),
                        worst_annotation.get("sift", ""),
                        worst_annotation.get("polyphen", ""),
                        worst_annotation.get("mutationtaster_pred", ""),
                        ";".join(
                            set(
                                worst_annotation.get("fathmm_pred",
                                                     "").split('%3B'))),
                        measureset_id,
                        clinvar_significance,
                        variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                        variant.annotation["freqs"].get(
                            "1kg_wgs_phase3_popmax", ""),
                        variant.annotation["freqs"].get("exac_v3", ""),
                        variant.annotation["freqs"].get("exac_v3_popmax", ""),
                        all_genotypes_string,
                    ] + genotypes))

        header = [
            "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c",
            "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id",
            "clinvar_clinical_sig", "freq_1kg_wgs_phase3",
            "freq_1kg_wgs_phase3_popmax", "freq_exac_v3",
            "freq_exac_v3_popmax", "all_genotypes"
        ] + list(
            map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i],
                individuals_to_include))

        writer = csv.writer(response)
        writer.writerow(header)
        for row in rows:
            writer.writerow(row)
        return response
    else:
        for individ_id_and_variants in individ_ids_and_variants:
            variants = individ_id_and_variants["variants"]
            individ_id_and_variants["variants"] = [
                v.toJSON() for v in variants
            ]

        individ_ids = {i['indiv_id'] for i in individ_ids_and_variants}
        for var in rare_variants:
            individ_ids.update(var.genotypes.keys())
        individuals = Individual.objects.filter(
            indiv_id__in=individ_ids,
            project__project_id__in=project_ids).select_related(
                'project').select_related('family').only(
                    'project__project_id', 'family__family_id',
                    *Individual.INDIVIDUAL_JSON_FIELDS_NO_IDS)

        return render(
            request, 'project/gene_quicklook.html', {
                'gene':
                gene,
                'gene_json':
                json.dumps(gene),
                'project':
                main_project,
                'rare_variants_json':
                json.dumps([v.toJSON() for v in rare_variants]),
                'individuals_json':
                json.dumps([
                    i.get_json_obj(skip_has_variant_data=True)
                    for i in individuals
                ]),
                'knockouts_json':
                json.dumps(individ_ids_and_variants),
                'other_projects_json':
                other_projects_json,
            })
Example #23
0
    def search_for_gene(self, search_gene_id, project_id_list, max_af=0.01):
        '''
        Search for a gene across project(s)
        Args:
          1. search_gene_id: Gene ID to search for
          2. proj_list: An optional list of projects to narrow down search to
      '''
        gene_id = get_gene_id_from_str(search_gene_id, get_reference())
        gene = get_reference().get_gene(gene_id)

        print("Staring gene search for: %s %s in projects: %s\n" %
              (search_gene_id, gene['gene_id'], ", ".join(project_id_list)))
        print("Max AF threshold: %s" % max_af)

        # all rare coding variants
        variant_filter = get_default_variant_filter(
            'all_coding',
            mall.get_annotator().reference_population_slugs)
        print("All Filters: ")
        pprint(variant_filter.toJSON())

        output_filename = 'results_' + search_gene_id + '.tsv'
        outfile = open(output_filename, 'w')

        header = [
            "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter",
            "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster",
            "fathmm", "clinvar_id", "clinvar_clinical_sig",
            "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax",
            "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes"
        ]

        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(header)

        if project_id_list:
            for project_id in project_id_list:
                project = Project.objects.filter(
                    project_id=project_id)[0]  # TODO validate
        else:
            project_id_list = [p.project_id for p in Project.objects.all()]

        for project_id in project_id_list:
            project = Project.objects.filter(project_id=project_id)[0]
            if get_project_datastore(project_id).project_collection_is_loaded(
                    project_id):
                print("Running on project %s" % project_id)
            else:
                print(
                    "Skipping project %s - gene search is not enabled for this project"
                    % project_id)
                continue

            for variant in project_analysis.get_variants_in_gene(
                    project, gene_id, variant_filter=variant_filter):
                if max(variant.annotation['freqs'].values()) >= max_af:
                    continue
                #pprint(variant.toJSON())
                add_extra_info_to_variants_project(get_reference(), project,
                                                   [variant])

                worst_annotation_idx = variant.annotation[
                    "worst_vep_index_per_gene"][gene_id]
                worst_annotation = variant.annotation["vep_annotation"][
                    worst_annotation_idx]
                all_genotypes_list = []
                pass_filter = "N/A"
                for indiv_id, genotype in variant.genotypes.items():
                    pass_filter = genotype.filter  # filter value is stored in the genotypes even though it's the same for all individuals
                    if genotype.num_alt > 0:
                        all_genotypes_list.append(
                            "%s[gt:%s GQ:%s AB:%0.3f]" %
                            (indiv_id, ">".join(
                                genotype.alleles), genotype.gq, genotype.ab
                             if genotype.ab is not None else float('NaN')))

                measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(
                    variant.unique_tuple(), ("", ""))
                row = map(str, [
                    project_id,
                    gene["symbol"],
                    variant.chr,
                    variant.pos,
                    variant.ref,
                    variant.alt,
                    variant.vcf_id or "",
                    pass_filter,
                    variant.annotation.get("vep_consequence", ""),
                    worst_annotation.get("hgvsc", ""),
                    worst_annotation.get("hgvsp", "").replace("%3D", "="),
                    worst_annotation.get("sift", ""),
                    worst_annotation.get("polyphen", ""),
                    worst_annotation.get("mutationtaster_pred", ""),
                    ";".join(
                        set(
                            worst_annotation.get("fathmm_pred",
                                                 "").split('%3B'))),
                    measureset_id,
                    clinvar_significance,
                    variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                    variant.annotation["freqs"].get("1kg_wgs_phase3_popmax",
                                                    ""),
                    variant.annotation["freqs"].get("exac_v3", ""),
                    variant.annotation["freqs"].get("exac_v3_popmax", ""),
                    ", ".join(all_genotypes_list),
                ])
                writer.writerow(row)

        outfile.close()
        print("Wrote out %s" % output_filename)
Example #24
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    project = get_object_or_404(Project, project_id=project_id)
    if not project.can_view(request.user):
        return HttpResponse("Unauthorized")

    if project.project_status == Project.NEEDS_MORE_PHENOTYPES and not request.user.is_staff:
        return render(request, 'analysis_unavailable.html',
                      {'reason': 'Awaiting phenotype data.'})

    if gene_id is None:
        return render(
            request, 'project/gene_quicklook.html', {
                'project': project,
                'gene': None,
                'gene_json': None,
                'rare_variants_json': None,
                'individuals_json': None,
                'knockouts_json': None,
            })

    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)
    sys.stderr.write(project_id + " - staring gene search for: %s %s \n" %
                     (gene_id, gene))

    # all rare coding variants
    variant_filter = get_default_variant_filter(
        'all_coding',
        mall.get_annotator().reference_population_slugs)

    rare_variants = []
    for variant in project_analysis.get_variants_in_gene(
            project, gene_id, variant_filter=variant_filter):
        max_af = max(variant.annotation['freqs'].values())
        if not any([
                indiv_id for indiv_id, genotype in variant.genotypes.items()
                if genotype.num_alt > 0
        ]):
            continue
        if max_af < .01:
            rare_variants.append(variant)
    #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation']))
    add_extra_info_to_variants_project(get_reference(), project, rare_variants)

    # compute knockout individuals
    individ_ids_and_variants = []
    knockout_ids, variation = get_knockouts_in_gene(project, gene_id)
    for indiv_id in knockout_ids:
        variants = variation.get_relevant_variants_for_indiv_ids([indiv_id])
        add_extra_info_to_variants_project(get_reference(), project, variants)
        individ_ids_and_variants.append({
            'indiv_id': indiv_id,
            'variants': variants,
        })

    sys.stderr.write(
        "Project-wide gene search retrieved %s rare variants for gene: %s \n" %
        (len(rare_variants), gene_id))

    download_csv = request.GET.get('download', '')
    if download_csv:
        response = HttpResponse(content_type='text/csv')
        response[
            'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(
                download_csv, gene["transcript_name"])

        if download_csv == 'knockouts':

            individuals_to_include = [
                individ_id_and_variants["indiv_id"]
                for individ_id_and_variants in individ_ids_and_variants
            ]

            rows = []
            for individ_id_and_variants in individ_ids_and_variants:
                rare_variants = individ_id_and_variants["variants"]
                for variant in rare_variants:
                    worst_annotation_idx = variant.annotation[
                        "worst_vep_index_per_gene"][gene_id]
                    worst_annotation = variant.annotation["vep_annotation"][
                        worst_annotation_idx]
                    genotypes = []
                    all_genotypes_string = ""
                    for indiv_id in individuals_to_include:
                        genotype = variant.genotypes[indiv_id]
                        allele_string = ">".join(genotype.alleles)
                        all_genotypes_string += indiv_id + ":" + allele_string + "  "
                        if genotype.num_alt > 0:
                            genotypes.append(allele_string + "   (" +
                                             str(genotype.gq) + ")")
                        else:
                            genotypes.append("")

                    measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(
                        variant.unique_tuple(), ("", ""))

                    rows.append(
                        map(str, [
                            gene["symbol"],
                            variant.chr,
                            variant.pos,
                            variant.ref,
                            variant.alt,
                            variant.vcf_id or "",
                            variant.annotation.get("vep_consequence", ""),
                            worst_annotation.get("hgvsc", ""),
                            worst_annotation.get("hgvsp", "").replace(
                                "%3D", "="),
                            worst_annotation.get("sift", ""),
                            worst_annotation.get("polyphen", ""),
                            worst_annotation.get("mutationtaster_pred", ""),
                            ";".join(
                                set(
                                    worst_annotation.get("fathmm_pred",
                                                         "").split('%3B'))),
                            measureset_id,
                            clinvar_significance,
                            variant.annotation["freqs"].get(
                                "1kg_wgs_phase3", ""),
                            variant.annotation["freqs"].get(
                                "1kg_wgs_phase3_popmax", ""),
                            variant.annotation["freqs"].get("exac_v3", ""),
                            variant.annotation["freqs"].get(
                                "exac_v3_popmax", ""),
                            all_genotypes_string,
                        ] + genotypes))
        elif download_csv == 'rare_variants':
            individuals_to_include = []
            for variant in rare_variants:
                for indiv_id, genotype in variant.genotypes.items():
                    if genotype.num_alt > 0 and indiv_id not in individuals_to_include:
                        individuals_to_include.append(indiv_id)
            rows = []
            for variant in rare_variants:
                worst_annotation_idx = variant.annotation[
                    "worst_vep_index_per_gene"][gene_id]
                worst_annotation = variant.annotation["vep_annotation"][
                    worst_annotation_idx]
                genotypes = []
                all_genotypes_string = ""
                for indiv_id in individuals_to_include:
                    genotype = variant.genotypes[indiv_id]
                    allele_string = ">".join(genotype.alleles)
                    all_genotypes_string += indiv_id + ":" + allele_string + "  "
                    if genotype.num_alt > 0:
                        genotypes.append(allele_string + "   (" +
                                         str(genotype.gq) + ")")
                    else:
                        genotypes.append("")

                measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(
                    variant.unique_tuple(), ("", ""))
                rows.append(
                    map(str, [
                        gene["symbol"],
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id or "",
                        variant.annotation.get("vep_consequence", ""),
                        worst_annotation.get("hgvsc", ""),
                        worst_annotation.get("hgvsp", "").replace("%3D", "="),
                        worst_annotation.get("sift", ""),
                        worst_annotation.get("polyphen", ""),
                        worst_annotation.get("mutationtaster_pred", ""),
                        ";".join(
                            set(
                                worst_annotation.get("fathmm_pred",
                                                     "").split('%3B'))),
                        measureset_id,
                        clinvar_significance,
                        variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                        variant.annotation["freqs"].get(
                            "1kg_wgs_phase3_popmax", ""),
                        variant.annotation["freqs"].get("exac_v3", ""),
                        variant.annotation["freqs"].get("exac_v3_popmax", ""),
                        all_genotypes_string,
                    ] + genotypes))

        header = [
            "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c",
            "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id",
            "clinvar_clinical_sig", "freq_1kg_wgs_phase3",
            "freq_1kg_wgs_phase3_popmax", "freq_exac_v3",
            "freq_exac_v3_popmax", "all_genotypes"
        ] + individuals_to_include

        writer = csv.writer(response)
        writer.writerow(header)
        for row in rows:
            writer.writerow(row)
        return response
    else:
        for individ_id_and_variants in individ_ids_and_variants:
            variants = individ_id_and_variants["variants"]
            individ_id_and_variants["variants"] = [
                v.toJSON() for v in variants
            ]

        return render(
            request, 'project/gene_quicklook.html', {
                'gene':
                gene,
                'gene_json':
                json.dumps(gene),
                'project':
                project,
                'rare_variants_json':
                json.dumps([v.toJSON() for v in rare_variants]),
                'individuals_json':
                json.dumps(
                    [i.get_json_obj() for i in project.get_individuals()]),
                'knockouts_json':
                json.dumps(individ_ids_and_variants),
            })
Example #25
0
    def handle(self, *args, **options):
        if len(args) != 2:
            sys.exit("ERROR: please specify the project_id and file of individual ids as command line args.")

        project_id = args[0]
        individuals_file = args[1]

        # init objects
        project = Project.objects.get(project_id=project_id)
        all_individual_ids_in_project = set([i.indiv_id for i in project.get_individuals()])

        individuals_of_interest = []
        invalid_individual_ids = []
        with open(individuals_file) as f:
            for line in f:
                line = line.strip('\n')
                if not line or line.startswith("#"):
                    continue
                individual_id = line.split("\t")[0]
                if individual_id in all_individual_ids_in_project:
                    individuals_of_interest.append(individual_id)
                else:
                    invalid_individual_ids.append(individual_id)

        print("Processing %s: %d individuals " % (project_id, len(individuals_of_interest)))
        if invalid_individual_ids:
            num_invalid = len(invalid_individual_ids)
            total_ids = len(all_individual_ids_in_project)
            sys.exit(("ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: "
                      "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s") % locals())

        # filter
        variant_filter = get_default_variant_filter('moderate_impact')
        variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
        variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
        variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
        variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold))
        variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold))
        quality_filter = {
            'vcf_filter': 'pass',
            'min_gq': GQ_threshold,
            'min_ab': AB_threshold,
        }

        # create individuals_variants.tsv
        individual_variants_f = gzip.open('individuals_in_%s.tsv.gz' % project_id, 'w')
        writer = csv.writer(individual_variants_f, dialect='excel', delimiter='\t')

        header_fields = [
            'project_id',
            'family_id',
            'individual_id',
            'gene',
            'chrom',
            'pos',
            'ref',
            'alt',
            'rsid',
            'annotation',
            '1kg_af',
            '1kg_popmax_af',
            'exac_af',
            'exac_popmax_af',
            'merck_wgs_3793_af',
            'genotype_str',
            'genotype_num_alt',
            'genotype_allele_balance',
            'genotype_AD',
            'genotype_DP',
            'genotype_GQ',
            'genotype_PL',
            'genotype_filter', 
            ]

        writer.writerow(header_fields)
        # collect the resources that we'll need here
        annotator = get_annotator()
        custom_population_store = get_custom_population_store()

        individual_counter = 0
        for i, family in enumerate(project.get_families()):
            for individual in family.get_individuals():
                if individual.indiv_id not in individuals_of_interest:
                    continue
                individual_counter += 1
                print("%s: %s, individual %s" % (individual_counter, family.family_id, individual.indiv_id))
                for variant in get_variants(get_datastore(project.project_id),
                                            family.xfamily(),
                                            variant_filter = variant_filter,
                                            quality_filter = quality_filter,
                                            indivs_to_consider = [individual.indiv_id]
                                            ):
                    genotype = variant.get_genotype(individual.indiv_id)
                    if len(genotype.alleles) == 0 or genotype.extras["dp"] < DP_threshold or genotype.num_alt == 0:
                        continue

                    custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt)

                    genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./."

                    g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                    g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax']
                    exac_freq = variant.annotation['freqs']['exac_v3']
                    exac_popmax_freq = variant.annotation['freqs']['exac_v3_popmax']
                    merck_wgs_3793_freq = custom_populations.get('merck-wgs-3793', 0.0)

                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold)
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold)
                    assert merck_wgs_3793_freq <= merck_wgs_3793_threshold


                    assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq)
                    assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"])
                    if genotype.num_alt == 1:
                        assert genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab)
                    assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter)

                    writer.writerow(map(str, [
                        project_id,
                        family.family_id,
                        individual.indiv_id,
                        get_gene_symbol(variant),
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id,
                        variant.annotation['vep_group'],
                        g1k_freq,
                        g1k_popmax_freq,
                        exac_freq,
                        exac_popmax_freq,
                        merck_wgs_3793_freq,
                        genotype_str,
                        genotype.num_alt,
                        genotype.ab,
                        genotype.extras["ad"],
                        genotype.extras["dp"],
                        genotype.gq,
                        genotype.extras["pl"],
                        genotype.filter,
                    ]))
                    individual_variants_f.flush()
        individual_variants_f.close()