Example #1
0
def add_custom_populations_to_variants(variants, population_slug_list):
    if population_slug_list:
        try:
            mall.get_custom_population_store().add_populations_to_variants(
                variants, population_slug_list)
        except Exception, e:
            print(
                "WARNING: got unexpected error in add_custom_populations_to_variants: %s"
                % e)
    def handle(self, *args, **options):
        from xbrowse_server import mall
        if len(args) == 0:
            print("Global: " + str([slug for slug in settings.ANNOTATOR_REFERENCE_POPULATION_SLUGS]))
            print("Private: " + str([p.slug for p in ReferencePopulation.objects.all()]))
        else:
            pop_store = mall.get_custom_population_store()
            pop_store._ensure_indices()

            population_id = args[0]
            print("Loading population: " + population_id)

            populations = [p for p in settings.ANNOTATOR_REFERENCE_POPULATIONS if p["slug"] == population_id] + \
                       [p.to_dict() for p in ReferencePopulation.objects.all() if p.slug == population_id]

            assert len(populations) == 1
            population_dict = populations[0]
            print(options)
            if options["AF_key"]:
                population_dict["vcf_info_key"] = options["AF_key"]
            elif options["AC_key"] and options["AN_key"]:
                population_dict["ac_info_key"] = options["AC_key"]
                population_dict["an_info_key"] = options["AN_key"]
            else:
                sys.exit("Must specify either --AF-key or both --AC-key and --AN-key")

            pop_store.load_population(population_dict)
Example #3
0
    def handle(self, *args, **options):
        from xbrowse_server import mall
        if len(args) == 0:
            print("Global: " + str([
                slug for slug in settings.ANNOTATOR_REFERENCE_POPULATION_SLUGS
            ]))
            print("Private: " +
                  str([p.slug for p in ReferencePopulation.objects.all()]))
        else:
            pop_store = mall.get_custom_population_store()
            pop_store._ensure_indices()

            population_id = args[0]
            populations = [p for p in settings.ANNOTATOR_REFERENCE_POPULATIONS if p["slug"] == population_id] + \
                       [p.to_dict() for p in ReferencePopulation.objects.all() if p.slug == population_id]

            assert len(populations) == 1
            population_dict = populations[0]
            if options["AF_key"]:
                population_dict["vcf_info_key"] = options["AF_key"]
            elif options["AC_key"] and options["AN_key"]:
                population_dict["ac_info_key"] = options["AC_key"]
                population_dict["an_info_key"] = options["AN_key"]
            else:
                sys.exit(
                    "Must specify either --AF-key or both --AC-key and --AN-key"
                )

            print("Loading pouplation: " + population_id)
            pop_store.load_population(population_dict)
    def handle(self, *args, **options):
        if not args:
            sys.exit("ERROR: please specify project id on the command line")
        if len(args) > 1:
            sys.exit("ERROR: too many args: %s. Only one project id should be provided." % " ".join(args))

        project_id = args[0]

        # create family_variants.tsv
        family_variants_f = gzip.open("family_variants_%s.tsv.gz" % project_id, "w")
        writer = csv.writer(family_variants_f, dialect="excel", delimiter="\t")

        header_fields = [
            "#inheritance_mode",
            "project_id",
            "family_id",
            "gene",
            "chrom",
            "pos",
            "ref",
            "alt",
            "rsid",
            "annotation",
            "1kg_af",
            "1kg_popmax_af",
            "exac_af",
            "exac_popmax_af",
            "",
        ]

        genotype_headers = ["sample_id", "str", "num_alt", "allele_balance", "AD", "DP", "GQ", "PL"]

        for i in range(0, 10):
            for h in genotype_headers:
                header_fields.append("genotype%d_%s" % (i, h))

        writer.writerow(header_fields)
        family_variants_f.flush()

        for inheritance_mode in ["dominant", "homozygous_recessive", "compound_het", "de_novo", "x_linked_recessive"]:
            # collect the resources that we'll need here
            annotator = mall.get_annotator()
            custom_population_store = mall.get_custom_population_store()

            project = Project.objects.get(project_id=project_id)
            families = project.get_families()

            # get the variants for this inheritance / project combination
            for i, (family, variant_list) in enumerate(
                get_variants_for_inheritance_for_project(project, inheritance_mode)
            ):
                for variant in variant_list:
                    # if variant.annotation['vep_group'] != "missense":
                    #    continue
                    custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt)
                    g1k_freq = variant.annotation["freqs"]["1kg_wgs_phase3"]
                    g1k_popmax_freq = variant.annotation["freqs"]["1kg_wgs_phase3_popmax"]
                    exac_freq = variant.annotation["freqs"]["exac_v3"]
                    exac_popmax_freq = variant.annotation["freqs"]["exac_v3_popmax"]

                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % (
                        g1k_popmax_freq,
                        g1k_popmax_freq_threshold,
                    )
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (
                        exac_popmax_freq,
                        exac_popmax_threshold,
                    )

                    row = [
                        inheritance_mode,
                        project_id,
                        family.family_id,
                        get_gene_symbol(variant),
                        variant.chr,
                        str(variant.pos),
                        variant.ref,
                        variant.alt,
                        variant.vcf_id,
                        variant.annotation["vep_group"],
                        g1k_freq,
                        g1k_popmax_freq,
                        exac_freq,
                        exac_popmax_freq,
                        "",
                    ]

                    for i, individual in enumerate(family.get_individuals()):
                        if i >= 10:
                            break

                        genotype = variant.get_genotype(individual.indiv_id)
                        if genotype is None:
                            print("WARNING: %s variant genotype for %s is None" % (variant, individual.indiv_id))
                            continue

                        assert genotype.filter == "pass", "%s %s - filter is %s " % (
                            variant.chr,
                            variant.pos,
                            genotype.filter,
                        )
                        assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (
                            variant.chr,
                            variant.pos,
                            genotype.gq,
                        )
                        assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (
                            variant.chr,
                            variant.pos,
                            genotype.extras["dp"],
                        )
                        if genotype.num_alt == 1:
                            assert genotype.ab >= AB_threshold / 100.0, "%s %s - AB is %s " % (
                                variant.chr,
                                variant.pos,
                                genotype.ab,
                            )

                        genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./."

                        row.extend(
                            [
                                individual.indiv_id,
                                genotype_str,
                                genotype.num_alt,
                                genotype.ab,
                                genotype.extras["ad"],
                                genotype.extras["dp"],
                                genotype.gq,
                                genotype.extras["pl"],
                            ]
                        )

                    writer.writerow(row)
                    family_variants_f.flush()

        family_variants_f.close()
Example #5
0
def add_custom_populations_to_variants(variants, population_slug_list):
    if population_slug_list:
        try:
            mall.get_custom_population_store().add_populations_to_variants(variants, population_slug_list)
        except Exception, e:
            print("WARNING: got unexpected error in add_custom_populations_to_variants: %s" % e)
Example #6
0
def add_custom_populations_to_variants(variants, population_slug_list):
    if population_slug_list:
        mall.get_custom_population_store().add_populations_to_variants(variants, population_slug_list)
Example #7
0
def handle_project(project_id):
    filename = 'family_variants_%s.tsv.gz' % project_id
    print("Generating report: " + filename)

    # create family_variants.tsv
    family_variants_f = gzip.open(filename, 'w')
    writer = csv.writer(family_variants_f, dialect='excel', delimiter='\t')

    header_fields = [
        '#inheritance_mode',
        'project_id',
        'family_id',
        'gene',
        'chrom',
        'pos',
        'ref',
        'alt',
        'rsid',
        'filter',
        'clinvar_status',
        'annotation',
        '1kg_af',
        '1kg_popmax_af',
        'exac_af',
        'exac_popmax_af',
        'merck_wgs_3793_af',
        'merck_wgs_144_af',
        'multiallelic_site_alt_alleles (* = spanning deletion)',
        '',
    ]

    genotype_headers = [
        'sample_id',
        'str',
        'num_alt',
        'allele_balance',
        'AD',
        'DP',
        'GQ',
        'PL',
    ]

    for i in range(0, 10):
        for h in genotype_headers:
            header_fields.append("genotype%d_%s" % (i, h))

    writer.writerow(header_fields)

    for inheritance_mode in [
            'homozygous_recessive', 'dominant', 'compound_het', 'de_novo',
            'x_linked_recessive', 'all_variants'
    ]:
        # collect the resources that we'll need here
        annotator = mall.get_annotator()
        custom_population_store = mall.get_custom_population_store()

        project = Project.objects.get(project_id=project_id)

        # get the variants for this inheritance / project combination
        for i, (family, family_results) in enumerate(
                get_variants_for_inheritance_for_project(
                    project, inheritance_mode)):
            for variant in family_results:
                custom_populations = custom_population_store.get_frequencies(
                    variant.xpos, variant.ref, variant.alt)
                g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                g1k_popmax_freq = variant.annotation['freqs'][
                    '1kg_wgs_phase3_popmax']
                exac_freq = variant.annotation['freqs']['exac_v3']
                exac_popmax_freq = variant.annotation['freqs'][
                    'exac_v3_popmax']
                merck_wgs_3793_freq = custom_populations.get(
                    'merck-wgs-3793', 0.0)
                merck_wgs_144_freq = custom_populations.get(
                    'merck-pcr-free-wgs-144', 0.0)

                try:
                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (
                        g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % (
                        g1k_popmax_freq, g1k_popmax_freq_threshold)
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (
                        exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (
                        exac_popmax_freq, exac_popmax_threshold)
                    #assert merck_wgs_3793_freq <= merck_wgs_3793_threshold, "Merck WGS 3793 threshold %s > %s" % (merck_wgs_3793_freq, merck_wgs_3793_threshold)
                    #assert merck_wgs_144_freq <= merck_wgs_144_threshold, "Merck PCR free 144 threshold %s > %s" % (merck_wgs_144_freq, merck_wgs_144_threshold)
                except AssertionError as e:
                    import traceback
                    traceback.print_exc()

                # filter value is stored in the genotypes
                if len(family.get_individuals()) == 0:
                    print("Family has 0 individuals: %s - skipping..." %
                          str(family))
                    continue

                genotype = variant.get_genotype(
                    family.get_individuals()[0].indiv_id)
                if genotype is not None:
                    filter_value = genotype.filter
                else:
                    filter_value = 'unknown'

                multiallelic_site_other_alleles = []
                if len(variant.extras['orig_alt_alleles']) > 1:
                    multiallelic_site_other_alleles = variant.extras[
                        'orig_alt_alleles']

                clinvar_significance = get_clinvar_variants().get(
                    variant.unique_tuple(), [""])[-1]
                row = [
                    inheritance_mode,
                    project_id,
                    family.family_id,
                    get_gene_symbol(variant),
                    variant.chr,
                    str(variant.pos),
                    variant.ref,
                    variant.alt,
                    variant.vcf_id,
                    filter_value,
                    clinvar_significance,
                    variant.annotation['vep_group'],
                    g1k_freq,
                    g1k_popmax_freq,
                    exac_freq,
                    exac_popmax_freq,
                    merck_wgs_3793_freq,
                    merck_wgs_144_freq,
                    ", ".join(multiallelic_site_other_alleles),
                    '',
                ]

                for i, individual in enumerate(family.get_individuals()):
                    if i >= 10:
                        break

                    genotype = variant.get_genotype(individual.indiv_id)

                    if genotype is None:
                        row.extend([
                            individual.indiv_id, "./.", "", "", "", "", "", ""
                        ])
                        continue
                    else:
                        #assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter)
                        try:
                            assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (
                                variant.chr, variant.pos, genotype.gq)
                            assert genotype.extras[
                                "dp"] >= DP_threshold, "%s %s - GQ is %s " % (
                                    variant.chr, variant.pos,
                                    genotype.extras["dp"])
                            if genotype.num_alt == 1:
                                assert genotype.ab is None or genotype.ab >= AB_threshold / 100., "%s %s - AB is %s " % (
                                    variant.chr, variant.pos, genotype.ab)
                        except AssertionError as e:
                            import traceback
                            traceback.print_exc()

                        genotype_str = "/".join(
                            genotype.alleles) if genotype.alleles else "./."

                        row.extend([
                            individual.indiv_id,
                            genotype_str,
                            genotype.num_alt,
                            genotype.ab if genotype.ab is not None else '',
                            genotype.extras["ad"],
                            genotype.extras["dp"],
                            genotype.gq,
                            genotype.extras["pl"],
                        ])

                writer.writerow(row)
                family_variants_f.flush()

    family_variants_f.close()
    print("Done with " + filename)
Example #8
0
def add_custom_populations_to_variants(variants, population_slug_list):
    if population_slug_list:
        mall.get_custom_population_store().add_populations_to_variants(
            variants, population_slug_list)
Example #9
0
    def handle(self, *args, **options):
        if not args:
            sys.exit("ERROR: please specify project id on the command line")
        if len(args) > 1:
            sys.exit("ERROR: too many args: %s. Only one project id should be provided." % " ".join(args) )

        project_id = args[0]


        # create family_variants.tsv
        family_variants_f = gzip.open('family_variants_%s.tsv.gz' % project_id, 'w')
        writer = csv.writer(family_variants_f, dialect='excel', delimiter='\t')

        header_fields = [
            '#inheritance_mode',
            'project_id',
            'family_id',
            'gene',
            'chrom',
            'pos',
            'ref',
            'alt',
            'rsid',
            'annotation',
            '1kg_af',
            '1kg_popmax_af',
            'exac_af',
            'exac_popmax_af',
            '',
            ]

        genotype_headers = [
            'sample_id',
            'str',
            'num_alt',
            'allele_balance',
            'AD',
            'DP',
            'GQ',
            'PL',
        ]

        for i in range(0, 10):
            for h in genotype_headers:
                header_fields.append("genotype%d_%s" % (i, h))

        writer.writerow(header_fields)
        family_variants_f.flush()

        for inheritance_mode in ['dominant', 'homozygous_recessive', 'compound_het', 'de_novo', 'x_linked_recessive']:
            # collect the resources that we'll need here
            annotator = mall.get_annotator()
            custom_population_store = mall.get_custom_population_store()

            project = Project.objects.get(project_id=project_id)
            families = project.get_families()

            # get the variants for this inheritance / project combination
            for i, (family, variant_list) in enumerate(get_variants_for_inheritance_for_project(project, inheritance_mode)):
                for variant in variant_list:
                    #if variant.annotation['vep_group'] != "missense":
                    #    continue
                    custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt)
                    g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                    g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax']
                    exac_freq = variant.annotation['freqs']['exac_v3']
                    exac_popmax_freq =  variant.annotation['freqs']['exac_v3_popmax']

                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold)
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold)


                    row = [
                        inheritance_mode,
                        project_id,
                        family.family_id,
                        get_gene_symbol(variant),
                        variant.chr,
                        str(variant.pos),
                        variant.ref,
                        variant.alt,
                        variant.vcf_id,
                        variant.annotation['vep_group'],

                        g1k_freq,
                        g1k_popmax_freq,

                        exac_freq,
                        exac_popmax_freq,
                        '',
                    ]

                    for i, individual in enumerate(family.get_individuals()):
                        if i >= 10:
                            break

                        genotype = variant.get_genotype(individual.indiv_id)
                        if genotype is None:
                            print("WARNING: %s variant genotype for %s is None" % (variant, individual.indiv_id))
                            continue

                        assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter)
                        assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq)
                        assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"])
                        if genotype.num_alt == 1:
                            assert genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab)

                        genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./."

                        row.extend([
                            individual.indiv_id,
                            genotype_str,
                            genotype.num_alt,
                            genotype.ab,
                            genotype.extras["ad"],
                            genotype.extras["dp"],
                            genotype.gq,
                            genotype.extras["pl"],])

                    writer.writerow(row)
                    family_variants_f.flush()

        family_variants_f.close()
def handle_project(project_id):
        filename = 'family_variants_%s.tsv.gz' % project_id
        print("Generating report: " + filename)

        # create family_variants.tsv
        family_variants_f = gzip.open(filename, 'w')
        writer = csv.writer(family_variants_f, dialect='excel', delimiter='\t')

        header_fields = [
            '#inheritance_mode',
            'project_id',
            'family_id',
            'gene',
            'chrom',
            'pos',
            'ref',
            'alt',
            'rsid',
            'filter', 
            'clinvar_status',
            'annotation',
            '1kg_af',
            '1kg_popmax_af',
            'exac_af',
            'exac_popmax_af',
            'merck_wgs_3793_af',
            'merck_wgs_144_af',
            'multiallelic_site_alt_alleles (* = spanning deletion)',
            '',
            ]

        genotype_headers = [
            'sample_id',
            'str',
            'num_alt',
            'allele_balance',
            'AD',
            'DP',
            'GQ',
            'PL',
        ]

        for i in range(0, 10):
            for h in genotype_headers:
                header_fields.append("genotype%d_%s" % (i, h))

        writer.writerow(header_fields)

        for inheritance_mode in ['homozygous_recessive', 'dominant', 'compound_het', 'de_novo', 'x_linked_recessive', 'all_variants']:
            # collect the resources that we'll need here
            annotator = mall.get_annotator()
            custom_population_store = mall.get_custom_population_store()

            project = Project.objects.get(project_id=project_id)

            # get the variants for this inheritance / project combination
            for i, (family, family_results) in enumerate(get_variants_for_inheritance_for_project(project, inheritance_mode)):
                for variant in family_results:
                    custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt)
                    g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                    g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax']
                    exac_freq = variant.annotation['freqs']['exac_v3']
                    exac_popmax_freq =  variant.annotation['freqs']['exac_v3_popmax']
                    merck_wgs_3793_freq = custom_populations.get('merck-wgs-3793', 0.0)
                    merck_wgs_144_freq = custom_populations.get('merck-pcr-free-wgs-144', 0.0)

                    try:
                        assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold)
                        assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold)
                        assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold)
                        assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold)
                        #assert merck_wgs_3793_freq <= merck_wgs_3793_threshold, "Merck WGS 3793 threshold %s > %s" % (merck_wgs_3793_freq, merck_wgs_3793_threshold)
                        #assert merck_wgs_144_freq <= merck_wgs_144_threshold, "Merck PCR free 144 threshold %s > %s" % (merck_wgs_144_freq, merck_wgs_144_threshold)
                    except AssertionError as e:
                        import traceback
                        traceback.print_exc()

                    # filter value is stored in the genotypes
                    if len(family.get_individuals()) == 0:
                        print("Family has 0 individuals: %s - skipping..." % str(family))
                        continue
                    filter_value = variant.get_genotype(family.get_individuals()[0].indiv_id).filter  

                    multiallelic_site_other_alleles = []
                    if len(variant.extras['orig_alt_alleles']) > 1:
                        multiallelic_site_other_alleles = variant.extras['orig_alt_alleles']

                    clinvar_significance = CLINVAR_VARIANTS.get(variant.unique_tuple(), [""])[-1]
                    row = [
                        inheritance_mode,
                        project_id,
                        family.family_id,
                        get_gene_symbol(variant),
                        variant.chr,
                        str(variant.pos),
                        variant.ref,
                        variant.alt,
                        variant.vcf_id,
                        filter_value,
                        clinvar_significance,
                        variant.annotation['vep_group'],

                        g1k_freq,
                        g1k_popmax_freq,

                        exac_freq,
                        exac_popmax_freq,
                        merck_wgs_3793_freq,
                        merck_wgs_144_freq,
                        ", ".join(multiallelic_site_other_alleles),
                        '',
                    ]

                    for i, individual in enumerate(family.get_individuals()):
                        if i >= 10:
                            break

                        genotype = variant.get_genotype(individual.indiv_id)

                        if genotype is None:
                            row.extend([individual.indiv_id, "./.", "", "", "", "", "", ""])
                            continue
                        else:
                            #assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter)
                            try:
                                assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq)
                                assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"])
                                if genotype.num_alt == 1:
                                    assert genotype.ab is None or genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab)
                            except AssertionError as e:
                                import traceback
                                traceback.print_exc()

                            genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./."

                            row.extend([
                                    individual.indiv_id,
                                    genotype_str,
                                    genotype.num_alt,
                                    genotype.ab if genotype.ab is not None else '',
                                    genotype.extras["ad"],
                                    genotype.extras["dp"],
                                    genotype.gq,
                                    genotype.extras["pl"],
                            ])

                    writer.writerow(row)
                    family_variants_f.flush()

        family_variants_f.close()
        print("Done with " + filename)
Example #11
0
    def handle(self, *args, **options):
        if len(args) != 2:
            sys.exit("ERROR: please specify the project_id and file of individual ids as command line args.")

        project_id = args[0]
        individuals_file = args[1]

        # init objects
        project = Project.objects.get(project_id=project_id)
        all_individual_ids_in_project = set([i.indiv_id for i in project.get_individuals()])

        individuals_of_interest = []
        invalid_individual_ids = []
        with open(individuals_file) as f:
            for line in f:
                line = line.strip('\n')
                if not line or line.startswith("#"):
                    continue
                individual_id = line.split("\t")[0]
                if individual_id in all_individual_ids_in_project:
                    individuals_of_interest.append(individual_id)
                else:
                    invalid_individual_ids.append(individual_id)

        print("Processing %s: %d individuals " % (project_id, len(individuals_of_interest)))
        if invalid_individual_ids:
            num_invalid = len(invalid_individual_ids)
            total_ids = len(all_individual_ids_in_project)
            sys.exit(("ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: "
                      "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s") % locals())

        # filter
        variant_filter = get_default_variant_filter('moderate_impact')
        variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
        variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
        variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
        variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold))
        variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold))
        quality_filter = {
            'vcf_filter': 'pass',
            'min_gq': GQ_threshold,
            'min_ab': AB_threshold,
        }

        # create individuals_variants.tsv
        individual_variants_f = gzip.open('individuals_in_%s.tsv.gz' % project_id, 'w')
        writer = csv.writer(individual_variants_f, dialect='excel', delimiter='\t')

        header_fields = [
            'project_id',
            'family_id',
            'individual_id',
            'gene',
            'chrom',
            'pos',
            'ref',
            'alt',
            'rsid',
            'annotation',
            '1kg_af',
            '1kg_popmax_af',
            'exac_af',
            'exac_popmax_af',
            'merck_wgs_3793_af',
            'genotype_str',
            'genotype_num_alt',
            'genotype_allele_balance',
            'genotype_AD',
            'genotype_DP',
            'genotype_GQ',
            'genotype_PL',
            'genotype_filter', 
            ]

        writer.writerow(header_fields)
        # collect the resources that we'll need here
        annotator = get_annotator()
        custom_population_store = get_custom_population_store()

        individual_counter = 0
        for i, family in enumerate(project.get_families()):
            for individual in family.get_individuals():
                if individual.indiv_id not in individuals_of_interest:
                    continue
                individual_counter += 1
                print("%s: %s, individual %s" % (individual_counter, family.family_id, individual.indiv_id))
                for variant in get_variants(get_datastore(project.project_id),
                                            family.xfamily(),
                                            variant_filter = variant_filter,
                                            quality_filter = quality_filter,
                                            indivs_to_consider = [individual.indiv_id]
                                            ):
                    genotype = variant.get_genotype(individual.indiv_id)
                    if len(genotype.alleles) == 0 or genotype.extras["dp"] < DP_threshold or genotype.num_alt == 0:
                        continue

                    custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt)

                    genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./."

                    g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                    g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax']
                    exac_freq = variant.annotation['freqs']['exac_v3']
                    exac_popmax_freq = variant.annotation['freqs']['exac_v3_popmax']
                    merck_wgs_3793_freq = custom_populations.get('merck-wgs-3793', 0.0)

                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold)
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold)
                    assert merck_wgs_3793_freq <= merck_wgs_3793_threshold


                    assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq)
                    assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"])
                    if genotype.num_alt == 1:
                        assert genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab)
                    assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter)

                    writer.writerow(map(str, [
                        project_id,
                        family.family_id,
                        individual.indiv_id,
                        get_gene_symbol(variant),
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id,
                        variant.annotation['vep_group'],
                        g1k_freq,
                        g1k_popmax_freq,
                        exac_freq,
                        exac_popmax_freq,
                        merck_wgs_3793_freq,
                        genotype_str,
                        genotype.num_alt,
                        genotype.ab,
                        genotype.extras["ad"],
                        genotype.extras["dp"],
                        genotype.gq,
                        genotype.extras["pl"],
                        genotype.filter,
                    ]))
                    individual_variants_f.flush()
        individual_variants_f.close()
Example #12
0
    def handle(self, *args, **options):
        if len(args) != 2:
            sys.exit(
                "ERROR: please specify the project_id and file of individual ids as command line args."
            )

        project_id = args[0]
        individuals_file = args[1]

        # init objects
        project = Project.objects.get(project_id=project_id)
        all_individual_ids_in_project = set(
            [i.indiv_id for i in project.get_individuals()])

        individuals_of_interest = []
        invalid_individual_ids = []
        with open(individuals_file) as f:
            for line in f:
                line = line.strip('\n')
                if not line or line.startswith("#"):
                    continue
                individual_id = line.split("\t")[0]
                if individual_id in all_individual_ids_in_project:
                    individuals_of_interest.append(individual_id)
                else:
                    invalid_individual_ids.append(individual_id)

        print("Processing %s: %d individuals " %
              (project_id, len(individuals_of_interest)))
        if invalid_individual_ids:
            num_invalid = len(invalid_individual_ids)
            total_ids = len(all_individual_ids_in_project)
            sys.exit((
                "ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: "
                "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s"
            ) % locals())

        # filter
        variant_filter = get_default_variant_filter('moderate_impact')
        variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
        variant_filter.ref_freqs.append(
            ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
        variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
        variant_filter.ref_freqs.append(
            ('exac_v3_popmax', exac_popmax_threshold))
        variant_filter.ref_freqs.append(
            ('merck-wgs-3793', merck_wgs_3793_threshold))
        quality_filter = {
            'vcf_filter': 'pass',
            'min_gq': GQ_threshold,
            'min_ab': AB_threshold,
        }

        # create individuals_variants.tsv
        individual_variants_f = gzip.open(
            'individuals_in_%s.tsv.gz' % project_id, 'w')
        writer = csv.writer(individual_variants_f,
                            dialect='excel',
                            delimiter='\t')

        header_fields = [
            'project_id',
            'family_id',
            'individual_id',
            'gene',
            'chrom',
            'pos',
            'ref',
            'alt',
            'rsid',
            'annotation',
            '1kg_af',
            '1kg_popmax_af',
            'exac_af',
            'exac_popmax_af',
            'merck_wgs_3793_af',
            'genotype_str',
            'genotype_num_alt',
            'genotype_allele_balance',
            'genotype_AD',
            'genotype_DP',
            'genotype_GQ',
            'genotype_PL',
            'genotype_filter',
        ]

        writer.writerow(header_fields)
        # collect the resources that we'll need here
        annotator = get_annotator()
        custom_population_store = get_custom_population_store()

        individual_counter = 0
        for i, family in enumerate(project.get_families()):
            for individual in family.get_individuals():
                if individual.indiv_id not in individuals_of_interest:
                    continue
                individual_counter += 1
                print("%s: %s, individual %s" %
                      (individual_counter, family.family_id,
                       individual.indiv_id))
                for variant in get_variants(
                        get_datastore(project.project_id),
                        family.xfamily(),
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                        indivs_to_consider=[individual.indiv_id]):
                    genotype = variant.get_genotype(individual.indiv_id)
                    if len(genotype.alleles) == 0 or genotype.extras[
                            "dp"] < DP_threshold or genotype.num_alt == 0:
                        continue

                    custom_populations = custom_population_store.get_frequencies(
                        variant.xpos, variant.ref, variant.alt)

                    genotype_str = "/".join(
                        genotype.alleles) if genotype.alleles else "./."

                    g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                    g1k_popmax_freq = variant.annotation['freqs'][
                        '1kg_wgs_phase3_popmax']
                    exac_freq = variant.annotation['freqs']['exac_v3']
                    exac_popmax_freq = variant.annotation['freqs'][
                        'exac_v3_popmax']
                    merck_wgs_3793_freq = custom_populations.get(
                        'merck-wgs-3793', 0.0)

                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (
                        g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % (
                        g1k_popmax_freq, g1k_popmax_freq_threshold)
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (
                        exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (
                        exac_popmax_freq, exac_popmax_threshold)
                    assert merck_wgs_3793_freq <= merck_wgs_3793_threshold

                    assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (
                        variant.chr, variant.pos, genotype.gq)
                    assert genotype.extras[
                        "dp"] >= DP_threshold, "%s %s - GQ is %s " % (
                            variant.chr, variant.pos, genotype.extras["dp"])
                    if genotype.num_alt == 1:
                        assert genotype.ab >= AB_threshold / 100., "%s %s - AB is %s " % (
                            variant.chr, variant.pos, genotype.ab)
                    assert genotype.filter == "pass", "%s %s - filter is %s " % (
                        variant.chr, variant.pos, genotype.filter)

                    writer.writerow(
                        map(str, [
                            project_id,
                            family.family_id,
                            individual.indiv_id,
                            get_gene_symbol(variant),
                            variant.chr,
                            variant.pos,
                            variant.ref,
                            variant.alt,
                            variant.vcf_id,
                            variant.annotation['vep_group'],
                            g1k_freq,
                            g1k_popmax_freq,
                            exac_freq,
                            exac_popmax_freq,
                            merck_wgs_3793_freq,
                            genotype_str,
                            genotype.num_alt,
                            genotype.ab,
                            genotype.extras["ad"],
                            genotype.extras["dp"],
                            genotype.gq,
                            genotype.extras["pl"],
                            genotype.filter,
                        ]))
                    individual_variants_f.flush()
        individual_variants_f.close()