Example #1
0
def get_variants_for_inheritance_for_project(project, inheritance_mode):
    """
    Get the variants for this project / inheritance combo
    Return dict of family -> list of variants
    """

    # create search specification
    # this could theoretically differ by project, if there are different reference populations
    #variant_filter = VariantFilter(so_annotations=SO_SEVERITY_ORDER, ref_freqs=[])
    variant_filter = get_default_variant_filter('moderate_impact')
    variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
    variant_filter.ref_freqs.append(
        ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold))
    variant_filter.ref_freqs.append(
        ('merck-wgs-3793', merck_wgs_3793_threshold))
    #variant_filter.ref_freqs.append(('merck-pcr-free-wgs-144', merck_wgs_144_threshold))
    quality_filter = {
        #        'vcf_filter': 'pass',
        'min_gq': GQ_threshold,
        'min_ab': AB_threshold,
    }

    # run MendelianVariantSearch for each family, collect results

    families = project.get_families()

    for i, family in enumerate(families):
        print("Processing %s - family %s  (%d / %d)" %
              (inheritance_mode, family.family_id, i + 1, len(families)))
        try:
            if inheritance_mode == "all_variants":
                yield family, list(
                    get_variants(get_datastore(project.project_id),
                                 family.xfamily(),
                                 variant_filter=variant_filter,
                                 quality_filter=quality_filter,
                                 indivs_to_consider=family.indiv_id_list()))
            else:
                yield family, list(
                    get_variants_with_inheritance_mode(
                        get_mall(project.project_id),
                        family.xfamily(),
                        inheritance_mode,
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                    ))
        except ValueError as e:
            print("Error: %s. Skipping family %s" % (str(e), str(family)))
def get_variants_for_inheritance_for_project(project, inheritance_mode):
    """
    Get the variants for this project / inheritance combo
    Return dict of family -> list of variants
    """

    # create search specification
    # this could theoretically differ by project, if there are different reference populations
    #variant_filter = VariantFilter(so_annotations=SO_SEVERITY_ORDER, ref_freqs=[])
    variant_filter = get_default_variant_filter('moderate_impact')
    variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
    variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold))
    variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold))
    #variant_filter.ref_freqs.append(('merck-pcr-free-wgs-144', merck_wgs_144_threshold))
    quality_filter = {
#        'vcf_filter': 'pass',
        'min_gq': GQ_threshold,
        'min_ab': AB_threshold,
    }

    # run MendelianVariantSearch for each family, collect results

    families = project.get_families()

    for i, family in enumerate(families):
        print("Processing %s - family %s  (%d / %d)" % (inheritance_mode, family.family_id, i+1, len(families)))
        try:
            if inheritance_mode == "all_variants":
                yield family, list(get_variants(
                        get_datastore(project.project_id),
                        family.xfamily(),
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                        indivs_to_consider=family.indiv_id_list()
                        ))
            else:
                yield family, list(get_variants_with_inheritance_mode(
                        get_mall(project.project_id),
                        family.xfamily(),
                        inheritance_mode,
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                        ))
        except ValueError as e:
            print("Error: %s. Skipping family %s" % (str(e), str(family)))
    def handle(self, *args, **options):
        if len(args) != 2:
            sys.exit("ERROR: please specify the project_id and file of individual ids as command line args.")

        project_id = args[0]
        individuals_file = args[1]

        # init objects
        project = Project.objects.get(project_id=project_id)
        all_individual_ids_in_project = set([i.indiv_id for i in project.get_individuals()])

        individuals_of_interest = []
        invalid_individual_ids = []
        with open(individuals_file) as f:
            for line in f:
                line = line.strip('\n')
                if not line or line.startswith("#"):
                    continue
                individual_id = line.split("\t")[0]
                if individual_id in all_individual_ids_in_project:
                    individuals_of_interest.append(individual_id)
                else:
                    invalid_individual_ids.append(individual_id)

        print("Processing %s: %d individuals " % (project_id, len(individuals_of_interest)))
        if invalid_individual_ids:
            num_invalid = len(invalid_individual_ids)
            total_ids = len(all_individual_ids_in_project)
            sys.exit(("ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: "
                      "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s") % locals())

        # filter
        variant_filter = get_default_variant_filter('moderate_impact')
        variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
        variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
        variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
        variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold))
        variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold))
        quality_filter = {
            'vcf_filter': 'pass',
            'min_gq': GQ_threshold,
            'min_ab': AB_threshold,
        }

        # create individuals_variants.tsv
        individual_variants_f = gzip.open('individuals_in_%s.tsv.gz' % project_id, 'w')
        writer = csv.writer(individual_variants_f, dialect='excel', delimiter='\t')

        header_fields = [
            'project_id',
            'family_id',
            'individual_id',
            'gene',
            'chrom',
            'pos',
            'ref',
            'alt',
            'rsid',
            'annotation',
            '1kg_af',
            '1kg_popmax_af',
            'exac_af',
            'exac_popmax_af',
            'merck_wgs_3793_af',
            'genotype_str',
            'genotype_num_alt',
            'genotype_allele_balance',
            'genotype_AD',
            'genotype_DP',
            'genotype_GQ',
            'genotype_PL',
            'genotype_filter', 
            ]

        writer.writerow(header_fields)
        # collect the resources that we'll need here
        annotator = get_annotator()
        custom_population_store = get_custom_population_store()

        individual_counter = 0
        for i, family in enumerate(project.get_families()):
            for individual in family.get_individuals():
                if individual.indiv_id not in individuals_of_interest:
                    continue
                individual_counter += 1
                print("%s: %s, individual %s" % (individual_counter, family.family_id, individual.indiv_id))
                for variant in get_variants(get_datastore(project.project_id),
                                            family.xfamily(),
                                            variant_filter = variant_filter,
                                            quality_filter = quality_filter,
                                            indivs_to_consider = [individual.indiv_id]
                                            ):
                    genotype = variant.get_genotype(individual.indiv_id)
                    if len(genotype.alleles) == 0 or genotype.extras["dp"] < DP_threshold or genotype.num_alt == 0:
                        continue

                    custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt)

                    genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./."

                    g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                    g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax']
                    exac_freq = variant.annotation['freqs']['exac_v3']
                    exac_popmax_freq = variant.annotation['freqs']['exac_v3_popmax']
                    merck_wgs_3793_freq = custom_populations.get('merck-wgs-3793', 0.0)

                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold)
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold)
                    assert merck_wgs_3793_freq <= merck_wgs_3793_threshold


                    assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq)
                    assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"])
                    if genotype.num_alt == 1:
                        assert genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab)
                    assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter)

                    writer.writerow(map(str, [
                        project_id,
                        family.family_id,
                        individual.indiv_id,
                        get_gene_symbol(variant),
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id,
                        variant.annotation['vep_group'],
                        g1k_freq,
                        g1k_popmax_freq,
                        exac_freq,
                        exac_popmax_freq,
                        merck_wgs_3793_freq,
                        genotype_str,
                        genotype.num_alt,
                        genotype.ab,
                        genotype.extras["ad"],
                        genotype.extras["dp"],
                        genotype.gq,
                        genotype.extras["pl"],
                        genotype.filter,
                    ]))
                    individual_variants_f.flush()
        individual_variants_f.close()
Example #4
0
    def handle(self, *args, **options):
        if len(args) != 2:
            sys.exit(
                "ERROR: please specify the project_id and file of individual ids as command line args."
            )

        project_id = args[0]
        individuals_file = args[1]

        # init objects
        project = Project.objects.get(project_id=project_id)
        all_individual_ids_in_project = set(
            [i.indiv_id for i in project.get_individuals()])

        individuals_of_interest = []
        invalid_individual_ids = []
        with open(individuals_file) as f:
            for line in f:
                line = line.strip('\n')
                if not line or line.startswith("#"):
                    continue
                individual_id = line.split("\t")[0]
                if individual_id in all_individual_ids_in_project:
                    individuals_of_interest.append(individual_id)
                else:
                    invalid_individual_ids.append(individual_id)

        print("Processing %s: %d individuals " %
              (project_id, len(individuals_of_interest)))
        if invalid_individual_ids:
            num_invalid = len(invalid_individual_ids)
            total_ids = len(all_individual_ids_in_project)
            sys.exit((
                "ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: "
                "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s"
            ) % locals())

        # filter
        variant_filter = get_default_variant_filter('moderate_impact')
        variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
        variant_filter.ref_freqs.append(
            ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
        variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
        variant_filter.ref_freqs.append(
            ('exac_v3_popmax', exac_popmax_threshold))
        variant_filter.ref_freqs.append(
            ('merck-wgs-3793', merck_wgs_3793_threshold))
        quality_filter = {
            'vcf_filter': 'pass',
            'min_gq': GQ_threshold,
            'min_ab': AB_threshold,
        }

        # create individuals_variants.tsv
        individual_variants_f = gzip.open(
            'individuals_in_%s.tsv.gz' % project_id, 'w')
        writer = csv.writer(individual_variants_f,
                            dialect='excel',
                            delimiter='\t')

        header_fields = [
            'project_id',
            'family_id',
            'individual_id',
            'gene',
            'chrom',
            'pos',
            'ref',
            'alt',
            'rsid',
            'annotation',
            '1kg_af',
            '1kg_popmax_af',
            'exac_af',
            'exac_popmax_af',
            'merck_wgs_3793_af',
            'genotype_str',
            'genotype_num_alt',
            'genotype_allele_balance',
            'genotype_AD',
            'genotype_DP',
            'genotype_GQ',
            'genotype_PL',
            'genotype_filter',
        ]

        writer.writerow(header_fields)
        # collect the resources that we'll need here
        annotator = get_annotator()
        custom_population_store = get_custom_population_store()

        individual_counter = 0
        for i, family in enumerate(project.get_families()):
            for individual in family.get_individuals():
                if individual.indiv_id not in individuals_of_interest:
                    continue
                individual_counter += 1
                print("%s: %s, individual %s" %
                      (individual_counter, family.family_id,
                       individual.indiv_id))
                for variant in get_variants(
                        get_datastore(project.project_id),
                        family.xfamily(),
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                        indivs_to_consider=[individual.indiv_id]):
                    genotype = variant.get_genotype(individual.indiv_id)
                    if len(genotype.alleles) == 0 or genotype.extras[
                            "dp"] < DP_threshold or genotype.num_alt == 0:
                        continue

                    custom_populations = custom_population_store.get_frequencies(
                        variant.xpos, variant.ref, variant.alt)

                    genotype_str = "/".join(
                        genotype.alleles) if genotype.alleles else "./."

                    g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                    g1k_popmax_freq = variant.annotation['freqs'][
                        '1kg_wgs_phase3_popmax']
                    exac_freq = variant.annotation['freqs']['exac_v3']
                    exac_popmax_freq = variant.annotation['freqs'][
                        'exac_v3_popmax']
                    merck_wgs_3793_freq = custom_populations.get(
                        'merck-wgs-3793', 0.0)

                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (
                        g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % (
                        g1k_popmax_freq, g1k_popmax_freq_threshold)
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (
                        exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (
                        exac_popmax_freq, exac_popmax_threshold)
                    assert merck_wgs_3793_freq <= merck_wgs_3793_threshold

                    assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (
                        variant.chr, variant.pos, genotype.gq)
                    assert genotype.extras[
                        "dp"] >= DP_threshold, "%s %s - GQ is %s " % (
                            variant.chr, variant.pos, genotype.extras["dp"])
                    if genotype.num_alt == 1:
                        assert genotype.ab >= AB_threshold / 100., "%s %s - AB is %s " % (
                            variant.chr, variant.pos, genotype.ab)
                    assert genotype.filter == "pass", "%s %s - filter is %s " % (
                        variant.chr, variant.pos, genotype.filter)

                    writer.writerow(
                        map(str, [
                            project_id,
                            family.family_id,
                            individual.indiv_id,
                            get_gene_symbol(variant),
                            variant.chr,
                            variant.pos,
                            variant.ref,
                            variant.alt,
                            variant.vcf_id,
                            variant.annotation['vep_group'],
                            g1k_freq,
                            g1k_popmax_freq,
                            exac_freq,
                            exac_popmax_freq,
                            merck_wgs_3793_freq,
                            genotype_str,
                            genotype.num_alt,
                            genotype.ab,
                            genotype.extras["ad"],
                            genotype.extras["dp"],
                            genotype.gq,
                            genotype.extras["pl"],
                            genotype.filter,
                        ]))
                    individual_variants_f.flush()
        individual_variants_f.close()