Esempio n. 1
0
def sv(args):


    #gts012 sets the uncalled GTs to 3
    reader = cyvcf2.VCF(args.i, gts012=True)

    #get list of samples
    samples = reader.samples  
    
    #read sample map
    sample_map, animal_map = load_sample_map(args.m)

    #add the new INFO tags
    reader.update("UNIQ", "String", 1, "Sample(s) with unique somatic variant")
    reader.update("UAB", "Float", 1, "Allele Balance in UNIQ sample")
    reader.update("TISSUE", "String", 1, "Source Tissue Type")
    reader.update("CASE", "String", 1, "Control or SCNT?")
    reader.update("EXPT", "String", 1, "Experiment")
    reader.update("ANIMAL", "String", 1, "ID of origin animal")
    # reader.update("FILTER", "String", 1, "VAF Filter PASS/FAIL")

    if not args.o:
        writer = cyvcf2.Writer("/dev/stdout", reader)
    else:
        writer = cyvcf2.Writer(args.o, reader)

    min_su = 5
    for var in reader:
        SUs = var.format('SU')

        ABs = var.format('AB')

        for i in range(len(samples)):
            if SUs[i][0] >= min_su and ABs[i][0] >= 0.15:
                unique = True

                for j in range(len(samples)):
                    if j != i:

                        if ABs[j][0] > 0.0:
                            unique = False
                            break

                if unique:
                    #set new info fields
                    var.INFO['TISSUE'] = sample_map[samples[i]]['Source']
                    var.INFO['CASE'] = sample_map[samples[i]]['Case']
                    var.INFO['EXPT'] = sample_map[samples[i]]['Experiment']
                    var.INFO['UNIQ'] = samples[i]
                    var.INFO['UAB'] = str(numpy.around(ABs[i][0], 3))
                    var.INFO['ANIMAL'] = sample_map[samples[i]]['Animal']
                    # var.INFO['FILTER'] = filt

                    writer.write_record(var)
    writer.close()
Esempio n. 2
0
def filter_to_pass_and_reject(in_file, paired, out_dir=None):
    """Filter VCF to only those with a strict PASS/REJECT: somatic + germline.

    Removes low quality calls filtered but also labeled with REJECT.
    """
    from bcbio.heterogeneity import bubbletree
    out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0]
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            max_depth = bubbletree.max_normal_germline_depth(in_file, bubbletree.PARAMS, paired)
            tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf")
            with contextlib.closing(cyvcf2.VCF(in_file)) as reader:
                reader = _add_db_to_header(reader)
                with contextlib.closing(cyvcf2.Writer(tx_out_plain, reader)) as writer:
                    for rec in reader:
                        filters = rec.FILTER.split(";") if rec.FILTER else []
                        other_filters = [x for x in filters if x not in ["PASS", ".", "REJECT"]]
                        if len(other_filters) == 0 or bubbletree.is_info_germline(rec):
                            # Germline, check if we should include based on frequencies
                            if "REJECT" in filters or bubbletree.is_info_germline(rec):
                                stats = bubbletree._is_possible_loh(rec, reader, bubbletree.PARAMS, paired,
                                                                    use_status=True, max_normal_depth=max_depth)
                                if stats:
                                    rec.FILTER = "PASS"
                                    rec.INFO["DB"] = True
                                    writer.write_record(rec)
                            # Somatic, always include
                            else:
                                writer.write_record(rec)
            vcfutils.bgzip_and_index(tx_out_plain, paired.tumor_data["config"])
    return out_file
Esempio n. 3
0
def unphase(inVcf, outVcf):

    # read the vcf with scikit-allel, just to get number of snps
    print("[GET_NR_SNPS]")
    print(f"Reading: {inVcf}")
    startTime = time.perf_counter()
    callset = allel.read_vcf(inVcf)
    print(f"Took {(time.perf_counter() - startTime):.2f} seconds.")

    # no tri-allelic?
    assert (sum(callset["variants/ALT"][:, 2] != '') == 0)
    assert (sum(callset["variants/ALT"][:, 1] != '') == 0)
    assert (sum(callset["variants/ALT"][:, 0] == '') == 0)

    snpsInFile = callset["calldata/GT"].shape[0]
    print(snpsInFile)

    print("[DONE]")

    print("[UNPHASE]")
    print(f"File to unphase: {inVcf}")
    print(f"Unphased output written to: {outVcf}")

    # go through the vcf
    vcfIFS = cyvcf2.VCF(inVcf)

    # get some randomness
    numIndividuals = len(vcfIFS.samples)
    randomness = numpy.random.randint(2, size=(numIndividuals, snpsInFile))

    # create a new vcf Writer using the input vcf as a template.
    vcfOFS = cyvcf2.Writer(outVcf, vcfIFS)

    count = 0
    allIdxs = numpy.arange(numIndividuals)

    for v in vcfIFS:

        # see what goes
        # what are the indices to be flipped?
        toFlip = allIdxs[randomness[:, count] == 1]
        for idx in toFlip:
            # flip it
            v.genotypes[idx][0], v.genotypes[idx][1] = v.genotypes[idx][
                1], v.genotypes[idx][0]

        # make sure we have new genotypes
        v.genotypes = v.genotypes
        # and write it
        vcfOFS.write_record(v)

        # increase count
        count += 1
        if (count % 100000 == 0):
            print(count)

    vcfOFS.close()
    vcfIFS.close()

    print("[DONE]")
Esempio n. 4
0
def main():
    vcf, normal_bams, output_dir, reference = argument_parser()

    vcf_handle = cyvcf2.VCF(vcf)
    vcf_handle.add_info_to_header({
        'ID': 'PON_VAF',
        'Description': 'VAF in Panel of Normals',
        'Type': 'Float',
        'Number': '1'
    })
    vcf_handle.add_info_to_header({
        'ID': 'PON_DEPTH',
        'Description': 'Total depth in Panel of Normals',
        'Type': 'Float',
        'Number': '1'
    })
    vcf_handle.add_info_to_header({
        'ID': 'PON_VC',
        'Description': 'Total variant read count in Panel of Normals',
        'Type': 'Float',
        'Number': '1'
    })

    output_vcf = os.path.join(
        output_dir, re.sub(r'.vcf$', '.pon.vcf', os.path.basename(vcf)))

    output_handle = cyvcf2.Writer(output_vcf, vcf_handle)
    for variant in vcf_handle:
        variant_position = f'{variant.CHROM}:{variant.POS}-{variant.POS}'
        pon_vafs, total_depths, mismatches = calculate_vaf(
            normal_bams, variant_position, reference)
        variant.INFO['PON_VAF'] = str(pon_vafs)
        variant.INFO['PON_DEPTH'] = str(total_depths)
        variant.INFO['PON_VC'] = str(mismatches)
        output_handle.write_record(variant)
Esempio n. 5
0
def _remove_prioritization(in_file, data, out_dir=None):
    """Remove tumor-only prioritization and return non-filtered calls.
    """
    out_file = "%s-germline.vcf" % utils.splitext_plus(in_file)[0]
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if not utils.file_uptodate(out_file, in_file) and not utils.file_uptodate(
            out_file + ".gz", in_file):
        with file_transaction(data, out_file) as tx_out_file:
            reader = cyvcf2.VCF(str(in_file))
            reader.add_filter_to_header({
                'ID':
                'Somatic',
                'Description':
                'Variant called as Somatic'
            })
            # with open(tx_out_file, "w") as out_handle:
            #     out_handle.write(reader.raw_header)
            with contextlib.closing(cyvcf2.Writer(tx_out_file,
                                                  reader)) as writer:
                for rec in reader:
                    rec = _update_prioritization_filters(rec)
                    # out_handle.write(str(rec))
                    writer.write_record(rec)
    return out_file
Esempio n. 6
0
def variants(args):
    """subroutine for variants subcommand
    """
    ancestor = setup_ancestor(args)

    vcf = cyvcf2.VCF(args.vcf)
    vcf.add_info_to_header({
        'ID': 'mutation_type',
        'Description': f'ancestral {args.k}-mer mutation '
        'type',
        'Type': 'Character',
        'Number': '1'
    })
    vcf_writer = cyvcf2.Writer('-', vcf)
    vcf_writer.write_header()
    for variant in vcf:
        # biallelic snps only
        if not (variant.is_snp and len(variant.ALT) == 1):
            continue
        # mutation type as ancestral kmer and derived kmer
        anc_kmer, der_kmer = ancestor.mutation_type(variant.CHROM,
                                                    variant.start, variant.REF,
                                                    variant.ALT[0])
        if anc_kmer is None or der_kmer is None:
            continue
        mutation_type = f'{anc_kmer}>{der_kmer}'
        variant.INFO['mutation_type'] = mutation_type
        # ancestral allele
        AA = ancestor[variant.CHROM][variant.start].seq
        # polarize genotypes (and associated INFO) if alternative allele is
        # ancestral
        if variant.ALT[0] == AA:
            variant.INFO['AC'] = variant.INFO['AN'] - variant.INFO['AC']
            variant.INFO['AF'] = variant.INFO['AC'] / variant.INFO['AN']
            # cyvcf2 docs say we need to reassign genotypes like this for the
            # change to propagate (can't just update indexwise)
            if variant.ploidy == 2:
                # diploid
                variant.genotypes = [[int(not gt[0]),
                                      int(not gt[1]), gt[2]]
                                     for gt in variant.genotypes]
            elif variant.ploidy == 1:
                # haploid
                variant.genotypes = [[int(not gt[0]), gt[1]]
                                     for gt in variant.genotypes]
            else:
                raise ValueError(f"invalid ploidy {variant.ploidy}")

        elif not variant.REF == AA:
            raise ValueError(f'ancestral allele {AA} is not equal to '
                             f'reference {variant.REF} or alternative '
                             f'{variant.ALT[0]}')
        # set REF to ancestral allele and ALT to derived allele
        variant.REF = anc_kmer[ancestor.target]
        variant.ALT = der_kmer[ancestor.target]
        vcf_writer.write_record(variant)
        # this line required to exit on a SIGTERM in a pipe, e.g. from head
        signal.signal(signal.SIGPIPE, signal.SIG_DFL)
Esempio n. 7
0
def main():
    input, vaf_threshold, output_dir = argument_parser()
    outputfile = os.path.join(
        output_dir, re.sub('.vcf$', '.filtered.vcf', os.path.basename(input)))

    vcf_handle = cyvcf2.VCF(input)
    print(vcf_handle)
    writer = cyvcf2.Writer(outputfile, vcf_handle)

    for variant in cyvcf2.VCF(input):
        if variant.INFO['PON_VAF'] < vaf_threshold:
            writer.write_record(variant)

    vcf_handle.close()
    writer.close()
Esempio n. 8
0
def main():
    input_vcf, reference, output_dir = argument_parser()

    output_vcf = os.path.join(
        output_dir, re.sub('.vcf$', '.sig9.vcf', os.path.basename(input_vcf)))
    vcf_handle = cyvcf2.VCF(input_vcf)
    output_vcf_handle = cyvcf2.Writer(output_vcf, vcf_handle)

    for variant in cyvcf2.VCF(input_vcf):
        var_position = Position(variant.CHROM, variant.POS, variant.POS)
        refbase, altbase, var_trinucleotide = get_trinucleotide(
            var_position, variant.REF, variant.ALT[0], reference)

        if var_trinucleotide in ['TTT', 'TTA', 'CTT'] and altbase == 'G':
            output_vcf_handle.write_record(variant)
Esempio n. 9
0
def _extract_germline(in_file, data):
    """Extract germline calls non-somatic, non-filtered calls.
    """
    out_file = "%s-germline.vcf" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file) and not utils.file_uptodate(
            out_file + ".gz", in_file):
        with file_transaction(data, out_file) as tx_out_file:
            reader = cyvcf2.VCF(in_file)
            reader.add_filter_to_header({
                'ID':
                'Somatic',
                'Description':
                'Variant called as Somatic'
            })
            with contextlib.closing(cyvcf2.Writer(tx_out_file,
                                                  reader)) as writer:
                for rec in reader:
                    writer.write_record(_update_germline_filters(rec))
    return out_file
Esempio n. 10
0
def filter_to_pass_and_reject(in_file, data, out_dir=None):
    """Filter VCF to only those with a strict PASS/REJECT: somatic + germline.

    Removes low quality calls filtered but also labeled with REJECT.
    """
    out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0]
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf")
            with contextlib.closing(cyvcf2.VCF(in_file)) as reader:
                with contextlib.closing(cyvcf2.Writer(tx_out_plain,
                                                      reader)) as writer:
                    for rec in reader:
                        filters = rec.FILTER.split(";") if rec.FILTER else []
                        filters = [
                            x for x in filters
                            if x not in ["PASS", ".", "REJECT"]
                        ]
                        if len(filters) == 0:
                            writer.write_record(rec)
            vcfutils.bgzip_and_index(tx_out_plain, data["config"])
    return out_file
Esempio n. 11
0
    for email in scored_variants[key]['email']:
        #latest_timestamp = datetime.datetime.min
        latest_timestamp = 0
        answer = ''
        #find latest answer for each user
        for entry in scored_variants[key]['email'][email]:
            if entry[1] > latest_timestamp:
                answer = entry[0]
                latest_timestamp = entry[1]
        scored_variants[key]['score_fields'][answer] += 1
        scored_variants[key]['score_fields']['scorer_count'] += 1

vcf = cyvcf2.VCF(os.path.expanduser(args.vcf))
vcf.add_info_to_header({"ID": "SVPD", "Description": "Details of SV-plaudit scorer count and scores in the format COUNT|SCORE1,SCORE2,SCOREN. Answers the question: `" + question + "` Available answers were as follows: `" + "`; `".join(answers) + "`", "Type":'Character', 'Number':'1'})
vcf.add_info_to_header({"ID": "SVP", "Description": "SV-plaudit curation score, the " + args.operation + " of scores for that entry where the values of the following curation answers: `" +  "`; `".join(answers) + "` are " + ",".join(args.number_map), "Type":'Float',  'Number':'1'})
writer = cyvcf2.Writer(args.annotated_outfile, vcf)

for variant in vcf:
    if variant.INFO.get('END'):
        key = variant.INFO.get('SVTYPE') + '_' + \
                variant.CHROM + '_' + \
                str(variant.POS) + '-' + \
                str(variant.INFO.get('END'))
        if key in scored_variants:
            vcf_annotation = str(scored_variants[key]['score_fields']['scorer_count']) + "|"
            for answer in answers:
                vcf_annotation += str(scored_variants[key]['score_fields'][answer]) + ","
            vcf_annotation = vcf_annotation[:-1]
            if args.operation:
                score_counts = vcf_annotation.split("|")[1].split(",")
                score_values = []
Esempio n. 12
0
    # Load Data
    if args.sample is not None:
        vcf = cyvcf2.VCF(args.vcf, samples=args.sample)
    else:
        vcf = cyvcf2.VCF(args.vcf)

    # Sample name
    if len(vcf.samples) > 1:
        sys.stderr.write("Error: " + str(len(vcf.samples)) +
                         " sample detected. This version is designed for a single sample !")
        sys.exit(-1)

    # Ouptuts
    if args.export:
        wx = cyvcf2.Writer(re.sub(r'\.vcf$|\.vcf.gz$|\.bcf',
                                  '_export.vcf', os.path.basename(args.vcf)), vcf)
    if args.debug:
        vcf.add_info_to_header({'ID': 'TMB_FILTERS', 'Description': 'Detected filters for TMB calculation',
                                'Type': 'Character', 'Number': '1'})
        wd = cyvcf2.Writer(re.sub(r'\.vcf$|\.vcf.gz$|\.bcf',
                                  '_debug.vcf', os.path.basename(args.vcf)), vcf)

    # Load config
    dbFlags = loadConfig(args.dbConfig)
    callerFlags = loadConfig(args.varConfig)

    # Genome size
    if args.effGenomeSize is None:
        if args.bed is not None:
            effGS = getEffGenomeSizeFromBed(args.bed)
        else:
Esempio n. 13
0
    return (rec)


if __name__ == "__main__":

    args = argsParse()

    ## Loading Data
    vcf = cyvcf2.VCF(args.vcf)
    rec = loadRec(args.rec)

    ## rec file
    vcf.add_info_to_header({
        'ID': 'RUNREC',
        'Description': 'Run recurrence',
        'Type': 'Character',
        'Number': '1'
    })
    w = cyvcf2.Writer(args.out, vcf)

    for variant in vcf:
        k = str(variant.CHROM) + ":" + str(variant.start + 1) + "-" + str(
            variant.end)
        if k in rec:
            variant.INFO["RUNREC"] = rec[k]

        w.write_record(variant)

    w.close()
    vcf.close()
Esempio n. 14
0
def snp(args):

    genome = pyfaidx.Fasta(args.r)

    #should look specifically at X chrom VAFs in males
    VAF=0.30
    MVAF=0.95
    if args.vaf:    VAF=float(args.vaf)
    if args.mvaf:   MVAF=float(args.mvaf)

    sample_map, animal_map = load_sample_map(args.m)

    #gts012 sets the uncalled GTs to 3
    reader = cyvcf2.VCF(args.i, gts012=True)


    #get list of samples
    samples = reader.samples

    #add the two new INFO tags
    reader.update("UNIQ", "String", 1, "Sample(s) with unique somatic variant")
    reader.update("UAB", "Float", 1, "Allele Balance in UNIQ sample")
    reader.update("TYPE", "String", 1, "Varant Type (SNPS: TS/TV) (INDELS: INS/DEL)")
    reader.update("TISSUE", "String", 1, "Source Tissue Type")
    reader.update("CASE", "String", 1, "Control or SCNT?")
    reader.update("EXPT", "String", 1, "Experiment")
    reader.update("CONTEXT", "String", 1, "Trinucleotide Context")
    reader.update("ANIMAL", "String", 1, "ID of origin animal")
    reader.update("UDP", "Integer", 1, "Depth at uniq site")
    reader.update("AAGR", "Float", 1, "AAG/RR Ratio")
    reader.update("UGQ", "Float", 1, "Uniq Genotype Quality")

    reader.add_filter_to_header({"ID":"LowVAF", "Description":"Somatic VAF below threshold"})
    reader.add_filter_to_header({"ID":"MGP", "Description":"Variant present in MGP"})

    if not args.o:
        writer = cyvcf2.Writer("/dev/stdout", reader)
    else:
        writer = cyvcf2.Writer(args.o, reader)

    AAG_RR_MIN = numpy.power(10.,10.)
    RR_AAG_MIN = numpy.power(10.,5.)

    min_depth = 10
    max_depth = 250

    allosomes = set(["X", "Y"])

    #iterate over vars
    for var in reader:
        

        unique = True

        # set max alt VAF for snp or indel
        if var.is_snp:
            MAX_VAF = 0.05
        elif var.is_indel:
            MAX_VAF = 0.00
        else:
            sys.stderr.write("Skipping Variant: Not SNP/Indel")
            continue


        #get RR and AAG genotype likelihoods
        RR_PLs = unphred(var.gt_phred_ll_homref)
        AAG_PLs = unphred(var.gt_phred_ll_het)

        #get AAG/RR and RR/AAG likelihood ratios
        AAG_RR_ratios = numpy.true_divide(AAG_PLs, RR_PLs)
        RR_AAG_ratios = numpy.true_divide(RR_PLs, AAG_PLs)

        #get genotypes, depths, and alt allele depths
        GTs = var.gt_types
        DEPTHS = var.gt_depths
        ALT_DEPTHS = var.gt_alt_depths
        QUALS = var.gt_quals

        #get allele balances
        ABs = numpy.true_divide(ALT_DEPTHS, DEPTHS)

        for i in range(len(samples)):
            # dont waste time in uneeded loops
            if not unique:
                break

            AAG = 1
            MIN_VAF = VAF

            #change AAG to 1/1 and min vaf to male allosome min [0.95]
            if sample_map[samples[i]]['Sex']=="M" and var.CHROM in allosomes:
                AAG = 2
                MIN_VAF = MVAF

            VAF_FILT = False
            

            #criteria for presence in given sample
            if (max_depth > DEPTHS[i] > min_depth 
                and AAG_RR_ratios[i] >= AAG_RR_MIN 
                and GTs[i] == AAG):

                if ABs[i] < MIN_VAF:
                    VAF_FILT = True
                
                for j in range(len(samples)):
                    if i == j:
                        continue

                    #default RR/AAG min is 1.
                    ratio_min = 1
                    #if same animal
                    if same_animal(sample_map, samples, i, j):
                        if not max_depth > DEPTHS[j] > min_depth:
                            unique = False
                            break

                        #if different case, set ratio_min to control RR_AAG min
                        #thus, SCNT lines for the same animal are treated as the control for controls
                        if sample_map[samples[i]]['Case'] != sample_map[samples[j]]['Case']:
                            ratio_min = RR_AAG_MIN

                    #criteria for failing or presence in other samples 
                    if (ABs[j] > MAX_VAF or
                        RR_AAG_ratios[j] < ratio_min):
                        unique = False
                        break

                if unique:
                    #get trinucleotide context (VCF coords are 1-based)
                    if var.is_snp:
                        context = genome[str(var.CHROM)][var.POS-2:var.POS+1]
                        #ts or tv?
                        tstv = 'Tv'
                        if var.is_transition:
                            tstv = 'Tr'

                        var.INFO['CONTEXT'] = context.seq
                        var.INFO['TYPE'] = tstv

                    var.INFO['TISSUE'] = sample_map[samples[i]]['Source']
                    var.INFO['CASE'] = sample_map[samples[i]]['Case']
                    var.INFO['EXPT'] = sample_map[samples[i]]['Experiment']
                    var.INFO['UNIQ'] = samples[i]
                    var.INFO['UAB'] = str(numpy.around(ABs[i], 3))
                    var.INFO['UDP'] = str(DEPTHS[i])
                    var.INFO['AAGR'] = str(AAG_RR_ratios[i])
                    var.INFO['UGQ'] = str(QUALS[i])
                    var.INFO['ANIMAL'] = sample_map[samples[i]]['Animal']

                    filters = []
                    f = var.FILTER
                    if f:
                        filters = f.split(";")

                    if VAF_FILT:
                        filters.append("LowVAF")

                    try:
                        var.INFO["MGP"]
                        filters.append("MGP")
                    except KeyError:
                        pass

                    if filters:
                        var.FILTER = filters

                    #write record
                    writer.write_record(var)

    writer.close()
Esempio n. 15
0
def snp_fnr(args): 

    #only concerned with autosome VAF cutoff
    VAF=0.30


    #gts012 sets the uncalled GTs to 3
    reader = cyvcf2.VCF(args.i, gts012=True)

    reader.update("ANIMAL", "String", ".", "Animals with this GSS var")
    reader.update("PRESENT", "String", ".", "SCNT lines detecting this GSS var")

    #open writerr
    if not args.o:
        writer = cyvcf2.Writer("/dev/stdout", reader)
    else:
        writer = cyvcf2.Writer(args.o, reader)

    counts_out = open(args.c, 'w')

    #get list of samples
    samples = reader.samples

    #sample to index map
    stoi = {s: samples.index(s) for s in samples}

    #load sample map and get animal sample groups
    sample_map, animal_map = load_sample_map(args.m)


    AAG_RR_MIN = numpy.power(10.,10.)

    min_depth = 10
    max_depth = 250

    counter = Counter()
    HIcounter = Counter()

    #iterate over vars
    for var in reader:

        #false by default
        gss = False
        PASS = False
        animals = []
        present = []

        #max alt allele balance for control samples
        if not (var.is_snp or var.is_indel):
            sys.stderr.write("Skipping Variant: Not SNP/Indel")
            continue

        #get RR and AAG genotype likelihoods
        RR_PLs = unphred(var.gt_phred_ll_homref)
        AAG_PLs = unphred(var.gt_phred_ll_het)

        #get AAG/RR ratios
        AAG_RR_ratios = numpy.true_divide(AAG_PLs, RR_PLs)

        #get genotypes, depths, and alt allele depths
        GTs = var.gt_types
        DEPTHS = var.gt_depths
        ALT_DEPTHS = var.gt_alt_depths

        #get allele balances
        ABs = numpy.true_divide(ALT_DEPTHS, DEPTHS)

        if not var.FILTER: PASS = True

        for animal, group in animal_map.items():

            #get sample name of control
            control = group['Control'][0]
            SCNTs = group['SCNT']

            #if var not called in the control, continue:
            # if GTs[stoi[control]] == [0,3]:
            #     continue

            ALL = [control] + SCNTs

            #if at least one sample was called het, 
            #   var is present in mouse
            if 1 in [GTs[stoi[x]] for x in ALL]:
                counter[control] += 1
                if PASS: 
                    HIcounter[control] += 1
                gss = True
                control_depth = False
                if (DEPTHS[stoi[control]] >= min_depth and
                    DEPTHS[stoi[control]] <= max_depth):
                    control_depth = True

                animals.append(animal)

                for sample in SCNTs:
                    i = stoi[sample]

                    if (DEPTHS[i] >= min_depth and
                        DEPTHS[i] <= max_depth and
                        control_depth and
                        AAG_RR_ratios[i] >= AAG_RR_MIN and
                        ABs[i] >= VAF and
                        GTs[i] == 1):


                        counter[sample] += 1
                        present.append(sample)
                        if PASS:
                            HIcounter[sample] += 1

        if gss:
            var.INFO['ANIMAL'] = ",".join(animals)
            var.INFO['PRESENT'] = ",".join(present)
            writer.write_record(var)

    counts_out.write("#COUNTS\tSample\tCase\tCount\tHQCount\n")
    for sample in sorted(counter.keys()):
        outstr = "\t".join(["#COUNT", sample, sample_map[sample]['Case'], str(counter[sample]), str(HIcounter[sample])])
        counts_out.write(outstr+"\n")

    counts_out.write("#FNR\tAnimal\tFNR\tHQFNR\n")

    for animal, group in sorted(animal_map.items()):

        #get sample name of control
        control = group['Control'][0]
        SCNTs = group['SCNT']

        present = counter[control]

        rates = []
        hrates = []

        for sample in SCNTs:
            called = counter[sample]
            hcalled = HIcounter[sample]
            rate = 1.0-(called/float(present))
            hrate = 1.0-(hcalled/float(present))
            rates.append(rate)
            hrates.append(hrate)

        a_rate = numpy.mean(rates)
        a_hrate = numpy.mean(hrates)

        counts_out.write("\t".join(["#FNR", animal, str(a_rate), str(a_hrate)])+"\n")


    # writer.close()
    counts_out.close()
Esempio n. 16
0
def mei(args):

    #gts012 sets the uncalled GTs to 3
    reader = cyvcf2.VCF(args.i, gts012=True)

    #get list of samples
    samples = reader.samples  
    
    #read sample map
    sample_map, animal_map = load_sample_map(args.m)


    #add the new INFO tags
    reader.update("UNIQ", "String", 1, "Sample(s) with unique somatic variant")
    # reader.update("UAB", "Float", 1, "Allele Balance in UNIQ sample")
    reader.update("TISSUE", "String", 1, "Source Tissue Type")
    reader.update("CASE", "String", 1, "Control or SCNT?")
    reader.update("EXPT", "String", 1, "Experiment")
    reader.update("ANIMAL", "String", 1, "ID of origin animal")
    # reader.update("PL", "String", 1, "RR")

    if not args.o:
        writer = cyvcf2.Writer("/dev/stdout", reader)
    else:
        writer = cyvcf2.Writer(args.o, reader)

    allosomes = set(["X", "Y"])

    min_su = 3
    for var in reader:
        LP = var.INFO['LP']
        RP = var.INFO['RP']
        if not var.FILTER and (LP > min_su and RP > min_su):
        # if (LP > min_su and RP > min_su):
            unique = True
            GTs = var.gt_types

            #strange behaviour... MELT PLs read as the input value *10. divide by 10 to correct.
            #should have MELT return positive integers rather than negative floats.
            RR_PLs = unphred(numpy.divide(var.gt_phred_ll_homref, 10.))
            AAG_PLs = unphred(numpy.divide(var.gt_phred_ll_het, 10.))

            #get AAG/RR and RR/AAG likelihood ratios
            AAG_RR_ratios = numpy.true_divide(AAG_PLs, RR_PLs)
            RR_AAG_ratios = numpy.true_divide(RR_PLs, AAG_PLs)

            for i in range(len(samples)):
                if not unique:
                    break
                AAG = 1
                #change AAG to 1/1 and min vaf to male allosome min [0.95]
                if sample_map[samples[i]]['Sex']=="M" and var.CHROM in allosomes:
                    AAG = 2

                #heterozygote
                if GTs[i] == AAG:
                    for j in range(len(samples)):
                        if j != i:
                
                            if GTs[j] != 0 or RR_PLs[j] < 0.60:
                                unique = False
                                break

                    if unique:
                        # print RR_PLs
                        #set new info fields
                        var.INFO['TISSUE'] = sample_map[samples[i]]['Source']
                        var.INFO['CASE'] = sample_map[samples[i]]['Case']
                        var.INFO['EXPT'] = sample_map[samples[i]]['Experiment']
                        var.INFO['UNIQ'] = samples[i]
                        var.INFO['ANIMAL'] = sample_map[samples[i]]['Animal']
                        writer.write_record(var)

    writer.close()
Esempio n. 17
0
def main(ARGS=None):
    if ARGS == None:
        ARGS = sys.argv[1:]
    args = parse_args(ARGS)
    """
    convert certain comma delim str args to lists
    """
    args.qual_impacts = misc.str_none_split(args.qual_impacts, ",")
    args.max_impact_csqs = misc.str_none_split(args.max_impact_csqs, ",")
    args.max_csq_scores = misc.str_none_split(args.max_csq_scores, ",")
    args.min_csq_scores = misc.str_none_split(args.min_csq_scores, ",")
    """
    read cnds files
    """
    var_cnds = None
    if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds)
    """
    init cyvcf2 VCF obj, get info subfields, header for output
    """
    vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True)
    cyvcf2_vcf = Cyvcf2Vcf(vcf)
    cyvcf2_vcf.get_info_subfields()
    if args.annotation_subfield == "ANN":
        cyvcf2_vcf.get_csq_keys(spliton="Functional annotations: ",
                                delim="|",
                                chars_del=[" ", "'", '"'],
                                ann_id=args.annotation_subfield)
    else:
        cyvcf2_vcf.get_csq_keys(spliton="Format: ",
                                delim="|",
                                ann_id=args.annotation_subfield)
    vcf_header_str = cyvcf2_vcf.header_to_list(
        gt_varnames=GT_VARNAMES,
        max_impact=args.max_impact,
        max_impact_csqs=args.max_impact_csqs,
        max_csq_scores=args.max_csq_scores,
        min_csq_scores=args.min_csq_scores,
        delim="\t")
    """
    since we're writing to a VCF, if any new INFO items written, need to 
    add to header to reflect this.
    """
    if args.max_impact_csqs != None:
        for csq_name in args.max_impact_csqs:
            csq_name_ext = csq_name + "_maximpact"
            vcf.add_info_to_header({'ID': csq_name_ext,
                                    'Description':'max '+csq_name+' to go along '+\
                                                  'with transcripts with max IMPACT',
                                    'Type':'Character',
                                    'Number':'1'})
    if args.max_csq_scores != None:
        for csq_name in args.max_csq_scores:
            csq_name_ext = csq_name + "_max"
            vcf.add_info_to_header({'ID': csq_name_ext,
                                    'Description':'max value for '+csq_name + \
                                                  'along assessed transcripts '+\
                                                  'in CSQ field.',
                                    'Type':'Float',
                                    'Number':'1'})
    if args.min_csq_scores != None:
        for csq_name in args.min_csq_scores:
            csq_name_ext = csq_name + "_min"
            vcf.add_info_to_header({'ID': csq_name_ext,
                                    'Description':'min value for '+csq_name + \
                                                  'along assessed transcripts '+\
                                                  'in CSQ field.',
                                    'Type':'Float',
                                    'Number':'1'})
    """
    init VCF writer object
    """
    w = cyvcf2.Writer(args.out_vcf, vcf)
    # to write variant record, for v in vcf: w.write_record(v)
    """
    iterate through all variants, performing de novo screen on each one
    """
    vargeno_counts = defaultdict(int)
    prev_chrom = None
    n_var = 0
    n_var_keep = 0
    """
    if intervals provided, make sure to parse over those, else whole vcf
    """
    if args.intervals != None:
        if os.path.isfile(args.intervals):
            intervals = open(args.intervals, "r").readlines()
            intervals = [x.rstrip() for x in intervals]
        else:
            intervals = [args.intervals]
    else:
        intervals = [""]
    """
    parse VCF file looking for de novo variant calls
    """
    for vcf_variant in cyvcf2_vcf.iterator(intervals):
        n_var += 1
        #if linenum == 1000000: break
        """
        create new Cyvcf2Variant instance
        """
        cyvcf2_variant = Cyvcf2Variant(vcf_variant)

        if vcf_variant.CHROM != prev_chrom:
            print("Extracting variants from chrom " + vcf_variant.CHROM)
            prev_chrom = vcf_variant.CHROM
        """
        assume single allele per site, exclude sites with call as '*'
        """
        alt = vcf_variant.ALT[0]
        if alt == '*': continue

        ## if no qualifying impact str found in CSQ, skip
        if args.qual_impacts != None:
            res = cyvcf2_variant.qual_impacts_screen(
                args.qual_impacts, csq_subfield=args.annotation_subfield)
            if res == False: continue

        ## if desired, derive max impact annots from var, along with other
        ## user defined max or min scores in CSQ for variant
        csqs_maximpact_list = []
        max_csq_scores = []
        min_csq_scores = []
        if args.max_impact == True:
            cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys,
                                         csq_subfield=args.annotation_subfield)
            if args.annotation_subfield == "ANN":
                impact_subfield = "Annotation_Impact"
            else:
                impact_subfield = "IMPACT"
            res = cyvcf2_variant.maxmin_csqs(
                csq_subfield=args.annotation_subfield,
                impact_subfield=impact_subfield,
                max_impact_csqs=args.max_impact_csqs,
                max_csq_scores=args.max_csq_scores,
                min_csq_scores=args.min_csq_scores)
            (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res
            """
            if corresponding values defined, add to vcf record
            """
            if args.max_impact_csqs != None:
                for i in range(len(args.max_impact_csqs)):
                    max_impact_csq_name = args.max_impact_csqs[i] + "_maximpact"
                    max_impact_csq = csqs_maximpact_list[i]
                    vcf_variant.INFO[max_impact_csq_name] = max_impact_csq
            if args.min_csq_scores != None:
                for i in range(len(args.min_csq_scores)):
                    min_csq_score_name = args.min_csq_scores[i] + "_min"
                    min_csq_score = float(min_csq_scores[i])
                    vcf_variant.INFO[min_csq_score_name] = min_csq_score
            if args.max_csq_scores != None:
                for i in range(len(args.max_csq_scores)):
                    max_csq_score_name = args.max_csq_scores[i] + "_max"
                    max_csq_score = float(max_csq_scores[i])
                    vcf_variant.INFO[max_csq_score_name] = max_csq_score

        ## filter on variant cnds file provided
        if var_cnds.test_variant(vcf_variant) == False: continue

        ## if variant survives filters, retain record
        w.write_record(vcf_variant)
        n_var_keep += 1

    w.close()
    vcf.close()

    ## print basic stats on number of input variants, number of
    ## variants to keep
    print("Number of variants in parent VCF : " + str(n_var))
    print("Number of variants retained post-filtration : " + str(n_var_keep))

    return
Esempio n. 18
0
    def __call__(self, predictions, records, line_ids=None):
        # First itertation: the output file has to be created and the headers defined
        import cyvcf2

        if len(predictions) == 0:
            return None

        metdata_id_infotag = self.info_tag_prefix + ":rID"

        if self.prediction_labels is None:
            # setup the header
            self.prediction_labels = list(predictions.keys())
            for k in predictions:
                col_labels_here = predictions[k].columns.tolist()
                # Make sure that the column are consistent across different prediction methods
                if self.column_labels is None:
                    self.column_labels = col_labels_here
                else:
                    if not np.all(
                            np.array(self.column_labels) == np.array(
                                col_labels_here)):
                        raise Exception(
                            "Prediction columns are not identical for methods %s and %s"
                            % (predictions.keys()[0], k))
                # Add the tag to the vcf file
                # "##INFO=<ID={ID},Number={Number},Type={Type},Description=\"{Description}\">".format(**adict)
                info_tag = {
                    "ID":
                    self.info_tag_prefix + ":%s" % k.upper(),
                    "Number":
                    None,
                    "Type":
                    "String",
                    "Description":
                    "%s SNV effect prediction. Prediction from model outputs: %s"
                    % (k.upper(), "|".join(self.column_labels))
                }
                self.vcf_reader.add_info_to_header(info_tag)
            # Add a tag in which the line_id = ranges_id will be written
            info_tag = {
                "ID":
                metdata_id_infotag,
                "Number":
                None,
                "Type":
                "String",
                "Description":
                "Range or region id taken from metadata, generated by the DataLoader."
            }
            self.vcf_reader.add_info_to_header(info_tag)
            # Now we can also create the vcf writer
            self.vcf_writer = cyvcf2.Writer(self.out_vcf_fpath,
                                            self.vcf_reader)
        else:
            if (len(predictions) != len(self.prediction_labels)) or not all(
                [k in predictions for k in self.prediction_labels]):
                raise Exception(
                    "Predictions are not consistent across batches")
            for k in predictions:
                col_labels_here = predictions[k].columns.tolist()
                if not np.all(
                        np.array(self.column_labels) == np.array(
                            col_labels_here)):
                    raise Exception(
                        "Prediction columns are not identical for methods %s and %s"
                        % (self.prediction_labels[0], k))

        # sanity check that the number of records matches the prediction rows:
        for k in predictions:
            if predictions[k].shape[0] != len(records):
                raise Exception(
                    "number of records does not match number the prediction rows for prediction %s."
                    % str(k))

        if line_ids is not None:
            if line_ids.shape[0] != len(records):
                raise Exception(
                    "number of line_ids does not match number of VCF records")

        # Actually write the vcf entries.
        for pred_line, record in enumerate(records):
            if self.standardise_var_id and self.vcf_id_generator is not None:
                record.ID = self.vcf_id_generator(record)
            for k in predictions:
                # In case there is a pediction for this line, annotate the vcf...
                preds = predictions[k].iloc[pred_line, :]
                info_tag = self.info_tag_prefix + ":{0}".format(k.upper())
                record.INFO[info_tag] = "|".join([str(pred) for pred in preds])
            line_id = ""
            if line_ids is not None:
                line_id = line_ids[pred_line]
            record.INFO[metdata_id_infotag] = line_id
            self.vcf_writer.write_record(record)
Esempio n. 19
0
def extend_vcf_annotations(query_vcf, pcgr_db_dir, logger, pon_annotation, regulatory_annotation, cpsr, debug):
    """
    Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
    1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
    2. Cancer-relevant gene annotations (PCGR_ONCO_XREF), e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc.
    3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc.
    4. Variant effect predictions
    5. Panel-of-normal (blacklisted variants) annotation

    List of INFO tags to be produced is provided by the 'infotags' files in the pcgr_db_dir
    """

    ## read VEP and PCGR tags to be appended to VCF file
    vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_dir, 'pcgr_infotags.tsv'))
    if cpsr is True:
        vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_dir, 'cpsr_infotags.tsv'))
    pcgr_onco_xref_map = annoutils.read_genexref_namemap(os.path.join(pcgr_db_dir, 'pcgr_onco_xref', 'pcgr_onco_xref_namemap.tsv'))


    out_vcf = re.sub(r'\.vcf(\.gz){0,}$','.annotated.vcf',query_vcf)

    meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(query_vcf, vcf_infotags_meta)
    dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info['dbnsfp_prediction_algorithms']
    vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap']
    vcf = cyvcf2.VCF(query_vcf)
    for tag in sorted(vcf_infotags_meta):
        if pon_annotation == 0 and regulatory_annotation == 0:
            if not tag.startswith('PANEL_OF_NORMALS') and not tag.startswith('REGULATORY_'):
                vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
        elif pon_annotation == 1 and regulatory_annotation == 0:
            if not tag.startswith('REGULATORY_'):
                vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
        elif pon_annotation == 0 and regulatory_annotation == 1:
            if not tag.startswith('PANEL_OF_NORMALS'):
                vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
        else:
            vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})

    w = cyvcf2.Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0

    vcf_info_element_types = {}
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element:
            identifier = str(header_element['ID'])
            fieldtype = str(header_element['Type'])
            vcf_info_element_types[identifier] = fieldtype

    vars_no_csq = list()
    for rec in vcf:
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                if not current_chrom is None:
                    logger.info(f"Completed summary of functional annotations for {num_chromosome_records_processed} variants on chr{current_chrom}")
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = f"g.{rec.CHROM}:{pos}{rec.REF}>{alt_allele}"
            vars_no_csq.append(variant_id)
            continue

        num_chromosome_records_processed += 1
        pcgr_onco_xref = annoutils.make_transcript_xref_map(rec, pcgr_onco_xref_map, xref_tag = "PCGR_ONCO_XREF")

        if regulatory_annotation == 1:
            csq_record_results_all = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = False, csq_identifier = 'CSQ')
            if 'vep_block' in csq_record_results_all:
                vep_csq_records_all = csq_record_results_all['vep_block']
                rec.INFO['REGULATORY_ANNOTATION'] = annoutils.map_regulatory_variant_annotations(vep_csq_records_all)

        csq_record_results_pick = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = True, csq_identifier = 'CSQ')
        vep_csq_records = None
        if 'vep_all_csq' in csq_record_results_pick:
            rec.INFO['VEP_ALL_CSQ'] = ','.join(csq_record_results_pick['vep_all_csq'])
        if 'vep_block' in csq_record_results_pick:
            vep_csq_records = csq_record_results_pick['vep_block']
            block_idx = 0
            if cpsr is True:
                block_idx = annoutils.get_correct_cpg_transcript(vep_csq_records)
            record = vep_csq_records[block_idx]
            for k in record:
                if k in vcf_info_element_types:
                    if vcf_info_element_types[k] == "Flag" and record[k] == "1":
                        rec.INFO[k] = True
                    else:
                        if not record[k] is None:
                            rec.INFO[k] = record[k]
        if not rec.INFO.get('DBNSFP') is None:
            annoutils.map_variant_effect_predictors(rec, dbnsfp_prediction_algorithms)

        w.write_record(rec)
    if vars_no_csq:
        logger.warning(f"There were {len(vars_no_csq)} records with no CSQ tag from VEP (was --vep_no_intergenic flag set?). Skipping them and showing (up to) the first 100:")
        print('----')
        print(', '.join(vars_no_csq[:100]))
        print('----')
    w.close()
    if current_chrom is not None:
        logger.info(f"Completed summary of functional annotations for {num_chromosome_records_processed} variants on chr{current_chrom}")
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            check_subprocess(logger, f'bgzip -f {out_vcf}', debug=False)
            check_subprocess(logger, f'tabix -f -p vcf {out_vcf}.gz', debug=False)
            annotated_vcf = f'{out_vcf}.gz'
            annoutils.write_pass_vcf(annotated_vcf, logger)
        else:
            error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4', logger)
    else:
        error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4', logger)
    pdp3_vaf = calculate_vaf(variant.gt_alt_depths[2], variant.gt_depths[2])
    pdp4_vaf = calculate_vaf(variant.gt_alt_depths[3], variant.gt_depths[3])
    ssc1_vaf = calculate_vaf(variant.gt_alt_depths[4], variant.gt_depths[4])
    ssc2_vaf = calculate_vaf(variant.gt_alt_depths[5], variant.gt_depths[5])
    ssc3_vaf = calculate_vaf(variant.gt_alt_depths[6], variant.gt_depths[6])
    ssc4_vaf = calculate_vaf(variant.gt_alt_depths[7], variant.gt_depths[7])
    ssc5_vaf = calculate_vaf(variant.gt_alt_depths[8], variant.gt_depths[8])
    pf1_vaf = calculate_vaf(variant.gt_alt_depths[9], variant.gt_depths[9])
    return pdp1_vaf, pdp2_vaf, pdp3_vaf, pdp4_vaf, ssc1_vaf, ssc2_vaf, ssc3_vaf, ssc4_vaf, ssc5_vaf, pf1_vaf


vcf_handle = cyvcf2.VCF(
    '/home/users/cjyoon/Projects/rheum/data_processing/01c_freebayes/everyone.freebayes.decomposed.norm.vep.centelexcl.vcf.gz'
)
writer = cyvcf2.Writer(
    '/home/users/cjyoon/Projects/rheum/data_processing/01c_freebayes/everyone.freebayes.decomposed.norm.vep.centelexcl.denovo_v2.vcf',
    vcf_handle)
for variant in vcf_handle:
    pdp1, pdp2, pdp3, pdp4, ssc1, ssc2, ssc3, ssc4, ssc5, pf1 = variant.genotypes
    pdp1_geno = variant_type(pdp1[0:2])
    pdp2_geno = variant_type(pdp2[0:2])
    pdp3_geno = variant_type(pdp3[0:2])
    pdp4_geno = variant_type(pdp4[0:2])
    ssc1_geno = variant_type(ssc1[0:2])
    ssc2_geno = variant_type(ssc2[0:2])
    ssc3_geno = variant_type(ssc3[0:2])
    ssc4_geno = variant_type(ssc4[0:2])
    ssc5_geno = variant_type(ssc5[0:2])
    pf1_geno = variant_type(pf1[0:2])

    pdp1_vaf, pdp2_vaf, pdp3_vaf, pdp4_vaf, ssc1_vaf, ssc2_vaf, ssc3_vaf, ssc4_vaf, ssc5_vaf, pf1_vaf = sample_vafs(
Esempio n. 21
0
def main():
	args = parse_args()

	vcf = cyvcf2.VCF(args.vcf)

	vcf.add_to_header(
		"##Filter_vcf_CMD=python Filter_vcf.py "
		"--vcf {} "
		"--output_vcf {} "
		"--variant_caller {} "
		"--min_samples {} "
		"--QUAL {} "
		"--sample_depth {} "
		"--min_support {} "
		"--genotype_quality {} "
		"--type {}".format(
			str(args.vcf),
			str(args.output_vcf),
			str(args.variant_caller),
			str(args.min_samples),
			str(args.QUAL),
			str(args.sample_depth),
			str(args.min_support),
			str(args.genotype_quality),
			str(args.var_type)))

	out_vcf = cyvcf2.Writer(args.output_vcf, vcf)

	for variant in vcf:
		if variant.QUAL < args.QUAL:
			continue
		if variant.INFO.get("DP") < args.min_samples * args.sample_depth:
			continue
		if args.var_type != "ALL":
			var_type = variant.INFO.get("type")
			if args.var_type == "INDEL":
				if var_type != "ins":
					if var_type != "del":
						continue
			else:
				if var_type != args.var_type.lower():
					continue
		dp = variant.format('DP')
		dp = dp[np.where(dp >= args.sample_depth)]
		if len(dp) < args.min_samples:
			continue
		gq = variant.format('GQ')
		gq = gq[np.where(gq >= args.genotype_quality)]
		if len(gq) < args.min_samples:
			continue
		gt = variant.gt_types
		gt_ref = variant.gt_ref_depths
		gt_alt = variant.gt_alt_depths
		if args.min_support > 0:
			passing = 0
			for idx, i in enumerate(gt):
				if i == 0:
					if gt_ref[idx] >= args.min_support:
						passing += 1
				elif i == 1:
					if gt_ref[idx] >= args.min_support and gt_alt[idx] >= args.min_support:
						passing += 1
				elif i == 2:
					if gt_alt[idx] >= args.min_support:
						passing += 1
			if passing < args.min_samples:
				continue
		out_vcf.write_record(variant)

	out_vcf.close()