def main(debug=None): args = docopt(__doc__, argv=debug, version=__version__) if args["<vcf>"] == "": print(__doc__) v = vcf(args["<vcf>"]) for line in v.output_raw(): if line.startswith("#CHROM"): line = line.split("\t") if args["--subst"]: find_replace = [re.split("[:=,]", x) for x in args["--subst"]] for orig, replacement in find_replace: for n, sample in enumerate(line[9:]): if sample == orig: line[9 + n] = replacement if args["--prefix"]: line[9:] = [args["--prefix"] + x for x in line[9:]] if args["--suffix"]: line[9:] = [x + args["--suffix"] for x in line[9:]] print '\t'.join(line) else: print(line.strip())
def main(debug=None): args = docopt(__doc__, argv=debug, version=__version__) if args["<vcf>"] == "": print(__doc__) v = vcf(args["<vcf>"]) for line in v.output_raw(): if line.startswith("#CHROM"): line = line.split("\t") if args["--subst"]: find_replace = [re.split("[:=,]", x) for x in args["--subst"]] for orig, replacement in find_replace: for n, sample in enumerate(line[9:]): if sample == orig: line[9+n] = replacement if args["--prefix"]: line[9:] = [args["--prefix"] + x for x in line[9:]] if args["--suffix"]: line[9:] = [x + args["--suffix"] for x in line[9:]] print '\t'.join(line) else: print(line.strip())
def main(debug=None): args = docopt(__doc__, argv=debug, options_first=False, version=__version__) def first(s): return s[0].replace(".", "N") firstv = np.vectorize(first) v = vcf(args["<vcf>"]) if len(v.samples) <= 1: exit(puts_err( colored.red("\n\tVCF must have at least two samples.\n"))) if args["<region>"]: variant_set = v(args["<region>"]) else: variant_set = v if args["fasta"] or args["tree"]: """ Generate an aligned fasta from a VCF file. """ gt_set = np.chararray((0, len(v.samples))) gt_set = [] for line in variant_set: if line.is_snp: gt_set.append(firstv(line.gt_bases)) if len(gt_set) == 0: exit(puts_err("No genotypes")) gt_set = np.vstack(gt_set) seqs = zip(v.samples, np.transpose(gt_set)) if args["fasta"]: for sample, seq in seqs: print(">" + sample) print(''.join(seq)) elif args["tree"]: """ Generate a phylogenetic tree using an aligned fasta with muscle. """ # Check for muscle dependency check_program_exists("muscle") fasta = "" with indent(4): puts_err(colored.blue("\nGenerating Fasta\n")) for sample, seq in seqs: fasta += ">" + sample + "\n" + ''.join(seq) + "\n" tree_type = "upgma" # default is upgma if args["nj"]: tree_type = "neighborjoining" with indent(4): puts_err(colored.blue("\nGenerating " + tree_type + " Tree\n")) comm = ["muscle", "-maketree", "-in", "-", "-cluster", tree_type] tree, err = Popen(comm, stdin=PIPE, stdout=PIPE).communicate(input=fasta) # output tree print(tree) if args["--plot"]: from jinja2 import Template import webbrowser import tempfile prefix = os.path.dirname( os.path.abspath( sys.modules['vcfkit'].__file__)) + "/static" template = open(prefix + "/tree.html", 'r').read() tree_template = Template(template) html_out = tempfile.NamedTemporaryFile(suffix=".html", delete=False) with html_out as f: tree = tree.replace("\n", "") sample_len = len(v.samples) f.write(tree_template.render(**locals())) webbrowser.open("file://" + html_out.name)
""" Converts Genotype likelyhoods to phred scaled (PL) genotype likelyhoods. """ return -int(gl * 10) debug = None if len(sys.argv) == 1: debug = ['primer', "--ref=WBcel235", "test.vcf.gz"] if __name__ == '__main__': # print debug args = docopt(__doc__, version='VCF-Toolbox v0.1', argv=debug, options_first=False) # Locate Reference v = vcf(args["<vcf>"]) format_added = False if args["transfer-filter"]: for line in v.output_raw(): line = line.strip() if line.startswith("#CHROM"): # Get Sample information and count samples = line.strip().split("\t")[9:] elif line.startswith("#"): # Add Info line for het polarization flag if line.startswith("##FORMAT") and format_added is False: format_added = True line = line + "\n##FORMAT=<ID=FT,Number=1,Type=String,Description=\"Genotype-level filter\">" else: line = line.split("\t") FILTER = line[6]
def main(debug=None): args = docopt(__doc__, version='VCF-Toolbox v0.1', argv=debug, options_first=False) module_path = os.path.split(os.path.realpath(__file__))[0] handle = open(args["<seq>"], "rb") reference = resolve_reference_genome(args["--ref"]) if args["<vcf>"]: concordance = True v = vcf(args["<vcf>"]) samples = v.samples if args["--vcf-sites"] and args["<vcf>"] is None: with indent(4): exit(puts_err(colored.red("\nMust specify <vcf> with --vcf-sites\n"))) # Setup reference for blast call b = blast(reference) # Set file type: sequence_file_type = seq_type(args["<seq>"]) # Output header print("\t".join(blast_variant.output_order)) for record in SeqIO.parse(handle, sequence_file_type): # Resolve sample within fasta line sample = resolve_sample_from_line(samples, handle.name) if not sample: sample = resolve_sample_from_line(samples, record.name) blast_results = b.blast_call(record) classification = "" for n, variant in enumerate(blast_results): output_line = False if variant is None: puts_err(colored.red("No Results for " + sample + " " + record.description)) continue if args["<vcf>"]: if n == 0: vcf_variants = [] for vcf_variant in v(variant.region()): if sample: gt = format_gt(vcf_variant.gt_bases[v.samples.index(sample)]) vcf_variants.append([vcf_variant.CHROM, vcf_variant.POS, gt, vcf_variant.REF, vcf_variant.ALT]) vcf_variant_positions = [x[0:2] for x in vcf_variants] chrom_pos = variant.chrom_pos_allele()[0:2] vcf_variant_match = [x for x in vcf_variants if x[0:2] == chrom_pos] if vcf_variant_match: vcf_variant_match = vcf_variant_match[0] variant.vcf_gt = vcf_variant_match[2] variant.REF = vcf_variant_match[3] variant.ALT = ','.join(vcf_variant_match[4]) variant.fetch_variant_type() if variant.REF == variant.seq_gt and variant.seq_gt == variant.vcf_gt: variant.classification = "TN" elif variant.REF != variant.seq_gt and variant.seq_gt == variant.vcf_gt: variant.classification = "TP" elif variant.REF == variant.seq_gt and variant.seq_gt != variant.vcf_gt: variant.classification = "FP" elif variant.REF != variant.seq_gt and variant.seq_gt != variant.vcf_gt: variant.classification = "FN" else: variant.REF = "" variant.ALT = "" variant.fetch_variant_type() variant.classification = "" if args["--vcf-sites"] and variant.classification != "": output_line = True elif args["--all-sites"] is True: output_line = True else: if args["--all-sites"]: output_line = True elif variant.is_variant: output_line = True if output_line: variant.sample = sample if record.description: variant.description = record.description else: variant.description = os.path.split(handle.name)[1] print '\t'.join([str(variant)])
tgt = '/'.join([gt_dict[int(x)] for x in re.split("[\|/]", val["GT"])]) return tgt debug = None if len(sys.argv) == 1: debug = ['vcf2sql', "test.vcf.gz"] if __name__ == '__main__': args = docopt(__doc__, argv=debug, options_first=False) timestamp = datetime.datetime.now() module_path = os.path.split(os.path.realpath(__file__))[0] v = vcf(args["<vcf>"]) vcf_safe = v.filename.replace(".", "_") tsv_out = v.filename.replace("vcf", "tsv").replace( "bcf", "tsv").replace(".gz", "") + ".gz" info_cols = [map(autoconvert, list(x)) + ["INFO"] for x in r_info.findall(v.raw_header)] format_cols = [map(autoconvert, list(x)) + ["FORMAT"] for x in r_format.findall(v.raw_header)] if args["--simple"]: info_cols = [x for x in info_cols if x[0] in simple_fields] format_cols = [x for x in format_cols if x[0] in simple_fields] if args["sqlite"]: db = SqliteDatabase(args["--db"])
def main(debug = None): args = docopt(__doc__, version='VCF-Toolbox v0.1', argv = debug, options_first=False) if args["--soft-filter"] and not args["--mode"]: exit(message("Must Specify --mode with soft-filter")) v = vcf(args["<vcf>"]) n_samples = len(v.samples) * 1.0 f = {} filter_s = [x for x in args.values() if x in ["REF","HET","ALT","MISSING"]][0] # Filter by rate or by number? if args["--min"]: direction = "<" if int(float(args["--min"])) != float(args["--min"]): filter_key_min = "r_" + filter_s filter_val_min = float(args["--min"]) filter_type = "FREQUENCY" else: filter_key_min = filter_s filter_val_min = int(float(args["--min"])) filter_type = "COUNT" filter_value = filter_val_min if args["--max"]: direction = ">" if int(float(args["--max"])) != float(args["--max"]): filter_key_max = "r_" + filter_s filter_val_max = float(args["--max"]) filter_type = "FREQUENCY" else: filter_key_max = filter_s filter_val_max = int(float(args["--max"])) filter_type = "COUNT" filter_value = filter_val_max # Output header header = v.raw_header.splitlines() for n, i in enumerate(header): if i.startswith("##FILTER") and args["--soft-filter"]: filter_name = args["--soft-filter"] filter_line = """##FILTER=<ID={filter_name},Description="Apply filter if {filter_type}({filter_s}) {direction} {filter_value}">""".format(**locals()) header.insert(n+1, filter_line) break header = '\n'.join(header) + "\n" sys.stdout.write(header) for line in v: filtered = False f["ALT"] = line.num_hom_alt f["HET"] = line.num_het f["REF"] = line.num_hom_ref f["MISSING"] = int(n_samples - line.num_called) f["r_ALT"] = f["ALT"] / n_samples f["r_HET"] = f["HET"] / n_samples f["r_REF"] = f["REF"] / n_samples f["r_MISSING"] = f["MISSING"] / n_samples if args["--min"]: if f[filter_key_min] < filter_val_min: filtered = True if args["--max"]: if f[filter_key_max] > filter_val_max: filtered = True if args["--soft-filter"]: line = str(line).split("\t") if args["--mode"] == "x": line[6] = "PASS" if filtered is False: sys.stdout.write('\t'.join(line)) else: if args["--mode"] == "+": if line[6] == "PASS": line[6] = "" line[6] = ';'.join([line[6]] + [args["--soft-filter"]]).strip(";") elif args["--mode"] == "x": line[6] = args["--soft-filter"] sys.stdout.write('\t'.join(line)) elif filtered is False: sys.stdout.write(str(line))
def main(debug=None): args = docopt(__doc__, argv=debug, options_first=False, version=__version__) def first(s): return s[0].replace(".", "N") firstv = np.vectorize(first) v = vcf(args["<vcf>"]) if len(v.samples) <= 1: exit(puts_err(colored.red("\n\tVCF must have at least two samples.\n"))) if args["<region>"]: variant_set = v(args["<region>"]) else: variant_set = v if args["fasta"] or args["tree"]: """ Generate an aligned fasta from a VCF file. """ gt_set = np.chararray((0,len(v.samples))) gt_set = [] for line in variant_set: if line.is_snp: gt_set.append(firstv(line.gt_bases)) if len(gt_set) == 0: exit(puts_err("No genotypes")) gt_set = np.vstack(gt_set) seqs = zip(v.samples, np.transpose(gt_set)) if args["fasta"]: for sample, seq in seqs: print(">" + sample) print(''.join(seq)) elif args["tree"]: """ Generate a phylogenetic tree using an aligned fasta with muscle. """ # Check for muscle dependency check_program_exists("muscle") fasta = "" with indent(4): puts_err(colored.blue("\nGenerating Fasta\n")) for sample, seq in seqs: fasta += ">" + sample + "\n" + ''.join(seq) + "\n" tree_type = "upgma" # default is upgma if args["nj"]: tree_type = "neighborjoining" with indent(4): puts_err(colored.blue("\nGenerating " + tree_type + " Tree\n")) comm = ["muscle", "-maketree", "-in", "-", "-cluster", tree_type] tree, err = Popen(comm, stdin=PIPE, stdout=PIPE).communicate(input=fasta) # output tree print(tree) if args["--plot"]: from jinja2 import Template import webbrowser import tempfile prefix = os.path.dirname(os.path.abspath(sys.modules['vcfkit'].__file__)) + "/static" template = open(prefix + "/tree.html",'r').read() tree_template = Template(template) html_out = tempfile.NamedTemporaryFile(suffix=".html", delete=False) with html_out as f: tree = tree.replace("\n", "") sample_len = len(v.samples) f.write(tree_template.render(**locals())) webbrowser.open("file://" + html_out.name)
def main(debug=None): args = docopt(__doc__, argv=debug, options_first=False, version=__version__) module_path = os.path.split(os.path.realpath(__file__))[0] v = vcf(args["<vcf>"]) samples = v.samples _ROOT = os.path.split(os.path.dirname(vk.__file__))[0] if args["fasta"] or args["tree"]: """ Generate an aligned fasta from a VCF file. """ seqs = {} for sample in samples: seqs[sample] = [] for line in v: if line.is_snp: non_missing = [x.replace(".", "-") for x in line.gt_bases] sample_gt = zip(samples, [x[-1] for x in non_missing]) for sample, gt in sample_gt: seqs[sample].append(gt) if not args["tree"]: for sample, seq in seqs.items(): print(">" + sample) print(''.join(seq)) elif args["tree"]: """ Generate a phylogenetic tree using an aligned fasta with muscle. """ # Check for muscle dependency check_program_exists("muscle") fasta = "" with indent(4): puts_err(colored.blue("\nGenerating Fasta\n")) for sample, seq in seqs.items(): fasta += ">" + sample + "\n" + ''.join(seq) + "\n" tree_type = "upgma" # default is upgma if args["nj"]: tree_type = "neighborjoining" with indent(4): puts_err(colored.blue("\nGenerating " + tree_type + " Tree\n")) comm = ["muscle", "-maketree", "-in", "-", "-cluster", tree_type] tree, err = Popen(comm, stdin=PIPE, stdout=PIPE).communicate(input=fasta) print(tree) if args["--plot"]: from jinja2 import Template import webbrowser import tempfile # R code for plotting here! prefix = _ROOT + "/static" tree_template = Template( open(_ROOT + "/static/tree.html", 'r').read()) html_out = tempfile.NamedTemporaryFile(suffix=".html", delete=False) with html_out as f: tree = tree.replace("\n", "") sample_len = len(samples) f.write(tree_template.render(**locals())) # print html_out.name webbrowser.open("file://" + html_out.name)
def main(debug=None): args = docopt(__doc__, version='VCF-Toolbox v0.1', argv=debug, options_first=False) module_path = os.path.split(os.path.realpath(__file__))[0] handle = open(args["<seq>"], "rb") reference = resolve_reference_genome(args["--ref"]) if args["<vcf>"]: concordance = True v = vcf(args["<vcf>"]) samples = v.samples if args["--vcf-sites"] and args["<vcf>"] is None: with indent(4): exit( puts_err( colored.red("\nMust specify <vcf> with --vcf-sites\n"))) # Setup reference for blast call b = blast(reference) # Set file type: sequence_file_type = seq_type(args["<seq>"]) # Output header print("\t".join(blast_variant.output_order)) for record in SeqIO.parse(handle, sequence_file_type): # Resolve sample within fasta line sample = resolve_sample_from_line(samples, handle.name) if not sample: sample = resolve_sample_from_line(samples, record.name) blast_results = b.blast_call(record) classification = "" for n, variant in enumerate(blast_results): output_line = False if variant is None: puts_err( colored.red("No Results for " + sample + " " + record.description)) continue if args["<vcf>"]: if n == 0: vcf_variants = [] for vcf_variant in v(variant.region()): if sample: gt = format_gt( vcf_variant.gt_bases[v.samples.index(sample)]) vcf_variants.append([ vcf_variant.CHROM, vcf_variant.POS, gt, vcf_variant.REF, vcf_variant.ALT ]) vcf_variant_positions = [ x[0:2] for x in vcf_variants ] chrom_pos = variant.chrom_pos_allele()[0:2] vcf_variant_match = [ x for x in vcf_variants if x[0:2] == chrom_pos ] if vcf_variant_match: vcf_variant_match = vcf_variant_match[0] variant.vcf_gt = vcf_variant_match[2] variant.REF = vcf_variant_match[3] variant.ALT = ','.join(vcf_variant_match[4]) variant.fetch_variant_type() if variant.REF == variant.seq_gt and variant.seq_gt == variant.vcf_gt: variant.classification = "TN" elif variant.REF != variant.seq_gt and variant.seq_gt == variant.vcf_gt: variant.classification = "TP" elif variant.REF == variant.seq_gt and variant.seq_gt != variant.vcf_gt: variant.classification = "FP" elif variant.REF != variant.seq_gt and variant.seq_gt != variant.vcf_gt: variant.classification = "FN" else: variant.REF = "" variant.ALT = "" variant.fetch_variant_type() variant.classification = "" if args["--vcf-sites"] and variant.classification != "": output_line = True elif args["--all-sites"] is True: output_line = True else: if args["--all-sites"]: output_line = True elif variant.is_variant: output_line = True if output_line: variant.sample = sample if record.description: variant.description = record.description else: variant.description = os.path.split(handle.name)[1] print '\t'.join([str(variant)])