def split(args): vc_mode, variant_caller, precalled_filename, gatk_path = tk_io.get_vc_mode( args.vc_precalled, args.variant_mode) precalled_file = None if vc_mode == "precalled" or vc_mode == "precalled_plus": mem_gb = 8 threads = 1 precalled_file = martian.make_path("precalled_vcf.vcf") tenkit.log_subprocess.check_call( ['cp', precalled_filename, precalled_file]) tk_tabix.index_vcf(precalled_file) precalled_file = precalled_file + ".gz" if vc_mode != "precalled": if variant_caller == 'freebayes': mem_gb = 5 threads = 1 elif variant_caller == "gatk": mem_gb = 8 threads = 2 # make sure the gatk jar file exists if gatk_path is None: martian.throw( "variant_caller 'gatk' selected, must supply path to gatk jar file -- e.g. \"gatk:/path/to/GenomeAnalysisTK.jar\"" ) gatk_loc = gatk_path if not (os.path.exists(gatk_loc)): martian.throw( "variant_caller 'gatk' selected, gatk jar file does not exist: %s" % gatk_loc) else: raise NotSupportedException('Variant caller not supported: ' + vc_mode) primary_contigs = tk_reference.load_primary_contigs(args.reference_path) bam_chunk_size_gb = 3.0 if args.restrict_locus is None: loci = tk_chunks.get_sized_bam_chunks(args.input, bam_chunk_size_gb, contig_whitelist=primary_contigs, extra_args={ '__mem_gb': mem_gb, '__threads': threads, 'split_input': precalled_file }) else: loci = [{'locus': args.restrict_locus}] return {'chunks': loci}
def combine_vcfs(output_filename, input_vcf_filenames): tmp_filename = output_filename + ".tmp" for (i, fn) in enumerate(input_vcf_filenames): if i == 0: args = 'cat ' + fn subprocess.check_call(args + " > " + tmp_filename, shell=True) else: args = 'grep -v "^#" ' + fn ret = subprocess.call(args + " >> " + tmp_filename, shell=True) if ret == 2: raise Exception("grep call failed: " + args) # Sort and index the files tk_tabix.sort_vcf(tmp_filename, output_filename) tk_tabix.index_vcf(output_filename) os.remove(tmp_filename)
def join(args, outs, chunk_defs, chunk_outs): # mapping of cluster ID -> VCFs to_merge = collections.defaultdict(list) for o, d in zip(chunk_outs, chunk_defs): to_merge[d.cluster_id].append(o.variant_subset) # merge each VCF subset for a cluster merged_vcfs = [] for cluster_id, vcf_list in to_merge.iteritems(): merged_vcf = martian.make_path('{}.vcf'.format(cluster_id)) tk_io.combine_vcfs(merged_vcf, vcf_list) merged_vcfs.append(merged_vcf + '.gz') # final merge to make one combined VCF tmp = martian.make_path('tmp.vcf') cmd = ['vcf-merge'] + merged_vcfs with open(tmp, 'w') as outf: subprocess.check_call(cmd, stdout=outf) # Sort and index the files tk_tabix.sort_vcf(tmp, outs.variants.replace('.gz', '')) tk_tabix.index_vcf(outs.variants.replace('.gz', '')) os.remove(tmp)
def main_sort_variants(args, outs): if args.input is None or args.input == [None]: outs.default = None else: outs.coerce_strings() # List of inputs sort_filename = outs.default[0:(len(outs.default)-3)] if type(args.input) == type([]): files = [f for f in args.input if os.path.isfile(f)] if len(files) == 0: outs.defaut = None return cat_filename = outs.default[:-3] tk_io.combine_vcfs(cat_filename, args.input) # Single input else: if not os.path.exists(args.input): outs.default = None return tk_tabix.sort_vcf(args.input, sort_filename) tk_tabix.index_vcf(sort_filename)