def varsim_somatic_main(): main_parser = argparse.ArgumentParser( description="VarSim: somatic workflow", formatter_class=argparse.ArgumentDefaultsHelpFormatter) main_parser.add_argument("--out_dir", metavar="Out directory", help="Output directory", default="somatic_out") main_parser.add_argument("--work_dir", metavar="Work directory", help="Work directory", default="somatic_work") main_parser.add_argument("--log_dir", metavar="Log directory", help="Directory to log to", default="somatic_log") main_parser.add_argument("--reference", metavar="FASTA", help="Reference genome", required=True, type=file) main_parser.add_argument("--seed", metavar="INT", help="Random number seed", type=int, default=0) main_parser.add_argument("--sex", metavar="Sex", help="Sex of the person (MALE/FEMALE)", required=False, type=str, choices=["MALE", "FEMALE"], default="MALE") main_parser.add_argument("--id", metavar="id", help="Sample ID", required=True) main_parser.add_argument("--simulator", metavar="simulator", help="Read simulator to use", required=False, type=str, choices=["art", "dwgsim"], default="art") main_parser.add_argument( "--simulator_executable", metavar="PATH", help="Path to the executable of the read simulator chosen", required=True, type=file) main_parser.add_argument("--varsim_jar", metavar="PATH", help="Path to VarSim.jar (deprecated)", type=file, default=None, required=False) main_parser.add_argument("--read_length", metavar="INT", help="Length of read to simulate", default=100, type=int) main_parser.add_argument( "--nlanes", metavar="INT", help= "Number of lanes to generate, coverage will be divided evenly over the lanes. Simulation is parallized over lanes. Each lane will have its own pair of files", default=3, type=int) main_parser.add_argument("--total_coverage", metavar="FLOAT", help="Total coverage to simulate", default=1.0, type=float) main_parser.add_argument("--mean_fragment_size", metavar="INT", help="Mean fragment size", default=350, type=int) main_parser.add_argument("--sd_fragment_size", metavar="INT", help="Standard deviation of fragment size", default=50, type=int) main_parser.add_argument("--force_five_base_encoding", action="store_true", help="Force bases to be ACTGN") main_parser.add_argument("--filter", action="store_true", help="Only use PASS variants") main_parser.add_argument("--keep_temp", action="store_true", help="Keep temporary files") main_parser.add_argument("--java_max_mem", metavar="XMX", help="max java memory", default="10g", type=str) main_parser.add_argument("--java", metavar="PATH", help="path to java", default="java", type=str) main_parser.add_argument("--python", metavar="PATH", help="path to python", default="python", type=str) main_parser.add_argument('--version', action='version', version=get_version()) input_vcf_group = main_parser.add_argument_group("Input VCFs options") input_vcf_group.add_argument( "--cosmic_vcf", metavar="VCF", help= "COSMIC database VCF. Need to specify when random COSMIC sampling is enabled." ) input_vcf_group.add_argument("--normal_vcf", metavar="VCF", help="Normal VCF from previous VarSim run", required=True) input_vcf_group.add_argument("--somatic_vcfs", metavar="VCF", nargs="+", help="Somatic VCF", default=[]) input_vcf_group.add_argument( "--merge_priority", choices=["sn", "ns"], help= "Priority of merging (lowest first) somatic (s) and normal truth (n).", default="sn") pipeline_control_group = main_parser.add_argument_group( "Pipeline control options. Disable parts of the pipeline.") pipeline_control_group.add_argument("--disable_rand_vcf", action="store_true", help="Disable RandVCF2VCF somatic") pipeline_control_group.add_argument("--disable_vcf2diploid", action="store_true", help="Disable vcf2diploid") pipeline_control_group.add_argument("--disable_sim", action="store_true", help="Disable read simulation") # RandVCF2VCF seed num_SNP num_INS num_DEL num_MNP num_COMPLEX percent_novel min_length_lim max_length_lim reference_file file.vcf rand_vcf_group = main_parser.add_argument_group( "RandVCF2VCF somatic options") rand_vcf_group.add_argument("--som_num_snp", metavar="INT", help="Number of somatic SNPs", default=9000, type=int) rand_vcf_group.add_argument("--som_num_ins", metavar="INT", help="Number of somatic insertions", default=1000, type=int) rand_vcf_group.add_argument("--som_num_del", metavar="INT", help="Number of somatic deletions", default=1000, type=int) rand_vcf_group.add_argument("--som_num_mnp", metavar="INT", help="Number of somatic MNPs", default=100, type=int) rand_vcf_group.add_argument("--som_num_complex", metavar="INT", help="Number of somatic complex variants", default=100, type=int) # rand_vcf_group.add_argument("--som_percent_novel", metavar="percent_novel", help="Percent novel", default=0, type=float) rand_vcf_group.add_argument("--som_min_length_lim", metavar="INT", help="Min length lim", default=0, type=int) rand_vcf_group.add_argument("--som_max_length_lim", metavar="INT", help="Max length lim", default=49, type=int) # rand_vcf_group.add_argument("--som_vcf", metavar="in_vcf", help="Input somatic variant database VCF", type=file, required=False) rand_vcf_group.add_argument( "--som_prop_het", metavar="FLOAT", help="Proportion of somatic heterozygous variants", default=1.0, type=float) rand_vcf_group.add_argument( "--sv_insert_seq", metavar="FILE", help= "Path to file containing concatenation of real insertion sequences", type=file, required=True) dwgsim_group = main_parser.add_argument_group("DWGSIM options") dwgsim_group.add_argument("--dwgsim_start_e", metavar="first_base_error_rate", help="Error rate on the first base", default=0.0001, type=float) dwgsim_group.add_argument("--dwgsim_end_e", metavar="last_base_error_rate", help="Error rate on the last base", default=0.0015, type=float) dwgsim_group.add_argument("--dwgsim_options", help="DWGSIM command-line options", default="", required=False) art_group = main_parser.add_argument_group("ART options") art_group.add_argument("--profile_1", metavar="profile_file1", help="Profile for first end", default=None, type=file) art_group.add_argument("--profile_2", metavar="profile_file2", help="Profile for second end", default=None, type=file) art_group.add_argument("--art_options", help="ART command-line options", default="", required=False) args = main_parser.parse_args() args.java = utils.get_java(args.java) check_java(args.java) utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem makedirs([args.log_dir, args.out_dir]) FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' logging.basicConfig(filename=os.path.join(args.log_dir, "varsim.log"), filemode="w", level=logging.DEBUG, format=FORMAT) logger = logging.getLogger(varsim_somatic_main.__name__) if not args.disable_sim: if not args.simulator_executable: logger.error( "Please specify %s binary with --simulator_executable option" % args.simulator) sys.exit(os.EX_USAGE) check_executable(args.simulator_executable.name) t_s = time.time() cosmic_sampled_vcfs = [] if not args.disable_rand_vcf: if not args.cosmic_vcf: logger.error( "COSMIC database VCF not specified using --cosmic_vcf") sys.exit(os.EX_USAGE) rand_vcf_stdout = open(os.path.join(args.out_dir, "random.cosmic.vcf"), "w") rand_vcf_stderr = open(os.path.join(args.log_dir, "random.cosmic.err"), "w") cosmic_sampled_vcfs = [rand_vcf_stdout.name] # Not able to support novel yet for COSMIC variants randvcf_options = RandVCFOptions(args.som_num_snp, args.som_num_ins, args.som_num_del, args.som_num_mnp, args.som_num_complex, 0, args.som_min_length_lim, args.som_max_length_lim, args.som_prop_het) run_randvcf(os.path.realpath(args.cosmic_vcf), rand_vcf_stdout, rand_vcf_stderr, args.seed, args.sex, randvcf_options, args.reference.name, args.java) normal_vcfs = [args.normal_vcf] somatic_vcfs = cosmic_sampled_vcfs + args.somatic_vcfs fixed_somatic_vcfs = [] if somatic_vcfs: vcfs_dir = os.path.join(args.out_dir, "somatic_vcfs") makedirs([vcfs_dir]) count = 0 for index, vcf in enumerate(somatic_vcfs): copied_vcf = os.path.join(vcfs_dir, "%d.vcf" % index) logger.info( "Copying somatic VCF %s to %s and adding VARSIMSOMATIC id to entries if missing" % (vcf, copied_vcf)) with open(vcf, "r") as vcf_fd, open(copied_vcf, "w") as copied_vcf_fd: for line in vcf_fd: if line.startswith("#"): copied_vcf_fd.write(line) else: line_fields = line.split("\t") line_fields[2] = ( "VARSIMSOMATIC%d" % count) if line_fields[2] == "." else ( "%s,VARSIMSOMATIC%d" % (line_fields[2], count)) copied_vcf_fd.write("\t".join(line_fields)) count += 1 fixed_somatic_vcfs.append(copied_vcf) vcf_files = (fixed_somatic_vcfs + normal_vcfs) if args.merge_priority == "sn" else ( normal_vcfs + fixed_somatic_vcfs) vcf_files = map(os.path.realpath, filter(None, vcf_files)) processes = run_vcfstats(vcf_files, args.out_dir, args.log_dir, args.java) # Run VarSim varsim_stdout = open(os.path.join(args.log_dir, "som_varsim.out"), "w") varsim_stderr = open(os.path.join(args.log_dir, "som_varsim.log"), "w") vcf_arg_list = ["--vcfs"] + vcf_files # need to fix the store true ones filter_arg_list = ["--filter"] if args.filter else [] disable_sim_arg_list = ["--disable_sim"] if args.disable_sim else [] force_five_base_encoding_arg_list = [ "--force_five_base_encoding" ] if args.force_five_base_encoding else [] keep_temp_arg_list = ["--keep_temp"] if args.keep_temp else [] profile_1_arg_list = ["--profile_1", args.profile_1.name ] if args.profile_1 is not None else [] profile_2_arg_list = ["--profile_2", args.profile_2.name ] if args.profile_2 is not None else [] other_varsim_opts = [] if args.simulator == "dwgsim": other_varsim_opts = [ "--dwgsim_start_e", str(args.dwgsim_start_e), "--dwgsim_end_e", str(args.dwgsim_end_e) ] if args.dwgsim_options: other_varsim_opts += ["--dwgsim_options", str(args.dwgsim_options)] elif args.simulator == "art" and args.art_options: other_varsim_opts += ["--art_options", args.art_options] args.python = utils.get_python(args.python) varsim_command = [args.python, os.path.realpath(VARSIM_PY), "--out_dir", str(os.path.realpath(args.out_dir)), "--work_dir", str(os.path.realpath(args.work_dir)), "--log_dir", str(os.path.realpath(os.path.join(args.log_dir, "varsim"))), "--reference", str(os.path.realpath(args.reference.name)), "--seed", str(args.seed), "--sex", str(args.sex), "--id", str(args.id), "--simulator", str(args.simulator), "--simulator_executable", str(args.simulator_executable.name), "--read_length", str(args.read_length), "--nlanes", str(args.nlanes), "--total_coverage", str(args.total_coverage), "--mean_fragment_size", str(args.mean_fragment_size), "--sd_fragment_size", str(args.sd_fragment_size), "--disable_rand_vcf", "--disable_rand_dgv", "--sv_insert_seq", args.sv_insert_seq.name] + other_varsim_opts + vcf_arg_list + filter_arg_list + disable_sim_arg_list \ + force_five_base_encoding_arg_list + keep_temp_arg_list + profile_1_arg_list + profile_2_arg_list varsim_command = " ".join(varsim_command) p_varsim = subprocess.Popen(varsim_command, stdout=varsim_stdout, stderr=varsim_stderr, shell=True) logger.info("Executing command " + varsim_command + " with pid " + str(p_varsim.pid)) processes.append(p_varsim) processes = monitor_processes(processes) # Split the tumor truth VCF into normal variants and somatic variants tumor_vcf = os.path.realpath( os.path.join(args.out_dir, "%s.truth.vcf" % args.id)) normal_vcf = os.path.join(args.out_dir, "%s_norm.vcf" % args.id) somatic_vcf = os.path.join(args.out_dir, "%s_somatic.vcf" % args.id) logger.info("Splitting the truth VCF %s into normal and somatic VCFs" % tumor_vcf) with open(tumor_vcf, "r") as tumor_truth_fd, \ open(normal_vcf, "w") as normal_vcf_fd, \ open(somatic_vcf, "w") as somatic_vcf_fd: for line in tumor_truth_fd: if line.startswith("#"): somatic_vcf_fd.write(line) normal_vcf_fd.write(line) continue if line.find("VARSIMSOMATIC") >= 0: somatic_vcf_fd.write(line) else: normal_vcf_fd.write(line) run_vcfstats([normal_vcf, somatic_vcf], args.out_dir, args.log_dir, args.java) logger.info("Done! (%g hours)" % ((time.time() - t_s) / 3600.0))
def varsim_main( reference, simulator, # use None to disable simulation simulator_exe, total_coverage, variant_vcfs=[], sampling_vcf=None, dgv_file=None, randvcf_options=None, # use None to disable RandVCF randdgv_options=None, # use None to disable RandDGV nlanes=1, simulator_options="", sample_id="VarSim_Sample", log_dir="log", out_dir="out", sv_insert_seq=None, seed=0, sex="MALE", remove_filtered=False, keep_temp=False, force_five_base_encoding=False, lift_ref=False, disable_vcf2diploid=False, java="java"): check_java(java) # make the directories we need makedirs([log_dir, out_dir]) logger = logging.getLogger(varsim_main.__name__) # Make sure we can actually execute the executable if simulator: if simulator not in ["dwgsim", "art", "longislnd"]: raise NotImplementedError( "Simulation method {} not implemented".format(simulator)) check_executable(simulator_exe) processes = [] t_s = time.time() variant_vcfs = map(os.path.realpath, variant_vcfs) if sv_insert_seq: in_vcfs = [] for i, vcf in enumerate(variant_vcfs): tool_work_dir = os.path.join(out_dir, "filled_in", str(i)) makedirs([tool_work_dir]) in_vcfs.append( fill_missing_sequences(vcf, sample_id, os.path.realpath(sv_insert_seq), reference, tool_work_dir, tool_work_dir, java)) variant_vcfs = map(os.path.realpath, in_vcfs) else: logger.warn( "Not filling in SV sequences since no insert sequence file provided" ) open_fds = [] if randvcf_options: if not sampling_vcf: logger.error("Need to provide the VCF for random sampling") raise ValueError("Sampling VCF missing") rand_vcf_out_fd = open(os.path.join(out_dir, "random.vc.vcf"), "w") rand_vcf_log_fd = open(os.path.join(log_dir, "RandVCF2VCF.err"), "w") variant_vcfs.append(os.path.realpath(rand_vcf_out_fd.name)) run_randvcf(os.path.realpath(sampling_vcf), rand_vcf_out_fd, rand_vcf_log_fd, seed, sex, randvcf_options, reference, java) open_fds += [rand_vcf_out_fd, rand_vcf_log_fd] if randdgv_options: if not sv_insert_seq: raise ValueError("Need SV sequence file to fill in SV sequences") if not dgv_file: logger.error("Need to provide the DGV file for random sampling") raise ValueError("DGV file missing") rand_dgv_stdout = open(os.path.join(out_dir, "random.sv.vcf"), "w") rand_dgv_stderr = open(os.path.join(log_dir, "RandDGV2VCF.err"), "w") variant_vcfs.append(os.path.realpath(rand_dgv_stdout.name)) run_randdgv(dgv_file, rand_dgv_stdout, rand_dgv_stderr, seed, sex, randdgv_options, reference, sv_insert_seq, java) open_fds += [rand_dgv_stdout, rand_dgv_stderr] processes = monitor_processes(processes) for open_fd in open_fds: open_fd.close() merged_reference = os.path.join(out_dir, "%s.fa" % (sample_id)) merged_truth_vcf = os.path.join(out_dir, "%s.truth.vcf" % (sample_id)) merged_map = os.path.join(out_dir, "%s.map" % (sample_id)) processes = run_vcfstats(variant_vcfs, out_dir, log_dir, java) if not disable_vcf2diploid: logger.info("vcf2diploid started") vcf2diploid_stdout = open(os.path.join(out_dir, "vcf2diploid.out"), "w") vcf2diploid_stderr = open(os.path.join(log_dir, "vcf2diploid.err"), "w") vcf_arg_list = sum([["-vcf", v] for v in variant_vcfs], []) filter_arg_list = ["-pass"] if remove_filtered else [] vcf2diploid_command = [ java, utils.JAVA_XMX, "-jar", VARSIMJAR, "vcf2diploid", "-t", sex, "-id", sample_id, "-chr", os.path.realpath(reference) ] + filter_arg_list + vcf_arg_list + ["-no_contig_id"] logger.info("Executing command " + " ".join(vcf2diploid_command)) subprocess.check_call(vcf2diploid_command, stdout=vcf2diploid_stdout, stderr=vcf2diploid_stderr, cwd=out_dir) processes = monitor_processes(processes) # Now concatenate the .fa from vcf2diploid contigs = get_contigs_list(reference) contig_fastas = map( lambda (x, y): os.path.join(out_dir, "%s_%s_%s.fa" % (x, sample_id, y)), itertools.product(contigs, ["maternal", "paternal"])) fastas_to_cat = filter(os.path.isfile, contig_fastas) concatenate_files(fastas_to_cat, merged_reference, remove_original=True) if os.path.getsize(merged_reference) == 0: logger.error( "Merged FASTA is empty. Something bad happened. Exiting") raise RuntimeError("Empty FASTA generated by vcf2diploid") # contatenate the vcfs vcfs_to_cat = filter( os.path.isfile, map(lambda x: os.path.join(out_dir, "%s_%s.vcf" % (x, sample_id)), contigs)) concatenate_files(vcfs_to_cat, merged_truth_vcf, header_str="#", simple_cat=False, remove_original=True) run_vcfstats([merged_truth_vcf], out_dir, log_dir, java) logger.info("vcf2diploid done") if lift_ref: lifted_dir = os.path.join(out_dir, "lifted") makedirs([lifted_dir]) #quick fix for issue of CN convertCN([merged_truth_vcf], "two2one") merged_truth_vcf = lift_vcfs([merged_truth_vcf], os.path.join(lifted_dir, "truth.vcf"), None, tabix_index=False) #quick fix for issue of CN convertCN([merged_truth_vcf], "one2two") pysam.tabix_index(merged_truth_vcf, force=True, preset='vcf') merged_map = lift_maps([merged_map], os.path.join(lifted_dir, "truth.map")) if processes: processes = monitor_processes(processes) # Now generate the reads using art/pbsim/dwgsim tmp_files = [] if simulator: fifos = [] fastqs = [] sim_ts = time.time() coverage_per_lane = total_coverage * 0.5 / nlanes processes = [] fifo_src_dst = [] if simulator == "dwgsim": for i, end in itertools.product(xrange(nlanes), [1, 2]): fifo_src_dst.append( ("simulated.lane%d.read%d.fastq" % (i, end), "simulated.lane%d.read%d.fq.gz" % (i, end))) elif simulator == "art": for i, end, suffix in itertools.product(xrange(nlanes), [1, 2], ["fq", "aln"]): fifo_src_dst.append( ("simulated.lane%d.read%d.%s" % (i, end, suffix), "simulated.lane%d.read%d.%s.gz" % (i, end, suffix))) else: # simulator == "longislnd": pass for fifo_name, dst in fifo_src_dst: fifos.append(os.path.join(out_dir, fifo_name)) if os.path.exists(fifos[-1]): os.remove(fifos[-1]) os.mkfifo(fifos[-1]) gzip_stderr = open(os.path.join(log_dir, "gzip.%s" % (fifo_name)), "w") gzip_command = "cat %s | gzip -2 > %s" % ( fifos[-1], os.path.join(out_dir, dst)) logger.info("Executing command %s" % (gzip_command)) gzip_p = subprocess.Popen(gzip_command, stdout=None, stderr=gzip_stderr, shell=True) logger.info(" with pid " + str(gzip_p.pid)) processes.append(gzip_p) tmp_files.append(os.path.join(out_dir, dst)) simulator_commands_files = [] if simulator == "dwgsim": for i in xrange(nlanes): simulator_command = "{} {} -C {} -z {} {} {}".format( os.path.realpath(simulator_exe), simulator_options, coverage_per_lane, seed + i, merged_reference, os.path.join(out_dir, "simulated.lane%d" % (i))) simulator_commands_files.append( (simulator_command, os.path.join(log_dir, "dwgsim.lane%d.out" % (i)), os.path.join(log_dir, "dwgsim.lane%d.err" % (i)))) elif simulator == "art": for i in xrange(nlanes): simulator_command = "{} {} -i {} -f {} -rs {} -o {}".format( simulator_exe, simulator_options, merged_reference, coverage_per_lane, seed + i, os.path.join(out_dir, "simulated.lane%d.read" % (i))) simulator_commands_files.append( (simulator_command, os.path.join(log_dir, "art.lane%d.out" % (i)), os.path.join(log_dir, "art.lane%d.err" % (i)))) else: # simulator == "longislnd": simulator_command = "{} {} --coverage {} --out {} --fasta {}".format( simulator_exe, simulator_options, total_coverage * 0.5, os.path.join(out_dir, "longislnd_sim"), merged_reference) simulator_commands_files.append( (simulator_command, os.path.join(log_dir, "longislnd.out"), os.path.join(log_dir, "longislnd.err"))) simulator_fds = [] for command, stdout, stderr in simulator_commands_files: stdout_fd = open(stdout, "w") stderr_fd = open(stderr, "w") process = subprocess.Popen(command, stdout=stdout_fd, stderr=stderr_fd, shell=True, close_fds=True) logger.info("Executing command {} with pid {}".format( command, process.pid)) processes.append(process) simulator_fds += [stdout_fd, stderr_fd] monitor_processes(processes) for fd in simulator_fds: fd.close() processes = [] logger.info("Read generation took %g seconds" % (time.time() - sim_ts)) sim_t_liftover = time.time() # Now start lifting over the gzipped files if simulator != "longislnd": for i in xrange(nlanes): liftover_stdout = open( os.path.join(log_dir, "lane%d.out" % (i)), "w") liftover_stderr = open( os.path.join(log_dir, "liftover%d.log" % (i)), "w") fastq_liftover_command = "%s -server %s -jar %s fastq_liftover -map %s -id %d " \ "-fastq <(gunzip -c %s/simulated.lane%d.read1.fq.gz) " \ "-fastq <(gunzip -c %s/simulated.lane%d.read2.fq.gz) " \ "-out >(gzip -1 > %s/lane%d.read1.fq.gz) " \ "-out >(gzip -1 > %s/lane%d.read2.fq.gz)" % ( java, utils.JAVA_XMX, VARSIMJAR, merged_map, i, out_dir, i, out_dir, i, out_dir, i, out_dir, i) if force_five_base_encoding: fastq_liftover_command += " -force_five_base_encoding " if simulator == "art": fastq_liftover_command += " -type art " \ "-aln <(gunzip -c %s/simulated.lane%d.read1.aln.gz) " \ "-aln <(gunzip -c %s/simulated.lane%d.read2.aln.gz)" % ( out_dir, i, out_dir, i) elif simulator == "pbsim": fastq_liftover_command += " -type pbsim " \ "-maf <(gunzip -c %s/simulated.lane%d.read1.maf.gz) " \ "-ref %s/simulated.lane%d.ref " % (out_dir, i, out_dir, i) fastq_liftover_command = "bash -c \"%s\"" % ( fastq_liftover_command) logger.info("Executing command " + fastq_liftover_command) subprocess.check_call(fastq_liftover_command, stdout=liftover_stdout, stderr=liftover_stderr, shell=True) fastqs.append( os.path.join(out_dir, "lane%d.read%d.fq.gz" % (i, end))) else: # liftover the read map files read_map_files = list( glob.glob(os.path.join(out_dir, "longislnd_sim", "*.bed"))) merged_raw_readmap = os.path.join(out_dir, "longislnd_sim", "merged_readmap.bed") concatenate_files(read_map_files, merged_raw_readmap) read_maps = "-longislnd %s" % merged_raw_readmap read_map_liftover_command = "%s %s -server -jar %s longislnd_liftover " % ( java, utils.JAVA_XMX, VARSIMJAR ) + read_maps + " -map %s " % merged_map + " -out %s" % ( os.path.join(out_dir, sample_id + ".truth.map")) read_map_liftover_stderr = open( os.path.join(log_dir, "longislnd_liftover.err"), "w") logger.info("Executing command " + read_map_liftover_command) subprocess.check_call(read_map_liftover_command, stdout=None, stderr=read_map_liftover_stderr, shell=True) monitor_processes(processes) logger.info("Liftover took %g seconds" % (time.time() - sim_t_liftover)) sim_te = max(sim_ts + 1, time.time()) bytes_written = sum([os.path.getsize(fastq) for fastq in fastqs]) logger.info("Took %g seconds, %ld Mbytes written, %g MB/s" % (sim_te - sim_ts, bytes_written / 1024.0 / 1024.0, bytes_written / 1024.0 / 1024.0 / (sim_te - sim_ts))) for fifo in fifos: os.remove(fifo) if not keep_temp: logger.info("Cleaning up intermediate files") for f in tmp_files: os.remove(f) logger.info("Done! (%g hours)" % ((time.time() - t_s) / 3600.0))
bytes_written = sum([os.path.getsize(fastq) for fastq in fastqs]) logger.info("Took %g seconds, %ld Mbytes written, %g MB/s" % ( sim_te - sim_ts, bytes_written / 1024.0 / 1024.0, bytes_written / 1024.0 / 1024.0 / (sim_te - sim_ts))) for fifo in fifos: os.remove(fifo) if not keep_temp: logger.info("Cleaning up intermediate files") for f in tmp_files: os.remove(f) logger.info("Done! (%g hours)" % ((time.time() - t_s) / 3600.0)) if __name__ == "__main__": check_java() main_parser = argparse.ArgumentParser(description="VarSim: A high-fidelity simulation validation framework", formatter_class=argparse.ArgumentDefaultsHelpFormatter) main_parser.add_argument("--out_dir", metavar="DIR", help="Output directory for the simulated genome, reads and variants", required=False, default="out") main_parser.add_argument("--work_dir", metavar="DIR", help="Work directory, currently not used", required=False, default="work") main_parser.add_argument("--log_dir", metavar="DIR", help="Log files of all steps are kept here", required=False, default="log") main_parser.add_argument("--reference", metavar="FASTA", help="Reference genome that variants will be inserted into", required=True) main_parser.add_argument("--seed", metavar="seed", help="Random number seed for reproducibility", type=int, default=0) main_parser.add_argument("--sex", metavar="Sex", help="Sex of the person (MALE/FEMALE)", required=False, type=str, choices=["MALE", "FEMALE"], default="MALE")
def varsim_main(reference, simulator, # use None to disable simulation simulator_exe, total_coverage, variant_vcfs=[], sampling_vcf=None, dgv_file=None, randvcf_options=None, # use None to disable RandVCF randdgv_options=None, # use None to disable RandDGV nlanes=1, simulator_options="", sample_id="VarSim_Sample", log_dir="log", out_dir="out", sv_insert_seq=None, seed=0, sex="MALE", remove_filtered=False, keep_temp=False, force_five_base_encoding=False, lift_ref=False, disable_vcf2diploid=False): check_java() # make the directories we need makedirs([log_dir, out_dir]) logger = logging.getLogger(varsim_main.__name__) # Make sure we can actually execute the executable if simulator: if simulator not in ["dwgsim", "art", "longislnd"]: raise NotImplementedError("Simulation method {} not implemented".format(simulator)) check_executable(simulator_exe) processes = [] t_s = time.time() variant_vcfs = map(os.path.realpath, variant_vcfs) if sv_insert_seq: in_vcfs = [] for i, vcf in enumerate(variant_vcfs): tool_work_dir = os.path.join(out_dir, "filled_in", str(i)) makedirs([tool_work_dir]) in_vcfs.append(fill_missing_sequences(vcf, sample_id, os.path.realpath(sv_insert_seq), reference, tool_work_dir, tool_work_dir)) variant_vcfs = map(os.path.realpath, in_vcfs) else: logger.warn("Not filling in SV sequences since no insert sequence file provided") open_fds = [] if randvcf_options: if not sampling_vcf: logger.error("Need to provide the VCF for random sampling") raise ValueError("Sampling VCF missing") rand_vcf_out_fd = open(os.path.join(out_dir, "random.vc.vcf"), "w") rand_vcf_log_fd = open(os.path.join(log_dir, "RandVCF2VCF.err"), "w") variant_vcfs.append(os.path.realpath(rand_vcf_out_fd.name)) run_randvcf(os.path.realpath(sampling_vcf), rand_vcf_out_fd, rand_vcf_log_fd, seed, sex, randvcf_options, reference) open_fds += [rand_vcf_out_fd, rand_vcf_log_fd] if randdgv_options: if not sv_insert_seq: raise ValueError("Need SV sequence file to fill in SV sequences") if not dgv_file: logger.error("Need to provide the DGV file for random sampling") raise ValueError("DGV file missing") rand_dgv_stdout = open(os.path.join(out_dir, "random.sv.vcf"), "w") rand_dgv_stderr = open(os.path.join(log_dir, "RandDGV2VCF.err"), "w") variant_vcfs.append(os.path.realpath(rand_dgv_stdout.name)) run_randdgv(dgv_file, rand_dgv_stdout, rand_dgv_stderr, seed, sex, randdgv_options, reference, sv_insert_seq) open_fds += [rand_dgv_stdout, rand_dgv_stderr] processes = monitor_processes(processes) for open_fd in open_fds: open_fd.close() merged_reference = os.path.join(out_dir, "%s.fa" % (sample_id)) merged_truth_vcf = os.path.join(out_dir, "%s.truth.vcf" % (sample_id)) merged_map = os.path.join(out_dir, "%s.map" % (sample_id)) processes = run_vcfstats(variant_vcfs, out_dir, log_dir) if not disable_vcf2diploid: logger.info("vcf2diploid started") vcf2diploid_stdout = open(os.path.join(out_dir, "vcf2diploid.out"), "w") vcf2diploid_stderr = open(os.path.join(log_dir, "vcf2diploid.err"), "w") vcf_arg_list = sum([["-vcf", v] for v in variant_vcfs], []) filter_arg_list = ["-pass"] if remove_filtered else [] vcf2diploid_command = ["java", utils.JAVA_XMX, "-jar", VARSIMJAR, "vcf2diploid", "-t", sex, "-id", sample_id, "-chr", os.path.realpath(reference)] + filter_arg_list + vcf_arg_list + ["-no_contig_id"] logger.info("Executing command " + " ".join(vcf2diploid_command)) subprocess.check_call(vcf2diploid_command, stdout=vcf2diploid_stdout, stderr=vcf2diploid_stderr, cwd=out_dir) processes = monitor_processes(processes) # Now concatenate the .fa from vcf2diploid contigs = get_contigs_list(reference) contig_fastas = map(lambda (x, y): os.path.join(out_dir, "%s_%s_%s.fa" % (x, sample_id, y)), itertools.product(contigs, ["maternal", "paternal"])) fastas_to_cat = filter(os.path.isfile, contig_fastas) concatenate_files(fastas_to_cat, merged_reference, remove_original=True) if os.path.getsize(merged_reference) == 0: logger.error("Merged FASTA is empty. Something bad happened. Exiting") raise RuntimeError("Empty FASTA generated by vcf2diploid") # contatenate the vcfs vcfs_to_cat = filter(os.path.isfile, map(lambda x: os.path.join(out_dir, "%s_%s.vcf" % (x, sample_id)), contigs)) concatenate_files(vcfs_to_cat, merged_truth_vcf, header_str="#", simple_cat=False, remove_original=True) monitor_processes(run_vcfstats([merged_truth_vcf], out_dir, log_dir)) logger.info("vcf2diploid done") if lift_ref: lifted_dir = os.path.join(out_dir, "lifted") makedirs([lifted_dir]) #quick fix for issue of CN convertCN([merged_truth_vcf], "two2one") merged_truth_vcf = lift_vcfs([merged_truth_vcf], os.path.join(lifted_dir, "truth.vcf"), None, tabix_index=False) #quick fix for issue of CN convertCN([merged_truth_vcf], "one2two") pysam.tabix_index(merged_truth_vcf, force=True, preset='vcf') merged_map = lift_maps([merged_map], os.path.join(lifted_dir, "truth.map")) if processes: processes = monitor_processes(processes) # Now generate the reads using art/pbsim/dwgsim tmp_files = [] if simulator: fifos = [] fastqs = [] sim_ts = time.time() coverage_per_lane = total_coverage * 0.5 / nlanes processes = [] fifo_src_dst = [] if simulator == "dwgsim": for i, end in itertools.product(xrange(nlanes), [1, 2]): fifo_src_dst.append( ("simulated.lane%d.read%d.fastq" % (i, end), "simulated.lane%d.read%d.fq.gz" % (i, end))) elif simulator == "art": for i, end, suffix in itertools.product(xrange(nlanes), [1, 2], ["fq", "aln"]): fifo_src_dst.append(("simulated.lane%d.read%d.%s" % (i, end, suffix), "simulated.lane%d.read%d.%s.gz" % (i, end, suffix))) else: # simulator == "longislnd": pass for fifo_name, dst in fifo_src_dst: fifos.append(os.path.join(out_dir, fifo_name)) if os.path.exists(fifos[-1]): os.remove(fifos[-1]) os.mkfifo(fifos[-1]) gzip_stderr = open(os.path.join(log_dir, "gzip.%s" % (fifo_name)), "w") gzip_command = "cat %s | gzip -2 > %s" % (fifos[-1], os.path.join(out_dir, dst)) logger.info("Executing command %s" % (gzip_command) ) gzip_p = subprocess.Popen(gzip_command, stdout = None, stderr = gzip_stderr, shell = True) logger.info( " with pid " + str(gzip_p.pid)) processes.append(gzip_p) tmp_files.append(os.path.join(out_dir, dst)) simulator_commands_files = [] if simulator == "dwgsim": for i in xrange(nlanes): simulator_command = "{} {} -C {} -z {} {} {}".format(os.path.realpath(simulator_exe), simulator_options, coverage_per_lane, seed + i, merged_reference, os.path.join(out_dir, "simulated.lane%d" % (i))) simulator_commands_files.append((simulator_command, os.path.join(log_dir, "dwgsim.lane%d.out" % (i)), os.path.join(log_dir, "dwgsim.lane%d.err" % (i)))) elif simulator == "art": for i in xrange(nlanes): simulator_command = "{} {} -i {} -f {} -rs {} -o {}".format(simulator_exe, simulator_options, merged_reference, coverage_per_lane, seed + i, os.path.join(out_dir, "simulated.lane%d.read" % (i))) simulator_commands_files.append((simulator_command, os.path.join(log_dir, "art.lane%d.out" % (i)), os.path.join(log_dir, "art.lane%d.err" % (i)))) else: # simulator == "longislnd": simulator_command = "{} {} --coverage {} --out {} --fasta {}".format(simulator_exe, simulator_options, total_coverage * 0.5, os.path.join(out_dir, "longislnd_sim"), merged_reference) simulator_commands_files.append((simulator_command, os.path.join(log_dir, "longislnd.out"), os.path.join(log_dir, "longislnd.err"))) simulator_fds = [] for command, stdout, stderr in simulator_commands_files: stdout_fd = open(stdout, "w") stderr_fd = open(stderr, "w") process = subprocess.Popen(command, stdout=stdout_fd, stderr=stderr_fd, shell=True, close_fds=True) logger.info("Executing command {} with pid {}".format(command, process.pid)) processes.append(process) simulator_fds += [stdout_fd, stderr_fd] monitor_processes(processes) for fd in simulator_fds: fd.close() processes = [] logger.info("Read generation took %g seconds" % (time.time() - sim_ts)) sim_t_liftover = time.time() # Now start lifting over the gzipped files if simulator != "longislnd": for i in xrange(nlanes): liftover_stdout = open(os.path.join(log_dir, "lane%d.out" % (i)), "w") liftover_stderr = open(os.path.join(log_dir, "liftover%d.log" % (i)), "w") fastq_liftover_command = "java -server %s -jar %s fastq_liftover -map %s -id %d " \ "-fastq <(gunzip -c %s/simulated.lane%d.read1.fq.gz) " \ "-fastq <(gunzip -c %s/simulated.lane%d.read2.fq.gz) " \ "-out >(gzip -1 > %s/lane%d.read1.fq.gz) " \ "-out >(gzip -1 > %s/lane%d.read2.fq.gz)" % ( utils.JAVA_XMX, VARSIMJAR, merged_map, i, out_dir, i, out_dir, i, out_dir, i, out_dir, i) if force_five_base_encoding: fastq_liftover_command += " -force_five_base_encoding " if simulator == "art": fastq_liftover_command += " -type art " \ "-aln <(gunzip -c %s/simulated.lane%d.read1.aln.gz) " \ "-aln <(gunzip -c %s/simulated.lane%d.read2.aln.gz)" % ( out_dir, i, out_dir, i) elif simulator == "pbsim": fastq_liftover_command += " -type pbsim " \ "-maf <(gunzip -c %s/simulated.lane%d.read1.maf.gz) " \ "-ref %s/simulated.lane%d.ref " % (out_dir, i, out_dir, i) fastq_liftover_command = "bash -c \"%s\"" % (fastq_liftover_command) logger.info("Executing command " + fastq_liftover_command) subprocess.check_call(fastq_liftover_command, stdout = liftover_stdout, stderr = liftover_stderr, shell = True) fastqs.append(os.path.join(out_dir, "lane%d.read%d.fq.gz" % (i, end))) else: # liftover the read map files read_map_files = list(glob.glob(os.path.join(out_dir, "longislnd_sim", "*.bed"))) merged_raw_readmap = os.path.join(out_dir, "longislnd_sim", "merged_readmap.bed") concatenate_files(read_map_files, merged_raw_readmap) read_maps = "-longislnd %s" % merged_raw_readmap read_map_liftover_command = "java %s -server -jar %s longislnd_liftover " % (utils.JAVA_XMX, VARSIMJAR) + read_maps + " -map %s " % merged_map + " -out %s" % (os.path.join(out_dir, sample_id + ".truth.map")) read_map_liftover_stderr = open(os.path.join(log_dir, "longislnd_liftover.err"), "w") logger.info("Executing command " + read_map_liftover_command ) subprocess.check_call(read_map_liftover_command, stdout = None, stderr = read_map_liftover_stderr, shell = True) monitor_processes(processes) logger.info("Liftover took %g seconds" % (time.time() - sim_t_liftover)) sim_te = max(sim_ts + 1, time.time()) bytes_written = sum([os.path.getsize(fastq) for fastq in fastqs]) logger.info("Took %g seconds, %ld Mbytes written, %g MB/s" % ( sim_te - sim_ts, bytes_written / 1024.0 / 1024.0, bytes_written / 1024.0 / 1024.0 / (sim_te - sim_ts))) for fifo in fifos: os.remove(fifo) if not keep_temp: logger.info("Cleaning up intermediate files") for f in tmp_files: os.remove(f) logger.info("Done! (%g hours)" % ((time.time() - t_s) / 3600.0))
parse_jsons(jsonfile, stats) print("Non-SV stats") print_stats(stats) sv_stats = {k: {ii: 0 for ii in metrics} for k in var_types} parse_jsons(jsonfile, sv_stats, count_sv=True) print("SV stats") print_stats(sv_stats) all_stats = {k: {ii: 0 for ii in metrics} for k in var_types} parse_jsons(jsonfile, all_stats, count_all=True) print("Overall stats") print_stats(all_stats) return tp, fn, fp, t if __name__ == "__main__": utils.check_java() main_parser = argparse.ArgumentParser( description="VarSim: A high-fidelity simulation validation framework", formatter_class=argparse.ArgumentDefaultsHelpFormatter) main_parser.add_argument("--reference", metavar="FASTA", help="reference filename", required=True, type=str) main_parser.add_argument("--sdf", metavar="SDF", help="SDF formatted reference folder", required=False, type=str, default='')
def process(args): ''' main :param args: :return: ''' args.java = utils.get_java(args.java) utils.check_java(args.java) # Setup logging FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' loglevel = utils.get_loglevel(args.loglevel) if args.log_to_file: logging.basicConfig(filename=args.log_to_file, filemode="w", level=loglevel, format=FORMAT) else: logging.basicConfig(level=loglevel, format=FORMAT) if len(args.vcfs) > 1: raise NotImplementedError( 'right now only support one prediction VCF. Quick workaround: src/sort_vcf.sh vcf1 vcf2 > merged.vcf' ) global LOGGER LOGGER = logging.getLogger(__name__) LOGGER.info('working hard ...') utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem args.out_dir = os.path.abspath(args.out_dir) args.reference = os.path.abspath(args.reference) utils.makedirs([args.out_dir]) varsim_prefix = os.path.join(args.out_dir, 'varsim_compare_results') varsim_comparator = VarSimVCFComparator( prefix=varsim_prefix, true_vcf=args.true_vcf, reference=args.reference, regions=None, sample=args.sample, vcfs=args.vcfs, exclude_filtered=args.exclude_filtered, disallow_partial_fp=args.disallow_partial_fp, match_geno=args.match_geno, log_to_file=args.log_to_file, opts=args.vcfcompare_options, java=args.java) varsim_tp, varsim_fn, varsim_fp = varsim_comparator.get_tp( ), varsim_comparator.get_fn(), varsim_comparator.get_fp() varsim_tp = utils.sort_and_compress(varsim_tp) varsim_fn = utils.sort_and_compress(varsim_fn) varsim_fp = utils.sort_and_compress(varsim_fp) #run vcfeval sdf = args.sdf if not sdf: LOGGER.info( "user did not supply SDF-formatted reference, trying to generate one..." ) sdf = generate_sdf(args.reference, args.log_to_file, java=args.java) '''for vcfeval sample column must be present, and not empty if single-sample vcf, vcfeval doesn't check if samples match in truth and call in multi-sample vcf, sample name must be specified right now ''' vcfeval_prefix = os.path.join(args.out_dir, 'vcfeval_compare_results') if os.path.exists(vcfeval_prefix): LOGGER.warn('{0} exists, removing ...'.format(vcfeval_prefix)) shutil.rmtree(vcfeval_prefix) vcfeval_comparator = RTGVCFComparator( prefix=vcfeval_prefix, true_vcf=varsim_fn, reference=sdf, regions=None, sample=args.sample, vcfs=[varsim_fp], exclude_filtered=args.exclude_filtered, match_geno=args.match_geno, log_to_file=args.log_to_file, opts=args.vcfeval_options, java=args.java) vcfeval_tp, vcfeval_tp_predict = vcfeval_comparator.get_tp( ), vcfeval_comparator.get_tp_predict() augmented_tp, augmented_fn, augmented_fp, augmented_t = merge_results( outdir=args.out_dir, varsim_tp=varsim_tp, varsim_fn=varsim_fn, vcfeval_tp=vcfeval_tp, varsim_fp=varsim_fp, vcfeval_tp_predict=vcfeval_tp_predict) augmented_tp, augmented_fn, augmented_fp, augmented_t = summarize_results( os.path.join(args.out_dir, "augmented"), augmented_tp, augmented_fn, augmented_fp, augmented_t, var_types=args.var_types, sv_length=args.sv_length, regions=args.regions, bed_either=args.bed_either, java=args.java) if args.master_vcf and args.call_vcf: match_false(augmented_fp, [args.call_vcf, args.master_vcf, augmented_fn], args.out_dir, args.sample, args.log_to_file, args.vcfeval_options, sdf, args.java) match_false(augmented_fn, [args.call_vcf], args.out_dir, args.sample, args.log_to_file, args.vcfeval_options, sdf, args.java) LOGGER.info( "Variant comparison done.\nTrue positive: {0}\nFalse negative: {1}\nFalse positive: {2}\n" .format(augmented_tp, augmented_fn, augmented_fp))
"Path to file containing concatenation of real insertion sequences", required=False) rand_dgv_group.add_argument("--sv_dgv", metavar="DGV_FILE", help="DGV file containing structural variants", required=False) rand_dgv_group.add_argument( "--sv_prop_het", metavar="FLOAT", help="Proportion of heterozygous structural variants", default=0.6, type=float) args = main_parser.parse_args() args.java = utils.get_java(args.java) check_java(args.java) utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem makedirs([args.out_dir]) # Setup logging FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' loglevel = get_loglevel(args.loglevel) if not args.log_to_stderr: logging.basicConfig(filename=os.path.join(args.out_dir, "varsim.log"), filemode="w", level=loglevel, format=FORMAT) else: logging.basicConfig(level=loglevel, format=FORMAT)
parse_jsons(jsonfile, stats) print("Non-SV stats") print_stats(stats) sv_stats = {k: {ii: 0 for ii in metrics} for k in var_types} parse_jsons(jsonfile, sv_stats, count_sv=True) print("SV stats") print_stats(sv_stats) all_stats = {k: {ii: 0 for ii in metrics} for k in var_types} parse_jsons(jsonfile, all_stats, count_all=True) print("Overall stats") print_stats(all_stats) return tp, fn, fp, t if __name__ == "__main__": utils.check_java() main_parser = argparse.ArgumentParser(description="VarSim: A high-fidelity simulation validation framework", formatter_class=argparse.ArgumentDefaultsHelpFormatter) main_parser.add_argument("--reference", metavar="FASTA", help="reference filename", required=True, type=str) main_parser.add_argument("--sdf", metavar="SDF", help="SDF formatted reference folder", required=False, type=str, default='') main_parser.add_argument("--out_dir", metavar="OUTDIR", help="output folder", required=True, type=str) main_parser.add_argument("--vcfs", metavar="VCF", help="variant calls to be evaluated", nargs="+", default=[], required = True) main_parser.add_argument("--var_types", metavar="VARTYPE", help="variant types", nargs="+", default=['SNP','Insertion','Complex','Deletion'], choices = ['SNP', 'Deletion', 'Insertion', 'Inversion', 'TandemDup', 'Complex', 'TransDup', 'TansDel', 'InterDup', 'Translocation'], required = False) main_parser.add_argument("--true_vcf", metavar="VCF", help="Input small variant sampling VCF, usually dbSNP", required = True) main_parser.add_argument("--regions", help="BED file to restrict analysis [Optional]", required = False, type=str) main_parser.add_argument("--sample", metavar = "SAMPLE", help="sample name", required = False, type=str) main_parser.add_argument("--exclude_filtered", action = 'store_true', help="only consider variants with PASS or . in FILTER column", required = False)