def varsim_somatic_main():
    main_parser = argparse.ArgumentParser(
        description="VarSim: somatic workflow",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    main_parser.add_argument("--out_dir",
                             metavar="Out directory",
                             help="Output directory",
                             default="somatic_out")
    main_parser.add_argument("--work_dir",
                             metavar="Work directory",
                             help="Work directory",
                             default="somatic_work")
    main_parser.add_argument("--log_dir",
                             metavar="Log directory",
                             help="Directory to log to",
                             default="somatic_log")
    main_parser.add_argument("--reference",
                             metavar="FASTA",
                             help="Reference genome",
                             required=True,
                             type=file)
    main_parser.add_argument("--seed",
                             metavar="INT",
                             help="Random number seed",
                             type=int,
                             default=0)
    main_parser.add_argument("--sex",
                             metavar="Sex",
                             help="Sex of the person (MALE/FEMALE)",
                             required=False,
                             type=str,
                             choices=["MALE", "FEMALE"],
                             default="MALE")
    main_parser.add_argument("--id",
                             metavar="id",
                             help="Sample ID",
                             required=True)
    main_parser.add_argument("--simulator",
                             metavar="simulator",
                             help="Read simulator to use",
                             required=False,
                             type=str,
                             choices=["art", "dwgsim"],
                             default="art")
    main_parser.add_argument(
        "--simulator_executable",
        metavar="PATH",
        help="Path to the executable of the read simulator chosen",
        required=True,
        type=file)
    main_parser.add_argument("--varsim_jar",
                             metavar="PATH",
                             help="Path to VarSim.jar (deprecated)",
                             type=file,
                             default=None,
                             required=False)
    main_parser.add_argument("--read_length",
                             metavar="INT",
                             help="Length of read to simulate",
                             default=100,
                             type=int)
    main_parser.add_argument(
        "--nlanes",
        metavar="INT",
        help=
        "Number of lanes to generate, coverage will be divided evenly over the lanes. Simulation is parallized over lanes. Each lane will have its own pair of files",
        default=3,
        type=int)
    main_parser.add_argument("--total_coverage",
                             metavar="FLOAT",
                             help="Total coverage to simulate",
                             default=1.0,
                             type=float)
    main_parser.add_argument("--mean_fragment_size",
                             metavar="INT",
                             help="Mean fragment size",
                             default=350,
                             type=int)
    main_parser.add_argument("--sd_fragment_size",
                             metavar="INT",
                             help="Standard deviation of fragment size",
                             default=50,
                             type=int)

    main_parser.add_argument("--force_five_base_encoding",
                             action="store_true",
                             help="Force bases to be ACTGN")
    main_parser.add_argument("--filter",
                             action="store_true",
                             help="Only use PASS variants")
    main_parser.add_argument("--keep_temp",
                             action="store_true",
                             help="Keep temporary files")
    main_parser.add_argument("--java_max_mem",
                             metavar="XMX",
                             help="max java memory",
                             default="10g",
                             type=str)
    main_parser.add_argument("--java",
                             metavar="PATH",
                             help="path to java",
                             default="java",
                             type=str)
    main_parser.add_argument("--python",
                             metavar="PATH",
                             help="path to python",
                             default="python",
                             type=str)
    main_parser.add_argument('--version',
                             action='version',
                             version=get_version())

    input_vcf_group = main_parser.add_argument_group("Input VCFs options")
    input_vcf_group.add_argument(
        "--cosmic_vcf",
        metavar="VCF",
        help=
        "COSMIC database VCF. Need to specify when random COSMIC sampling is enabled."
    )
    input_vcf_group.add_argument("--normal_vcf",
                                 metavar="VCF",
                                 help="Normal VCF from previous VarSim run",
                                 required=True)
    input_vcf_group.add_argument("--somatic_vcfs",
                                 metavar="VCF",
                                 nargs="+",
                                 help="Somatic VCF",
                                 default=[])
    input_vcf_group.add_argument(
        "--merge_priority",
        choices=["sn", "ns"],
        help=
        "Priority of merging (lowest first) somatic (s) and normal truth (n).",
        default="sn")

    pipeline_control_group = main_parser.add_argument_group(
        "Pipeline control options. Disable parts of the pipeline.")
    pipeline_control_group.add_argument("--disable_rand_vcf",
                                        action="store_true",
                                        help="Disable RandVCF2VCF somatic")
    pipeline_control_group.add_argument("--disable_vcf2diploid",
                                        action="store_true",
                                        help="Disable vcf2diploid")
    pipeline_control_group.add_argument("--disable_sim",
                                        action="store_true",
                                        help="Disable read simulation")

    # RandVCF2VCF seed num_SNP num_INS num_DEL num_MNP num_COMPLEX percent_novel min_length_lim max_length_lim reference_file file.vcf
    rand_vcf_group = main_parser.add_argument_group(
        "RandVCF2VCF somatic options")
    rand_vcf_group.add_argument("--som_num_snp",
                                metavar="INT",
                                help="Number of somatic SNPs",
                                default=9000,
                                type=int)
    rand_vcf_group.add_argument("--som_num_ins",
                                metavar="INT",
                                help="Number of somatic insertions",
                                default=1000,
                                type=int)
    rand_vcf_group.add_argument("--som_num_del",
                                metavar="INT",
                                help="Number of somatic deletions",
                                default=1000,
                                type=int)
    rand_vcf_group.add_argument("--som_num_mnp",
                                metavar="INT",
                                help="Number of somatic MNPs",
                                default=100,
                                type=int)
    rand_vcf_group.add_argument("--som_num_complex",
                                metavar="INT",
                                help="Number of somatic complex variants",
                                default=100,
                                type=int)
    # rand_vcf_group.add_argument("--som_percent_novel", metavar="percent_novel", help="Percent novel", default=0, type=float)
    rand_vcf_group.add_argument("--som_min_length_lim",
                                metavar="INT",
                                help="Min length lim",
                                default=0,
                                type=int)
    rand_vcf_group.add_argument("--som_max_length_lim",
                                metavar="INT",
                                help="Max length lim",
                                default=49,
                                type=int)
    # rand_vcf_group.add_argument("--som_vcf", metavar="in_vcf", help="Input somatic variant database VCF", type=file, required=False)
    rand_vcf_group.add_argument(
        "--som_prop_het",
        metavar="FLOAT",
        help="Proportion of somatic heterozygous variants",
        default=1.0,
        type=float)
    rand_vcf_group.add_argument(
        "--sv_insert_seq",
        metavar="FILE",
        help=
        "Path to file containing concatenation of real insertion sequences",
        type=file,
        required=True)

    dwgsim_group = main_parser.add_argument_group("DWGSIM options")
    dwgsim_group.add_argument("--dwgsim_start_e",
                              metavar="first_base_error_rate",
                              help="Error rate on the first base",
                              default=0.0001,
                              type=float)
    dwgsim_group.add_argument("--dwgsim_end_e",
                              metavar="last_base_error_rate",
                              help="Error rate on the last base",
                              default=0.0015,
                              type=float)
    dwgsim_group.add_argument("--dwgsim_options",
                              help="DWGSIM command-line options",
                              default="",
                              required=False)

    art_group = main_parser.add_argument_group("ART options")
    art_group.add_argument("--profile_1",
                           metavar="profile_file1",
                           help="Profile for first end",
                           default=None,
                           type=file)
    art_group.add_argument("--profile_2",
                           metavar="profile_file2",
                           help="Profile for second end",
                           default=None,
                           type=file)
    art_group.add_argument("--art_options",
                           help="ART command-line options",
                           default="",
                           required=False)

    args = main_parser.parse_args()

    args.java = utils.get_java(args.java)
    check_java(args.java)
    utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem
    makedirs([args.log_dir, args.out_dir])

    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    logging.basicConfig(filename=os.path.join(args.log_dir, "varsim.log"),
                        filemode="w",
                        level=logging.DEBUG,
                        format=FORMAT)
    logger = logging.getLogger(varsim_somatic_main.__name__)

    if not args.disable_sim:
        if not args.simulator_executable:
            logger.error(
                "Please specify %s binary with --simulator_executable option" %
                args.simulator)
            sys.exit(os.EX_USAGE)
        check_executable(args.simulator_executable.name)

    t_s = time.time()

    cosmic_sampled_vcfs = []
    if not args.disable_rand_vcf:
        if not args.cosmic_vcf:
            logger.error(
                "COSMIC database VCF not specified using --cosmic_vcf")
            sys.exit(os.EX_USAGE)
        rand_vcf_stdout = open(os.path.join(args.out_dir, "random.cosmic.vcf"),
                               "w")
        rand_vcf_stderr = open(os.path.join(args.log_dir, "random.cosmic.err"),
                               "w")
        cosmic_sampled_vcfs = [rand_vcf_stdout.name]

        # Not able to support novel yet for COSMIC variants
        randvcf_options = RandVCFOptions(args.som_num_snp, args.som_num_ins,
                                         args.som_num_del, args.som_num_mnp,
                                         args.som_num_complex, 0,
                                         args.som_min_length_lim,
                                         args.som_max_length_lim,
                                         args.som_prop_het)
        run_randvcf(os.path.realpath(args.cosmic_vcf), rand_vcf_stdout,
                    rand_vcf_stderr, args.seed, args.sex, randvcf_options,
                    args.reference.name, args.java)

    normal_vcfs = [args.normal_vcf]
    somatic_vcfs = cosmic_sampled_vcfs + args.somatic_vcfs
    fixed_somatic_vcfs = []
    if somatic_vcfs:
        vcfs_dir = os.path.join(args.out_dir, "somatic_vcfs")
        makedirs([vcfs_dir])
        count = 0
        for index, vcf in enumerate(somatic_vcfs):
            copied_vcf = os.path.join(vcfs_dir, "%d.vcf" % index)
            logger.info(
                "Copying somatic VCF %s to %s and adding VARSIMSOMATIC id to entries if missing"
                % (vcf, copied_vcf))
            with open(vcf, "r") as vcf_fd, open(copied_vcf,
                                                "w") as copied_vcf_fd:
                for line in vcf_fd:
                    if line.startswith("#"):
                        copied_vcf_fd.write(line)
                    else:
                        line_fields = line.split("\t")
                        line_fields[2] = (
                            "VARSIMSOMATIC%d" %
                            count) if line_fields[2] == "." else (
                                "%s,VARSIMSOMATIC%d" % (line_fields[2], count))
                        copied_vcf_fd.write("\t".join(line_fields))
                        count += 1
            fixed_somatic_vcfs.append(copied_vcf)

    vcf_files = (fixed_somatic_vcfs +
                 normal_vcfs) if args.merge_priority == "sn" else (
                     normal_vcfs + fixed_somatic_vcfs)
    vcf_files = map(os.path.realpath, filter(None, vcf_files))

    processes = run_vcfstats(vcf_files, args.out_dir, args.log_dir, args.java)

    # Run VarSim
    varsim_stdout = open(os.path.join(args.log_dir, "som_varsim.out"), "w")
    varsim_stderr = open(os.path.join(args.log_dir, "som_varsim.log"), "w")

    vcf_arg_list = ["--vcfs"] + vcf_files

    # need to fix the store true ones
    filter_arg_list = ["--filter"] if args.filter else []
    disable_sim_arg_list = ["--disable_sim"] if args.disable_sim else []
    force_five_base_encoding_arg_list = [
        "--force_five_base_encoding"
    ] if args.force_five_base_encoding else []
    keep_temp_arg_list = ["--keep_temp"] if args.keep_temp else []
    profile_1_arg_list = ["--profile_1", args.profile_1.name
                          ] if args.profile_1 is not None else []
    profile_2_arg_list = ["--profile_2", args.profile_2.name
                          ] if args.profile_2 is not None else []
    other_varsim_opts = []
    if args.simulator == "dwgsim":
        other_varsim_opts = [
            "--dwgsim_start_e",
            str(args.dwgsim_start_e), "--dwgsim_end_e",
            str(args.dwgsim_end_e)
        ]
        if args.dwgsim_options:
            other_varsim_opts += ["--dwgsim_options", str(args.dwgsim_options)]
    elif args.simulator == "art" and args.art_options:
        other_varsim_opts += ["--art_options", args.art_options]

    args.python = utils.get_python(args.python)
    varsim_command = [args.python, os.path.realpath(VARSIM_PY),
                      "--out_dir", str(os.path.realpath(args.out_dir)),
                      "--work_dir", str(os.path.realpath(args.work_dir)),
                      "--log_dir", str(os.path.realpath(os.path.join(args.log_dir, "varsim"))),
                      "--reference", str(os.path.realpath(args.reference.name)),
                      "--seed", str(args.seed),
                      "--sex", str(args.sex),
                      "--id", str(args.id),
                      "--simulator", str(args.simulator),
                      "--simulator_executable", str(args.simulator_executable.name),
                      "--read_length", str(args.read_length),
                      "--nlanes", str(args.nlanes),
                      "--total_coverage", str(args.total_coverage),
                      "--mean_fragment_size", str(args.mean_fragment_size),
                      "--sd_fragment_size", str(args.sd_fragment_size),
                      "--disable_rand_vcf",
                      "--disable_rand_dgv",
        "--sv_insert_seq", args.sv_insert_seq.name] + other_varsim_opts + vcf_arg_list + filter_arg_list + disable_sim_arg_list \
                     + force_five_base_encoding_arg_list + keep_temp_arg_list + profile_1_arg_list + profile_2_arg_list
    varsim_command = " ".join(varsim_command)
    p_varsim = subprocess.Popen(varsim_command,
                                stdout=varsim_stdout,
                                stderr=varsim_stderr,
                                shell=True)
    logger.info("Executing command " + varsim_command + " with pid " +
                str(p_varsim.pid))
    processes.append(p_varsim)

    processes = monitor_processes(processes)

    # Split the tumor truth VCF into normal variants and somatic variants
    tumor_vcf = os.path.realpath(
        os.path.join(args.out_dir, "%s.truth.vcf" % args.id))
    normal_vcf = os.path.join(args.out_dir, "%s_norm.vcf" % args.id)
    somatic_vcf = os.path.join(args.out_dir, "%s_somatic.vcf" % args.id)
    logger.info("Splitting the truth VCF %s into normal and somatic VCFs" %
                tumor_vcf)
    with open(tumor_vcf, "r") as tumor_truth_fd, \
        open(normal_vcf, "w") as normal_vcf_fd, \
        open(somatic_vcf, "w") as somatic_vcf_fd:
        for line in tumor_truth_fd:
            if line.startswith("#"):
                somatic_vcf_fd.write(line)
                normal_vcf_fd.write(line)
                continue
            if line.find("VARSIMSOMATIC") >= 0:
                somatic_vcf_fd.write(line)
            else:
                normal_vcf_fd.write(line)

    run_vcfstats([normal_vcf, somatic_vcf], args.out_dir, args.log_dir,
                 args.java)

    logger.info("Done! (%g hours)" % ((time.time() - t_s) / 3600.0))
Esempio n. 2
0
def varsim_multi(reference,
                 simulator,
                 simulator_exe,
                 total_coverage,
                 variant_vcfs=[],
                 sampling_vcf=None,
                 dgv_file=None,
                 regions=None,
                 randvcf_options=None,
                 randdgv_options=None,
                 nlanes=1,
                 simulator_options="",
                 samples=[],
                 out_dir="out",
                 sv_insert_seq=None,
                 seed=0,
                 sex="MALE",
                 remove_filtered=False,
                 keep_temp=False,
                 force_five_base_encoding=False,
                 lift_ref=False,
                 disable_vcf2diploid=False,
                 samples_random=0):
    logger = logging.getLogger(varsim_multi.__name__)

    makedirs([out_dir])

    restricted_dir = os.path.join(out_dir, "restricted")

    restricted_reference, restricted_vcfs = gen_restricted_ref_and_vcfs(
        reference,
        variant_vcfs,
        regions,
        samples,
        restricted_dir,
        flank=0,
        short_contig_names=False)

    merged_bedtool = pybedtools.BedTool(regions).merge() if regions else None

    if regions and sampling_vcf:
        merged_bed = os.path.join(out_dir, "merged.bed")
        merged_bedtool = pybedtools.BedTool(regions).merge().saveas(merged_bed)
        _, [restricted_sampling_vcf] = gen_restricted_ref_and_vcfs(
            reference, [sampling_vcf],
            merged_bed, [],
            os.path.join(out_dir, "region_restricted"),
            flank=0)
        # Now lift over the restricted_sampling_vcf to get the region-limited VCF
        sampling_vcf = lift_vcfs([restricted_sampling_vcf],
                                 os.path.join(
                                     out_dir, "region_restricted",
                                     "region-restricted-sampling.vcf"),
                                 reference)

    all_samples = samples + ["VarSim%d" % i for i in xrange(samples_random)]

    for index, (sample, coverage) in enumerate(zip(all_samples,
                                                   total_coverage)):
        sample_dir = os.path.join(out_dir, sample)
        sample_seed = seed + 1000 * index
        makedirs([sample_dir])
        logger.info("Simulating sample {} in {}".format(sample, sample_dir))

        # Run RandVCF first to get the sampled variants for the sample
        if randvcf_options:
            sampled_vcf = os.path.join(sample_dir, "randvcf.vcf")
            with open(sampled_vcf, "w") as randvcf_out, open(
                    os.path.join(sample_dir, "randvcf.err"),
                    "w") as randvcf_log:
                run_randvcf(sampling_vcf, randvcf_out, randvcf_log,
                            sample_seed, sex, randvcf_options,
                            reference).wait()
            pysam.tabix_index(sampled_vcf, force=True, preset='vcf')
            sampled_vcf = "{}.gz".format(sampled_vcf)
            # Now generate the restricted sampled VCF for the sample
            _, [restricted_sampled_vcf] = gen_restricted_ref_and_vcfs(
                reference, [sampled_vcf],
                regions, [],
                os.path.join(sample_dir, "restricted_randvcf"),
                flank=0)
            sample_variant_vcfs = (restricted_vcfs if index >= len(samples)
                                   else []) + [restricted_sampled_vcf]
        else:
            sample_variant_vcfs = restricted_vcfs

        varsim_main(restricted_reference, simulator, simulator_exe, coverage,
                    sample_variant_vcfs, None, dgv_file, None, randdgv_options,
                    nlanes, simulator_options, sample,
                    os.path.join(sample_dir, "log"),
                    os.path.join(sample_dir, "out"), sv_insert_seq,
                    sample_seed, sex, remove_filtered, keep_temp,
                    force_five_base_encoding, lift_ref, disable_vcf2diploid)

    with open(os.path.join(out_dir, "samples.txt"), "w") as samples_fd:
        samples_fd.write("\n".join(all_samples))
Esempio n. 3
0
def varsim_multi(reference,
                 simulator,
                 simulator_exe,
                 total_coverage,
                 variant_vcfs=[],
                 sampling_vcf=None,
                 dgv_file=None,
                 regions=None,
                 randvcf_options=None,
                 randdgv_options=None,
                 nlanes=1,
                 simulator_options="",
                 samples=[],
                 out_dir="out",
                 sv_insert_seq=None,
                 seed=0,
                 sex="MALE",
                 remove_filtered=False,
                 keep_temp=False,
                 force_five_base_encoding=False,
                 lift_ref=False,
                 disable_vcf2diploid=False,
                 samples_random=0,
                 java="java"):
    logger = logging.getLogger(varsim_multi.__name__)

    makedirs([out_dir])

    restricted_dir = os.path.join(out_dir, "restricted")

    restricted_reference, restricted_vcfs = gen_restricted_ref_and_vcfs(
        reference,
        variant_vcfs,
        regions,
        samples,
        restricted_dir,
        flank=0,
        short_contig_names=False)
    dgv_vcf = None

    if dgv_file:
        assert sv_insert_seq, "SV insertion sequence file is required."
        dgv_vcf_dir = os.path.join(out_dir, "tmp")
        makedirs([dgv_vcf_dir])
        dgv_vcf = os.path.join(dgv_vcf_dir, "dgv.vcf")
        makedirs([os.path.join(out_dir, "log")])
        dgv_err_file = os.path.join(out_dir, "log", "dgv2vcf.err")
        randdgv_options2vcf = copy.copy(randdgv_options)
        randdgv_options2vcf.output_all = "-all"
        with open(dgv_vcf, "w") as dgv2vcf_out, open(dgv_err_file,
                                                     "w") as dgv2vcf_log:
            run_randdgv(dgv_file, dgv2vcf_out, dgv2vcf_log, seed, sex,
                        randdgv_options2vcf, reference, sv_insert_seq, java)

    if regions:
        merged_bed = os.path.join(out_dir, "merged.bed")
        pybedtools.BedTool(regions).merge().saveas(merged_bed)
        restricted_dir = os.path.join(out_dir, "region_restricted")
        if sampling_vcf:
            _, [restricted_sampling_vcf
                ] = gen_restricted_ref_and_vcfs(reference, [sampling_vcf],
                                                merged_bed, [],
                                                restricted_dir,
                                                flank=0)
            # Now lift over the restricted_sampling_vcf to get the region-limited VCF
            sampling_vcf = lift_vcfs([restricted_sampling_vcf],
                                     os.path.join(
                                         restricted_dir,
                                         "region-restricted-sampling.vcf"),
                                     reference)
        if dgv_vcf:
            convertCN([dgv_vcf], "two2one")
            dgv_vcf = sort_and_compress(dgv_vcf)
            _, [restricted_dgv_vcf
                ] = gen_restricted_ref_and_vcfs(reference, [dgv_vcf],
                                                merged_bed, [],
                                                restricted_dir,
                                                flank=0)
            # Now lift over the restricted_dgv_vcf to get the region-limited VCF
            dgv_vcf = lift_vcfs([restricted_dgv_vcf],
                                os.path.join(restricted_dir,
                                             "region-restricted-dgv.vcf"),
                                reference)

    all_samples = samples + ["VarSim%d" % i for i in xrange(samples_random)]

    for index, (sample, coverage) in enumerate(zip(all_samples,
                                                   total_coverage)):
        sample_dir = os.path.join(out_dir, sample)
        sample_seed = seed + 1000 * index
        makedirs([sample_dir])
        logger.info("Simulating sample {} in {}".format(sample, sample_dir))
        sample_variant_vcfs = list(
            restricted_vcfs if index < len(samples) else [])

        # Run RandVCF first to get the sampled variants for the sample
        if randvcf_options and sampling_vcf:
            sampled_vcf = os.path.join(sample_dir, "randvcf.vcf")
            with open(sampled_vcf, "w") as randvcf_out, open(
                    os.path.join(sample_dir, "randvcf.err"),
                    "w") as randvcf_log:
                run_randvcf(sampling_vcf, randvcf_out, randvcf_log,
                            sample_seed, sex, randvcf_options, reference, java)
            sampled_vcf = sort_and_compress(sampled_vcf)
            # Now generate the restricted sampled VCF for the sample
            _, [restricted_sampled_vcf] = gen_restricted_ref_and_vcfs(
                reference, [sampled_vcf],
                regions, [],
                os.path.join(sample_dir, "restricted_randvcf"),
                flank=0)
            sample_variant_vcfs = sample_variant_vcfs + [
                restricted_sampled_vcf
            ]

        if randdgv_options and dgv_vcf:
            sampled_dgv_vcf = os.path.join(sample_dir, "randdgvvcf.vcf")
            randdgvvcf_options = randdgv_options2randvcf_options(
                randdgv_options)
            with open(sampled_dgv_vcf, "w") as randdgvvcf_out, open(
                    os.path.join(sample_dir, "randdgvvcf.err"),
                    "w") as randdgvvcf_log:
                run_randvcf(dgv_vcf, randdgvvcf_out, randdgvvcf_log,
                            sample_seed, sex, randdgvvcf_options, reference,
                            java)
            sampled_dgv_vcf = sort_and_compress(sampled_dgv_vcf)
            # Now generate the restricted sampled dgv VCF for the sample
            _, [restricted_sampled_dgv_vcf] = gen_restricted_ref_and_vcfs(
                reference, [sampled_dgv_vcf],
                regions, [],
                os.path.join(sample_dir, "restricted_randdgvvcf"),
                flank=0)
            convertCN([restricted_sampled_dgv_vcf], "one2two")
            sample_variant_vcfs = sample_variant_vcfs + [
                restricted_sampled_dgv_vcf
            ]

        varsim_main(restricted_reference,
                    simulator,
                    simulator_exe,
                    coverage,
                    sample_variant_vcfs,
                    None,
                    dgv_file,
                    None,
                    randdgv_options,
                    nlanes,
                    simulator_options,
                    sample,
                    os.path.join(sample_dir, "log"),
                    os.path.join(sample_dir, "out"),
                    sv_insert_seq,
                    sample_seed,
                    sex,
                    remove_filtered,
                    keep_temp,
                    force_five_base_encoding,
                    lift_ref,
                    disable_vcf2diploid,
                    java=java)

    with open(os.path.join(out_dir, "samples.txt"), "w") as samples_fd:
        samples_fd.write("\n".join(all_samples))
Esempio n. 4
0
        if not args.cosmic_vcf:
            logger.error(
                "COSMIC database VCF not specified using --cosmic_vcf")
            sys.exit(os.EX_USAGE)
        rand_vcf_stdout = open(os.path.join(args.out_dir, "random.cosmic.vcf"),
                               "w")
        rand_vcf_stderr = open(os.path.join(args.log_dir, "random.cosmic.err"),
                               "w")
        cosmic_sampled_vcfs = [rand_vcf_stdout.name]

        # Not able to support novel yet for COSMIC variants
        monitor_processes([
            run_randvcf(os.path.realpath(args.cosmic_vcf), rand_vcf_stdout,
                        rand_vcf_stderr, args.seed, args.sex, args.som_num_snp,
                        args.som_num_ins, args.som_num_del, args.som_num_mnp,
                        args.som_num_complex, 0, args.som_min_length_lim,
                        args.som_max_length_lim,
                        os.path.realpath(args.reference.name),
                        args.som_prop_het)
        ])

    normal_vcfs = [args.normal_vcf]
    somatic_vcfs = cosmic_sampled_vcfs + args.somatic_vcfs
    fixed_somatic_vcfs = []
    if somatic_vcfs:
        vcfs_dir = os.path.join(args.out_dir, "somatic_vcfs")
        makedirs([vcfs_dir])
        count = 0
        for index, vcf in enumerate(somatic_vcfs):
            copied_vcf = os.path.join(vcfs_dir, "%d.vcf" % index)
            logger.info(
Esempio n. 5
0
def varsim_somatic_main():

    check_java()

    main_parser = argparse.ArgumentParser(description="VarSim: somatic workflow",
                                          formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    main_parser.add_argument("--out_dir", metavar="Out directory", help="Output directory",
                             default="somatic_out")
    main_parser.add_argument("--work_dir", metavar="Work directory", help="Work directory",
                             default="somatic_work")
    main_parser.add_argument("--log_dir", metavar="Log directory", help="Directory to log to",
                             default="somatic_log")
    main_parser.add_argument("--reference", metavar="FASTA", help="Reference genome", required=True, type=file)
    main_parser.add_argument("--seed", metavar="INT", help="Random number seed", type=int, default=0)
    main_parser.add_argument("--sex", metavar="Sex", help="Sex of the person (MALE/FEMALE)", required=False, type=str,
                             choices=["MALE", "FEMALE"], default="MALE")
    main_parser.add_argument("--id", metavar="id", help="Sample ID", required=True)
    main_parser.add_argument("--simulator", metavar="simulator", help="Read simulator to use", required=False, type=str,
                             choices=["art", "dwgsim"], default="art")
    main_parser.add_argument("--simulator_executable", metavar="PATH",
                             help="Path to the executable of the read simulator chosen"
                             , required=True, type=file)
    main_parser.add_argument("--varsim_jar", metavar="PATH", help="Path to VarSim.jar (deprecated)", type=file,
                             default=DEFAULT_VARSIMJAR,
                             required=False)
    main_parser.add_argument("--read_length", metavar="INT", help="Length of read to simulate", default=100, type=int)
    main_parser.add_argument("--nlanes", metavar="INT",
                             help="Number of lanes to generate, coverage will be divided evenly over the lanes. Simulation is parallized over lanes. Each lane will have its own pair of files",
                             default=3, type=int)
    main_parser.add_argument("--total_coverage", metavar="FLOAT", help="Total coverage to simulate", default=1.0,
                             type=float)
    main_parser.add_argument("--mean_fragment_size", metavar="INT", help="Mean fragment size", default=350,
                             type=int)
    main_parser.add_argument("--sd_fragment_size", metavar="INT", help="Standard deviation of fragment size",
                             default=50, type=int)

    main_parser.add_argument("--force_five_base_encoding", action="store_true", help="Force bases to be ACTGN")
    main_parser.add_argument("--filter", action="store_true", help="Only use PASS variants")
    main_parser.add_argument("--keep_temp", action="store_true", help="Keep temporary files")
    main_parser.add_argument('--version', action='version', version=get_version())


    input_vcf_group = main_parser.add_argument_group("Input VCFs options")
    input_vcf_group.add_argument("--cosmic_vcf", metavar="VCF", help="COSMIC database VCF. Need to specify when random COSMIC sampling is enabled.")
    input_vcf_group.add_argument("--normal_vcf", metavar="VCF", help="Normal VCF from previous VarSim run", required=True)
    input_vcf_group.add_argument("--somatic_vcfs", metavar="VCF", nargs="+", help="Somatic VCF", default=[])
    input_vcf_group.add_argument("--merge_priority", choices=["sn", "ns"], help="Priority of merging (lowest first) somatic (s) and normal truth (n).", default="sn")

    pipeline_control_group = main_parser.add_argument_group("Pipeline control options. Disable parts of the pipeline.")
    pipeline_control_group.add_argument("--disable_rand_vcf", action="store_true", help="Disable RandVCF2VCF somatic")
    pipeline_control_group.add_argument("--disable_vcf2diploid", action="store_true", help="Disable vcf2diploid")
    pipeline_control_group.add_argument("--disable_sim", action="store_true", help="Disable read simulation")

    # RandVCF2VCF seed num_SNP num_INS num_DEL num_MNP num_COMPLEX percent_novel min_length_lim max_length_lim reference_file file.vcf
    rand_vcf_group = main_parser.add_argument_group("RandVCF2VCF somatic options")
    rand_vcf_group.add_argument("--som_num_snp", metavar="INT", help="Number of somatic SNPs", default=9000, type=int)
    rand_vcf_group.add_argument("--som_num_ins", metavar="INT", help="Number of somatic insertions", default=1000,
                                type=int)
    rand_vcf_group.add_argument("--som_num_del", metavar="INT", help="Number of somatic deletions", default=1000,
                                type=int)
    rand_vcf_group.add_argument("--som_num_mnp", metavar="INT", help="Number of somatic MNPs", default=100, type=int)
    rand_vcf_group.add_argument("--som_num_complex", metavar="INT", help="Number of somatic complex variants",
                                default=100, type=int)
    # rand_vcf_group.add_argument("--som_percent_novel", metavar="percent_novel", help="Percent novel", default=0, type=float)
    rand_vcf_group.add_argument("--som_min_length_lim", metavar="INT", help="Min length lim", default=0,
                                type=int)
    rand_vcf_group.add_argument("--som_max_length_lim", metavar="INT", help="Max length lim", default=49,
                                type=int)
    # rand_vcf_group.add_argument("--som_vcf", metavar="in_vcf", help="Input somatic variant database VCF", type=file, required=False)
    rand_vcf_group.add_argument("--som_prop_het", metavar="FLOAT", help="Proportion of somatic heterozygous variants",
                                default=1.0, type=float)
    rand_vcf_group.add_argument("--sv_insert_seq", metavar="FILE",
                                help="Path to file containing concatenation of real insertion sequences", type=file,
                                required=True)

    dwgsim_group = main_parser.add_argument_group("DWGSIM options")
    dwgsim_group.add_argument("--dwgsim_start_e", metavar="first_base_error_rate", help="Error rate on the first base",
                              default=0.0001, type=float)
    dwgsim_group.add_argument("--dwgsim_end_e", metavar="last_base_error_rate", help="Error rate on the last base",
                              default=0.0015, type=float)
    dwgsim_group.add_argument("--dwgsim_options", help="DWGSIM command-line options", default="", required=False)

    art_group = main_parser.add_argument_group("ART options")
    art_group.add_argument("--profile_1", metavar="profile_file1", help="Profile for first end", default=None, type=file)
    art_group.add_argument("--profile_2", metavar="profile_file2", help="Profile for second end", default=None, type=file)
    art_group.add_argument("--art_options", help="ART command-line options", default="", required=False)

    args = main_parser.parse_args()

    makedirs([args.log_dir, args.out_dir])

    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    logging.basicConfig(filename=os.path.join(args.log_dir, "varsim.log"), filemode="w", level=logging.DEBUG, format=FORMAT)
    logger = logging.getLogger(varsim_somatic_main.__name__)

    if not args.disable_sim:
        if not args.simulator_executable:
            logger.error("Please specify %s binary with --simulator_executable option" % args.simulator)
            sys.exit(os.EX_USAGE)
        check_executable(args.simulator_executable.name)

    t_s = time.time()

    cosmic_sampled_vcfs = []
    if not args.disable_rand_vcf:
        if not args.cosmic_vcf:
            logger.error("COSMIC database VCF not specified using --cosmic_vcf")
            sys.exit(os.EX_USAGE)
        rand_vcf_stdout = open(os.path.join(args.out_dir, "random.cosmic.vcf"), "w")
        rand_vcf_stderr = open(os.path.join(args.log_dir, "random.cosmic.err"), "w")
        cosmic_sampled_vcfs = [rand_vcf_stdout.name]

        # Not able to support novel yet for COSMIC variants
        randvcf_options = RandVCFOptions(args.som_num_snp, args.som_num_ins, args.som_num_del, args.som_num_mnp, args.som_num_complex, 0, args.som_min_length_lim, args.som_max_length_lim, args.som_prop_het)
        monitor_processes([run_randvcf(os.path.realpath(args.cosmic_vcf), rand_vcf_stdout, rand_vcf_stderr, args.seed, args.sex, randvcf_options, args.reference.name)])

    normal_vcfs = [args.normal_vcf]
    somatic_vcfs = cosmic_sampled_vcfs + args.somatic_vcfs
    fixed_somatic_vcfs = []
    if somatic_vcfs:
        vcfs_dir = os.path.join(args.out_dir, "somatic_vcfs")
        makedirs([vcfs_dir])
        count = 0
        for index, vcf in enumerate(somatic_vcfs):
            copied_vcf = os.path.join(vcfs_dir, "%d.vcf" % index)
            logger.info("Copying somatic VCF %s to %s and adding VARSIMSOMATIC id to entries if missing" % (vcf, copied_vcf))
            with open(vcf, "r") as vcf_fd, open(copied_vcf, "w") as copied_vcf_fd:
                for line in vcf_fd:
                    if line.startswith("#"):
                        copied_vcf_fd.write(line)
                    else:
                        line_fields = line.split("\t")
                        line_fields[2] = ("VARSIMSOMATIC%d" % count) if line_fields[2] == "." else ("%s,VARSIMSOMATIC%d" % (line_fields[2], count))
                        copied_vcf_fd.write("\t".join(line_fields))
                        count += 1
            fixed_somatic_vcfs.append(copied_vcf)

    vcf_files = (fixed_somatic_vcfs + normal_vcfs) if args.merge_priority == "sn" else (normal_vcfs + fixed_somatic_vcfs)
    vcf_files = map(os.path.realpath, filter(None, vcf_files))

    processes = run_vcfstats(vcf_files, args.out_dir, args.log_dir)

    # Run VarSim
    varsim_stdout = open(os.path.join(args.log_dir, "som_varsim.out"), "w")
    varsim_stderr = open(os.path.join(args.log_dir, "som_varsim.log"), "w")

    vcf_arg_list = ["--vcfs"] + vcf_files

    # need to fix the store true ones
    filter_arg_list = ["--filter"] if args.filter else []
    disable_sim_arg_list = ["--disable_sim"] if args.disable_sim else []
    force_five_base_encoding_arg_list = ["--force_five_base_encoding"] if args.force_five_base_encoding else []
    keep_temp_arg_list = ["--keep_temp"] if args.keep_temp else []
    profile_1_arg_list = ["--profile_1", args.profile_1.name] if args.profile_1 is not None else []
    profile_2_arg_list = ["--profile_2", args.profile_2.name] if args.profile_2 is not None else []
    other_varsim_opts = []
    if args.simulator == "dwgsim":
        other_varsim_opts = ["--dwgsim_start_e", str(args.dwgsim_start_e), "--dwgsim_end_e", str(args.dwgsim_end_e)]
        if args.dwgsim_options: other_varsim_opts += ["--dwgsim_options", str(args.dwgsim_options)]
    elif args.simulator == "art" and args.art_options:
        other_varsim_opts += ["--art_options", args.art_options]

    varsim_command = ["python", os.path.realpath(VARSIM_PY),
                      "--out_dir", str(os.path.realpath(args.out_dir)),
                      "--work_dir", str(os.path.realpath(args.work_dir)),
                      "--log_dir", str(os.path.realpath(os.path.join(args.log_dir, "varsim"))),
                      "--reference", str(os.path.realpath(args.reference.name)),
                      "--seed", str(args.seed),
                      "--sex", str(args.sex),
                      "--id", str(args.id),
                      "--simulator", str(args.simulator),
                      "--simulator_executable", str(args.simulator_executable.name),
                      "--read_length", str(args.read_length),
                      "--nlanes", str(args.nlanes),
                      "--total_coverage", str(args.total_coverage),
                      "--mean_fragment_size", str(args.mean_fragment_size),
                      "--sd_fragment_size", str(args.sd_fragment_size),
                      "--disable_rand_vcf",
                      "--disable_rand_dgv",
		      "--sv_insert_seq", args.sv_insert_seq.name] + other_varsim_opts + vcf_arg_list + filter_arg_list + disable_sim_arg_list \
                     + force_five_base_encoding_arg_list + keep_temp_arg_list + profile_1_arg_list + profile_2_arg_list
    varsim_command = " ".join(varsim_command)
    p_varsim = subprocess.Popen(varsim_command, stdout=varsim_stdout, stderr=varsim_stderr, shell=True)
    logger.info("Executing command " + varsim_command + " with pid " + str(p_varsim.pid))
    processes.append(p_varsim)

    processes = monitor_processes(processes)

    # Split the tumor truth VCF into normal variants and somatic variants
    tumor_vcf = os.path.realpath(os.path.join(args.out_dir, "%s.truth.vcf" % args.id))
    normal_vcf = os.path.join(args.out_dir, "%s_norm.vcf" % args.id)
    somatic_vcf = os.path.join(args.out_dir, "%s_somatic.vcf" % args.id)
    logger.info("Splitting the truth VCF %s into normal and somatic VCFs" % tumor_vcf)
    with open(tumor_vcf, "r") as tumor_truth_fd, \
        open(normal_vcf, "w") as normal_vcf_fd, \
        open(somatic_vcf, "w") as somatic_vcf_fd:
        for line in tumor_truth_fd:
            if line.startswith("#"):
                somatic_vcf_fd.write(line)
                normal_vcf_fd.write(line)
                continue
            if line.find("VARSIMSOMATIC") >= 0:
                somatic_vcf_fd.write(line)
            else:
                normal_vcf_fd.write(line)

    monitor_processes(run_vcfstats([normal_vcf, somatic_vcf], args.out_dir, args.log_dir))

    logger.info("Done! (%g hours)" % ((time.time() - t_s) / 3600.0))