Ejemplo n.º 1
0
                             type=str)

    longislnd_group = main_parser.add_argument_group("LongISLND options")
    longislnd_group.add_argument("--longislnd_options",
                                 help="LongISLND options",
                                 default="")

    args = main_parser.parse_args()

    args.java = utils.get_java(args.java)
    utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem
    makedirs([args.log_dir, args.out_dir])

    # Setup logging
    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    loglevel = get_loglevel(args.loglevel)
    if not args.log_to_stderr:
        logging.basicConfig(filename=os.path.join(args.log_dir, "varsim.log"),
                            filemode="w",
                            level=loglevel,
                            format=FORMAT)
    else:
        logging.basicConfig(level=loglevel, format=FORMAT)

    simulator = None if args.disable_sim else args.simulator
    simulator_opts = ""
    if args.simulator == "dwgsim":
        simulator_opts = "-e {1},{2} -E {1},{2} -d {3} -s {4} -1 {5} -2 {5} {6}".format(
            args.dwgsim_start_e, args.dwgsim_end_e, args.mean_fragment_size,
            args.sd_fragment_size, args.read_length, args.dwgsim_options)
    elif args.simulator == "art":
Ejemplo n.º 2
0
    art_group.add_argument("--art_options", help="ART command-line options", default="")

    pbsim_group = main_parser.add_argument_group("PBSIM options")
    pbsim_group.add_argument("--model_qc", metavar="model_qc", help="PBSIM QC model", default=None, type=str)

    longislnd_group = main_parser.add_argument_group("LongISLND options")
    longislnd_group.add_argument("--longislnd_options", help="LongISLND options", default="")

    args = main_parser.parse_args()

    utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem
    makedirs([args.log_dir, args.out_dir])

    # Setup logging
    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    loglevel = get_loglevel(args.loglevel)
    if not args.log_to_stderr:
        logging.basicConfig(filename=os.path.join(args.log_dir, "varsim.log"), filemode="w", level=loglevel, format=FORMAT)
    else:
        logging.basicConfig(level=loglevel, format=FORMAT)

    simulator = None if args.disable_sim else args.simulator
    simulator_opts = ""
    if args.simulator == "dwgsim":
        simulator_opts = "-e {1},{2} -E {1},{2} -d {3} -s {4} -1 {5} -2 {5} {6}".format(args.dwgsim_start_e, args.dwgsim_end_e, args.mean_fragment_size, args.sd_fragment_size, args.read_length, args.dwgsim_options)
    elif args.simulator == "art":
        profile_opts = "-1 {} -2 {}".format(args.profile_1, args.profile_2) if (args.profile_1 and args.profile_2) else ""
        simulator_opts = "-p -l {} -m {} -s {} {} {}".format(args.read_length, args.mean_fragment_size, args.sd_fragment_size, profile_opts, args.art_options)
    elif args.simulator == "longislnd":
        simulator_opts = args.longislnd_options
    elif args.simulator == "pbsim":
Ejemplo n.º 3
0
def process(args):
    '''
    main
    :param args:
    :return:
    '''
    # Setup logging
    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    loglevel = utils.get_loglevel(args.loglevel)
    logging.basicConfig(level=loglevel, format=FORMAT)

    global LOGGER
    LOGGER = logging.getLogger(__name__)
    LOGGER.info('running {}'.format(' '.join(sys.argv)))

    dup_mode = None
    if args.mode == 'first_duplicate':
        dup_mode = utils.COMBINE_KEEP_FIRST_DUPLICATE
    elif args.mode == 'all_duplicate':
        dup_mode = utils.COMBINE_KEEP_ALL_DUPLICATE
    elif args.mode == 'no_duplicate':
        dup_mode = utils.COMBINE_KEEP_NO_DUPLICATE
    else:
        raise ValueError
    """
    scenarios:
    vcf
    vcf.gz
    vcf.gz + vcf.gz.tbi
    """
    input_vcfs = args.vcfs
    for i in range(len(args.vcfs)):
        current_vcf = args.vcfs[i]
        if current_vcf.endswith(".gz") and os.path.isfile(current_vcf +
                                                          ".tbi"):
            input_vcfs[i] = current_vcf
        elif current_vcf.endswith(".gz"):
            LOGGER.info('indexing {}'.format(current_vcf))
            utils.index_vcf_gz(current_vcf)
            input_vcfs[i] = current_vcf
        else:
            LOGGER.info('sort and index {}'.format(current_vcf))
            input_vcfs[i] = utils.sort_and_compress(current_vcf,
                                                    mode=2,
                                                    overwrite=args.overwrite)
    output_vcf = args.output_prefix + '.vcf'
    if input_vcfs and len(input_vcfs) == 1:
        output_vcf = output_vcf + '.gz'
        output_vcf_idx = output_vcf + '.tbi'
        if (not args.overwrite) and \
           (os.path.isfile(output_vcf) or os.path.isfile(output_vcf_idx)):
            LOGGER.warn(
                '{} or {} exists, use --overwrite otherwise do nothing.'.
                format(output_vcf, output_vcf_idx))
        else:
            shutil.copyfile(input_vcfs[0], output_vcf)
            shutil.copyfile(input_vcfs[0], output_vcf_idx)
    else:
        output_vcf = utils.combine_vcf(output_vcf,
                                       input_vcfs,
                                       duplicate_handling_mode=dup_mode)
    LOGGER.info('{} done'.format(output_vcf))
    return
Ejemplo n.º 4
0
def process(args):
    '''
    main
    :param args:
    :return:
    '''

    # Setup logging
    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    loglevel = utils.get_loglevel(args.loglevel)
    if args.log_to_file:
        logging.basicConfig(filename=args.log_to_file,
                            filemode="w",
                            level=loglevel,
                            format=FORMAT)
    else:
        logging.basicConfig(level=loglevel, format=FORMAT)

    if len(args.vcfs) > 1:
        raise NotImplementedError(
            'right now only support one prediction VCF. Quick workaround: src/sort_vcf.sh vcf1 vcf2 > merged.vcf'
        )

    global LOGGER
    LOGGER = logging.getLogger(__name__)
    LOGGER.info('working hard ...')

    utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem
    args.out_dir = os.path.abspath(args.out_dir)
    args.reference = os.path.abspath(args.reference)
    utils.makedirs([args.out_dir])

    varsim_prefix = os.path.join(args.out_dir, 'varsim_compare_results')
    varsim_comparator = VarSimVCFComparator(
        prefix=varsim_prefix,
        true_vcf=args.true_vcf,
        reference=args.reference,
        regions=None,
        sample=args.sample,
        vcfs=args.vcfs,
        exclude_filtered=args.exclude_filtered,
        disallow_partial_fp=args.disallow_partial_fp,
        match_geno=args.match_geno,
        log_to_file=args.log_to_file,
        opts=args.vcfcompare_options)
    varsim_tp, varsim_fn, varsim_fp = varsim_comparator.get_tp(
    ), varsim_comparator.get_fn(), varsim_comparator.get_fp()
    varsim_tp = utils.sort_and_compress(varsim_tp)
    varsim_fn = utils.sort_and_compress(varsim_fn)
    varsim_fp = utils.sort_and_compress(varsim_fp)
    #run vcfeval
    sdf = args.sdf
    if not sdf:
        LOGGER.info(
            "user did not supply SDF-formatted reference, trying to generate one..."
        )
        sdf = generate_sdf(args.reference, args.log_to_file)
    '''for vcfeval
    sample column must be present, and not empty
    if single-sample vcf, vcfeval doesn't check if samples match in truth and call
    in multi-sample vcf, sample name must be specified
    right now
    '''
    vcfeval_prefix = os.path.join(args.out_dir, 'vcfeval_compare_results')
    if os.path.exists(vcfeval_prefix):
        LOGGER.warn('{0} exists, removing ...'.format(vcfeval_prefix))
        shutil.rmtree(vcfeval_prefix)
    vcfeval_comparator = RTGVCFComparator(
        prefix=vcfeval_prefix,
        true_vcf=varsim_fn,
        reference=sdf,
        regions=None,
        sample=args.sample,
        vcfs=[varsim_fp],
        exclude_filtered=args.exclude_filtered,
        match_geno=args.match_geno,
        log_to_file=args.log_to_file,
        opts=args.vcfeval_options)
    vcfeval_tp, vcfeval_tp_predict = vcfeval_comparator.get_tp(
    ), vcfeval_comparator.get_tp_predict()
    augmented_tp, augmented_fn, augmented_fp, augmented_t = merge_results(
        outdir=args.out_dir,
        varsim_tp=varsim_tp,
        varsim_fn=varsim_fn,
        vcfeval_tp=vcfeval_tp,
        varsim_fp=varsim_fp,
        vcfeval_tp_predict=vcfeval_tp_predict)
    augmented_tp, augmented_fn, augmented_fp, augmented_t = summarize_results(
        os.path.join(args.out_dir, "augmented"),
        augmented_tp,
        augmented_fn,
        augmented_fp,
        augmented_t,
        var_types=args.var_types,
        sv_length=args.sv_length,
        regions=args.regions,
        bed_either=args.bed_either)

    LOGGER.info(
        "Variant comparison done.\nTrue positive: {0}\nFalse negative: {1}\nFalse positive: {2}\n"
        .format(augmented_tp, augmented_fn, augmented_fp))
Ejemplo n.º 5
0
def process(args):
    '''
    main
    :param args:
    :return:
    '''

    # Setup logging
    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    loglevel = utils.get_loglevel(args.loglevel)
    if args.log_to_file:
        logging.basicConfig(filename=args.log_to_file, filemode="w", level=loglevel, format=FORMAT)
    else:
        logging.basicConfig(level=loglevel, format=FORMAT)

    if len(args.vcfs) > 1:
        raise NotImplementedError('right now only support one prediction VCF. Quick workaround: src/sort_vcf.sh vcf1 vcf2 > merged.vcf')

    global LOGGER
    LOGGER = logging.getLogger(__name__)
    LOGGER.info('working hard ...')

    utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem
    args.out_dir = os.path.abspath(args.out_dir)
    args.reference = os.path.abspath(args.reference)
    utils.makedirs([args.out_dir])

    varsim_prefix = os.path.join(args.out_dir, 'varsim_compare_results')
    varsim_comparator = VarSimVCFComparator(prefix=varsim_prefix, true_vcf = args.true_vcf, reference = args.reference,
                                            regions = None,
               sample = args.sample, vcfs = args.vcfs,
               exclude_filtered = args.exclude_filtered,
               disallow_partial_fp = args.disallow_partial_fp,
               match_geno = args.match_geno, log_to_file= args.log_to_file, opts = args.vcfcompare_options)
    varsim_tp, varsim_fn, varsim_fp = varsim_comparator.get_tp(), varsim_comparator.get_fn(), varsim_comparator.get_fp()
    varsim_tp = utils.sort_and_compress(varsim_tp)
    varsim_fn = utils.sort_and_compress(varsim_fn)
    varsim_fp = utils.sort_and_compress(varsim_fp)
    #run vcfeval
    sdf = args.sdf
    if not sdf:
        LOGGER.info("user did not supply SDF-formatted reference, trying to generate one...")
        sdf = generate_sdf(args.reference, args.log_to_file)

    '''for vcfeval
    sample column must be present, and not empty
    if single-sample vcf, vcfeval doesn't check if samples match in truth and call
    in multi-sample vcf, sample name must be specified
    right now
    '''
    vcfeval_prefix = os.path.join(args.out_dir, 'vcfeval_compare_results')
    if os.path.exists(vcfeval_prefix):
        LOGGER.warn('{0} exists, removing ...'.format(vcfeval_prefix))
        shutil.rmtree(vcfeval_prefix)
    vcfeval_comparator = RTGVCFComparator(prefix=vcfeval_prefix, true_vcf = varsim_fn, reference = sdf,
                                          regions = None,
                                            sample = args.sample, vcfs = [varsim_fp],
                                            exclude_filtered = args.exclude_filtered,
                                            match_geno = args.match_geno, log_to_file= args.log_to_file,
                                          opts = args.vcfeval_options)
    vcfeval_tp, vcfeval_tp_predict = vcfeval_comparator.get_tp(), vcfeval_comparator.get_tp_predict()
    augmented_tp, augmented_fn, augmented_fp, augmented_t = merge_results(
                      outdir = args.out_dir,
                      varsim_tp = varsim_tp, varsim_fn = varsim_fn,
                      vcfeval_tp = vcfeval_tp, varsim_fp = varsim_fp, vcfeval_tp_predict = vcfeval_tp_predict)
    augmented_tp, augmented_fn, augmented_fp, augmented_t = summarize_results(os.path.join(args.out_dir,"augmented"), augmented_tp, augmented_fn, augmented_fp, augmented_t,
                      var_types= args.var_types, sv_length= args.sv_length, regions = args.regions, bed_either = args.bed_either)

    LOGGER.info("Variant comparison done.\nTrue positive: {0}\nFalse negative: {1}\nFalse positive: {2}\n".
                format(augmented_tp, augmented_fn, augmented_fp))