Ejemplo n.º 1
0
def summarize_results(prefix, tp, fn, fp, t, var_types, sv_length = 100, regions = None, bed_either = False):
    '''
    count variants by type and tabulate
    :param augmented_tp:
    :param augmented_fn:
    :param augmented_fp:
    :param augmented_t:
    :return:
    '''
    cmd = ['java', utils.JAVA_XMX, '-jar', utils.VARSIMJAR, 'vcfcompareresultsparser',
           '-prefix', prefix, '-tp',tp,
           '-fn', fn, '-fp', fp,
           '-t', t, 
           '-sv_length', str(sv_length),
           ]
    if regions:
        cmd = cmd + ['-bed', regions]
    if bed_either:
        cmd = cmd + ['-bed_either']
    utils.run_shell_command(cmd, cmd_stdout=sys.stdout, cmd_stderr=sys.stderr)

    tp = prefix + "_tp.vcf"
    fn = prefix + "_fn.vcf"
    fp = prefix + "_fp.vcf"
    t = prefix + "_t.vcf"

    tp = utils.sort_and_compress(tp)
    fn = utils.sort_and_compress(fn)
    fp = utils.sort_and_compress(fp)
    t = utils.sort_and_compress(t)

    jsonfile = "{0}_report.json".format(prefix)
    metrics = ['tp', 'fp', 't', 'fn']
    stats = {k: {ii: 0 for ii in metrics} for k in var_types}
    parse_jsons(jsonfile, stats)
    print("Non-SV stats")
    print_stats(stats)
    sv_stats = {k: {ii: 0 for ii in metrics} for k in var_types}
    parse_jsons(jsonfile, sv_stats, count_sv=True)
    print("SV stats")
    print_stats(sv_stats)
    all_stats = {k: {ii: 0 for ii in metrics} for k in var_types}
    parse_jsons(jsonfile, all_stats, count_all=True)
    print("Overall stats")
    print_stats(all_stats)
    return tp, fn, fp, t
Ejemplo n.º 2
0
def summarize_results(prefix,
                      tp,
                      fn,
                      fp,
                      t,
                      var_types,
                      sv_length=100,
                      regions=None,
                      bed_either=False):
    '''
    count variants by type and tabulate
    :param augmented_tp:
    :param augmented_fn:
    :param augmented_fp:
    :param augmented_t:
    :return:
    '''
    cmd = [
        'java',
        utils.JAVA_XMX,
        '-jar',
        utils.VARSIMJAR,
        'vcfcompareresultsparser',
        '-prefix',
        prefix,
        '-tp',
        tp,
        '-fn',
        fn,
        '-fp',
        fp,
        '-t',
        t,
        '-sv_length',
        str(sv_length),
    ]
    if regions:
        cmd = cmd + ['-bed', regions]
    if bed_either:
        cmd = cmd + ['-bed_either']
    utils.run_shell_command(cmd, cmd_stdout=sys.stdout, cmd_stderr=sys.stderr)

    tp = prefix + "_tp.vcf"
    fn = prefix + "_fn.vcf"
    fp = prefix + "_fp.vcf"
    t = prefix + "_t.vcf"

    tp = utils.sort_and_compress(tp)
    fn = utils.sort_and_compress(fn)
    fp = utils.sort_and_compress(fp)
    t = utils.sort_and_compress(t)

    jsonfile = "{0}_report.json".format(prefix)
    metrics = ['tp', 'fp', 't', 'fn']
    stats = {k: {ii: 0 for ii in metrics} for k in var_types}
    parse_jsons(jsonfile, stats)
    print("Non-SV stats")
    print_stats(stats)
    sv_stats = {k: {ii: 0 for ii in metrics} for k in var_types}
    parse_jsons(jsonfile, sv_stats, count_sv=True)
    print("SV stats")
    print_stats(sv_stats)
    all_stats = {k: {ii: 0 for ii in metrics} for k in var_types}
    parse_jsons(jsonfile, all_stats, count_all=True)
    print("Overall stats")
    print_stats(all_stats)
    return tp, fn, fp, t
Ejemplo n.º 3
0
def process(args):
    '''
    main
    :param args:
    :return:
    '''

    # Setup logging
    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    loglevel = utils.get_loglevel(args.loglevel)
    if args.log_to_file:
        logging.basicConfig(filename=args.log_to_file,
                            filemode="w",
                            level=loglevel,
                            format=FORMAT)
    else:
        logging.basicConfig(level=loglevel, format=FORMAT)

    if len(args.vcfs) > 1:
        raise NotImplementedError(
            'right now only support one prediction VCF. Quick workaround: src/sort_vcf.sh vcf1 vcf2 > merged.vcf'
        )

    global LOGGER
    LOGGER = logging.getLogger(__name__)
    LOGGER.info('working hard ...')

    utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem
    args.out_dir = os.path.abspath(args.out_dir)
    args.reference = os.path.abspath(args.reference)
    utils.makedirs([args.out_dir])

    varsim_prefix = os.path.join(args.out_dir, 'varsim_compare_results')
    varsim_comparator = VarSimVCFComparator(
        prefix=varsim_prefix,
        true_vcf=args.true_vcf,
        reference=args.reference,
        regions=None,
        sample=args.sample,
        vcfs=args.vcfs,
        exclude_filtered=args.exclude_filtered,
        disallow_partial_fp=args.disallow_partial_fp,
        match_geno=args.match_geno,
        log_to_file=args.log_to_file,
        opts=args.vcfcompare_options)
    varsim_tp, varsim_fn, varsim_fp = varsim_comparator.get_tp(
    ), varsim_comparator.get_fn(), varsim_comparator.get_fp()
    varsim_tp = utils.sort_and_compress(varsim_tp)
    varsim_fn = utils.sort_and_compress(varsim_fn)
    varsim_fp = utils.sort_and_compress(varsim_fp)
    #run vcfeval
    sdf = args.sdf
    if not sdf:
        LOGGER.info(
            "user did not supply SDF-formatted reference, trying to generate one..."
        )
        sdf = generate_sdf(args.reference, args.log_to_file)
    '''for vcfeval
    sample column must be present, and not empty
    if single-sample vcf, vcfeval doesn't check if samples match in truth and call
    in multi-sample vcf, sample name must be specified
    right now
    '''
    vcfeval_prefix = os.path.join(args.out_dir, 'vcfeval_compare_results')
    if os.path.exists(vcfeval_prefix):
        LOGGER.warn('{0} exists, removing ...'.format(vcfeval_prefix))
        shutil.rmtree(vcfeval_prefix)
    vcfeval_comparator = RTGVCFComparator(
        prefix=vcfeval_prefix,
        true_vcf=varsim_fn,
        reference=sdf,
        regions=None,
        sample=args.sample,
        vcfs=[varsim_fp],
        exclude_filtered=args.exclude_filtered,
        match_geno=args.match_geno,
        log_to_file=args.log_to_file,
        opts=args.vcfeval_options)
    vcfeval_tp, vcfeval_tp_predict = vcfeval_comparator.get_tp(
    ), vcfeval_comparator.get_tp_predict()
    augmented_tp, augmented_fn, augmented_fp, augmented_t = merge_results(
        outdir=args.out_dir,
        varsim_tp=varsim_tp,
        varsim_fn=varsim_fn,
        vcfeval_tp=vcfeval_tp,
        varsim_fp=varsim_fp,
        vcfeval_tp_predict=vcfeval_tp_predict)
    augmented_tp, augmented_fn, augmented_fp, augmented_t = summarize_results(
        os.path.join(args.out_dir, "augmented"),
        augmented_tp,
        augmented_fn,
        augmented_fp,
        augmented_t,
        var_types=args.var_types,
        sv_length=args.sv_length,
        regions=args.regions,
        bed_either=args.bed_either)

    LOGGER.info(
        "Variant comparison done.\nTrue positive: {0}\nFalse negative: {1}\nFalse positive: {2}\n"
        .format(augmented_tp, augmented_fn, augmented_fp))
Ejemplo n.º 4
0
def process(args):
    '''
    main
    :param args:
    :return:
    '''
    # Setup logging
    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    loglevel = utils.get_loglevel(args.loglevel)
    logging.basicConfig(level=loglevel, format=FORMAT)

    global LOGGER
    LOGGER = logging.getLogger(__name__)
    LOGGER.info('running {}'.format(' '.join(sys.argv)))

    dup_mode = None
    if args.mode == 'first_duplicate':
        dup_mode = utils.COMBINE_KEEP_FIRST_DUPLICATE
    elif args.mode == 'all_duplicate':
        dup_mode = utils.COMBINE_KEEP_ALL_DUPLICATE
    elif args.mode == 'no_duplicate':
        dup_mode = utils.COMBINE_KEEP_NO_DUPLICATE
    else:
        raise ValueError
    """
    scenarios:
    vcf
    vcf.gz
    vcf.gz + vcf.gz.tbi
    """
    input_vcfs = args.vcfs
    for i in range(len(args.vcfs)):
        current_vcf = args.vcfs[i]
        if current_vcf.endswith(".gz") and os.path.isfile(current_vcf +
                                                          ".tbi"):
            input_vcfs[i] = current_vcf
        elif current_vcf.endswith(".gz"):
            LOGGER.info('indexing {}'.format(current_vcf))
            utils.index_vcf_gz(current_vcf)
            input_vcfs[i] = current_vcf
        else:
            LOGGER.info('sort and index {}'.format(current_vcf))
            input_vcfs[i] = utils.sort_and_compress(current_vcf,
                                                    mode=2,
                                                    overwrite=args.overwrite)
    output_vcf = args.output_prefix + '.vcf'
    if input_vcfs and len(input_vcfs) == 1:
        output_vcf = output_vcf + '.gz'
        output_vcf_idx = output_vcf + '.tbi'
        if (not args.overwrite) and \
           (os.path.isfile(output_vcf) or os.path.isfile(output_vcf_idx)):
            LOGGER.warn(
                '{} or {} exists, use --overwrite otherwise do nothing.'.
                format(output_vcf, output_vcf_idx))
        else:
            shutil.copyfile(input_vcfs[0], output_vcf)
            shutil.copyfile(input_vcfs[0], output_vcf_idx)
    else:
        output_vcf = utils.combine_vcf(output_vcf,
                                       input_vcfs,
                                       duplicate_handling_mode=dup_mode)
    LOGGER.info('{} done'.format(output_vcf))
    return
Ejemplo n.º 5
0
def match_false(augmented_file,
                files_to_pair_with,
                out_dir,
                sample,
                log_to_file,
                vcfeval_options,
                sdf,
                java="java"):
    """Try to pair up each false call in a file (augmented_file) with a variant in the other files provided in a list (files_to_pair_with) to create an annotated version of the first file.
    By default the the first variant in the list is provided to get an AF, the 2nd to determine the simulated variant (for false positives) and the 3rd to determine if a false positive is
    a pure false positive (not simulated) or not (wrong genotype)"""

    files_to_pair_with_clean = []
    for item in files_to_pair_with:
        files_to_pair_with_clean.append(utils.make_clean_vcf(item, out_dir))

    content = []
    annotated_content = []

    with utils.versatile_open(augmented_file, "rt") as augmented_file_handle:
        for line in augmented_file_handle.readlines():
            line_strip = line.strip()
            line_split = line_strip.split()

            if line_strip[0] == "#":
                annotated_content.append(line_strip)
                content.append(line_strip)

            else:
                if content[-1][0] != "#":
                    del content[-1]
                content.append(line_strip)

                single_var_file = utils.write_vcf(
                    content, os.path.join(out_dir, "single.vcf"))
                single_var_file = utils.sort_and_compress(single_var_file)

                for i, item in enumerate(files_to_pair_with_clean):

                    equivalent_variant = None

                    if item:
                        vcfeval_prefix = os.path.join(
                            out_dir, 'vcfeval_compare_results_annotate')

                        vcfeval_comparator = RTGVCFComparator(
                            prefix=vcfeval_prefix,
                            true_vcf=item,
                            reference=sdf,
                            regions=None,
                            sample=sample,
                            vcfs=[single_var_file],
                            exclude_filtered=False,
                            match_geno=False,
                            log_to_file=log_to_file,
                            opts=vcfeval_options,
                            java=java)

                        equivalent_variant = utils.get_equivalent_variant(
                            line_split, vcfeval_comparator.get_tp())

                        #clean up
                        if os.path.exists(vcfeval_prefix):
                            LOGGER.warn('{0} exists, removing ...'.format(
                                vcfeval_prefix))
                            shutil.rmtree(vcfeval_prefix)

                    if i == 0:
                        if equivalent_variant:
                            try:
                                AO = int(equivalent_variant[-1].split(':')
                                         [4].split(',')[0])
                                RO = int(equivalent_variant[-1].split(':')
                                         [2].split(',')[0])
                            except:
                                info = "N/A;"
                            else:
                                info = str(float(AO) / (AO + RO)) + ';'
                        else:
                            info = "N/A;"

                    elif i == 1:
                        if equivalent_variant:
                            info += equivalent_variant[
                                0] + '_' + equivalent_variant[
                                    1] + '_' + equivalent_variant[
                                        3] + '_' + equivalent_variant[
                                            4] + '_' + equivalent_variant[
                                                -1] + ";"
                        else:
                            info += "N/A;"

                    elif i == 2:
                        info += "pure;" if not equivalent_variant else "not;"

                line_split[6] = info
                annotated_content.append('\t'.join(line_split))

                #clean up
                if os.path.isfile(single_var_file):
                    os.remove(single_var_file)
                    os.remove(single_var_file + ".tbi")

    annotated_file = utils.write_vcf(
        annotated_content,
        os.path.join(
            args.out_dir, "{}_annotated.vcf".format(
                os.path.splitext(
                    os.path.splitext(
                        os.path.basename(augmented_file))[0])[0])))
    annotated_file = utils.sort_and_compress(annotated_file)

    #clean up
    for item in files_to_pair_with_clean:
        if item and os.path.isfile(item):
            os.remove(item)
            os.remove(item + ".tbi")
Ejemplo n.º 6
0
def match_false(augmented_file,
                files_to_pair_with,
                out_dir,
                sample,
                log_to_file,
                vcfeval_options,
                sdf,
                java="java"):
    """Try to pair up each false call in a file (augmented_file) with a variant in the other files provided in a list (files_to_pair_with) to create an annotated version of the first file.
    By default the the first variant in the list is provided to get an AF, the 2nd to determine the simulated variant (for false positives) and the 3rd to determine if a false positive is
    a pure false positive (not simulated) or not (wrong genotype)"""
    files_to_pair_with_clean = []
    for item in files_to_pair_with:
        files_to_pair_with_clean.append(utils.make_clean_vcf(item, out_dir))

    content = []
    annotated_content = []

    with utils.versatile_open(augmented_file, "rt") as augmented_file_handle:
        for line in augmented_file_handle.readlines():
            line_strip = line.strip()
            line_split = line_strip.split()

            if line_strip[0] == "#":
                annotated_content.append(line_strip)
                content.append(line_strip)

            else:
                if content[-1][0] != "#":
                    del content[-1]
                content.append(line_strip)

                single_var_file = utils.write_vcf(
                    content, os.path.join(out_dir, "single.vcf"))
                single_var_file = utils.sort_and_compress(single_var_file)

                single_var_chr = line_split[0]
                info = ''

                for i, item in enumerate(files_to_pair_with_clean):

                    nonmatching_gt_variant = None

                    if item:
                        vcfeval_prefix = os.path.join(
                            out_dir, 'vcfeval_compare_results_annotate')

                        #Restrict the comparison to just the chromosome of the single variant by creating a filtered comparison file
                        filtered_true_vcf = utils.write_filtered_vcf(
                            item, single_var_chr,
                            os.path.join(out_dir, "filtered.vcf"))
                        filtered_true_vcf = utils.sort_and_compress(
                            filtered_true_vcf)

                        vcfeval_comparator = RTGVCFComparator(
                            prefix=vcfeval_prefix,
                            true_vcf=filtered_true_vcf,
                            reference=sdf,
                            regions=None,
                            sample=sample,
                            vcfs=[single_var_file],
                            exclude_filtered=False,
                            match_geno=False,
                            log_to_file=log_to_file,
                            opts=vcfeval_options,
                            java=java)

                        nonmatching_gt_variant = utils.get_closest_variant(
                            line_split, vcfeval_comparator.get_tp())

                        #if not nonmatching_gt_variant, check for matching alt and ref at the same position. Example of when this could be applicable is a 0/0 call when vcfeval will not pair up variants at the same locus with the same alt and ref even with match_geno=False
                        if not nonmatching_gt_variant:
                            nonmatching_gt_variant = utils.get_matching_alt_ref(
                                line_split, filtered_true_vcf)

                        #clean up
                        if os.path.exists(vcfeval_prefix):
                            LOGGER.warn('{0} exists, removing ...'.format(
                                vcfeval_prefix))
                            shutil.rmtree(vcfeval_prefix)

                    if i == 0:
                        AO_RO_DP_AD = {
                            "AO": None,
                            "RO": None,
                            "DP": None,
                            "AD": None
                        }
                        if nonmatching_gt_variant:
                            for entry in AO_RO_DP_AD:
                                AO_RO_DP_AD[entry] = utils.get_info(
                                    nonmatching_gt_variant, entry)

                        # gatk4 format
                        if AO_RO_DP_AD["AD"]:
                            AD_split = AO_RO_DP_AD["AD"].split(',')
                            AO = list(map(int, AD_split[1:]))
                            RO = int(AD_split[0])
                            for i, item in enumerate(AO):
                                comma = ',' if i < len(AO) - 1 else ''
                                if item + RO == 0:
                                    info += "0.0" + comma

                                else:
                                    info += str(float(item) /
                                                (item + RO)) + comma
                        #freebayes
                        elif AO_RO_DP_AD["AO"] and AO_RO_DP_AD["RO"]:
                            for i, item in enumerate(
                                    AO_RO_DP_AD["AO"].split(',')):
                                comma = ',' if i < len(
                                    AO_RO_DP_AD["AO"].split(',')) - 1 else ''
                                denominator = int(item) + int(
                                    AO_RO_DP_AD["RO"])
                                if denominator == 0:
                                    info += "0.0" + comma

                                else:
                                    info += str(
                                        float(item) / denominator) + comma
                        else:
                            info += "N/A"

                        info += ';'
                        info += "N/A" if not AO_RO_DP_AD["DP"] else str(
                            AO_RO_DP_AD["DP"])
                        info += ';'
                    elif i == 1:
                        if nonmatching_gt_variant:
                            info += nonmatching_gt_variant[
                                0] + '_' + nonmatching_gt_variant[
                                    1] + '_' + nonmatching_gt_variant[
                                        3] + '_' + nonmatching_gt_variant[
                                            4] + '_' + nonmatching_gt_variant[
                                                -1] + ";"
                        else:
                            info += "N/A;"

                    elif i == 2:
                        info += "pure;" if not nonmatching_gt_variant else "not;"

                line_split[6] = info
                annotated_content.append('\t'.join(line_split))

                #clean up
                for fil in [single_var_file, filtered_true_vcf]:
                    if os.path.isfile(fil):
                        os.remove(fil)
                        os.remove(fil + ".tbi")

    annotated_file = utils.write_vcf(
        annotated_content,
        os.path.join(
            args.out_dir, "{}_annotated.vcf".format(
                os.path.splitext(
                    os.path.splitext(
                        os.path.basename(augmented_file))[0])[0])))
    annotated_file = utils.sort_and_compress(annotated_file)

    #clean up
    for item in files_to_pair_with_clean:
        if item and os.path.isfile(item):
            os.remove(item)
            os.remove(item + ".tbi")
Ejemplo n.º 7
0
def varsim_multi(reference,
                 simulator,
                 simulator_exe,
                 total_coverage,
                 variant_vcfs=[],
                 sampling_vcf=None,
                 dgv_file=None,
                 regions=None,
                 randvcf_options=None,
                 randdgv_options=None,
                 nlanes=1,
                 simulator_options="",
                 samples=[],
                 out_dir="out",
                 sv_insert_seq=None,
                 seed=0,
                 sex="MALE",
                 remove_filtered=False,
                 keep_temp=False,
                 force_five_base_encoding=False,
                 lift_ref=False,
                 disable_vcf2diploid=False,
                 samples_random=0,
                 java="java"):
    logger = logging.getLogger(varsim_multi.__name__)

    makedirs([out_dir])

    restricted_dir = os.path.join(out_dir, "restricted")

    restricted_reference, restricted_vcfs = gen_restricted_ref_and_vcfs(
        reference,
        variant_vcfs,
        regions,
        samples,
        restricted_dir,
        flank=0,
        short_contig_names=False)
    dgv_vcf = None

    if dgv_file:
        assert sv_insert_seq, "SV insertion sequence file is required."
        dgv_vcf_dir = os.path.join(out_dir, "tmp")
        makedirs([dgv_vcf_dir])
        dgv_vcf = os.path.join(dgv_vcf_dir, "dgv.vcf")
        makedirs([os.path.join(out_dir, "log")])
        dgv_err_file = os.path.join(out_dir, "log", "dgv2vcf.err")
        randdgv_options2vcf = copy.copy(randdgv_options)
        randdgv_options2vcf.output_all = "-all"
        with open(dgv_vcf, "w") as dgv2vcf_out, open(dgv_err_file,
                                                     "w") as dgv2vcf_log:
            run_randdgv(dgv_file, dgv2vcf_out, dgv2vcf_log, seed, sex,
                        randdgv_options2vcf, reference, sv_insert_seq, java)

    if regions:
        merged_bed = os.path.join(out_dir, "merged.bed")
        pybedtools.BedTool(regions).merge().saveas(merged_bed)
        restricted_dir = os.path.join(out_dir, "region_restricted")
        if sampling_vcf:
            _, [restricted_sampling_vcf
                ] = gen_restricted_ref_and_vcfs(reference, [sampling_vcf],
                                                merged_bed, [],
                                                restricted_dir,
                                                flank=0)
            # Now lift over the restricted_sampling_vcf to get the region-limited VCF
            sampling_vcf = lift_vcfs([restricted_sampling_vcf],
                                     os.path.join(
                                         restricted_dir,
                                         "region-restricted-sampling.vcf"),
                                     reference)
        if dgv_vcf:
            convertCN([dgv_vcf], "two2one")
            dgv_vcf = sort_and_compress(dgv_vcf)
            _, [restricted_dgv_vcf
                ] = gen_restricted_ref_and_vcfs(reference, [dgv_vcf],
                                                merged_bed, [],
                                                restricted_dir,
                                                flank=0)
            # Now lift over the restricted_dgv_vcf to get the region-limited VCF
            dgv_vcf = lift_vcfs([restricted_dgv_vcf],
                                os.path.join(restricted_dir,
                                             "region-restricted-dgv.vcf"),
                                reference)

    all_samples = samples + ["VarSim%d" % i for i in xrange(samples_random)]

    for index, (sample, coverage) in enumerate(zip(all_samples,
                                                   total_coverage)):
        sample_dir = os.path.join(out_dir, sample)
        sample_seed = seed + 1000 * index
        makedirs([sample_dir])
        logger.info("Simulating sample {} in {}".format(sample, sample_dir))
        sample_variant_vcfs = list(
            restricted_vcfs if index < len(samples) else [])

        # Run RandVCF first to get the sampled variants for the sample
        if randvcf_options and sampling_vcf:
            sampled_vcf = os.path.join(sample_dir, "randvcf.vcf")
            with open(sampled_vcf, "w") as randvcf_out, open(
                    os.path.join(sample_dir, "randvcf.err"),
                    "w") as randvcf_log:
                run_randvcf(sampling_vcf, randvcf_out, randvcf_log,
                            sample_seed, sex, randvcf_options, reference, java)
            sampled_vcf = sort_and_compress(sampled_vcf)
            # Now generate the restricted sampled VCF for the sample
            _, [restricted_sampled_vcf] = gen_restricted_ref_and_vcfs(
                reference, [sampled_vcf],
                regions, [],
                os.path.join(sample_dir, "restricted_randvcf"),
                flank=0)
            sample_variant_vcfs = sample_variant_vcfs + [
                restricted_sampled_vcf
            ]

        if randdgv_options and dgv_vcf:
            sampled_dgv_vcf = os.path.join(sample_dir, "randdgvvcf.vcf")
            randdgvvcf_options = randdgv_options2randvcf_options(
                randdgv_options)
            with open(sampled_dgv_vcf, "w") as randdgvvcf_out, open(
                    os.path.join(sample_dir, "randdgvvcf.err"),
                    "w") as randdgvvcf_log:
                run_randvcf(dgv_vcf, randdgvvcf_out, randdgvvcf_log,
                            sample_seed, sex, randdgvvcf_options, reference,
                            java)
            sampled_dgv_vcf = sort_and_compress(sampled_dgv_vcf)
            # Now generate the restricted sampled dgv VCF for the sample
            _, [restricted_sampled_dgv_vcf] = gen_restricted_ref_and_vcfs(
                reference, [sampled_dgv_vcf],
                regions, [],
                os.path.join(sample_dir, "restricted_randdgvvcf"),
                flank=0)
            convertCN([restricted_sampled_dgv_vcf], "one2two")
            sample_variant_vcfs = sample_variant_vcfs + [
                restricted_sampled_dgv_vcf
            ]

        varsim_main(restricted_reference,
                    simulator,
                    simulator_exe,
                    coverage,
                    sample_variant_vcfs,
                    None,
                    dgv_file,
                    None,
                    randdgv_options,
                    nlanes,
                    simulator_options,
                    sample,
                    os.path.join(sample_dir, "log"),
                    os.path.join(sample_dir, "out"),
                    sv_insert_seq,
                    sample_seed,
                    sex,
                    remove_filtered,
                    keep_temp,
                    force_five_base_encoding,
                    lift_ref,
                    disable_vcf2diploid,
                    java=java)

    with open(os.path.join(out_dir, "samples.txt"), "w") as samples_fd:
        samples_fd.write("\n".join(all_samples))
Ejemplo n.º 8
0
def process(args):
    '''
    main
    :param args:
    :return:
    '''

    # Setup logging
    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    loglevel = utils.get_loglevel(args.loglevel)
    if args.log_to_file:
        logging.basicConfig(filename=args.log_to_file, filemode="w", level=loglevel, format=FORMAT)
    else:
        logging.basicConfig(level=loglevel, format=FORMAT)

    if len(args.vcfs) > 1:
        raise NotImplementedError('right now only support one prediction VCF. Quick workaround: src/sort_vcf.sh vcf1 vcf2 > merged.vcf')

    global LOGGER
    LOGGER = logging.getLogger(__name__)
    LOGGER.info('working hard ...')

    utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem
    args.out_dir = os.path.abspath(args.out_dir)
    args.reference = os.path.abspath(args.reference)
    utils.makedirs([args.out_dir])

    varsim_prefix = os.path.join(args.out_dir, 'varsim_compare_results')
    varsim_comparator = VarSimVCFComparator(prefix=varsim_prefix, true_vcf = args.true_vcf, reference = args.reference,
                                            regions = None,
               sample = args.sample, vcfs = args.vcfs,
               exclude_filtered = args.exclude_filtered,
               disallow_partial_fp = args.disallow_partial_fp,
               match_geno = args.match_geno, log_to_file= args.log_to_file, opts = args.vcfcompare_options)
    varsim_tp, varsim_fn, varsim_fp = varsim_comparator.get_tp(), varsim_comparator.get_fn(), varsim_comparator.get_fp()
    varsim_tp = utils.sort_and_compress(varsim_tp)
    varsim_fn = utils.sort_and_compress(varsim_fn)
    varsim_fp = utils.sort_and_compress(varsim_fp)
    #run vcfeval
    sdf = args.sdf
    if not sdf:
        LOGGER.info("user did not supply SDF-formatted reference, trying to generate one...")
        sdf = generate_sdf(args.reference, args.log_to_file)

    '''for vcfeval
    sample column must be present, and not empty
    if single-sample vcf, vcfeval doesn't check if samples match in truth and call
    in multi-sample vcf, sample name must be specified
    right now
    '''
    vcfeval_prefix = os.path.join(args.out_dir, 'vcfeval_compare_results')
    if os.path.exists(vcfeval_prefix):
        LOGGER.warn('{0} exists, removing ...'.format(vcfeval_prefix))
        shutil.rmtree(vcfeval_prefix)
    vcfeval_comparator = RTGVCFComparator(prefix=vcfeval_prefix, true_vcf = varsim_fn, reference = sdf,
                                          regions = None,
                                            sample = args.sample, vcfs = [varsim_fp],
                                            exclude_filtered = args.exclude_filtered,
                                            match_geno = args.match_geno, log_to_file= args.log_to_file,
                                          opts = args.vcfeval_options)
    vcfeval_tp, vcfeval_tp_predict = vcfeval_comparator.get_tp(), vcfeval_comparator.get_tp_predict()
    augmented_tp, augmented_fn, augmented_fp, augmented_t = merge_results(
                      outdir = args.out_dir,
                      varsim_tp = varsim_tp, varsim_fn = varsim_fn,
                      vcfeval_tp = vcfeval_tp, varsim_fp = varsim_fp, vcfeval_tp_predict = vcfeval_tp_predict)
    augmented_tp, augmented_fn, augmented_fp, augmented_t = summarize_results(os.path.join(args.out_dir,"augmented"), augmented_tp, augmented_fn, augmented_fp, augmented_t,
                      var_types= args.var_types, sv_length= args.sv_length, regions = args.regions, bed_either = args.bed_either)

    LOGGER.info("Variant comparison done.\nTrue positive: {0}\nFalse negative: {1}\nFalse positive: {2}\n".
                format(augmented_tp, augmented_fn, augmented_fp))