Exemple #1
0
def snp_count_table(options):
    """ make a table of snp counts.  there are serious problems with this now:
    1) don't have snp count for baseline (as it's not gam or vcf)
    2) snps counted differenty for gam/vcf (multiple alternates at same site
    counted in former but not latter)
    """
    # tsv header
    count_table =  "#\t{}\t\n".format(options.baseline)
    count_table += "#graph\tlinear_snp_count\tsample_snp_count\taugmented_snp_count\n"

    sums = defaultdict(lambda : (0,0,0))
    counts = defaultdict(lambda : 0)

    for gam in options.in_gams:
        linear_vcf = linear_vcf_path(gam, options) + ".gz"
        vg_sample = sample_vg_path(gam, options)
        vg_augmented = augmented_vg_path(gam, options)
        vcf_snps = count_vcf_snps(linear_vcf, options)
        sample_snps = count_vg_paths(vg_sample, options)
        augmented_snps = count_vg_paths(vg_augmented, options)

        name = graph_path(gam, options)

        sums[name] = (sums[name][0] + vcf_snps,
                      sums[name][1] + sample_snps,
                      sums[name][2] + augmented_snps)
        counts[name] = counts[name] + 1

    for name in list(set(map(lambda x : graph_path(x, options), options.in_gams))):
        avg_vcf = float(sums[name][0]) / float(counts[name])
        avg_sam = float(sums[name][1]) / float(counts[name])
        avg_aug = float(sums[name][2]) / float(counts[name])
        count_table +="{}\t{}\t{}\t{}\n".format(
            os.path.splitext(os.path.basename(name))[0],
            avg_vcf,
            avg_sam,
            avg_aug)

    with open(count_tsv_path(options), "w") as ofile:
        ofile.write(count_table)
Exemple #2
0
def graph_size_table(options):
    """ make a table of sequence lengths for the vg call outputs
    """
    # tsv header
    length_table =  "#\t{}\t\n".format(options.baseline)
    length_table += "#graph\tsample_snp_length\taugmented_snp_length\toriginal_length\n"

    sums = defaultdict(lambda : (0,0,0))
    counts = defaultdict(lambda : 0)

    for gam in options.in_gams:
        linear_vcf = linear_vcf_path(gam, options) + ".gz"
        vg_sample = sample_vg_path(gam, options)
        vg_augmented = augmented_vg_path(gam, options)
        sample_snps = vg_length(vg_sample, options)
        augmented_snps = vg_length(vg_augmented, options)
        vg_original = graph_path(gam, options)
        original_snps = vg_length(vg_original, options)

        name = graph_path(gam, options)

        sums[name] = (sums[name][0] + sample_snps,
                      sums[name][1] + augmented_snps,
                      sums[name][2] + original_snps)
        counts[name] = counts[name] + 1

    for name in list(set(map(lambda x : graph_path(x, options), options.in_gams))):
        avg_sam = float(sums[name][0]) / float(counts[name])
        avg_aug = float(sums[name][1]) / float(counts[name])
        avg_ori = float(sums[name][2]) / float(counts[name])
        length_table +="{}\t{}\t{}\t{}\n".format(
            os.path.splitext(os.path.basename(name))[0],
            avg_sam,
            avg_aug,
            avg_ori)

    with open(size_tsv_path(options), "w") as ofile:
        ofile.write(length_table)