Exemple #1
0
pp = VariantPhaser.VariantPhaser(vc)
pp.phase_variant(args.sam_filename, args.fastx_filename, args.output_prefix, partial_ok=args.partial_ok)
pp.haplotypes
pp.haplotypes.get_haplotype_vcf_assignment()

# (3) phase isoforms
seqids = set([r.id for r in SeqIO.parse(open(args.fastx_filename), VariantPhaser.type_fa_or_fq(args.fastx_filename))])
isoform_tally = VariantPhaser.phase_isoforms(args.read_stat, seqids, pp)
if len(isoform_tally) == 0:
    os.system("touch {out}.NO_HAPS_FOUND".format(out=args.output_prefix))
    print >> sys.stderr, "No good haps found. END."
    sys.exit(0)
pp.haplotypes.write_haplotype_to_vcf(args.mapping_filename, isoform_tally, args.output_prefix)

# (4) clean isoforms
hap_count = VariantPhaseCleaner.make_haplotype_counts(isoform_tally)

# (5) error correct haplotypes
#  if diploid, use exhaustive search
#  otherwise, use hap counts (ToDo: make this work with exhaustive search later)
variants = [ [base.upper() for base,count in vc.variant[pos]] for pos in pp.accepted_pos]

if args.ploidy == 2 and all(len(vars)==2 for vars in variants):
    diff_arr, hap_count_ordered = VariantPhaseCleaner.infer_haplotypes_via_exhaustive_diploid_only(pp.haplotypes, variants)
else:
    diff_arr, hap_count_ordered = VariantPhaseCleaner.infer_haplotypes_via_min_diff(pp.haplotypes.haplotypes, hap_count, args.ploidy, MAX_DIFF_ALLOWED)

if diff_arr is None:
    os.system("touch {out}.cleaned.NO_HAPS_FOUND".format(out=args.output_prefix))
    print >> sys.stderr, "No good haps found. END."
    sys.exit(0)
# (3) phase isoforms
seqids = set([
    r.id for r in SeqIO.parse(open(args.fastx_filename),
                              VariantPhaser.type_fa_or_fq(args.fastx_filename))
])
isoform_tally = VariantPhaser.phase_isoforms(args.read_stat, seqids, pp)
if len(isoform_tally) == 0:
    os.system("touch {out}.NO_HAPS_FOUND".format(out=args.output_prefix))
    print("No good haps found. END.", file=sys.stderr)
    sys.exit(0)
pp.haplotypes.write_haplotype_to_vcf(args.mapping_filename, isoform_tally,
                                     args.output_prefix)

# (4) clean isoforms
hap_count = VariantPhaseCleaner.make_haplotype_counts(isoform_tally)

# (5) error correct haplotypes
#  if diploid, use exhaustive search
#  otherwise, use hap counts (ToDo: make this work with exhaustive search later)
variants = [[base.upper() for base, count in vc.variant[pos]]
            for pos in pp.accepted_pos]

if args.ploidy == 2 and all(len(vars) == 2 for vars in variants):
    diff_arr, hap_count_ordered = VariantPhaseCleaner.infer_haplotypes_via_exhaustive_diploid_only(
        pp.haplotypes, variants)
else:
    min_perc_allowed = min(
        MIN_PERC_ALLOWED, 1 / (args.ploidy + 4)
    )  # this is a heuristic, allowing for some alleles to be much less expressde than others
    diff_arr, hap_count_ordered = VariantPhaseCleaner.infer_haplotypes_via_min_diff(
Exemple #3
0
    os.system("touch {out}.NO_SNPS_FOUND".format(out=args.output_prefix))
    print >> sys.stderr, "No SNPs found. END."
    sys.exit(0)

# (2) for each CCS read, assign a haplotype (or discard if outlier)
pp = VariantPhaser.VariantPhaser(vc)
pp.phase_variant(args.sam_filename, args.fasta_filename, args.output_prefix, partial_ok=args.partial_ok)
pp.haplotypes
pp.haplotypes.get_haplotype_vcf_assignment()

# (3) phase isoforms
seqids = set([r.id for r in SeqIO.parse(open(args.fasta_filename), 'fasta')])
isoform_tally = VariantPhaser.phase_isoforms(args.read_stat, seqids, pp)
if len(isoform_tally) == 0:
    os.system("touch {out}.NO_HAPS_FOUND".format(out=args.output_prefix))
    print >> sys.stderr, "No good haps found. END."
    sys.exit(0)
pp.haplotypes.write_haplotype_to_vcf(args.mapping_filename, isoform_tally, args.output_prefix)

# (4) clean isoforms
hap_count = VariantPhaseCleaner.make_haplotype_counts(isoform_tally)

# --- old, obsolete ----
#G, partial_haps = VariantPhaseCleaner.make_haplotype_graph_nonpartial_only(pp.haplotypes.haplotypes, ERR_SUB, MAX_DIFF_ALLOWED)
#print "G nodes:", G.nodes(), "G edges:", G.edges()
#m, new_hap, new_isoform_tally = VariantPhaseCleaner.error_correct_haplotypes(G, partial_haps, hap_count, pp.haplotypes, isoform_tally)
diff_arr, hap_count_ordered = VariantPhaseCleaner.infer_haplotypes_via_min_diff(pp.haplotypes.haplotypes, hap_count, args.ploidity, MAX_DIFF_ALLOWED)
m, new_hap, new_isoform_tally = VariantPhaseCleaner.error_correct_haplotypes(pp.haplotypes, isoform_tally, diff_arr, hap_count_ordered)

new_hap.get_haplotype_vcf_assignment()
new_hap.write_haplotype_to_vcf(args.mapping_filename, new_isoform_tally, args.output_prefix+'.cleaned')