pp = VariantPhaser.VariantPhaser(vc) pp.phase_variant(args.sam_filename, args.fastx_filename, args.output_prefix, partial_ok=args.partial_ok) pp.haplotypes pp.haplotypes.get_haplotype_vcf_assignment() # (3) phase isoforms seqids = set([r.id for r in SeqIO.parse(open(args.fastx_filename), VariantPhaser.type_fa_or_fq(args.fastx_filename))]) isoform_tally = VariantPhaser.phase_isoforms(args.read_stat, seqids, pp) if len(isoform_tally) == 0: os.system("touch {out}.NO_HAPS_FOUND".format(out=args.output_prefix)) print >> sys.stderr, "No good haps found. END." sys.exit(0) pp.haplotypes.write_haplotype_to_vcf(args.mapping_filename, isoform_tally, args.output_prefix) # (4) clean isoforms hap_count = VariantPhaseCleaner.make_haplotype_counts(isoform_tally) # (5) error correct haplotypes # if diploid, use exhaustive search # otherwise, use hap counts (ToDo: make this work with exhaustive search later) variants = [ [base.upper() for base,count in vc.variant[pos]] for pos in pp.accepted_pos] if args.ploidy == 2 and all(len(vars)==2 for vars in variants): diff_arr, hap_count_ordered = VariantPhaseCleaner.infer_haplotypes_via_exhaustive_diploid_only(pp.haplotypes, variants) else: diff_arr, hap_count_ordered = VariantPhaseCleaner.infer_haplotypes_via_min_diff(pp.haplotypes.haplotypes, hap_count, args.ploidy, MAX_DIFF_ALLOWED) if diff_arr is None: os.system("touch {out}.cleaned.NO_HAPS_FOUND".format(out=args.output_prefix)) print >> sys.stderr, "No good haps found. END." sys.exit(0)
# (3) phase isoforms seqids = set([ r.id for r in SeqIO.parse(open(args.fastx_filename), VariantPhaser.type_fa_or_fq(args.fastx_filename)) ]) isoform_tally = VariantPhaser.phase_isoforms(args.read_stat, seqids, pp) if len(isoform_tally) == 0: os.system("touch {out}.NO_HAPS_FOUND".format(out=args.output_prefix)) print("No good haps found. END.", file=sys.stderr) sys.exit(0) pp.haplotypes.write_haplotype_to_vcf(args.mapping_filename, isoform_tally, args.output_prefix) # (4) clean isoforms hap_count = VariantPhaseCleaner.make_haplotype_counts(isoform_tally) # (5) error correct haplotypes # if diploid, use exhaustive search # otherwise, use hap counts (ToDo: make this work with exhaustive search later) variants = [[base.upper() for base, count in vc.variant[pos]] for pos in pp.accepted_pos] if args.ploidy == 2 and all(len(vars) == 2 for vars in variants): diff_arr, hap_count_ordered = VariantPhaseCleaner.infer_haplotypes_via_exhaustive_diploid_only( pp.haplotypes, variants) else: min_perc_allowed = min( MIN_PERC_ALLOWED, 1 / (args.ploidy + 4) ) # this is a heuristic, allowing for some alleles to be much less expressde than others diff_arr, hap_count_ordered = VariantPhaseCleaner.infer_haplotypes_via_min_diff(
os.system("touch {out}.NO_SNPS_FOUND".format(out=args.output_prefix)) print >> sys.stderr, "No SNPs found. END." sys.exit(0) # (2) for each CCS read, assign a haplotype (or discard if outlier) pp = VariantPhaser.VariantPhaser(vc) pp.phase_variant(args.sam_filename, args.fasta_filename, args.output_prefix, partial_ok=args.partial_ok) pp.haplotypes pp.haplotypes.get_haplotype_vcf_assignment() # (3) phase isoforms seqids = set([r.id for r in SeqIO.parse(open(args.fasta_filename), 'fasta')]) isoform_tally = VariantPhaser.phase_isoforms(args.read_stat, seqids, pp) if len(isoform_tally) == 0: os.system("touch {out}.NO_HAPS_FOUND".format(out=args.output_prefix)) print >> sys.stderr, "No good haps found. END." sys.exit(0) pp.haplotypes.write_haplotype_to_vcf(args.mapping_filename, isoform_tally, args.output_prefix) # (4) clean isoforms hap_count = VariantPhaseCleaner.make_haplotype_counts(isoform_tally) # --- old, obsolete ---- #G, partial_haps = VariantPhaseCleaner.make_haplotype_graph_nonpartial_only(pp.haplotypes.haplotypes, ERR_SUB, MAX_DIFF_ALLOWED) #print "G nodes:", G.nodes(), "G edges:", G.edges() #m, new_hap, new_isoform_tally = VariantPhaseCleaner.error_correct_haplotypes(G, partial_haps, hap_count, pp.haplotypes, isoform_tally) diff_arr, hap_count_ordered = VariantPhaseCleaner.infer_haplotypes_via_min_diff(pp.haplotypes.haplotypes, hap_count, args.ploidity, MAX_DIFF_ALLOWED) m, new_hap, new_isoform_tally = VariantPhaseCleaner.error_correct_haplotypes(pp.haplotypes, isoform_tally, diff_arr, hap_count_ordered) new_hap.get_haplotype_vcf_assignment() new_hap.write_haplotype_to_vcf(args.mapping_filename, new_isoform_tally, args.output_prefix+'.cleaned')