def get_consensus(read_fn, init_ref, consensus_fn, consens_seq_name, hp_correction = True, min_iteration = 4, max_num_reads = 150, entropy_th = 0.65): g = construct_aln_graph_from_fasta(read_fn, init_ref, max_num_reads = max_num_reads, remove_in_del = False) s,c = g.generate_consensus() with open(consensus_fn,"w") as f: print >>f, ">"+consens_seq_name print >>f, s.upper() if min_iteration > 1: for i in range(min_iteration-2): g = construct_aln_graph_from_fasta(read_fn, consensus_fn, max_num_reads = max_num_reads, remove_in_del = False) s,c = g.generate_consensus() with open(consensus_fn,"w") as f: print >>f, ">"+consens_seq_name print >>f, s.upper() if hp_correction: g = construct_aln_graph_from_fasta(read_fn, consensus_fn, max_num_reads = max_num_reads, remove_in_del = False) s = detect_missing(g, entropy_th = entropy_th) with open(consensus_fn,"w") as f: print >>f, ">"+consens_seq_name print >>f, s.upper() g = construct_aln_graph_from_fasta(read_fn, consensus_fn, max_num_reads = max_num_reads, remove_in_del = False) s,c = g.generate_consensus() s = mark_lower_case_base(g, entropy_th = entropy_th) with open(consensus_fn,"w") as f: print >>f, ">"+consens_seq_name print >>f, s
def generate_haplotype_consensus(inpute_fasta_name, ref_fasta_name, prefix, consensus_name, hpFix = True, min_iteration = 4, max_num_reads = 150, entropy_th = 0.65): normalize_fasta(inpute_fasta_name, ref_fasta_name, "%s_input.fa" % prefix) get_consensus("%s_input.fa" % prefix, ref_fasta_name, "%s.fa" % prefix, consensus_name, hp_correction = False, min_iteration = min_iteration, max_num_reads = max_num_reads, entropy_th = entropy_th) g = construct_aln_graph_from_fasta("%s_input.fa" % prefix, "%s.fa" % prefix, ref_group=consensus_name, max_num_reads = max_num_reads, remove_in_del = False) rv, hen = read_node_vector(g, entropy_th = 0.65) cluster, cluster_vec = clustering_read(rv, hen, k_cluster = 2, random_seed = 42) with open("%s.log" % prefix, "w") as logf: print >>logf, len(rv) for k in cluster: print >>logf, cluster_vec[k], k, len(cluster[k]) for k in cluster: for r in cluster[k]: print >>logf, "".join(rv[r]), k, r if len(cluster[0]) > 0: get_subset_reads("%s_input.fa" % prefix, cluster, 0, "%s_h1_input.fa" % prefix) #rid,s = best_template_by_blasr("%s_h1_input.fa" % prefix) #print rid, len(s) #with open("%s_h1_ref.fa" % prefix, "w") as f: # print >>f ,">%s_h1_ref" % prefix # print >>f, s get_consensus("%s_h1_input.fa" % prefix, "%s.fa" % prefix, "%s_h1.fa" % prefix, "%s_h1" % consensus_name, hp_correction = hpFix, min_iteration = min_iteration, max_num_reads = max_num_reads) if len(cluster[1]) > 0: get_subset_reads("%s_input.fa" % prefix, cluster, 1, "%s_h2_input.fa" % prefix) #rid,s = best_template_by_blasr("%s_h2_input.fa" % prefix) #print rid, len(s) #with open("%s_h2_ref.fa" % prefix, "w") as f: # print >>f ,">%s_h2_ref" % prefix # print >>f, s get_consensus("%s_h2_input.fa" % prefix, "%s.fa" % prefix, "%s_h2.fa" % prefix, "%s_h2" % consensus_name, hp_correction = hpFix, min_iteration = min_iteration, max_num_reads = max_num_reads)