def main(): prior = '/home/yu/max/research/' reads_file = prior + 'data/reads.20k.rc.fasta' creads_file = prior + 'data/temp_creads.outrx_27_6_rc_v2.out' ktmer_headers_file = prior + 'data/temp_ktmer_headersrx_27_6_rc_v2.out' hr, rr = rf.read_fasta(reads_file) headers = itec4.build_headers_dict(ktmer_headers_file) creads = itec4.build_creads_dict(creads_file, hr, rr) for i in range(len(hr)): hr[i] = hr[i].split()[0] out_fold = '/home/yu/max/research/6.8.15_nhoods/' if not os.path.exists(out_fold): os.makedirs(out_fold) for i in [s for s in range(len(hr)) if s % 2 == 1]: print i header = hr[i] nhood = get_special_1_deg_nhood(header, creads, headers, hr, rr) base_file = str(i) + '_base.fasta' hood_file = str(i) + '_hood.fasta' with open(base_file, 'w') as f: f.write(header + '\n' + rr[i]) if len(nhood) != 0: with open(hood_file, 'w') as f: f.write('\n'.join(nhood)) commands.getstatusoutput('mv ' + base_file + ' ' + out_fold) commands.getstatusoutput('mv ' + hood_file + ' ' + out_fold) return
def convert_creads_to_nhoods(reads_file, creads_file, ktmer_headers_file): out_file = '/' + '/'.join(creads_file.split('/')[:-1]) + '/nhoods_' + creads_file.split('/')[-1] creads = itec4.build_creads_dict(creads_file, reads_file) headers = itec4.build_headers_dict(ktmer_headers_file) hr, rr = rf.read_fasta(reads_file) for i in range(len(hr)): hr[i] = hr[i].split()[0] new_text = '' for i in range(len(hr)): if i % 500 == 0: print i, datetime.datetime.now() header = hr[i] nh = itec4.get_1_deg_nhood(header, creads, headers) new_text += str(i) + ' ' neighbors_indices = [] for neighbor_header in nh: neighbors_indices.append(hr.index(neighbor_header)) new_text += ' '.join([str(s) for s in neighbors_indices]) new_text += '\n' with open(out_file, 'w') as f: f.write(new_text) return
def convert_creads_to_nhoods(reads_file, creads_file, ktmer_headers_file): out_file = '/' + '/'.join( creads_file.split('/')[:-1]) + '/nhoods_' + creads_file.split('/')[-1] creads = itec4.build_creads_dict(creads_file, reads_file) headers = itec4.build_headers_dict(ktmer_headers_file) hr, rr = rf.read_fasta(reads_file) for i in range(len(hr)): hr[i] = hr[i].split()[0] new_text = '' for i in range(len(hr)): if i % 500 == 0: print i, datetime.datetime.now() header = hr[i] nh = itec4.get_1_deg_nhood(header, creads, headers) new_text += str(i) + ' ' neighbors_indices = [] for neighbor_header in nh: neighbors_indices.append(hr.index(neighbor_header)) new_text += ' '.join([str(s) for s in neighbors_indices]) new_text += '\n' with open(out_file, 'w') as f: f.write(new_text) return
def main(): header = '>' + sys.argv[1] e_coli_genome = '/home/mshen/research/data/e_coli_genome.fasta' # ec_tool = '/home/mshen/research/bin/error_correction_1218.sh' # reads_file = '/home/mshen/research/data/PacBioCLR/PacBio_10kb_CLR_mapped_removed_homopolymers.fasta' # creads_file = '/home/mshen/research/data/22.4_creads.out' # ktmer_headers_file = '/home/mshen/research/data/22.4_ktmer_headers.out' blasr_exe = '/home/jeyuan/blasr/alignment/bin/blasr' blasr_options = '-bestn 1 -m 1' # Concise output temp_sig = str(datetime.datetime.now()).split()[1] # New dataset ec_tool = '/home/lin/program/error_correction_5X_0204.sh' # reads_file = '/home/mchaisso/datasets/pacbio_ecoli/reads.20k.fasta' reads_file = '/home/mshen/research/data/reads.20k.rc.fasta' # creads_file = '/home/mshen/research/data/22.8_creads_20k.out' # ktmer_headers_file = '/home/mshen/research/data/22.8_ktmer_headers_20k.out' creads_file = '/home/mshen/research/data/temp_creads.out_28_6_rc.out' ktmer_headers_file = '/home/mshen/research/data/temp_ktmer_headers_28_6_rc.out' creads = itec4.build_creads_dict(creads_file, reads_file) headers = itec4.build_headers_dict(ktmer_headers_file) hr, rr = rf.read_fasta(reads_file) # Compensate for new dataset for i in range(len(hr)): hr[i] = hr[i].split()[0] con = itec4.error_correct(ec_tool, header, headers, creads, hr, rr, temp_sig_out = temp_sig) if len(con) == 0: print 'FAILURE IN ERROR CORRECTION' sys.exit(0) return temp_file = 'temp_cfh_' + temp_sig + '.fasta' temp2_file = 'temp_cfh2_' + temp_sig + '.fasta' with open(temp_file, 'w') as f: f.write(header + '\n' + con) status = commands.getstatusoutput(blasr_exe + ' ' + temp_file + ' ' + e_coli_genome + ' ' + blasr_options)[1] if len(status) != 0: print status collected_h = set() ktmers = [] if header not in creads or len(creads[header]) == 1: pass for i in range(len(creads[header])): if i % 2 == 1: ktmers.append(creads[header][i]) for kt in ktmers: for h in headers[kt]: collected_h.add(h) to_con = [] to_gen = [] for ch in collected_h: with open(temp2_file, 'w') as f: f.write(ch + '\n' + rr[hr.index(ch)]) status = commands.getstatusoutput(blasr_exe + ' ' + temp2_file + ' ' + temp_file + ' ' + blasr_options)[1] to_con.append(status) status = commands.getstatusoutput(blasr_exe + ' ' + temp2_file + ' ' + e_coli_genome + ' ' + blasr_options)[1] to_gen.append(status) print sum([1 for s in to_con if len(s) > 0]), 'used in consensus out of', len(to_con) for tg in to_gen: print tg
# Finds data on the number of kt-mers per read import sys import itec4 import read_fasta as rf reads_file = '/home/mshen/research/data/reads.20k.rc.fasta' creads_file = '/home/mshen/research/data/20k_v2/temp_creads.outrx_27_6_rc_v2.out' hr, rr = rf.read_fasta(reads_file) creads_dict = itec4.build_creads_dict(creads_file, hr, rr) for i in range(len(hr)): hr[i] = hr[i].split()[0] for i in range(20): print hr[i], len(creads_dict[hr[i]]) - 1 / 2, len(rr[i]) sys.exit(0) data = [] for i in range(len(creads_dict.keys())): k = creads_dict.keys()[i] data.append((len(creads_dict[k]) - 1) / 2) if i % 5000 == 0: print i print '\n'.join([str(s) for s in data]) print float(sum(data)) / float(len(data))
import itec4 prior = '/home/mshen/research/' reads_fn = prior + 'data/reads.20k.rc.fasta' creads_fn = prior + 'data/20k_v2/temp_creads.outrx_27_6_rc_v2.out' ktmer_headers_fn = prior + 'data/20k_v2/temp_ktmer_headersrx_27_6_rc_v2.out' genome_fn = prior + 'data/ecoli_consensus_mark.fasta' bt_fraction = 0.01 # If the current best score is less than fraction * best last score, backtrack forget_cutoff = 50000 traversed = [] score_history = ([0], [0]) # First is score history, second is num_candidates history hr, rr = ml.read_fasta(reads_fn) headers = itec4.build_headers_dict(ktmer_headers_fn) creads = itec4.build_creads_dict(creads_fn, hr, rr) for i in range(len(hr)): hr[i] = hr[i].split()[0] def main(): print 'Reads File:', reads_fn, '\ncreads File:', creads_fn, '\nktmer Headers File:', \ ktmer_headers_fn afa(reads_fn, ktmer_headers_fn, creads_fn) return def afa(reads_fn, ktmer_headers_fn, creads_fn): gh, gr = ml.read_fasta(genome_fn) gr = gr[0] ktmers = headers.keys()
prior = '/home/mshen/research/' reads_fn = prior + 'data/reads.20k.rc.fasta' creads_fn = prior + 'data/20k_v2/temp_creads.outrx_27_6_rc_v2.out' ktmer_headers_fn = prior + 'data/20k_v2/temp_ktmer_headersrx_27_6_rc_v2.out' genome_fn = prior + 'data/ecoli_consensus_mark.fasta' bt_fraction = 0.01 # If the current best score is less than fraction * best last score, backtrack forget_cutoff = 50000 traversed = [] score_history = ([0], [0] ) # First is score history, second is num_candidates history hr, rr = ml.read_fasta(reads_fn) headers = itec4.build_headers_dict(ktmer_headers_fn) creads = itec4.build_creads_dict(creads_fn, hr, rr) for i in range(len(hr)): hr[i] = hr[i].split()[0] def main(): print 'Reads File:', reads_fn, '\ncreads File:', creads_fn, '\nktmer Headers File:', \ ktmer_headers_fn afa(reads_fn, ktmer_headers_fn, creads_fn) return def afa(reads_fn, ktmer_headers_fn, creads_fn): gh, gr = ml.read_fasta(genome_fn) gr = gr[0]
def main(): header = '>' + sys.argv[1] e_coli_genome = '/home/mshen/research/data/e_coli_genome.fasta' # ec_tool = '/home/mshen/research/bin/error_correction_1218.sh' # reads_file = '/home/mshen/research/data/PacBioCLR/PacBio_10kb_CLR_mapped_removed_homopolymers.fasta' # creads_file = '/home/mshen/research/data/22.4_creads.out' # ktmer_headers_file = '/home/mshen/research/data/22.4_ktmer_headers.out' blasr_exe = '/home/jeyuan/blasr/alignment/bin/blasr' blasr_options = '-bestn 1 -m 1' # Concise output temp_sig = str(datetime.datetime.now()).split()[1] # New dataset ec_tool = '/home/lin/program/error_correction_5X_0204.sh' # reads_file = '/home/mchaisso/datasets/pacbio_ecoli/reads.20k.fasta' reads_file = '/home/mshen/research/data/reads.20k.rc.fasta' # creads_file = '/home/mshen/research/data/22.8_creads_20k.out' # ktmer_headers_file = '/home/mshen/research/data/22.8_ktmer_headers_20k.out' creads_file = '/home/mshen/research/data/temp_creads.out_28_6_rc.out' ktmer_headers_file = '/home/mshen/research/data/temp_ktmer_headers_28_6_rc.out' creads = itec4.build_creads_dict(creads_file, reads_file) headers = itec4.build_headers_dict(ktmer_headers_file) hr, rr = rf.read_fasta(reads_file) # Compensate for new dataset for i in range(len(hr)): hr[i] = hr[i].split()[0] con = itec4.error_correct(ec_tool, header, headers, creads, hr, rr, temp_sig_out=temp_sig) if len(con) == 0: print 'FAILURE IN ERROR CORRECTION' sys.exit(0) return temp_file = 'temp_cfh_' + temp_sig + '.fasta' temp2_file = 'temp_cfh2_' + temp_sig + '.fasta' with open(temp_file, 'w') as f: f.write(header + '\n' + con) status = commands.getstatusoutput(blasr_exe + ' ' + temp_file + ' ' + e_coli_genome + ' ' + blasr_options)[1] if len(status) != 0: print status collected_h = set() ktmers = [] if header not in creads or len(creads[header]) == 1: pass for i in range(len(creads[header])): if i % 2 == 1: ktmers.append(creads[header][i]) for kt in ktmers: for h in headers[kt]: collected_h.add(h) to_con = [] to_gen = [] for ch in collected_h: with open(temp2_file, 'w') as f: f.write(ch + '\n' + rr[hr.index(ch)]) status = commands.getstatusoutput(blasr_exe + ' ' + temp2_file + ' ' + temp_file + ' ' + blasr_options)[1] to_con.append(status) status = commands.getstatusoutput(blasr_exe + ' ' + temp2_file + ' ' + e_coli_genome + ' ' + blasr_options)[1] to_gen.append(status) print sum([1 for s in to_con if len(s) > 0]), 'used in consensus out of', len(to_con) for tg in to_gen: print tg