remaining_time = time_passed / count * (len(paired_end_reads) - count) print 'Approximately {:.3} minutes remaining'.format( remaining_time) return alignments, genome_aligned_reads if __name__ == "__main__": genome_name = 'practice_W_1' input_folder = './{}'.format(genome_name) chr_name = '{}_chr_1'.format(genome_name) reads_fn_end = 'reads_{}.txt'.format(chr_name) reads_fn = join(input_folder, reads_fn_end) ref_fn_end = 'ref_{}.txt'.format(chr_name) ref_fn = join(input_folder, ref_fn_end) key_length = 7 start = time.clock() reads = read_reads(reads_fn) # If you want to speed it up, cut down the number of reads by # changing the line to reads = read_reads(reads_fn)[:<x>] where <x> # is the number of reads you want to work with. genome_hash_table = build_hash_and_pickle(ref_fn, key_length) ref = read_reference(ref_fn) genome_aligned_reads, alignments = hashing_algorithm( reads, genome_hash_table) # print genome_aligned_reads # print alignments output_str = pretty_print_aligned_reads_with_ref(genome_aligned_reads, alignments, ref) print output_str[:5000]
output_read_pair.append(read) # # Note that there are some huge potential problems here. all_read_alignment_locations.append(read_alignment_locations) output_read_pairs.append(output_read_pair) return all_read_alignment_locations, output_read_pairs if __name__ == "__main__": data_folder = 'hw1_W_2' input_folder = join('../data/', data_folder) f_base = '{}_chr_1'.format(data_folder) reads_fn = join(input_folder, 'reads_{}.txt'.format(f_base)) start = time.clock() input_reads = read_reads(reads_fn) # This will take a while; you can use an array slice for example: # # input_reads = reads[:300] # # to generate some data quickly. reference_fn = join(input_folder, 'ref_{}.txt'.format(f_base)) reference = read_reference(reference_fn) alignments, reads = trivial_algorithm(input_reads, reference) print alignments print reads output_str = pretty_print_aligned_reads_with_ref(reads, alignments, reference) output_fn = join(input_folder, 'aligned_{}.txt'.format(f_base)) with(open(output_fn, 'w')) as output_file: output_file.write(output_str)
n_mismatches = sum(mismatches) if n_mismatches < max_mismatches: min_mismatch_location = i - part read = rev_read read_alignment_locations.append(min_mismatch_location) output_read_pair.append(read) all_read_alignment_locations.append(read_alignment_locations) output_read_pairs.append(output_read_pair) return all_read_alignment_locations, output_read_pairs if __name__ == "__main__": data_folder = 'practice_W_1' input_folder = join('./', data_folder) f_base = '{}_chr_1'.format(data_folder) reads_fn = join(input_folder, 'reads_{}.txt'.format(f_base)) start = time.clock() input_reads = read_reads(reads_fn) reference_fn = join(input_folder, 'ref_{}.txt'.format(f_base)) reference = read_reference(reference_fn) alignments, reads = faster_algorithm(input_reads, reference) print alignments print reads output_str = pretty_print_aligned_reads_with_ref(reads, alignments, reference) output_fn = join(input_folder, 'aligned_{}.txt'.format(f_base)) with (open(output_fn, 'w')) as output_file: output_file.write(output_str)
genome_aligned_reads.append(genome_aligned_read) count += 1 if count % 100 == 0: time_passed = (time.clock()-start)/60 print '{} reads aligned'.format(count), 'in {:.3} minutes'.format(time_passed) remaining_time = time_passed/count*(len(paired_end_reads)-count) print 'Approximately {:.3} minutes remaining'.format(remaining_time) return alignments, genome_aligned_reads if __name__ == "__main__": genome_name = 'practice_W_1' input_folder = './{}'.format(genome_name) chr_name = '{}_chr_1'.format(genome_name) reads_fn_end = 'reads_{}.txt'.format(chr_name) reads_fn = join(input_folder, reads_fn_end) ref_fn_end = 'ref_{}.txt'.format(chr_name) ref_fn = join(input_folder, ref_fn_end) key_length = 7 start = time.clock() reads = read_reads(reads_fn) # If you want to speed it up, cut down the number of reads by # changing the line to reads = read_reads(reads_fn)[:<x>] where <x> # is the number of reads you want to work with. genome_hash_table = build_hash_and_pickle(ref_fn, key_length) ref = read_reference(ref_fn) genome_aligned_reads, alignments = hashing_algorithm(reads, genome_hash_table) # print genome_aligned_reads # print alignments output_str = pretty_print_aligned_reads_with_ref(genome_aligned_reads, alignments, ref) print output_str[:5000]