Ejemplo n.º 1
0
            remaining_time = time_passed / count * (len(paired_end_reads) -
                                                    count)
            print 'Approximately {:.3} minutes remaining'.format(
                remaining_time)
    return alignments, genome_aligned_reads


if __name__ == "__main__":
    genome_name = 'practice_W_1'
    input_folder = './{}'.format(genome_name)
    chr_name = '{}_chr_1'.format(genome_name)
    reads_fn_end = 'reads_{}.txt'.format(chr_name)
    reads_fn = join(input_folder, reads_fn_end)
    ref_fn_end = 'ref_{}.txt'.format(chr_name)
    ref_fn = join(input_folder, ref_fn_end)
    key_length = 7
    start = time.clock()
    reads = read_reads(reads_fn)
    # If you want to speed it up, cut down the number of reads by
    # changing the line to reads = read_reads(reads_fn)[:<x>] where <x>
    # is the number of reads you want to work with.
    genome_hash_table = build_hash_and_pickle(ref_fn, key_length)
    ref = read_reference(ref_fn)
    genome_aligned_reads, alignments = hashing_algorithm(
        reads, genome_hash_table)
    # print genome_aligned_reads
    # print alignments
    output_str = pretty_print_aligned_reads_with_ref(genome_aligned_reads,
                                                     alignments, ref)
    print output_str[:5000]
            output_read_pair.append(read)
            # # Note that there are some huge potential problems here.

        all_read_alignment_locations.append(read_alignment_locations)
        output_read_pairs.append(output_read_pair)
    return all_read_alignment_locations, output_read_pairs


if __name__ == "__main__":
    data_folder = 'hw1_W_2'
    input_folder = join('../data/', data_folder)
    f_base = '{}_chr_1'.format(data_folder)
    reads_fn = join(input_folder, 'reads_{}.txt'.format(f_base))
    start = time.clock()
    input_reads = read_reads(reads_fn)
    # This will take a while; you can use an array slice for example:
    #
    #   input_reads = reads[:300]
    #
    # to generate some data quickly.

    reference_fn = join(input_folder, 'ref_{}.txt'.format(f_base))
    reference = read_reference(reference_fn)
    alignments, reads = trivial_algorithm(input_reads, reference)
    print alignments
    print reads
    output_str = pretty_print_aligned_reads_with_ref(reads, alignments, reference)
    output_fn = join(input_folder, 'aligned_{}.txt'.format(f_base))
    with(open(output_fn, 'w')) as output_file:
        output_file.write(output_str)
Ejemplo n.º 3
0
                        n_mismatches = sum(mismatches)
                        if n_mismatches < max_mismatches:
                            min_mismatch_location = i - part
                            read = rev_read

            read_alignment_locations.append(min_mismatch_location)
            output_read_pair.append(read)
        all_read_alignment_locations.append(read_alignment_locations)
        output_read_pairs.append(output_read_pair)
    return all_read_alignment_locations, output_read_pairs


if __name__ == "__main__":
    data_folder = 'practice_W_1'
    input_folder = join('./', data_folder)
    f_base = '{}_chr_1'.format(data_folder)
    reads_fn = join(input_folder, 'reads_{}.txt'.format(f_base))
    start = time.clock()
    input_reads = read_reads(reads_fn)

    reference_fn = join(input_folder, 'ref_{}.txt'.format(f_base))
    reference = read_reference(reference_fn)
    alignments, reads = faster_algorithm(input_reads, reference)
    print alignments
    print reads
    output_str = pretty_print_aligned_reads_with_ref(reads, alignments,
                                                     reference)
    output_fn = join(input_folder, 'aligned_{}.txt'.format(f_base))
    with (open(output_fn, 'w')) as output_file:
        output_file.write(output_str)
Ejemplo n.º 4
0
        genome_aligned_reads.append(genome_aligned_read)
        count += 1
        if count % 100 == 0:
            time_passed = (time.clock()-start)/60
            print '{} reads aligned'.format(count), 'in {:.3} minutes'.format(time_passed)
            remaining_time = time_passed/count*(len(paired_end_reads)-count)
            print 'Approximately {:.3} minutes remaining'.format(remaining_time)
    return alignments, genome_aligned_reads

if __name__ == "__main__":
    genome_name = 'practice_W_1'
    input_folder = './{}'.format(genome_name)
    chr_name = '{}_chr_1'.format(genome_name)
    reads_fn_end = 'reads_{}.txt'.format(chr_name)
    reads_fn = join(input_folder, reads_fn_end)
    ref_fn_end = 'ref_{}.txt'.format(chr_name)
    ref_fn = join(input_folder, ref_fn_end)
    key_length = 7
    start = time.clock()
    reads = read_reads(reads_fn)
    # If you want to speed it up, cut down the number of reads by
    # changing the line to reads = read_reads(reads_fn)[:<x>] where <x>
    # is the number of reads you want to work with.
    genome_hash_table = build_hash_and_pickle(ref_fn, key_length)
    ref = read_reference(ref_fn)
    genome_aligned_reads, alignments = hashing_algorithm(reads, genome_hash_table)
    # print genome_aligned_reads
    # print alignments
    output_str = pretty_print_aligned_reads_with_ref(genome_aligned_reads, alignments, ref)
    print output_str[:5000]