Esempio n. 1
0
def align_and_pickle_mappings():
    print 'loading ref_hash pickle...'
    pickle_filename = 'ref_hash_{}_{}.pkl'.format(c.DATASET, c.KEY_SIZE)
    ref_hash = load_hash_pickle(pickle_filename)

    print 'loading ref...'
    ref_file = c.REF_FILE
    ref = utils.read_reference('{}/{}/{}'.format(c.DATA_PATH, c.DATASET, ref_file))

    print 'loading reads...'
    reads_file = c.READS_FILE
    reads = utils.read_reads('{}/{}/{}'.format(c.DATA_PATH, c.DATASET, reads_file))
    print 'loaded reads'

    print 'aligning {}...'.format(c.DATASET)
    start = time.time()
    alignments = [align_paired_read(pr, ref, ref_hash) for pr in reads]
    alignments = [x for x in alignments if x]
    print 'alignment complete, elapsed: {}'.format(time.time() - start)

    directory = '{}/{}/{}'.format(c.DATA_PATH, c.DATASET, 'alignments/')
    if not os.path.exists(directory):
        print 'creating folder {}'.format(directory)
        os.makedirs(directory)

    start = time.time()
    alignments_pickle_path = 'alignments_{}_{}.pkl'.format(c.DATASET, c.KEY_SIZE)
    pickle.dump(alignments, open(alignments_pickle_path, 'wb'))
    print '{} alignment pickled in {}'.format(reads_file, time.time() - start)
Esempio n. 2
0
            if first_line:
                first_line = False
                continue
            write_buffer += line.strip() + '\n'
            line_count += 1
            if line_count >= max_num_lines:
                with open('data/{}/reads_split/part_{}.txt'.format(c.DATASET, file_count),'w') as w:
                    w.write(write_buffer)
                    print 'wrote part {}'.format(file_count)
                file_count += 1
                write_buffer = ''
                line_count = 0
        if len(write_buffer) > 0:
            with open('data/{}/reads_split/part_{}.txt'.format(c.DATASET, file_count),'w') as w:
                w.write(write_buffer)
                print 'wrote part {}'.format(file_count)


if __name__ == '__main__':
    print 'splitting reads...'

    directory = '{}/{}/{}'.format(c.DATA_PATH, c.DATASET, 'reads_split/')
    if not os.path.exists(directory):
        print 'creating folder {}'.format(directory)
        os.makedirs(directory)

    ref = utils.read_reference('data/{}/{}'.format(c.DATASET,c.REF_FILE))
    split_reads('data/{}/{}'.format(c.DATASET, c.READS_FILE))

    print 'reads split'
#
#     l = SortedListWithKey(key=lambda val:val[0])
#     for fn in fns:
#         alignments = pickle.load(open('{}{}'.format(directory,fn), 'rb'))
#         alignments = [item for sublist in alignments for item in sublist]
#         l.update(alignments)
#         print 'done {}'.format(fn)
#     return l



if __name__ == '__main__':



    ref = utils.read_reference()
    stretches = get_nonperfect_stretches(ref)
    print stretches

    #print 'sorting...'
    #sl = sorted(nprt_list)
    #print 'sorted: {}'.format(sl)

    pickle.dump(stretches, file('stretches_{}.pkl'.format(c.DATASET), 'wb'))
    #msgpack.dump(stretches, file('stretches_{}.msg'.format(c.DATASET), 'wb'))

    #pickle.dump(sl, file('sorted_nprt_{}.pkl'.format(c.DATASET), 'wb'))

    print 'DONE'

    # start = time.time()
Esempio n. 4
0
    alignments_pickle_path = 'alignments_{}_{}.pkl'.format(c.DATASET, c.KEY_SIZE)
    pickle.dump(alignments, open(alignments_pickle_path, 'wb'))
    print '{} alignment pickled in {}'.format(reads_file, time.time() - start)


if __name__ == '__main__':

    args = sys.argv[1:]
    if len(args) == 2:
        FILE_INDEX_BEGIN = int(args[0])
        FILE_INDEX_END = int(args[1])
    else:
        FILE_INDEX_BEGIN = 0
        FILE_INDEX_END = len(os.listdir('data/{}/reads_split/'.format(c.DATASET)))
    print 'Processing files {} through {}'.format(FILE_INDEX_BEGIN, FILE_INDEX_END)

    print 'loading ref_hash pickle...'
    pickle_filename = 'ref_hash_{}_{}.pkl'.format(c.DATASET, c.KEY_SIZE)
    ref_hash = load_hash_pickle(pickle_filename)

    print 'loading ref...'
    ref_file = c.REF_FILE
    ref = utils.read_reference('{}/{}/{}'.format(c.DATA_PATH, c.DATASET, ref_file))

    #align_and_pickle_mappings()

    for file_index in xrange(FILE_INDEX_BEGIN, FILE_INDEX_END):
        align_and_pickle_mappings_split(file_index, ref, ref_hash)
        progress = (file_index - FILE_INDEX_BEGIN)*100.0/(FILE_INDEX_END - FILE_INDEX_BEGIN)
        print 'STATUS: {0:.2f}% complete \n'.format(progress)