def align_and_pickle_mappings(): print 'loading ref_hash pickle...' pickle_filename = 'ref_hash_{}_{}.pkl'.format(c.DATASET, c.KEY_SIZE) ref_hash = load_hash_pickle(pickle_filename) print 'loading ref...' ref_file = c.REF_FILE ref = utils.read_reference('{}/{}/{}'.format(c.DATA_PATH, c.DATASET, ref_file)) print 'loading reads...' reads_file = c.READS_FILE reads = utils.read_reads('{}/{}/{}'.format(c.DATA_PATH, c.DATASET, reads_file)) print 'loaded reads' print 'aligning {}...'.format(c.DATASET) start = time.time() alignments = [align_paired_read(pr, ref, ref_hash) for pr in reads] alignments = [x for x in alignments if x] print 'alignment complete, elapsed: {}'.format(time.time() - start) directory = '{}/{}/{}'.format(c.DATA_PATH, c.DATASET, 'alignments/') if not os.path.exists(directory): print 'creating folder {}'.format(directory) os.makedirs(directory) start = time.time() alignments_pickle_path = 'alignments_{}_{}.pkl'.format(c.DATASET, c.KEY_SIZE) pickle.dump(alignments, open(alignments_pickle_path, 'wb')) print '{} alignment pickled in {}'.format(reads_file, time.time() - start)
def align_and_pickle_mappings_split(split_no, ref, ref_hash): print 'loading reads file {}...'.format(split_no) reads = utils.read_reads('{}/{}/reads_split/part_{}.txt'.format(c.DATA_PATH, c.DATASET, split_no)) print 'loaded reads file {}'.format(split_no) print 'aligning {} part {}'.format(c.DATASET, split_no) start = time.time() alignments = [align_paired_read(pr, ref, ref_hash) for pr in reads] alignments = [x for x in alignments if x] print 'alignment complete, elapsed: {}'.format(time.time() - start) directory = '{}/{}/{}'.format(c.DATA_PATH, c.DATASET, 'alignments/') if not os.path.exists(directory): print 'creating folder {}'.format(directory) os.makedirs(directory) start = time.time() alignments_pickle_path = '{}/alignments_{}_{}_part_{}.pkl'.format(directory, c.DATASET, c.KEY_SIZE, split_no) pickle.dump(alignments, open(alignments_pickle_path, 'wb')) print '{} part {} alignment pickled in {}'.format(c.DATASET, split_no, time.time() - start)