def align_paired_read(paired_read, ref, ref_hash): NUM_PIECES_TO_MATCH_FOR_PRIMARY = 2 NUM_PIECES_TO_MATCH_FOR_SECONDARY = 2 orientations = [(paired_read[0], paired_read[1][::-1]), (paired_read[1], paired_read[0][::-1])] location_read_tuple_list = [] for oriented_paired_read in orientations: primary_read = oriented_paired_read[0] secondary_read = oriented_paired_read[1] primary_locations = align_single_read(primary_read, ref_hash, c.KEY_SIZE, NUM_PIECES_TO_MATCH_FOR_PRIMARY) if len(primary_locations) < 1: continue secondary_locations = align_single_read(secondary_read, ref_hash, c.KEY_SIZE, NUM_PIECES_TO_MATCH_FOR_SECONDARY) for pl in primary_locations: secondary_location_f = [x for x in secondary_locations if pl + c.PAIRED_GAP_ESTIMATE_INTERVAL[0] < x < pl + c.PAIRED_GAP_ESTIMATE_INTERVAL[1] or pl - c.PAIRED_GAP_ESTIMATE_INTERVAL[1] < x < pl - c.PAIRED_GAP_ESTIMATE_INTERVAL[0]] if 0 < len(secondary_location_f) < 2: # it's probs a repetitive region if > 1 lrt_primary = (pl, utils.key_to_integer(primary_read)) lrt_secondary = (secondary_location_f[0], utils.key_to_integer(secondary_read)) location_read_tuple_list.append(lrt_primary) location_read_tuple_list.append(lrt_secondary) return location_read_tuple_list
def read_file_hash_and_pickle(file_path, key_size, pickle_filename): ref = '' with open(file_path, 'r') as f: first_line = True line_count = 0 for line in f: if first_line: first_line = False continue ref += line.strip() if line_count%100000==0: print 'lines read: {}'.format(line_count) line_count += 1 print 'initializing dict' ref_hash = defaultdict(list) for i in xrange(len(ref)-key_size+1): key = ref[i:i+key_size] intkey = utils.key_to_integer(key) ref_hash[intkey].append(i) if i%100000==0: print 'hashing position {}'.format(i) print 'pickling hashed dict' pickle.dump(ref_hash, open(pickle_filename, 'wb'))
def align_single_read(read, ref_hash, key_size, min_num_pieces = c.MINIMUM_NUMBER_OF_PIECES_TO_MATCH): read_pieces = [read[i * key_size: (i + 1) * key_size] for i in xrange(len(read) / key_size)] read_piece_locations = [ref_hash[utils.key_to_integer(read_pieces[i])] for i in xrange(len(read_pieces))] start_positions = [[x - i * key_size for x in read_piece_locations[i]] for i in xrange(len(read_piece_locations))] start_position_counter = Counter() for start_position in start_positions: start_position_counter.update(start_position) viable_locations = [] for location in start_position_counter: if start_position_counter[location] >= min_num_pieces: viable_locations.append(location) return viable_locations
def ident_bad_regions(some_pr_tuples, ref): nonperfect_pos_read_tuples = [] good_areas = bitarray(len(ref)) good_areas.setall(False) start = time.time() count = 0.0 for t in some_pr_tuples: read_num = t[1] pos = t[0] ref_piece_num = utils.key_to_integer(ref[pos:pos+c.READ_SIZE]) if read_num == ref_piece_num: good_areas[pos:pos+c.READ_SIZE] = True else: pass #good_areas[pos:pos+c.READ_SIZE] = True #else: # nonperfect_pos_read_tuples.append(t) count += 1 if count % 10000 == 0: print '{}'.format(count/len(some_pr_tuples)) print '{}'.format(time.time() - start) print 'piece done in {}'.format((time.time() - start)) return (good_areas, nonperfect_pos_read_tuples)