def pileup2(read_pos_tuple_list): result = np.zeros((c.OUTPUT_SIZE, 4)) #result = [[0,0,0,0] for x in xrange(c.OUTPUT_SIZE)] count = 0.0 for read_pos_tuple in read_pos_tuple_list: read = utils.integer_to_key(read_pos_tuple[1],c.READ_SIZE) position = read_pos_tuple[0] for i in xrange(position,position+len(read)): j = i - position if read[j]=='A': result[i][0] += 1 elif read[j]=='C': result[i][1] += 1 elif read[j]=='G': result[i][2] += 1 elif read[j]=='T': result[i][3] += 1 #result[position:position+len(read)] = read count += 1 if count % 100000 == 0: print 'done: {:.2f}'.format(count/len(read_pos_tuple_list)) donor = consensus(result) return ''.join(donor)
def get_donor_for_stretch(stretch, ref, pos_to_read): STRETCH_LIMIT = 20 MARGIN_LEFT = c.READ_SIZE MARGIN_RIGHT = stretch[1] - stretch[0] + 8 stretch_length = stretch[1] - stretch[0] if stretch_length > STRETCH_LIMIT: print '{} is over stretch limit, skipping.'.format(stretch) return donor = ['.'] * (MARGIN_RIGHT + MARGIN_LEFT + stretch_length + 1) read_tuples = [] (start, end) = (stretch[0] - MARGIN_LEFT, stretch[1] + MARGIN_RIGHT) if start < 0 or end > len(ref): return for i in xrange(start, end-c.READ_SIZE): # we don't want the extras on the right try: read_tuples.append((i, utils.integer_to_key(pos_to_read[i], c.READ_SIZE))) except KeyError: pass # print 'reads: {}'.format(read_tuples) # distances = [] # for read_tuple in read_tuples: # ref_piece = ref[read_tuple[0]-MARGIN:read_tuple[0]+c.READ_SIZE+MARGIN] # read_str = utils.integer_to_key(read[1], c.READ_SIZE) # read_str = read_tuple # distances.append(utils.sliding_window(read_str, ref_piece)) # print 'ref: {}\nrea: {}'.format(ref_piece, read_str) # print 'distances {}'.format(distances) # seed generation! # argmin = distances.index(min(distances)) if len(read_tuples) < stretch_length: print 'skipping {} low read tuple count'.format(stretch) return elif len(read_tuples) > 250: print 'skipping {} HIGH read tuple count'.format(stretch) return # ham = [] # for s in read_tuples: # rr = s[1] # po = s[0] # ham.append(hamming_ignore_dots_list_of_char(ref[po:po + c.READ_SIZE], rr)) # argmin = ham.index(min(ham)) # print 'ARGMIN:{}.'.format(argmin) # SEED NUMBER 1 #argmin = 0 # first one always behaves well! # try: # # pos = read_tuples[argmin][0] # # str = read_tuples[argmin][1] # #if hamming_ignore_dots_list_of_char(ref[pos:pos + c.READ_SIZE], str) > -1 * c.READ_SIZE + 1: # if sum([ref[pos+i] == str[i] for i in xrange(len(str))])<49: # print 'skipping due to bad initial read' # #print pos # return donor[0:0 + c.READ_SIZE] = list(ref[start:start + c.READ_SIZE]) #donor[-1*c.READ_SIZE-1:-1] = list(ref[end - c.READ_SIZE:end]) # except IndexError: # return # argmax = -1 # try: # pos = read_tuples[argmax][0] # str = read_tuples[argmax][1] # if sum([ref[pos+i] == str[i] for i in xrange(len(str))])<50: # print 'skipping due to bad initial read' # return # donor[argmax:argmax + c.READ_SIZE] = list(str) # except IndexError: # return #print 'initial state of donor:\n{}'.format(''.join(donor)) iteration_count = xrange(6) to_be_removed = [] still_unused = [] threshold = -40 for _ in iteration_count: threshold += 3 for item in to_be_removed: try: read_tuples.remove(item) except ValueError: print 'Value not in list problem. repetitive region' return to_be_removed = [] chosen_ones = [] for read_tuple in read_tuples: if read_tuple == None: continue read = read_tuple[1] hams = [] for offset in xrange(0, len(donor) - len(read)): j = len(donor) - offset - len(read) pre = ['.'] * offset post = ['.'] * j padded = pre + list(read) + post ham = hamming_ignore_dots_list_of_char(donor, padded) # if ham < -49: # print 'repetitive region! skipping...' # to_be_removed.extend(read_tuples) # break hams.append(ham) if min(hams) < threshold: offset = hams.index(min(hams)) j = len(donor) - offset - len(read) pre = ['.'] * offset post = ['.'] * j padded = pre + list(read) + post chosen_ones.append(padded) to_be_removed.append(read_tuple) #print '{} -> {}'.format(''.join(padded), min(hams)) piece_of_donor = pileup_ignore_dots(chosen_ones, donor) donor = piece_of_donor # new seed! #print '\n{} -> {}'.format(''.join(donor), stretch) return (start, donor.strip('.'))