def estimate_window_size(readData, parameter): if (parameter.window_size != -1): # window size should be even number if parameter.window_size % 2 ==1: info("warning: window size should be even number; adding 1 to it.") parameter.window_size = parameter.window_size + 1 return parameter.window_size info( "begin window_size calculation...") window_size_list = [] for chr in readData.chr_list: reads = [] for chip_filename in readData.chip_filename_list: reads.extend(readData.data_dict[chr][chip_filename]) coord_array = numpy.zeros(readData.chr_length_dict[chr], dtype=numpy.int64) for x in reads: try: coord_array[x] += 1 except IndexError: pass #debug("coordinates out of range") window = get_window_size2(coord_array) # if window >1000: # limit the window size to be less than 1000 bases. # window = 1000 window_size_list.append(window) window_logger.debug("%-10s %s", chr, window) debug("length of windowsize list is "+str(len(window_size_list))) window_median = misc.median(window_size_list) # Window size cannot be odd number since we need # to use overlapping half windows. if window_median % 2 == 1: window_median = window_median+1 window_logger.debug("window_size: %+10s", window_median) info ( "finishing window size calculation...") parameter.window_size = window_median return
def estimate_window_size(readData, parameter): if (parameter.window_size != -1): # window size should be even number if parameter.window_size % 2 == 1: info("warning: window size should be even number; adding 1 to it.") parameter.window_size = parameter.window_size + 1 return parameter.window_size info("begin window_size calculation...") window_size_list = [] for chr in readData.chr_list: reads = [] for chip_filename in readData.chip_filename_list: reads.extend(readData.data_dict[chr][chip_filename]) coord_array = numpy.zeros(readData.chr_length_dict[chr], dtype=numpy.int64) for x in reads: try: coord_array[x] += 1 except IndexError: pass #debug("coordinates out of range") window = get_window_size2(coord_array) # if window >1000: # limit the window size to be less than 1000 bases. # window = 1000 window_size_list.append(window) window_logger.debug("%-10s %s", chr, window) debug("length of windowsize list is " + str(len(window_size_list))) window_median = misc.median(window_size_list) # Window size cannot be odd number since we need # to use overlapping half windows. if window_median % 2 == 1: window_median = window_median + 1 window_logger.debug("window_size: %+10s", window_median) info("finishing window size calculation...") parameter.window_size = window_median return
def post_processing_per_peak(strands_dict, chip_list, input_list, chr, start, end, shiftSize, readLength, narrow_peak, remove_artefacts): ''' Remove artefacts and refine peak width.''' chip_forward = numpy.zeros(end-start) chip_reverse = numpy.zeros(end-start) input_forward = numpy.zeros(end-start) input_reverse = numpy.zeros(end-start) for chip in chip_list: forward = strands_dict[chr][chip]['f'] reverse = strands_dict[chr][chip]['r'] forward_read = forward[numpy.where( (forward >= start) & (forward < end) )] reverse_read = reverse[numpy.where( (reverse >= start+readLength) & (reverse < end+readLength) )] for read in forward_read: try: chip_forward[(read-start)] +=1 except IndexError: debug("index error ignored. end-start: %d. read-start: %d.", end-start, read-start) for read in reverse_read: try: chip_reverse[(read-start-readLength)] +=1 except IndexError: debug("index error ignored. end-start: %d. read-start-readLength: %d.", end-start, read-start-readLength) # using input reads to remove artefacts if remove_artefacts is True: for input in input_list: forward = strands_dict[chr][input]['f'] reverse = strands_dict[chr][input]['r'] forward_read = forward[numpy.where( (forward >= start) & (forward < end) )] reverse_read = reverse[numpy.where( (reverse >= start+readLength) & (reverse < end+readLength) )] for read in forward_read: try: input_forward[(read-start)] +=1 except IndexError: debug("index error ignored. end-start: %d. read-start: %d.", end-start, read-start) for read in reverse_read: try: input_reverse[(read-start-readLength)] +=1 except IndexError: debug("index error ignored. end-start: %d. read-start-readLength: %d.", end-start, read-start-readLength) chip_both = chip_forward + chip_reverse input_both = input_forward + input_reverse if sum(chip_both) == 0: pass else: chip_both = chip_both/sum(chip_both) if sum(input_both) ==0: pass else: input_both = input_both/sum(input_both) overlap_chip_input = numpy.sum(numpy.minimum(chip_both, input_both)) chip_both.sort() chip_3_maximum = numpy.sum(chip_both[-3:]) if sum(chip_reverse) != 0: chip_reverse = chip_reverse/sum(chip_reverse) chip_forward_roll = numpy.roll(chip_forward,readLength) if sum(chip_forward) != 0: chip_forward = chip_forward/sum(chip_forward) chip_forward_roll = chip_forward_roll/sum(chip_forward) overlap_orig = numpy.sum(numpy.minimum(chip_forward, chip_reverse)) overlap_orig = numpy.max([overlap_orig, 1e-5]) overlap_roll = numpy.sum(numpy.minimum(chip_forward_roll, chip_reverse)) else: overlap_chip_input = 0 chip_3_maximum = 0 overlap_orig = 1e-5 overlap_roll = 0 if narrow_peak is True: sum_forward = 0 sum_reverse = 0 if sum(chip_forward) > 0: chip_forward = chip_forward/sum(chip_forward) for i in range(end-start): sum_forward += chip_forward[i] if sum_forward > 0.2: new_start = start + i break if sum(chip_reverse) > 0: chip_reverse = chip_reverse/sum(chip_reverse) for i in range(end-start-1, -1, -1): sum_reverse += chip_reverse[i] if sum_reverse > 0.2: new_end = start + i break # there is no reads in the peak. if sum(chip_forward)+sum(chip_reverse) == 0: new_start = new_end = (start+end)/2 if sum(chip_forward) == 0: new_start = new_end - misc.median(2*shiftSize.values()) if sum(chip_reverse) == 0: new_end = new_start + misc.median(2*shiftSize.values()) start = new_start end = new_end return (start, end, overlap_chip_input, chip_3_maximum, overlap_orig, overlap_roll)
def get_window_size2(array, bin=20, iter=100): # this is the function that estimate the window size. chr_len = array.size rowNum = int(chr_len/bin) array_window = std.as_strided(array, (rowNum, bin), (bin*array.itemsize, 1*array.itemsize)) array_window = numpy.sum(array_window, 1) peak_len_list = [] for x in range(iter): peak = numpy.max(array_window) peak_idx = numpy.where(array_window == peak)[0][0] # set the peak to -1 so that it won't confuse the next iteration. array_window[peak_idx]= -1 if peak_idx == 0: # Check if the window reaches the left boundary. left_boundary_not_reached = False else: left_boundary_not_reached = True # Check if the window reaches of the right boundary. if peak_idx == rowNum-1: right_boundary_not_reached = False else: right_boundary_not_reached = True i_l = 1 i_r = 1 # Be careful if anything equals to -1. # Though it's unlikely that it will happen. while left_boundary_not_reached: while (array_window[peak_idx-i_l] >= 0.1*peak): array_window[peak_idx-i_l] = -1 if peak_idx-i_l==0: left_boundary_not_reached = False break i_l += 1 if left_boundary_not_reached: if peak_idx-i_l==0: array_window[peak_idx-i_l] = -1 left_boundary_not_reached = False if left_boundary_not_reached: # Will continue counting if the next window(one window gap) # has above the 10% mode reads. if (array_window[peak_idx-i_l-1] >= 0.1*peak): array_window[peak_idx-i_l] = -1 i_l += 1 else: break while right_boundary_not_reached: while (array_window[peak_idx+i_r] >= 0.1*peak): array_window[peak_idx+i_r] = -1 if peak_idx+i_r ==rowNum-1: right_boundary_not_reached = False break i_r += 1 if right_boundary_not_reached: if peak_idx+i_r==rowNum-1: array_window[peak_idx+i_r] = -1 right_boundary_not_reached = False if right_boundary_not_reached: if (array_window[peak_idx+i_r+1] >=0.1*peak): array_window[peak_idx+i_r] = -1 i_r += 1 else: break peak_len_list.append((1+i_l+i_r)*bin) return misc.median(peak_len_list)
def estimate_shift_size(readData, parameter): # If shift size provided by the user, then skip estimating it. if parameter.shift_size != "-1": shift_list = parameter.shift_size.split(',') if len(shift_list)== 1: for name in readData.filename_list: readData.shift_size[name] = int(shift_list[0]) info("%-10s %s", name, shift_list[0]) else: if parameter.difftest is True: for idx, name in enumerate(readData.chip_filename_list): readData.shift_size[name] = int(shift_list[idx]) info("%-10s %s", name, shift_list[idx]) chip1_shift_list = [readData.shift_size[chip] for chip in readData.chip1_filename_list] input1_shift_size = sum(chip1_shift_list)/len(chip1_shift_list) for name in readData.input1_filename_list: readData.shift_size[name] = input1_shift_size chip2_shift_list = [readData.shift_size[chip] for chip in readData.chip2_filename_list] input2_shift_size = sum(chip2_shift_list)/len(chip2_shift_list) for name in readData.input2_filename_list: readData.shift_size[name] = input2_shift_size else: for idx, chip in enumerate(readData.chip1_filename_list): readData.shift_size[chip] = int(shift_list[idx]) info("%-10s %s", chip, shift_list[idx]) for input in readData.input1_filename_list: readData.shift_size[input] = \ sum([int(x) for x in shift_list])/len(shift_list) info("%-10s %s", input, sum([int(x) for x in shift_list])/len(shift_list)) return info("begin estimating the shift size...") for chip_filename in readData.chip1_filename_list: info("estimating for %-10s", chip_filename) shift_list = [] for count, chr in enumerate(readData.chr_list): # estimate the shift size for five chromosomes and take # the median of these as the estimator. if count ==5: break forward = readData.data_dict_by_strands[chr][chip_filename]['f'] reverse = readData.data_dict_by_strands[chr][chip_filename]['r'] shift = shift_size_per_chrom(forward, reverse) info("%-10s %d", chr, shift) shift_list.append(shift) shift_median = misc.median(shift_list[:]) info("%-10s %d", chip_filename, shift_median) readData.shift_size[chip_filename] = shift_median chip1_shift_list = [readData.shift_size[chip] for chip in readData.chip1_filename_list] input1_shift_size = sum(chip1_shift_list)/len(chip1_shift_list) for input_filename in readData.input1_filename_list: readData.shift_size[input_filename] = input1_shift_size if parameter.difftest is True: # If we're calling differential binding. for filename in readData.chip2_filename_list: info("estimating for %-10s", filename) shift_list = [] for count, chr in enumerate(readData.chr_list): if count==5: break forward = readData.data_dict_by_strands[chr][filename]['f'] reverse = readData.data_dict_by_strands[chr][filename]['r'] shift = shift_size_per_chrom(forward, reverse) info("%-10s %d", chr, shift) shift_list.append(shift) shift_median = misc.median(shift_list[:]) info("%-10s %d", filename, shift_median) readData.shift_size[filename] = shift_median chip2_shift_list = [readData.shift_size[chip] for chip in readData.chip2_filename_list] input2_shift_size = sum(chip2_shift_list)/len(chip2_shift_list) for input_filename in readData.input2_filename_list: readData.shift_size[input_filename] = input2_shift_size return
def get_window_size2(array, bin=20, iter=100): # this is the function that estimate the window size. chr_len = array.size rowNum = chr_len / bin array_window = std.as_strided(array, (rowNum, bin), (bin * array.itemsize, 1 * array.itemsize)) array_window = numpy.sum(array_window, 1) peak_len_list = [] for x in range(iter): peak = numpy.max(array_window) peak_idx = numpy.where(array_window == peak)[0][0] # set the peak to -1 so that it won't confuse the next iteration. array_window[peak_idx] = -1 if peak_idx == 0: # Check if the window reaches the left boundary. left_boundary_not_reached = False else: left_boundary_not_reached = True # Check if the window reaches of the right boundary. if peak_idx == rowNum - 1: right_boundary_not_reached = False else: right_boundary_not_reached = True i_l = 1 i_r = 1 # Be careful if anything equals to -1. # Though it's unlikely that it will happen. while left_boundary_not_reached: while (array_window[peak_idx - i_l] >= 0.1 * peak): array_window[peak_idx - i_l] = -1 if peak_idx - i_l == 0: left_boundary_not_reached = False break i_l += 1 if left_boundary_not_reached: if peak_idx - i_l == 0: array_window[peak_idx - i_l] = -1 left_boundary_not_reached = False if left_boundary_not_reached: # Will continue counting if the next window(one window gap) # has above the 10% mode reads. if (array_window[peak_idx - i_l - 1] >= 0.1 * peak): array_window[peak_idx - i_l] = -1 i_l += 1 else: break while right_boundary_not_reached: while (array_window[peak_idx + i_r] >= 0.1 * peak): array_window[peak_idx + i_r] = -1 if peak_idx + i_r == rowNum - 1: right_boundary_not_reached = False break i_r += 1 if right_boundary_not_reached: if peak_idx + i_r == rowNum - 1: array_window[peak_idx + i_r] = -1 right_boundary_not_reached = False if right_boundary_not_reached: if (array_window[peak_idx + i_r + 1] >= 0.1 * peak): array_window[peak_idx + i_r] = -1 i_r += 1 else: break peak_len_list.append((1 + i_l + i_r) * bin) return misc.median(peak_len_list)
def check_sadi_consistence(self, fragment): """ check if same distance restraints make sense. Each length of an atom pair is tested agains the standard deviation of all distances. For a large standard deviation, the list is tested for outliers. :param fragment: frag name """ atoms = self.get_atoms(fragment) restr = self.get_restraints(fragment) restraints = deepcopy(restr) atnames = self.get_atomnames(fragment, uppercase=True) good = True for num, line in enumerate(restraints): prefixes = [] dev = 0.02 line = line.split() if not line: continue if line[0].upper() == 'SADI': prefixes.append(line[0]) del line[0] try: if not str(line[0][0]).isalpha(): prefixes.append(line[0]) dev = line[0] del line[0] # delete standard deviation except IndexError: return False if len(line) % 2 == 1: # test for uneven atoms count print('*** Inconsistent SADI restraint line {} of "{}". ' 'Not all atoms form a pair ***'.format(num, fragment)) pairs = pairwise(line) distances = [] pairlist = [] if len(pairs) <= 2: return True for i in pairs: if i in pairlist or tuple(reversed(i)) in pairlist: print('*** Duplicate atom pair "{}" in SADI restraint line {} of "{}". ***'.format(" ".join(i), num, fragment)) pairlist.append(i) try: a = atoms[atnames.index(i[0])][2:5] b = atoms[atnames.index(i[1])][2:5] except ValueError: return False a = [float(x) for x in a] b = [float(y) for y in b] dist = atomic_distance(a, b, self.get_cell(fragment)) distances.append(dist) stdev = std_dev(distances) # Error distribution of # only do outlier test if standard deviation is suspiciously large: if stdev > 0.065: outliers = nalimov_test(distances) if outliers: print("\nFragment {}:".format(fragment)) for x in outliers: pair = ' '.join(pairlist[x]) print('*** Suspicious deviation of atom pair "{}" ({:4.3f} A, median: {:4.3f}) after ' 'line {} in {}.txt ***'.format(pair, distances[x], median(distances), self.get_startline(fragment) + num + 1, self.get_db_name(fragment)) ) print('*** {} ... ***'.format(restr[num][:60])) good = False if (stdev > (2.5 * float(dev))) and good: print("\nFragment {}:".format(fragment)) print( '*** Suspicious restraints in SADI line {} with high standard deviation {:4.3f} ' '(median length: {:4.3f} A) ***'.format(num + 1, stdev, median(distances))) print('*** ' + ' '.join(prefixes + line) + ' ***') good = False if good: return True else: return False