Ejemplo n.º 1
0
def estimate_window_size(readData, parameter):
    if (parameter.window_size != -1):
        # window size should be even number 
        if parameter.window_size % 2 ==1:
            info("warning: window size should be even number; adding 1 to it.")
            parameter.window_size = parameter.window_size + 1
        return parameter.window_size

    info( "begin window_size calculation...")
    window_size_list = []
    for chr in readData.chr_list:
        reads = []
        for chip_filename in readData.chip_filename_list:
            reads.extend(readData.data_dict[chr][chip_filename])
        coord_array = numpy.zeros(readData.chr_length_dict[chr],
                dtype=numpy.int64)
        for x in reads:
            try: coord_array[x] += 1
            except IndexError: pass #debug("coordinates out of range")

        window = get_window_size2(coord_array)
#        if window >1000:  # limit the window size to be less than 1000 bases.
#            window = 1000
        window_size_list.append(window)
        window_logger.debug("%-10s %s", chr, window)
    debug("length of windowsize list is "+str(len(window_size_list)))
    window_median = misc.median(window_size_list)
    # Window size cannot be odd number since we need 
    # to use overlapping half windows. 
    if window_median % 2 == 1:
        window_median = window_median+1
    window_logger.debug("window_size: %+10s", window_median)
    info ( "finishing window size calculation...")
    parameter.window_size = window_median
    return  
Ejemplo n.º 2
0
def estimate_window_size(readData, parameter):
    if (parameter.window_size != -1):
        # window size should be even number
        if parameter.window_size % 2 == 1:
            info("warning: window size should be even number; adding 1 to it.")
            parameter.window_size = parameter.window_size + 1
        return parameter.window_size

    info("begin window_size calculation...")
    window_size_list = []
    for chr in readData.chr_list:
        reads = []
        for chip_filename in readData.chip_filename_list:
            reads.extend(readData.data_dict[chr][chip_filename])
        coord_array = numpy.zeros(readData.chr_length_dict[chr],
                                  dtype=numpy.int64)
        for x in reads:
            try:
                coord_array[x] += 1
            except IndexError:
                pass  #debug("coordinates out of range")

        window = get_window_size2(coord_array)
        #        if window >1000:  # limit the window size to be less than 1000 bases.
        #            window = 1000
        window_size_list.append(window)
        window_logger.debug("%-10s %s", chr, window)
    debug("length of windowsize list is " + str(len(window_size_list)))
    window_median = misc.median(window_size_list)
    # Window size cannot be odd number since we need
    # to use overlapping half windows.
    if window_median % 2 == 1:
        window_median = window_median + 1
    window_logger.debug("window_size: %+10s", window_median)
    info("finishing window size calculation...")
    parameter.window_size = window_median
    return
Ejemplo n.º 3
0
def post_processing_per_peak(strands_dict, chip_list, input_list, chr,
                             start, end, shiftSize, readLength, narrow_peak,
                             remove_artefacts):
    ''' Remove artefacts and refine peak width.'''
    chip_forward = numpy.zeros(end-start)
    chip_reverse = numpy.zeros(end-start)
    input_forward = numpy.zeros(end-start)
    input_reverse = numpy.zeros(end-start)

    for chip in chip_list:
        forward = strands_dict[chr][chip]['f']
        reverse = strands_dict[chr][chip]['r']
        forward_read = forward[numpy.where( (forward >= start) &
                (forward < end) )]
        reverse_read = reverse[numpy.where( (reverse >= start+readLength) &
                (reverse < end+readLength) )]
        for read in forward_read:
            try: chip_forward[(read-start)] +=1
            except IndexError:
                debug("index error ignored. end-start: %d. read-start: %d.", end-start, read-start)
        for read in reverse_read:
            try: chip_reverse[(read-start-readLength)] +=1
            except IndexError:
                debug("index error ignored. end-start: %d. read-start-readLength: %d.", end-start, read-start-readLength)
    # using input reads to remove artefacts
    if remove_artefacts is True: 
        for input in input_list:
            forward = strands_dict[chr][input]['f']
            reverse = strands_dict[chr][input]['r']
            forward_read = forward[numpy.where( (forward >= 
                    start) & (forward < end) )]
            reverse_read = reverse[numpy.where( (reverse >=
                    start+readLength) & (reverse < end+readLength) )]
            for read in forward_read:
                try: input_forward[(read-start)] +=1
                except IndexError: 
                    debug("index error ignored. end-start: %d. read-start: %d.", end-start, read-start)
            for read in reverse_read:
                try: input_reverse[(read-start-readLength)] +=1
                except IndexError:
                    debug("index error ignored. end-start: %d. read-start-readLength: %d.", end-start, read-start-readLength)

        chip_both = chip_forward + chip_reverse
        input_both = input_forward + input_reverse
        if sum(chip_both) == 0:
            pass
        else:
            chip_both = chip_both/sum(chip_both)
        if sum(input_both) ==0:
            pass
        else:
            input_both = input_both/sum(input_both)
        overlap_chip_input = numpy.sum(numpy.minimum(chip_both, input_both))
        chip_both.sort()
        chip_3_maximum = numpy.sum(chip_both[-3:])
        if sum(chip_reverse) != 0:
            chip_reverse = chip_reverse/sum(chip_reverse) 
        chip_forward_roll = numpy.roll(chip_forward,readLength)
        if sum(chip_forward) != 0:
            chip_forward = chip_forward/sum(chip_forward)
            chip_forward_roll = chip_forward_roll/sum(chip_forward)
        overlap_orig = numpy.sum(numpy.minimum(chip_forward, chip_reverse))
        overlap_orig = numpy.max([overlap_orig, 1e-5])
        overlap_roll = numpy.sum(numpy.minimum(chip_forward_roll, chip_reverse))
    else: 
        overlap_chip_input = 0
        chip_3_maximum = 0 
        overlap_orig = 1e-5
        overlap_roll = 0

    if narrow_peak is True:
        sum_forward = 0
        sum_reverse = 0
        if sum(chip_forward) > 0: 
            chip_forward = chip_forward/sum(chip_forward)
            for i in range(end-start): 
                sum_forward += chip_forward[i]
                if sum_forward > 0.2: 
                    new_start = start + i
                    break
        if sum(chip_reverse) > 0:
            chip_reverse = chip_reverse/sum(chip_reverse)
            for i in range(end-start-1, -1, -1):
                sum_reverse += chip_reverse[i]
                if sum_reverse > 0.2:
                    new_end = start + i
                    break
        # there is no reads in the peak. 
        if sum(chip_forward)+sum(chip_reverse) == 0: 
            new_start = new_end = (start+end)/2

        if sum(chip_forward) == 0:
            new_start = new_end - misc.median(2*shiftSize.values())
        if sum(chip_reverse) == 0:
            new_end = new_start + misc.median(2*shiftSize.values())
        start = new_start
        end = new_end

    return (start, end, overlap_chip_input, chip_3_maximum, 
            overlap_orig, overlap_roll)
Ejemplo n.º 4
0
def get_window_size2(array, bin=20, iter=100):
    # this is the function that estimate the window size. 
    chr_len = array.size
    rowNum = int(chr_len/bin)
    array_window = std.as_strided(array, (rowNum, bin), 
            (bin*array.itemsize, 1*array.itemsize))
    array_window = numpy.sum(array_window, 1)
    peak_len_list = []
    for x in range(iter):
        peak = numpy.max(array_window)
        peak_idx = numpy.where(array_window == peak)[0][0]
        # set the peak to -1 so that it won't confuse the next iteration.
        array_window[peak_idx]= -1 
        if peak_idx == 0:  # Check if the window reaches the left boundary.
            left_boundary_not_reached = False
        else:
            left_boundary_not_reached = True    
        # Check if the window reaches of the right boundary.
        if peak_idx == rowNum-1: 
            right_boundary_not_reached = False
        else: 
            right_boundary_not_reached = True
        i_l = 1
        i_r = 1 
        # Be careful if anything equals to -1. 
        # Though it's unlikely that it will happen.
        while left_boundary_not_reached:

            while (array_window[peak_idx-i_l] >= 0.1*peak): 
                array_window[peak_idx-i_l] = -1
                if peak_idx-i_l==0:
                    left_boundary_not_reached = False
                    break
                i_l += 1
            if left_boundary_not_reached:
                if peak_idx-i_l==0:
                    array_window[peak_idx-i_l] = -1
                    left_boundary_not_reached = False
            
            if left_boundary_not_reached: 
            # Will continue counting if the next window(one window gap) 
            # has above the 10% mode reads. 
                if (array_window[peak_idx-i_l-1] >= 0.1*peak): 
                    array_window[peak_idx-i_l] = -1
                    i_l += 1
                else:
                    break

        while right_boundary_not_reached:
            while (array_window[peak_idx+i_r] >= 0.1*peak):
                array_window[peak_idx+i_r] = -1
                if peak_idx+i_r ==rowNum-1:
                    right_boundary_not_reached = False
                    break
                i_r += 1
            if right_boundary_not_reached:
                if peak_idx+i_r==rowNum-1:
                    array_window[peak_idx+i_r] = -1
                    right_boundary_not_reached = False

            if right_boundary_not_reached:
                if (array_window[peak_idx+i_r+1] >=0.1*peak): 
                    array_window[peak_idx+i_r] = -1
                    i_r += 1
                else:
                    break
        peak_len_list.append((1+i_l+i_r)*bin)

    return misc.median(peak_len_list)
Ejemplo n.º 5
0
def estimate_shift_size(readData, parameter):
    # If shift size provided by the user, then skip estimating it.
    if parameter.shift_size != "-1": 
        shift_list = parameter.shift_size.split(',')
        if len(shift_list)== 1:
            for name in readData.filename_list: 
                readData.shift_size[name] = int(shift_list[0])
                info("%-10s %s", name, shift_list[0])
        else: 
            if parameter.difftest is True: 
                for idx, name in enumerate(readData.chip_filename_list):
                    readData.shift_size[name] = int(shift_list[idx])
                    info("%-10s %s", name, shift_list[idx])
                chip1_shift_list = [readData.shift_size[chip]
                                    for chip in readData.chip1_filename_list]
                input1_shift_size = sum(chip1_shift_list)/len(chip1_shift_list)
                for name in readData.input1_filename_list:
                    readData.shift_size[name] = input1_shift_size
                chip2_shift_list = [readData.shift_size[chip]
                                    for chip in readData.chip2_filename_list]
                input2_shift_size = sum(chip2_shift_list)/len(chip2_shift_list)
                for name in readData.input2_filename_list:
                    readData.shift_size[name] = input2_shift_size
            else: 
                for idx, chip in enumerate(readData.chip1_filename_list):
                    readData.shift_size[chip] = int(shift_list[idx])
                    info("%-10s %s", chip, shift_list[idx])
                for input in readData.input1_filename_list:
                    readData.shift_size[input] = \
                            sum([int(x) for x in shift_list])/len(shift_list)
                    info("%-10s %s", input, 
                          sum([int(x) for x in shift_list])/len(shift_list))
        return 
    info("begin estimating the shift size...")
    for chip_filename in readData.chip1_filename_list:
        info("estimating for %-10s", chip_filename)
        shift_list = []
        for count, chr in enumerate(readData.chr_list):
            # estimate the shift size for five chromosomes and take
            # the median of these as the estimator. 
            if count ==5:   
                break
            forward = readData.data_dict_by_strands[chr][chip_filename]['f']
            reverse = readData.data_dict_by_strands[chr][chip_filename]['r']
            shift = shift_size_per_chrom(forward, reverse)
            info("%-10s %d", chr, shift)
            shift_list.append(shift)
        shift_median = misc.median(shift_list[:])
        info("%-10s %d", chip_filename, shift_median)
        readData.shift_size[chip_filename] = shift_median
    chip1_shift_list = [readData.shift_size[chip] 
                        for chip in readData.chip1_filename_list]
    input1_shift_size = sum(chip1_shift_list)/len(chip1_shift_list)
    for input_filename in readData.input1_filename_list: 
        readData.shift_size[input_filename] = input1_shift_size
            
    if parameter.difftest is True:  # If we're calling differential binding. 
        for filename in readData.chip2_filename_list: 
            info("estimating for %-10s", filename)
            shift_list = []
            for count, chr in enumerate(readData.chr_list):
                if count==5: 
                    break
                forward = readData.data_dict_by_strands[chr][filename]['f']
                reverse = readData.data_dict_by_strands[chr][filename]['r']
                shift = shift_size_per_chrom(forward, reverse)
                info("%-10s %d", chr, shift)
                shift_list.append(shift)
            shift_median = misc.median(shift_list[:])
            info("%-10s %d", filename, shift_median)
            readData.shift_size[filename] = shift_median
        chip2_shift_list = [readData.shift_size[chip]
                            for chip in readData.chip2_filename_list]
        input2_shift_size = sum(chip2_shift_list)/len(chip2_shift_list)
        for input_filename in readData.input2_filename_list: 
            readData.shift_size[input_filename] = input2_shift_size
    return 
Ejemplo n.º 6
0
def get_window_size2(array, bin=20, iter=100):
    # this is the function that estimate the window size.
    chr_len = array.size
    rowNum = chr_len / bin
    array_window = std.as_strided(array, (rowNum, bin),
                                  (bin * array.itemsize, 1 * array.itemsize))
    array_window = numpy.sum(array_window, 1)
    peak_len_list = []
    for x in range(iter):
        peak = numpy.max(array_window)
        peak_idx = numpy.where(array_window == peak)[0][0]
        # set the peak to -1 so that it won't confuse the next iteration.
        array_window[peak_idx] = -1
        if peak_idx == 0:  # Check if the window reaches the left boundary.
            left_boundary_not_reached = False
        else:
            left_boundary_not_reached = True
        # Check if the window reaches of the right boundary.
        if peak_idx == rowNum - 1:
            right_boundary_not_reached = False
        else:
            right_boundary_not_reached = True
        i_l = 1
        i_r = 1
        # Be careful if anything equals to -1.
        # Though it's unlikely that it will happen.
        while left_boundary_not_reached:

            while (array_window[peak_idx - i_l] >= 0.1 * peak):
                array_window[peak_idx - i_l] = -1
                if peak_idx - i_l == 0:
                    left_boundary_not_reached = False
                    break
                i_l += 1
            if left_boundary_not_reached:
                if peak_idx - i_l == 0:
                    array_window[peak_idx - i_l] = -1
                    left_boundary_not_reached = False

            if left_boundary_not_reached:
                # Will continue counting if the next window(one window gap)
                # has above the 10% mode reads.
                if (array_window[peak_idx - i_l - 1] >= 0.1 * peak):
                    array_window[peak_idx - i_l] = -1
                    i_l += 1
                else:
                    break

        while right_boundary_not_reached:
            while (array_window[peak_idx + i_r] >= 0.1 * peak):
                array_window[peak_idx + i_r] = -1
                if peak_idx + i_r == rowNum - 1:
                    right_boundary_not_reached = False
                    break
                i_r += 1
            if right_boundary_not_reached:
                if peak_idx + i_r == rowNum - 1:
                    array_window[peak_idx + i_r] = -1
                    right_boundary_not_reached = False

            if right_boundary_not_reached:
                if (array_window[peak_idx + i_r + 1] >= 0.1 * peak):
                    array_window[peak_idx + i_r] = -1
                    i_r += 1
                else:
                    break
        peak_len_list.append((1 + i_l + i_r) * bin)

    return misc.median(peak_len_list)
Ejemplo n.º 7
0
 def check_sadi_consistence(self, fragment):
     """
     check if same distance restraints make sense. Each length of an atom
     pair is tested agains the standard deviation of all distances.
     For a large standard deviation, the list is tested for outliers.
     :param fragment: frag name
     """
     atoms = self.get_atoms(fragment)
     restr = self.get_restraints(fragment)
     restraints = deepcopy(restr)
     atnames = self.get_atomnames(fragment, uppercase=True)
     good = True
     for num, line in enumerate(restraints):
         prefixes = []
         dev = 0.02
         line = line.split()
         if not line:
             continue
         if line[0].upper() == 'SADI':
             prefixes.append(line[0])
             del line[0]
             try:
                 if not str(line[0][0]).isalpha():
                     prefixes.append(line[0])
                     dev = line[0]
                     del line[0]  # delete standard deviation
             except IndexError:
                 return False
             if len(line) % 2 == 1:  # test for uneven atoms count
                 print('*** Inconsistent SADI restraint line {} of "{}". '
                       'Not all atoms form a pair ***'.format(num, fragment))
             pairs = pairwise(line)
             distances = []
             pairlist = []
             if len(pairs) <= 2:
                 return True
             for i in pairs:
                 if i in pairlist or tuple(reversed(i)) in pairlist:
                     print('*** Duplicate atom pair "{}" in SADI restraint line {} of "{}". ***'.format(" ".join(i),
                                                                                                        num,
                                                                                                        fragment))
                 pairlist.append(i)
                 try:
                     a = atoms[atnames.index(i[0])][2:5]
                     b = atoms[atnames.index(i[1])][2:5]
                 except ValueError:
                     return False
                 a = [float(x) for x in a]
                 b = [float(y) for y in b]
                 dist = atomic_distance(a, b, self.get_cell(fragment))
                 distances.append(dist)
             stdev = std_dev(distances)  # Error distribution of
             # only do outlier test if standard deviation is suspiciously large:
             if stdev > 0.065:
                 outliers = nalimov_test(distances)
                 if outliers:
                     print("\nFragment {}:".format(fragment))
                     for x in outliers:
                         pair = ' '.join(pairlist[x])
                         print('*** Suspicious deviation of atom pair "{}" ({:4.3f} A, median: {:4.3f}) after '
                               'line {} in {}.txt ***'.format(pair, distances[x], median(distances),
                                                              self.get_startline(fragment) + num + 1,
                                                              self.get_db_name(fragment))
                               )
                         print('*** {} ... ***'.format(restr[num][:60]))
                         good = False
             if (stdev > (2.5 * float(dev))) and good:
                 print("\nFragment {}:".format(fragment))
                 print(
                     '*** Suspicious restraints in SADI line {} with high standard deviation {:4.3f} '
                     '(median length: {:4.3f} A) ***'.format(num + 1, stdev, median(distances)))
                 print('*** ' + ' '.join(prefixes + line) + ' ***')
                 good = False
     if good:
         return True
     else:
         return False