Ejemplo n.º 1
0
def load_data():
    args = parse_arguments()

    window_nuc = 2001
    half_wx = window_nuc // 2
    window_rna = 10
    half_wx_rna = window_rna // 2
    args = parse_arguments()
    path_to_directory = os.path.dirname(os.path.dirname(args.directory))
    # we get the path conducting to seq_chr_sacCer3
    path_to_file = os.path.join(path_to_directory, 'seq_chr_sacCer3',
                                args.directory, 'chr16.hdf5')

    f = h5py.File(path_to_file, 'r')
    nucleotid = np.array(f['data'])
    f.close()

    X_one_hot = (np.arange(nucleotid.max()) == nucleotid[..., None] -
                 1).astype(int)
    X_ = X_one_hot.reshape(X_one_hot.shape[0],
                           X_one_hot.shape[1] * X_one_hot.shape[2])

    nuc_directory = os.path.dirname(args.nuc)
    nuc_file = os.path.join(nuc_directory, 'Start_data', args.nuc)

    nuc_density = pd.read_csv(nuc_file)
    y_true = nuc_density[nuc_density.chr == 'chr16'].value.values

    X_slide = rolling_window(X_, window=(window_nuc, 4))
    X_ = X_slide.reshape(X_slide.shape[0], X_slide.shape[2], X_slide.shape[3],
                         1)

    threshold = nuc_occupancy(nuc_file, return_threshold=True)

    rna_directory = os.path.dirname(args.rna_seq)
    rna_file = os.path.join(rna_directory, 'Start_data', args.rna_seq)

    rna_density = pd.read_csv(rna_file)
    rna_density = rna_density[rna_density.chr == 'chr16'].value.values

    if not args.predicted_rnaseq:
        rna_density[rna_density > 0] = np.log(rna_density[rna_density > 0])
        rna_density[rna_density < 0] = -np.log(-rna_density[rna_density < 0])

    rna_inputs = rolling_window(rna_density, window=(window_rna, ))
    rna_inputs = rna_inputs[half_wx - half_wx_rna:-half_wx + half_wx_rna - 1]
    rna_inputs = rna_inputs.reshape(rna_inputs.shape[0], window_rna, 1)

    y_true = y_true[half_wx:-half_wx]
    y_true /= float(threshold)

    return X_, rna_inputs, y_true
Ejemplo n.º 2
0
def load_data(seq2seq=False, args=None):
    window = 2001
    half_wx = window // 2
    args = parse_arguments(args)
    path_to_directory = os.path.dirname(os.path.dirname(args.directory))
    # we get the path conducting to seq_chr_sacCer3
    path_to_file = os.path.join(
        path_to_directory,
        "seq_chr_sacCer3",
        args.directory,
        "chr" + args.test + ".hdf5",
    )

    f = h5py.File(path_to_file, "r")
    nucleotid = np.array(f["data"])
    f.close()

    if args.reversed_seq:
        nucleotid[nucleotid == 1] = 5
        nucleotid[nucleotid == 2] = 6
        nucleotid[nucleotid == 3] = 7
        nucleotid[nucleotid == 4] = 8
        nucleotid[nucleotid == 5] = 2
        nucleotid[nucleotid == 6] = 1
        nucleotid[nucleotid == 7] = 4
        nucleotid[nucleotid == 8] = 3

        nucleotid = nucleotid[::-1]

    n = int(args.k)

    X_ = nmer_patch(nucleotid, n)

    if seq2seq:
        _, output_len = model_dictionary()[args.model]

        X_slide = rolling_window(X_,
                                 window=(window, pow(4, n)),
                                 asteps=(output_len, pow(4, n)))
        X_ = X_slide.reshape(X_slide.shape[0], X_slide.shape[2], 1,
                             X_slide.shape[3])
        windows_num = len(X_slide)

    else:
        X_slide = rolling_window(X_, window=(window, pow(4, n)))
        X_ = X_slide.reshape(X_slide.shape[0], X_slide.shape[2], 1,
                             X_slide.shape[3])
        output_len = 1
        windows_num = 1

    return X_
Ejemplo n.º 3
0
def _calculate_rolling_mean(x, batch_size, sample_len, output_len,
                            num_classes):
    x = rolling_window(x,
                       window=(x.shape[0], sample_len, num_classes),
                       asteps=(x.shape[0], sample_len, num_classes))
    x = x.reshape((output_len, batch_size, sample_len, num_classes))
    x = np.mean(x, axis=2)
    x = np.swapaxes(x, 0, 1)
    return x
Ejemplo n.º 4
0
def nmer_patch(nucleotid_, window, n):
    nucleotid_ = rolling_window(nucleotid_ - 1, window=n)

    weights = [pow(4, i) for i in range(n)]
    nucleotid_ = np.average(nucleotid_, weights=weights, axis=1) * sum(weights)

    X_one_hot = (np.arange(pow(4, n)) == nucleotid_[..., None]).astype(int)
    _X_ = np.zeros((window, 1, pow(4, n)))
    _X_[(n - 1) // 2:-(n // 2), 0] = X_one_hot
    return _X_
Ejemplo n.º 5
0
def nmer_patch(nucleotid, n):
    nucleotid = nucleotid[:, 0]
    nucleotid_ = rolling_window(nucleotid - 1, window=n)

    weights = [pow(4, i) for i in range(n)]
    nucleotid_ = np.average(nucleotid_, weights=weights, axis=1) * sum(weights)

    X_one_hot = (np.arange(pow(4, n)) == nucleotid_[..., None]).astype(int)

    if n == 1:
        return X_one_hot
    else:
        _X_ = np.zeros((len(nucleotid), pow(4, n)))
        _X_[(n - 1) // 2:-(n // 2)] = X_one_hot
        return _X_
Ejemplo n.º 6
0
    def process(self):
        """
            Load the data from a .fa file and process it with a rolling window
            so that the model can handle it.
        """
        WX = 299

        path_to_directory = os.path.dirname(os.path.dirname(self.directory))
        path_to_directory = os.path.join(path_to_directory, 'seq_chr',
                                         self.directory + '/')

        f = h5py.File(path_to_directory + 'chr' + str(self.num_chr) + '.hdf5')
        seq = np.array(f['data'])
        f.close()

        reverse = self._reverse_dna(seq)

        seq = self._one_hot_encoder(seq)
        reverse = self._one_hot_encoder(reverse)
        seq_slide = rolling_window(seq,
                                   window=(WX, 4),
                                   asteps=None,
                                   wsteps=None,
                                   axes=None,
                                   toend=True)
        seq = seq_slide.reshape(seq_slide.shape[0], WX, 4, 1)

        reverse_slide = rolling_window(reverse,
                                       window=(WX, 4),
                                       asteps=None,
                                       wsteps=None,
                                       axes=None,
                                       toend=True)
        reverse = reverse_slide.reshape(reverse_slide.shape[0], WX, 4, 1)

        return seq, reverse
Ejemplo n.º 7
0
def process(nucleotid):
    """
        Take a numpy array corresponding to a DNA sequence and transform it so
        that the model is able to make prediction on it.
        
        Args:
            nucleotid: array corresponding to the DNA sequence shape = (len, 1)
        return:
            x_seq: array ready to be passed as input of a model to make
            prediction. The shape is (len, 2001, 4, 1)
    """
    WX = 2001
    x = one_hot_encoder(nucleotid)
    x_slide = rolling_window(x, window=(WX, 4))
    x_seq = x_slide.reshape(x_slide.shape[0], WX, 4, 1)
    return x_seq
def load_data():
    args = parse_arguments()

    window = 2001
    half_wx = window // 2
    args = parse_arguments()
    path_to_directory = os.path.dirname(os.path.dirname(
        args.directory))  # we get the path conducting to seq_chr_sacCer3
    path_to_file = os.path.join(path_to_directory, 'seq_chr_sacCer3',
                                args.directory, 'chr16.hdf5')

    f = h5py.File(path_to_file, 'r')
    nucleotid = np.array(f[f.keys()[0]])
    f.close()

    X_one_hot = (np.arange(nucleotid.max()) == nucleotid[..., None] -
                 1).astype(int)
    X_ = X_one_hot.reshape(X_one_hot.shape[0],
                           X_one_hot.shape[1] * X_one_hot.shape[2])

    proba_directory = os.path.dirname(args.file)
    proba_file = os.path.join(proba_directory, 'Start_data', args.file)

    proba = pd.read_csv(proba_file)
    y_true = proba[proba.chr == 'chr16'].value.values

    _, output_len = model_dictionary()[args.model]

    if output_len % 2 == 0:
        half_len = output_len // 2
    else:
        half_len = output_len // 2 + 1

    X_slide = rolling_window(X_, window=(window, 4), asteps=(half_len, 4))
    X_ = X_slide.reshape(X_slide.shape[0], X_slide.shape[2], X_slide.shape[3],
                         1)
    X_1 = X_[::2]
    X_2 = X_[1::2]

    y_true = y_true[half_wx:X_1.shape[0] * output_len + half_wx - half_len]

    return X_1, X_2, y_true, half_len
Ejemplo n.º 9
0
def _convert_array_to_multi(myArray, number_of_lines, number_of_column):
    """
       Convert a numpy array to multi array and if the shape is not correct
       then reshape it (removing starting elements in most of case).
    """
    if len(myArray) > number_of_lines * number_of_column:
        # if the array has not the right shape,
        # then reshape it by removing x starting elements
        resized_array = np.delete(
            myArray,
            range(0,
                  len(myArray) - (number_of_lines * number_of_column)), 0)
        res = np.reshape(resized_array, (number_of_lines, number_of_column))

    elif len(myArray) < number_of_lines * number_of_column:
        myArray = rolling_window(myArray, window=number_of_column)
        np.random.shuffle(myArray)
        res = myArray[:number_of_lines]
    else:
        res = np.reshape(myArray, (number_of_lines, number_of_column))

    return res
 def _process(self, nucleotid):
     x = self._rescale(nucleotid)
     x = self._one_hot_encoder(x)
     x_slide = rolling_window(x, window=(Sequence.WX, 4))
     x_seq = x_slide.reshape(x_slide.shape[0], Sequence.WX, 4, 1)
     return x_seq
Ejemplo n.º 11
0
def _max_norm(y, wx=3001):
    y_roll = rolling_window(y, window=wx)
    max_roll = np.max(y_roll, axis=1).astype(float)
    half = wx // 2
    y = np.concatenate((y[:half], y[half:-half] / max_roll, y[-half:]), axis=0)
    return y