Example #1
0
    def run(self,
            sample_id,
            chr_id,
            win_size=1000,
            min_r=0.1,
            stride_size=200):
        """

        :param sample_id:
        :param chr_id:
        :param win_size:
        :param min_r:
        :param stride_size:
        :return:
        """

        ref_gap_chr = self.ref_gap_obj.loc[
            self.ref_gap_obj['CHROM'] == 'chr' + chr_id,
            ['START', 'END']] if not self.ref_gap_obj.empty else None

        # reference base
        logger.info(
            'Loading reference sequence for sample {} chr: {}...'.format(
                sample_id, chr_id))
        self.rb_base_chr = self.ref_fa_obj.fetch('chr' + chr_id)
        chr_len = len(self.rb_base_chr)

        # reference mappabillity
        logger.info(
            'Loading reference mappability for sample {} chr: {}...'.format(
                sample_id, chr_id))
        self.rb_mappability_chr = self.ref_bw_obj.values(
            'chr' + chr_id, 0, chr_len - 1)

        fil_pos = np.array([], dtype=np.int)
        if ref_gap_chr is not None:
            for _, i_row in ref_gap_chr.iterrows():
                # END is excluded
                fil_pos = np.concatenate(
                    (fil_pos, np.arange(i_row['START'], i_row['END'])))

        rb_base_pos = np.ones(chr_len, dtype=int)
        rb_base_pos[fil_pos] = 0

        seg_values, seg_starts, seg_lengths = find_seg(rb_base_pos)
        assert len(seg_values) == len(seg_starts) == len(seg_lengths)

        for val_idx, i_val in enumerate(seg_values):
            if i_val == 0:
                # gap region
                yield seg_starts[val_idx], seg_starts[val_idx] + seg_lengths[
                    val_idx], seg_lengths[val_idx], 0, None
            else:
                i_seg_start = seg_starts[val_idx]
                i_seg_len = seg_lengths[val_idx]

                if i_seg_len >= win_size:
                    i_start_indices, remain_len = seq_slide(
                        i_seg_len, win_size, stride_size)
                    for i in i_start_indices:
                        i_w_start = i + i_seg_start
                        i_w_end = i_w_start + win_size

                        yield self._get_feats_region(chr_id, i_w_start,
                                                     i_w_end, win_size, min_r)

                    if remain_len > 0:
                        i_w_start = i_seg_len - win_size
                        i_w_end = i_seg_len
                        yield self._get_feats_region(chr_id, i_w_start,
                                                     i_w_end, win_size, min_r)
                else:
                    yield self._get_feats_region(chr_id, i_seg_start,
                                                 i_seg_start + i_seg_len,
                                                 win_size, min_r)
Example #2
0
def main(args):

    # input cnv call result
    global win_size
    win_size = args.win_size
    global step_size
    step_size = args.step_size

    in_fname = args.fname
    in_dir = args.i_root_dir
    out_dir = args.out_root_dir

    sample_id = args.sample_id
    chr_id = args.chr_id

    n_cpus = args.cpus

    out_type = args.out_type

    global min_seg_len
    min_seg_len = 5

    in_full_name = os.path.join(in_dir, in_fname)
    if not os.path.exists(in_full_name):
        raise FileNotFoundError('file not found {}'.format(in_full_name))
    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)

    global cnv_df
    cnv_df = pd.read_csv(in_full_name, sep='\t')
    cnv_df.loc[cnv_df['p_neu'].values.astype(int) == -1,
               ['p_neu', 'p_del', 'p_dup']] = np.nan

    # find predictive region

    cnv_df['pred_ind'] = np.where(cnv_df['indicator'] == 3, 1, 0)
    seg_values, seg_starts, seg_lengths = find_seg(cnv_df['pred_ind'].values)
    predictive_indices = np.where(seg_values == 1)[0]
    seg_start_pd_row_indices = seg_starts[predictive_indices]
    # index start at 0, end point should not include
    seg_end_pd_row_indices = seg_start_pd_row_indices + seg_lengths[
        predictive_indices]

    logger.info('segmenting and merging...')
    pred_seg_ind_lst = list(
        zip(seg_start_pd_row_indices, seg_end_pd_row_indices))
    len_segs = len(pred_seg_ind_lst)

    out_res = dict()
    start_pos = []
    end_pos = []
    merg_p_neu = []
    merg_p_del = []
    merg_p_dup = []
    merg_pre_l = []

    locker = mp.Lock()
    # with ThreadPool(n_proc) as p, h5py.File(online_out_sample_data_fn, 'w') as h5_out:
    with mp.Pool(n_cpus, initializer=mp_init, initargs=(locker, )) as p:

        results = p.imap(multi_run_wrapper, pred_seg_ind_lst)

        for i, res in enumerate(results):
            logger.info('finished at {}/{}'.format(i + 1, len_segs))
            if res is None:
                logger.info('{}:{} cannot merge'.format(
                    pred_seg_ind_lst[i][0], pred_seg_ind_lst[i][1]))
                continue
            re_start_i, re_end_i, re_merg_p_neu_i, re_merg_p_del_i, re_merg_p_dup_i, re_merg_pre_l_i = res
            # logger.info(re_start_i)

            start_pos.extend(re_start_i)
            end_pos.extend(re_end_i)
            merg_p_neu.extend(re_merg_p_neu_i)
            merg_p_del.extend(re_merg_p_del_i)
            merg_p_dup.extend(re_merg_p_dup_i)
            merg_pre_l.extend(re_merg_pre_l_i)

    out_res['POS_S'] = start_pos
    out_res['POS_E'] = end_pos
    out_res['LEN'] = np.array(end_pos) - np.array(start_pos)
    out_res['P_NEU'] = merg_p_neu
    out_res['P_DEL'] = merg_p_del
    out_res['P_DUP'] = merg_p_dup
    out_res['PRED_L'] = merg_pre_l

    out_cnv_df = pd.DataFrame(data=out_res)
    un_pred_df = cnv_df.loc[
        cnv_df['indicator'] != 3,
        ['seg_s', 'seg_e', 'p_neu', 'p_del', 'p_dup', 'pred_l', 'indicator']]
    un_pred_df.loc[(cnv_df['indicator'] == 1) | (cnv_df['indicator'] == 2), 'seg_e'] =\
        un_pred_df.loc[(cnv_df['indicator'] == 1) | (cnv_df['indicator'] == 2), 'seg_s'] + 200
    un_pred_df['seg_len'] = un_pred_df['seg_e'] - un_pred_df['seg_s']
    un_pred_df = un_pred_df[[
        'seg_s', 'seg_e', 'seg_len', 'p_neu', 'p_del', 'p_dup', 'pred_l'
    ]]

    whl_cnv_re = np.concatenate((out_cnv_df.values, un_pred_df.values), axis=0)
    ind = np.argsort(whl_cnv_re[:, 0])
    whl_cnv_re = whl_cnv_re[ind]
    f_out_df = pd.DataFrame(
        data=whl_cnv_re,
        columns=['POS_S', 'POS_E', 'LEN', 'P_NEU', 'P_DEL', 'P_DUP', 'PRED_L'])
    f_out_df['POS_S'] = f_out_df['POS_S'].astype(int)
    f_out_df['POS_E'] = f_out_df['POS_E'].astype(int)
    f_out_df['LEN'] = f_out_df['LEN'].astype(int)
    f_out_df['PRED_L'] = f_out_df['PRED_L'].astype(int)

    # f_out_df = f_out_df[(f_out_df['PRED_L'] == 1) | (f_out_df['PRED_L'] == 2)]
    out_cnv_fn = os.path.join(
        out_dir,
        'M{}_{}_{}_{}_out_cnv_{}-rbf_min5.csv'.format(sample_id, chr_id,
                                                      win_size, step_size,
                                                      out_type))
    if os.path.exists(out_cnv_fn):
        os.remove(out_cnv_fn)

    f_out_df.to_csv(out_cnv_fn, index=False, sep='\t')

    logger.info('Done, the results saved at {}'.format(out_cnv_fn))
Example #3
0
def gen_neu_feats(sample_id,
                  chr_id,
                  rb_base_chr,
                  rb_mappability_chr,
                  bam_obj_whole,
                  gap_regions_chr,
                  cnv_regions_chr,
                  out_chr_fname,
                  min_reg_len=2000,
                  n_regions=4):
    """

    :param sample_id:
    :param chr_id:
    :param rb_base_chr:
    :param rb_mappability_chr:
    :param bam_obj_whole:
    :param gap_regions_chr:
    :param cnv_regions_chr:
    :param out_chr_fname:
    :param min_reg_len:
    :param n_regions:
    :return:
    """
    # find the regions from gap and/or cnvs to be excluded
    # gaps
    fil_pos = np.array([], dtype=np.int)
    if gap_regions_chr:
        for _, i_row in gap_regions_chr.iterrows():
            # END is excluded
            fil_pos = np.concatenate(
                (fil_pos, np.arange(i_row['START'], i_row['END'])))
    # cnv region
    # NEU features need to exclude the cnv region
    # but for prediction, cnv regions are not needed to be excluded.
    if cnv_regions_chr:
        for _, i_row in cnv_regions_chr.iterrows():
            # VCF POS is started with 1, END is also excluded
            fil_pos = np.concatenate(
                (fil_pos, np.arange(i_row['POS'] - 1, i_row['END'] - 1)))

    rb_base_pos = np.ones(len(rb_base_chr), dtype=int)
    rb_base_pos[fil_pos] = 0

    logger.info(
        'finding the regions to be generated feature matrix, sample {} chr {}...'
        .format(sample_id, chr_id))
    seg_values, seg_starts, seg_lengths = find_seg(rb_base_pos)

    assert len(seg_values) == len(seg_starts) == len(seg_lengths)

    fil_val_idx = np.where(seg_values == 1)[0]
    fil_len_idx = np.where(seg_lengths >= min_reg_len)[0]
    fil_idx = list(set(fil_val_idx).intersection(set(fil_len_idx)))

    t_seg_start = seg_starts[fil_idx]
    t_seg_len = seg_lengths[fil_idx]
    frt_idxs = np.argsort(t_seg_len)[:n_regions]

    logger.info(
        'extracting features for neu region len={}, sample {} chr {}...'.
        format(t_seg_len[frt_idxs], sample_id, chr_id))

    for i_idx in frt_idxs:
        i_start = t_seg_start[i_idx]
        i_len = t_seg_len[i_idx]
        i_end = i_start + i_len
        i_rb_base = rb_base_chr[i_start:i_end]
        i_ref_map = rb_mappability_chr[i_start:i_end]
        i_pileup = bam_obj_whole.pileup('chr' + chr_id,
                                        start=i_start,
                                        stop=i_end,
                                        stepper='nofilter',
                                        min_base_quality=0,
                                        truncate=True)
        ref_rel_pos, f_mat = gen_feat_region(i_pileup, i_rb_base, i_ref_map,
                                             i_start, i_len)

        with open(out_chr_fname, 'a') as f:
            f.write('#{},{},{},{},{},{},{}\n'.format(chr_id, i_start, i_end,
                                                     i_len, 'NEU', 10,
                                                     ref_rel_pos))
            np.savetxt(f, f_mat, fmt='%-10.5f')
        del f_mat
        del ref_rel_pos
    return 'Sample {} chr {}: neu features written to file'.format(
        sample_id, chr_id)
Example #4
0
    def cal_feat_segs(self,
                      ref_fasta_fn,
                      online_feat_segs_fn,
                      win_size=1000,
                      n_features=13,
                      min_r=0.1,
                      stride_size=200):

        if not os.path.exists(ref_fasta_fn):
            raise FileNotFoundError(
                'Reference fasta file does not exist. {}'.format(ref_fasta_fn))

        logger.info('loading Reference fasta file...')
        ref_fa_obj = pysam.FastaFile(ref_fasta_fn)

        # reference base
        logger.info(
            'Loading reference sequence for sample {} chr: {}...'.format(
                self.sample_id, self.chr_id))
        rb_base_chr = ref_fa_obj.fetch('chr' + self.chr_id)
        chr_len = len(rb_base_chr)

        ref_gap_obj = load_gap()
        ref_gap_chr = ref_gap_obj.loc[
            ref_gap_obj['CHROM'] == 'chr' + self.chr_id,
            ['START', 'END']] if not ref_gap_obj.empty else None

        fil_pos = np.array([], dtype=np.int)
        if ref_gap_chr is not None:
            for _, i_row in ref_gap_chr.iterrows():
                # END is excluded
                fil_pos = np.concatenate(
                    (fil_pos, np.arange(i_row['START'], i_row['END'])))

        rb_base_pos = np.ones(chr_len, dtype=int)
        rb_base_pos[fil_pos] = 0

        seg_values, seg_starts, seg_lengths = find_seg(rb_base_pos)
        assert len(seg_values) == len(seg_starts) == len(seg_lengths)

        logger.info(
            'segmenting {} bp-long {} bp step feature maps sample {}, chr {}...'
            .format(win_size, stride_size, self.sample_id, self.chr_id))
        logger.info(
            '>>>>>>>>>>this processing will take a few minutes (almost 180 minutes for chr 1)...'
        )

        self.__chr_segs_unpredictable = np.empty((0, 4), dtype=np.int)
        self.__chr_segs_predictable = np.empty((0, 4), dtype=np.int)
        self.__chr_segs_predictable_feats = np.empty((0, win_size, n_features))

        for val_idx, i_val in enumerate(seg_values):
            if i_val == 0:
                # gap region
                self.__chr_segs_unpredictable = np.append(
                    self.__chr_segs_unpredictable,
                    np.array([[
                        seg_starts[val_idx], seg_starts[val_idx] +
                        seg_lengths[val_idx], seg_lengths[val_idx], 0
                    ]],
                             dtype=np.int),
                    axis=0)

            else:
                i_seg_start = seg_starts[val_idx]
                i_seg_len = seg_lengths[val_idx]

                if i_seg_len >= win_size:
                    i_start_indices, remain_len = seq_slide(
                        i_seg_len, win_size, stride_size)
                    for i in i_start_indices:
                        i_w_start = int(i + i_seg_start)
                        i_w_end = int(i_w_start + win_size)

                        logger.info('processing at {}'.format(i_w_start))

                        self.__get_feats_region(i_w_start, i_w_end, win_size,
                                                min_r)

                    if remain_len > 0:
                        i_w_start = int(i_seg_len - win_size)
                        i_w_end = int(i_seg_len)

                        self.__get_feats_region(i_w_start, i_w_end, win_size,
                                                min_r)
                else:
                    self.__get_feats_region(i_seg_start,
                                            i_seg_start + i_seg_len, win_size,
                                            min_r)

        logger.info(
            'saving segments of {} bp-long {} bp step feature maps... {}'.
            format(win_size, stride_size, online_feat_segs_fn))
        np.savez_compressed(
            online_feat_segs_fn,
            chr_segs_unpredictable=self.__chr_segs_unpredictable,
            chr_segs_predictable=self.__chr_segs_predictable,
            chr_segs_predictable_feats=self.__chr_segs_predictable_feats)

        logger.info(
            'Done, saving the result file at {}'.format(online_feat_segs_fn))
Example #5
0
    def cal_feat_segs(self,
                      ref_fasta_fn,
                      online_feat_segs_fn,
                      n_features=13,
                      min_r=0.1,
                      stride_size=200):
        """

        :param ref_fasta_fn:
        :param online_feat_segs_fn:
        :param n_features:
        :param min_r:
        :param stride_size:
        :return:
        """

        if not os.path.exists(ref_fasta_fn):
            raise FileNotFoundError(
                'Reference fasta file does not exist. {}'.format(ref_fasta_fn))

        logger.info('loading Reference fasta file...')
        ref_fa_obj = pysam.FastaFile(ref_fasta_fn)

        # reference base
        logger.info(
            'Loading reference sequence for sample {} chr: {}...'.format(
                self.sample_id, self.chr_id))
        rb_base_chr = ref_fa_obj.fetch('chr' + self.chr_id)
        chr_len = len(rb_base_chr)

        ref_gap_obj = load_gap()
        ref_gap_chr = ref_gap_obj.loc[
            ref_gap_obj['CHROM'] == 'chr' + self.chr_id,
            ['START', 'END']] if not ref_gap_obj.empty else None

        fil_pos = np.array([], dtype=np.int)
        if ref_gap_chr is not None:
            for _, i_row in ref_gap_chr.iterrows():
                # END is excluded
                fil_pos = np.concatenate(
                    (fil_pos, np.arange(i_row['START'], i_row['END'])))

        rb_base_pos = np.ones(chr_len, dtype=int)
        rb_base_pos[fil_pos] = 0

        seg_values, seg_starts, seg_lengths = find_seg(rb_base_pos)
        assert len(seg_values) == len(seg_starts) == len(seg_lengths)

        self.__chr_segs_predictable = np.empty((0, 4), dtype=np.int)
        self.__chr_segs_predictable_feats = np.empty(
            (0, self.win_size, n_features))

        seg_gap_inds = np.where(seg_values == 0)[0]
        seg_val_len_less_inds = np.where((seg_values == 1)
                                         & (seg_lengths < self.win_size))[0]
        seg_val_normal_inds = np.where((seg_values == 1)
                                       & (seg_lengths >= self.win_size))[0]

        assert len(seg_val_len_less_inds) + len(seg_val_normal_inds) + len(
            seg_gap_inds) == len(seg_values)

        logger.info('calculating gap segments for sample {}, chr {}...'.format(
            self.win_size, stride_size, self.sample_id, self.chr_id))
        self.__chr_segs_unpredictable = np.array(
            [[i_gap_start, i_gap_start + i_gap_lens,
              i_gap_lens, 0] for i_gap_start, i_gap_lens in zip(
                  seg_starts[seg_gap_inds], seg_lengths[seg_gap_inds])])

        self.__chr_segs_predictable = np.empty((0, 4), dtype=np.int)
        self.__chr_segs_predictable_feats = np.empty(
            (0, self.win_size, n_features))

        logger.info(
            'calculating {} bp-long {} bp step feature maps sample {}, chr {}...'
            .format(self.win_size, stride_size, self.sample_id, self.chr_id))

        val_seg_len_less_starts = seg_starts[seg_val_len_less_inds]
        val_seg_len_less_lens = seg_lengths[seg_val_len_less_inds]
        val_seg_len_less_end = val_seg_len_less_starts + val_seg_len_less_lens

        val_seg_poss_zip = list(
            zip(val_seg_len_less_starts, val_seg_len_less_end))

        # slice the seg into win_size
        val_seg_normal_starts = seg_starts[seg_val_normal_inds]
        val_seg_normal_lens = seg_lengths[seg_val_normal_inds]

        val_normal_slice_starts = [
            seq_slide(i_seg_len, self.win_size, stride_size)
            for i_seg_len in val_seg_normal_lens
        ]

        assert len(val_seg_normal_starts) == len(val_normal_slice_starts)
        val_nomarl_slices = [
            (val_seg_normal_starts[i] + i_slice_start,
             val_seg_normal_starts[i] + i_slice_start + self.win_size,
             end_start, remain_len)
            for i, (i_slice_start, end_start,
                    remain_len) in enumerate(val_normal_slice_starts)
        ]

        for i_seg_norm_starts, i_seg_norm_ends, end_start, remain_len in val_nomarl_slices:
            val_seg_poss_zip.extend(
                list(zip(i_seg_norm_starts, i_seg_norm_ends)))
            if remain_len > 0:
                val_seg_poss_zip.append((end_start, end_start + remain_len))

        logger.info(
            'saving segments of {} bp-long {} bp step feature maps... {}'.
            format(self.win_size, stride_size, online_feat_segs_fn))

        for i_w_start, i_w_end in val_seg_poss_zip:
            i_w_len = i_w_end - i_w_start
            self.__get_feats_region(i_w_start, i_w_end, i_w_len, min_r)
            if i_w_start < 55000:
                logger.info('process at {}'.format(i_w_start))

        np.savez_compressed(
            online_feat_segs_fn,
            chr_segs_unpredictable=self.__chr_segs_unpredictable,
            chr_segs_predictable=self.__chr_segs_predictable,
            chr_segs_predictable_feats=self.__chr_segs_predictable_feats)

        logger.info(
            'Done, saving the result file at {}'.format(online_feat_segs_fn))
Example #6
0
    def cal_feat_segs(self, sample_id, chr_id, win_size=1000, min_r=0.1, stride_size=200,
                      online_feat_segs_fn=None, n_proc=16):
        """
        calculate feature map segments for given window

        :param sample_id:
        :param chr_id:
        :param win_size:
        :param min_r:
        :param stride_size:
        :param online_feat_segs_fn:
        :param n_proc: number of processor
        """

        self.sample_id = sample_id
        self.chr_id = chr_id

        # assure the feature segmentation list is empty before adding the feature maps
        self.chr_segs_unpredictable = []
        self.chr_segs_predictable = []
        self.chr_segs_predictable_feats = []

        ref_gap_chr = self.ref_gap_obj.loc[self.ref_gap_obj['CHROM'] == 'chr' + chr_id,
                                           ['START', 'END']] if not self.ref_gap_obj.empty else None

        # reference base
        logger.info('Loading reference sequence for sample {} chr: {}...'.format(sample_id, chr_id))
        self.rb_base_chr = self.ref_fa_obj.fetch('chr' + chr_id)
        chr_len = len(self.rb_base_chr)

        # reference mappabillity
        logger.info('Loading reference mappability for sample {} chr: {}...'.format(sample_id, chr_id))
        self.rb_mappability_chr = self.ref_bw_obj.values('chr' + chr_id, 0, chr_len)

        fil_pos = np.array([], dtype=np.int)
        if ref_gap_chr is not None:
            for _, i_row in ref_gap_chr.iterrows():
                # END is excluded
                fil_pos = np.concatenate((fil_pos, np.arange(i_row['START'], i_row['END'])))

        rb_base_pos = np.ones(chr_len, dtype=int)
        rb_base_pos[fil_pos] = 0

        # call pysam pileup() to read the whole chr
        logger.info('loading whole bam pileup for sample {} chr: {}...'.format(sample_id, chr_id))
        assert self.bam_obj_whole is not None
        whole_chr_pileup = self.bam_obj_whole.pileup('chr' + chr_id, stepper='nofilter', min_base_quality=0)

        # get the feature map for the whole chr
        # take almost 25 minutes for  chr1
        logger.info('calculating feature maps for whole chromosome, sample {}, chr {}...'.format(sample_id, chr_id))
        logger.info('>>>>>>>>>>this processing will take a few minutes (almost 25 minutes for chr 1)...')
        self.chr_feat_mat_whole = gen_feat_whole_chr(whole_chr_pileup,
                                                     self.rb_base_chr,
                                                     chr_len,
                                                     self.rb_mappability_chr)
        del whole_chr_pileup
        del self.rb_base_chr
        del self.rb_mappability_chr

        # self.chr_feat_mat_whole = np.zeros((self.n_features, chr_len))
        # logger.info('submitting through ThreadPoolExecutor, worker={}...'.format(n_proc))
        # with concurrent.futures.ThreadPoolExecutor(max_workers=n_proc) as pool:
        #     # There must use ThreadPoolExecutor. it is not working if using ProcessPoolExecutor
        #     res = [pool.submit(gen_feat_single, pileup_column) for pileup_column in whole_chr_pileup]
        #     for i_re in concurrent.futures.as_completed(res):
        #         i_ref_pos, *i_re_tup = i_re.result()
        #
        #         # if i_ref_pos % 1000000 == 0:
        #         # logger.info('feature generation at position {}'.format(i_ref_pos))
        #         # base coverage
        #         self.chr_feat_mat_whole[0, i_ref_pos] = i_re_tup[0]
        #         # base quality
        #         self.chr_feat_mat_whole[1, i_ref_pos] = i_re_tup[1]
        #         # base map quality
        #         self.chr_feat_mat_whole[2, i_ref_pos] = i_re_tup[2]
        #         # base_gc_cnt
        #         self.chr_feat_mat_whole[3, i_ref_pos] = i_re_tup[3]
        #         # base_a_cnt
        #         self.chr_feat_mat_whole[4, i_ref_pos] = i_re_tup[4]
        #         # base_t_cnt
        #         self.chr_feat_mat_whole[5, i_ref_pos] = i_re_tup[5]
        #         # base_c_cnt
        #         self.chr_feat_mat_whole[6, i_ref_pos] = i_re_tup[6]
        #         # base_g_cnt
        #         self.chr_feat_mat_whole[7, i_ref_pos] = i_re_tup[7]
        #
        #         # reference mappability
        #         self.chr_feat_mat_whole[8, i_ref_pos] = self.rb_mappability_chr[i_ref_pos]
        #
        #         self.chr_feat_mat_whole[9, i_ref_pos] = 1 if self.rb_base_chr[i_ref_pos] == 'A' else 0
        #         self.chr_feat_mat_whole[10, i_ref_pos] = 1 if self.rb_base_chr[i_ref_pos] == 'T' else 0
        #         self.chr_feat_mat_whole[11, i_ref_pos] = 1 if self.rb_base_chr[i_ref_pos] == 'C' else 0
        #         self.chr_feat_mat_whole[12, i_ref_pos] = 1 if self.rb_base_chr[i_ref_pos] == 'G' else 0

        seg_values, seg_starts, seg_lengths = find_seg(rb_base_pos)
        assert len(seg_values) == len(seg_starts) == len(seg_lengths)

        logger.info('segmenting {} bp-long {} bp step feature maps sample {}, chr {}...'.format(
            win_size, stride_size, sample_id, chr_id))

        for val_idx, i_val in enumerate(seg_values):
            if i_val == 0:
                # gap region
                self.chr_segs_unpredictable.append(np.array([seg_starts[val_idx],
                                                             seg_starts[val_idx] + seg_lengths[val_idx],
                                                             seg_lengths[val_idx], 0]))
            else:
                i_seg_start = seg_starts[val_idx]
                i_seg_len = seg_lengths[val_idx]

                if i_seg_len >= win_size:
                    i_start_indices, remain_len = seq_slide(i_seg_len, win_size, stride_size)
                    for i in i_start_indices:
                        i_w_start = int(i+i_seg_start)
                        i_w_end = int(i_w_start + win_size)

                        self.__get_feats_region(i_w_start, i_w_end, win_size, min_r)

                    if remain_len > 0:
                        i_w_start = int(i_seg_len-win_size)
                        i_w_end = int(i_seg_len)
                        self.__get_feats_region(i_w_start, i_w_end, win_size, min_r)
                else:
                    self.__get_feats_region(i_seg_start, i_seg_start+i_seg_len, win_size, min_r)

        del self.chr_feat_mat_whole
        gc.collect()

        self.chr_segs_unpredictable = np.vstack(self.chr_segs_unpredictable)
        self.chr_segs_predictable = np.vstack(self.chr_segs_predictable)
        self.chr_segs_predictable_feats = np.array(self.chr_segs_predictable_feats)

        if online_feat_segs_fn:
            logger.info('saving segments of {} bp-long {} bp step feature maps... {}'.format(win_size,
                                                                                             stride_size,
                                                                                             online_feat_segs_fn))
            np.savez_compressed(online_feat_segs_fn,
                                chr_segs_unpredictable=self.chr_segs_unpredictable,
                                chr_segs_predictable=self.chr_segs_predictable,
                                chr_segs_predictable_feats=self.chr_segs_predictable_feats)