Ejemplo n.º 1
0
    def __init__(self, ref_fasta_fn, ref_map_fn):
        """

        :param ref_fasta_fn:
        :param ref_map_fn:
        :param bam_fn:
        """

        if not os.path.exists(ref_fasta_fn):
            raise FileNotFoundError(
                'Reference fasta file does not exist. {}'.format(ref_fasta_fn))
        if not os.path.exists(ref_map_fn):
            raise FileNotFoundError(
                'Reference mappability bw file does not exist. {}'.format(
                    ref_map_fn))

        logger.info('loading Reference fasta file...')
        ref_fa_obj = pysam.FastaFile(ref_fasta_fn)
        logger.info('loading Reference mappability bw file...')
        ref_bw_obj = pyBigWig.open(ref_map_fn, 'r')

        self.ref_fa_obj = ref_fa_obj
        self.ref_bw_obj = ref_bw_obj
        self.ref_gap_obj = load_gap()
        self.bam_obj_whole = None
        self.rb_base_chr = None
        self.rb_mappability_chr = None
Ejemplo n.º 2
0
    def load_deps(self, bam_fn, ref_fasta_fn, ref_map_fn, fmt='bam'):
        """
        load depended files
        :param bam_fn: bam file, also could be .cram files
        :param ref_fasta_fn: reference fasta file
        :param ref_map_fn: reference mappability file with specified k-mer
        :param fmt: bam format: .bam file or .cram file
        """

        if not os.path.exists(ref_fasta_fn):
            raise FileNotFoundError('Reference fasta file does not exist. {}'.format(ref_fasta_fn))
        if not os.path.exists(ref_map_fn):
            raise FileNotFoundError('Reference mappability bw file does not exist. {}'.format(ref_map_fn))

        logger.info('loading Reference fasta file...')
        self.ref_fa_obj = pysam.FastaFile(ref_fasta_fn)
        logger.info('loading Reference mappability bw file...')
        self.ref_bw_obj = pyBigWig.open(ref_map_fn, 'r')
        logger.info('loading Reference gap...')
        self.ref_gap_obj = load_gap()

        logger.info('Loading bam file...')
        if fmt == 'bam':
            self.bam_obj_whole = pysam.AlignmentFile(bam_fn, mode='rb')
Ejemplo n.º 3
0
    def cal_feat_segs(self,
                      ref_fasta_fn,
                      online_feat_segs_fn,
                      win_size=1000,
                      n_features=13,
                      min_r=0.1,
                      stride_size=200):

        if not os.path.exists(ref_fasta_fn):
            raise FileNotFoundError(
                'Reference fasta file does not exist. {}'.format(ref_fasta_fn))

        logger.info('loading Reference fasta file...')
        ref_fa_obj = pysam.FastaFile(ref_fasta_fn)

        # reference base
        logger.info(
            'Loading reference sequence for sample {} chr: {}...'.format(
                self.sample_id, self.chr_id))
        rb_base_chr = ref_fa_obj.fetch('chr' + self.chr_id)
        chr_len = len(rb_base_chr)

        ref_gap_obj = load_gap()
        ref_gap_chr = ref_gap_obj.loc[
            ref_gap_obj['CHROM'] == 'chr' + self.chr_id,
            ['START', 'END']] if not ref_gap_obj.empty else None

        fil_pos = np.array([], dtype=np.int)
        if ref_gap_chr is not None:
            for _, i_row in ref_gap_chr.iterrows():
                # END is excluded
                fil_pos = np.concatenate(
                    (fil_pos, np.arange(i_row['START'], i_row['END'])))

        rb_base_pos = np.ones(chr_len, dtype=int)
        rb_base_pos[fil_pos] = 0

        seg_values, seg_starts, seg_lengths = find_seg(rb_base_pos)
        assert len(seg_values) == len(seg_starts) == len(seg_lengths)

        logger.info(
            'segmenting {} bp-long {} bp step feature maps sample {}, chr {}...'
            .format(win_size, stride_size, self.sample_id, self.chr_id))
        logger.info(
            '>>>>>>>>>>this processing will take a few minutes (almost 180 minutes for chr 1)...'
        )

        self.__chr_segs_unpredictable = np.empty((0, 4), dtype=np.int)
        self.__chr_segs_predictable = np.empty((0, 4), dtype=np.int)
        self.__chr_segs_predictable_feats = np.empty((0, win_size, n_features))

        for val_idx, i_val in enumerate(seg_values):
            if i_val == 0:
                # gap region
                self.__chr_segs_unpredictable = np.append(
                    self.__chr_segs_unpredictable,
                    np.array([[
                        seg_starts[val_idx], seg_starts[val_idx] +
                        seg_lengths[val_idx], seg_lengths[val_idx], 0
                    ]],
                             dtype=np.int),
                    axis=0)

            else:
                i_seg_start = seg_starts[val_idx]
                i_seg_len = seg_lengths[val_idx]

                if i_seg_len >= win_size:
                    i_start_indices, remain_len = seq_slide(
                        i_seg_len, win_size, stride_size)
                    for i in i_start_indices:
                        i_w_start = int(i + i_seg_start)
                        i_w_end = int(i_w_start + win_size)

                        logger.info('processing at {}'.format(i_w_start))

                        self.__get_feats_region(i_w_start, i_w_end, win_size,
                                                min_r)

                    if remain_len > 0:
                        i_w_start = int(i_seg_len - win_size)
                        i_w_end = int(i_seg_len)

                        self.__get_feats_region(i_w_start, i_w_end, win_size,
                                                min_r)
                else:
                    self.__get_feats_region(i_seg_start,
                                            i_seg_start + i_seg_len, win_size,
                                            min_r)

        logger.info(
            'saving segments of {} bp-long {} bp step feature maps... {}'.
            format(win_size, stride_size, online_feat_segs_fn))
        np.savez_compressed(
            online_feat_segs_fn,
            chr_segs_unpredictable=self.__chr_segs_unpredictable,
            chr_segs_predictable=self.__chr_segs_predictable,
            chr_segs_predictable_feats=self.__chr_segs_predictable_feats)

        logger.info(
            'Done, saving the result file at {}'.format(online_feat_segs_fn))
Ejemplo n.º 4
0
    def cal_feat_segs(self,
                      ref_fasta_fn,
                      online_feat_segs_fn,
                      n_features=13,
                      min_r=0.1,
                      stride_size=200):
        """

        :param ref_fasta_fn:
        :param online_feat_segs_fn:
        :param n_features:
        :param min_r:
        :param stride_size:
        :return:
        """

        if not os.path.exists(ref_fasta_fn):
            raise FileNotFoundError(
                'Reference fasta file does not exist. {}'.format(ref_fasta_fn))

        logger.info('loading Reference fasta file...')
        ref_fa_obj = pysam.FastaFile(ref_fasta_fn)

        # reference base
        logger.info(
            'Loading reference sequence for sample {} chr: {}...'.format(
                self.sample_id, self.chr_id))
        rb_base_chr = ref_fa_obj.fetch('chr' + self.chr_id)
        chr_len = len(rb_base_chr)

        ref_gap_obj = load_gap()
        ref_gap_chr = ref_gap_obj.loc[
            ref_gap_obj['CHROM'] == 'chr' + self.chr_id,
            ['START', 'END']] if not ref_gap_obj.empty else None

        fil_pos = np.array([], dtype=np.int)
        if ref_gap_chr is not None:
            for _, i_row in ref_gap_chr.iterrows():
                # END is excluded
                fil_pos = np.concatenate(
                    (fil_pos, np.arange(i_row['START'], i_row['END'])))

        rb_base_pos = np.ones(chr_len, dtype=int)
        rb_base_pos[fil_pos] = 0

        seg_values, seg_starts, seg_lengths = find_seg(rb_base_pos)
        assert len(seg_values) == len(seg_starts) == len(seg_lengths)

        self.__chr_segs_predictable = np.empty((0, 4), dtype=np.int)
        self.__chr_segs_predictable_feats = np.empty(
            (0, self.win_size, n_features))

        seg_gap_inds = np.where(seg_values == 0)[0]
        seg_val_len_less_inds = np.where((seg_values == 1)
                                         & (seg_lengths < self.win_size))[0]
        seg_val_normal_inds = np.where((seg_values == 1)
                                       & (seg_lengths >= self.win_size))[0]

        assert len(seg_val_len_less_inds) + len(seg_val_normal_inds) + len(
            seg_gap_inds) == len(seg_values)

        logger.info('calculating gap segments for sample {}, chr {}...'.format(
            self.win_size, stride_size, self.sample_id, self.chr_id))
        self.__chr_segs_unpredictable = np.array(
            [[i_gap_start, i_gap_start + i_gap_lens,
              i_gap_lens, 0] for i_gap_start, i_gap_lens in zip(
                  seg_starts[seg_gap_inds], seg_lengths[seg_gap_inds])])

        self.__chr_segs_predictable = np.empty((0, 4), dtype=np.int)
        self.__chr_segs_predictable_feats = np.empty(
            (0, self.win_size, n_features))

        logger.info(
            'calculating {} bp-long {} bp step feature maps sample {}, chr {}...'
            .format(self.win_size, stride_size, self.sample_id, self.chr_id))

        val_seg_len_less_starts = seg_starts[seg_val_len_less_inds]
        val_seg_len_less_lens = seg_lengths[seg_val_len_less_inds]
        val_seg_len_less_end = val_seg_len_less_starts + val_seg_len_less_lens

        val_seg_poss_zip = list(
            zip(val_seg_len_less_starts, val_seg_len_less_end))

        # slice the seg into win_size
        val_seg_normal_starts = seg_starts[seg_val_normal_inds]
        val_seg_normal_lens = seg_lengths[seg_val_normal_inds]

        val_normal_slice_starts = [
            seq_slide(i_seg_len, self.win_size, stride_size)
            for i_seg_len in val_seg_normal_lens
        ]

        assert len(val_seg_normal_starts) == len(val_normal_slice_starts)
        val_nomarl_slices = [
            (val_seg_normal_starts[i] + i_slice_start,
             val_seg_normal_starts[i] + i_slice_start + self.win_size,
             end_start, remain_len)
            for i, (i_slice_start, end_start,
                    remain_len) in enumerate(val_normal_slice_starts)
        ]

        for i_seg_norm_starts, i_seg_norm_ends, end_start, remain_len in val_nomarl_slices:
            val_seg_poss_zip.extend(
                list(zip(i_seg_norm_starts, i_seg_norm_ends)))
            if remain_len > 0:
                val_seg_poss_zip.append((end_start, end_start + remain_len))

        logger.info(
            'saving segments of {} bp-long {} bp step feature maps... {}'.
            format(self.win_size, stride_size, online_feat_segs_fn))

        for i_w_start, i_w_end in val_seg_poss_zip:
            i_w_len = i_w_end - i_w_start
            self.__get_feats_region(i_w_start, i_w_end, i_w_len, min_r)
            if i_w_start < 55000:
                logger.info('process at {}'.format(i_w_start))

        np.savez_compressed(
            online_feat_segs_fn,
            chr_segs_unpredictable=self.__chr_segs_unpredictable,
            chr_segs_predictable=self.__chr_segs_predictable,
            chr_segs_predictable_feats=self.__chr_segs_predictable_feats)

        logger.info(
            'Done, saving the result file at {}'.format(online_feat_segs_fn))