def local_coverage(self, features, *args, **kwargs): processes = kwargs.pop('processes', None) if not processes: return _local_coverage(self.adapter, features, *args, **kwargs) if isinstance(features, (list, tuple)): raise ValueError( "only single features are supported for parallel " "local_coverage") # we don't want to have self.array do the binning bins = kwargs.pop('bins', None) # since if we got here processes is not None, then this will trigger # a parallel array creation features = helpers.tointerval(features) x = np.arange(features.start, features.stop) features = list(helpers.split_feature(features, processes)) ys = self.array( features, *args, bins=None, processes=processes, ragged=True, **kwargs) # now we ravel() and re-bin y = np.column_stack(ys).ravel() if bins: xi, yi = rebin(x, y, bins) del x, y return xi, yi return x, y
def _local_coverage(reader, features, read_strand=None, fragment_size=None, shift_width=0, bins=None, use_score=False, accumulate=True, preserve_total=False, method=None, function="mean", zero_inf=True, zero_nan=True, processes=None, stranded=True, verbose=False): """ Returns a binned vector of coverage. Computes a 1D vector of coverage at the coordinates for each feature in `features`, extending each read by `fragmentsize` bp. Some arguments cannot be used for bigWig files due to the structure of these files. The parameters docstring below indicates whether or not an argument can be used with bigWig files. Depending on the arguments provided, this method can return a vector containing values from a single feature or from concatenated features. An example of the flexibility afforded by the latter case: `features` can be a 3-tuple of pybedtools.Intervals representing (TSS + 1kb upstream, gene, TTS + 1kb downstream) and `bins` can be [100, 1000, 100]. This will return a vector of length 1200 containing the three genomic intervals binned into 100, 1000, and 100 bins respectively. Note that is up to the caller to construct the right axes labels in the final plot! Parameters ---------- features : str, interval-like object, or list Can be a single interval or an iterable yielding intervals. Interval-like objects must have chrom, start, and stop attributes, and optionally a strand attribute. One exception to this that if `features` is a single string, it can be of the form "chrom:start-stop" or "chrom:start-stop[strand]". If `features` is a single interval, then return a 1-D array for that interval. If `features` is an iterable of intervals, then return a 1-D array that is a concatenation of signal for these intervals. Available for bigWig. bins : None, int, list If `bins` is None, then each value in the returned array will correspond to one bp in the genome. If `features` is a single Interval, then `bins` is an integer or None. If `features` is an iterable of Intervals, `bins` is an iterable of integers of the same length as `features`. Available for bigWig. fragment_size : None or int If not None, then each item from the genomic signal (e.g., reads from a BAM file) will be extended `fragment_size` bp in the 3' direction. Higher fragment sizes will result in smoother signal. Not available for bigWig. shift_width : int Each item from the genomic signal (e.g., reads from a BAM file) will be shifted `shift_width` bp in the 3' direction. This can be useful for reconstructing a ChIP-seq profile, using the shift width determined from the peak-caller (e.g., modeled `d` in MACS). Not available for bigWig. read_strand : None or str If `read_strand` is one of "+" or "-", then only items from the genomic signal (e.g., reads from a BAM file) on that strand will be considered and reads on the opposite strand ignored. Useful for plotting genomic signal for stranded libraries. Not available for bigWig. stranded : bool If True, then the profile will be reversed for features whose strand attribute is "-". use_score : bool If True, then each bin will contain the sum of the *score* attribute of genomic features in that bin instead of the *number* of genomic features falling within each bin. Not available for bigWig. accumulate : bool If False, then only record *that* there was something there, rather than acumulating reads. This is useful for making matrices with called peaks. Available for bigWig. preserve_total : bool If True, re-scales the returned value so that each binned row's total is equal to the sum of the original, un-binned data. The units of the returned array will be in "total per bin". This is useful for, e.g., counting reads in features. If `preserve_total` is False, then the returned array will have units of "density"; this is more generally useful and is the default behavior. Available for bigWig, but not when using method="ucsc_summarize". method : str; one of [ "summarize" | "get_as_array" | "ucsc_summarize" ] Only used for bigWig. The method specifies how data are extracted from the bigWig file. "summarize" is the default. It's quite fast, but may yield slightly different results when compared to running this same function on the BAM file from which the bigWig was created. "summarize" uses bx-python. The values returned will not be exactly the same as the values returned when local_coverage is called on a BAM, BED, or bigBed file, but they will be close. This method is quite fast, and is the default when bins is not None. "get_as_array" uses bx-python, but does a separate binning step. This can be slower than the other two methods, but the results are exactly the same as those from a BAM, BED, or bigBed file. This method is always used if bins=None. "ucsc_summarize" is an alternative version of "summarize". It uses the UCSC program `bigWigSummary`, which must already installed and on your path. function : str; one of ['sum' | 'mean' | 'min' | 'max' | 'std'] Determine the nature of the values returned. Only valid if `method` is "summarize" or "ucsc_summarize", which also implies bigWig. Default is "mean". If `method="ucsc_summarize", then there is an additional option for function, "coverage", which returns the percent of region that is covered. zero_inf, zero_nan : bool Only used for bigWig. If either are True, sets any missing or inf values to zero before returning. If `method="ucsc_summarize"`, missinv values are always reported as zero. If `method="get_as_array"`, missing values always reported as nan. Values can be -inf, inf, or nan for missing values when `method="summarize"` according to the following table: ========== ======================== `function` missing values appear as ========== ======================== "sum" 0 "mean" nan "min" inf "max" -inf "std" nan ========== ======================== processes : int or None The feature can be split across multiple processes. Returns ------- 1-d NumPy array Notes ----- If a feature has a "-" strand attribute, then the resulting profile will be *relative to a minus-strand feature*. That is, the resulting profile will be reversed. Returns arrays `x` and `y`. `x` is in genomic coordinates, and `y` is the coverage at each of those coordinates after extending fragments. The total number of reads is guaranteed to be the same no matter how it's binned. (with ideas from http://www-huber.embl.de/users/anders/HTSeq/doc/tss.html) """ # bigWig files are handled differently, so we need to know if we're working # with one; raise exeception if a kwarg was supplied that's not supported. if isinstance(reader, filetype_adapters.BigWigAdapter): is_bigwig = True defaults = ( ('read_strand', read_strand, None), ('fragment_size', fragment_size, None), ('shift_width', shift_width, 0), ('use_score', use_score, False), ('preserve_total', preserve_total, False), ) for name, check, default in defaults: if (((default is None) and (check is not default)) or (check != default)): raise ArgumentError("Argument '%s' not supported for bigWig" % name) if method == 'ucsc_summarize': if preserve_total: raise ArgumentError( "preserve_total=True not supported when using " "method='ucsc_summarize'") else: is_bigwig = False if isinstance(reader, filetype_adapters.BamAdapter): if use_score: raise ArgumentError("Argument 'use_score' not supported for " "bam") # e.g., features = "chr1:1-1000" if isinstance(features, basestring): features = helpers.tointerval(features) if not ((isinstance(features, list) or isinstance(features, tuple))): if bins is not None: if not isinstance(bins, int): raise ArgumentError("bins must be an int, got %s" % type(bins)) features = [features] bins = [bins] else: if bins is None: bins = [None for i in features] if not len(bins) == len(features): raise ArgumentError("bins must have same length as feature list") # nomenclature: # "window" is region we're getting data for # "alignment" is one item in that region # profiles = [] xs = [] for window, nbin in zip(features, bins): window = helpers.tointerval(window) chrom = window.chrom start = window.start stop = window.stop strand = window.strand if not is_bigwig: # Extend the window to catch reads that would extend into the # requested window _fs = fragment_size or 0 padded_window = pybedtools.Interval( chrom, max(start - _fs - shift_width, 0), stop + _fs + shift_width, ) window_size = stop - start # start off with an array of zeros to represent the window profile = np.zeros(window_size, dtype=float) for interval in reader[padded_window]: if read_strand: if interval.strand != read_strand: continue # Shift interval by modeled distance, if specified. if shift_width: if interval.strand == '-': interval.start -= shift_width interval.stop -= shift_width else: interval.start += shift_width interval.stop += shift_width # Extend fragment size from 3' if fragment_size: if interval.strand == '-': interval.start = interval.stop - fragment_size else: interval.stop = interval.start + fragment_size # Convert to 0-based coords that can be used as indices into # array start_ind = interval.start - start # If the feature goes out of the window, then only include the # part that's inside the window start_ind = max(start_ind, 0) # Same thing for stop stop_ind = interval.stop - start stop_ind = min(stop_ind, window_size) # Skip if the feature is shifted outside the window. This can # happen with large values of `shift_width`. if start_ind >= window_size or stop_ind < 0: continue # Finally, increment profile if use_score: score = float(interval.score) else: score = 1 if accumulate: if preserve_total: profile[start_ind:stop_ind] += (score / float( (stop_ind - start_ind))) else: profile[start_ind:stop_ind] += score else: profile[start_ind:stop_ind] = score else: # it's a bigWig profile = reader.summarize( window, method=method, function=function, bins=(nbin or len(window)), zero_inf=zero_inf, zero_nan=zero_nan, ) # If no bins, return genomic coords if (nbin is None): x = np.arange(start, stop) # Otherwise do the downsampling; resulting x is stll in genomic # coords else: if preserve_total: total = float(profile.sum()) if not is_bigwig or method == 'get_as_array': xi, profile = rebin(x=np.arange(start, stop), y=profile, nbin=nbin) if not accumulate: nonzero = profile != 0 profile[profile != 0] = 1 x = xi else: x = np.linspace(start, stop - 1, nbin) # Minus-strand profiles should be flipped left-to-right. if stranded and strand == '-': profile = profile[::-1] xs.append(x) if preserve_total: scale = profile.sum() / total profile /= scale profiles.append(profile) stacked_xs = np.hstack(xs) stacked_profiles = np.hstack(profiles) del xs del profiles return stacked_xs, stacked_profiles
def _local_coverage(reader, features, read_strand=None, fragment_size=None, shift_width=0, bins=None, use_score=False, accumulate=True, preserve_total=False, method=None, processes=None, stranded=True, verbose=False): """ Returns a binned vector of coverage. Computes a 1D vector of coverage at the coordinates for each feature in `features`, extending each read by `fragmentsize` bp. Some arguments cannot be used for bigWig files due to the structure of these files. The parameters docstring below indicates whether or not an argument can be used with bigWig files. Depending on the arguments provided, this method can return a vector containing values from a single feature or from concatenated features. An example of the flexibility afforded by the latter case: `features` can be a 3-tuple of pybedtools.Intervals representing (TSS + 1kb upstream, gene, TTS + 1kb downstream) and `bins` can be [100, 1000, 100]. This will return a vector of length 1200 containing the three genomic intervals binned into 100, 1000, and 100 bins respectively. Note that is up to the caller to construct the right axes labels in the final plot! Parameters ---------- features : str, interval-like object, or list Can be a single interval or an iterable yielding intervals. Interval-like objects must have chrom, start, and stop attributes, and optionally a strand attribute. One exception to this that if `features` is a single string, it can be of the form "chrom:start-stop" or "chrom:start-stop[strand]". If `features` is a single interval, then return a 1-D array for that interval. If `features` is an iterable of intervals, then return a 1-D array that is a concatenation of signal for these intervals. Available for bigWig. bins : None, int, list If `bins` is None, then each value in the returned array will correspond to one bp in the genome. If `features` is a single Interval, then `bins` is an integer or None. If `features` is an iterable of Intervals, `bins` is an iterable of integers of the same length as `features`. Available for bigWig. fragment_size : None or int If not None, then each item from the genomic signal (e.g., reads from a BAM file) will be extended `fragment_size` bp in the 3' direction. Higher fragment sizes will result in smoother signal. Not available for bigWig. shift_width : int Each item from the genomic signal (e.g., reads from a BAM file) will be shifted `shift_width` bp in the 3' direction. This can be useful for reconstructing a ChIP-seq profile, using the shift width determined from the peak-caller (e.g., modeled `d` in MACS). Not available for bigWig. read_strand : None or str If `read_strand` is one of "+" or "-", then only items from the genomic signal (e.g., reads from a BAM file) on that strand will be considered and reads on the opposite strand ignored. Useful for plotting genomic signal for stranded libraries. Not available for bigWig. stranded : bool If True, then the profile will be reversed for features whose strand attribute is "-". use_score : bool If True, then each bin will contain the sum of the *score* attribute of genomic features in that bin instead of the *number* of genomic features falling within each bin. Not available for bigWig. accumulate : bool If False, then only record *that* there was something there, rather than acumulating reads. This is useful for making matrices with called peaks. Available for bigWig. preserve_total : bool If True, re-scales the returned value so that each binned row's total is equal to the sum of the original, un-binned data. The units of the returned array will be in "total per bin". This is useful for, e.g., counting reads in features. If `preserve_total` is False, then the returned array will have units of "density"; this is more generally useful and is the default behavior. Available for bigWig, but not when using method="ucsc_summarize". method : str; one of [ "summarize" | "get_as_array" | "ucsc_summarize" ] Only used for bigWig. The method specifies how data are extracted from the bigWig file. "summarize" is the default. It's quite fast, but may yield slightly different results when compared to running this same function on the BAM file from which the bigWig was created. "summarize" uses bx-python. The values returned will not be exactly the same as the values returned when local_coverage is called on a BAM, BED, or bigBed file, but they will be close. This method is quite fast, and is the default when bins is not None. "get_as_array" uses bx-python, but does a separate binning step. This can be slower than the other two methods, but the results are exactly the same as those from a BAM, BED, or bigBed file. This method is always used if bins=None. "ucsc_summarize" is an alternative version of "summarize". It uses the UCSC program `bigWigSummary`, which must already installed and on your path. processes : int or None The feature can be split across multiple processes. Returns ------- 1-d NumPy array Notes ----- If a feature has a "-" strand attribute, then the resulting profile will be *relative to a minus-strand feature*. That is, the resulting profile will be reversed. Returns arrays `x` and `y`. `x` is in genomic coordinates, and `y` is the coverage at each of those coordinates after extending fragments. The total number of reads is guaranteed to be the same no matter how it's binned. (with ideas from http://www-huber.embl.de/users/anders/HTSeq/doc/tss.html) """ # bigWig files are handled differently, so we need to know if we're working # with one; raise exeception if a kwarg was supplied that's not supported. if isinstance(reader, filetype_adapters.BigWigAdapter): is_bigwig = True defaults = ( ('read_strand', read_strand, None), ('fragment_size', fragment_size, None), ('shift_width', shift_width, 0), ('use_score', use_score, False), ('preserve_total', preserve_total, False), ) for name, check, default in defaults: if ( ((default is None) and (check is not default)) or (check != default) ): raise ArgumentError( "Argument '%s' not supported for bigWig" % name) if method == 'ucsc_summarize': if preserve_total: raise ArgumentError( "preserve_total=True not supported when using " "method='ucsc_summarize'") else: is_bigwig = False if isinstance(reader, filetype_adapters.BamAdapter): if use_score: raise ArgumentError("Argument 'use_score' not supported for " "bam") # e.g., features = "chr1:1-1000" if isinstance(features, basestring): features = helpers.tointerval(features) if not ((isinstance(features, list) or isinstance(features, tuple))): if bins is not None: if not isinstance(bins, int): raise ArgumentError( "bins must be an int, got %s" % type(bins)) features = [features] bins = [bins] else: if bins is None: bins = [None for i in features] if not len(bins) == len(features): raise ArgumentError( "bins must have same length as feature list") # nomenclature: # "window" is region we're getting data for # "alignment" is one item in that region # profiles = [] xs = [] for window, nbin in zip(features, bins): window = helpers.tointerval(window) chrom = window.chrom start = window.start stop = window.stop strand = window.strand if not is_bigwig: # Extend the window to catch reads that would extend into the # requested window _fs = fragment_size or 0 padded_window = pybedtools.Interval( chrom, max(start - _fs - shift_width, 0), stop + _fs + shift_width, ) window_size = stop - start # start off with an array of zeros to represent the window profile = np.zeros(window_size, dtype=float) for interval in reader[padded_window]: if read_strand: if interval.strand != read_strand: continue # Shift interval by modeled distance, if specified. if shift_width: if interval.strand == '-': interval.start -= shift_width interval.stop -= shift_width else: interval.start += shift_width interval.stop += shift_width # Extend fragment size from 3' if fragment_size: if interval.strand == '-': interval.start = interval.stop - fragment_size else: interval.stop = interval.start + fragment_size # Convert to 0-based coords that can be used as indices into # array start_ind = interval.start - start # If the feature goes out of the window, then only include the # part that's inside the window start_ind = max(start_ind, 0) # Same thing for stop stop_ind = interval.stop - start stop_ind = min(stop_ind, window_size) # Skip if the feature is shifted outside the window. This can # happen with large values of `shift_width`. if start_ind >= window_size or stop_ind < 0: continue # Finally, increment profile if use_score: score = float(interval.score) else: score = 1 if accumulate: if preserve_total: profile[start_ind:stop_ind] += ( score / float((stop_ind - start_ind))) else: profile[start_ind:stop_ind] += score else: profile[start_ind:stop_ind] = score else: # it's a bigWig profile = reader.summarize( window, method=method, bins=(nbin or len(window))) # If no bins, return genomic coords if (nbin is None): x = np.arange(start, stop) # Otherwise do the downsampling; resulting x is stll in genomic # coords else: if preserve_total: total = float(profile.sum()) if not is_bigwig or method == 'get_as_array': xi, profile = rebin( x=np.arange(start, stop), y=profile, nbin=nbin) if not accumulate: nonzero = profile != 0 profile[profile != 0] = 1 x = xi else: x = np.linspace(start, stop - 1, nbin) # Minus-strand profiles should be flipped left-to-right. if stranded and strand == '-': profile = profile[::-1] xs.append(x) if preserve_total: scale = profile.sum() / total profile /= scale profiles.append(profile) stacked_xs = np.hstack(xs) stacked_profiles = np.hstack(profiles) del xs del profiles return stacked_xs, stacked_profiles
def _local_coverage(reader, features, read_strand=None, fragment_size=None, shift_width=0, bins=None, use_score=False, accumulate=True, preserve_total=False, method=None, processes=None, stranded=True, verbose=False): """ Returns a binned vector of coverage. Computes a 1D vector of coverage at the coordinates for each feature in `features`, extending each read by `fragmentsize` bp. Some arguments cannot be used for bigWig files due to the structure of these files. The parameters docstring below indicates whether or not an argument can be used with bigWig files. Depending on the arguments provided, this method can return a vector containing values from a single feature or from concatenated features. An example of the flexibility afforded by the latter case: `features` can be a 3-tuple of pybedtools.Intervals representing (TSS + 1kb upstream, gene, TTS + 1kb downstream) and `bins` can be [100, 1000, 100]. This will return a vector of length 1200 containing the three genomic intervals binned into 100, 1000, and 100 bins respectively. Note that is up to the caller to construct the right axes labels in the final plot! Parameters ---------- features : str, interval-like object, or list Can be a single interval or an iterable yielding intervals. Interval-like objects must have chrom, start, and stop attributes, and optionally a strand attribute. One exception to this that if `features` is a single string, it can be of the form "chrom:start-stop" or "chrom:start-stop[strand]". If `features` is a single interval, then return a 1-D array for that interval. If `features` is an iterable of intervals, then return a 1-D array that is a concatenation of signal for these intervals. Available for bigWig. bins : None, int, list If `bins` is None, then each value in the returned array will correspond to one bp in the genome. If `features` is a single Interval, then `bins` is an integer or None. If `features` is an iterable of Intervals, `bins` is an iterable of integers of the same length as `features`. Available for bigWig. fragment_size : None or int If not None, then each item from the genomic signal (e.g., reads from a BAM file) will be extended `fragment_size` bp in the 3' direction. Higher fragment sizes will result in smoother signal. Not available for bigWig. shift_width : int Each item from the genomic signal (e.g., reads from a BAM file) will be shifted `shift_width` bp in the 3' direction. This can be useful for reconstructing a ChIP-seq profile, using the shift width determined from the peak-caller (e.g., modeled `d` in MACS). Not available for bigWig. read_strand : None or str If `read_strand` is one of "+" or "-", then only items from the genomic signal (e.g., reads from a BAM file) on that strand will be considered and reads on the opposite strand ignored. Useful for plotting genomic signal for stranded libraries. Not available for bigWig. stranded : bool If True, then the profile will be reversed for features whose strand attribute is "-". use_score : bool If True, then each bin will contain the sum of the *score* attribute of genomic features in that bin instead of the *number* of genomic features falling within each bin. Not available for bigWig. accumulate : bool If False, then only record *that* there was something there, rather than acumulating reads. This is useful for making matrices with called peaks. Available for bigWig. preserve_total : bool If True, re-scales the returned value so that each binned row's total is equal to the sum of the original, un-binned data. The units of the returned array will be in "total per bin". This is useful for, e.g., counting reads in features. If `preserve_total` is False, then the returned array will have units of "density"; this is more generally useful and is the default behavior. Available for bigWig, but not when using method="ucsc_summarize". method : str; all types: * one of ["mean_bin_coverage", "bin_covered", None] bigWig specific: * one of [ "summarize" | "get_as_array" | "ucsc_summarize" ] The method specifies how data are extracted from the bigWig file. "summarize" is the default. It's quite fast, but may yield slightly different results when compared to running this same function on the BAM file from which the bigWig was created. "summarize" uses bx-python. The values returned will not be exactly the same as the values returned when local_coverage is called on a BAM, BED, or bigBed file, but they will be close. This method is quite fast, and is the default when bins is not None. "get_as_array" uses bx-python, but does a separate binning step. This can be slower than the other two methods, but the results are exactly the same as those from a BAM, BED, or bigBed file. This method is always used if bins=None. "ucsc_summarize" is an alternative version of "summarize". It uses the UCSC program `bigWigSummary`, which must already installed and on your path. "mean_offset_coverage": Let's split [start, stop] range in nbin bins, where each bin represents average peaks coverage (per bp) around the bin center. 'xs' array contains bin centers. If 'accumulate' is TRUE method provides average offset coverage by peaks, else peaks density in bin "bin_covered": Let's split [start, stop] range in nbin bins, where each bin represents whether it was covered by at least 1 peak or not. Bins defined by centers from 'xs' array. Note: "mean_offset_coverage" and "bin_covered" methods differ from default interpolation based method. Difference is more visible when data is split in large bins covered by short peaks (e.g like raw bam peaks). Also default methods split in bins which not symmetric relatively +/- strand direction. These 2 methods tends: * to generate same bins for +/- strand data * bin value depends on bin signal coverage/enrichment Default method: * last bin on minus strand (first on plus strand) length differ from other bins, so merged + and - signal bins are introduce some 'shift' bias * bin value is interpolated between closest left and right genome offsets, thus whole bin doesn't represent bin enrichment So default method is good, when bins length are close or smaller than signal peaks lengths. processes : int or None The feature can be split across multiple processes. Returns ------- 1-d NumPy array Notes ----- If a feature has a "-" strand attribute, then the resulting profile will be *relative to a minus-strand feature*. That is, the resulting profile will be reversed. Returns arrays `x` and `y`. `x` is in genomic coordinates, and `y` is the coverage at each of those coordinates after extending fragments. The total number of reads is guaranteed to be the same no matter how it's binned. (with ideas from http://www-huber.embl.de/users/anders/HTSeq/doc/tss.html) """ # bigWig files are handled differently, so we need to know if we're working # with one; raise exeception if a kwarg was supplied that's not supported. if isinstance(reader, filetype_adapters.BigWigAdapter): is_bigwig = True defaults = ( ('read_strand', read_strand, None), ('fragment_size', fragment_size, None), ('shift_width', shift_width, 0), ('use_score', use_score, False), ('preserve_total', preserve_total, False), ) for name, check, default in defaults: if (((default is None) and (check is not default)) or (check != default)): raise ArgumentError("Argument '%s' not supported for bigWig" % name) if method == 'ucsc_summarize': if preserve_total: raise ArgumentError( "preserve_total=True not supported when using " "method='ucsc_summarize'") else: is_bigwig = False if isinstance(reader, filetype_adapters.BamAdapter): if use_score: raise ArgumentError("Argument 'use_score' not supported for " "bam") # e.g., features = "chr1:1-1000" if isinstance(features, basestring): features = helpers.tointerval(features) if not ((isinstance(features, list) or isinstance(features, tuple))): if bins is not None: if not isinstance(bins, int): raise ArgumentError("bins must be an int, got %s" % type(bins)) features = [features] bins = [bins] else: if bins is None: bins = [None for i in features] if not len(bins) == len(features): raise ArgumentError("bins must have same length as feature list") # nomenclature: # "window" is region we're getting data for # "alignment" is one item in that region # profiles = [] xs = [] for window, nbin in zip(features, bins): window = helpers.tointerval(window) chrom = window.chrom start = window.start stop = window.stop strand = window.strand if not is_bigwig: # Extend the window to catch reads that would extend into the # requested window _fs = fragment_size or 0 padded_window = pybedtools.Interval( chrom, max(start - _fs - shift_width, 0), stop + _fs + shift_width, ) window_size = stop - start # start off with an array of zeros to represent the window profile = np.zeros(window_size, dtype=float) for interval in reader[padded_window]: if read_strand: if interval.strand != read_strand: continue # Shift interval by modeled distance, if specified. if shift_width: if interval.strand == '-': interval.start -= shift_width interval.stop -= shift_width else: interval.start += shift_width interval.stop += shift_width # Extend fragment size from 3' if fragment_size: if interval.strand == '-': interval.start = interval.stop - fragment_size else: interval.stop = interval.start + fragment_size # Convert to 0-based coords that can be used as indices into # array start_ind = interval.start - start # If the feature goes out of the window, then only include the # part that's inside the window start_ind = max(start_ind, 0) # Same thing for stop stop_ind = interval.stop - start stop_ind = min(stop_ind, window_size) # Skip if the feature is shifted outside the window. This can # happen with large values of `shift_width`. if start_ind >= window_size or stop_ind < 0: continue # Finally, increment profile if use_score: score = float(interval.score) else: score = 1 if accumulate: if preserve_total: profile[start_ind:stop_ind] += (score / float( (stop_ind - start_ind))) else: profile[start_ind:stop_ind] += score else: profile[start_ind:stop_ind] = score else: # it's a bigWig profile = reader.summarize(window, method=method, bins=(nbin or len(window))) # If no bins, return genomic coords if (nbin is None): x = np.arange(start, stop) # Otherwise do the downsampling; resulting x is stll in genomic # coords else: if preserve_total: total = float(profile.sum()) if method == 'mean_offset_coverage' or method == 'bin_covered': # Let's split [start, stop] range in nbin bins, where each # bin represent average peaks coverage (per bp) around the bin. # Profile for minus strand is reversed profile for plus strand, # so we need to split in bins symmetrically. # So: # * start offset represents [start, start + bin_size / 2) bind # * stop offset represents [stop - bin_size / 2, stop] bin # * i-th bin center: [center_i - bin_size / 2, # center_i + bin_size/2) size = stop - start assert size == len(profile) assert nbin > 2 # Let's split in nbins + (nbins - 1) small bins. In this case # we have 1 small bin near start, 1 small near stop and # each 2 inner small bins represent one normal bin. # Otherwise we need to be more accurate while calculating # indexes bounds = np.linspace(0, size - 1, nbin * 2 - 1) # inner bins bounds indexes ib_bounds = zip( np.ceil(bounds[1:-3:2]).astype(int), np.ceil(bounds[3:-1:2]).astype(int) - 1) ib_centers = np.ceil(bounds[2:-1:2]).astype(int) profile = np.fromiter(itertools.chain( (profile[0:max(1, ib_bounds[0][0])].mean(), ), [profile[l:max(l, r) + 1].mean() for l, r in ib_bounds], (profile[min(size - 1, ib_bounds[-1][1] + 1):].mean(), )), float, count=nbin) x = np.fromiter(itertools.chain( (start, ), (start + offset for offset in ib_centers), (stop - 1, )), int, count=nbin) if method == 'bin_covered': nonzero = profile != 0 profile[nonzero] = 1 elif not is_bigwig or method == 'get_as_array': xi, profile = rebin(x=np.arange(start, stop), y=profile, nbin=nbin) if not accumulate: nonzero = profile != 0 profile[nonzero] = 1 x = xi else: x = np.linspace(start, stop - 1, nbin) # Minus-strand profiles should be flipped left-to-right. if stranded and strand == '-': profile = profile[::-1] xs.append(x) if preserve_total: scale = profile.sum() / total profile /= scale profiles.append(profile) stacked_xs = np.hstack(xs) stacked_profiles = np.hstack(profiles) del xs del profiles return stacked_xs, stacked_profiles