def goto(self, gr: GR = None, gr2: GR = None): """ Go to the range on the genome. Parameters ---------- gr2 gr : {str, GenomeRange} The range string, like "chr1:1000000-2000000", or GenomeRange object. Examples -------- >>> frame = Frame() >>> frame.goto("chrX:3000000-5000000") >>> str(frame.current_range) 'chrX:3000000-5000000' >>> frame.goto(GenomeRange("chr1", 1000, 2000)) >>> str(frame.current_range) 'chr1:1000-2000' """ if gr: self.current_range = gr if isinstance( gr, GenomeRange) else GenomeRange(gr) if gr2: self.current_range2 = gr if isinstance( gr2, GenomeRange) else GenomeRange(gr2)
def fetch_intervals(self, gr: GenomeRange): """ Parameters ---------- gr : {str, GenomeRange} Returns ------- intervals : pandas.core.frame.DataFrame Annotation interval table. """ rows = [ row for row in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end) ] if not rows: gr.change_chrom_names() for row in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end): rows.append(row) columns = [ 'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute' ] df = pd.DataFrame(rows, columns=columns) df['start'] = df['start'].astype(int) df['end'] = df['end'].astype(int) df['gene_name'] = df['attribute'].str.extract( ".*gene_name (.*?) ").iloc[:, 0].str.strip('\";') df['gene_name'].fillna("", inplace=True) return df
def plot(self, ax, chrom_region, region_start, region_end): self.ax = ax grange = GenomeRange(chrom_region, region_start, region_end) if grange.chrom not in self.interval_tree: grange.change_chrom_names() bands_in_region = sorted( self.interval_tree[grange.chrom][grange.start:grange.end]) band_height = self.properties['height'] xranges, colors = [], [] for itv in bands_in_region: start, end = itv.begin, itv.end band_name, band_type = itv.data[:2] band_color = self.lookup_band_color(band_type) xranges.append((start, end)) colors.append(band_color) if self.properties['show_band_name'] != 'no': if grange.length < 80_000_000: self.plot_text(band_name, start, end, band_color) coll = BrokenBarHCollection(xranges, (0, band_height), facecolors=colors, linewidths=self.properties['border_width'], edgecolors=self.properties['border_color']) ax.add_collection(coll) ax.set_ylim(-0.1, band_height + 0.1) ax.set_xlim(region_start, region_end) self.plot_label()
def fetch_data(self, gr: GenomeRange, **kwargs): gr = to_gr(gr) if gr.chrom not in list(self.interval_tree): gr.change_chrom_names() return [ (region.begin, region.end, region.data) for region in sorted( self.interval_tree[gr.chrom][gr.start - 10000 : gr.end + 10000] ) ]
def fetch_data(self, gr: GenomeRange, **kwargs): if gr.chrom not in self.interval_tree: gr.change_chrom_names() bands_in_region = sorted(self.interval_tree[gr.chrom][gr.start:gr.end]) rows = [] for itv in bands_in_region: start, end = itv.begin, itv.end band_name, band_type = itv.data[:2] rows.append([gr.chrom, start, end, band_name, band_type]) fields = ['chrom', 'start', 'end', 'band_name', 'band_type'] return pd.DataFrame(rows, columns=fields)
def goto(self, gr1=None, gr2=None): if gr1 is not None: gr1 = GenomeRange(gr1) if gr2 is not None: gr2 = GenomeRange(gr2) if gr1 is None: gr1 = self.current_range[0] if gr2 is None: gr2 = gr1 if gr1 is None or gr2 is None: raise ValueError("No history gr found.") self.current_range = [gr1, gr2]
def fetch_data(self, gr: GenomeRange): vlines_list = [] if gr.chrom not in list(self.vlines_intval_tree): gr.change_chrom_names() for region in sorted(self.vlines_intval_tree[gr.chrom][gr.start - 1:gr.end + 1]): vlines_list.append(region.begin) if region.end != region.begin: vlines_list.append(region.end) return vlines_list
def goto(self, genome_range, who=None): if isinstance(genome_range, str): genome_range = GenomeRange(genome_range) if not self.chrom_lengthes.check_range(genome_range): log.warning("The genome range {} is not valid.".format(genome_range)) return self.current_range = genome_range frame_range = GenomeRange(genome_range.chrom, genome_range.start - 1, # NOTE: frame's start is zero based genome_range.end) self.frame.goto(frame_range) self.widgets.refresh_widgets(who=who)
def __intervaltree_from_list(self, region_list): itree = {} for r in region_list: if isinstance(r, str): grange = GenomeRange(r) elif isinstance(r, tuple): grange = GenomeRange(r[0], r[1], r[2]) elif isinstance(r, GenomeRange): grange = r else: raise ValueError("position must be a tuple or string.") chr_ = grange.chrom itree.setdefault(chr_, IntervalTree()) itree[chr_][grange.start:grange.end + 1] = grange return itree
def __init__(self, frame, reference_genome='hg19', init_range=None, widgets_box='simple', dpi=None, img_format='svg'): """ Parameters ---------- frame : coolbox.core.Frame Browser's main frame. reference_genome : str, optional Reference genome, built-in references:('hg19', 'hg38', 'mm9', 'mm10') if you want use other genome, you can specify the "chromosome length file", that is a tab splited file, first column is the chromosomes, and second column is the length of correspond chromosome. ['hg19'] init_range : str, optional Initial browser range. widgets_box : {'simple', 'full'}, optional WidgetsBox sub class, default SimpleWidgets dpi : int, optional The dpi of frame's image. img_format : str, optional Frame image format, default svg. """ self.dpi = dpi self.img_format = img_format self.frame = frame if reference_genome in BUILT_IN_GENOMES: self.chrom_lengthes = BUILT_IN_GENOMES[reference_genome] else: self.chrom_lengthes = GenomeLength(reference_genome) if len(self.chrom_lengthes) == 0: raise IOError("chromosome lengthes file is not include any useful information." "Please check file \"{}\".".format(reference_genome)) if init_range is not None: self.current_range = GenomeRange(init_range) else: self.current_range = self.get_init_range() if widgets_box == 'simple': self.widgets = SimpleWidgets(self) elif widgets_box == 'full': self.widgets = FullWidgets(self) else: raise NotImplementedError("widgets type {} not support, please use 'simple' or 'full'".format(widgets_box)) self.goto(self.current_range) self.fig = None # cache figs in dict, speed up the figure display process. # key: genome range # value: fig image bytes self.fig_cache = {}
def __init__(self, hic_track_or_file, genome_position, args_hic=None, **kwargs): if isinstance(hic_track_or_file, str): args_hic = args_hic or {} hic_track = HiCMat(hic_track_or_file, **args_hic) else: hic_track = hic_track_or_file properties_dict = { 'hic': hic_track, 'color': Virtual4C.DEFAULT_COLOR, 'height': Virtual4C.DEFAULT_HEIGHT, 'genome_position': genome_position, 'bin_width': 3, 'max_value': 'auto', 'min_value': 'auto', 'show_data_range': True, 'data_range_style': 'y-axis', 'style': 'line:1', 'title': '', } properties_dict.update(kwargs) super().__init__(properties_dict) self.hic = self.properties['hic'] self.position = GenomeRange(self.properties['genome_position']) self.bin_width = self.properties['bin_width'] self.properties['type'] = self.properties['style']
def fetch_data(self, gr: GenomeRange, **kwargs): ix_chrom = self.properties['col_chrom'] ix_pos = self.properties['col_pos'] ix_pval = self.properties['col_pval'] rows = self.load_range(gr) if len(rows) == 0: gr.change_chrom_names() rows = self.load_range(gr) df = pd.DataFrame(rows) if df.shape[0] > 0: columns = [f'col_{i}' for i in range(df.shape[1])] columns[ix_chrom] = "chrom" columns[ix_pos] = "pos" columns[ix_pval] = "score" df.columns = columns return df
def plot(self, ax, chrom_region, start_region, end_region): self.ax = ax genome_range = GenomeRange(chrom_region, start_region, end_region) log.debug("plotting {}".format(self.properties['file'])) num_bins = self.__get_bins_num() self.__check_chrom_name(genome_range) scores_per_bin = self.__get_scores_per_bin(genome_range, num_bins) x_values = np.linspace(genome_range.start, genome_range.end, num_bins) if 'type' in self.properties and self.properties['type'] != 'fill': self.__plot_line_or_points(scores_per_bin, x_values) else: self.__plot_fill(scores_per_bin, x_values) ymin, ymax = self.__adjust_plot(genome_range) if "show_data_range" in self.properties and self.properties["show_data_range"] == 'no': pass else: self.genome_range = genome_range self.plot_data_range(ymin, ymax, self.properties['data_range_style']) self.plot_label() return self.ax
def plot(self, ax, chrom_region, start_region, end_region): self.ax = ax genome_range = GenomeRange(chrom_region, start_region, end_region) itv_df = self.fetch_intervals(genome_range) df = itv_df if self.has_prop("row_filter"): filters = self.properties["row_filter"] for filter_ in filters.split(";"): try: op_idx = list(re.finditer("[=><!]", filter_))[0].start() l_ = filter_[:op_idx].strip() r_ = filter_[op_idx:] df = eval(f'df[df["{l_}"]{r_}]') except IndexError: log.warning(f"row filter {filter_} is not valid.") region_length = end_region - start_region if self.has_prop("length_ratio_thresh"): len_ratio_th = self.properties["length_ratio_thresh"] df = df[(df["end"] - df["start"]) > region_length * len_ratio_th] features = [] for _, row in df.iterrows(): gf = GraphicFeature( start=row['start'], end=row['end'], strand=(1 if row['strand'] == '+' else -1), label=row['gene_name'], color=random.choice(self.colors), ) features.append(gf) record = GraphicRecord(sequence_length=end_region - start_region, features=features, first_index=start_region) record.plot(ax=ax, with_ruler=False, draw_line=False) self.plot_label()
def __intervaltree_from_list(self, vlines_list): from intervaltree import IntervalTree itree = {} for v in vlines_list: if isinstance(v, str): grange = GenomeRange(v) elif isinstance(v, tuple): grange = GenomeRange(v[0], v[1], v[1]) elif isinstance(v, GenomeRange): grange = v else: raise ValueError("position must be a tuple or string.") chr_ = grange.chrom itree.setdefault(chr_, IntervalTree()) itree[chr_][grange.start:grange.end + 1] = grange return itree
def plot(self, ax, chrom_region, start_region, end_region): self.ax = ax self._out_of_bound = False log.debug("plotting {}".format(self.properties['file'])) genome_range = GenomeRange(chrom_region, start_region, end_region) self.ax = ax # fetch matrix and perform transform process if self.style == STYLE_WINDOW: arr, fetch_region = self.__fetch_window_matrix(genome_range) self.fetch_region = fetch_region else: arr = self.__fetch_matrix(genome_range) self.matrix = arr # plot matrix img = self.__plot_matrix(genome_range) self.__adjust_figure(genome_range) # plot colorbar if self.properties['color_bar'] == 'yes': if hasattr(self, 'y_ax') and self.style == STYLE_WINDOW: self.__plot_colorbar(img, orientation='vertical') else: self.__plot_colorbar(img, orientation='horizontal') else: pass # plot label self.plot_label()
def __init__(self, *args, **kwargs): super().__init__({}, OrderedDict()) # init range if 'genome_range' in kwargs: range_ = kwargs['genome_range'] if isinstance(range_, GenomeRange): self.current_range = range_ else: # init from genome range string # e.g. `frame = Frame(genome_range="chr1:1000-2000")` self.current_range = GenomeRange(range_) else: self.current_range = None # set properties if 'width' in kwargs: self.properties['width'] = kwargs['width'] else: self.properties['width'] = Frame.DEFAULT_WIDTH if 'width_ratios' in kwargs: self.properties['width_ratios'] = kwargs['width_ratios'] else: self.properties['width_ratios'] = Frame.DEFAULT_WIDTH_RATIOS if 'margins' in kwargs: self.properties['margins'] = kwargs['margins'] else: self.properties['margins'] = Frame.DEFAULT_MARGINS if 'title' in kwargs: self.properties['title'] = kwargs['title']
def fetch_intervals(self, genome_range: Union[str, GenomeRange]): """ Fetch intervals within input chromosome range. """ chrom, start, end = split_genome_range(genome_range) gr = GenomeRange(chrom, start, end) rows = self.__load(gr) if len(rows) == 0: chrom = change_chrom_names(chrom) rows = self.__load(GenomeRange(chrom, start, end)) intval_table = pd.DataFrame( rows, columns=['chromsome', 'start', 'end', 'score']) return intval_table
def fetch_intervals(self, gr: GenomeRange): """ Parameters ---------- gr : {str, GenomeRange} Returns ------- intervals : pandas.core.frame.DataFrame Annotation interval table. """ rows = [ row for row in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end) ] if not rows: gr.change_chrom_names() for row in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end): rows.append(row) columns = [ 'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute' ] df = pd.DataFrame(rows, columns=columns) df['start'] = df['start'].astype(int) df['end'] = df['end'].astype(int) name_attr = self.properties.get("name_attr", "auto") if name_attr == "auto": gene_name = df['attribute'].str.extract( ".*gene_name (.*?) ").iloc[:, 0].str.strip('\";') if gene_name.hasnans: gene_id = df['attribute'].str.extract( ".*gene_id (.*?) ").iloc[:, 0].str.strip('\";') gene_name.fillna(gene_id, inplace=True) if gene_name.hasnans: pos_str = df['seqname'].astype(str) + ":" +\ df['start'].astype(str) + "-" +\ df['end'].astype(str) gene_name.fillna(pos_str, inplace=True) df['feature_name'] = gene_name else: df['feature_name'] = df['attribute'].str.extract( f".*{name_attr} (.*?) ").iloc[:, 0].str.strip('\";') return df
def chrom_dropdown_val_change(change): new_chrom = change['new'] current_range = browser.current_range # only change chromosome range_ = GenomeRange(new_chrom, current_range.start, current_range.end) range_ = browser.chrom_lengthes.bound_range(range_) browser.goto(range_, who='chromosomes_list') browser.refresh()
def plot(self, ax, chrom_region, start_region, end_region): gr = GenomeRange(chrom_region, start_region, end_region) ptype = self.properties.get("plot_type", "alignment") self.ax = ax if ptype == "alignment": self.plot_align(ax, gr) else: self.plot_coverage(ax, gr)
def fetch_data(self, gr: GenomeRange, **kwargs) -> pd.DataFrame: rows = self.load(gr) if len(rows) == 0: gr.chrom = change_chrom_names(gr.chrom) rows = self.load(gr) return pd.DataFrame(rows, columns=['chromsome', 'start', 'end', 'score'])
def go_left(self, step_ratio=0.5, dry_run=False): window_size = self.window_size step = int(window_size * step_ratio) start = self.current_range.start - step end = self.current_range.end - step genome_range = GenomeRange(self.current_range.chrom, start, end) genome_range = self.chrom_lengthes.bound_range(genome_range) if dry_run: return genome_range else: self.goto(genome_range)
def get_init_range(self, chrom=None): """ Generate an initial range within a chromosome. Args: chrom (str, optional): initial choromosome. Return: (:obj:`GenomeRange`) """ if chrom is None: chrom = list(self.chrom_lengthes.keys())[0] default_length = 10**7 if self.chrom_lengthes[chrom] > default_length: return GenomeRange(chrom, 1, default_length) else: return GenomeRange(chrom, 1, self.chrom_lengthes[chrom])
def plot(self, ax, chrom_region, start_region, end_region): gr = GenomeRange(chrom_region, start_region, end_region) vlines_list = self.fetch_data(gr) ymin, ymax = ax.get_ylim() ax.vlines(vlines_list, ymin, ymax, linestyle=self.properties['line_style'], linewidth=self.properties['line_width'], color=self.properties['color'], alpha=self.properties['alpha'])
def range_slider_val_change(change): start_old, end_old = change['old'] length_old = end_old - start_old start, end = change['new'] chrom = browser.current_range.chrom if end - start <= 0: end = start + length_old new_range = GenomeRange(chrom, start, end) new_range = browser.chrom_lengthes.bound_range(new_range) browser.goto(new_range, who='range_slider') browser.refresh()
def plot(self, ax, chrom_region, start_region, end_region): self.ax = ax genome_range = GenomeRange(chrom_region, start_region, end_region) self.genome_range = genome_range plot_data = self.fetch_plot_data(genome_range) if plot_data is not None: if isinstance(plot_data, tuple): scores_per_bin, x_values = plot_data else: scores_per_bin, x_values = plot_data, None self.plot_coverage(ax, genome_range, scores_per_bin, x_values) self.plot_label()
def zoom_out(self, zoom_ratio=2, dry_run=False): window_size = self.window_size window_size = window_size * zoom_ratio start = self.center - window_size // 2 end = start + window_size genome_range = GenomeRange(self.current_range.chrom, start, end) genome_range = self.chrom_lengthes.bound_range(genome_range) self.goto(genome_range) if dry_run: return genome_range else: self.goto(genome_range)
def plot(self, ax, gr: GenomeRange, **kwargs): gr = GenomeRange(gr) vlines_list = self.fetch_data(gr) ymin, ymax = ax.get_ylim() ax.vlines(vlines_list, ymin, ymax, linestyle=self.properties['line_style'], linewidth=self.properties['line_width'], color=self.properties['color'], alpha=self.properties['alpha'])
def __init__(self, hicmat: Union[str, HicMatBase], genome_position: str, args_hic: dict = None, **kwargs): properties = Virtual4C.DEFAULT_PROPERTIES.copy() properties.update({ "genome_position": genome_position, **kwargs, }) super().__init__(hicmat, args_hic, **properties) self.position = GenomeRange(self.properties['genome_position']) self.bin_width = self.properties['bin_width']