def test_builtin_bin(self, tmpdir): dest_file = os.path.join(str(tmpdir), "hic.h5") hic = self.hic_class(file_name=dest_file, mode='w') hic.add_region(GenomicRegion(chromosome='chr1', start=1, end=100)) hic.add_region(GenomicRegion(chromosome='chr1', start=101, end=200)) hic.flush() hic.add_edge([0, 0, 12]) hic.add_edge([0, 1, 36]) hic.add_edge([1, 1, 24]) hic.flush() hic.close() hic = load(dest_file, mode='r') binned = hic.bin(50) original_reads = 0 for edge in hic.edges(): original_reads += edge.weight new_reads = 0 for edge in binned.edges(): new_reads += edge.weight # search for duplicated edges edge_dict = {} for edge in binned.edges(): assert (edge.source, edge.sink) not in edge_dict # make sure that the total number # of reads stays the same assert original_reads == new_reads hic.close() binned.close()
def _loop_regions_from_bedpe(bedpe): anchors = [] for region in bedpe.regions: a1 = GenomicRegion(chromosome=region.chromosome1, start=region.start1, end=region.end1) a2 = GenomicRegion(chromosome=region.chromosome2, start=region.start2, end=region.end2) anchors.append((a1, a2)) return anchors
def plot(self, regions): if isinstance(regions, tuple): x_region, y_region = regions else: x_region = regions y_region = x_region if isinstance(x_region, string_types): x_region = GenomicRegion.from_string(x_region) if isinstance(y_region, string_types): y_region = GenomicRegion.from_string(y_region) self._current_chromosome_x = x_region.chromosome self._current_chromosome_y = y_region.chromosome if self.ax is None: self.ax = plt.gca() self._before_plot((x_region, y_region)) plot_output = self._plot((x_region, y_region)) self._after_plot((x_region, y_region)) if plot_output is None: return self.fig, self.ax return plot_output
def setup_method(self, method): self.dir = os.path.dirname(os.path.realpath(__file__)) hic = Hic() # add some nodes (120 to be exact) nodes = [] for i in range(1, 5000, 1000): nodes.append( GenomicRegion(chromosome="chr1", start=i, end=i + 1000 - 1)) for i in range(1, 3000, 1000): nodes.append( GenomicRegion(chromosome="chr2", start=i, end=i + 1000 - 1)) for i in range(1, 2000, 500): nodes.append( GenomicRegion(chromosome="chr3", start=i, end=i + 1000 - 1)) hic.add_regions(nodes) # add some edges with increasing weight for testing edges = [] weight = 1 for i in range(0, len(nodes)): for j in range(i, len(nodes)): edges.append(Edge(source=i, sink=j, weight=weight)) weight += 1 hic.add_edges(edges) self.hic = hic self.hic_cerevisiae = load( self.dir + "/test_matrix/cerevisiae.chrI.HindIII_upgrade.hic", mode='r') self.hic_class = Hic
def sample_fa_hic(file_name=None, zero_indices=set(), tmpdir=None): hic = Hic(file_name=file_name, tmpdir=tmpdir, mode='w') # add some nodes (120 to be exact) nodes = [] for i in range(1, 50000, 1000): nodes.append(GenomicRegion(chromosome="chr1", start=i, end=i + 1000 - 1)) for i in range(1, 30000, 1000): nodes.append(GenomicRegion(chromosome="chr2", start=i, end=i + 1000 - 1)) for i in range(1, 20000, 500): nodes.append(GenomicRegion(chromosome="chr3", start=i, end=i + 1000 - 1)) hic.add_regions(nodes) # add some edges with increasing weight for testing edges = [] weight = 1 for i in range(0, len(nodes)): for j in range(i, len(nodes)): if i not in zero_indices and j not in zero_indices: edges.append(Edge(source=i, sink=j, weight=weight)) weight += 1 hic.add_edges(edges) return hic
def test_masked_matrix(self): hic = Hic() m = np.zeros((12, 12)) row_regions = [] col_regions = [] # add some nodes (120 to be exact) nodes = [] for i in range(1, 5000, 1000): node = GenomicRegion(chromosome="chr1", start=i, end=i + 1000 - 1) nodes.append(node) row_regions.append(node) col_regions.append(node) for i in range(1, 3000, 1000): node = GenomicRegion(chromosome="chr2", start=i, end=i + 1000 - 1) nodes.append(node) row_regions.append(node) col_regions.append(node) for i in range(1, 2000, 500): node = GenomicRegion(chromosome="chr3", start=i, end=i + 1000 - 1) nodes.append(node) row_regions.append(node) col_regions.append(node) hic.add_regions(nodes) # add some edges with increasing weight for testing edges = [] weight = 1 for i in range(0, len(nodes)): for j in range(i, len(nodes)): if i != 1 and j != 1 and i != 5 and j != 5: edges.append(Edge(source=i, sink=j, weight=weight)) m[i, j] = weight m[j, i] = weight weight += 1 hic.add_edges(edges) m = hic.matrix() hic.close() # check masking for i in range(m.shape[0]): assert np.ma.is_masked(m[1, i]) assert np.ma.is_masked(m[i, 1]) assert np.ma.is_masked(m[5, i]) assert np.ma.is_masked(m[i, 5]) # check not masked not_masked = {0, 2, 3, 4, 6, 7, 8, 9, 10, 11} masked = {1, 5} for j in not_masked: for i in range(m.shape[0]): if i not in masked: assert not np.ma.is_masked(m[i, j]) assert not np.ma.is_masked(m[j, i]) else: assert np.ma.is_masked(m[i, j]) assert np.ma.is_masked(m[j, i])
def setup_method(self, method): fragment1 = GenomicRegion(start=1, end=1000, chromosome='chr1', strand=1, ix=0) fragment2 = GenomicRegion(start=1001, end=2000, chromosome='chr2', strand=-1, ix=1) self.read1 = FragmentRead(fragment1, position=500, strand=1) self.read2 = FragmentRead(fragment2, position=1200, strand=1) class DummyPairs(object): def __init__(self): self._ix_to_chromosome = {0: 'chr1', 1: 'chr2'} row = dict() row['ix'] = 0 row['left_read_position'] = 500 row['left_read_strand'] = 1 row['left_fragment'] = 0 row['left_fragment_start'] = 1 row['left_fragment_end'] = 1000 row['left_fragment_chromosome'] = 0 row['right_read_position'] = 1200 row['right_read_strand'] = -1 row['right_fragment'] = 1 row['right_fragment_start'] = 1001 row['right_fragment_end'] = 2000 row['right_fragment_chromosome'] = 1 dummy_pairs = DummyPairs() self.lazy_read1 = LazyFragmentRead(row, dummy_pairs, side='left') self.lazy_read2 = LazyFragmentRead(row, dummy_pairs, side='right')
def loop_strength(hic, loop_regions, pixels=16, **kwargs): kwargs.setdefault('log', False) kwargs.setdefault('norm', True) kwargs.setdefault('oe', True) try: include_nan = kwargs.pop('keep_invalid') except KeyError: include_nan = False if isinstance(loop_regions, Bedpe): loop_regions = _loop_regions_from_bedpe(loop_regions) # generating new regions new_region_pairs = [] # 0: original, 1: control left, 2: control right for (region1, region2) in loop_regions: d = int(abs(region1.center - region2.center)) new_left = GenomicRegion(chromosome=region1.chromosome, start=region1.start - d, end=region1.end - d) new_right = GenomicRegion(chromosome=region1.chromosome, start=region2.start + d, end=region2.end + d) new_region_pairs.append((region1, region2)) new_region_pairs.append((new_left, region1)) new_region_pairs.append((region2, new_right)) original, left, right = [], [], [] for i, (pair, m) in enumerate(_loop_matrix_iterator(hic, new_region_pairs, pixels=pixels, keep_invalid=True, **kwargs)): if m is not None: value = float(np.nansum(m)/np.nansum(np.logical_not(m.mask))) else: value = None if i % 3 == 0: original.append(value) elif i % 3 == 1: left.append(value) else: right.append(value) ratios = [] for i in range(len(original)): if original[i] is None or (left[i] is None and right[i] is None): if include_nan: ratios.append(np.nan) continue try: if left[i] is None: r = original[i]/right[i] elif right[i] is None: r = original[i]/left[i] else: r = original[i]/((left[i]+right[i])/2) ratios.append(np.log2(r)) except ZeroDivisionError: if include_nan: ratios.append(np.nan) return ratios
def region_pairs(self, pairs=None): """ Retrieve or set the regions used to generate the aggregate matrix. :param pairs: Iterable of region tuples of the form [(region1, region2), (region3, region4), ...]. If None, simply return the region pairs in this object. :return: List of region pairs [(region1, region2), (region3, region4), ...]. """ if pairs is not None: try: self.file.remove_node(self._group, 'region_pairs') except tables.NoSuchNodeError: pass pairs_table = self.file.create_table(self._group, 'region_pairs', description={ 'chromosome1': tables.StringCol(50, pos=0), 'start1': tables.Int32Col(pos=1), 'end1': tables.Int32Col(pos=2), 'strand1': tables.Int32Col(pos=3), 'chromosome2': tables.StringCol(50, pos=4), 'start2': tables.Int32Col(pos=5), 'end2': tables.Int32Col(pos=6), 'strand2': tables.Int32Col(pos=7) }) row = pairs_table.row for r1, r2 in pairs: row['chromosome1'] = r1.chromosome row['start1'] = r1.start row['end1'] = r1.end row['strand1'] = r1.strand if r1.strand is not None else 0 row['chromosome2'] = r2.chromosome row['start2'] = r2.start row['end2'] = r2.end row['strand2'] = r2.strand if r2.strand is not None else 0 row.append() pairs_table.flush() pairs = [] pairs_table = self.file.get_node(self._group, 'region_pairs') for row in pairs_table.iterrows(): r1 = GenomicRegion(chromosome=row['chromosome1'], start=row['start1'], end=row['end1'], strand=row['strand1']) r2 = GenomicRegion(chromosome=row['chromosome2'], start=row['start2'], end=row['end2'], strand=row['strand2']) pairs.append((r1, r2)) return pairs
def plot(self, region): if isinstance(region, string_types): region = GenomicRegion.from_string(region) if self.fix_chromosome: chromosome = region.chromosome if chromosome.startswith('chr'): chromosome = chromosome[3:] else: chromosome = 'chr' + chromosome region = GenomicRegion(chromosome=chromosome, start=region.start, end=region.end) self._plot(region)
def test_re_dist(self): read1 = FragmentRead(GenomicRegion(chromosome='chr1', start=1, end=1000), position=200, strand=-1) assert read1.re_distance() == 199 read2 = FragmentRead(GenomicRegion(chromosome='chr1', start=1, end=1000), position=990, strand=-1) assert read2.re_distance() == 10
def sample_hic_matrix1(file_name=None, tmpdir=None): # 0 1 2 3 4 5 6 7 8 9 # ##################### # 0 # 0 1 0 2 0 3 0 4 0 5 # 1 # 6 0 7 0 8 0 9 0 1 # 2 # 2 3 4 0 5 0 0 6 # 3 # 7 0 8 9 0 1 0 # 4 # 0 2 3 0 0 4 # 5 # 5 6 7 8 9 # 6 # 1 0 0 0 # 7 # 2 3 0 # 8 # 0 4 # 9 # 5 nodes = [ GenomicRegion('chr1', 1, 1000), GenomicRegion('chr1', 1001, 2000), GenomicRegion('chr1', 2001, 3000), GenomicRegion('chr1', 3001, 4000), GenomicRegion('chr1', 4001, 5000), GenomicRegion('chr1', 5001, 6000), GenomicRegion('chr1', 6001, 7000), GenomicRegion('chr1', 7001, 8000), GenomicRegion('chr1', 8001, 9000), GenomicRegion('chr1', 9001, 10000) ] edges = [ Edge(source=0, sink=1, weight=1), Edge(source=3, sink=5, weight=8), Edge(source=0, sink=3, weight=2), Edge(source=3, sink=6, weight=9), Edge(source=0, sink=5, weight=3), Edge(source=3, sink=8, weight=1), Edge(source=0, sink=7, weight=4), Edge(source=4, sink=5, weight=2), Edge(source=0, sink=9, weight=5), Edge(source=4, sink=6, weight=3), Edge(source=1, sink=1, weight=6), Edge(source=4, sink=9, weight=4), Edge(source=1, sink=3, weight=7), Edge(source=5, sink=5, weight=5), Edge(source=1, sink=5, weight=8), Edge(source=5, sink=6, weight=6), Edge(source=1, sink=7, weight=9), Edge(source=5, sink=7, weight=7), Edge(source=1, sink=9, weight=1), Edge(source=5, sink=8, weight=8), Edge(source=2, sink=2, weight=2), Edge(source=5, sink=9, weight=9), Edge(source=2, sink=3, weight=3), Edge(source=6, sink=6, weight=1), Edge(source=2, sink=4, weight=4), Edge(source=7, sink=7, weight=2), Edge(source=2, sink=6, weight=5), Edge(source=7, sink=8, weight=3), Edge(source=2, sink=9, weight=6), Edge(source=8, sink=9, weight=4), Edge(source=3, sink=3, weight=7), Edge(source=9, sink=9, weight=5) ] hic = Hic(file_name=file_name, tmpdir=tmpdir) hic.add_regions(nodes) hic.add_edges(edges) return hic
def test_convenience_functions(self): pair = FragmentReadPair( FragmentRead(GenomicRegion(start=1, end=1000, chromosome='chr1'), position=500, strand=1), FragmentRead(GenomicRegion(start=10001, end=11000, chromosome='chr1'), position=10500, strand=-1)) assert pair.is_same_chromosome() assert pair.get_gap_size() == 9001 assert pair.is_inward_pair() assert not pair.is_outward_pair() assert not pair.is_same_fragment() assert not pair.is_same_pair() pair = FragmentReadPair( FragmentRead(GenomicRegion(start=1, end=1000, chromosome='chr1'), position=500, strand=1), FragmentRead(GenomicRegion(start=1, end=1000, chromosome='chr1'), position=600, strand=1)) assert pair.is_same_chromosome() assert pair.get_gap_size() == 0 assert not pair.is_inward_pair() assert not pair.is_outward_pair() assert pair.is_same_fragment() assert pair.is_same_pair() pair = FragmentReadPair( FragmentRead(GenomicRegion(start=1, end=1000, chromosome='chr1'), position=500, strand=-1), FragmentRead(GenomicRegion(start=1, end=1000, chromosome='chr2'), position=600, strand=1)) assert not pair.is_same_chromosome() assert pair.get_gap_size() is None assert not pair.is_inward_pair() assert not pair.is_outward_pair() assert not pair.is_same_fragment() assert not pair.is_same_pair() pair = FragmentReadPair( FragmentRead(GenomicRegion(start=1, end=1000, chromosome='chr1'), position=500, strand=-1), FragmentRead(GenomicRegion(start=1001, end=2000, chromosome='chr1'), position=1200, strand=1)) assert pair.is_same_chromosome() assert pair.get_gap_size() == 0 assert not pair.is_inward_pair() assert pair.is_outward_pair() assert not pair.is_same_fragment() assert not pair.is_same_pair()
def _subset_rows(self, key): """ Iterate over a subset of regions given the specified key. :param key: A :class:`~GenomicRegion` object, or a list of the former. Also accepts slices and integers :return: Iterator over the specified subset of regions """ if isinstance(key, slice): for row in self._regions.where("(ix >= {}) & (ix < {})".format( key.start, key.stop)): yield row elif isinstance(key, int): yield self._regions[key] elif isinstance(key, list) and len(key) > 0 and isinstance( key[0], int): for ix in key: yield self._regions[ix] else: if isinstance(key, string_types): key = GenomicRegion.from_string(key) if isinstance(key, GenomicRegion): keys = [key] else: keys = key for k in keys: if isinstance(k, string_types): k = GenomicRegion.from_string(k) query = '(' if k.chromosome is not None: query += "(chromosome == b'%s') & " % k.chromosome if k.end is not None: query += "(start <= %d) & " % k.end if k.start is not None: query += "(end >= %d) & " % k.start if query.endswith(' & '): query = query[:-3] query += ')' if len(query) == 2: for row in self._regions: yield row else: for row in self._regions.where(query): yield row
def region_by_ix(self, ix): chromosome_lengths = self.chromosome_lengths offset_ix = 0 remaining_ix = ix chromosomes = self.chromosomes() current_chromosome = None for chromosome in chromosomes: current_chromosome = chromosome chromosome_length = chromosome_lengths[chromosome] if chromosome.lower() == 'all': continue ixs = int(np.ceil(chromosome_length / self._resolution)) if remaining_ix > ixs: offset_ix += ixs remaining_ix -= ixs else: break region_ix = offset_ix + remaining_ix start = remaining_ix * self._resolution + 1 return GenomicRegion(chromosome=current_chromosome, start=start, end=min(start + self._resolution - 1, chromosome_lengths[current_chromosome]), ix=region_ix)
def _region_subset(self, region, *args, **kwargs): subset_ix, subset_start = self._region_start(region) cl = self.chromosome_lengths[region.chromosome] norm = self.normalisation_vector(region.chromosome) for i, start in enumerate( range(subset_start, region.end, self._resolution)): end = min(start + self._resolution - 1, cl, region.end) bias_ix = int(start / self._resolution) if np.isnan([i]) or norm[i] == 0: valid = False bias = 1.0 else: valid = True try: bias = 1 / norm[bias_ix] except IndexError: bias = 1.0 if np.isnan(bias): bias = 1.0 r = GenomicRegion(chromosome=region.chromosome, start=start, end=end, bias=bias, valid=valid, ix=int(subset_ix + i)) yield r
def setup_method(self, method): chromosomes = [{ 'name': 'chr1', 'end': 10000 }, { 'name': 'chr2', 'end': 15000 }, { 'name': 'chr3', 'end': 7000 }] regions = [] for chromosome in chromosomes: for start in range(1, chromosome["end"] - 1000, 1000): regions.append( GenomicRegion(start=start, end=start + 999, chromosome=chromosome["name"])) self.regions = RegionsTable() self.regions.add_regions(regions) self.empty_regions = RegionsTable(additional_fields={ 'a': t.Int32Col(), 'b': t.StringCol(10) })
def test_add_additional_fields(self): # GenomicRegion self.empty_regions.add_region( GenomicRegion(start=1, end=1000, chromosome='chr1', a=10, b='ten')) self.empty_regions.flush() assert self.empty_regions[0].start == 1 assert self.empty_regions[0].end == 1000 assert self.empty_regions[0].chromosome == 'chr1' assert self.empty_regions[0].a == 10 assert self.empty_regions[0].b == 'ten' # dict self.empty_regions.add_region({ 'start': 1001, 'end': 2000, 'chromosome': 'chr1', 'a': 11, 'b': 'eleven' }) self.empty_regions.flush() assert self.empty_regions[1].start == 1001 assert self.empty_regions[1].end == 2000 assert self.empty_regions[1].chromosome == 'chr1' assert self.empty_regions[1].a == 11 assert self.empty_regions[1].b == 'eleven' # list self.empty_regions.add_region(['chr1', 2001, 3000]) self.empty_regions.flush() assert self.empty_regions[2].start == 2001 assert self.empty_regions[2].end == 3000 assert self.empty_regions[2].chromosome == 'chr1' assert self.empty_regions[2].a == 0 assert self.empty_regions[2].b == ''
def test_add_region(self): # GenomicRegion self.empty_regions.add_region( GenomicRegion(start=1, end=1000, chromosome='chr1')) self.empty_regions.flush() assert self.empty_regions[0].start == 1 assert self.empty_regions[0].end == 1000 assert self.empty_regions[0].chromosome == 'chr1' # dict self.empty_regions.add_region({ 'start': 1001, 'end': 2000, 'chromosome': 'chr1' }) self.empty_regions.flush() assert self.empty_regions[1].start == 1001 assert self.empty_regions[1].end == 2000 assert self.empty_regions[1].chromosome == 'chr1' # list self.empty_regions.add_region(['chr1', 2001, 3000]) self.empty_regions.flush() assert self.empty_regions[2].start == 2001 assert self.empty_regions[2].end == 3000 assert self.empty_regions[2].chromosome == 'chr1'
def _row_to_region(self, row, lazy_region=None): """ Convert a PyTables row to :class:`~GenomicRegion`. :param row: PyTables row object :param lazy_region: (optional) :class:`~LazyGenomicRegion` that is used for loading attributes. :return: :class:`~GenomicRegion` or :class:`~LazyGenomicRegion` """ if lazy_region is not None: lazy_region._row = row return lazy_region kwargs = {} for name in self._regions.colnames: if name not in RegionsTable.RegionDescription().columns.keys(): value = row[name] value = value.decode() if isinstance(value, bytes) else value kwargs[name] = value try: mask_ix = row['_mask_ix'] except (KeyError, ValueError): mask_ix = 0 return GenomicRegion(chromosome=row["chromosome"].decode(), start=row["start"], end=row["end"], ix=row["ix"], _mask_ix=mask_ix, **kwargs)
def setup_method(self, method): peaks = [ [(10, 12), (1, 4), (15, 8)], [(11, 12), (5, 8), (15.2, 7.8)], [(10.5, 11.8), (5.5, 8.1)], ] regions = [ GenomicRegion('chr1', a + 1, b, ix=i) for i, (a, b) in enumerate(pairwise(np.arange(0, 10001, 100))) ] self.peaks = {} for i in range(3): p = fanc.peaks.PeakInfo() p.add_regions(regions) edges = [ fanc.peaks.Peak(x=x, y=y, source=math.floor(x), sink=math.floor(y), weight=1) for x, y in (tuple(sorted(xy)) for xy in peaks[i]) ] p.add_edges(edges) p.flush() self.peaks[i] = p
def _convert_key(self, key, region_trees): if isinstance(key, string_types): key = GenomicRegion.from_string(key) if isinstance(key, GenomicRegion): start = None stop = None try: key_start = 0 if key.start is None else max(0, key.start - 1) key_end = key.end for interval in region_trees[ key.chromosome][key_start:key_end]: i = interval.data start = min(i, start) if start is not None else i stop = max(i + 1, stop) if stop is not None else i + 1 except KeyError: raise ValueError("Requested chromosome {} was not " "found in this matrix.".format( key.chromosome)) if start is None or stop is None: raise ValueError( "Requested region {} was not found in this matrix.".format( key)) return slice(start, stop, 1) return key
def sample_hic_matrix2(file_name=None, tmpdir=None): # 0 1 2 3 4 5 6 7 8 9 # ##################### # 0 # 0 1 0 2 - 3 0 - 0 5 # 1 # 6 0 7 - 8 0 - 0 1 # 2 # 2 3 - 0 5 - 0 6 # 3 # 7 - 8 9 - 1 0 # 4 # - - - - - - # 5 # 5 6 - 8 9 # 6 # 1 - 0 0 # 7 # - - - # 8 # 0 4 # 9 # 5 nodes = [ GenomicRegion('chr1', 1, 1000), GenomicRegion('chr1', 1001, 2000), GenomicRegion('chr1', 2001, 3000), GenomicRegion('chr1', 3001, 4000), GenomicRegion('chr1', 4001, 5000), GenomicRegion('chr1', 5001, 6000), GenomicRegion('chr1', 6001, 7000), GenomicRegion('chr1', 7001, 8000), GenomicRegion('chr1', 8001, 9000), GenomicRegion('chr1', 9001, 10000) ] edges = [ Edge(source=0, sink=1, weight=1), Edge(source=3, sink=5, weight=8), Edge(source=0, sink=3, weight=2), Edge(source=3, sink=6, weight=9), Edge(source=0, sink=5, weight=3), Edge(source=3, sink=8, weight=1), Edge(source=0, sink=9, weight=5), Edge(source=1, sink=1, weight=6), Edge(source=1, sink=3, weight=7), Edge(source=5, sink=5, weight=5), Edge(source=1, sink=5, weight=8), Edge(source=5, sink=6, weight=6), Edge(source=1, sink=9, weight=1), Edge(source=5, sink=8, weight=8), Edge(source=2, sink=2, weight=2), Edge(source=5, sink=9, weight=9), Edge(source=2, sink=3, weight=3), Edge(source=6, sink=6, weight=1), Edge(source=2, sink=6, weight=5), Edge(source=2, sink=9, weight=6), Edge(source=8, sink=9, weight=4), Edge(source=3, sink=3, weight=7), Edge(source=9, sink=9, weight=5) ] hic = Hic(file_name=file_name, tmpdir=tmpdir) hic.add_regions(nodes) hic.add_edges(edges) return hic
def contact_directionality_bias(hic, regions, distance=1000000, region_anchor='center', **kwargs): forward_region_pairs = [] reverse_region_pairs = [] for region in regions: pos = int(getattr(region, region_anchor)) new_region = GenomicRegion(chromosome=region.chromosome, start=pos, end=pos, strand=region.strand) if region.is_forward(): forward_region_pairs.append((new_region, new_region.expand(absolute=distance))) else: reverse_region_pairs.append((new_region, new_region.expand(absolute=distance))) cumulative_forward = np.zeros(int(distance / hic.bin_size) * 2 + 1) count_forward = np.zeros(int(distance / hic.bin_size) * 2 + 1) for matrix in extract_submatrices(hic, forward_region_pairs, **kwargs): cumulative_forward += matrix[0, :] if hasattr(matrix, 'mask'): inverted_mask = ~matrix.mask count_forward += inverted_mask.astype('int')[0, :] else: count_forward += np.ones(count_forward.shape) cumulative_reverse = np.zeros(int(distance / hic.bin_size) * 2 + 1) count_reverse = np.zeros(int(distance / hic.bin_size) * 2 + 1) for matrix in extract_submatrices(hic, reverse_region_pairs, **kwargs): cumulative_reverse += matrix[0, :] if hasattr(matrix, 'mask'): inverted_mask = ~matrix.mask count_reverse += inverted_mask.astype('int')[0, :] else: count_reverse += np.ones(count_reverse.shape) avg_forward = cumulative_forward / count_forward avg_reverse = cumulative_reverse / count_reverse bin_size = hic.bin_size d = [] ratio_forward = [] ratio_reverse = [] center = int(len(avg_forward)/2) for ix in range(center + 1): d.append(ix * bin_size) ratio_forward.append(avg_forward[center + ix] / avg_forward[center - ix]) ratio_reverse.append(avg_reverse[center + ix] / avg_reverse[center - ix]) return d, ratio_forward, ratio_reverse
def test_save_and_load(self, tmpdir): dest_file = str(tmpdir) + "/hic.h5" hic1 = self.hic_class(file_name=dest_file, mode='w') hic1.add_region(GenomicRegion('chr1', 1, 1000)) hic1.add_region(GenomicRegion('chr2', 1, 1000)) hic1.flush() hic1.add_edge_simple(0, 1, 1) hic1.flush() hic1.close() hic2 = self.hic_class(dest_file, mode='r') nodes2 = list(hic2.regions()) edges2 = list(hic2.edges()) assert len(nodes2) == 2 assert len(edges2) == 1 hic2.close()
def _mouse_release_event(self, event): xlim = self.ax.get_xlim() ylim = self.ax.get_ylim() if xlim != self._last_xlim or ylim != self._last_ylim: self._last_xlim = xlim self._last_ylim = ylim x_start, x_end = (xlim[0], xlim[1]) if xlim[0] < xlim[1] else (xlim[1], xlim[0]) x_region = GenomicRegion(chromosome=self._current_chromosome_x, start=x_start, end=x_end) y_start, y_end = (ylim[0], ylim[1]) if ylim[0] < ylim[1] else (ylim[1], ylim[0]) y_region = GenomicRegion(chromosome=self._current_chromosome_y, start=y_start, end=y_end) self.refresh(region=(x_region, y_region))
def test_get_node_x_by_region(self): region1 = GenomicRegion.from_string('chr1') nodes1 = list(self.hic.regions(region1)) assert len(nodes1) == 5 region2 = GenomicRegion.from_string('chr2') nodes2 = list(self.hic.regions(region2)) assert len(nodes2) == 3 region3 = GenomicRegion.from_string('chr3') nodes3 = list(self.hic.regions(region3)) assert len(nodes3) == 4 region4 = GenomicRegion.from_string('chr1:3452-6000') nodes4 = list(self.hic.regions(region4)) assert len(nodes4) == 2 region5 = GenomicRegion.from_string('chr1:1-51000') nodes5 = list(self.hic.regions(region5)) assert len(nodes5) == 5
def _edges_subset(self, key=None, row_regions=None, col_regions=None, lazy=False, *args, **kwargs): if row_regions[0].chromosome != row_regions[-1].chromosome: raise ValueError("Cannot subset rows across multiple chromosomes!") if col_regions[0].chromosome != col_regions[-1].chromosome: raise ValueError( "Cannot subset columns across multiple chromosomes!") regions_by_ix = {} for region in row_regions + col_regions: regions_by_ix[region.ix] = region row_span = GenomicRegion(chromosome=row_regions[0].chromosome, start=row_regions[0].start, end=row_regions[-1].end) col_span = GenomicRegion(chromosome=col_regions[0].chromosome, start=col_regions[0].start, end=col_regions[-1].end) if not lazy: for x, y, weight in self._read_matrix(row_span, col_span): if x > y: x, y = y, x yield Edge(source=regions_by_ix[x], sink=regions_by_ix[y], weight=weight) else: edge = LazyJuicerEdge(source=0, sink=0, weight=1.0, matrix=self) for x, y, weight in self._read_matrix(row_span, col_span): if x > y: x, y = y, x edge.source, edge.sink, edge.weight = x, y, weight yield edge
def setup_method(self, method): hic = Hic() m = np.zeros((12, 12)) row_regions = [] col_regions = [] # add some nodes (120 to be exact) nodes = [] for i in range(1, 5000, 1000): node = GenomicRegion(chromosome="chr1", start=i, end=i + 1000 - 1) nodes.append(node) row_regions.append(node) col_regions.append(node) for i in range(1, 3000, 1000): node = GenomicRegion(chromosome="chr2", start=i, end=i + 1000 - 1) nodes.append(node) row_regions.append(node) col_regions.append(node) for i in range(1, 2000, 500): node = GenomicRegion(chromosome="chr3", start=i, end=i + 1000 - 1) nodes.append(node) row_regions.append(node) col_regions.append(node) hic.add_regions(nodes) # add some edges with increasing weight for testing edges = [] weight = 1 for i in range(0, len(nodes)): for j in range(i, len(nodes)): edges.append(Edge(source=i, sink=j, weight=weight)) m[i, j] = weight m[j, i] = weight weight += 1 hic.add_edges(edges) self.hic = hic self.m = RegionMatrix(m, row_regions=row_regions, col_regions=col_regions)
def test_merge(self): hic = self.hic_class() # add some nodes (120 to be exact) nodes = [] for i in range(1, 5000, 1000): nodes.append( GenomicRegion(chromosome="chr1", start=i, end=i + 1000 - 1)) for i in range(1, 3000, 1000): nodes.append( GenomicRegion(chromosome="chr2", start=i, end=i + 1000 - 1)) for i in range(1, 2000, 500): nodes.append( GenomicRegion(chromosome="chr3", start=i, end=i + 1000 - 1)) hic.add_regions(nodes, preserve_attributes=False) # add some edges with increasing weight for testing edges = [] weight = 1 for i in range(0, len(nodes)): for j in range(i, len(nodes)): edges.append(Edge(source=i, sink=j, weight=weight)) weight += 1 hic.add_edges(edges) # check length merged_hic_2x = Hic.merge([self.hic, hic]) merged_hic_3x = Hic.merge([self.hic, hic, hic]) hic.close() m = self.hic[:, :] m_merged_2x = merged_hic_2x[:, :] m_merged_3x = merged_hic_3x[:, :] for i in range(m.shape[0]): for j in range(m.shape[1]): assert m[i, j] == 0 or m[i, j] == m_merged_2x[i, j] / 2 assert m[i, j] == 0 or m[i, j] == m_merged_3x[i, j] / 3 merged_hic_2x.close() merged_hic_3x.close()