Ejemplo n.º 1
0
    def test_builtin_bin(self, tmpdir):
        dest_file = os.path.join(str(tmpdir), "hic.h5")

        hic = self.hic_class(file_name=dest_file, mode='w')
        hic.add_region(GenomicRegion(chromosome='chr1', start=1, end=100))
        hic.add_region(GenomicRegion(chromosome='chr1', start=101, end=200))
        hic.flush()
        hic.add_edge([0, 0, 12])
        hic.add_edge([0, 1, 36])
        hic.add_edge([1, 1, 24])
        hic.flush()
        hic.close()
        hic = load(dest_file, mode='r')

        binned = hic.bin(50)

        original_reads = 0
        for edge in hic.edges():
            original_reads += edge.weight

        new_reads = 0
        for edge in binned.edges():
            new_reads += edge.weight

        # search for duplicated edges
        edge_dict = {}
        for edge in binned.edges():
            assert (edge.source, edge.sink) not in edge_dict

        # make sure that the total number
        # of reads stays the same
        assert original_reads == new_reads
        hic.close()
        binned.close()
Ejemplo n.º 2
0
def _loop_regions_from_bedpe(bedpe):
    anchors = []
    for region in bedpe.regions:
        a1 = GenomicRegion(chromosome=region.chromosome1, start=region.start1, end=region.end1)
        a2 = GenomicRegion(chromosome=region.chromosome2, start=region.start2, end=region.end2)
        anchors.append((a1, a2))
    return anchors
Ejemplo n.º 3
0
    def plot(self, regions):
        if isinstance(regions, tuple):
            x_region, y_region = regions
        else:
            x_region = regions
            y_region = x_region

        if isinstance(x_region, string_types):
            x_region = GenomicRegion.from_string(x_region)

        if isinstance(y_region, string_types):
            y_region = GenomicRegion.from_string(y_region)

        self._current_chromosome_x = x_region.chromosome
        self._current_chromosome_y = y_region.chromosome

        if self.ax is None:
            self.ax = plt.gca()
        self._before_plot((x_region, y_region))
        plot_output = self._plot((x_region, y_region))
        self._after_plot((x_region, y_region))

        if plot_output is None:
            return self.fig, self.ax
        return plot_output
Ejemplo n.º 4
0
    def setup_method(self, method):
        self.dir = os.path.dirname(os.path.realpath(__file__))

        hic = Hic()

        # add some nodes (120 to be exact)
        nodes = []
        for i in range(1, 5000, 1000):
            nodes.append(
                GenomicRegion(chromosome="chr1", start=i, end=i + 1000 - 1))
        for i in range(1, 3000, 1000):
            nodes.append(
                GenomicRegion(chromosome="chr2", start=i, end=i + 1000 - 1))
        for i in range(1, 2000, 500):
            nodes.append(
                GenomicRegion(chromosome="chr3", start=i, end=i + 1000 - 1))
        hic.add_regions(nodes)

        # add some edges with increasing weight for testing
        edges = []
        weight = 1
        for i in range(0, len(nodes)):
            for j in range(i, len(nodes)):
                edges.append(Edge(source=i, sink=j, weight=weight))
                weight += 1

        hic.add_edges(edges)

        self.hic = hic
        self.hic_cerevisiae = load(
            self.dir + "/test_matrix/cerevisiae.chrI.HindIII_upgrade.hic",
            mode='r')
        self.hic_class = Hic
Ejemplo n.º 5
0
def sample_fa_hic(file_name=None, zero_indices=set(), tmpdir=None):
    hic = Hic(file_name=file_name, tmpdir=tmpdir, mode='w')

    # add some nodes (120 to be exact)
    nodes = []
    for i in range(1, 50000, 1000):
        nodes.append(GenomicRegion(chromosome="chr1", start=i, end=i + 1000 - 1))
    for i in range(1, 30000, 1000):
        nodes.append(GenomicRegion(chromosome="chr2", start=i, end=i + 1000 - 1))
    for i in range(1, 20000, 500):
        nodes.append(GenomicRegion(chromosome="chr3", start=i, end=i + 1000 - 1))
    hic.add_regions(nodes)

    # add some edges with increasing weight for testing
    edges = []
    weight = 1
    for i in range(0, len(nodes)):
        for j in range(i, len(nodes)):
            if i not in zero_indices and j not in zero_indices:
                edges.append(Edge(source=i, sink=j, weight=weight))
            weight += 1

    hic.add_edges(edges)

    return hic
Ejemplo n.º 6
0
    def test_masked_matrix(self):
        hic = Hic()

        m = np.zeros((12, 12))
        row_regions = []
        col_regions = []
        # add some nodes (120 to be exact)
        nodes = []
        for i in range(1, 5000, 1000):
            node = GenomicRegion(chromosome="chr1", start=i, end=i + 1000 - 1)
            nodes.append(node)
            row_regions.append(node)
            col_regions.append(node)
        for i in range(1, 3000, 1000):
            node = GenomicRegion(chromosome="chr2", start=i, end=i + 1000 - 1)
            nodes.append(node)
            row_regions.append(node)
            col_regions.append(node)
        for i in range(1, 2000, 500):
            node = GenomicRegion(chromosome="chr3", start=i, end=i + 1000 - 1)
            nodes.append(node)
            row_regions.append(node)
            col_regions.append(node)
        hic.add_regions(nodes)

        # add some edges with increasing weight for testing
        edges = []
        weight = 1
        for i in range(0, len(nodes)):
            for j in range(i, len(nodes)):
                if i != 1 and j != 1 and i != 5 and j != 5:
                    edges.append(Edge(source=i, sink=j, weight=weight))
                    m[i, j] = weight
                    m[j, i] = weight
                weight += 1

        hic.add_edges(edges)

        m = hic.matrix()
        hic.close()

        # check masking
        for i in range(m.shape[0]):
            assert np.ma.is_masked(m[1, i])
            assert np.ma.is_masked(m[i, 1])
            assert np.ma.is_masked(m[5, i])
            assert np.ma.is_masked(m[i, 5])

        # check not masked
        not_masked = {0, 2, 3, 4, 6, 7, 8, 9, 10, 11}
        masked = {1, 5}

        for j in not_masked:
            for i in range(m.shape[0]):
                if i not in masked:
                    assert not np.ma.is_masked(m[i, j])
                    assert not np.ma.is_masked(m[j, i])
                else:
                    assert np.ma.is_masked(m[i, j])
                    assert np.ma.is_masked(m[j, i])
Ejemplo n.º 7
0
    def setup_method(self, method):
        fragment1 = GenomicRegion(start=1,
                                  end=1000,
                                  chromosome='chr1',
                                  strand=1,
                                  ix=0)
        fragment2 = GenomicRegion(start=1001,
                                  end=2000,
                                  chromosome='chr2',
                                  strand=-1,
                                  ix=1)
        self.read1 = FragmentRead(fragment1, position=500, strand=1)
        self.read2 = FragmentRead(fragment2, position=1200, strand=1)

        class DummyPairs(object):
            def __init__(self):
                self._ix_to_chromosome = {0: 'chr1', 1: 'chr2'}

        row = dict()
        row['ix'] = 0
        row['left_read_position'] = 500
        row['left_read_strand'] = 1
        row['left_fragment'] = 0
        row['left_fragment_start'] = 1
        row['left_fragment_end'] = 1000
        row['left_fragment_chromosome'] = 0
        row['right_read_position'] = 1200
        row['right_read_strand'] = -1
        row['right_fragment'] = 1
        row['right_fragment_start'] = 1001
        row['right_fragment_end'] = 2000
        row['right_fragment_chromosome'] = 1
        dummy_pairs = DummyPairs()
        self.lazy_read1 = LazyFragmentRead(row, dummy_pairs, side='left')
        self.lazy_read2 = LazyFragmentRead(row, dummy_pairs, side='right')
Ejemplo n.º 8
0
def loop_strength(hic, loop_regions, pixels=16, **kwargs):
    kwargs.setdefault('log', False)
    kwargs.setdefault('norm', True)
    kwargs.setdefault('oe', True)

    try:
        include_nan = kwargs.pop('keep_invalid')
    except KeyError:
        include_nan = False

    if isinstance(loop_regions, Bedpe):
        loop_regions = _loop_regions_from_bedpe(loop_regions)

    # generating new regions
    new_region_pairs = []  # 0: original, 1: control left, 2: control right
    for (region1, region2) in loop_regions:
        d = int(abs(region1.center - region2.center))
        new_left = GenomicRegion(chromosome=region1.chromosome, start=region1.start - d, end=region1.end - d)
        new_right = GenomicRegion(chromosome=region1.chromosome, start=region2.start + d, end=region2.end + d)
        new_region_pairs.append((region1, region2))
        new_region_pairs.append((new_left, region1))
        new_region_pairs.append((region2, new_right))

    original, left, right = [], [], []
    for i, (pair, m) in enumerate(_loop_matrix_iterator(hic, new_region_pairs, pixels=pixels,
                                                        keep_invalid=True, **kwargs)):
        if m is not None:
            value = float(np.nansum(m)/np.nansum(np.logical_not(m.mask)))
        else:
            value = None

        if i % 3 == 0:
            original.append(value)
        elif i % 3 == 1:
            left.append(value)
        else:
            right.append(value)

    ratios = []
    for i in range(len(original)):
        if original[i] is None or (left[i] is None and right[i] is None):
            if include_nan:
                ratios.append(np.nan)
            continue

        try:
            if left[i] is None:
                r = original[i]/right[i]
            elif right[i] is None:
                r = original[i]/left[i]
            else:
                r = original[i]/((left[i]+right[i])/2)
            ratios.append(np.log2(r))
        except ZeroDivisionError:
            if include_nan:
                ratios.append(np.nan)
    return ratios
Ejemplo n.º 9
0
    def region_pairs(self, pairs=None):
        """
        Retrieve or set the regions used to generate the aggregate matrix.

        :param pairs: Iterable of region tuples of the form
                      [(region1, region2), (region3, region4), ...].
                      If None, simply return the region pairs in this object.
        :return: List of region pairs [(region1, region2), (region3, region4), ...].
        """
        if pairs is not None:
            try:
                self.file.remove_node(self._group, 'region_pairs')
            except tables.NoSuchNodeError:
                pass

            pairs_table = self.file.create_table(self._group, 'region_pairs',
                                                 description={
                                                     'chromosome1': tables.StringCol(50, pos=0),
                                                     'start1': tables.Int32Col(pos=1),
                                                     'end1': tables.Int32Col(pos=2),
                                                     'strand1': tables.Int32Col(pos=3),
                                                     'chromosome2': tables.StringCol(50, pos=4),
                                                     'start2': tables.Int32Col(pos=5),
                                                     'end2': tables.Int32Col(pos=6),
                                                     'strand2': tables.Int32Col(pos=7)
                                                 })

            row = pairs_table.row
            for r1, r2 in pairs:
                row['chromosome1'] = r1.chromosome
                row['start1'] = r1.start
                row['end1'] = r1.end
                row['strand1'] = r1.strand if r1.strand is not None else 0

                row['chromosome2'] = r2.chromosome
                row['start2'] = r2.start
                row['end2'] = r2.end
                row['strand2'] = r2.strand if r2.strand is not None else 0

                row.append()

            pairs_table.flush()

        pairs = []
        pairs_table = self.file.get_node(self._group, 'region_pairs')
        for row in pairs_table.iterrows():
            r1 = GenomicRegion(chromosome=row['chromosome1'], start=row['start1'],
                               end=row['end1'], strand=row['strand1'])
            r2 = GenomicRegion(chromosome=row['chromosome2'], start=row['start2'],
                               end=row['end2'], strand=row['strand2'])
            pairs.append((r1, r2))
        return pairs
Ejemplo n.º 10
0
 def plot(self, region):
     if isinstance(region, string_types):
         region = GenomicRegion.from_string(region)
     if self.fix_chromosome:
         chromosome = region.chromosome
         if chromosome.startswith('chr'):
             chromosome = chromosome[3:]
         else:
             chromosome = 'chr' + chromosome
         region = GenomicRegion(chromosome=chromosome,
                                start=region.start,
                                end=region.end)
     self._plot(region)
Ejemplo n.º 11
0
 def test_re_dist(self):
     read1 = FragmentRead(GenomicRegion(chromosome='chr1',
                                        start=1,
                                        end=1000),
                          position=200,
                          strand=-1)
     assert read1.re_distance() == 199
     read2 = FragmentRead(GenomicRegion(chromosome='chr1',
                                        start=1,
                                        end=1000),
                          position=990,
                          strand=-1)
     assert read2.re_distance() == 10
Ejemplo n.º 12
0
def sample_hic_matrix1(file_name=None, tmpdir=None):
    #     0 1 2 3 4 5 6 7 8 9
    #   #####################
    # 0 # 0 1 0 2 0 3 0 4 0 5
    # 1 #   6 0 7 0 8 0 9 0 1
    # 2 #     2 3 4 0 5 0 0 6
    # 3 #       7 0 8 9 0 1 0
    # 4 #         0 2 3 0 0 4
    # 5 #           5 6 7 8 9
    # 6 #             1 0 0 0
    # 7 #               2 3 0
    # 8 #                 0 4
    # 9 #                   5
    nodes = [
        GenomicRegion('chr1', 1, 1000),
        GenomicRegion('chr1', 1001, 2000),
        GenomicRegion('chr1', 2001, 3000),
        GenomicRegion('chr1', 3001, 4000),
        GenomicRegion('chr1', 4001, 5000),
        GenomicRegion('chr1', 5001, 6000),
        GenomicRegion('chr1', 6001, 7000),
        GenomicRegion('chr1', 7001, 8000),
        GenomicRegion('chr1', 8001, 9000),
        GenomicRegion('chr1', 9001, 10000)
    ]

    edges = [
        Edge(source=0, sink=1, weight=1), Edge(source=3, sink=5, weight=8),
        Edge(source=0, sink=3, weight=2), Edge(source=3, sink=6, weight=9),
        Edge(source=0, sink=5, weight=3), Edge(source=3, sink=8, weight=1),
        Edge(source=0, sink=7, weight=4), Edge(source=4, sink=5, weight=2),
        Edge(source=0, sink=9, weight=5), Edge(source=4, sink=6, weight=3),
        Edge(source=1, sink=1, weight=6), Edge(source=4, sink=9, weight=4),
        Edge(source=1, sink=3, weight=7), Edge(source=5, sink=5, weight=5),
        Edge(source=1, sink=5, weight=8), Edge(source=5, sink=6, weight=6),
        Edge(source=1, sink=7, weight=9), Edge(source=5, sink=7, weight=7),
        Edge(source=1, sink=9, weight=1), Edge(source=5, sink=8, weight=8),
        Edge(source=2, sink=2, weight=2), Edge(source=5, sink=9, weight=9),
        Edge(source=2, sink=3, weight=3), Edge(source=6, sink=6, weight=1),
        Edge(source=2, sink=4, weight=4), Edge(source=7, sink=7, weight=2),
        Edge(source=2, sink=6, weight=5), Edge(source=7, sink=8, weight=3),
        Edge(source=2, sink=9, weight=6), Edge(source=8, sink=9, weight=4),
        Edge(source=3, sink=3, weight=7), Edge(source=9, sink=9, weight=5)
    ]

    hic = Hic(file_name=file_name, tmpdir=tmpdir)
    hic.add_regions(nodes)
    hic.add_edges(edges)

    return hic
Ejemplo n.º 13
0
    def test_convenience_functions(self):
        pair = FragmentReadPair(
            FragmentRead(GenomicRegion(start=1, end=1000, chromosome='chr1'),
                         position=500,
                         strand=1),
            FragmentRead(GenomicRegion(start=10001,
                                       end=11000,
                                       chromosome='chr1'),
                         position=10500,
                         strand=-1))
        assert pair.is_same_chromosome()
        assert pair.get_gap_size() == 9001
        assert pair.is_inward_pair()
        assert not pair.is_outward_pair()
        assert not pair.is_same_fragment()
        assert not pair.is_same_pair()

        pair = FragmentReadPair(
            FragmentRead(GenomicRegion(start=1, end=1000, chromosome='chr1'),
                         position=500,
                         strand=1),
            FragmentRead(GenomicRegion(start=1, end=1000, chromosome='chr1'),
                         position=600,
                         strand=1))
        assert pair.is_same_chromosome()
        assert pair.get_gap_size() == 0
        assert not pair.is_inward_pair()
        assert not pair.is_outward_pair()
        assert pair.is_same_fragment()
        assert pair.is_same_pair()

        pair = FragmentReadPair(
            FragmentRead(GenomicRegion(start=1, end=1000, chromosome='chr1'),
                         position=500,
                         strand=-1),
            FragmentRead(GenomicRegion(start=1, end=1000, chromosome='chr2'),
                         position=600,
                         strand=1))
        assert not pair.is_same_chromosome()
        assert pair.get_gap_size() is None
        assert not pair.is_inward_pair()
        assert not pair.is_outward_pair()
        assert not pair.is_same_fragment()
        assert not pair.is_same_pair()

        pair = FragmentReadPair(
            FragmentRead(GenomicRegion(start=1, end=1000, chromosome='chr1'),
                         position=500,
                         strand=-1),
            FragmentRead(GenomicRegion(start=1001, end=2000,
                                       chromosome='chr1'),
                         position=1200,
                         strand=1))
        assert pair.is_same_chromosome()
        assert pair.get_gap_size() == 0
        assert not pair.is_inward_pair()
        assert pair.is_outward_pair()
        assert not pair.is_same_fragment()
        assert not pair.is_same_pair()
Ejemplo n.º 14
0
    def _subset_rows(self, key):
        """
        Iterate over a subset of regions given the specified key.

        :param key: A :class:`~GenomicRegion` object,
                    or a list of the former. Also accepts slices and integers
        :return: Iterator over the specified subset of regions
        """
        if isinstance(key, slice):
            for row in self._regions.where("(ix >= {}) & (ix < {})".format(
                    key.start, key.stop)):
                yield row
        elif isinstance(key, int):
            yield self._regions[key]
        elif isinstance(key, list) and len(key) > 0 and isinstance(
                key[0], int):
            for ix in key:
                yield self._regions[ix]
        else:
            if isinstance(key, string_types):
                key = GenomicRegion.from_string(key)

            if isinstance(key, GenomicRegion):
                keys = [key]
            else:
                keys = key

            for k in keys:
                if isinstance(k, string_types):
                    k = GenomicRegion.from_string(k)

                query = '('
                if k.chromosome is not None:
                    query += "(chromosome == b'%s') & " % k.chromosome
                if k.end is not None:
                    query += "(start <= %d) & " % k.end
                if k.start is not None:
                    query += "(end >= %d) & " % k.start
                if query.endswith(' & '):
                    query = query[:-3]
                query += ')'

                if len(query) == 2:
                    for row in self._regions:
                        yield row
                else:
                    for row in self._regions.where(query):
                        yield row
Ejemplo n.º 15
0
    def region_by_ix(self, ix):
        chromosome_lengths = self.chromosome_lengths

        offset_ix = 0
        remaining_ix = ix
        chromosomes = self.chromosomes()
        current_chromosome = None
        for chromosome in chromosomes:
            current_chromosome = chromosome
            chromosome_length = chromosome_lengths[chromosome]
            if chromosome.lower() == 'all':
                continue

            ixs = int(np.ceil(chromosome_length / self._resolution))
            if remaining_ix > ixs:
                offset_ix += ixs
                remaining_ix -= ixs
            else:
                break

        region_ix = offset_ix + remaining_ix
        start = remaining_ix * self._resolution + 1
        return GenomicRegion(chromosome=current_chromosome,
                             start=start,
                             end=min(start + self._resolution - 1,
                                     chromosome_lengths[current_chromosome]),
                             ix=region_ix)
Ejemplo n.º 16
0
    def _region_subset(self, region, *args, **kwargs):
        subset_ix, subset_start = self._region_start(region)

        cl = self.chromosome_lengths[region.chromosome]
        norm = self.normalisation_vector(region.chromosome)
        for i, start in enumerate(
                range(subset_start, region.end, self._resolution)):
            end = min(start + self._resolution - 1, cl, region.end)
            bias_ix = int(start / self._resolution)

            if np.isnan([i]) or norm[i] == 0:
                valid = False
                bias = 1.0
            else:
                valid = True
                try:
                    bias = 1 / norm[bias_ix]
                except IndexError:
                    bias = 1.0

            if np.isnan(bias):
                bias = 1.0

            r = GenomicRegion(chromosome=region.chromosome,
                              start=start,
                              end=end,
                              bias=bias,
                              valid=valid,
                              ix=int(subset_ix + i))
            yield r
Ejemplo n.º 17
0
    def setup_method(self, method):
        chromosomes = [{
            'name': 'chr1',
            'end': 10000
        }, {
            'name': 'chr2',
            'end': 15000
        }, {
            'name': 'chr3',
            'end': 7000
        }]

        regions = []
        for chromosome in chromosomes:
            for start in range(1, chromosome["end"] - 1000, 1000):
                regions.append(
                    GenomicRegion(start=start,
                                  end=start + 999,
                                  chromosome=chromosome["name"]))
        self.regions = RegionsTable()
        self.regions.add_regions(regions)
        self.empty_regions = RegionsTable(additional_fields={
            'a': t.Int32Col(),
            'b': t.StringCol(10)
        })
Ejemplo n.º 18
0
    def test_add_additional_fields(self):
        # GenomicRegion
        self.empty_regions.add_region(
            GenomicRegion(start=1, end=1000, chromosome='chr1', a=10, b='ten'))
        self.empty_regions.flush()
        assert self.empty_regions[0].start == 1
        assert self.empty_regions[0].end == 1000
        assert self.empty_regions[0].chromosome == 'chr1'
        assert self.empty_regions[0].a == 10
        assert self.empty_regions[0].b == 'ten'

        # dict
        self.empty_regions.add_region({
            'start': 1001,
            'end': 2000,
            'chromosome': 'chr1',
            'a': 11,
            'b': 'eleven'
        })
        self.empty_regions.flush()
        assert self.empty_regions[1].start == 1001
        assert self.empty_regions[1].end == 2000
        assert self.empty_regions[1].chromosome == 'chr1'
        assert self.empty_regions[1].a == 11
        assert self.empty_regions[1].b == 'eleven'

        # list
        self.empty_regions.add_region(['chr1', 2001, 3000])
        self.empty_regions.flush()
        assert self.empty_regions[2].start == 2001
        assert self.empty_regions[2].end == 3000
        assert self.empty_regions[2].chromosome == 'chr1'
        assert self.empty_regions[2].a == 0
        assert self.empty_regions[2].b == ''
Ejemplo n.º 19
0
    def test_add_region(self):
        # GenomicRegion
        self.empty_regions.add_region(
            GenomicRegion(start=1, end=1000, chromosome='chr1'))
        self.empty_regions.flush()
        assert self.empty_regions[0].start == 1
        assert self.empty_regions[0].end == 1000
        assert self.empty_regions[0].chromosome == 'chr1'

        # dict
        self.empty_regions.add_region({
            'start': 1001,
            'end': 2000,
            'chromosome': 'chr1'
        })
        self.empty_regions.flush()
        assert self.empty_regions[1].start == 1001
        assert self.empty_regions[1].end == 2000
        assert self.empty_regions[1].chromosome == 'chr1'

        # list
        self.empty_regions.add_region(['chr1', 2001, 3000])
        self.empty_regions.flush()
        assert self.empty_regions[2].start == 2001
        assert self.empty_regions[2].end == 3000
        assert self.empty_regions[2].chromosome == 'chr1'
Ejemplo n.º 20
0
    def _row_to_region(self, row, lazy_region=None):
        """
        Convert a PyTables row to :class:`~GenomicRegion`.

        :param row: PyTables row object
        :param lazy_region: (optional) :class:`~LazyGenomicRegion` that is
                            used for loading attributes.
        :return: :class:`~GenomicRegion` or :class:`~LazyGenomicRegion`
        """
        if lazy_region is not None:
            lazy_region._row = row
            return lazy_region

        kwargs = {}
        for name in self._regions.colnames:
            if name not in RegionsTable.RegionDescription().columns.keys():
                value = row[name]
                value = value.decode() if isinstance(value, bytes) else value
                kwargs[name] = value

        try:
            mask_ix = row['_mask_ix']
        except (KeyError, ValueError):
            mask_ix = 0

        return GenomicRegion(chromosome=row["chromosome"].decode(),
                             start=row["start"],
                             end=row["end"],
                             ix=row["ix"],
                             _mask_ix=mask_ix,
                             **kwargs)
Ejemplo n.º 21
0
    def setup_method(self, method):
        peaks = [
            [(10, 12), (1, 4), (15, 8)],
            [(11, 12), (5, 8), (15.2, 7.8)],
            [(10.5, 11.8), (5.5, 8.1)],
        ]
        regions = [
            GenomicRegion('chr1', a + 1, b, ix=i)
            for i, (a, b) in enumerate(pairwise(np.arange(0, 10001, 100)))
        ]

        self.peaks = {}
        for i in range(3):
            p = fanc.peaks.PeakInfo()
            p.add_regions(regions)
            edges = [
                fanc.peaks.Peak(x=x,
                                y=y,
                                source=math.floor(x),
                                sink=math.floor(y),
                                weight=1)
                for x, y in (tuple(sorted(xy)) for xy in peaks[i])
            ]
            p.add_edges(edges)
            p.flush()
            self.peaks[i] = p
Ejemplo n.º 22
0
    def _convert_key(self, key, region_trees):
        if isinstance(key, string_types):
            key = GenomicRegion.from_string(key)

        if isinstance(key, GenomicRegion):
            start = None
            stop = None
            try:
                key_start = 0 if key.start is None else max(0, key.start - 1)
                key_end = key.end
                for interval in region_trees[
                        key.chromosome][key_start:key_end]:
                    i = interval.data
                    start = min(i, start) if start is not None else i
                    stop = max(i + 1, stop) if stop is not None else i + 1
            except KeyError:
                raise ValueError("Requested chromosome {} was not "
                                 "found in this matrix.".format(
                                     key.chromosome))

            if start is None or stop is None:
                raise ValueError(
                    "Requested region {} was not found in this matrix.".format(
                        key))

            return slice(start, stop, 1)
        return key
Ejemplo n.º 23
0
def sample_hic_matrix2(file_name=None, tmpdir=None):
    #     0 1 2 3 4 5 6 7 8 9
    #   #####################
    # 0 # 0 1 0 2 - 3 0 - 0 5
    # 1 #   6 0 7 - 8 0 - 0 1
    # 2 #     2 3 - 0 5 - 0 6
    # 3 #       7 - 8 9 - 1 0
    # 4 #         - - - - - -
    # 5 #           5 6 - 8 9
    # 6 #             1 - 0 0
    # 7 #               - - -
    # 8 #                 0 4
    # 9 #                   5
    nodes = [
        GenomicRegion('chr1', 1, 1000),
        GenomicRegion('chr1', 1001, 2000),
        GenomicRegion('chr1', 2001, 3000),
        GenomicRegion('chr1', 3001, 4000),
        GenomicRegion('chr1', 4001, 5000),
        GenomicRegion('chr1', 5001, 6000),
        GenomicRegion('chr1', 6001, 7000),
        GenomicRegion('chr1', 7001, 8000),
        GenomicRegion('chr1', 8001, 9000),
        GenomicRegion('chr1', 9001, 10000)
    ]

    edges = [
        Edge(source=0, sink=1, weight=1), Edge(source=3, sink=5, weight=8),
        Edge(source=0, sink=3, weight=2), Edge(source=3, sink=6, weight=9),
        Edge(source=0, sink=5, weight=3), Edge(source=3, sink=8, weight=1),
        Edge(source=0, sink=9, weight=5),
        Edge(source=1, sink=1, weight=6),
        Edge(source=1, sink=3, weight=7), Edge(source=5, sink=5, weight=5),
        Edge(source=1, sink=5, weight=8), Edge(source=5, sink=6, weight=6),
        Edge(source=1, sink=9, weight=1), Edge(source=5, sink=8, weight=8),
        Edge(source=2, sink=2, weight=2), Edge(source=5, sink=9, weight=9),
        Edge(source=2, sink=3, weight=3), Edge(source=6, sink=6, weight=1),
        Edge(source=2, sink=6, weight=5),
        Edge(source=2, sink=9, weight=6), Edge(source=8, sink=9, weight=4),
        Edge(source=3, sink=3, weight=7), Edge(source=9, sink=9, weight=5)
    ]

    hic = Hic(file_name=file_name, tmpdir=tmpdir)
    hic.add_regions(nodes)
    hic.add_edges(edges)

    return hic
Ejemplo n.º 24
0
def contact_directionality_bias(hic, regions, distance=1000000, region_anchor='center', **kwargs):
    forward_region_pairs = []
    reverse_region_pairs = []
    for region in regions:
        pos = int(getattr(region, region_anchor))
        new_region = GenomicRegion(chromosome=region.chromosome, start=pos, end=pos, strand=region.strand)
        if region.is_forward():
            forward_region_pairs.append((new_region, new_region.expand(absolute=distance)))
        else:
            reverse_region_pairs.append((new_region, new_region.expand(absolute=distance)))

    cumulative_forward = np.zeros(int(distance / hic.bin_size) * 2 + 1)
    count_forward = np.zeros(int(distance / hic.bin_size) * 2 + 1)
    for matrix in extract_submatrices(hic, forward_region_pairs, **kwargs):
        cumulative_forward += matrix[0, :]
        if hasattr(matrix, 'mask'):
            inverted_mask = ~matrix.mask
            count_forward += inverted_mask.astype('int')[0, :]
        else:
            count_forward += np.ones(count_forward.shape)

    cumulative_reverse = np.zeros(int(distance / hic.bin_size) * 2 + 1)
    count_reverse = np.zeros(int(distance / hic.bin_size) * 2 + 1)
    for matrix in extract_submatrices(hic, reverse_region_pairs, **kwargs):
        cumulative_reverse += matrix[0, :]
        if hasattr(matrix, 'mask'):
            inverted_mask = ~matrix.mask
            count_reverse += inverted_mask.astype('int')[0, :]
        else:
            count_reverse += np.ones(count_reverse.shape)

    avg_forward = cumulative_forward / count_forward
    avg_reverse = cumulative_reverse / count_reverse

    bin_size = hic.bin_size
    d = []
    ratio_forward = []
    ratio_reverse = []
    center = int(len(avg_forward)/2)
    for ix in range(center + 1):
        d.append(ix * bin_size)
        ratio_forward.append(avg_forward[center + ix] / avg_forward[center - ix])
        ratio_reverse.append(avg_reverse[center + ix] / avg_reverse[center - ix])

    return d, ratio_forward, ratio_reverse
Ejemplo n.º 25
0
    def test_save_and_load(self, tmpdir):
        dest_file = str(tmpdir) + "/hic.h5"

        hic1 = self.hic_class(file_name=dest_file, mode='w')
        hic1.add_region(GenomicRegion('chr1', 1, 1000))
        hic1.add_region(GenomicRegion('chr2', 1, 1000))
        hic1.flush()
        hic1.add_edge_simple(0, 1, 1)
        hic1.flush()
        hic1.close()

        hic2 = self.hic_class(dest_file, mode='r')
        nodes2 = list(hic2.regions())
        edges2 = list(hic2.edges())
        assert len(nodes2) == 2
        assert len(edges2) == 1

        hic2.close()
Ejemplo n.º 26
0
    def _mouse_release_event(self, event):
        xlim = self.ax.get_xlim()
        ylim = self.ax.get_ylim()

        if xlim != self._last_xlim or ylim != self._last_ylim:
            self._last_xlim = xlim
            self._last_ylim = ylim
            x_start, x_end = (xlim[0],
                              xlim[1]) if xlim[0] < xlim[1] else (xlim[1],
                                                                  xlim[0])
            x_region = GenomicRegion(chromosome=self._current_chromosome_x,
                                     start=x_start,
                                     end=x_end)
            y_start, y_end = (ylim[0],
                              ylim[1]) if ylim[0] < ylim[1] else (ylim[1],
                                                                  ylim[0])
            y_region = GenomicRegion(chromosome=self._current_chromosome_y,
                                     start=y_start,
                                     end=y_end)
            self.refresh(region=(x_region, y_region))
Ejemplo n.º 27
0
    def test_get_node_x_by_region(self):
        region1 = GenomicRegion.from_string('chr1')
        nodes1 = list(self.hic.regions(region1))
        assert len(nodes1) == 5

        region2 = GenomicRegion.from_string('chr2')
        nodes2 = list(self.hic.regions(region2))
        assert len(nodes2) == 3

        region3 = GenomicRegion.from_string('chr3')
        nodes3 = list(self.hic.regions(region3))
        assert len(nodes3) == 4

        region4 = GenomicRegion.from_string('chr1:3452-6000')
        nodes4 = list(self.hic.regions(region4))
        assert len(nodes4) == 2

        region5 = GenomicRegion.from_string('chr1:1-51000')
        nodes5 = list(self.hic.regions(region5))
        assert len(nodes5) == 5
Ejemplo n.º 28
0
    def _edges_subset(self,
                      key=None,
                      row_regions=None,
                      col_regions=None,
                      lazy=False,
                      *args,
                      **kwargs):

        if row_regions[0].chromosome != row_regions[-1].chromosome:
            raise ValueError("Cannot subset rows across multiple chromosomes!")

        if col_regions[0].chromosome != col_regions[-1].chromosome:
            raise ValueError(
                "Cannot subset columns across multiple chromosomes!")

        regions_by_ix = {}
        for region in row_regions + col_regions:
            regions_by_ix[region.ix] = region

        row_span = GenomicRegion(chromosome=row_regions[0].chromosome,
                                 start=row_regions[0].start,
                                 end=row_regions[-1].end)

        col_span = GenomicRegion(chromosome=col_regions[0].chromosome,
                                 start=col_regions[0].start,
                                 end=col_regions[-1].end)

        if not lazy:
            for x, y, weight in self._read_matrix(row_span, col_span):
                if x > y:
                    x, y = y, x
                yield Edge(source=regions_by_ix[x],
                           sink=regions_by_ix[y],
                           weight=weight)
        else:
            edge = LazyJuicerEdge(source=0, sink=0, weight=1.0, matrix=self)
            for x, y, weight in self._read_matrix(row_span, col_span):
                if x > y:
                    x, y = y, x
                edge.source, edge.sink, edge.weight = x, y, weight
                yield edge
Ejemplo n.º 29
0
    def setup_method(self, method):
        hic = Hic()

        m = np.zeros((12, 12))
        row_regions = []
        col_regions = []
        # add some nodes (120 to be exact)
        nodes = []
        for i in range(1, 5000, 1000):
            node = GenomicRegion(chromosome="chr1", start=i, end=i + 1000 - 1)
            nodes.append(node)
            row_regions.append(node)
            col_regions.append(node)
        for i in range(1, 3000, 1000):
            node = GenomicRegion(chromosome="chr2", start=i, end=i + 1000 - 1)
            nodes.append(node)
            row_regions.append(node)
            col_regions.append(node)
        for i in range(1, 2000, 500):
            node = GenomicRegion(chromosome="chr3", start=i, end=i + 1000 - 1)
            nodes.append(node)
            row_regions.append(node)
            col_regions.append(node)
        hic.add_regions(nodes)

        # add some edges with increasing weight for testing
        edges = []
        weight = 1
        for i in range(0, len(nodes)):
            for j in range(i, len(nodes)):
                edges.append(Edge(source=i, sink=j, weight=weight))
                m[i, j] = weight
                m[j, i] = weight
                weight += 1

        hic.add_edges(edges)

        self.hic = hic
        self.m = RegionMatrix(m,
                              row_regions=row_regions,
                              col_regions=col_regions)
Ejemplo n.º 30
0
    def test_merge(self):
        hic = self.hic_class()

        # add some nodes (120 to be exact)
        nodes = []
        for i in range(1, 5000, 1000):
            nodes.append(
                GenomicRegion(chromosome="chr1", start=i, end=i + 1000 - 1))
        for i in range(1, 3000, 1000):
            nodes.append(
                GenomicRegion(chromosome="chr2", start=i, end=i + 1000 - 1))
        for i in range(1, 2000, 500):
            nodes.append(
                GenomicRegion(chromosome="chr3", start=i, end=i + 1000 - 1))
        hic.add_regions(nodes, preserve_attributes=False)

        # add some edges with increasing weight for testing
        edges = []
        weight = 1
        for i in range(0, len(nodes)):
            for j in range(i, len(nodes)):
                edges.append(Edge(source=i, sink=j, weight=weight))
                weight += 1

        hic.add_edges(edges)

        # check length
        merged_hic_2x = Hic.merge([self.hic, hic])
        merged_hic_3x = Hic.merge([self.hic, hic, hic])
        hic.close()

        m = self.hic[:, :]
        m_merged_2x = merged_hic_2x[:, :]
        m_merged_3x = merged_hic_3x[:, :]

        for i in range(m.shape[0]):
            for j in range(m.shape[1]):
                assert m[i, j] == 0 or m[i, j] == m_merged_2x[i, j] / 2
                assert m[i, j] == 0 or m[i, j] == m_merged_3x[i, j] / 3
        merged_hic_2x.close()
        merged_hic_3x.close()