Example #1
0
    def test_constructors_errors(self):

        # scalar
        with pytest.raises(TypeError):
            IntervalIndex(5)

        # not an interval
        with pytest.raises(TypeError):
            IntervalIndex([0, 1])

        with pytest.raises(TypeError):
            IntervalIndex.from_intervals([0, 1])

        # invalid closed
        with pytest.raises(ValueError):
            IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid')

        # mismatched closed
        with pytest.raises(ValueError):
            IntervalIndex.from_intervals([Interval(0, 1),
                                          Interval(1, 2, closed='left')])

        with pytest.raises(ValueError):
            IntervalIndex.from_arrays([0, 10], [3, 5])

        with pytest.raises(ValueError):
            Index([Interval(0, 1), Interval(2, 3, closed='left')])

        # no point in nesting periods in an IntervalIndex
        with pytest.raises(ValueError):
            IntervalIndex.from_breaks(
                pd.period_range('2000-01-01', periods=3))
Example #2
0
    def test_constructors_errors_string(self, data):
        # GH 19016
        left, right = data[:-1], data[1:]
        tuples = lzip(left, right)
        ivs = [Interval(l, r) for l, r in tuples] or data
        msg = ('category, object, and string subtypes are not supported '
               'for IntervalIndex')

        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex(ivs)

        with tm.assert_raises_regex(TypeError, msg):
            Index(ivs)

        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex.from_intervals(ivs)

        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex.from_breaks(data)

        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex.from_arrays(left, right)

        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex.from_tuples(tuples)
Example #3
0
    def test_constructors(self, data, closed, name):
        left, right = data[:-1], data[1:]
        ivs = [Interval(l, r, closed=closed) for l, r in lzip(left, right)]
        expected = IntervalIndex._simple_new(
            left=left, right=right, closed=closed, name=name)

        # validate expected
        assert expected.closed == closed
        assert expected.name == name
        assert expected.dtype.subtype == data.dtype
        tm.assert_index_equal(expected.left, data[:-1])
        tm.assert_index_equal(expected.right, data[1:])

        # validated constructors
        result = IntervalIndex(ivs, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_intervals(ivs, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_breaks(data, closed=closed, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_arrays(
            left, right, closed=closed, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_tuples(
            lzip(left, right), closed=closed, name=name)
        tm.assert_index_equal(result, expected)

        result = Index(ivs, name=name)
        assert isinstance(result, IntervalIndex)
        tm.assert_index_equal(result, expected)

        # idempotent
        tm.assert_index_equal(Index(expected), expected)
        tm.assert_index_equal(IntervalIndex(expected), expected)

        result = IntervalIndex.from_intervals(expected)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_intervals(
            expected.values, name=expected.name)
        tm.assert_index_equal(result, expected)

        left, right = expected.left, expected.right
        result = IntervalIndex.from_arrays(
            left, right, closed=expected.closed, name=expected.name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_tuples(
            expected.to_tuples(), closed=expected.closed, name=expected.name)
        tm.assert_index_equal(result, expected)

        breaks = expected.left.tolist() + [expected.right[-1]]
        result = IntervalIndex.from_breaks(
            breaks, closed=expected.closed, name=expected.name)
        tm.assert_index_equal(result, expected)
Example #4
0
    def test_missing_values(self):
        idx = pd.Index([np.nan, pd.Interval(0, 1), pd.Interval(1, 2)])
        idx2 = pd.IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2])
        assert idx.equals(idx2)

        with pytest.raises(ValueError):
            IntervalIndex.from_arrays([np.nan, 0, 1], np.array([0, 1, 2]))

        tm.assert_numpy_array_equal(isnull(idx),
                                    np.array([True, False, False]))
Example #5
0
    def test_union(self):
        other = IntervalIndex.from_arrays([2], [3])
        expected = IntervalIndex.from_arrays(range(3), range(1, 4))
        actual = self.index.union(other)
        self.assertTrue(expected.equals(actual))

        actual = other.union(self.index)
        self.assertTrue(expected.equals(actual))

        tm.assert_index_equal(self.index.union(self.index), self.index)
        tm.assert_index_equal(self.index.union(self.index[:1]),
                              self.index)
Example #6
0
    def test_missing_values(self, closed):
        idx = Index([np.nan, Interval(0, 1, closed=closed),
                     Interval(1, 2, closed=closed)])
        idx2 = IntervalIndex.from_arrays(
            [np.nan, 0, 1], [np.nan, 1, 2], closed=closed)
        assert idx.equals(idx2)

        with pytest.raises(ValueError):
            IntervalIndex.from_arrays(
                [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed)

        tm.assert_numpy_array_equal(isna(idx),
                                    np.array([True, False, False]))
Example #7
0
    def test_union(self, closed):
        idx = self.create_index(closed=closed)
        other = IntervalIndex.from_arrays([2], [3], closed=closed)
        expected = IntervalIndex.from_arrays(
            range(3), range(1, 4), closed=closed)
        actual = idx.union(other)
        assert expected.equals(actual)

        actual = other.union(idx)
        assert expected.equals(actual)

        tm.assert_index_equal(idx.union(idx), idx)
        tm.assert_index_equal(idx.union(idx[:1]), idx)
Example #8
0
    def test_sort_index_intervals(self):
        s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays(
            [0, 1, 2, 3],
            [1, 2, 3, 4]))

        result = s.sort_index()
        expected = s
        assert_series_equal(result, expected)

        result = s.sort_index(ascending=False)
        expected = Series([3, 2, 1, np.nan], IntervalIndex.from_arrays(
            [3, 2, 1, 0],
            [4, 3, 2, 1]))
        assert_series_equal(result, expected)
Example #9
0
    def test_constructor_errors(self):
        # GH 19016: categorical data
        data = Categorical(list('01234abcde'), ordered=True)
        msg = ('category, object, and string subtypes are not supported '
               'for IntervalIndex')
        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex.from_arrays(data[:-1], data[1:])

        # unequal length
        left = [0, 1, 2]
        right = [2, 3]
        msg = 'left and right must have the same length'
        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex.from_arrays(left, right)
Example #10
0
    def test_constructors(self, closed, name):
        left, right = Index([0, 1, 2, 3]), Index([1, 2, 3, 4])
        ivs = [Interval(l, r, closed=closed) for l, r in zip(left, right)]
        expected = IntervalIndex._simple_new(
            left=left, right=right, closed=closed, name=name)

        result = IntervalIndex(ivs, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_intervals(ivs, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_breaks(
            np.arange(5), closed=closed, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_arrays(
            left.values, right.values, closed=closed, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_tuples(
            zip(left, right), closed=closed, name=name)
        tm.assert_index_equal(result, expected)

        result = Index(ivs, name=name)
        assert isinstance(result, IntervalIndex)
        tm.assert_index_equal(result, expected)

        # idempotent
        tm.assert_index_equal(Index(expected), expected)
        tm.assert_index_equal(IntervalIndex(expected), expected)

        result = IntervalIndex.from_intervals(
            expected.values, name=expected.name)
        tm.assert_index_equal(result, expected)

        left, right = expected.left, expected.right
        result = IntervalIndex.from_arrays(
            left, right, closed=expected.closed, name=expected.name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_tuples(
            expected.to_tuples(), closed=expected.closed, name=expected.name)
        tm.assert_index_equal(result, expected)

        breaks = expected.left.tolist() + [expected.right[-1]]
        result = IntervalIndex.from_breaks(
            breaks, closed=expected.closed, name=expected.name)
        tm.assert_index_equal(result, expected)
Example #11
0
    def test_constructors_errors_tz(self, tz_left, tz_right):
        # GH 18537
        left = date_range('2017-01-01', periods=4, tz=tz_left)
        right = date_range('2017-01-02', periods=4, tz=tz_right)

        # don't need to check IntervalIndex(...) or from_intervals, since
        # mixed tz are disallowed at the Interval level
        with pytest.raises(ValueError):
            IntervalIndex.from_arrays(left, right)

        with pytest.raises(ValueError):
            IntervalIndex.from_tuples(lzip(left, right))

        with pytest.raises(ValueError):
            breaks = left.tolist() + [right[-1]]
            IntervalIndex.from_breaks(breaks)
    def test_astype(self):

        ci = self.create_index()
        result = ci.astype('category')
        tm.assert_index_equal(result, ci, exact=True)

        result = ci.astype(object)
        tm.assert_index_equal(result, Index(np.array(ci)))

        # this IS equal, but not the same class
        assert result.equals(ci)
        assert isinstance(result, Index)
        assert not isinstance(result, CategoricalIndex)

        # interval
        ii = IntervalIndex.from_arrays(left=[-0.001, 2.0],
                                       right=[2, 4],
                                       closed='right')

        ci = CategoricalIndex(Categorical.from_codes(
            [0, 1, -1], categories=ii, ordered=True))

        result = ci.astype('interval')
        expected = ii.take([0, 1, -1])
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_intervals(result.values)
        tm.assert_index_equal(result, expected)
Example #13
0
    def test_constructors_empty(self, data, closed):
        # GH 18421
        expected_dtype = getattr(data, 'dtype', np.int64)
        expected_values = np.array([], dtype=object)
        expected_index = IntervalIndex(data, closed=closed)

        # validate the expected index
        assert expected_index.empty
        assert expected_index.closed == closed
        assert expected_index.dtype.subtype == expected_dtype
        tm.assert_numpy_array_equal(expected_index.values, expected_values)

        result = IntervalIndex.from_tuples(data, closed=closed)
        tm.assert_index_equal(result, expected_index)
        tm.assert_numpy_array_equal(result.values, expected_values)

        result = IntervalIndex.from_breaks(data, closed=closed)
        tm.assert_index_equal(result, expected_index)
        tm.assert_numpy_array_equal(result.values, expected_values)

        result = IntervalIndex.from_arrays(data, data, closed=closed)
        tm.assert_index_equal(result, expected_index)
        tm.assert_numpy_array_equal(result.values, expected_values)

        if closed == 'right':
            # Can't specify closed for IntervalIndex.from_intervals
            result = IntervalIndex.from_intervals(data)
            tm.assert_index_equal(result, expected_index)
            tm.assert_numpy_array_equal(result.values, expected_values)
Example #14
0
    def test_constructors_nan(self, closed, data):
        # GH 18421
        expected_values = np.array(data, dtype=object)
        expected_idx = IntervalIndex(data, closed=closed)

        # validate the expected index
        assert expected_idx.closed == closed
        tm.assert_numpy_array_equal(expected_idx.values, expected_values)

        result = IntervalIndex.from_tuples(data, closed=closed)
        tm.assert_index_equal(result, expected_idx)
        tm.assert_numpy_array_equal(result.values, expected_values)

        result = IntervalIndex.from_breaks([np.nan] + data, closed=closed)
        tm.assert_index_equal(result, expected_idx)
        tm.assert_numpy_array_equal(result.values, expected_values)

        result = IntervalIndex.from_arrays(data, data, closed=closed)
        tm.assert_index_equal(result, expected_idx)
        tm.assert_numpy_array_equal(result.values, expected_values)

        if closed == 'right':
            # Can't specify closed for IntervalIndex.from_intervals
            result = IntervalIndex.from_intervals(data)
            tm.assert_index_equal(result, expected_idx)
            tm.assert_numpy_array_equal(result.values, expected_values)
Example #15
0
 def test_subtype_conversion(self, index, subtype):
     dtype = IntervalDtype(subtype)
     result = index.astype(dtype)
     expected = IntervalIndex.from_arrays(index.left.astype(subtype),
                                          index.right.astype(subtype),
                                          closed=index.closed)
     tm.assert_index_equal(result, expected)
Example #16
0
    def test_constructors(self):
        expected = self.index
        actual = IntervalIndex.from_breaks(np.arange(3), closed='right')
        self.assertTrue(expected.equals(actual))

        alternate = IntervalIndex.from_breaks(np.arange(3), closed='left')
        self.assertFalse(expected.equals(alternate))

        actual = IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)])
        self.assertTrue(expected.equals(actual))

        actual = IntervalIndex([Interval(0, 1), Interval(1, 2)])
        self.assertTrue(expected.equals(actual))

        actual = IntervalIndex.from_arrays(np.arange(2), np.arange(2) + 1,
                                           closed='right')
        self.assertTrue(expected.equals(actual))

        actual = Index([Interval(0, 1), Interval(1, 2)])
        assert isinstance(actual, IntervalIndex)
        self.assertTrue(expected.equals(actual))

        actual = Index(expected)
        assert isinstance(actual, IntervalIndex)
        self.assertTrue(expected.equals(actual))
Example #17
0
    def test_take(self):
        actual = self.index.take([0, 1])
        self.assertTrue(self.index.equals(actual))

        expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2])
        actual = self.index.take([0, 0, 1])
        self.assertTrue(expected.equals(actual))
Example #18
0
    def test_itemsize(self):
        # GH 19209
        left = np.arange(0, 4, dtype='i8')
        right = np.arange(1, 5, dtype='i8')

        result = IntervalIndex.from_arrays(left, right).itemsize
        expected = 16  # 8 * 2
        assert result == expected
Example #19
0
    def setup(self, N):
        left = np.append(np.arange(N), np.array(0))
        right = np.append(np.arange(1, N + 1), np.array(1))
        self.intv = IntervalIndex.from_arrays(left, right)
        self.intv._engine

        self.left = IntervalIndex.from_breaks(np.arange(N))
        self.right = IntervalIndex.from_breaks(np.arange(N - 3, 2 * N - 3))
Example #20
0
 def test_subtype_integer(self, subtype_start, subtype_end):
     index = IntervalIndex.from_breaks(np.arange(100, dtype=subtype_start))
     dtype = IntervalDtype(subtype_end)
     result = index.astype(dtype)
     expected = IntervalIndex.from_arrays(index.left.astype(subtype_end),
                                          index.right.astype(subtype_end),
                                          closed=index.closed)
     tm.assert_index_equal(result, expected)
Example #21
0
    def test_nbytes(self):
        # GH 19209
        left = np.arange(0, 4, dtype='i8')
        right = np.arange(1, 5, dtype='i8')

        result = IntervalIndex.from_arrays(left, right).nbytes
        expected = 64  # 4 * 8 * 2
        assert result == expected
Example #22
0
    def test_dropna_intervals(self):
        s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays(
            [np.nan, 0, 1, 2],
            [np.nan, 1, 2, 3]))

        result = s.dropna()
        expected = s.iloc[1:]
        assert_series_equal(result, expected)
Example #23
0
    def test_take(self, closed):
        index = self.create_index(closed=closed)

        actual = index.take([0, 1])
        tm.assert_index_equal(actual, index)

        expected = IntervalIndex.from_arrays(
            [0, 0, 1], [1, 1, 2], closed=closed)
        actual = index.take([0, 0, 1])
        tm.assert_index_equal(actual, expected)
Example #24
0
    def test_append(self):

        index1 = IntervalIndex.from_arrays([0, 1], [1, 2])
        index2 = IntervalIndex.from_arrays([1, 2], [2, 3])

        result = index1.append(index2)
        expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3])
        tm.assert_index_equal(result, expected)

        result = index1.append([index1, index2])
        expected = IntervalIndex.from_arrays([0, 1, 0, 1, 1, 2],
                                             [1, 2, 1, 2, 2, 3])
        tm.assert_index_equal(result, expected)

        def f():
            index1.append(IntervalIndex.from_arrays([0, 1], [1, 2],
                                                    closed='both'))

        self.assertRaises(ValueError, f)
Example #25
0
    def test_take(self, closed):
        index = self.create_index(closed=closed)

        result = index.take(range(10))
        tm.assert_index_equal(result, index)

        result = index.take([0, 0, 1])
        expected = IntervalIndex.from_arrays(
            [0, 0, 1], [1, 1, 2], closed=closed)
        tm.assert_index_equal(result, expected)
Example #26
0
    def test_get_reindexer_datetimelike(self, arrays):
        # GH 20636
        index = IntervalIndex.from_arrays(*arrays)
        tuples = [(index[0].left, index[0].left + pd.Timedelta('12H')),
                  (index[-1].right - pd.Timedelta('12H'), index[-1].right)]
        target = IntervalIndex.from_tuples(tuples)

        result = index._get_reindexer(target)
        expected = np.array([0, 3], dtype='intp')
        tm.assert_numpy_array_equal(result, expected)
Example #27
0
    def test_itemsize(self):
        # GH 19209
        left = np.arange(0, 4, dtype='i8')
        right = np.arange(1, 5, dtype='i8')
        expected = 16  # 8 * 2

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = IntervalIndex.from_arrays(left, right).itemsize

        assert result == expected
Example #28
0
    def test_get_item(self, closed):
        i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan),
                                      closed=closed)
        assert i[0] == Interval(0.0, 1.0, closed=closed)
        assert i[1] == Interval(1.0, 2.0, closed=closed)
        assert isna(i[2])

        result = i[0:1]
        expected = IntervalIndex.from_arrays((0.,), (1.,), closed=closed)
        tm.assert_index_equal(result, expected)

        result = i[0:2]
        expected = IntervalIndex.from_arrays((0., 1), (1., 2.), closed=closed)
        tm.assert_index_equal(result, expected)

        result = i[1:3]
        expected = IntervalIndex.from_arrays((1., np.nan), (2., np.nan),
                                             closed=closed)
        tm.assert_index_equal(result, expected)
Example #29
0
    def test_difference(self, closed, sort):
        index = IntervalIndex.from_arrays([1, 0, 3, 2],
                                          [1, 2, 3, 4],
                                          closed=closed)
        result = index.difference(index[:1], sort)
        expected = index[1:]
        if sort:
            expected = expected.sort_values()
        tm.assert_index_equal(result, expected)

        # GH 19101: empty result, same dtype
        result = index.difference(index, sort)
        expected = IntervalIndex(np.array([], dtype='int64'), closed=closed)
        tm.assert_index_equal(result, expected)

        # GH 19101: empty result, different dtypes
        other = IntervalIndex.from_arrays(index.left.astype('float64'),
                                          index.right, closed=closed)
        result = index.difference(other, sort)
        tm.assert_index_equal(result, expected)
Example #30
0
    def test_dropna(self):

        expected = IntervalIndex.from_tuples([(0.0, 1.0), (1.0, 2.0)])

        ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan])
        result = ii.dropna()
        tm.assert_index_equal(result, expected)

        ii = IntervalIndex.from_arrays([0, 1, np.nan], [1, 2, np.nan])
        result = ii.dropna()
        tm.assert_index_equal(result, expected)
Example #31
0
    def test_astype(self):

        ci = self.create_index()
        result = ci.astype(object)
        tm.assert_index_equal(result, Index(np.array(ci)))

        # this IS equal, but not the same class
        assert result.equals(ci)
        assert isinstance(result, Index)
        assert not isinstance(result, CategoricalIndex)

        # interval
        ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], right=[2, 4], closed="right")

        ci = CategoricalIndex(
            Categorical.from_codes([0, 1, -1], categories=ii, ordered=True)
        )

        result = ci.astype("interval")
        expected = ii.take([0, 1, -1])
        tm.assert_index_equal(result, expected)

        result = IntervalIndex(result.values)
        tm.assert_index_equal(result, expected)
Example #32
0
    def chromosome_coverage_read_counts(self, gene_overlap_dat, chrom_gene_df,
                                        chrom_exon_df, chrom):
        """
        Determine per-chromosome reads coverage and per-gene read counts from an RNA-seq experiment in
        a way that properly considers ambiguous reads - if a (paired) read falls entirely within the
        exonic regions of a *single* gene, only then does read contribute to read count and coverage.
        The cigar scores from single and paired reads are parsed according to cigar_segment_bounds.

        1. Saves compressed coverage array to self.save_dir with file name 'sample_[sample_id]_[chrom].npz' for
         genes with no overlap with any other gene (a.k.a. "isolated genes") with filename
         'chrom_coverage_[sample_id]_[chrom].npz'
        2. Saves a dictionary of {gene_name: 1-d numpy gene coverage arrays (concatenated exonic regions)}
         to a serialized pickle file for all genes that exonic have overlap with other genes (a.k.a. "overlap genes")
         with filename 'overlap_coverage_[sample_id]_[chrom].pkl'
        3. Saves read counts to self.save_dir with filename 'read_counts_[sample_id]_[chrom].csv'

        NOTE: if the required chromosome coverage files and read count file *already* exist prior to any coverage/read count
        calculations, Degnorm will default to using those files. This will only happen if a user either moves
        coverage and read count files from a prior Degnorm pipeline run to the appropriate chromosome directories
        of the target output directory, or if they re-use a Degnorm pipeline run's output directory. This is *NOT*
        the same as using a warm-start directory. A warm-start skips coverage/read count calculations entirely,
        assuming a prior Degnorm run successfully parse all coverage/read counts.

        :param chrom_gene_df: pandas.DataFrame with `chr`, `gene`, `gene_start`, and `gene_end` columns
        that delineate the start and end position of a gene's transcript on a chromosome, must be
        subset to the chromosome in study.
        :param gene_overlap_dat: dictionary with keys 'isolated_genes' and 'overlap_genes' detailing
        groups of genes that do not overlap with others and then groups of genes that share any overlap.
        See gene_processing.get_gene_overlap_structure function.
        :param chrom_exon_df: pandas.DataFrame with `chr`, `gene`, `start`, `end` columns that delineate
        the start and end positions of exons on a gene.
        :param chrom: str chromosome name
        :return: None. Coverage and read count files are written to self.save_dir.
        """
        # First, load this chromosome's reads.
        if self.verbose:
            logging.info(
                'SAMPLE {0}, CHR {1} -- begin loading reads from {2}'.format(
                    self.sample_id, chrom, self.filename))

        # assess how many genes we have.
        n_genes = chrom_gene_df.shape[0]

        # gene_overlap_dat data check: ensure that number isolated genes + number overlapping genes
        # equals number of genes in genes DataFrame.
        n_isolated_genes, n_overlap_genes = 0, 0
        if gene_overlap_dat['isolated_genes']:
            n_isolated_genes = len(gene_overlap_dat['isolated_genes'])

        if gene_overlap_dat['overlap_genes']:
            n_overlap_genes = np.sum(
                [len(x) for x in gene_overlap_dat['overlap_genes']])

        if n_isolated_genes + n_overlap_genes != n_genes:
            raise ValueError(
                'number of genes contained in gene_overlap_dat does not match that of chrom_gene_df.'
            )

        # create filepaths to non-overlapping read coverage, overlapping read coverage, read count files.
        chrom_cov_file = os.path.join(
            self.save_dir,
            'chrom_coverage_' + self.sample_id + '_' + str(chrom) + '.npz')
        ol_cov_file = os.path.join(
            self.save_dir,
            'overlap_coverage_' + self.sample_id + '_' + str(chrom) + '.pkl')
        count_file = os.path.join(
            self.save_dir,
            'read_counts_' + self.sample_id + '_' + str(chrom) + '.csv')

        # if all required coverage, read count files are present, e.g. created from a previous run attempt,
        # then skip all calculations and default to the existing files. Addresses issue #30.
        if ((n_isolated_genes > 0 and os.path.isfile(chrom_cov_file)) or n_isolated_genes == 0) \
            and ((n_overlap_genes > 0 and os.path.isfile(ol_cov_file)) or n_overlap_genes == 0) \
            and (os.path.isfile(count_file)):

            if self.verbose:
                logging.info("""SAMPLE {0}, CHR {1} -- WARNING... All coverage and read count files already present:
                {0}
                {1}
                {2}
                Defaulting to these files; skipping coverage and read count calculations."""\
                             .format(chrom_cov_file, ol_cov_file, count_file))

            return None

        # initialize read counts.
        read_count_dict = {gene: 0 for gene in chrom_gene_df.gene}

        # set pandas.options.mode.chained_assignment = None to avoid SettingWithCopyWarnings
        set_option('mode.chained_assignment', None)

        # ---------------------------------------------------------------------- #
        # Step 1. Load chromosome's reads and index them.
        # ---------------------------------------------------------------------- #
        reads_df = self.load_chromosome_reads(chrom)

        if self.verbose:
            logging.info(
                'SAMPLE {0}, CHR {1} -- reads successfully loaded. shape = {2}'
                .format(self.sample_id, chrom, reads_df.shape))

        # append end position to reads based on cigar score.
        reads_df['end_pos'] = reads_df['pos'] + reads_df['cigar'].apply(
            lambda x: sum([int(k)
                           for k, v in re.findall(r'(\d+)([A-Z]?)', x)]))

        # assign row number to read ID column.
        reads_df['read_id'] = range(reads_df.shape[0])

        # easy win: drop reads whose start position is < minimum start position of a gene,
        # and drop reads whose end position is > maximum start position of a gene
        min_gene_start, max_gene_end = chrom_gene_df.gene_start.min(
        ) - 1, chrom_gene_df.gene_end.max() - 1
        reads_df = reads_df[(reads_df.pos >= (min_gene_start))
                            & (reads_df.end_pos <= (max_gene_end))]

        # If working with paired reads,
        # ensure that we've sequestered paired reads (eliminate any query names only occurring once).
        if self.paired:
            qname_counts = reads_df.qname_unpaired.value_counts()
            paired_occ_reads = qname_counts[qname_counts ==
                                            2].index.values.tolist()
            reads_df = reads_df[reads_df.qname_unpaired.isin(paired_occ_reads)]

        # ---------------------------------------------------------------------- #
        # Step 2. Drop reads that don't fully fall within union of all exons.
        # ---------------------------------------------------------------------- #
        chrom_len = self.header[self.header.chr == chrom].length.iloc[0]
        tscript_vec = np.ones(
            [chrom_len], dtype=int)  # large vector, will delete after using.

        # build binary 0/1 exon/intron indicator vector.
        # Need to account for exon data being 1-indexed, tscript_vec is 0-indexed, but
        # exon end positions are inclusive.
        exon_starts = chrom_exon_df.start.values - 1
        exon_ends = chrom_exon_df.end.values
        for i in range(len(exon_starts)):
            tscript_vec[exon_starts[i]:exon_ends[i]] = 0

        del exon_starts, exon_ends
        gc.collect()

        # store read_ids of reads to drop, and initialize dropped read count.
        drop_reads = list()

        # store read match region bounds, so that we only parse CIGAR strings once.
        read_bounds = list()

        # use values array, faster access.
        dat = reads_df[['cigar', 'pos', 'read_id']].values

        # for paired reads, perform special parsing of CIGAR strings to avoid double-counting of overlap regions.
        if self.paired:
            for ii in np.arange(1, dat.shape[0], 2):

                # obtain read region bounds.
                bounds_1 = cigar_segment_bounds(dat[ii - 1, 0],
                                                start=dat[ii - 1, 1])
                bounds_2 = cigar_segment_bounds(dat[ii, 0], start=dat[ii, 1])

                # leverage nature of alignments of paired reads to find disjoint coverage ranges.
                min_bounds_1, max_bounds_1 = min(bounds_1), max(bounds_1)
                min_bounds_2, max_bounds_2 = min(bounds_2), max(bounds_2)

                if max_bounds_2 >= max_bounds_1:
                    bounds_2 = [
                        max_bounds_1 + 1 if j <= max_bounds_1 else j
                        for j in bounds_2
                    ]
                else:
                    bounds_2 = [
                        min_bounds_1 - 1 if j >= min_bounds_1 else j
                        for j in bounds_2
                    ]
                    bounds_2.sort()

                # aggregate read pair's bounds.
                bounds = bounds_1 + bounds_2

                # iterate over match regions. If a single region is not fully contained
                # within exon regions, drop the pair.
                drop_read = False
                for j in np.arange(1, len(bounds), step=2):

                    # check whether matching regions on tscript_vec are fully contained within exonic regions.
                    # note that right-bounds are inclusive.
                    if np.sum(
                            tscript_vec[(bounds[j - 1]):(bounds[j] + 1)]) > 0:
                        drop_read = True

                # append read id to set of read indices to drop (if appropriate).
                if drop_read:
                    drop_reads.extend([dat[ii - 1, 2], dat[ii, 2]])

                # otherwise, append match region bounds list. Note: endpoints of regions are inclusive.
                else:
                    read_bounds.append(bounds)

        # for single-read RNA-Seq experiments, we do not need such special consideration.
        else:
            for ii in np.arange(dat.shape[0]):
                # obtain read regions bounds.
                bounds = cigar_segment_bounds(dat[ii, 0], start=dat[ii, 1])

                # iterate over match regions. If a single region is not fully contained
                # within exon regions, drop the read.
                drop_read = False
                for j in np.arange(1, len(bounds), step=2):

                    if np.sum(
                            tscript_vec[(bounds[j - 1]):(bounds[j] + 1)]) > 0:
                        drop_read = True

                # append read id to set of read indices to drop (if appropriate).
                if drop_read:
                    drop_reads.append(dat[ii, 2])

                # otherwise, append match region bounds list. Note: endpoints of regions are inclusive.
                else:
                    read_bounds.append(bounds)

        # drop reads that don't fully intersect exonic regions.
        if drop_reads:
            reads_df = reads_df[~reads_df.read_id.isin(drop_reads)]

        if self.paired:
            # if paired reads, don't actually need .1 and .2 constituent reads anymore.
            # So to save time + memory, take every other read.
            reads_df = reads_df.iloc[np.arange(1, reads_df.shape[0], step=2)]

        # add parsed match region bounds to reads!
        reads_df['bounds'] = read_bounds

        # delete objs, attempt to save on memory.
        del tscript_vec, drop_reads, dat, read_bounds
        gc.collect()

        # ---------------------------------------------------------------------- #
        # Step 3. Compute coverage, reads across groups of mutually overlapping genes.
        # (This is costly from a time perspective. Should constitute
        #  coverage, read count calculations for ~ 10-20% of genes.)
        # ---------------------------------------------------------------------- #

        # display summary statistics around rate of gene intersection.
        if self.verbose:
            logging.info(
                'SAMPLE {0}, CHR {1} -- overlap genes = {2} / {3}.'.format(
                    self.sample_id, chrom, n_overlap_genes, n_genes))
            logging.info(
                'SAMPLE {0}, CHR {1} -- begin overlap gene group reads processing.'
                .format(self.sample_id, chrom))

        # for genes in a group of overlapping genes, compute read coverage + count.
        if n_overlap_genes > 0:

            ol_cov_dict = dict()

            # iterate over groups of overlapping genes.
            for ol_genes in gene_overlap_dat['overlap_genes']:

                ol_gene_df = chrom_gene_df[chrom_gene_df.gene.isin(ol_genes)]
                ol_gene_group_start = ol_gene_df.gene_start.min() - 1
                ol_gene_group_end = ol_gene_df.gene_end.max() - 1

                ol_gene_starts = list()
                gene_exon_bounds = list()
                transcript_idx = list(
                )  # list of 1-d np.arrays, each holding one overlapping gene's exon positioning.

                # obtain exon regions for each gene in overlap group.
                # Exon starts/ends are 1-indexed, change them to be 0-indexed.
                for ol_gene in ol_genes:
                    ol_gene_exon_df = chrom_exon_df[chrom_exon_df.gene ==
                                                    ol_gene]

                    # store gene starts for constructing per-gene coverage vectors.
                    # 0-index gene starts/ends.
                    ol_gene_start = ol_gene_exon_df.gene_start.iloc[0] - 1
                    ol_gene_end = ol_gene_exon_df.gene_end.iloc[0] - 1
                    ol_gene_starts.append(ol_gene_start)

                    # initialize gene coverage vector for each gene in overlap group.
                    ol_cov_dict[ol_gene] = np.zeros(
                        [ol_gene_end - ol_gene_start + 1], dtype=int)

                    # save gene exon positioning, for determining which reads captured by which genes.
                    # 0-index exon positions, and include gene end positioning.
                    e_starts, e_ends = np.sort(
                        ol_gene_exon_df.start.values) - 1, np.sort(
                            ol_gene_exon_df.end.values)
                    gene_exon_bounds += [[
                        [e_starts[j], e_ends[j]] for j in range(len(e_starts))
                    ]]  # list of list of lists, includes exon end pos.
                    transcript_idx.append(
                        np.unique(
                            fill_in_bounds(flatten_2d(gene_exon_bounds[-1])))
                    )  # transcript vector is 0-indexed, includes exon end pos.

                # drop things we don't need any more.
                del ol_gene_df, ol_gene_exon_df, e_starts, e_ends

                # storage for reads to drop.
                drop_reads = list()

                # subset reads to those that start and end within scope of this bloc of overlapping genes.
                ol_reads_dat = reads_df[(reads_df.pos >= (ol_gene_group_start))
                                        & (reads_df.end_pos <=
                                           (ol_gene_group_end))][[
                                               'bounds', 'read_id'
                                           ]].values

                # for single-read RNA-Seq experiments, we do not need such special consideration.
                for i in range(ol_reads_dat.shape[0]):

                    # obtain read regions bounds.
                    read_bounds, read_id = ol_reads_dat[i, :]

                    # find genes that fully include this read. Everything is 0-indexed.
                    caught_genes = self.determine_full_inclusion(
                        read_bounds, gene_exon_bounds=gene_exon_bounds)

                    # Ambiguous read determination logic:
                    # - if paired reads lie fully within 0 or 2+ genes, do not use the reads pair and drop them.
                    # - if read lies fully within a single gene:
                    #    - do not drop it.
                    #    - if the caught gene is the current gene being analyzed, use the read. O/w do not.
                    n_caught_genes = len(caught_genes)

                    # if only one gene captures read, use the read and identify capturing gene for
                    # incrementing count, but drop it from consideration later (it's been accounted for).
                    # if only full intersection is with with a single gene, increment coverage and read count
                    # for that gene, and drop read.
                    # Note: need to restart coverage calculations relative to gene's start position.
                    if n_caught_genes == 1:
                        drop_read = True
                        read_gene = ol_genes[caught_genes[0]]
                        read_gene_start = ol_gene_starts[caught_genes[0]]
                        read_idx = fill_in_bounds(
                            read_bounds, endpoint=True) - read_gene_start - 1
                        ol_cov_dict[read_gene][read_idx] += 1
                        read_count_dict[read_gene] += 1

                    # if no gene fully captures the read, do not use read *but do not drop it*,
                    # for the possibility that some isolated gene captures the read later on.
                    elif n_caught_genes == 0:
                        drop_read = False

                    # if > 1 gene fully captures the read,
                    # do not use read and drop it from consideration.
                    else:
                        drop_read = True

                    # if need be, add read to list of reads to be dropped.
                    if drop_read:
                        drop_reads.append(read_id)

                # drop ambiguous reads from larger set of chromosome reads,
                # should speed up gene-read searches in the future.
                if drop_reads:
                    reads_df = reads_df[~reads_df.read_id.isin(drop_reads)]

                    del drop_reads

                # pare down coverage vectors for genes in overlap group to their concatenated exon regions.
                for i in range(len(ol_genes)):
                    ol_gene = ol_genes[i]
                    ol_cov_dict[ol_gene] = ol_cov_dict[ol_gene][
                        transcript_idx[i] - ol_gene_starts[i]]

            # ---------------------------------------------------------------------- #
            # Step 3.5: save overlapping genes' coverage vectors.
            # overlapping gene coverage vector dict ->> pkl file.
            # ---------------------------------------------------------------------- #
            if self.verbose:
                logging.info(
                    'SAMPLE {0}, CHR {1} -- saving overlapping gene coverage vectors.'
                    .format(self.sample_id, chrom))

            # dump overlapping genes' coverage matrices.
            with open(ol_cov_file, 'wb') as f:
                pkl.dump(ol_cov_dict, f)

            # free up some memory -- delete groups of intersecting genes, etc.
            del ol_reads_dat, ol_cov_dict, transcript_idx, gene_exon_bounds
            gc.collect()

            if self.verbose:
                logging.info(
                    'SAMPLE {0}, CHR {1} -- overlapping gene reads processing successful.'
                    .format(self.sample_id, chrom))

        # ---------------------------------------------------------------------- #
        # Step 4. Compute coverage, reads for individual isolated genes.
        # ---------------------------------------------------------------------- #
        if n_isolated_genes > 0:

            if self.verbose:
                logging.info(
                    'SAMPLE {0}, CHR {1} -- begin isolated gene reads processing.'
                    .format(self.sample_id, chrom))

            # reduce chrom_gene_df to remaining genes
            chrom_gene_df = chrom_gene_df[chrom_gene_df.gene.isin(
                gene_overlap_dat['isolated_genes'])]

            # run same inclusion/exclusion transcript test but on the isolated genes.
            tscript_vec = np.ones([chrom_len], dtype=int)

            # identify regions of chromosome covered by isolated genes.
            # change gene starts/ends to 0-indexed to match 0-indexed tscript_vec array, but
            # gene ends are inclusive.
            gene_starts = chrom_gene_df.gene_start.values - 1
            gene_ends = chrom_gene_df.gene_end.values
            for i in range(len(gene_starts)):
                tscript_vec[gene_starts[i]:gene_ends[i]] = 0

            # identify reads that do not fall within an isolated gene's (start, end).
            drop_reads = list()
            dat = reads_df[['pos', 'end_pos', 'read_id']].values
            for i in range(dat.shape[0]):
                read_start, read_end, read_id = dat[i, :]

                # remember to include read end position. reads are 0-indexed.
                if np.sum(tscript_vec[read_start:(read_end + 1)]) > 0:
                    drop_reads.append(read_id)

            # drop memory hogs.
            del dat, gene_starts, gene_ends, tscript_vec

            # drop reads that do not lie completely within area covered by isolated genes.
            if drop_reads:
                reads_df = reads_df[~reads_df.read_id.isin(drop_reads)]

            del drop_reads
            gc.collect()

            # (a precaution) only continue if we have any reads intersecting isolated genes.
            if not reads_df.empty:

                # initialize chromosome coverage array.
                cov_vec = np.zeros([chrom_len], dtype=int)

                # ---------------------------------------------------------------------- #
                # Step 4.5.1: join genes on reads data
                # so that each read is tied to a gene, for read counting purposes.
                # ---------------------------------------------------------------------- #

                # 0-index gene_starts, gene_ends because reads are 0-indexed.
                chrom_gene_df.loc[:, ['gene_start', 'gene_end']] -= 1

                # add IntervalIndex index to chromosome gene data.
                chrom_gene_df.index = IntervalIndex.from_arrays(
                    chrom_gene_df.gene_start,
                    right=chrom_gene_df.gene_end,
                    closed='both')

                try:
                    reads_df['gene'] = chrom_gene_df.loc[
                        reads_df.pos].gene.values

                # if there remains at least one read that doesn't land within a gene span,
                # try another sweep to remove reads not within gene regions.
                except KeyError:

                    # outline valid read start positions along transcript.
                    tscript_vec = np.ones([chrom_len], dtype=int)

                    for i in range(chrom_gene_df.shape[0]):
                        left = chrom_gene_df.index[i].left
                        right = chrom_gene_df.index[i].right + 1
                        tscript_vec[left:right] = 0

                    # iterate over reads, checking whether read start position falls within
                    # a [gene_start, gene_end] region.
                    drop_reads = list()
                    for i in range(reads_df.shape[0]):
                        if tscript_vec[reads_df.pos.iloc[i]] != 0:
                            drop_reads.append(reads_df.read_id.iloc[i])

                    # drop reads that do not start within valid [gene_start, gene_end] regions.
                    if drop_reads:
                        reads_df = reads_df[~reads_df.read_id.isin(drop_reads)]

                    del tscript_vec, drop_reads
                    gc.collect()

                    # subset reads to reads w/ valid read ID, then join with interval index again.
                    reads_df['gene'] = chrom_gene_df.loc[
                        reads_df.pos].gene.values

                # loop over reads for isolated genes, incrementing read count and coverage.
                dat = reads_df[['bounds', 'gene']].values
                for i in range(dat.shape[0]):
                    bounds, gene = dat[i, :]

                    # reads are already 0-indexed.
                    read_idx = fill_in_bounds(bounds, endpoint=True)

                    # increment coverage and read count.
                    cov_vec[read_idx] += 1
                    read_count_dict[gene] += 1

                # ---------------------------------------------------------------------- #
                # Step 4.5.2: save chromosome coverage vector.
                # chromosome overage vector ->> compressed csr numpy array
                # ---------------------------------------------------------------------- #
                if self.verbose:
                    logging.info(
                        'SAMPLE {0}, CHR {1} -- saving csr-compressed chrom coverage array.'
                        .format(self.sample_id, chrom))

                # save coverage vector as a compressed-sparse row matrix.
                sparse.save_npz(chrom_cov_file,
                                matrix=sparse.csr_matrix(cov_vec))

                # drop large data objects.
                del cov_vec, dat, reads_df

            # drop remaining large data data objects.
            del chrom_gene_df, chrom_exon_df
            gc.collect()

            if self.verbose:
                logging.info(
                    'SAMPLE {0}, CHR {1} -- isolated gene reads processing successful.'
                    .format(self.sample_id, chrom))

        # ---------------------------------------------------------------------- #
        # Step 5. Save read counts.
        # chromosome read counts ->> .csv file
        # ---------------------------------------------------------------------- #
        # construct read count DataFrame from read count dictionary.
        read_count_df = DataFrame({
            'gene':
            list(read_count_dict.keys()),
            self.sample_id:
            list(read_count_dict.values())
        })

        del read_count_dict
        gc.collect()

        if self.verbose:
            logging.info(
                'SAMPLE {0}, CHR {1} -- mean per-gene read count: {2:.4}'.
                format(self.sample_id, chrom,
                       read_count_df[self.sample_id].mean()))
            logging.info('SAMPLE {0}, CHR {1} -- saving read counts.'.format(
                self.sample_id, chrom))

        # save sample's chromosome read counts to .csv for joining later.
        read_count_df.to_csv(count_file, index=False)
Example #33
0
def create_series_categorical_intervals(left, right, closed="right"):
    return Series(Categorical(IntervalIndex.from_arrays(left, right, closed)))
Example #34
0
def test_dir():
    # GH#27571 dir(interval_index) should not raise
    index = IntervalIndex.from_arrays([0, 1], [1, 2])
    result = dir(index)
    assert "str" not in result
Example #35
0
class TestIntervalIndex:
    index = IntervalIndex.from_arrays([0, 1], [1, 2])

    def create_index(self, closed="right"):
        return IntervalIndex.from_breaks(range(11), closed=closed)

    def create_index_with_nan(self, closed="right"):
        mask = [True, False] + [True] * 8
        return IntervalIndex.from_arrays(
            np.where(mask, np.arange(10), np.nan),
            np.where(mask, np.arange(1, 11), np.nan),
            closed=closed,
        )

    def test_properties(self, closed):
        index = self.create_index(closed=closed)
        assert len(index) == 10
        assert index.size == 10
        assert index.shape == (10,)

        tm.assert_index_equal(index.left, Index(np.arange(10)))
        tm.assert_index_equal(index.right, Index(np.arange(1, 11)))
        tm.assert_index_equal(index.mid, Index(np.arange(0.5, 10.5)))

        assert index.closed == closed

        ivs = [Interval(l, r, closed) for l, r in zip(range(10), range(1, 11))]
        expected = np.array(ivs, dtype=object)
        tm.assert_numpy_array_equal(np.asarray(index), expected)

        # with nans
        index = self.create_index_with_nan(closed=closed)
        assert len(index) == 10
        assert index.size == 10
        assert index.shape == (10,)

        expected_left = Index([0, np.nan, 2, 3, 4, 5, 6, 7, 8, 9])
        expected_right = expected_left + 1
        expected_mid = expected_left + 0.5
        tm.assert_index_equal(index.left, expected_left)
        tm.assert_index_equal(index.right, expected_right)
        tm.assert_index_equal(index.mid, expected_mid)

        assert index.closed == closed

        ivs = [
            Interval(l, r, closed) if notna(l) else np.nan
            for l, r in zip(expected_left, expected_right)
        ]
        expected = np.array(ivs, dtype=object)
        tm.assert_numpy_array_equal(np.asarray(index), expected)

    @pytest.mark.parametrize(
        "breaks",
        [
            [1, 1, 2, 5, 15, 53, 217, 1014, 5335, 31240, 201608],
            [-np.inf, -100, -10, 0.5, 1, 1.5, 3.8, 101, 202, np.inf],
            pd.to_datetime(["20170101", "20170202", "20170303", "20170404"]),
            pd.to_timedelta(["1ns", "2ms", "3s", "4M", "5H", "6D"]),
        ],
    )
    def test_length(self, closed, breaks):
        # GH 18789
        index = IntervalIndex.from_breaks(breaks, closed=closed)
        result = index.length
        expected = Index(iv.length for iv in index)
        tm.assert_index_equal(result, expected)

        # with NA
        index = index.insert(1, np.nan)
        result = index.length
        expected = Index(iv.length if notna(iv) else iv for iv in index)
        tm.assert_index_equal(result, expected)

    def test_with_nans(self, closed):
        index = self.create_index(closed=closed)
        assert index.hasnans is False

        result = index.isna()
        expected = np.zeros(len(index), dtype=bool)
        tm.assert_numpy_array_equal(result, expected)

        result = index.notna()
        expected = np.ones(len(index), dtype=bool)
        tm.assert_numpy_array_equal(result, expected)

        index = self.create_index_with_nan(closed=closed)
        assert index.hasnans is True

        result = index.isna()
        expected = np.array([False, True] + [False] * (len(index) - 2))
        tm.assert_numpy_array_equal(result, expected)

        result = index.notna()
        expected = np.array([True, False] + [True] * (len(index) - 2))
        tm.assert_numpy_array_equal(result, expected)

    def test_copy(self, closed):
        expected = self.create_index(closed=closed)

        result = expected.copy()
        assert result.equals(expected)

        result = expected.copy(deep=True)
        assert result.equals(expected)
        assert result.left is not expected.left

    def test_ensure_copied_data(self, closed):
        # exercise the copy flag in the constructor

        # not copying
        index = self.create_index(closed=closed)
        result = IntervalIndex(index, copy=False)
        tm.assert_numpy_array_equal(
            index.left.values, result.left.values, check_same="same"
        )
        tm.assert_numpy_array_equal(
            index.right.values, result.right.values, check_same="same"
        )

        # by-definition make a copy
        result = IntervalIndex(np.array(index), copy=False)
        tm.assert_numpy_array_equal(
            index.left.values, result.left.values, check_same="copy"
        )
        tm.assert_numpy_array_equal(
            index.right.values, result.right.values, check_same="copy"
        )

    def test_delete(self, closed):
        expected = IntervalIndex.from_breaks(np.arange(1, 11), closed=closed)
        result = self.create_index(closed=closed).delete(0)
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize(
        "data",
        [
            interval_range(0, periods=10, closed="neither"),
            interval_range(1.7, periods=8, freq=2.5, closed="both"),
            interval_range(Timestamp("20170101"), periods=12, closed="left"),
            interval_range(Timedelta("1 day"), periods=6, closed="right"),
        ],
    )
    def test_insert(self, data):
        item = data[0]
        idx_item = IntervalIndex([item])

        # start
        expected = idx_item.append(data)
        result = data.insert(0, item)
        tm.assert_index_equal(result, expected)

        # end
        expected = data.append(idx_item)
        result = data.insert(len(data), item)
        tm.assert_index_equal(result, expected)

        # mid
        expected = data[:3].append(idx_item).append(data[3:])
        result = data.insert(3, item)
        tm.assert_index_equal(result, expected)

        # invalid type
        msg = "can only insert Interval objects and NA into an IntervalIndex"
        with pytest.raises(ValueError, match=msg):
            data.insert(1, "foo")

        # invalid closed
        msg = "inserted item must be closed on the same side as the index"
        for closed in {"left", "right", "both", "neither"} - {item.closed}:
            with pytest.raises(ValueError, match=msg):
                bad_item = Interval(item.left, item.right, closed=closed)
                data.insert(1, bad_item)

        # GH 18295 (test missing)
        na_idx = IntervalIndex([np.nan], closed=data.closed)
        for na in [np.nan, None, pd.NA]:
            expected = data[:1].append(na_idx).append(data[1:])
            result = data.insert(1, na)
            tm.assert_index_equal(result, expected)

        if data.left.dtype.kind not in ["m", "M"]:
            # trying to insert pd.NaT into a numeric-dtyped Index should cast/raise
            msg = "can only insert Interval objects and NA into an IntervalIndex"
            with pytest.raises(ValueError, match=msg):
                result = data.insert(1, pd.NaT)
        else:
            result = data.insert(1, pd.NaT)
            tm.assert_index_equal(result, expected)

    def test_is_unique_interval(self, closed):
        """
        Interval specific tests for is_unique in addition to base class tests
        """
        # unique overlapping - distinct endpoints
        idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)], closed=closed)
        assert idx.is_unique is True

        # unique overlapping - shared endpoints
        idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed)
        assert idx.is_unique is True

        # unique nested
        idx = IntervalIndex.from_tuples([(-1, 1), (-2, 2)], closed=closed)
        assert idx.is_unique is True

    def test_monotonic(self, closed):
        # increasing non-overlapping
        idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)], closed=closed)
        assert idx.is_monotonic is True
        assert idx._is_strictly_monotonic_increasing is True
        assert idx.is_monotonic_decreasing is False
        assert idx._is_strictly_monotonic_decreasing is False

        # decreasing non-overlapping
        idx = IntervalIndex.from_tuples([(4, 5), (2, 3), (1, 2)], closed=closed)
        assert idx.is_monotonic is False
        assert idx._is_strictly_monotonic_increasing is False
        assert idx.is_monotonic_decreasing is True
        assert idx._is_strictly_monotonic_decreasing is True

        # unordered non-overlapping
        idx = IntervalIndex.from_tuples([(0, 1), (4, 5), (2, 3)], closed=closed)
        assert idx.is_monotonic is False
        assert idx._is_strictly_monotonic_increasing is False
        assert idx.is_monotonic_decreasing is False
        assert idx._is_strictly_monotonic_decreasing is False

        # increasing overlapping
        idx = IntervalIndex.from_tuples([(0, 2), (0.5, 2.5), (1, 3)], closed=closed)
        assert idx.is_monotonic is True
        assert idx._is_strictly_monotonic_increasing is True
        assert idx.is_monotonic_decreasing is False
        assert idx._is_strictly_monotonic_decreasing is False

        # decreasing overlapping
        idx = IntervalIndex.from_tuples([(1, 3), (0.5, 2.5), (0, 2)], closed=closed)
        assert idx.is_monotonic is False
        assert idx._is_strictly_monotonic_increasing is False
        assert idx.is_monotonic_decreasing is True
        assert idx._is_strictly_monotonic_decreasing is True

        # unordered overlapping
        idx = IntervalIndex.from_tuples([(0.5, 2.5), (0, 2), (1, 3)], closed=closed)
        assert idx.is_monotonic is False
        assert idx._is_strictly_monotonic_increasing is False
        assert idx.is_monotonic_decreasing is False
        assert idx._is_strictly_monotonic_decreasing is False

        # increasing overlapping shared endpoints
        idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed)
        assert idx.is_monotonic is True
        assert idx._is_strictly_monotonic_increasing is True
        assert idx.is_monotonic_decreasing is False
        assert idx._is_strictly_monotonic_decreasing is False

        # decreasing overlapping shared endpoints
        idx = pd.IntervalIndex.from_tuples([(2, 3), (1, 3), (1, 2)], closed=closed)
        assert idx.is_monotonic is False
        assert idx._is_strictly_monotonic_increasing is False
        assert idx.is_monotonic_decreasing is True
        assert idx._is_strictly_monotonic_decreasing is True

        # stationary
        idx = IntervalIndex.from_tuples([(0, 1), (0, 1)], closed=closed)
        assert idx.is_monotonic is True
        assert idx._is_strictly_monotonic_increasing is False
        assert idx.is_monotonic_decreasing is True
        assert idx._is_strictly_monotonic_decreasing is False

        # empty
        idx = IntervalIndex([], closed=closed)
        assert idx.is_monotonic is True
        assert idx._is_strictly_monotonic_increasing is True
        assert idx.is_monotonic_decreasing is True
        assert idx._is_strictly_monotonic_decreasing is True

    def test_get_item(self, closed):
        i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed)
        assert i[0] == Interval(0.0, 1.0, closed=closed)
        assert i[1] == Interval(1.0, 2.0, closed=closed)
        assert isna(i[2])

        result = i[0:1]
        expected = IntervalIndex.from_arrays((0.0,), (1.0,), closed=closed)
        tm.assert_index_equal(result, expected)

        result = i[0:2]
        expected = IntervalIndex.from_arrays((0.0, 1), (1.0, 2.0), closed=closed)
        tm.assert_index_equal(result, expected)

        result = i[1:3]
        expected = IntervalIndex.from_arrays(
            (1.0, np.nan), (2.0, np.nan), closed=closed
        )
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize(
        "breaks",
        [
            date_range("20180101", periods=4),
            date_range("20180101", periods=4, tz="US/Eastern"),
            timedelta_range("0 days", periods=4),
        ],
        ids=lambda x: str(x.dtype),
    )
    def test_maybe_convert_i8(self, breaks):
        # GH 20636
        index = IntervalIndex.from_breaks(breaks)

        # intervalindex
        result = index._maybe_convert_i8(index)
        expected = IntervalIndex.from_breaks(breaks.asi8)
        tm.assert_index_equal(result, expected)

        # interval
        interval = Interval(breaks[0], breaks[1])
        result = index._maybe_convert_i8(interval)
        expected = Interval(breaks[0].value, breaks[1].value)
        assert result == expected

        # datetimelike index
        result = index._maybe_convert_i8(breaks)
        expected = Index(breaks.asi8)
        tm.assert_index_equal(result, expected)

        # datetimelike scalar
        result = index._maybe_convert_i8(breaks[0])
        expected = breaks[0].value
        assert result == expected

        # list-like of datetimelike scalars
        result = index._maybe_convert_i8(list(breaks))
        expected = Index(breaks.asi8)
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize(
        "breaks",
        [date_range("2018-01-01", periods=5), timedelta_range("0 days", periods=5)],
    )
    def test_maybe_convert_i8_nat(self, breaks):
        # GH 20636
        index = IntervalIndex.from_breaks(breaks)

        to_convert = breaks._constructor([pd.NaT] * 3)
        expected = pd.Float64Index([np.nan] * 3)
        result = index._maybe_convert_i8(to_convert)
        tm.assert_index_equal(result, expected)

        to_convert = to_convert.insert(0, breaks[0])
        expected = expected.insert(0, float(breaks[0].value))
        result = index._maybe_convert_i8(to_convert)
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize(
        "breaks",
        [np.arange(5, dtype="int64"), np.arange(5, dtype="float64")],
        ids=lambda x: str(x.dtype),
    )
    @pytest.mark.parametrize(
        "make_key",
        [
            IntervalIndex.from_breaks,
            lambda breaks: Interval(breaks[0], breaks[1]),
            lambda breaks: breaks,
            lambda breaks: breaks[0],
            list,
        ],
        ids=["IntervalIndex", "Interval", "Index", "scalar", "list"],
    )
    def test_maybe_convert_i8_numeric(self, breaks, make_key):
        # GH 20636
        index = IntervalIndex.from_breaks(breaks)
        key = make_key(breaks)

        # no conversion occurs for numeric
        result = index._maybe_convert_i8(key)
        assert result is key

    @pytest.mark.parametrize(
        "breaks1, breaks2",
        permutations(
            [
                date_range("20180101", periods=4),
                date_range("20180101", periods=4, tz="US/Eastern"),
                timedelta_range("0 days", periods=4),
            ],
            2,
        ),
        ids=lambda x: str(x.dtype),
    )
    @pytest.mark.parametrize(
        "make_key",
        [
            IntervalIndex.from_breaks,
            lambda breaks: Interval(breaks[0], breaks[1]),
            lambda breaks: breaks,
            lambda breaks: breaks[0],
            list,
        ],
        ids=["IntervalIndex", "Interval", "Index", "scalar", "list"],
    )
    def test_maybe_convert_i8_errors(self, breaks1, breaks2, make_key):
        # GH 20636
        index = IntervalIndex.from_breaks(breaks1)
        key = make_key(breaks2)

        msg = (
            f"Cannot index an IntervalIndex of subtype {breaks1.dtype} with "
            f"values of dtype {breaks2.dtype}"
        )
        msg = re.escape(msg)
        with pytest.raises(ValueError, match=msg):
            index._maybe_convert_i8(key)

    def test_contains_method(self):
        # can select values that are IN the range of a value
        i = IntervalIndex.from_arrays([0, 1], [1, 2])

        expected = np.array([False, False], dtype="bool")
        actual = i.contains(0)
        tm.assert_numpy_array_equal(actual, expected)
        actual = i.contains(3)
        tm.assert_numpy_array_equal(actual, expected)

        expected = np.array([True, False], dtype="bool")
        actual = i.contains(0.5)
        tm.assert_numpy_array_equal(actual, expected)
        actual = i.contains(1)
        tm.assert_numpy_array_equal(actual, expected)

        # __contains__ not implemented for "interval in interval", follow
        # that for the contains method for now
        with pytest.raises(
            NotImplementedError, match="contains not implemented for two"
        ):
            i.contains(Interval(0, 1))

    def test_contains_dunder(self):

        index = IntervalIndex.from_arrays([0, 1], [1, 2], closed="right")

        # __contains__ requires perfect matches to intervals.
        assert 0 not in index
        assert 1 not in index
        assert 2 not in index

        assert Interval(0, 1, closed="right") in index
        assert Interval(0, 2, closed="right") not in index
        assert Interval(0, 0.5, closed="right") not in index
        assert Interval(3, 5, closed="right") not in index
        assert Interval(-1, 0, closed="left") not in index
        assert Interval(0, 1, closed="left") not in index
        assert Interval(0, 1, closed="both") not in index

    def test_dropna(self, closed):

        expected = IntervalIndex.from_tuples([(0.0, 1.0), (1.0, 2.0)], closed=closed)

        ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan], closed=closed)
        result = ii.dropna()
        tm.assert_index_equal(result, expected)

        ii = IntervalIndex.from_arrays([0, 1, np.nan], [1, 2, np.nan], closed=closed)
        result = ii.dropna()
        tm.assert_index_equal(result, expected)

    def test_non_contiguous(self, closed):
        index = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed)
        target = [0.5, 1.5, 2.5]
        actual = index.get_indexer(target)
        expected = np.array([0, -1, 1], dtype="intp")
        tm.assert_numpy_array_equal(actual, expected)

        assert 1.5 not in index

    def test_isin(self, closed):
        index = self.create_index(closed=closed)

        expected = np.array([True] + [False] * (len(index) - 1))
        result = index.isin(index[:1])
        tm.assert_numpy_array_equal(result, expected)

        result = index.isin([index[0]])
        tm.assert_numpy_array_equal(result, expected)

        other = IntervalIndex.from_breaks(np.arange(-2, 10), closed=closed)
        expected = np.array([True] * (len(index) - 1) + [False])
        result = index.isin(other)
        tm.assert_numpy_array_equal(result, expected)

        result = index.isin(other.tolist())
        tm.assert_numpy_array_equal(result, expected)

        for other_closed in {"right", "left", "both", "neither"}:
            other = self.create_index(closed=other_closed)
            expected = np.repeat(closed == other_closed, len(index))
            result = index.isin(other)
            tm.assert_numpy_array_equal(result, expected)

            result = index.isin(other.tolist())
            tm.assert_numpy_array_equal(result, expected)

    def test_comparison(self):
        actual = Interval(0, 1) < self.index
        expected = np.array([False, True])
        tm.assert_numpy_array_equal(actual, expected)

        actual = Interval(0.5, 1.5) < self.index
        expected = np.array([False, True])
        tm.assert_numpy_array_equal(actual, expected)
        actual = self.index > Interval(0.5, 1.5)
        tm.assert_numpy_array_equal(actual, expected)

        actual = self.index == self.index
        expected = np.array([True, True])
        tm.assert_numpy_array_equal(actual, expected)
        actual = self.index <= self.index
        tm.assert_numpy_array_equal(actual, expected)
        actual = self.index >= self.index
        tm.assert_numpy_array_equal(actual, expected)

        actual = self.index < self.index
        expected = np.array([False, False])
        tm.assert_numpy_array_equal(actual, expected)
        actual = self.index > self.index
        tm.assert_numpy_array_equal(actual, expected)

        actual = self.index == IntervalIndex.from_breaks([0, 1, 2], "left")
        tm.assert_numpy_array_equal(actual, expected)

        actual = self.index == self.index.values
        tm.assert_numpy_array_equal(actual, np.array([True, True]))
        actual = self.index.values == self.index
        tm.assert_numpy_array_equal(actual, np.array([True, True]))
        actual = self.index <= self.index.values
        tm.assert_numpy_array_equal(actual, np.array([True, True]))
        actual = self.index != self.index.values
        tm.assert_numpy_array_equal(actual, np.array([False, False]))
        actual = self.index > self.index.values
        tm.assert_numpy_array_equal(actual, np.array([False, False]))
        actual = self.index.values > self.index
        tm.assert_numpy_array_equal(actual, np.array([False, False]))

        # invalid comparisons
        actual = self.index == 0
        tm.assert_numpy_array_equal(actual, np.array([False, False]))
        actual = self.index == self.index.left
        tm.assert_numpy_array_equal(actual, np.array([False, False]))

        msg = (
            "not supported between instances of 'int' and "
            "'pandas._libs.interval.Interval'"
        )
        with pytest.raises(TypeError, match=msg):
            self.index > 0
        with pytest.raises(TypeError, match=msg):
            self.index <= 0
        with pytest.raises(TypeError, match=msg):
            self.index > np.arange(2)

        msg = "Lengths must match to compare"
        with pytest.raises(ValueError, match=msg):
            self.index > np.arange(3)

    def test_missing_values(self, closed):
        idx = Index(
            [np.nan, Interval(0, 1, closed=closed), Interval(1, 2, closed=closed)]
        )
        idx2 = IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2], closed=closed)
        assert idx.equals(idx2)

        msg = (
            "missing values must be missing in the same location both left "
            "and right sides"
        )
        with pytest.raises(ValueError, match=msg):
            IntervalIndex.from_arrays(
                [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed
            )

        tm.assert_numpy_array_equal(isna(idx), np.array([True, False, False]))

    def test_sort_values(self, closed):
        index = self.create_index(closed=closed)

        result = index.sort_values()
        tm.assert_index_equal(result, index)

        result = index.sort_values(ascending=False)
        tm.assert_index_equal(result, index[::-1])

        # with nan
        index = IntervalIndex([Interval(1, 2), np.nan, Interval(0, 1)])

        result = index.sort_values()
        expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan])
        tm.assert_index_equal(result, expected)

        result = index.sort_values(ascending=False, na_position="first")
        expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)])
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize("tz", [None, "US/Eastern"])
    def test_datetime(self, tz):
        start = Timestamp("2000-01-01", tz=tz)
        dates = date_range(start=start, periods=10)
        index = IntervalIndex.from_breaks(dates)

        # test mid
        start = Timestamp("2000-01-01T12:00", tz=tz)
        expected = date_range(start=start, periods=9)
        tm.assert_index_equal(index.mid, expected)

        # __contains__ doesn't check individual points
        assert Timestamp("2000-01-01", tz=tz) not in index
        assert Timestamp("2000-01-01T12", tz=tz) not in index
        assert Timestamp("2000-01-02", tz=tz) not in index
        iv_true = Interval(
            Timestamp("2000-01-02", tz=tz), Timestamp("2000-01-03", tz=tz)
        )
        iv_false = Interval(
            Timestamp("1999-12-31", tz=tz), Timestamp("2000-01-01", tz=tz)
        )
        assert iv_true in index
        assert iv_false not in index

        # .contains does check individual points
        assert not index.contains(Timestamp("2000-01-01", tz=tz)).any()
        assert index.contains(Timestamp("2000-01-01T12", tz=tz)).any()
        assert index.contains(Timestamp("2000-01-02", tz=tz)).any()

        # test get_indexer
        start = Timestamp("1999-12-31T12:00", tz=tz)
        target = date_range(start=start, periods=7, freq="12H")
        actual = index.get_indexer(target)
        expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype="intp")
        tm.assert_numpy_array_equal(actual, expected)

        start = Timestamp("2000-01-08T18:00", tz=tz)
        target = date_range(start=start, periods=7, freq="6H")
        actual = index.get_indexer(target)
        expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype="intp")
        tm.assert_numpy_array_equal(actual, expected)

    def test_append(self, closed):

        index1 = IntervalIndex.from_arrays([0, 1], [1, 2], closed=closed)
        index2 = IntervalIndex.from_arrays([1, 2], [2, 3], closed=closed)

        result = index1.append(index2)
        expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3], closed=closed)
        tm.assert_index_equal(result, expected)

        result = index1.append([index1, index2])
        expected = IntervalIndex.from_arrays(
            [0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3], closed=closed
        )
        tm.assert_index_equal(result, expected)

        msg = "Intervals must all be closed on the same side"
        for other_closed in {"left", "right", "both", "neither"} - {closed}:
            index_other_closed = IntervalIndex.from_arrays(
                [0, 1], [1, 2], closed=other_closed
            )
            with pytest.raises(ValueError, match=msg):
                index1.append(index_other_closed)

    def test_is_non_overlapping_monotonic(self, closed):
        # Should be True in all cases
        tpls = [(0, 1), (2, 3), (4, 5), (6, 7)]
        idx = IntervalIndex.from_tuples(tpls, closed=closed)
        assert idx.is_non_overlapping_monotonic is True

        idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed)
        assert idx.is_non_overlapping_monotonic is True

        # Should be False in all cases (overlapping)
        tpls = [(0, 2), (1, 3), (4, 5), (6, 7)]
        idx = IntervalIndex.from_tuples(tpls, closed=closed)
        assert idx.is_non_overlapping_monotonic is False

        idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed)
        assert idx.is_non_overlapping_monotonic is False

        # Should be False in all cases (non-monotonic)
        tpls = [(0, 1), (2, 3), (6, 7), (4, 5)]
        idx = IntervalIndex.from_tuples(tpls, closed=closed)
        assert idx.is_non_overlapping_monotonic is False

        idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed)
        assert idx.is_non_overlapping_monotonic is False

        # Should be False for closed='both', otherwise True (GH16560)
        if closed == "both":
            idx = IntervalIndex.from_breaks(range(4), closed=closed)
            assert idx.is_non_overlapping_monotonic is False
        else:
            idx = IntervalIndex.from_breaks(range(4), closed=closed)
            assert idx.is_non_overlapping_monotonic is True

    @pytest.mark.parametrize(
        "start, shift, na_value",
        [
            (0, 1, np.nan),
            (Timestamp("2018-01-01"), Timedelta("1 day"), pd.NaT),
            (Timedelta("0 days"), Timedelta("1 day"), pd.NaT),
        ],
    )
    def test_is_overlapping(self, start, shift, na_value, closed):
        # GH 23309
        # see test_interval_tree.py for extensive tests; interface tests here

        # non-overlapping
        tuples = [(start + n * shift, start + (n + 1) * shift) for n in (0, 2, 4)]
        index = IntervalIndex.from_tuples(tuples, closed=closed)
        assert index.is_overlapping is False

        # non-overlapping with NA
        tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)]
        index = IntervalIndex.from_tuples(tuples, closed=closed)
        assert index.is_overlapping is False

        # overlapping
        tuples = [(start + n * shift, start + (n + 2) * shift) for n in range(3)]
        index = IntervalIndex.from_tuples(tuples, closed=closed)
        assert index.is_overlapping is True

        # overlapping with NA
        tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)]
        index = IntervalIndex.from_tuples(tuples, closed=closed)
        assert index.is_overlapping is True

        # common endpoints
        tuples = [(start + n * shift, start + (n + 1) * shift) for n in range(3)]
        index = IntervalIndex.from_tuples(tuples, closed=closed)
        result = index.is_overlapping
        expected = closed == "both"
        assert result is expected

        # common endpoints with NA
        tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)]
        index = IntervalIndex.from_tuples(tuples, closed=closed)
        result = index.is_overlapping
        assert result is expected

    @pytest.mark.parametrize(
        "tuples",
        [
            list(zip(range(10), range(1, 11))),
            list(
                zip(
                    date_range("20170101", periods=10),
                    date_range("20170101", periods=10),
                )
            ),
            list(
                zip(
                    timedelta_range("0 days", periods=10),
                    timedelta_range("1 day", periods=10),
                )
            ),
        ],
    )
    def test_to_tuples(self, tuples):
        # GH 18756
        idx = IntervalIndex.from_tuples(tuples)
        result = idx.to_tuples()
        expected = Index(com.asarray_tuplesafe(tuples))
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize(
        "tuples",
        [
            list(zip(range(10), range(1, 11))) + [np.nan],
            list(
                zip(
                    date_range("20170101", periods=10),
                    date_range("20170101", periods=10),
                )
            )
            + [np.nan],
            list(
                zip(
                    timedelta_range("0 days", periods=10),
                    timedelta_range("1 day", periods=10),
                )
            )
            + [np.nan],
        ],
    )
    @pytest.mark.parametrize("na_tuple", [True, False])
    def test_to_tuples_na(self, tuples, na_tuple):
        # GH 18756
        idx = IntervalIndex.from_tuples(tuples)
        result = idx.to_tuples(na_tuple=na_tuple)

        # check the non-NA portion
        expected_notna = Index(com.asarray_tuplesafe(tuples[:-1]))
        result_notna = result[:-1]
        tm.assert_index_equal(result_notna, expected_notna)

        # check the NA portion
        result_na = result[-1]
        if na_tuple:
            assert isinstance(result_na, tuple)
            assert len(result_na) == 2
            assert all(isna(x) for x in result_na)
        else:
            assert isna(result_na)

    def test_nbytes(self):
        # GH 19209
        left = np.arange(0, 4, dtype="i8")
        right = np.arange(1, 5, dtype="i8")

        result = IntervalIndex.from_arrays(left, right).nbytes
        expected = 64  # 4 * 8 * 2
        assert result == expected

    @pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"])
    def test_set_closed(self, name, closed, new_closed):
        # GH 21670
        index = interval_range(0, 5, closed=closed, name=name)
        result = index.set_closed(new_closed)
        expected = interval_range(0, 5, closed=new_closed, name=name)
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize("bad_closed", ["foo", 10, "LEFT", True, False])
    def test_set_closed_errors(self, bad_closed):
        # GH 21670
        index = interval_range(0, 5)
        msg = f"invalid option for 'closed': {bad_closed}"
        with pytest.raises(ValueError, match=msg):
            index.set_closed(bad_closed)

    def test_is_all_dates(self):
        # GH 23576
        year_2017 = pd.Interval(
            pd.Timestamp("2017-01-01 00:00:00"), pd.Timestamp("2018-01-01 00:00:00")
        )
        year_2017_index = pd.IntervalIndex([year_2017])
        assert not year_2017_index.is_all_dates

    @pytest.mark.parametrize("key", [[5], (2, 3)])
    def test_get_value_non_scalar_errors(self, key):
        # GH 31117
        idx = IntervalIndex.from_tuples([(1, 3), (2, 4), (3, 5), (7, 10), (3, 10)])
        s = pd.Series(range(len(idx)), index=idx)

        msg = str(key)
        with pytest.raises(InvalidIndexError, match=msg):
            with tm.assert_produces_warning(FutureWarning):
                idx.get_value(s, key)

    @pytest.mark.parametrize("closed", ["left", "right", "both"])
    def test_pickle_round_trip_closed(self, closed):
        # https://github.com/pandas-dev/pandas/issues/35658
        idx = IntervalIndex.from_tuples([(1, 2), (2, 3)], closed=closed)
        result = tm.round_trip_pickle(idx)
        tm.assert_index_equal(result, idx)
Example #36
0
class TestFloatSubtype(AstypeTests):
    """Tests specific to IntervalIndex with float subtype"""

    indexes = [
        interval_range(-10.0, 10.0, inclusive="neither"),
        IntervalIndex.from_arrays(
            [-1.5, np.nan, 0.0, 0.0, 1.5],
            [-0.5, np.nan, 1.0, 1.0, 3.0],
            inclusive="both",
        ),
    ]

    @pytest.fixture(params=indexes)
    def index(self, request):
        return request.param

    @pytest.mark.parametrize("subtype", ["int64", "uint64"])
    def test_subtype_integer(self, subtype):
        index = interval_range(0.0, 10.0)
        dtype = IntervalDtype(subtype, "right")
        result = index.astype(dtype)
        expected = IntervalIndex.from_arrays(
            index.left.astype(subtype),
            index.right.astype(subtype),
            inclusive=index.inclusive,
        )
        tm.assert_index_equal(result, expected)

        # raises with NA
        msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
        with pytest.raises(ValueError, match=msg):
            index.insert(0, np.nan).astype(dtype)

    @pytest.mark.parametrize("subtype", ["int64", "uint64"])
    def test_subtype_integer_with_non_integer_borders(self, subtype):
        index = interval_range(0.0, 3.0, freq=0.25)
        dtype = IntervalDtype(subtype, "right")
        result = index.astype(dtype)
        expected = IntervalIndex.from_arrays(
            index.left.astype(subtype),
            index.right.astype(subtype),
            inclusive=index.inclusive,
        )
        tm.assert_index_equal(result, expected)

    def test_subtype_integer_errors(self):
        # float64 -> uint64 fails with negative values
        index = interval_range(-10.0, 10.0, inclusive="right")
        dtype = IntervalDtype("uint64", "right")
        msg = re.escape(
            "Cannot convert interval[float64, right] to interval[uint64, right]; "
            "subtypes are incompatible")
        with pytest.raises(TypeError, match=msg):
            index.astype(dtype)

    @pytest.mark.parametrize("subtype", ["datetime64[ns]", "timedelta64[ns]"])
    def test_subtype_datetimelike(self, index, subtype):
        dtype = IntervalDtype(subtype, "right")
        msg = "Cannot convert .* to .*; subtypes are incompatible"
        with pytest.raises(TypeError, match=msg):
            index.astype(dtype)
Example #37
0
 def test_get_loc_decreasing(self, values):
     # GH 25860
     index = IntervalIndex.from_arrays(values[1:], values[:-1])
     result = index.get_loc(index[0])
     expected = 0
     assert result == expected
Example #38
0
    def test_constructors_errors(self):

        # scalar
        msg = (r'IntervalIndex\(...\) must be called with a collection of '
               'some kind, 5 was passed')
        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex(5)

        # not an interval
        msg = ("type <(class|type) 'numpy.int64'> with value 0 "
               "is not an interval")
        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex([0, 1])

        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex.from_intervals([0, 1])

        # invalid closed
        msg = "invalid options for 'closed': invalid"
        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid')

        # mismatched closed within intervals
        msg = 'intervals must all be closed on the same side'
        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex.from_intervals([Interval(0, 1),
                                          Interval(1, 2, closed='left')])

        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex([Interval(0, 1), Interval(2, 3, closed='left')])

        with tm.assert_raises_regex(ValueError, msg):
            Index([Interval(0, 1), Interval(2, 3, closed='left')])

        # mismatched closed inferred from intervals vs constructor.
        msg = 'conflicting values for closed'
        with tm.assert_raises_regex(ValueError, msg):
            iv = [Interval(0, 1, closed='both'), Interval(1, 2, closed='both')]
            IntervalIndex(iv, closed='neither')

        # no point in nesting periods in an IntervalIndex
        msg = 'Period dtypes are not supported, use a PeriodIndex instead'
        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex.from_breaks(
                pd.period_range('2000-01-01', periods=3))

        # decreasing breaks/arrays
        msg = 'left side of interval must be <= right side'
        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex.from_breaks(range(10, -1, -1))

        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex.from_arrays(range(10, -1, -1), range(9, -2, -1))

        # GH 19016: categorical data
        data = Categorical(list('01234abcde'), ordered=True)
        msg = ('category, object, and string subtypes are not supported '
               'for IntervalIndex')

        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex.from_breaks(data)

        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex.from_arrays(data[:-1], data[1:])
Example #39
0
 def f():
     index1.append(IntervalIndex.from_arrays([0, 1], [1, 2],
                                             closed='both'))
Example #40
0
 def setup(self, N):
     left = np.append(np.arange(N), np.array(0))
     right = np.append(np.arange(1, N + 1), np.array(1))
     self.intv = IntervalIndex.from_arrays(left, right)
     self.intv._engine
Example #41
0
 def setup_method(self, method):
     self.index = IntervalIndex.from_arrays([0, 1], [1, 2])
     self.index_with_nan = IntervalIndex.from_tuples([(0, 1), np.nan,
                                                      (1, 2)])
     self.indices = dict(intervalIndex=tm.makeIntervalIndex(10))
Example #42
0
 def create_index_with_nan(self, closed='right'):
     mask = [True, False] + [True] * 8
     return IntervalIndex.from_arrays(np.where(mask, np.arange(10), np.nan),
                                      np.where(mask, np.arange(1, 11),
                                               np.nan),
                                      closed=closed)
Example #43
0
    def test_constructors(self, data, closed, name):
        left, right = data[:-1], data[1:]
        ivs = [Interval(l, r, closed=closed) for l, r in lzip(left, right)]
        expected = IntervalIndex._simple_new(left=left,
                                             right=right,
                                             closed=closed,
                                             name=name)

        # validate expected
        assert expected.closed == closed
        assert expected.name == name
        assert expected.dtype.subtype == data.dtype
        tm.assert_index_equal(expected.left, data[:-1])
        tm.assert_index_equal(expected.right, data[1:])

        # validated constructors
        result = IntervalIndex(ivs, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_intervals(ivs, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_breaks(data, closed=closed, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_arrays(left,
                                           right,
                                           closed=closed,
                                           name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_tuples(lzip(left, right),
                                           closed=closed,
                                           name=name)
        tm.assert_index_equal(result, expected)

        result = Index(ivs, name=name)
        assert isinstance(result, IntervalIndex)
        tm.assert_index_equal(result, expected)

        # idempotent
        tm.assert_index_equal(Index(expected), expected)
        tm.assert_index_equal(IntervalIndex(expected), expected)

        result = IntervalIndex.from_intervals(expected)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_intervals(expected.values,
                                              name=expected.name)
        tm.assert_index_equal(result, expected)

        left, right = expected.left, expected.right
        result = IntervalIndex.from_arrays(left,
                                           right,
                                           closed=expected.closed,
                                           name=expected.name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_tuples(expected.to_tuples(),
                                           closed=expected.closed,
                                           name=expected.name)
        tm.assert_index_equal(result, expected)

        breaks = expected.left.tolist() + [expected.right[-1]]
        result = IntervalIndex.from_breaks(breaks,
                                           closed=expected.closed,
                                           name=expected.name)
        tm.assert_index_equal(result, expected)