Ejemplo n.º 1
0
def test_not_constatnt_splitter():
    sequence = np.array([1, 1, 1, 2, 2, 2, 2])
    splitter = pasio.NotZeroSplitter(base_splitter=pasio.SquareSplitter())
    splits = splitter.split(sequence, simple_scorer_factory)
    assert splits[1] == [0, 3]

    splitter = pasio.NotZeroSplitter(base_splitter=pasio.SquareSplitter())
    splits = splitter.split(sequence, simple_greedy_scorer_factory)
    assert splits[1] == list(range(len(sequence)))

    splitter = pasio.NotConstantSplitter(base_splitter=pasio.SquareSplitter())
    splits = splitter.split(sequence, simple_greedy_scorer_factory)
    assert splits[1] == [0, 3]

    splitter = pasio.NotConstantSplitter(base_splitter=pasio.SquareSplitter())
    splits = splitter.split(sequence, simple_greedy_scorer_factory,
                            np.array(range(len(sequence) - 1)))
    assert splits[1] == [0, 3]

    splitter = pasio.NotConstantSplitter(base_splitter=pasio.SquareSplitter())
    assert np.allclose(
        np.array([0, 3]),
        splitter.get_non_constant_split_candidates(sequence, None))
    assert np.allclose(
        np.array([0, 3]),
        splitter.get_non_constant_split_candidates(sequence, np.array([0, 3])))
Ejemplo n.º 2
0
def test_split_with_length_regularization():
    # score of split 'AAA|B|AA' = 9+1+4 = 14
    # with regularization = 9+1+4 - 1.5*(1/log(3+1)+1/log(1+1)+1/log(2+1)) = 9.38
    # alternative split: 'AAA|BAA' gives score = 9+3 - 1.5*(1/log(3+1)+1/log(3+1)) = 9.83
    sequence = 'AAABAA'
    splitter = pasio.SquareSplitter(
        length_regularization_multiplier=1.5,
        length_regularization_function=lambda x: 1 / np.log(1 + x))
    optimal_split = splitter.split(sequence, SimpleScorer)

    assert optimal_split[1] == [0, 3]
    assert optimal_split[0] == 9 + 3 - 1.5 * (1 / np.log(3 + 1) +
                                              1 / np.log(3 + 1))

    # limiting possible splits
    splitter = pasio.SquareSplitter(
        length_regularization_multiplier=1.5,
        length_regularization_function=lambda x: 1 / np.log(1 + x))
    optimal_split = splitter.split(sequence,
                                   SimpleScorer,
                                   split_candidates=np.array([0, 4, 5]))

    assert optimal_split[1] == [0, 4]
    assert optimal_split[0] == 4 + 4 - 1.5 * (1 / np.log(4 + 1) +
                                              1 / np.log(2 + 1))
Ejemplo n.º 3
0
def test_split_into_segments_candidates():

    sequence = 'AAABBB'
    optimal_split = pasio.SquareSplitter().split(
        sequence, simple_scorer_factory, split_candidates=[0, 1, 2, 3, 5, 6])
    assert optimal_split[1] == [0, 3]
    assert optimal_split[0] == 9 + 9

    sequence = 'AAABBB'
    optimal_split = pasio.SquareSplitter().split(sequence,
                                                 simple_scorer_factory,
                                                 split_candidates=[0, 3, 5, 6])
    assert optimal_split[1] == [0, 3]
    assert optimal_split[0] == 9 + 9

    sequence = 'AAABBBC'
    optimal_split = pasio.SquareSplitter().split(sequence,
                                                 simple_scorer_factory,
                                                 split_candidates=[0, 3, 7])
    assert optimal_split[1] == [0, 3]
    assert optimal_split[0] == 9 + 4

    sequence = 'AAABBBC'
    optimal_split = pasio.SquareSplitter().split(sequence,
                                                 simple_scorer_factory,
                                                 split_candidates=[0, 3])
    assert optimal_split[1] == [0, 3]
    assert optimal_split[0] == 9 + 4

    sequence = 'AAAAAA'
    optimal_split = pasio.SquareSplitter().split(sequence,
                                                 simple_scorer_factory,
                                                 split_candidates=[0, 3])
    assert optimal_split[1] == [0]
    assert optimal_split[0] == 36
Ejemplo n.º 4
0
def test_split_into_segments_square():
    sequence = 'A'
    optimal_split = pasio.SquareSplitter().split(sequence,
                                                 simple_scorer_factory)
    assert optimal_split[1] == [0]
    assert optimal_split[0] == 1

    sequence = 'AAA'
    optimal_split = pasio.SquareSplitter().split(sequence,
                                                 simple_scorer_factory)
    assert optimal_split[1] == [0]
    assert optimal_split[0] == 9

    sequence = 'AAABBB'
    optimal_split = pasio.SquareSplitter().split(sequence,
                                                 simple_scorer_factory)
    assert optimal_split[1] == [0, 3]
    assert optimal_split[0] == 9 + 9

    sequence = 'AAABBBC'
    optimal_split = pasio.SquareSplitter().split(sequence,
                                                 simple_scorer_factory)
    assert optimal_split[1] == [0, 3, 6]
    assert optimal_split[0] == 9 + 9 + 1

    sequence = 'ABBBC'
    optimal_split = pasio.SquareSplitter().split(sequence,
                                                 simple_scorer_factory)
    assert optimal_split[1] == [0, 1, 4]
    assert optimal_split[0] == 1 + 9 + 1
Ejemplo n.º 5
0
def test_stat_split_into_segments_square():
    def split_on_two_segments_or_not(counts, scorer_factory):
        scorer = scorer_factory(counts)
        best_score = scorer.score(0, len(counts))
        split_point = 0
        for i in range(len(counts)):
            current_score = scorer.score(stop=i) + scorer.score(start=i)
            if current_score > best_score:
                split_point = i
                best_score = current_score
        return best_score, split_point

    np.random.seed(4)
    scorer_factory = lambda counts, split_candidates=None: pasio.LogMarginalLikelyhoodComputer(
        counts, 1, 1, split_candidates)
    for repeat in range(5):
        counts = np.concatenate(
            [np.random.poisson(15, 100),
             np.random.poisson(20, 100)])

        optimal_split = pasio.SquareSplitter().split(counts, scorer_factory)

        two_split = split_on_two_segments_or_not(counts, scorer_factory)

        assert optimal_split[0] >= two_split[0]
        assert two_split[1] in optimal_split[1]
        assert np.allclose(
            optimal_split[0],
            pasio.compute_score_from_splits(counts, optimal_split[1],
                                            scorer_factory))
        if (two_split[1] is None):
            assert optimal_split[1] == [0, 200]
        else:
            assert abs(two_split[1] - 100) < 10
Ejemplo n.º 6
0
def test_split_into_segments_slidingwindow():
    A = 'AAAAAAAAAAAAAAAA'
    B = 'BBBBBBBBBBBBBBBBB'
    sequence = A + B
    splitter = pasio.SlidingWindowSplitter(
        window_size=10, window_shift=5, base_splitter=pasio.SquareSplitter())
    splits = splitter.split(sequence, simple_scorer_factory)
    assert splits[1] == [0, len(A)]
    assert splits[0] == len(A)**2 + len(B)**2

    splitter = pasio.SlidingWindowSplitter(
        window_size=10,
        window_shift=5,
        base_splitter=pasio.SquareSplitter(
            split_number_regularization_multiplier=2))
    splits = splitter.split(sequence, simple_scorer_factory)
    assert splits[1] == [0, len(A)]
    assert splits[0] == len(A)**2 + len(B)**2 - 2
Ejemplo n.º 7
0
def test_split_with_split_num_regularization():
    # score of split 'AAA|B|AA' = 9+1+4 = 14
    # with regularization = 9+1+4 - 3*2 = 8
    # alternative split: 'AAA|BAA' gives score = 9+3-3*1 = 9
    sequence = 'AAABAA'
    splitter = pasio.SquareSplitter(
        split_number_regularization_multiplier=3,
        split_number_regularization_function=lambda x: x)
    optimal_split = splitter.split(sequence, SimpleScorer)
    assert optimal_split[1] == [0, 3]
    assert optimal_split[0] == 9
Ejemplo n.º 8
0
def segmentation(counts, scorer, candidates=None):
    optimal_split = pasio.SquareSplitter().split(counts,
                                                 scorer,
                                                 split_candidates=candidates)