コード例 #1
0
    def rank_sentences(
        self,
        sentences,
        threshold=.03,
        fast_power_method=True,
    ):
        if not (threshold is None
                or isinstance(threshold, float) and 0 <= threshold < 1):
            raise ValueError(
                '\'threshold\' should be a floating-point number '
                'from the interval [0, 1) or None', )

        tf_scores = [
            Counter(self.tokenize_sentence(sentence)) for sentence in sentences
        ]

        similarity_matrix = self._calculate_similarity_matrix(tf_scores)

        if threshold is None:
            markov_matrix = self._markov_matrix(similarity_matrix)

        else:
            markov_matrix = self._markov_matrix_discrete(
                similarity_matrix,
                threshold=threshold,
            )

        scores = stationary_distribution(
            markov_matrix,
            increase_power=fast_power_method,
            normalized=False,
        )

        return scores
コード例 #2
0
def degree_centrality_scores(
    similarity_matrix,
    threshold=None,
    increase_power=True,
):
    if not (threshold is None
            or isinstance(threshold, float) and 0 <= threshold < 1):
        raise ValueError(
            '\'threshold\' should be a floating-point number '
            'from the interval [0, 1) or None', )

    if threshold is None:
        markov_matrix = create_markov_matrix(similarity_matrix)

    else:
        markov_matrix = create_markov_matrix_discrete(
            similarity_matrix,
            threshold,
        )

    scores = stationary_distribution(
        markov_matrix,
        increase_power=increase_power,
        normalized=False,
    )

    return scores
コード例 #3
0
    def rank_sentences(
        self,
        sentences,
        threshold=.03,
        discretize=True,
        fast_power_method=True,
        normalize=False,
    ):
        if not isinstance(threshold, float) or not 0 <= threshold < 1:
            raise ValueError(
                '\'threshold\' should be a floating-point number '
                'from the interval [0, 1)',
            )

        tf_scores = [
            self._calculate_tf(self.tokenize_sentence(sentence))
            for sentence in sentences
        ]
        similarity_matrix = self._calculate_similarity_matrix(tf_scores)
        np.savetxt('text.txt', similarity_matrix, delimiter=' ', fmt='%.2f')
        if discretize:
            markov_matrix = self._markov_matrix_discrete(
                similarity_matrix,
                threshold=threshold,
            )

        else:
            markov_matrix = self._markov_matrix(similarity_matrix)

        lexrank = stationary_distribution(
            markov_matrix,
            increase_power=fast_power_method,
        )

        if normalize:
            max_val = max(lexrank)
            lexrank = [val / max_val for val in lexrank]

        return lexrank
コード例 #4
0
ファイル: summarizer.py プロジェクト: crabcamp/lexrank
    def rank_sentences(
        self,
        sentences,
        threshold=.03,
        fast_power_method=True,
    ):
        if not (
            threshold is None or
            isinstance(threshold, float) and 0 <= threshold < 1
        ):
            raise ValueError(
                '\'threshold\' should be a floating-point number '
                'from the interval [0, 1) or None',
            )

        tf_scores = [
            Counter(self.tokenize_sentence(sentence)) for sentence in sentences
        ]

        similarity_matrix = self._calculate_similarity_matrix(tf_scores)

        if threshold is None:
            markov_matrix = self._markov_matrix(similarity_matrix)

        else:
            markov_matrix = self._markov_matrix_discrete(
                similarity_matrix,
                threshold=threshold,
            )

        scores = stationary_distribution(
            markov_matrix,
            increase_power=fast_power_method,
            normalized=False,
        )

        return scores
コード例 #5
0
ファイル: test_power_method.py プロジェクト: crabcamp/lexrank
def test_stationary_distribution():
    transition_matrices = []

    t_matrix = np.array([[1.]])

    assert np.array_equal(stationary_distribution(t_matrix), [1.])
    transition_matrices.append(t_matrix)

    t_matrix = np.array([
        [.6, .1, .3],
        [.1, .7, .2],
        [.2, .2, .6],
    ])

    expected_result = [.2759, .3448, .3793]
    actual_result_1 = stationary_distribution(t_matrix, increase_power=True)
    actual_result_2 = stationary_distribution(t_matrix, increase_power=False)

    assert np.array_equal(np.round(actual_result_1, 4), expected_result)
    assert np.array_equal(np.round(actual_result_2, 4), expected_result)
    transition_matrices.append(t_matrix)

    t_matrix = np.zeros([5, 5])
    t_matrix[np.ix_([0, 3], [2, 4])] = .5
    t_matrix[np.ix_([2], [0, 1, 3, 4])] = .25
    t_matrix[1, 2], t_matrix[4, 4] = 1, 1

    expected_result = [0, 0, 0, 0, 1]
    actual_result_1 = stationary_distribution(t_matrix, increase_power=True)
    actual_result_2 = stationary_distribution(t_matrix, increase_power=False)

    assert np.allclose(actual_result_1, expected_result)
    assert np.allclose(actual_result_2, expected_result)
    transition_matrices.append(t_matrix)

    t_matrix = np.zeros([4, 4])
    t_matrix[0, 1] = 1
    t_matrix[1, 1], t_matrix[1, 2] = 1 / 3, 2 / 3
    t_matrix[2, 3] = 1
    t_matrix[3, 0], t_matrix[3, 3] = 3 / 5, 2 / 5

    expected_result = [6 / 31, 9 / 31, 6 / 31, 10 / 31]
    actual_result_1 = stationary_distribution(t_matrix, increase_power=True)
    actual_result_2 = stationary_distribution(t_matrix, increase_power=False)

    assert np.allclose(actual_result_1, expected_result)
    assert np.allclose(actual_result_2, expected_result)
    transition_matrices.append(t_matrix)

    t_matrix = np.zeros([7, 7])
    t_matrix[0, 0], t_matrix[0, 1] = .5, .5
    t_matrix[1, 0], t_matrix[1, 1], t_matrix[1, 2] = .5, .4, .1
    t_matrix[2, 1], t_matrix[2, 2] = .6, .4
    t_matrix[3, 2], t_matrix[3, 3] = .2, .4
    t_matrix[3, 4], t_matrix[3, 5] = .2, .2
    t_matrix[4, 3], t_matrix[4, 6] = .7, .3
    t_matrix[5, 6] = 1.
    t_matrix[6, 5], t_matrix[6, 6] = .95, .05

    expected_result = [0.2465, 0.2465, 0.0411, 0., 0., 0.2269, 0.2389]
    actual_result_1 = stationary_distribution(t_matrix, increase_power=True)
    actual_result_2 = stationary_distribution(t_matrix, increase_power=False)

    assert np.array_equal(np.round(actual_result_1, 4), expected_result)
    assert np.array_equal(np.round(actual_result_2, 4), expected_result)
    transition_matrices.append(t_matrix)

    t_matrix = np.array([[1 / 2, 0, 1 / 2], [0, 1, 0], [1 / 2, 0, 1 / 2]])

    expected_result = [1 / 3] * 3
    actual_result_1 = stationary_distribution(t_matrix, increase_power=True)
    actual_result_2 = stationary_distribution(t_matrix, increase_power=False)

    assert np.allclose(actual_result_1, expected_result)
    assert np.allclose(actual_result_2, expected_result)
    transition_matrices.append(t_matrix)

    # crash test
    repeat_num = 20
    big_t_mat = block_diag(*transition_matrices * repeat_num)
    distribution_1 = stationary_distribution(big_t_mat, increase_power=True)
    distribution_2 = stationary_distribution(big_t_mat, increase_power=False)

    assert math.isclose(sum(distribution_1), 1)
    assert math.isclose(sum(distribution_2), 1)
    assert np.allclose(distribution_1, distribution_2)

    data = np.split(np.array(distribution_1), repeat_num)
    test_row = data[0]

    for row in data[1:]:
        assert np.array_equal(row, test_row)

    sample_num = 1000
    big_t_mat = np.random.random([sample_num] * 2)
    big_t_mat /= big_t_mat.sum(axis=1, keepdims=True)
    distribution_1 = stationary_distribution(big_t_mat, increase_power=True)
    distribution_2 = stationary_distribution(big_t_mat, increase_power=False)
    distribution_3 = stationary_distribution(big_t_mat, normalized=False)

    assert math.isclose(sum(distribution_1), 1)
    assert math.isclose(sum(distribution_2), 1)
    assert math.isclose(sum(distribution_3), sample_num)
    assert np.allclose(distribution_1, distribution_2)
コード例 #6
0
def test_stationary_distribution():
    transition_matrices = []

    t_matrix = np.array([[1.]])

    assert np.array_equal(stationary_distribution(t_matrix), [1.])
    transition_matrices.append(t_matrix)

    t_matrix = np.array([
        [.6, .1, .3],
        [.1, .7, .2],
        [.2, .2, .6],
    ])

    expected_result = [.2759, .3448, .3793]
    actual_result_1 = stationary_distribution(t_matrix, increase_power=True)
    actual_result_2 = stationary_distribution(t_matrix, increase_power=False)

    assert np.array_equal(np.round(actual_result_1, 4), expected_result)
    assert np.array_equal(np.round(actual_result_2, 4), expected_result)
    transition_matrices.append(t_matrix)

    t_matrix = np.zeros([5, 5])
    t_matrix[np.ix_([0, 3], [2, 4])] = .5
    t_matrix[np.ix_([2], [0, 1, 3, 4])] = .25
    t_matrix[1, 2], t_matrix[4, 4] = 1, 1

    expected_result = [0, 0, 0, 0, 1]
    actual_result_1 = stationary_distribution(t_matrix, increase_power=True)
    actual_result_2 = stationary_distribution(t_matrix, increase_power=False)

    assert np.allclose(actual_result_1, expected_result)
    assert np.allclose(actual_result_2, expected_result)
    transition_matrices.append(t_matrix)

    t_matrix = np.zeros([4, 4])
    t_matrix[0, 1] = 1
    t_matrix[1, 1], t_matrix[1, 2] = 1 / 3, 2 / 3
    t_matrix[2, 3] = 1
    t_matrix[3, 0], t_matrix[3, 3] = 3 / 5, 2 / 5

    expected_result = [6 / 31, 9 / 31, 6 / 31, 10 / 31]
    actual_result_1 = stationary_distribution(t_matrix, increase_power=True)
    actual_result_2 = stationary_distribution(t_matrix, increase_power=False)

    assert np.allclose(actual_result_1, expected_result)
    assert np.allclose(actual_result_2, expected_result)
    transition_matrices.append(t_matrix)

    t_matrix = np.zeros([7, 7])
    t_matrix[0, 0], t_matrix[0, 1] = .5, .5
    t_matrix[1, 0], t_matrix[1, 1], t_matrix[1, 2] = .5, .4, .1
    t_matrix[2, 1], t_matrix[2, 2] = .6, .4
    t_matrix[3, 2], t_matrix[3, 3] = .2, .4
    t_matrix[3, 4], t_matrix[3, 5] = .2, .2
    t_matrix[4, 3], t_matrix[4, 6] = .7, .3
    t_matrix[5, 6] = 1.
    t_matrix[6, 5], t_matrix[6, 6] = .95, .05

    expected_result = [0.2465, 0.2465, 0.0411, 0., 0., 0.2269, 0.2389]
    actual_result_1 = stationary_distribution(t_matrix, increase_power=True)
    actual_result_2 = stationary_distribution(t_matrix, increase_power=False)

    assert np.array_equal(np.round(actual_result_1, 4), expected_result)
    assert np.array_equal(np.round(actual_result_2, 4), expected_result)
    transition_matrices.append(t_matrix)

    t_matrix = np.array([[1 / 2, 0, 1 / 2], [0, 1, 0], [1 / 2, 0, 1 / 2]])

    expected_result = [1 / 3] * 3
    actual_result_1 = stationary_distribution(t_matrix, increase_power=True)
    actual_result_2 = stationary_distribution(t_matrix, increase_power=False)

    assert np.allclose(actual_result_1, expected_result)
    assert np.allclose(actual_result_2, expected_result)
    transition_matrices.append(t_matrix)

    # crash test
    repeat_num = 20
    big_t_mat = block_diag(*transition_matrices * repeat_num)
    distribution_1 = stationary_distribution(big_t_mat, increase_power=True)
    distribution_2 = stationary_distribution(big_t_mat, increase_power=False)

    assert math.isclose(sum(distribution_1), 1)
    assert math.isclose(sum(distribution_2), 1)
    assert np.allclose(distribution_1, distribution_2)

    data = np.split(np.array(distribution_1), repeat_num)
    test_row = data[0]

    for row in data[1:]:
        assert np.array_equal(row, test_row)

    sample_num = 1000
    big_t_mat = np.random.random([sample_num] * 2)
    big_t_mat /= big_t_mat.sum(axis=1, keepdims=True)
    distribution_1 = stationary_distribution(big_t_mat, increase_power=True)
    distribution_2 = stationary_distribution(big_t_mat, increase_power=False)
    distribution_3 = stationary_distribution(big_t_mat, normalized=False)

    assert math.isclose(sum(distribution_1), 1)
    assert math.isclose(sum(distribution_2), 1)
    assert math.isclose(sum(distribution_3), sample_num)
    assert np.allclose(distribution_1, distribution_2)