def rank_sentences( self, sentences, threshold=.03, fast_power_method=True, ): if not (threshold is None or isinstance(threshold, float) and 0 <= threshold < 1): raise ValueError( '\'threshold\' should be a floating-point number ' 'from the interval [0, 1) or None', ) tf_scores = [ Counter(self.tokenize_sentence(sentence)) for sentence in sentences ] similarity_matrix = self._calculate_similarity_matrix(tf_scores) if threshold is None: markov_matrix = self._markov_matrix(similarity_matrix) else: markov_matrix = self._markov_matrix_discrete( similarity_matrix, threshold=threshold, ) scores = stationary_distribution( markov_matrix, increase_power=fast_power_method, normalized=False, ) return scores
def degree_centrality_scores( similarity_matrix, threshold=None, increase_power=True, ): if not (threshold is None or isinstance(threshold, float) and 0 <= threshold < 1): raise ValueError( '\'threshold\' should be a floating-point number ' 'from the interval [0, 1) or None', ) if threshold is None: markov_matrix = create_markov_matrix(similarity_matrix) else: markov_matrix = create_markov_matrix_discrete( similarity_matrix, threshold, ) scores = stationary_distribution( markov_matrix, increase_power=increase_power, normalized=False, ) return scores
def rank_sentences( self, sentences, threshold=.03, discretize=True, fast_power_method=True, normalize=False, ): if not isinstance(threshold, float) or not 0 <= threshold < 1: raise ValueError( '\'threshold\' should be a floating-point number ' 'from the interval [0, 1)', ) tf_scores = [ self._calculate_tf(self.tokenize_sentence(sentence)) for sentence in sentences ] similarity_matrix = self._calculate_similarity_matrix(tf_scores) np.savetxt('text.txt', similarity_matrix, delimiter=' ', fmt='%.2f') if discretize: markov_matrix = self._markov_matrix_discrete( similarity_matrix, threshold=threshold, ) else: markov_matrix = self._markov_matrix(similarity_matrix) lexrank = stationary_distribution( markov_matrix, increase_power=fast_power_method, ) if normalize: max_val = max(lexrank) lexrank = [val / max_val for val in lexrank] return lexrank
def rank_sentences( self, sentences, threshold=.03, fast_power_method=True, ): if not ( threshold is None or isinstance(threshold, float) and 0 <= threshold < 1 ): raise ValueError( '\'threshold\' should be a floating-point number ' 'from the interval [0, 1) or None', ) tf_scores = [ Counter(self.tokenize_sentence(sentence)) for sentence in sentences ] similarity_matrix = self._calculate_similarity_matrix(tf_scores) if threshold is None: markov_matrix = self._markov_matrix(similarity_matrix) else: markov_matrix = self._markov_matrix_discrete( similarity_matrix, threshold=threshold, ) scores = stationary_distribution( markov_matrix, increase_power=fast_power_method, normalized=False, ) return scores
def test_stationary_distribution(): transition_matrices = [] t_matrix = np.array([[1.]]) assert np.array_equal(stationary_distribution(t_matrix), [1.]) transition_matrices.append(t_matrix) t_matrix = np.array([ [.6, .1, .3], [.1, .7, .2], [.2, .2, .6], ]) expected_result = [.2759, .3448, .3793] actual_result_1 = stationary_distribution(t_matrix, increase_power=True) actual_result_2 = stationary_distribution(t_matrix, increase_power=False) assert np.array_equal(np.round(actual_result_1, 4), expected_result) assert np.array_equal(np.round(actual_result_2, 4), expected_result) transition_matrices.append(t_matrix) t_matrix = np.zeros([5, 5]) t_matrix[np.ix_([0, 3], [2, 4])] = .5 t_matrix[np.ix_([2], [0, 1, 3, 4])] = .25 t_matrix[1, 2], t_matrix[4, 4] = 1, 1 expected_result = [0, 0, 0, 0, 1] actual_result_1 = stationary_distribution(t_matrix, increase_power=True) actual_result_2 = stationary_distribution(t_matrix, increase_power=False) assert np.allclose(actual_result_1, expected_result) assert np.allclose(actual_result_2, expected_result) transition_matrices.append(t_matrix) t_matrix = np.zeros([4, 4]) t_matrix[0, 1] = 1 t_matrix[1, 1], t_matrix[1, 2] = 1 / 3, 2 / 3 t_matrix[2, 3] = 1 t_matrix[3, 0], t_matrix[3, 3] = 3 / 5, 2 / 5 expected_result = [6 / 31, 9 / 31, 6 / 31, 10 / 31] actual_result_1 = stationary_distribution(t_matrix, increase_power=True) actual_result_2 = stationary_distribution(t_matrix, increase_power=False) assert np.allclose(actual_result_1, expected_result) assert np.allclose(actual_result_2, expected_result) transition_matrices.append(t_matrix) t_matrix = np.zeros([7, 7]) t_matrix[0, 0], t_matrix[0, 1] = .5, .5 t_matrix[1, 0], t_matrix[1, 1], t_matrix[1, 2] = .5, .4, .1 t_matrix[2, 1], t_matrix[2, 2] = .6, .4 t_matrix[3, 2], t_matrix[3, 3] = .2, .4 t_matrix[3, 4], t_matrix[3, 5] = .2, .2 t_matrix[4, 3], t_matrix[4, 6] = .7, .3 t_matrix[5, 6] = 1. t_matrix[6, 5], t_matrix[6, 6] = .95, .05 expected_result = [0.2465, 0.2465, 0.0411, 0., 0., 0.2269, 0.2389] actual_result_1 = stationary_distribution(t_matrix, increase_power=True) actual_result_2 = stationary_distribution(t_matrix, increase_power=False) assert np.array_equal(np.round(actual_result_1, 4), expected_result) assert np.array_equal(np.round(actual_result_2, 4), expected_result) transition_matrices.append(t_matrix) t_matrix = np.array([[1 / 2, 0, 1 / 2], [0, 1, 0], [1 / 2, 0, 1 / 2]]) expected_result = [1 / 3] * 3 actual_result_1 = stationary_distribution(t_matrix, increase_power=True) actual_result_2 = stationary_distribution(t_matrix, increase_power=False) assert np.allclose(actual_result_1, expected_result) assert np.allclose(actual_result_2, expected_result) transition_matrices.append(t_matrix) # crash test repeat_num = 20 big_t_mat = block_diag(*transition_matrices * repeat_num) distribution_1 = stationary_distribution(big_t_mat, increase_power=True) distribution_2 = stationary_distribution(big_t_mat, increase_power=False) assert math.isclose(sum(distribution_1), 1) assert math.isclose(sum(distribution_2), 1) assert np.allclose(distribution_1, distribution_2) data = np.split(np.array(distribution_1), repeat_num) test_row = data[0] for row in data[1:]: assert np.array_equal(row, test_row) sample_num = 1000 big_t_mat = np.random.random([sample_num] * 2) big_t_mat /= big_t_mat.sum(axis=1, keepdims=True) distribution_1 = stationary_distribution(big_t_mat, increase_power=True) distribution_2 = stationary_distribution(big_t_mat, increase_power=False) distribution_3 = stationary_distribution(big_t_mat, normalized=False) assert math.isclose(sum(distribution_1), 1) assert math.isclose(sum(distribution_2), 1) assert math.isclose(sum(distribution_3), sample_num) assert np.allclose(distribution_1, distribution_2)