def test_DataSelector_selects_correct_num_of_docs_with_float_select_arg( corpus: Corpus, tokenizer: PreTrainedTokenizerFast, keep: float, correct_n_docs: int, ): data_selector = DataSelector( keep=keep, tokenizer=tokenizer, similarity_metrics=[ "jensen-shannon", "renyi", "cosine", "euclidean", "variational", "bhattacharyya", ], diversity_metrics=[ "num_token_types", "type_token_ratio", "entropy", "simpsons_index", "renyi_entropy", ], ) selected_corpus = data_selector.fit_transform(corpus) assert len(selected_corpus) == correct_n_docs
def test_compute_metrics_adds_composite_score_column( data_selector: DataSelector, corpus: Corpus): expected_features = ( len(data_selector.similarity_metrics) + len(data_selector.diversity_metrics) + 1 # composite score ) data_selector.fit(corpus) scores = data_selector.compute_metrics(corpus) assert scores.shape[1] == expected_features assert "composite" in scores
def data_selector(tokenizer) -> DataSelector: return DataSelector( keep=2, tokenizer=tokenizer, similarity_metrics=[ "jensen-shannon", "renyi", "cosine", "euclidean", "variational", "bhattacharyya", ], diversity_metrics=[ "num_token_types", "type_token_ratio", "entropy", "simpsons_index", "renyi_entropy", ], )
def test_to_term_dist_raise_error_with_empty_str(data_selector: DataSelector, text): with pytest.raises(ValueError): data_selector.to_term_dist(text)
def test_DataSelector_raise_error_when_both_similarity_and_diversity_metrics_are_not_specified( tokenizer, ): with pytest.raises(ValueError): DataSelector(keep=0.5, tokenizer=tokenizer)
def test_DataSelector_raise_error_with_invalid_diversity_metric(tokenizer): with pytest.raises(ValueError): DataSelector(keep=2, tokenizer=tokenizer, diversity_metrics=["invalid_metric"])
def test_DataSelector_raise_error_with_invalid_select_float(keep, tokenizer): with pytest.raises(ValueError): DataSelector(keep=keep, tokenizer=tokenizer, similarity_metrics=["euclidean"])
def test_DataSelector_raise_error_with_zero_or_negative_select_int( keep, tokenizer): with pytest.raises(ValueError): DataSelector(keep=keep, tokenizer=tokenizer, similarity_metrics=["euclidean"])
def test_compute_diversity_return_dataframe_of_correct_shape( data_selector: DataSelector, corpus: Corpus): data_selector.fit(corpus) scores = data_selector.compute_diversities(corpus) assert scores.shape[1] == len(data_selector.diversity_metrics)
def test_to_term_dist_correctness(data_selector: DataSelector, text): term_dist = data_selector.to_term_dist(text) assert len(term_dist.nonzero()[0]) == len(text.split(" "))
def test_to_term_dist_return_a_valid_proba_dist(data_selector: DataSelector, text): term_dist = data_selector.to_term_dist(text) assert np.isclose(term_dist.sum(), 1.0) assert (term_dist >= 0).all()