Esempio n. 1
0
def test_DataSelector_selects_correct_num_of_docs_with_float_select_arg(
    corpus: Corpus,
    tokenizer: PreTrainedTokenizerFast,
    keep: float,
    correct_n_docs: int,
):
    data_selector = DataSelector(
        keep=keep,
        tokenizer=tokenizer,
        similarity_metrics=[
            "jensen-shannon",
            "renyi",
            "cosine",
            "euclidean",
            "variational",
            "bhattacharyya",
        ],
        diversity_metrics=[
            "num_token_types",
            "type_token_ratio",
            "entropy",
            "simpsons_index",
            "renyi_entropy",
        ],
    )
    selected_corpus = data_selector.fit_transform(corpus)
    assert len(selected_corpus) == correct_n_docs
Esempio n. 2
0
def test_compute_metrics_adds_composite_score_column(
        data_selector: DataSelector, corpus: Corpus):
    expected_features = (
        len(data_selector.similarity_metrics) +
        len(data_selector.diversity_metrics) + 1  # composite score
    )

    data_selector.fit(corpus)
    scores = data_selector.compute_metrics(corpus)
    assert scores.shape[1] == expected_features
    assert "composite" in scores
Esempio n. 3
0
def data_selector(tokenizer) -> DataSelector:
    return DataSelector(
        keep=2,
        tokenizer=tokenizer,
        similarity_metrics=[
            "jensen-shannon",
            "renyi",
            "cosine",
            "euclidean",
            "variational",
            "bhattacharyya",
        ],
        diversity_metrics=[
            "num_token_types",
            "type_token_ratio",
            "entropy",
            "simpsons_index",
            "renyi_entropy",
        ],
    )
Esempio n. 4
0
def test_to_term_dist_raise_error_with_empty_str(data_selector: DataSelector,
                                                 text):
    with pytest.raises(ValueError):
        data_selector.to_term_dist(text)
Esempio n. 5
0
def test_DataSelector_raise_error_when_both_similarity_and_diversity_metrics_are_not_specified(
    tokenizer, ):
    with pytest.raises(ValueError):
        DataSelector(keep=0.5, tokenizer=tokenizer)
Esempio n. 6
0
def test_DataSelector_raise_error_with_invalid_diversity_metric(tokenizer):
    with pytest.raises(ValueError):
        DataSelector(keep=2,
                     tokenizer=tokenizer,
                     diversity_metrics=["invalid_metric"])
Esempio n. 7
0
def test_DataSelector_raise_error_with_invalid_select_float(keep, tokenizer):
    with pytest.raises(ValueError):
        DataSelector(keep=keep,
                     tokenizer=tokenizer,
                     similarity_metrics=["euclidean"])
Esempio n. 8
0
def test_DataSelector_raise_error_with_zero_or_negative_select_int(
        keep, tokenizer):
    with pytest.raises(ValueError):
        DataSelector(keep=keep,
                     tokenizer=tokenizer,
                     similarity_metrics=["euclidean"])
Esempio n. 9
0
def test_compute_diversity_return_dataframe_of_correct_shape(
        data_selector: DataSelector, corpus: Corpus):
    data_selector.fit(corpus)
    scores = data_selector.compute_diversities(corpus)
    assert scores.shape[1] == len(data_selector.diversity_metrics)
Esempio n. 10
0
def test_to_term_dist_correctness(data_selector: DataSelector, text):
    term_dist = data_selector.to_term_dist(text)
    assert len(term_dist.nonzero()[0]) == len(text.split(" "))
Esempio n. 11
0
def test_to_term_dist_return_a_valid_proba_dist(data_selector: DataSelector,
                                                text):
    term_dist = data_selector.to_term_dist(text)
    assert np.isclose(term_dist.sum(), 1.0)
    assert (term_dist >= 0).all()