Example #1
0
def test_split_data(sample_posts):
    """Test that we can take list of tuples and return splits"""
    parsed_posts = transform.parse_posts(sample_posts)
    docs = transform.make_docs(parsed_posts)
    split_data = transform.split_data(docs, train_ratio=0.5)
    assert len(split_data.train) == 1
    assert len(split_data.valid) == 1
Example #2
0
def preprocess(run_id: str, labels: str) -> Path:
    """Read in raw posts and process data to satisfy spacy train api"""
    raw_posts = io.read_raw_posts(run_id)

    parsed_posts = transform.parse_posts(raw_posts)
    filtered_posts = transform.filter_posts(parsed_posts, labels)

    docs = transform.make_docs(filtered_posts)
    split_data = transform.split_data(docs, train_ratio=0.6)

    train_docs = transform.convert_to_doc_binary(split_data.train)
    valid_docs = transform.convert_to_doc_binary(split_data.valid)

    io.write_docs(train_docs, transform.Split.TRAIN.value, run_id)
    io.write_docs(valid_docs, transform.Split.VALID.value, run_id)

    return paths.DATA_DIRS.processed
Example #3
0
def sample_doc_bin(sample_posts: list[transform.RawPost]) -> DocBin:
    """Convert sample posts to doc bin"""
    tuple_list = transform.parse_posts(sample_posts)
    docs = transform.make_docs(tuple_list)
    return transform.convert_to_doc_binary(docs)
Example #4
0
def test_parse_posts(sample_posts):
    """Read in raw posts returning text/label tuple pairs"""
    parsed_posts = transform.parse_posts(sample_posts)
    assert parsed_posts == [("AITA sample text", "NTA"),
                            ("AITA sample text", "YTA")]
Example #5
0
def test_convert_to_doc_binary(sample_posts):
    """Test that docs are correctly written to binary format"""
    parsed_posts = transform.parse_posts(sample_posts)
    docs = transform.make_docs(parsed_posts)
    doc_bin = transform.convert_to_doc_binary(docs)
    assert isinstance(doc_bin, DocBin)
Example #6
0
def test_make_docs(sample_posts):
    """Test that docs are created from list of tuples"""
    parsed_posts = transform.parse_posts(sample_posts)
    docs = transform.make_docs(parsed_posts)
    assert all(isinstance(doc, Doc) for doc in docs)
Example #7
0
def test_filter_posts_no_filter(sample_posts):
    """Test that all posts are returned if no filter provided"""
    parsed_posts = transform.parse_posts(sample_posts)
    filtered_posts = transform.filter_posts(parsed_posts)
    assert len(filtered_posts) == len(parsed_posts)
Example #8
0
def test_filter_posts(sample_posts):
    """Test that posts are correctly filtered out by label"""
    parsed_posts = transform.parse_posts(sample_posts)
    filtered_posts = transform.filter_posts(parsed_posts, labels="NTA")
    assert all(label == "NTA" for _, label in filtered_posts)
Example #9
0
def test_parse_posts_invalid():
    """Test invalid dictionary passed as post"""
    with pytest.raises(ValueError):
        transform.parse_posts([{"not_valid_text": "value", "label": "NTA"}])