def test_split_data(sample_posts): """Test that we can take list of tuples and return splits""" parsed_posts = transform.parse_posts(sample_posts) docs = transform.make_docs(parsed_posts) split_data = transform.split_data(docs, train_ratio=0.5) assert len(split_data.train) == 1 assert len(split_data.valid) == 1
def preprocess(run_id: str, labels: str) -> Path: """Read in raw posts and process data to satisfy spacy train api""" raw_posts = io.read_raw_posts(run_id) parsed_posts = transform.parse_posts(raw_posts) filtered_posts = transform.filter_posts(parsed_posts, labels) docs = transform.make_docs(filtered_posts) split_data = transform.split_data(docs, train_ratio=0.6) train_docs = transform.convert_to_doc_binary(split_data.train) valid_docs = transform.convert_to_doc_binary(split_data.valid) io.write_docs(train_docs, transform.Split.TRAIN.value, run_id) io.write_docs(valid_docs, transform.Split.VALID.value, run_id) return paths.DATA_DIRS.processed
def sample_doc_bin(sample_posts: list[transform.RawPost]) -> DocBin: """Convert sample posts to doc bin""" tuple_list = transform.parse_posts(sample_posts) docs = transform.make_docs(tuple_list) return transform.convert_to_doc_binary(docs)
def test_parse_posts(sample_posts): """Read in raw posts returning text/label tuple pairs""" parsed_posts = transform.parse_posts(sample_posts) assert parsed_posts == [("AITA sample text", "NTA"), ("AITA sample text", "YTA")]
def test_convert_to_doc_binary(sample_posts): """Test that docs are correctly written to binary format""" parsed_posts = transform.parse_posts(sample_posts) docs = transform.make_docs(parsed_posts) doc_bin = transform.convert_to_doc_binary(docs) assert isinstance(doc_bin, DocBin)
def test_make_docs(sample_posts): """Test that docs are created from list of tuples""" parsed_posts = transform.parse_posts(sample_posts) docs = transform.make_docs(parsed_posts) assert all(isinstance(doc, Doc) for doc in docs)
def test_filter_posts_no_filter(sample_posts): """Test that all posts are returned if no filter provided""" parsed_posts = transform.parse_posts(sample_posts) filtered_posts = transform.filter_posts(parsed_posts) assert len(filtered_posts) == len(parsed_posts)
def test_filter_posts(sample_posts): """Test that posts are correctly filtered out by label""" parsed_posts = transform.parse_posts(sample_posts) filtered_posts = transform.filter_posts(parsed_posts, labels="NTA") assert all(label == "NTA" for _, label in filtered_posts)
def test_parse_posts_invalid(): """Test invalid dictionary passed as post""" with pytest.raises(ValueError): transform.parse_posts([{"not_valid_text": "value", "label": "NTA"}])