def test_from_pandas(): df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) ds = Dataset.from_pandas(df) assert ds.dataset.column_names == ["a", "b"] assert ds["a"] == [1, 2, 3] assert len(ds) == 3
def test_from_parquet_file(resources_data_path): """This only shows an example of how one could read in a parquet file""" file_path = resources_data_path / "test.parquet" df = pd.read_parquet(file_path) dataset = Dataset.from_pandas(df) assert "reviewerID" in dataset.column_names
def test_from_excel_file(resources_data_path): """This only shows an example of how one could read in an excel file""" str_value = Value("string") int_value = Value("int64") features = Features(Notification=int_value, Type=str_value, Plant=int_value, Serial=str_value) file_path = resources_data_path / "test.xlsx" df = pd.read_excel(file_path) dataset = Dataset.from_pandas(df, features=features) assert len(dataset) > 0
def training_dataset() -> Dataset: df = pd.DataFrame( { "text": [ "This is a simple NER test", "This is a simple NER test with misaligned spans", "No NER here", ], "entities": [ [{"start": 17, "end": 20, "label": "NER"}], [{"start": 17, "end": 22, "label": "NER"}], [], ], } ) return Dataset.from_pandas(df)