Ejemplo n.º 1
0
    def test_flatten_json(self):
        file_path = os.path.join(FILES_PATH, "to-be-flattened.jsonl")
        ds = DataSource(format="json", flatten=True, path=file_path)
        df = ds.to_dataframe().compute()

        for c in ["persons.*.lastName", "persons.*.name"]:
            self.assertIn(c, df.columns, f"Expected {c} as column name")
Ejemplo n.º 2
0
    def test_read_parquet(self):
        file_path = os.path.join(FILES_PATH, "test.parquet")
        ds = DataSource(format="parquet", path=file_path)

        df = ds.to_dataframe().compute()
        self.assertTrue("reviewerID" in df.columns)
        self.assertTrue("path" in df.columns)
Ejemplo n.º 3
0
 def test_reader_csv_with_leading_and_trailing_spaces_in_examples(self):
     ds = DataSource(
         format="csv",
         source=os.path.join(TEST_RESOURCES, "trailing_coma_in_headers.csv"),
         attributes=dict(sep=";"),
     )
     df = ds.to_dataframe().compute()
     self.assertIn("name", df.columns)
Ejemplo n.º 4
0
    def test_read_csv(self):
        file_path = os.path.join(TEST_RESOURCES, "dataset_source.csv")

        datasource = DataSource(format="csv", path=file_path)
        data_frame = datasource.to_dataframe().compute()

        assert len(data_frame) > 0
        self.assertTrue("path" in data_frame.columns)
Ejemplo n.º 5
0
    def test_read_json(self):
        file_path = os.path.join(FILES_PATH, "dataset_source.jsonl")

        datasource = DataSource(format="json", path=file_path)
        data_frame = datasource.to_dataframe().compute()

        assert len(data_frame) > 0
        self.assertTrue("path" in data_frame.columns)
Ejemplo n.º 6
0
    def test_read_excel(self):
        file_path = os.path.join(FILES_PATH, "test.xlsx")

        datasource = DataSource(format="xlsx", path=file_path)
        data_frame = datasource.to_dataframe().compute()

        assert len(data_frame) > 0
        self.assertTrue("path" in data_frame.columns)
Ejemplo n.º 7
0
    def test_flatten_nested_list(self):
        file_path = os.path.join(FILES_PATH, "nested-list.jsonl")

        ds = DataSource(format="json", flatten=True, path=file_path)
        df = ds.to_dataframe().compute()

        for c in [
                "classification.*.origin.*.key",
                "classification.*.origin.*.source"
        ]:
            self.assertIn(c, df.columns, f"Expected {c} as data column")
Ejemplo n.º 8
0
    def test_add_mock_format(self):
        def ds_parser(*args, **kwargs):
            from dask import dataframe as ddf
            import pandas as pd

            return ddf.from_pandas(pd.DataFrame([i for i in range(0, 100)]),
                                   npartitions=1)

        DataSource.add_supported_format("new-format", ds_parser)

        for ds in [
                DataSource(format="new-format"),
                DataSource(source="new-format")
        ]:
            self.assertFalse(ds.to_dataframe().columns is None)
Ejemplo n.º 9
0
 def test_load_multiple_formats(self):
     files = [
         os.path.join(FILES_PATH, "dataset_source.jsonl"),
         os.path.join(FILES_PATH, "dataset_source.csv"),
     ]
     with pytest.raises(TypeError):
         DataSource(source=files)
Ejemplo n.º 10
0
    def test_to_mapped(self):
        the_mapping = {"label": "overall", "tokens": "summary"}

        for ds in [
                DataSource(
                    format="json",
                    mapping=the_mapping,
                    path=os.path.join(FILES_PATH, "dataset_source.jsonl"),
                ),
                DataSource(
                    source=os.path.join(FILES_PATH, "dataset_source.jsonl"),
                    mapping=the_mapping,
                ),
        ]:
            df = ds.to_mapped_dataframe()

            self.assertIn("label", df.columns)
            self.assertIn("tokens", df.columns)

            bag = ds.to_mapped_bag().take(1)[0]

            self.assertIn("label", bag)
            self.assertIn("tokens", bag)
Ejemplo n.º 11
0
 def test_override_format(self):
     with pytest.raises(TypeError):
         DataSource(source=os.path.join(FILES_PATH, "*.jsonl"),
                    format="not-found")
Ejemplo n.º 12
0
    def test_no_mapping(self):

        ds = DataSource(format="json",
                        path=os.path.join(FILES_PATH, "dataset_source.jsonl"))
        with pytest.raises(ValueError):
            ds.to_mapped_dataframe()
Ejemplo n.º 13
0
 def test_wrong_format(self):
     with pytest.raises(TypeError):
         DataSource(format="not-found")
     # New format
     with pytest.raises(TypeError):
         DataSource(source="not-found")