Exemple #1
0
    def test_csv_equality(self):
        fs = LocalFileSystem()
        with fs.open(from_root('/test/sample_data/csv_sample.csv')) as f:
            schema1 = from_file(f, {"read_headers": True})
            assert(isinstance(schema1, TextSchema))

        with fs.open(from_root('/test/sample_data/csv_sample_2.csv')) as f:
            schema2 = from_file(f, {"read_headers": True})
            assert(isinstance(schema2, TextSchema))

        schema = find_conflicts([schema1, schema2])[0]
        assert(isinstance(schema, SchemaConflict))
        expect = {'CountDistinctSchemas': 2, 'DistinctSchemas': [{'SchemaType': 'csv', 'Columns': [{'Name': 'type', 'Type': 'object'}, {'Name': 'price', 'Type': 'float64'}]},{'SchemaType': 'csv', 'Columns': [{'Name': 'type', 'Type': 'object'}, {'Name': 'price', 'Type': 'float64'}, {'Name': 'availabile', 'Type': 'bool'}, {'Name': 'date', 'Type': 'object'}]}], 'NonOverlappingColumns': [{'name': 'availabile', 'type': 'bool'}, {'name': 'date', 'type': 'object'}]}
        assert(schema.to_dict() == {'SchemaConflicts': expect})
Exemple #2
0
 def test_jsonl(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/json_lines.jsonl')) as f:
         schema = from_file(f)
         assert(isinstance(schema, JsonSchema))
         expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'field': {'type': 'string'},'field2': {'type': 'string'},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'type': 'string'},'field6': {'type': 'string'},'field7': {'type': 'string'}}, 'type': 'object'}
         assert(schema.schema == expect)
Exemple #3
0
 def test_complex_json(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/complex_json.json')) as f:
         schema = from_file(f)
         assert(isinstance(schema, JsonSchema))
         expect = {'$schema': 'http://json-schema.org/schema#', 'type': 'object', 'properties': {'data': {'type': 'array', 'items': {'type': 'object','properties': {'field1': {'type': 'string'},'field2': {'type': ['integer', 'string']},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'type': 'object','properties': {'some_other_stuff': {'type': 'string'}},'required': ['some_other_stuff']}}}}}, 'required': ['data']}
         assert(schema.schema == expect)
Exemple #4
0
 def test_invalid_json(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/bad_json.json')) as f:
         schema = from_file(f, {})
         assert(isinstance(schema, InvalidSchema))
         message = f"File type not supported for file {from_root('/test/sample_data/bad_json.json')}.  Type: ASCII text, with no line terminators"
         assert(message in schema.reason)
Exemple #5
0
 def test_file_not_supported(self):
     logger.set_level("error")
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/unsupported_file_type.usf')) as f:
         schema = from_file(f)
         assert(isinstance(schema, InvalidSchema))
         assert(schema.reason[0:32] == f"File type not supported for file")
Exemple #6
0
 def test_csv_no_header(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/csv_no_header.csv')) as f:
         schema = from_file(f)
         assert(isinstance(schema, TextSchema))
         assert(list(map(lambda c: c.name,schema.columns)) == [0,1])
         assert(list(map(lambda c: c.type,schema.columns)) == ["object","float64"])
Exemple #7
0
 def test_valid_csv(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/csv_sample.csv')) as f:
         schema = from_file(f, {"read_headers": True})
         assert(isinstance(schema, TextSchema))
         assert(list(map(lambda c: c.name, schema.columns)) == ["type","price"])
         assert(list(map(lambda c: c.type,schema.columns)) == ["object","float64"])
Exemple #8
0
 def test_valid_json(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/json_simple.json')) as f:
         schema = from_file(f)
         assert(isinstance(schema, JsonSchema))
         expect = {'$schema': 'http://json-schema.org/schema#',
                   'properties': {'field': {'type': 'string'}, 'field2': {'type': 'string'}, 'field3': {'type': 'string'}},
                   'required': ['field', 'field2', 'field3'],
                   'type': 'object'}
         assert(schema.schema == expect)
         assert(schema.to_dict() == {'Columns': [], 'SchemaType': 'json'})
         assert(schema.to_pd_dict() == {})
Exemple #9
0
    def test_check_schemas(self):
        fs = LocalFileSystem()
        with fs.open(from_root('/test/sample_data/complex_json.json')) as f:
            schema1 = from_file(f)
            assert(isinstance(schema1, JsonSchema))

        with fs.open(from_root('/test/sample_data/complex_json_2.json')) as f:
            schema2 = from_file(f)
            assert(isinstance(schema2, JsonSchema))

        with fs.open(from_root('/test/sample_data/json_simple.json')) as f:
            schema3 = from_file(f)
            assert(isinstance(schema3, JsonSchema))

        with fs.open(from_root('/test/sample_data/unsupported_file_type.usf')) as f:
            schema4 = from_file(f)
            assert(isinstance(schema4, InvalidSchema))

        with fs.open(from_root('/test/sample_data/csv_sample.csv')) as f:
            schema5 = from_file(f, {"read_headers": True})
            assert(isinstance(schema5, TextSchema))

        with fs.open(from_root('/test/sample_data/json_lines.jsonl')) as f:
            schema6 = from_file(f)
            assert(isinstance(schema6, JsonSchema))


        with fs.open(from_root('/test/sample_data/json_lines2.jsonl')) as f:
            schema7 = from_file(f)
            assert(isinstance(schema7, JsonSchema))

        schema = find_conflicts([schema1, schema2])[0]
        expect = {'$schema': 'http://json-schema.org/schema#','properties': {'data': {'items': {'properties': {'field1': {'type': 'string'},'field2': {'type': ['integer','string']},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'properties': {'some_other_stuff': {'type': 'string'}},'required': ['some_other_stuff'],'type': 'object'},'field6': {'type': 'string'}},'type': 'object'},'type': 'array'}},'required': ['data'],'type': 'object'}
        assert(isinstance(schema, JsonSchema))
        assert(schema.schema == expect)

        schema = find_conflicts([schema1, schema2, schema3])[0]
        assert(isinstance(schema, JsonSchema))
        expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'data': {'items': {'properties': {'field1': {'type': 'string'},'field2': {'type': ['integer','string']},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'properties': {'some_other_stuff': {'type': 'string'}},'required': ['some_other_stuff'],'type': 'object'},'field6': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'},'field': {'type': 'string'},'field2': {'type': 'string'},'field3': {'type': 'string'}}, 'required': [], 'type': 'object'}
        assert(schema.schema == expect)
        schema = find_conflicts([schema1, schema2, schema3,  schema5])[0]
        assert(isinstance(schema, InvalidSchema))
        assert(schema.reason == "Mixed type schemas not supported at this time.  Ensure that files are of one type: ['csv', 'json']")

        schema = find_conflicts([schema6, schema7])[0]
        assert(isinstance(schema, JsonSchema))
        expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'field': {'type': 'string'},'field2': {'type': 'string'},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'type': 'string'},'field6': {'type': 'string'},'field7': {'type': 'string'},'other': {'type': 'string'},'other2': {'type': 'string'},'other3': {'type': 'string'}}, 'required': ['other'], 'type': 'object'}
        assert(schema.schema == expect)
Exemple #10
0
 def test_snappy_parquet_schema_support(self):
     logger.set_level("info")
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/sample.snappy.parquet')) as f:
         schema = from_file(f)
         assert(isinstance(schema, ParquetSchema))
Exemple #11
0
 def test_valid_csv_crlf_lf(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/csv_crlf_sample.csv')) as f:
         schema = from_file(f, {"read_headers": True})
         assert(isinstance(schema, TextSchema))
Exemple #12
0
    def infer_table(
        self,
        path: str,
        name: Optional[str],
        options: Optional[dict] = None,
        resp: Optional[Response] = None
    ) -> Tuple[Union[Table, InvalidTables], Response]:
        opt = options or {}
        logger.info(f"Fetching keys at {path}")
        response: Response = resp or Response()

        path = self.get_path(path).full_path()
        keys, response = self.list_keys(path, response)

        logger.debug(f"{len(keys)} keys at {path}")

        final: Union[Table, InvalidTables]

        sample_size = opt.get("sample_size")
        if sample_size:
            import random
            try:
                ss = int(sample_size)
            except TypeError:
                logger.warning(f"Invalid sample size (int): {sample_size}")
                ss = 3

            logger.warning(
                f"Sampling keys to determine schema. Sample size: {ss}.")
            if ss < len(keys):
                keys = random.sample(keys, ss)

        if len(keys) > 0:
            try:
                valid, invalid_schemas = sequence(
                    list(
                        map(
                            lambda key: schemas.from_file(
                                self.client().open(key.full_path()), opt),
                            keys)), Schema, InvalidSchema)
                non_empty = [
                    v for v in valid if not isinstance(v, EmptySchema)
                ]
                validated, paths = CheckSchemas.find_conflicts(non_empty)
                table = CheckSchemas.get_table(self.get_name(name, path),
                                               validated, paths)
                invalid_tables = list(
                    map(
                        lambda i: InvalidTable("Invalid Schema",
                                               invalid_schema=i),
                        invalid_schemas))
                if isinstance(table, Table):
                    final = table
                else:
                    invalid_tables.append(table)
                    final = InvalidTables(invalid_tables)
            except (ClientError, PermissionError) as e:
                final = InvalidTables(
                    [InvalidTable(f"Not able to infer table: {message(e)}")])
        else:
            response.set_status(404)
            final = InvalidTables([TableNotFound(f"No keys at {path}")])

        return final, response