Ejemplo n.º 1
0
    def test_generate_schema(self):
        self.assertEqual(
            generate_schema([{
                'id': '1',
                'first_name': 'Connor'
            }, {
                'id': '2',
                'first_name': '1'
            }]), {
                'id': {
                    'type': ['null', 'integer'],
                    '_conversion_type': 'integer'
                },
                'first_name': {
                    'type': ['null', 'string'],
                    '_conversion_type': 'string'
                }
            })

        self.assertEqual(
            generate_schema([{
                'id': '1',
                'cost': '1'
            }, {
                'id': '2',
                'cost': '1.25'
            }]), {
                'id': {
                    'type': ['null', 'integer'],
                    '_conversion_type': 'integer'
                },
                'cost': {
                    'type': ['null', 'number'],
                    '_conversion_type': 'number'
                }
            })

        self.assertEqual(
            generate_schema([{
                'id': '1',
                'date': '2017-01-01'
            }, {
                'id': '2',
                'date': '2017-01-02'
            }]), {
                'id': {
                    'type': ['null', 'integer'],
                    '_conversion_type': 'integer'
                },
                'date': {
                    'type': ['null', 'string'],
                    '_conversion_type': 'string'
                }
            })
Ejemplo n.º 2
0
def get_sampled_schema_for_table(config, table_spec):
    logger.info('Sampling records to determine table schema.')

    s3_files = s3.get_input_files_for_table(config, table_spec)

    samples = s3.sample_files(config, table_spec, s3_files)

    metadata_schema = {
        '_s3_source_bucket': {
            'type': 'string'
        },
        '_s3_source_file': {
            'type': 'string'
        },
        '_s3_source_lineno': {
            'type': 'integer'
        },
    }

    data_schema = conversion.generate_schema(samples)

    return {
        'type': 'object',
        'properties': merge_dicts(data_schema, metadata_schema)
    }
Ejemplo n.º 3
0
def get_sampled_schema_for_table(config, table_spec):
    LOGGER.info('Sampling records to determine table schema.')

    s3_files_gen = get_input_files_for_table(config, table_spec)

    samples = [sample for sample in sample_files(
        config, table_spec, s3_files_gen)]

    if skipped_files_count:
        LOGGER.warning(
            "%s files got skipped during the last sampling.", skipped_files_count)

    if not samples:
        # Return empty properties for accept everything from data if no samples found
        return {
            'type': 'object',
            'properties': {}
        }

    data_schema = conversion.generate_schema(
        samples, table_spec, config.get('string_max_length', False))

    return {
        'type': 'object',
        'properties': data_schema
    }
Ejemplo n.º 4
0
def get_sampled_schema_for_table(config: Dict, table_spec: Dict) -> Dict:
    """
    Detects json schema using a sample of table/stream data
    :param config: Tap config
    :param table_spec: tables specs
    :return: detected schema
    """
    LOGGER.info('Sampling records to determine table schema.')

    modified_since = utils.strptime_with_tz(config['start_date'])
    s3_files_gen = get_input_files_for_table(config, table_spec, modified_since)

    samples = list(sample_files(config, table_spec, s3_files_gen))

    if not samples:
        return {}

    metadata_schema = {
        SDC_SOURCE_BUCKET_COLUMN: {'type': 'string'},
        SDC_SOURCE_FILE_COLUMN: {'type': 'string'},
        SDC_SOURCE_LINENO_COLUMN: {'type': 'integer'},
        SDC_EXTRA_COLUMN: {'type': 'array', 'items': {'type': 'string'}},
    }

    data_schema = conversion.generate_schema(samples, table_spec)

    return {
        'type': 'object',
        'properties': merge_dicts(data_schema, metadata_schema)
    }
Ejemplo n.º 5
0
def get_sampled_schema_for_table(config, table_spec):
    LOGGER.info('Sampling records to determine table schema.')

    s3_files = get_input_files_for_table(config, table_spec)

    if not s3_files:
        return {}

    samples = sample_files(config, table_spec, s3_files)

    metadata_schema = {
        SDC_SOURCE_BUCKET_COLUMN: {
            'type': 'string'
        },
        SDC_SOURCE_FILE_COLUMN: {
            'type': 'string'
        },
        SDC_SOURCE_LINENO_COLUMN: {
            'type': 'integer'
        },
        csv.SDC_EXTRA_COLUMN: {
            'type': 'array',
            'items': {
                'type': 'string'
            }
        },
    }

    data_schema = conversion.generate_schema(samples, table_spec)

    return {
        'type': 'object',
        'properties': merge_dicts(data_schema, metadata_schema)
    }
Ejemplo n.º 6
0
def get_sampled_schema_for_table(config, table_spec):
    LOGGER.info('Sampling records to determine table schema.')

    s3_files_gen = get_input_files_for_table(config, table_spec)

    samples = [
        sample for sample in sample_files(config, table_spec, s3_files_gen)
    ]

    if not samples:
        return {}

    data_schema = conversion.generate_schema(samples, table_spec)
    return {'type': 'object', 'properties': data_schema}
    def test_generate_schema(self):
        samples = [
            dict(id='1',
                 name='productA',
                 added_at='2017/05/18 10:40:22',
                 price='22.99',
                 sold='true',
                 sold_at='2019-11-29'),
            dict(id='4',
                 name='productB',
                 added_at='2017/05/18 10:40:22',
                 price='18',
                 sold='false'),
            dict(id='6',
                 name='productC',
                 added_at='2017/05/18 10:40:22',
                 price='14.6',
                 sold='true',
                 sold_at='2019-12-11'),
        ]

        table_specs = {'date_overrides': ['added_at']}

        schema = generate_schema(samples, table_specs)

        self.assertDictEqual(
            {
                'id': {
                    'type': ['null', 'integer']
                },
                'name': {
                    'type': ['null', 'string']
                },
                'added_at': {
                    'type': ['null', 'string'],
                    'format': 'date-time'
                },
                'price': {
                    'type': ['null', 'number']
                },
                'sold': {
                    'type': ['null', 'string']
                },
                'sold_at': {
                    'type': ['null', 'string']
                }
            }, schema)
Ejemplo n.º 8
0
def get_sampled_schema_for_table(config, table_spec):
    LOGGER.info('Sampling records to determine table schema.')

    s3_files_gen = get_input_files_for_table(config, table_spec)

    samples = [
        sample for sample in sample_files(config, table_spec, s3_files_gen)
    ]

    if skipped_files_count:
        LOGGER.warning("%s files got skipped during the last sampling.",
                       skipped_files_count)

    if not samples:
        #Return empty properties for accept everything from data if no samples found
        return {'type': 'object', 'properties': {}}

    metadata_schema = {
        SDC_SOURCE_BUCKET_COLUMN: {
            'type': 'string'
        },
        SDC_SOURCE_FILE_COLUMN: {
            'type': 'string'
        },
        SDC_SOURCE_LINENO_COLUMN: {
            'type': 'integer'
        },
        SDC_EXTRA_COLUMN: {
            'type': 'array',
            'items': {
                'anyOf': [{
                    'type': 'object',
                    'properties': {}
                }, {
                    'type': 'string'
                }]
            }
        }
    }

    data_schema = conversion.generate_schema(samples, table_spec)

    return {
        'type': 'object',
        'properties': merge_dicts(data_schema, metadata_schema)
    }
Ejemplo n.º 9
0
def get_sampled_schema_for_table(config: Dict, table_spec: Dict) -> Dict:
    """
    Detects json schema using a sample of table/stream data
    :param config: Tap config
    :param table_spec: tables specs
    :return: detected schema
    """
    LOGGER.info('Sampling records to determine table schema.')

    s3_files_gen = get_input_files_for_table(config, table_spec)
    s3_files_list = [obj for obj in s3_files_gen]

    sorted_s3_files = sorted(s3_files_list,
                             reverse=True,
                             key=lambda k: k['last_modified'])

    samples = list(sample_files(config, table_spec, sorted_s3_files))

    if not samples:
        return {}

    metadata_schema = {
        SDC_SOURCE_BUCKET_COLUMN: {
            'type': 'string'
        },
        SDC_SOURCE_FILE_COLUMN: {
            'type': 'string'
        },
        SDC_SOURCE_LINENO_COLUMN: {
            'type': 'integer'
        },
        SDC_EXTRA_COLUMN: {
            'type': 'array',
            'items': {
                'type': 'string'
            }
        },
    }

    data_schema = conversion.generate_schema(samples, table_spec)

    return {
        'type': 'object',
        'properties': merge_dicts(data_schema, metadata_schema)
    }
Ejemplo n.º 10
0
def get_sampled_schema_for_table(config, table_spec):
    LOGGER.info('Sampling records to determine table schema.')

    s3_files_gen = get_input_files_for_table(config, table_spec)

    samples = [
        sample for sample in sample_files(config, table_spec, s3_files_gen)
    ]

    if not samples:
        return {}

    metadata_schema = {
        SDC_SOURCE_BUCKET_COLUMN: {
            'type': 'string'
        },
        SDC_SOURCE_FILE_COLUMN: {
            'type': 'string'
        },
        SDC_SOURCE_LINENO_COLUMN: {
            'type': 'integer'
        },
        csv_singer.SDC_EXTRA_COLUMN: {
            'type': 'array',
            'items': {
                'type': 'string'
            }
        },
        LAST_MODIFIED: {
            'type': 'string',
            'format': 'date-time'
        }
    }

    data_schema = conversion.generate_schema(samples, table_spec)

    return {
        'type': 'object',
        'properties': merge_dicts(data_schema, metadata_schema)
    }
Ejemplo n.º 11
0
def get_sampled_schema_for_table(config, table_spec):
    LOGGER.info("Sampling records to determine table schema.")

    s3_files_gen = get_input_files_for_table(config, table_spec)

    samples = [
        sample for sample in sample_files(config, table_spec, s3_files_gen)
    ]

    if not samples:
        return {}

    metadata_schema = {
        SDC_SOURCE_BUCKET_COLUMN: {
            "type": "string"
        },
        SDC_SOURCE_FILE_COLUMN: {
            "type": "string"
        },
        SDC_SOURCE_LINENO_COLUMN: {
            "type": "integer"
        },
        csv.SDC_EXTRA_COLUMN: {
            "type": "array",
            "items": {
                "type": "string"
            }
        },
    }

    data_schema = conversion.generate_schema(samples, table_spec)

    return {
        "type": "object",
        "properties": merge_dicts(data_schema, metadata_schema)
    }
Ejemplo n.º 12
0
 def test_generate_schema(self, mock_count_sample):
     samples = [{
         'name': 'test',
         'id': 3,
         'marks': [45.85, 25.38],
         'students': {
             'no': 5,
             'col': 6
         },
         'created_at': '20-05-2021',
         'tota': []
     }]
     table_spec = {
         'search_prefix': '',
         'search_pattern': 'test\\/.*\\.jsonl',
         'table_name': 'jsonl_table',
         'key_properties': ['id'],
         'date_overrides': ['created_at'],
         'delimiter': ','
     }
     res = conversion.generate_schema(samples, table_spec)
     expected_result = {
         'name': {
             'type': ['null', 'string']
         },
         'id': {
             'type': ['null', 'integer', 'string']
         },
         'marks': {
             'anyOf': [{
                 'type': 'array',
                 'items': {
                     'type': ['null', 'number', 'string']
                 }
             }, {
                 'type': ['null', 'string']
             }]
         },
         'students': {
             'anyOf': [{
                 'type': 'object',
                 'properties': {}
             }, {
                 'type': ['null', 'string']
             }]
         },
         'created_at': {
             'anyOf': [{
                 'type': ['null', 'string'],
                 'format': 'date-time'
             }, {
                 'type': ['null', 'string']
             }]
         },
         'tota': {
             'anyOf': [{
                 'type': 'array',
                 'items': ['null', 'string']
             }, {
                 'type': ['null', 'string']
             }]
         }
     }
     self.assertEqual(res, expected_result)
Ejemplo n.º 13
0
 def test_generate_schema(self):
     samples = self.load_file("sample.json", "data_test")
     table_input = self.load_file("table_spec_without_key.json", "data_test")
     output = self.load_file("data_schema.json", "data_test")
     result = generate_schema([samples], table_input)
     self.assertEqual(simplejson.dumps(output), simplejson.dumps(result))