Example #1
0
def discover(config):
    streams = []
    for table_spec in config['tables']:
        modified_since = dateutil.parser.parse(table_spec['start_date'])
        target_files = file_utils.get_input_files_for_table(
            table_spec, modified_since)
        sample_rate = table_spec.get('sample_rate', 10)
        max_sampling_read = table_spec.get('max_sampling_read', 1000)
        max_sampled_files = table_spec.get('max_sampled_files', 5)
        prefer_number_vs_integer = table_spec.get('prefer_number_vs_integer',
                                                  False)
        samples = file_utils.sample_files(table_spec,
                                          target_files,
                                          sample_rate=sample_rate,
                                          max_records=max_sampling_read,
                                          max_files=max_sampled_files)

        metadata_schema = {
            '_smart_source_bucket': {
                'type': 'string'
            },
            '_smart_source_file': {
                'type': 'string'
            },
            '_smart_source_lineno': {
                'type': 'integer'
            },
        }
        data_schema = conversion.generate_schema(
            samples, prefer_number_vs_integer=prefer_number_vs_integer)
        inferred_schema = {
            'type': 'object',
            'properties': merge_dicts(data_schema, metadata_schema)
        }

        merged_schema = override_schema_with_config(inferred_schema,
                                                    table_spec)
        schema = Schema.from_dict(merged_schema)

        stream_metadata = []
        key_properties = table_spec.get('key_properties', [])
        streams.append(
            CatalogEntry(
                tap_stream_id=table_spec['name'],
                stream=table_spec['name'],
                schema=schema,
                key_properties=key_properties,
                metadata=stream_metadata,
                replication_key=None,
                is_view=None,
                database=None,
                table=None,
                row_count=None,
                stream_alias=None,
                replication_method=None,
            ))
    return Catalog(streams)
Example #2
0
def generate_schema(table_spec, samples):
    metadata_schema = {
        '_smart_source_bucket': {'type': 'string'},
        '_smart_source_file': {'type': 'string'},
        '_smart_source_lineno': {'type': 'integer'},
    }
    prefer_number_vs_integer = table_spec.get('prefer_number_vs_integer', False)
    data_schema = conversion.generate_schema(samples, prefer_number_vs_integer=prefer_number_vs_integer)
    inferred_schema = {
        'type': 'object',
        'properties': merge_dicts(data_schema, metadata_schema)
    }

    merged_schema = override_schema_with_config(inferred_schema, table_spec)
    return Schema.from_dict(merged_schema)
def discover(config):
    streams = []
    for table_spec in config['tables']:
        try:
            modified_since = dateutil.parser.parse(table_spec['start_date'])
            target_files = file_utils.get_matching_objects(
                table_spec, modified_since)
            sample_rate = table_spec.get('sample_rate', 5)
            max_sampling_read = table_spec.get('max_sampling_read', 1000)
            max_sampled_files = table_spec.get('max_sampled_files', 50)
            prefer_number_vs_integer = table_spec.get(
                'prefer_number_vs_integer', False)
            samples = file_utils.sample_files(table_spec,
                                              target_files,
                                              sample_rate=sample_rate,
                                              max_records=max_sampling_read,
                                              max_files=max_sampled_files)

            metadata_schema = {
                '_smart_source_bucket': {
                    'type': 'string'
                },
                '_smart_source_file': {
                    'type': 'string'
                },
                '_smart_source_lineno': {
                    'type': 'integer'
                },
            }
            data_schema = conversion.generate_schema(
                samples, prefer_number_vs_integer=prefer_number_vs_integer)
            inferred_schema = {
                'type': 'object',
                'properties': merge_dicts(data_schema, metadata_schema)
            }

            merged_schema = override_schema_with_config(
                inferred_schema, table_spec)
            # Ignore empty columns
            merged_schema['properties'] = {
                key: value
                for key, value in merged_schema['properties'].items()
                if key != ''
            }

            schema = Schema.from_dict(merged_schema)

            stream_metadata = []
            key_properties = table_spec.get('key_properties', [])
            streams.append(
                CatalogEntry(
                    tap_stream_id=table_spec['name'],
                    stream=table_spec['name'],
                    schema=schema,
                    key_properties=key_properties,
                    metadata=stream_metadata,
                    replication_key=None,
                    is_view=None,
                    database=None,
                    table=None,
                    row_count=None,
                    stream_alias=None,
                    replication_method=None,
                ))
        except Exception as err:
            LOGGER.error(
                f"Unable to write Catalog entry for '{table_spec['name']}' - it will be skipped due to error {err}"
            )

    return Catalog(streams)
    def test_generate_schema(self):
        self.assertEqual(
            generate_schema([{
                'id': '1',
                'first_name': 'Connor'
            }, {
                'id': '2',
                'first_name': '1'
            }]), {
                'id': {
                    'type': ['null', 'integer'],
                },
                'first_name': {
                    'type': ['null', 'string'],
                }
            })

        self.assertEqual(
            generate_schema([{
                'id': '1',
                'cost': '1'
            }, {
                'id': '2',
                'cost': '1.25'
            }]), {
                'id': {
                    'type': ['null', 'integer'],
                },
                'cost': {
                    'type': ['null', 'number'],
                }
            })

        self.assertEqual(
            generate_schema([{
                'id': '1',
                'cost': '1'
            }, {
                'id': '2',
                'cost': '1'
            }, {
                'id': '-3',
                'cost': '25'
            }, {
                'id': '+4',
                'cost': '3.25'
            }]), {
                'id': {
                    'type': ['null', 'integer'],
                },
                'cost': {
                    'type': ['null', 'number'],
                }
            })

        self.assertEqual(
            generate_schema([{
                'id': '1',
                'date': '2017-01-01'
            }, {
                'id': '2',
                'date': '2017-01-02'
            }]), {
                'id': {
                    'type': ['null', 'integer'],
                },
                'date': {
                    'type': ['null', 'string'],
                }
            })