Exemple #1
0
    def test_smart_columns(self):
        with patch('sys.stdout', new_callable=StringIO) as fake_out:
            records_streamed = 0
            table_spec = TEST_TABLE_SPEC['tables'][7]
            modified_since = dateutil.parser.parse(table_spec['start_date'])
            target_files = file_utils.get_matching_objects(
                table_spec, modified_since)
            samples = file_utils.sample_files(table_spec,
                                              target_files,
                                              sample_rate=1)
            schema = generate_schema(table_spec, samples)
            for t_file in target_files:
                records_streamed += file_utils.write_file(
                    t_file['key'], table_spec, schema.to_dict())

            raw_records = fake_out.getvalue().split('\n')
            records = [json.loads(raw) for raw in raw_records if raw]
            self.assertEqual(
                records_streamed, len(records),
                "Number records written to the pipe differed from records read from the pipe."
            )
            self.assertTrue(records[0]['type'] == "RECORD")
            self.assertTrue(len(records[0]) == 3)
            self.assertTrue(len(records[0]['record']) == 7)
            self.assertTrue("_smart_source_bucket" in records[0]['record'])
            self.assertTrue("_smart_source_lineno" in records[0]['record'])
Exemple #2
0
def discover(config):
    streams = []
    for table_spec in config['tables']:
        try:
            modified_since = dateutil.parser.parse(table_spec['start_date'])
            target_files = file_utils.get_matching_objects(table_spec, modified_since)
            sample_rate = table_spec.get('sample_rate',5)
            max_sampling_read = table_spec.get('max_sampling_read', 1000)
            max_sampled_files = table_spec.get('max_sampled_files', 50)
            samples = file_utils.sample_files(table_spec, target_files,sample_rate=sample_rate,
                                              max_records=max_sampling_read, max_files=max_sampled_files)
            schema = generate_schema(table_spec, samples)
            stream_metadata = []
            key_properties = table_spec.get('key_properties', [])
            streams.append(
                CatalogEntry(
                    tap_stream_id=table_spec['name'],
                    stream=table_spec['name'],
                    schema=schema,
                    key_properties=key_properties,
                    metadata=stream_metadata,
                    replication_key=None,
                    is_view=None,
                    database=None,
                    table=None,
                    row_count=None,
                    stream_alias=None,
                    replication_method=None,
                )
            )
        except Exception as err:
            LOGGER.error(f"Unable to write Catalog entry for '{table_spec['name']}' - it will be skipped due to error {err}")

    return Catalog(streams)
Exemple #3
0
def discover(config):
    streams = []
    for table_spec in config['tables']:
        modified_since = dateutil.parser.parse(table_spec['start_date'])
        target_files = file_utils.get_input_files_for_table(
            table_spec, modified_since)
        sample_rate = table_spec.get('sample_rate', 10)
        max_sampling_read = table_spec.get('max_sampling_read', 1000)
        max_sampled_files = table_spec.get('max_sampled_files', 5)
        prefer_number_vs_integer = table_spec.get('prefer_number_vs_integer',
                                                  False)
        samples = file_utils.sample_files(table_spec,
                                          target_files,
                                          sample_rate=sample_rate,
                                          max_records=max_sampling_read,
                                          max_files=max_sampled_files)

        metadata_schema = {
            '_smart_source_bucket': {
                'type': 'string'
            },
            '_smart_source_file': {
                'type': 'string'
            },
            '_smart_source_lineno': {
                'type': 'integer'
            },
        }
        data_schema = conversion.generate_schema(
            samples, prefer_number_vs_integer=prefer_number_vs_integer)
        inferred_schema = {
            'type': 'object',
            'properties': merge_dicts(data_schema, metadata_schema)
        }

        merged_schema = override_schema_with_config(inferred_schema,
                                                    table_spec)
        schema = Schema.from_dict(merged_schema)

        stream_metadata = []
        key_properties = table_spec.get('key_properties', [])
        streams.append(
            CatalogEntry(
                tap_stream_id=table_spec['name'],
                stream=table_spec['name'],
                schema=schema,
                key_properties=key_properties,
                metadata=stream_metadata,
                replication_key=None,
                is_view=None,
                database=None,
                table=None,
                row_count=None,
                stream_alias=None,
                replication_method=None,
            ))
    return Catalog(streams)
def discover(config):
    streams = []
    for table_spec in config['tables']:
        try:
            modified_since = dateutil.parser.parse(table_spec['start_date'])
            target_files = file_utils.get_matching_objects(
                table_spec, modified_since)
            sample_rate = table_spec.get('sample_rate', 5)
            max_sampling_read = table_spec.get('max_sampling_read', 1000)
            max_sampled_files = table_spec.get('max_sampled_files', 50)
            prefer_number_vs_integer = table_spec.get(
                'prefer_number_vs_integer', False)
            samples = file_utils.sample_files(table_spec,
                                              target_files,
                                              sample_rate=sample_rate,
                                              max_records=max_sampling_read,
                                              max_files=max_sampled_files)

            metadata_schema = {
                '_smart_source_bucket': {
                    'type': 'string'
                },
                '_smart_source_file': {
                    'type': 'string'
                },
                '_smart_source_lineno': {
                    'type': 'integer'
                },
            }
            data_schema = conversion.generate_schema(
                samples, prefer_number_vs_integer=prefer_number_vs_integer)
            inferred_schema = {
                'type': 'object',
                'properties': merge_dicts(data_schema, metadata_schema)
            }

            merged_schema = override_schema_with_config(
                inferred_schema, table_spec)
            # Ignore empty columns
            merged_schema['properties'] = {
                key: value
                for key, value in merged_schema['properties'].items()
                if key != ''
            }

            schema = Schema.from_dict(merged_schema)

            stream_metadata = []
            key_properties = table_spec.get('key_properties', [])
            streams.append(
                CatalogEntry(
                    tap_stream_id=table_spec['name'],
                    stream=table_spec['name'],
                    schema=schema,
                    key_properties=key_properties,
                    metadata=stream_metadata,
                    replication_key=None,
                    is_view=None,
                    database=None,
                    table=None,
                    row_count=None,
                    stream_alias=None,
                    replication_method=None,
                ))
        except Exception as err:
            LOGGER.error(
                f"Unable to write Catalog entry for '{table_spec['name']}' - it will be skipped due to error {err}"
            )

    return Catalog(streams)