コード例 #1
0
    def test_smart_columns(self):
        with patch('sys.stdout', new_callable=StringIO) as fake_out:
            records_streamed = 0
            table_spec = TEST_TABLE_SPEC['tables'][7]
            modified_since = dateutil.parser.parse(table_spec['start_date'])
            target_files = file_utils.get_matching_objects(
                table_spec, modified_since)
            samples = file_utils.sample_files(table_spec,
                                              target_files,
                                              sample_rate=1)
            schema = generate_schema(table_spec, samples)
            for t_file in target_files:
                records_streamed += file_utils.write_file(
                    t_file['key'], table_spec, schema.to_dict())

            raw_records = fake_out.getvalue().split('\n')
            records = [json.loads(raw) for raw in raw_records if raw]
            self.assertEqual(
                records_streamed, len(records),
                "Number records written to the pipe differed from records read from the pipe."
            )
            self.assertTrue(records[0]['type'] == "RECORD")
            self.assertTrue(len(records[0]) == 3)
            self.assertTrue(len(records[0]['record']) == 7)
            self.assertTrue("_smart_source_bucket" in records[0]['record'])
            self.assertTrue("_smart_source_lineno" in records[0]['record'])
コード例 #2
0
def sync(config, state, catalog):
    # Loop over selected streams in catalog
    LOGGER.info(f"Processing {len(list(catalog.get_selected_streams(state)))} selected streams from Catalog")
    for stream in catalog.get_selected_streams(state):
        LOGGER.info("Syncing stream:" + stream.tap_stream_id)
        catalog_schema = stream.schema.to_dict()
        table_spec = next((x for x in config['tables'] if x['name'] == stream.tap_stream_id), None)
        if table_spec is not None:
            # Allow updates to our tables specification to override any previously extracted schema in the catalog
            merged_schema = override_schema_with_config(catalog_schema, table_spec)
            singer.write_schema(
                stream_name=stream.tap_stream_id,
                schema=merged_schema,
                key_properties=stream.key_properties,
            )
            modified_since = dateutil.parser.parse(
                state.get(stream.tap_stream_id, {}).get('modified_since') or table_spec['start_date'])
            target_files = file_utils.get_matching_objects(table_spec, modified_since)
            max_records_per_run = table_spec.get('max_records_per_run', -1)
            records_streamed = 0
            for t_file in target_files:
                records_streamed += file_utils.write_file(t_file['key'], table_spec, merged_schema, max_records=max_records_per_run-records_streamed)
                if 0 < max_records_per_run <= records_streamed:
                    LOGGER.info(f'Processed the per-run limit of {records_streamed} records for stream "{stream.tap_stream_id}". Stopping sync for this stream.')
                    break
                state[stream.tap_stream_id] = {'modified_since': t_file['last_modified'].isoformat()}
                singer.write_state(state)

            LOGGER.info(f'Wrote {records_streamed} records for stream "{stream.tap_stream_id}".')
        else:
            LOGGER.warn(f'Skipping processing for stream [{stream.tap_stream_id}] without a config block.')
    return
コード例 #3
0
def discover(config):
    streams = []
    for table_spec in config['tables']:
        try:
            modified_since = dateutil.parser.parse(table_spec['start_date'])
            target_files = file_utils.get_matching_objects(table_spec, modified_since)
            sample_rate = table_spec.get('sample_rate',5)
            max_sampling_read = table_spec.get('max_sampling_read', 1000)
            max_sampled_files = table_spec.get('max_sampled_files', 50)
            samples = file_utils.sample_files(table_spec, target_files,sample_rate=sample_rate,
                                              max_records=max_sampling_read, max_files=max_sampled_files)
            schema = generate_schema(table_spec, samples)
            stream_metadata = []
            key_properties = table_spec.get('key_properties', [])
            streams.append(
                CatalogEntry(
                    tap_stream_id=table_spec['name'],
                    stream=table_spec['name'],
                    schema=schema,
                    key_properties=key_properties,
                    metadata=stream_metadata,
                    replication_key=None,
                    is_view=None,
                    database=None,
                    table=None,
                    row_count=None,
                    stream_alias=None,
                    replication_method=None,
                )
            )
        except Exception as err:
            LOGGER.error(f"Unable to write Catalog entry for '{table_spec['name']}' - it will be skipped due to error {err}")

    return Catalog(streams)
コード例 #4
0
    def test_https_bucket(self):
        table_spec = TEST_TABLE_SPEC['tables'][4]
        modified_since = dateutil.parser.parse(table_spec['start_date'])
        target_files = file_utils.get_matching_objects(table_spec,
                                                       modified_since)
        assert len(target_files) == 1

        target_uri = table_spec['path'] + '/' + target_files[0]["key"]
        iterator = get_row_iterator(TEST_TABLE_SPEC['tables'][4], target_uri)

        row = next(iterator)
        self.assertTrue(int(row['id']) > 0, row['id'] + " was not positive")
コード例 #5
0
    def test_renamed_https_object(self):
        table_spec = TEST_TABLE_SPEC['tables'][6]
        modified_since = dateutil.parser.parse(table_spec['start_date'])
        target_files = file_utils.get_matching_objects(table_spec,
                                                       modified_since)
        assert len(target_files) == 1

        target_uri = table_spec['path'] + '/' + table_spec['pattern']
        iterator = get_row_iterator(TEST_TABLE_SPEC['tables'][6], target_uri)

        row = next(iterator)
        self.assertTrue(len(row) > 1, "Not able to read a row.")
コード例 #6
0
    def test_indirect_https_bucket(self):
        table_spec = TEST_TABLE_SPEC['tables'][5]
        modified_since = dateutil.parser.parse(table_spec['start_date'])
        target_files = file_utils.get_matching_objects(table_spec,
                                                       modified_since)
        assert len(target_files) == 1

        target_uri = table_spec['path'] + '/' + table_spec['pattern']
        iterator = get_row_iterator(TEST_TABLE_SPEC['tables'][5], target_uri)

        row = next(iterator)
        self.assertTrue(row['1976'] == '1976',
                        "Row did not contain expected data")
コード例 #7
0
def discover(config):
    streams = []
    for table_spec in config['tables']:
        try:
            modified_since = dateutil.parser.parse(table_spec['start_date'])
            target_files = file_utils.get_matching_objects(
                table_spec, modified_since)
            sample_rate = table_spec.get('sample_rate', 5)
            max_sampling_read = table_spec.get('max_sampling_read', 1000)
            max_sampled_files = table_spec.get('max_sampled_files', 50)
            prefer_number_vs_integer = table_spec.get(
                'prefer_number_vs_integer', False)
            samples = file_utils.sample_files(table_spec,
                                              target_files,
                                              sample_rate=sample_rate,
                                              max_records=max_sampling_read,
                                              max_files=max_sampled_files)

            metadata_schema = {
                '_smart_source_bucket': {
                    'type': 'string'
                },
                '_smart_source_file': {
                    'type': 'string'
                },
                '_smart_source_lineno': {
                    'type': 'integer'
                },
            }
            data_schema = conversion.generate_schema(
                samples, prefer_number_vs_integer=prefer_number_vs_integer)
            inferred_schema = {
                'type': 'object',
                'properties': merge_dicts(data_schema, metadata_schema)
            }

            merged_schema = override_schema_with_config(
                inferred_schema, table_spec)
            schema = Schema.from_dict(merged_schema)

            stream_metadata = []
            key_properties = table_spec.get('key_properties', [])
            streams.append(
                CatalogEntry(
                    tap_stream_id=table_spec['name'],
                    stream=table_spec['name'],
                    schema=schema,
                    key_properties=key_properties,
                    metadata=stream_metadata,
                    replication_key=None,
                    is_view=None,
                    database=None,
                    table=None,
                    row_count=None,
                    stream_alias=None,
                    replication_method=None,
                ))
        except Exception as err:
            LOGGER.error(
                f"Unable to write Catalog entry for '{table_spec['name']}' - it will be skipped due to error {err}"
            )

    return Catalog(streams)
コード例 #8
0
 def test_local_bucket(self):
     table_spec = TEST_TABLE_SPEC['tables'][1]
     modified_since = dateutil.parser.parse(table_spec['start_date'])
     target_files = file_utils.get_matching_objects(table_spec,
                                                    modified_since)
     assert len(target_files) == 1