Beispiel #1
0
    def test_smart_columns(self):
        with patch('sys.stdout', new_callable=StringIO) as fake_out:
            records_streamed = 0
            table_spec = TEST_TABLE_SPEC['tables'][7]
            modified_since = dateutil.parser.parse(table_spec['start_date'])
            target_files = file_utils.get_matching_objects(
                table_spec, modified_since)
            samples = file_utils.sample_files(table_spec,
                                              target_files,
                                              sample_rate=1)
            schema = generate_schema(table_spec, samples)
            for t_file in target_files:
                records_streamed += file_utils.write_file(
                    t_file['key'], table_spec, schema.to_dict())

            raw_records = fake_out.getvalue().split('\n')
            records = [json.loads(raw) for raw in raw_records if raw]
            self.assertEqual(
                records_streamed, len(records),
                "Number records written to the pipe differed from records read from the pipe."
            )
            self.assertTrue(records[0]['type'] == "RECORD")
            self.assertTrue(len(records[0]) == 3)
            self.assertTrue(len(records[0]['record']) == 7)
            self.assertTrue("_smart_source_bucket" in records[0]['record'])
            self.assertTrue("_smart_source_lineno" in records[0]['record'])
Beispiel #2
0
def sync(config, state, catalog):
    # Loop over selected streams in catalog
    for stream in catalog.get_selected_streams(state):
        LOGGER.info("Syncing stream:" + stream.tap_stream_id)
        catalog_schema = stream.schema.to_dict()
        table_spec = next(
            (x for x in config['tables'] if x['name'] == stream.tap_stream_id),
            None)
        # Allow updates to our tables specification to override any previously extracted schema in the catalog
        merged_schema = override_schema_with_config(catalog_schema, table_spec)
        singer.write_schema(
            stream_name=stream.tap_stream_id,
            schema=merged_schema,
            key_properties=stream.key_properties,
        )
        modified_since = dateutil.parser.parse(
            state.get(stream.tap_stream_id, {}).get('modified_since')
            or table_spec['start_date'])
        target_files = file_utils.get_input_files_for_table(
            table_spec, modified_since)
        records_streamed = 0
        for t_file in target_files:
            records_streamed += file_utils.write_file(t_file['key'],
                                                      table_spec,
                                                      merged_schema)
            state[stream.tap_stream_id] = {
                'modified_since': t_file['last_modified'].isoformat()
            }
            singer.write_state(state)

        LOGGER.info(
            f'Wrote {records_streamed} records for table "{stream.tap_stream_id}".'
        )
    return
Beispiel #3
0
def sync(config, state, catalog):
    # Loop over selected streams in catalog
    LOGGER.info(f"Processing {len(list(catalog.get_selected_streams(state)))} selected streams from Catalog")
    for stream in catalog.get_selected_streams(state):
        LOGGER.info("Syncing stream:" + stream.tap_stream_id)
        catalog_schema = stream.schema.to_dict()
        table_spec = next((x for x in config['tables'] if x['name'] == stream.tap_stream_id), None)
        if table_spec is not None:
            # Allow updates to our tables specification to override any previously extracted schema in the catalog
            merged_schema = override_schema_with_config(catalog_schema, table_spec)
            singer.write_schema(
                stream_name=stream.tap_stream_id,
                schema=merged_schema,
                key_properties=stream.key_properties,
            )
            modified_since = dateutil.parser.parse(
                state.get(stream.tap_stream_id, {}).get('modified_since') or table_spec['start_date'])
            target_files = file_utils.get_matching_objects(table_spec, modified_since)
            max_records_per_run = table_spec.get('max_records_per_run', -1)
            records_streamed = 0
            for t_file in target_files:
                records_streamed += file_utils.write_file(t_file['key'], table_spec, merged_schema, max_records=max_records_per_run-records_streamed)
                if 0 < max_records_per_run <= records_streamed:
                    LOGGER.info(f'Processed the per-run limit of {records_streamed} records for stream "{stream.tap_stream_id}". Stopping sync for this stream.')
                    break
                state[stream.tap_stream_id] = {'modified_since': t_file['last_modified'].isoformat()}
                singer.write_state(state)

            LOGGER.info(f'Wrote {records_streamed} records for stream "{stream.tap_stream_id}".')
        else:
            LOGGER.warn(f'Skipping processing for stream [{stream.tap_stream_id}] without a config block.')
    return