#!/usr/bin/env python import singer import singer.metrics as metrics from singer import metadata from singer import Transformer logger = singer.get_logger().getChild('tap-bigcommerce') def sync_stream(state, instance): stream = instance.stream with metrics.record_counter(stream.tap_stream_id) as counter: for (stream, record) in instance.sync(state): counter.increment() try: with Transformer() as transformer: record = transformer.transform( record, stream.schema.to_dict(), metadata.to_map(stream.metadata) ) singer.write_record(stream.tap_stream_id, record) if counter.value % 1000 == 0: singer.write_state(state) except Exception as e: logger.error('Handled exception: {error}'.format(error=str(e))) continue
from target_stitch.exceptions import TargetStitchException from decimal import getcontext #NB> because the target may validate decimal values, this precision must be at least 1 greater than the maximum precision decimal output by the tap. #tap-postgres, for instance, will allow up to 38 digits of precision getcontext().prec = 40 * 2 DEFAULT_STITCH_URL = 'https://api.stitchdata.com/v2/import/batch' MAX_BYTES_PER_FLUSH = 20 * 1024 * 1024 MAX_BYTES_PER_RECORD = MAX_BYTES_PER_FLUSH # Cannot be higher than 1000000 due to sequence numbers exceeding max long value MAX_RECORDS_PER_FLUSH = 1000000 LOGGER = singer.get_logger().getChild('target_stitch') # We use this to store schema and key properties from SCHEMA messages StreamMeta = namedtuple('StreamMeta', ['schema', 'key_properties', 'bookmark_properties']) def collect(): '''Send usage info to Stitch.''' try: version = pkg_resources.get_distribution('target-stitch').version params = { 'e': 'se', 'aid': 'singer', 'se_ca': 'target-stitch', 'se_ac': 'open', 'se_la': version, }
from pymysqlreplication import BinLogStreamReader from pymysqlreplication.constants import FIELD_TYPE from pymysqlreplication.event import RotateEvent from pymysqlreplication.row_event import ( DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent, ) from singer import utils, Schema import tap_mysql.sync_strategies.common as common from tap_mysql.stream_utils import write_schema_message from tap_mysql.discover_utils import discover_catalog, desired_columns from tap_mysql.connection import connect_with_backoff, make_connection_wrapper LOGGER = singer.get_logger('tap_mysql') SDC_DELETED_AT = "_sdc_deleted_at" UPDATE_BOOKMARK_PERIOD = 1000 BOOKMARK_KEYS = {'log_file', 'log_pos', 'version'} MYSQL_TIMESTAMP_TYPES = {FIELD_TYPE.TIMESTAMP, FIELD_TYPE.TIMESTAMP2} def add_automatic_properties(catalog_entry, columns): catalog_entry.schema.properties[SDC_DELETED_AT] = Schema( type=["null", "string"], format="date-time") columns.append(SDC_DELETED_AT)
import unittest import psycopg2 import tap_postgres from tap_postgres.discovery_utils import BASE_RECURSIVE_SCHEMAS import tap_postgres.db as post_db from singer import get_logger, metadata from psycopg2.extensions import quote_ident try: from tests.utils import get_test_connection, ensure_test_table, get_test_connection_config except ImportError: from utils import get_test_connection, ensure_test_table, get_test_connection_config LOGGER = get_logger() def do_not_dump_catalog(catalog): pass tap_postgres.dump_catalog = do_not_dump_catalog class TestStringTableWithPK(unittest.TestCase): maxDiff = None table_name = 'CHICKEN TIMES' def setUp(self): table_spec = { "columns": [{ "name": "id",
import os import itertools import more_itertools import re import backoff import boto3 from typing import Dict, Generator, Optional, Iterator from botocore.exceptions import ClientError from singer_encodings.csv import get_row_iterator, SDC_EXTRA_COLUMN # pylint:disable=no-name-in-module from singer import get_logger, utils from tap_s3_csv import conversion LOGGER = get_logger('tap_s3_csv') SDC_SOURCE_BUCKET_COLUMN = "_sdc_source_bucket" SDC_SOURCE_FILE_COLUMN = "_sdc_source_file" SDC_SOURCE_LINENO_COLUMN = "_sdc_source_lineno" def retry_pattern(): """ Retry decorator to retry failed functions :return: """ return backoff.on_exception(backoff.expo, ClientError, max_tries=5, on_backoff=log_backoff_attempt,
"""Postmark tap.""" # -*- coding: utf-8 -*- import logging from argparse import Namespace import pkg_resources from singer import get_logger, utils from singer.catalog import Catalog from tap_postmark.postmark import Postmark from tap_postmark.discover import discover from tap_postmark.sync import sync VERSION: str = pkg_resources.get_distribution('tap-postmark').version LOGGER: logging.RootLogger = get_logger() REQUIRED_CONFIG_KEYS: tuple = ( 'postmark_server_token', 'start_date', ) @utils.handle_top_exception(LOGGER) def main() -> None: """Run tap.""" # Parse command line arguments args: Namespace = utils.parse_args(REQUIRED_CONFIG_KEYS) LOGGER.info(f'>>> Running tap-postmark v{VERSION}') # If discover flag was passed, run discovery mode and dump output to stdout if args.discover:
import time import singer import singer.utils as singer_utils from singer import Transformer, metadata, metrics from requests.exceptions import RequestException from tap_salesforce.salesforce.bulk import Bulk LOGGER = singer.get_logger('tap_salesforce') BLACKLISTED_FIELDS = set(['attributes']) def remove_blacklisted_fields(data): return {k: v for k, v in data.items() if k not in BLACKLISTED_FIELDS} # pylint: disable=unused-argument def transform_bulk_data_hook(data, typ, schema): result = data if isinstance(data, dict): result = remove_blacklisted_fields(data) # Salesforce Bulk API returns CSV's with empty strings for text fields. # When the text field is nillable and the data value is an empty string, # change the data so that it is None. if data == "" and "null" in schema['type']: result = None return result
import os import singer import pendulum logger = singer.get_logger() class ExactsalesStream(object): tap = None endpoint = '' key_properties = [] state_field = None initial_state = None earliest_state = None stream_start = None schema = '' schema_path = 'schemas/{}.json' schema_cache = None start = 1 limit = 100 next_start = 100 payload = [] def get_schema(self): if not self.schema_cache: self.schema_cache = self.load_schema() return self.schema_cache def load_schema(self):
class SQLInterface: """ Generic interface for handling SQL Targets in Singer. Provides reasonable defaults for: - nested schemas -> traditional SQL Tables and Columns - nested records -> traditional SQL Table rows Expected usage for use with your given target is to: - override all public _non-helper_ functions - use all public _helper_ functions inside of your _non-helper_ functions Function Syntax: - `_...` prefix : Private function - `..._helper` suffix : Helper function """ IDENTIFIER_FIELD_LENGTH = NotImplementedError( '`IDENTIFIER_FIELD_LENGTH` not implemented.') LOGGER = singer.get_logger() def _set_timer_tags(self, metric, job_type, path): metric.tags['job_type'] = job_type metric.tags['path'] = path metric.tags.update(self.metrics_tags()) return metric def _set_counter_tags(self, metric, counter_type, path): metric.tags['count_type'] = counter_type metric.tags['path'] = path metric.tags.update(self.metrics_tags()) return metric def _set_metrics_tags__table(self, metric, table_name): metric.tags['table'] = table_name return metric def metrics_tags(self): """ Optional function to overwrite to include more tags into Singer Metrics. :return: Dictonary of Tags """ return {} def json_schema_to_sql_type(self, schema): """ Given a JSONSchema structure, return a compatible string representing a SQL column type. :param schema: JSONSchema :return: string """ raise NotImplementedError('`` not implemented.') def get_table_schema(self, connection, name): """ Fetch the `table_schema` for `name`. :param connection: remote connection, type left to be determined by implementing class :param name: string :return: TABLE_SCHEMA(remote) """ raise NotImplementedError('`get_table_schema` not implemented.') def _get_table_schema(self, connection, name): """ get_table_schema, but with checking the version of the schema to ensure latest format. :param connection: remote connection, type left to be determined by implementing class :param name: string :return: TABLE_SCHEMA(remote) """ remote_schema = self.get_table_schema(connection, name) if remote_schema and remote_schema.get('schema_version', 0) != CURRENT_SCHEMA_VERSION: raise Exception( 'Schema for `{}` is of version {}. Expected version {}'.format( name, remote_schema.get('schema_version', 0), CURRENT_SCHEMA_VERSION)) return remote_schema def is_table_empty(self, connection, name): """ Returns True when given table name has no rows. :param connection: remote connection, type left to be determined by implementing class :param name: string :return: boolean """ raise NotImplementedError('`is_table_empty` not implemented.') def canonicalize_identifier(self, name): """ Given a SQL Identifier `name`, attempt to serialize it to an acceptable name for remote. NOTE: DOES NOT handle collision support, nor identifier length/truncation support. :param name: string :return: string """ raise NotImplementedError('`canonicalize_identifier` not implemented.') def fetch_column_from_path(self, path, table_schema): """ Should only be used for paths which have been established, ie, the schema will not be changing etc. :param path: :param table_schema: :return: """ for to, m in table_schema.get('mappings', {}).items(): if tuple(m['from']) == path: return to, json_schema.simple_type(m) raise Exception('blahbittyblah') def _canonicalize_column_identifier(self, path, schema, mappings): """""" from_type__to_name = {} existing_paths = set() existing_column_names = set() for m in mappings: from_type__to_name[(m['from'], json_schema.shorthand(m))] = m['to'] existing_paths.add(m['from']) existing_column_names.add(m['to']) ## MAPPING EXISTS, NO CANONICALIZATION NECESSARY if (path, json_schema.shorthand(schema)) in from_type__to_name: return from_type__to_name[(path, json_schema.shorthand(schema))] raw_canonicalized_column_name = self.canonicalize_identifier( SEPARATOR.join(path)) canonicalized_column_name = self.canonicalize_identifier( raw_canonicalized_column_name[:self.IDENTIFIER_FIELD_LENGTH]) raw_suffix = '' ## NO TYPE MATCH if path in existing_paths: raw_suffix = SEPARATOR + json_schema.shorthand(schema) canonicalized_column_name = self.canonicalize_identifier( raw_canonicalized_column_name[:self.IDENTIFIER_FIELD_LENGTH - len(raw_suffix)] + raw_suffix) self.LOGGER.warning( 'FIELD COLLISION: Field `{}` exists in remote already. No compatible type found. Appending type suffix: `{}`' .format(path, canonicalized_column_name)) i = 0 ## NAME COLLISION while canonicalized_column_name in existing_column_names: self.LOGGER.warning( 'NAME COLLISION: Field `{}` collided with `{}` in remote. Adding new integer suffix...' .format(path, canonicalized_column_name)) i += 1 suffix = raw_suffix + SEPARATOR + str(i) canonicalized_column_name = self.canonicalize_identifier( raw_canonicalized_column_name[:self.IDENTIFIER_FIELD_LENGTH - len(suffix)] + suffix) return canonicalized_column_name def add_table(self, connection, path, name, metadata): """ Create the remote table schema. :param connection: remote connection, type left to be determined by implementing class :param path: (String, ...) :param name: String :param metadata: additional metadata needed by implementing class :return: None """ raise NotImplementedError('`add_table` not implemented.') def add_key_properties(self, connection, table_name, key_properties): """ :param connection: remote connection, type left to be determined by implementing class :param table_name: string :param key_properties: [string, ...] :return: None """ raise NotImplementedError('`add_key_properties` not implemented.') def add_table_mapping_helper(self, from_path, table_mappings): """ :param from_path: :param table_mappings: :return: (boolean, string) """ ## MAPPING EXISTS if from_path in table_mappings: return {'exists': True, 'to': table_mappings[from_path]} to_from = dict([(v, k) for k, v in table_mappings.items()]) name = SEPARATOR.join(from_path) raw_canonicalized_name = self.canonicalize_identifier(name) canonicalized_name = self.canonicalize_identifier( raw_canonicalized_name[:self.IDENTIFIER_FIELD_LENGTH]) i = 0 ## NAME COLLISION while canonicalized_name in to_from: self.LOGGER.warning( 'NAME COLLISION: Table `{}` collided with `{}` in remote. Adding new integer suffix...' .format(from_path, canonicalized_name)) i += 1 suffix = SEPARATOR + str(i) canonicalized_name = self.canonicalize_identifier( raw_canonicalized_name[:self.IDENTIFIER_FIELD_LENGTH - len(suffix)] + suffix) return {'exists': False, 'to': canonicalized_name} def add_table_mapping(self, connection, from_path, metadata): """ Given a full path to a table, `from_path`, add a table mapping to the canonicalized name. :param connection: remote connection, type left to be determined by implementing class :param from_path: (string, ...) :return: None """ raise NotImplementedError('`add_table_mapping` not implemented.') def add_column(self, connection, table_name, name, schema): """ Add column `name` in `table_name` with `schema`. :param connection: remote connection, type left to be determined by implementing class :param table_name: string :param name: string :param schema: JSON Object Schema :return: None """ raise NotImplementedError('`add_column` not implemented.') def drop_column(self, connection, table_name, name): """ Drop column `name` in `table_name`. :param connection: remote connection, type left to be determined by implementing class :param table_name: string :param name: string :return: None """ raise NotImplementedError('`add_column` not implemented.') def migrate_column(self, connection, table_name, from_column, to_column): """ Migrate data `from_column` in `table_name` `to_column`. :param connection: remote connection, type left to be determined by implementing class :param table_name: string :param from_column: string :param to_column: string :return: None """ raise NotImplementedError('`migrate_column` not implemented.') def make_column_nullable(self, connection, table_name, name): """ Update column `name` in `table_name` to accept `null` values. :param connection: remote connection, type left to be determined by implementing class :param table_name: string :param name: string :return: None """ raise NotImplementedError('`make_column_nullable` not implemented.') def add_index(self, connection, table_name, column_names): """ Add an index on a group of `column_names` in `table_name`. :param connection: remote connection, type left to be determined by implementing class :param table_name: string :param column_names: (string, ...) :return: None """ raise NotImplementedError('`add_index` not implemented.') def add_column_mapping(self, connection, table_name, from_path, to_name, schema): """ Given column path `from_path` add a column mapping to `to_name` for `schema`. A column mapping is an entry in the TABLE_SCHEMA which reads: {... 'mappings': [... `to_name`: {'type': `json_schema.get_type(schema)`, 'from': `path`} ] ...} :param connection: remote connection, type left to be determined by implementing class :param table_name: string :param from_path: (string, ...) :param to_name: string :param schema: JSON Object Schema :return: None """ raise NotImplementedError('`add_column_mapping` not implemented.') def drop_column_mapping(self, connection, table_name, name): """ Given column mapping `name`, remove from the TABLE_SCHEMA(remote). :param connection: remote connection, type left to be determined by implementing class :param table_name: string :param name: string :return: None """ raise NotImplementedError('`remove_column_mapping` not implemented.') def _get_mapping(self, existing_schema, path, schema): for to, mapping in existing_schema.get('mappings', {}).items(): if tuple(mapping['from']) == path \ and json_schema.shorthand(mapping) == json_schema.shorthand(schema): return to return None def upsert_table_helper(self, connection, schema, metadata, log_schema_changes=True): """ Upserts the `schema` to remote by: - creating table if necessary - adding columns - adding column mappings - migrating data from old columns to new, etc. :param connection: remote connection, type left to be determined by implementing class :param schema: TABLE_SCHEMA(local) :param metadata: additional information necessary for downstream operations, :param log_schema_changes: defaults to True, set to false to disable logging of table level schema changes :return: TABLE_SCHEMA(remote) """ table_path = schema['path'] with self._set_timer_tags(metrics.job_timer(), 'upsert_table_schema', table_path) as timer: _metadata = deepcopy(metadata) _metadata['schema_version'] = CURRENT_SCHEMA_VERSION table_name = self.add_table_mapping(connection, table_path, _metadata) self._set_metrics_tags__table(timer, table_name) existing_schema = self._get_table_schema(connection, table_name) existing_table = True if existing_schema is None: self.add_table(connection, table_path, table_name, _metadata) existing_schema = self._get_table_schema( connection, table_name) existing_table = False self.add_key_properties(connection, table_name, schema.get('key_properties', None)) ## Build up mappings to compare new columns against existing mappings = [] for to, m in existing_schema.get('mappings', {}).items(): mapping = json_schema.simple_type(m) mapping['from'] = tuple(m['from']) mapping['to'] = to mappings.append(mapping) ## Only process columns which have single, nullable, types column_paths_seen = set() single_type_columns = [] for column_path, column_schema in schema['schema'][ 'properties'].items(): column_paths_seen.add(column_path) for sub_schema in column_schema['anyOf']: single_type_columns.append( (column_path, deepcopy(sub_schema))) ### Add any columns missing from new schema for m in mappings: if not m['from'] in column_paths_seen: single_type_columns.append( (m['from'], json_schema.make_nullable(m))) ## Process new columns against existing table_empty = self.is_table_empty(connection, table_name) for column_path, column_schema in single_type_columns: upsert_table_helper__start__column = time.monotonic() canonicalized_column_name = self._canonicalize_column_identifier( column_path, column_schema, mappings) nullable_column_schema = json_schema.make_nullable( column_schema) def log_message(msg): if log_schema_changes: self.LOGGER.info( 'Table Schema Change [`{}`.`{}`:`{}`] {} (took {} millis)' .format( table_name, column_path, canonicalized_column_name, msg, _duration_millis( upsert_table_helper__start__column))) ## NEW COLUMN if not column_path in [m['from'] for m in mappings]: upsert_table_helper__column = "New column" ### NON EMPTY TABLE if not table_empty: upsert_table_helper__column += ", non empty table" self.LOGGER.warning( 'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.' .format(column_path, table_name)) column_schema = nullable_column_schema self.add_column(connection, table_name, canonicalized_column_name, column_schema) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, column_schema) mapping = json_schema.simple_type(column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) log_message(upsert_table_helper__column) continue ## EXISTING COLUMNS ### SCHEMAS MATCH if [ True for m in mappings if m['from'] == column_path and self.json_schema_to_sql_type( m) == self.json_schema_to_sql_type(column_schema) ]: continue ### NULLABLE SCHEMAS MATCH ### New column _is not_ nullable, existing column _is_ if [ True for m in mappings if m['from'] == column_path and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(nullable_column_schema) ]: continue ### NULL COMPATIBILITY ### New column _is_ nullable, existing column is _not_ non_null_original_column = [ m for m in mappings if m['from'] == column_path and json_schema.shorthand(m) == json_schema.shorthand(column_schema) ] if non_null_original_column: ## MAKE NULLABLE self.make_column_nullable(connection, table_name, canonicalized_column_name) self.drop_column_mapping(connection, table_name, canonicalized_column_name) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) mappings = [ m for m in mappings if not ( m['from'] == column_path and json_schema.shorthand( m) == json_schema.shorthand(column_schema)) ] mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) log_message("Made existing column nullable.") continue ### FIRST MULTI TYPE ### New column matches existing column path, but the types are incompatible duplicate_paths = [ m for m in mappings if m['from'] == column_path ] if 1 == len(duplicate_paths): existing_mapping = duplicate_paths[0] existing_column_name = existing_mapping['to'] if existing_column_name: self.drop_column_mapping(connection, table_name, existing_column_name) ## Update existing properties mappings = [ m for m in mappings if m['from'] != column_path ] mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) existing_column_new_normalized_name = self._canonicalize_column_identifier( column_path, existing_mapping, mappings) mapping = json_schema.simple_type( json_schema.make_nullable(existing_mapping)) mapping['from'] = column_path mapping['to'] = existing_column_new_normalized_name mappings.append(mapping) ## Add new columns ### NOTE: all migrated columns will be nullable and remain that way #### Table Metadata self.add_column_mapping( connection, table_name, column_path, existing_column_new_normalized_name, json_schema.make_nullable(existing_mapping)) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) #### Columns self.add_column( connection, table_name, existing_column_new_normalized_name, json_schema.make_nullable(existing_mapping)) self.add_column(connection, table_name, canonicalized_column_name, nullable_column_schema) ## Migrate existing data self.migrate_column(connection, table_name, existing_mapping['to'], existing_column_new_normalized_name) ## Drop existing column self.drop_column(connection, table_name, existing_mapping['to']) upsert_table_helper__column = "Splitting `{}` into `{}` and `{}`. New column matches existing column path, but the types are incompatible.".format( existing_column_name, existing_column_new_normalized_name, canonicalized_column_name) ## REST MULTI TYPE elif 1 < len(duplicate_paths): ## Add new column self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) self.add_column(connection, table_name, canonicalized_column_name, nullable_column_schema) mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) upsert_table_helper__column = "Adding new column to split column `{}`. New column matches existing column's path, but no types were compatible.".format( column_path) ## UNKNOWN else: raise Exception( 'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.' .format(column_path, canonicalized_column_name, table_name)) log_message(upsert_table_helper__column) if not existing_table: for column_names in self.new_table_indexes(schema): self.add_index(connection, table_name, column_names) return self._get_table_schema(connection, table_name) def _serialize_table_record_field_name(self, remote_schema, path, value_json_schema): """ Returns the appropriate remote field (column) name for `path`. :param remote_schema: TABLE_SCHEMA(remote) :param path: (string, ...) :value_json_schema: dict, JSON Schema :return: string """ simple_json_schema = json_schema.simple_type(value_json_schema) mapping = self._get_mapping(remote_schema, path, simple_json_schema) if not mapping is None: return mapping ## Numbers are valid as `float` OR `int` ## ie, 123.0 and 456 are valid 'number's if json_schema.INTEGER in json_schema.get_type(simple_json_schema): mapping = self._get_mapping(remote_schema, path, {'type': json_schema.NUMBER}) if not mapping is None: return mapping raise Exception( "A compatible column for path {} and JSONSchema {} in table {} cannot be found." .format(path, simple_json_schema, remote_schema['path'])) def serialize_table_record_null_value(self, remote_schema, streamed_schema, field, value): """ Returns the serialized version of `value` which is appropriate for the target's null implementation. :param remote_schema: TABLE_SCHEMA(remote) :param streamed_schema: TABLE_SCHEMA(local) :param field: string :param value: literal :return: literal """ raise NotImplementedError( '`parse_table_record_serialize_null_value` not implemented.') def serialize_table_record_datetime_value(self, remote_schema, streamed_schema, field, value): """ Returns the serialized version of `value` which is appropriate for the target's datetime implementation. :param remote_schema: TABLE_SCHEMA(remote) :param streamed_schema: TABLE_SCHEMA(local) :param field: string :param value: literal :return: literal """ raise NotImplementedError( '`parse_table_record_serialize_datetime_value` not implemented.') def _serialize_table_records(self, remote_schema, streamed_schema, records): """ Parse the given table's `records` in preparation for persistence to the remote target. Base implementation returns a list of dictionaries, where _every_ dictionary has the same keys as `remote_schema`'s properties. :param remote_schema: TABLE_SCHEMA(remote) :param streamed_schema: TABLE_SCHEMA(local) :param records: [{(path_0, path_1, ...): (_json_schema_string_type, value), ...}, ...] :return: [{...}, ...] """ datetime_paths = set() default_paths = {} for column_path, column_schema in streamed_schema['schema'][ 'properties'].items(): for sub_schema in column_schema['anyOf']: if json_schema.is_datetime(sub_schema): datetime_paths.add(column_path) if sub_schema.get('default') is not None: default_paths[column_path] = sub_schema.get('default') ## Get the default NULL value so we can assign row values when value is _not_ NULL NULL_DEFAULT = self.serialize_table_record_null_value( remote_schema, streamed_schema, None, None) serialized_rows = [] remote_fields = set(remote_schema['schema']['properties'].keys()) default_row = dict([(field, NULL_DEFAULT) for field in remote_fields]) paths = streamed_schema['schema']['properties'].keys() for record in records: row = deepcopy(default_row) for path in paths: json_schema_string_type, value = record.get(path, (None, None)) ## Serialize fields which are not present but have default values set if path in default_paths \ and value is None: value = default_paths[path] json_schema_string_type = json_schema.python_type(value) if not json_schema_string_type: continue ## Serialize datetime to compatible format if path in datetime_paths \ and json_schema_string_type == json_schema.STRING \ and value is not None: value = self.serialize_table_record_datetime_value( remote_schema, streamed_schema, path, value) value_json_schema = { 'type': json_schema.STRING, 'format': json_schema.DATE_TIME_FORMAT } else: value_json_schema = {'type': json_schema_string_type} ## Serialize NULL default value value = self.serialize_table_record_null_value( remote_schema, streamed_schema, path, value) field_name = self._serialize_table_record_field_name( remote_schema, path, value_json_schema) ## `field_name` is unset if row[field_name] == NULL_DEFAULT: row[field_name] = value serialized_rows.append(row) return serialized_rows def write_table_batch(self, connection, table_batch, metadata): """ Update the remote for given table's schema, and write records. Returns the number of records persisted. :param connection: remote connection, type left to be determined by implementing class :param table_batch: {'remote_schema': TABLE_SCHEMA(remote), 'records': [{...}, ...]} :param metadata: additional metadata needed by implementing class :return: integer """ raise NotImplementedError('`write_table_batch` not implemented.') def write_batch_helper(self, connection, root_table_name, schema, key_properties, records, metadata): """ Write all `table_batch`s associated with the given `schema` and `records` to remote. :param connection: remote connection, type left to be determined by implementing class :param root_table_name: string :param schema: SingerStreamSchema :param key_properties: [string, ...] :param records: [{...}, ...] :param metadata: additional metadata needed by implementing class :return: {'records_persisted': int, 'rows_persisted': int} """ with self._set_timer_tags(metrics.job_timer(), 'batch', (root_table_name, )): with self._set_counter_tags(metrics.record_counter(None), 'batch_rows_persisted', (root_table_name, )) as batch_counter: self.LOGGER.info( 'Writing batch with {} records for `{}` with `key_properties`: `{}`' .format(len(records), root_table_name, key_properties)) for table_batch in denest.to_table_batches( schema, key_properties, records): table_batch['streamed_schema']['path'] = (root_table_name,) + \ table_batch['streamed_schema']['path'] with self._set_timer_tags( metrics.job_timer(), 'table', table_batch['streamed_schema'] ['path']) as table_batch_timer: with self._set_counter_tags( metrics.record_counter(None), 'table_rows_persisted', table_batch['streamed_schema'] ['path']) as table_batch_counter: self.LOGGER.info( 'Writing table batch schema for `{}`...'. format(table_batch['streamed_schema']['path'])) remote_schema = self.upsert_table_helper( connection, table_batch['streamed_schema'], metadata) self._set_metrics_tags__table( table_batch_timer, remote_schema['name']) self._set_metrics_tags__table( table_batch_counter, remote_schema['name']) self.LOGGER.info( 'Writing table batch with {} rows for `{}`...'. format(len(table_batch['records']), table_batch['streamed_schema']['path'])) batch_rows_persisted = self.write_table_batch( connection, { 'remote_schema': remote_schema, 'records': self._serialize_table_records( remote_schema, table_batch['streamed_schema'], table_batch['records']) }, metadata) table_batch_counter.increment(batch_rows_persisted) batch_counter.increment(batch_rows_persisted) return { 'records_persisted': len(records), 'rows_persisted': batch_counter.value } def write_batch(self, stream_buffer): """ Persist `stream_buffer.records` to remote. :param stream_buffer: SingerStreamBuffer :return: {'records_persisted': int, 'rows_persisted': int} """ raise NotImplementedError('`write_batch` not implemented.') def activate_version(self, stream_buffer, version): """ Activate the given `stream_buffer`'s remote to `version` :param stream_buffer: SingerStreamBuffer :param version: integer :return: boolean """ raise NotImplementedError('`activate_version` not implemented.') def new_table_indexes(self, schema): """ Returns a list of lists of string column names to add indexes for a new table once that new table has been fully created. For subclassess where indexes don't make any sense, like Redshift, this can safely always return false. :param schema: TABLE_SCHEMA(local) :param column_name: string :return: [[column_name: string], [column_name: string, column_name: string],...] """ return []
import singer import singer.metrics as metrics import singer.schema import snowflake.connector from singer import metadata from singer import utils from singer.catalog import Catalog, CatalogEntry from singer.schema import Schema import tap_snowflake.sync_strategies.common as common import tap_snowflake.sync_strategies.full_table as full_table import tap_snowflake.sync_strategies.incremental as incremental from tap_snowflake.connection import SnowflakeConnection LOGGER = singer.get_logger('tap_snowflake') # Max number of rows that a SHOW SCHEMAS|TABLES|COLUMNS can return. # If more than this number of rows returned then tap-snowflake will raise TooManyRecordsException SHOW_COMMAND_MAX_ROWS = 9999 # Tone down snowflake connector logs noise logging.getLogger('snowflake.connector').setLevel(logging.WARNING) Column = collections.namedtuple('Column', [ 'table_catalog', 'table_schema', 'table_name', 'column_name', 'data_type', 'character_maximum_length', 'numeric_precision', 'numeric_scale' ]) REQUIRED_CONFIG_KEYS = [ 'account', 'dbname', 'user', 'password', 'warehouse', 'tables'
def __init__(self, connection_config, stream_schema_message=None, table_cache=None): """ connection_config: Redshift connection details stream_schema_message: An instance of the DbSync class is typically used to load data only from a certain singer tap stream. The stream_schema_message holds the destination schema name and the JSON schema that will be used to validate every RECORDS messages that comes from the stream. Schema validation happening before creating CSV and before uploading data into Redshift. If stream_schema_message is not defined then we can use the DbSync instance as a generic purpose connection to Redshift and can run individual queries. For example collecting catalog informations from Redshift for caching purposes. """ self.connection_config = connection_config self.stream_schema_message = stream_schema_message self.table_cache = table_cache # logger to be used across the class's methods self.logger = get_logger("target_redshift") # Validate connection configuration config_errors = validate_config(connection_config) # Exit if config has errors if len(config_errors) != 0: self.logger.error("Invalid configuration:\n * {}".format( "\n * ".join(config_errors))) sys.exit(1) aws_profile = self.connection_config.get( "aws_profile") or os.environ.get("AWS_PROFILE") aws_access_key_id = self.connection_config.get( "aws_access_key_id") or os.environ.get("AWS_ACCESS_KEY_ID") aws_secret_access_key = self.connection_config.get( "aws_secret_access_key") or os.environ.get("AWS_SECRET_ACCESS_KEY") aws_session_token = self.connection_config.get( "aws_session_token") or os.environ.get("AWS_SESSION_TOKEN") # Init S3 client # Conditionally pass keys as this seems to affect whether instance credentials are correctly loaded if the keys are None if aws_access_key_id and aws_secret_access_key: aws_session = boto3.session.Session( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, aws_session_token=aws_session_token, ) credentials = aws_session.get_credentials().get_frozen_credentials( ) # Explicitly set credentials to those fetched from Boto so we can re-use them in COPY SQL if necessary self.connection_config[ "aws_access_key_id"] = credentials.access_key self.connection_config[ "aws_secret_access_key"] = credentials.secret_key self.connection_config["aws_session_token"] = credentials.token else: aws_session = boto3.session.Session(profile_name=aws_profile) self.s3 = aws_session.client("s3") self.skip_updates = self.connection_config.get("skip_updates", False) self.schema_name = None self.grantees = None # Init stream schema if self.stream_schema_message is not None: # Define target schema name. # -------------------------- # Target schema name can be defined in multiple ways: # # 1: 'default_target_schema' key : Target schema is the same for every incoming stream if # not specified explicitly for a given stream in # the `schema_mapping` object # 2: 'schema_mapping' key : Target schema defined explicitly for a given stream. # Example config.json: # "schema_mapping": { # "my_tap_stream_id": { # "target_schema": "my_redshift_schema", # "target_schema_select_permissions": { # "users": [ "user_1", "user_2" ], # "groups": [ "group_1", "group_2" ] # } # } # } config_default_target_schema = self.connection_config.get( "default_target_schema", "").strip() config_schema_mapping = self.connection_config.get( "schema_mapping", {}) stream_name = stream_schema_message["stream"] stream_schema_name = stream_name_to_dict( stream_name)["schema_name"] if config_schema_mapping and stream_schema_name in config_schema_mapping: self.schema_name = config_schema_mapping[ stream_schema_name].get("target_schema") elif config_default_target_schema: self.schema_name = config_default_target_schema if not self.schema_name: raise Exception( "Target schema name not defined in config. Neither 'default_target_schema' (string) nor 'schema_mapping' (object) defines target schema for {} stream." .format(stream_name)) # Define grantees # --------------- # Grantees can be defined in multiple ways: # # 1: 'default_target_schema_select_permissions' key : USAGE and SELECT privileges will be granted on every table to a given role # for every incoming stream if not specified explicitly # in the `schema_mapping` object # 2: 'target_schema_select_permissions' key : Roles to grant USAGE and SELECT privileges defined explicitly # for a given stream. # Example config.json: # "schema_mapping": { # "my_tap_stream_id": { # "target_schema": "my_redshift_schema", # "target_schema_select_permissions": { # "users": [ "user_1", "user_2" ], # "groups": [ "group_1", "group_2" ] # } # } # } self.grantees = self.connection_config.get( "default_target_schema_select_permissions") if config_schema_mapping and stream_schema_name in config_schema_mapping: self.grantees = config_schema_mapping[stream_schema_name].get( "target_schema_select_permissions", self.grantees) self.data_flattening_max_level = self.connection_config.get( "data_flattening_max_level", 0) self.flatten_schema = flatten_schema( stream_schema_message["schema"], max_level=self.data_flattening_max_level, )
def __init__(self, connection_config, stream_schema_message=None, table_cache=None): """ connection_config: Snowflake connection details stream_schema_message: An instance of the DbSync class is typically used to load data only from a certain singer tap stream. The stream_schema_message holds the destination schema name and the JSON schema that will be used to validate every RECORDS messages that comes from the stream. Schema validation happening before creating CSV and before uploading data into Snowflake. If stream_schema_message is not defined that we can use the DbSync instance as a generic purpose connection to Snowflake and can run individual queries. For example collecting catalog informations from Snowflake for caching purposes. """ self.connection_config = connection_config self.stream_schema_message = stream_schema_message self.table_cache = table_cache # logger to be used across the class's methods self.logger = get_logger('target_snowflake') # Validate connection configuration config_errors = validate_config(connection_config) # Exit if config has errors if len(config_errors) > 0: self.logger.error("Invalid configuration:\n * {}".format( '\n * '.join(config_errors))) sys.exit(1) stage = stream_name_to_dict(self.connection_config['stage'], separator='.') if not stage['schema_name']: self.logger.error( "The named external stage object in config has to use the <schema>.<stage_name> format." ) sys.exit(1) self.schema_name = None self.grantees = None # Init stream schema if self.stream_schema_message is not None: # Define target schema name. # -------------------------- # Target schema name can be defined in multiple ways: # # 1: 'default_target_schema' key : Target schema is the same for every incoming stream if # not specified explicitly for a given stream in # the `schema_mapping` object # 2: 'schema_mapping' key : Target schema defined explicitly for a given stream. # Example config.json: # "schema_mapping": { # "my_tap_stream_id": { # "target_schema": "my_snowflake_schema", # "target_schema_select_permissions": [ "role_with_select_privs" ] # } # } config_default_target_schema = self.connection_config.get( 'default_target_schema', '').strip() config_schema_mapping = self.connection_config.get( 'schema_mapping', {}) stream_name = stream_schema_message['stream'] stream_schema_name = stream_name_to_dict( stream_name)['schema_name'] if config_schema_mapping and stream_schema_name in config_schema_mapping: self.schema_name = config_schema_mapping[ stream_schema_name].get('target_schema') elif config_default_target_schema: self.schema_name = config_default_target_schema if not self.schema_name: raise Exception( "Target schema name not defined in config. Neither 'default_target_schema' (string) nor 'schema_mapping' (object) defines target schema for {} stream." .format(stream_name)) # Define grantees # --------------- # Grantees can be defined in multiple ways: # # 1: 'default_target_schema_select_permissions' key : USAGE and SELECT privileges will be granted on every table to a given role # for every incoming stream if not specified explicitly # in the `schema_mapping` object # 2: 'target_schema_select_permissions' key : Roles to grant USAGE and SELECT privileges defined explicitly # for a given stream. # Example config.json: # "schema_mapping": { # "my_tap_stream_id": { # "target_schema": "my_snowflake_schema", # "target_schema_select_permissions": [ "role_with_select_privs" ] # } # } self.grantees = self.connection_config.get( 'default_target_schema_select_permissions') if config_schema_mapping and stream_schema_name in config_schema_mapping: self.grantees = config_schema_mapping[stream_schema_name].get( 'target_schema_select_permissions', self.grantees) self.data_flattening_max_level = self.connection_config.get( 'data_flattening_max_level', 0) self.flatten_schema = flatten_schema( stream_schema_message['schema'], max_level=self.data_flattening_max_level) self.s3 = boto3.client( 's3', aws_access_key_id=self.connection_config.get('aws_access_key_id'), aws_secret_access_key=self.connection_config.get( 'aws_secret_access_key'), aws_session_token=self.connection_config.get('aws_session_token'))
import sys import copy from datetime import datetime from decimal import Decimal from tempfile import mkstemp from typing import Dict from dateutil import parser from dateutil.parser import ParserError from joblib import Parallel, delayed, parallel_backend from jsonschema import Draft7Validator, FormatChecker from singer import get_logger from target_snowflake.db_sync import DbSync LOGGER = get_logger('target_snowflake') # Tone down snowflake.connector log noise by only outputting warnings and higher level messages logging.getLogger('snowflake.connector').setLevel(logging.WARNING) DEFAULT_BATCH_SIZE_ROWS = 100000 DEFAULT_PARALLELISM = 0 # 0 The number of threads used to flush tables DEFAULT_MAX_PARALLELISM = 16 # Don't use more than this number of threads by default when flushing streams in parallel # max timestamp/datetime supported in SF, used to reset all invalid dates that are beyond this value MAX_TIMESTAMP = '9999-12-31 23:59:59.999999' # max time supported in SF, used to reset all invalid times that are beyond this value MAX_TIME = '23:59:59.999999'
def __init__(self, connection_config): self.connection_config = connection_config self.logger = get_logger('target_snowflake') self.s3 = self.create_s3_client()
"""Sync data.""" # -*- coding: utf-8 -*- import logging from datetime import datetime, timezone from typing import Callable, Optional import singer from singer.catalog import Catalog, CatalogEntry from tap_paypal import tools from tap_paypal.paypal import PayPal from tap_paypal.streams import STREAMS LOGGER: logging.RootLogger = singer.get_logger() def sync( paypal: PayPal, state: dict, catalog: Catalog, start_date: str, ) -> None: """Sync data from tap source. Arguments: paypal {PayPal} -- PayPal client state {dict} -- Tap state catalog {Catalog} -- Stream catalog start_date {str} -- Start date """ # For every stream in the catalog
import singer LOGGER = singer.get_logger() #pylint: disable=too-many-return-statements def infer(key, datum, date_overrides, check_second_call=False): """ Returns the inferred data type """ if datum is None or datum == '': return None try: if isinstance(datum, list): data_type = 'string' if check_second_call: LOGGER.warning( 'Unsupported type for "%s", List inside list is not supported hence will be treated as a string', key) elif not datum: data_type = 'list' else: data_type = 'list.' + \ infer(key, datum[0], date_overrides, True) return data_type if key in date_overrides: return 'date-time' if isinstance(datum, dict):
import io import sys import time from collections import namedtuple from decimal import Decimal from jsonschema import Draft4Validator, FormatChecker import singer from singer import utils from transform_field import transform from transform_field.timings import Timings LOGGER = singer.get_logger('transform_field') TIMINGS = Timings(LOGGER) DEFAULT_MAX_BATCH_BYTES = 4000000 DEFAULT_MAX_BATCH_RECORDS = 20000 DEFAULT_BATCH_DELAY_SECONDS = 300.0 VALIDATE_RECORDS = False StreamMeta = namedtuple( 'StreamMeta', ['schema', 'key_properties', 'bookmark_properties']) TransMeta = namedtuple( 'TransMeta', ['field_id', 'type', 'when', 'nested_field_id']) REQUIRED_CONFIG_KEYS = [ "transformations" ]
import json import singer from dateutil.parser import parse LOGGER = singer.get_logger() def get_last_record_value_for_table(state, table): last_value = state.get('bookmarks', {}) \ .get(table, {}) \ .get('last_record') if last_value is None: return None return parse(last_value) def incorporate(state, table, field, value): if value is None: return state new_state = state.copy() parsed = parse(value).strftime("%Y-%m-%dT%H:%M:%SZ") if 'bookmarks' not in new_state: new_state['bookmarks'] = {} if(new_state['bookmarks'].get(table, {}).get('last_record') is None or
import json import re import singer import warnings import singer.metadata as metadata from psycopg2 import sql from singer import utils, get_bookmark from dateutil.parser import parse, UnknownTimezoneWarning, ParserError from functools import reduce import tap_postgres.db as post_db import tap_postgres.sync_strategies.common as sync_common from tap_postgres.stream_utils import refresh_streams_schema LOGGER = singer.get_logger('tap_postgres') UPDATE_BOOKMARK_PERIOD = 10000 FALLBACK_DATETIME = '9999-12-31T23:59:59.999+00:00' class ReplicationSlotNotFoundError(Exception): """Custom exception when replication slot not found""" class UnsupportedPayloadKindError(Exception): """Custom exception when waljson payload is not insert, update nor delete""" # pylint: disable=invalid-name,missing-function-docstring,too-many-branches,too-many-statements,too-many-arguments def get_pg_version(conn_info):