def main_impl(): try: args = utils.parse_args(REQUIRED_CONFIG_KEYS) account_id = args.config['account_id'] access_token = args.config['access_token'] CONFIG.update(args.config) global RESULT_RETURN_LIMIT RESULT_RETURN_LIMIT = CONFIG.get('result_return_limit', RESULT_RETURN_LIMIT) global API API = FacebookAdsApi.init(access_token=access_token) user = fb_user.User(fbid='me') accounts = user.get_ad_accounts() account = None for acc in accounts: if acc['account_id'] == account_id: account = acc if not account: raise SingerConfigurationError( "Couldn't find account with id {}".format(account_id)) except FacebookError as fb_error: raise_from(SingerConfigurationError, fb_error) if args.discover: try: do_discover() except FacebookError as fb_error: raise_from(SingerDiscoveryError, fb_error) elif args.properties: catalog = Catalog.from_dict(args.properties) try: do_sync(account, catalog, args.state) except FacebookError as fb_error: raise_from(SingerSyncError, fb_error) else: LOGGER.info("No properties were selected")
def main_impl(): args = utils.parse_args(REQUIRED_CONFIG_KEYS) connection = open_connection(args.config) warnings = [] with connection.cursor() as cur: try: cur.execute('SET @@session.time_zone="+0:00"') except pymysql.err.InternalError as e: warnings.append('Could not set session.time_zone. Error: ({}) {}'.format(*e.args)) try: cur.execute('SET @@session.wait_timeout=2700') except pymysql.err.InternalError as e: warnings.append('Could not set session.wait_timeout. Error: ({}) {}'.format(*e.args)) try: cur.execute('SET @@session.innodb_lock_wait_timeout=2700') except pymysql.err.InternalError as e: warnings.append( 'Could not set session.innodb_lock_wait_timeout. Error: ({}) {}'.format(*e.args) ) if warnings: LOGGER.info(("Encountered non-fatal errors when configuring MySQL session that could " "impact performance:")) for w in warnings: LOGGER.warning(w) log_server_params(connection) if args.discover: do_discover(connection) elif args.catalog: state = build_state(args.state, args.catalog) do_sync(connection, args.catalog, state) elif args.properties: catalog = Catalog.from_dict(args.properties) state = build_state(args.state, catalog) do_sync(connection, catalog, state) else: LOGGER.info("No properties were selected")
def discover(service): catalog = Catalog([]) for entity_name, entity in service.entities.items(): if entity_name not in selected_tables: continue schema_dict, metadata, pks = get_schema(entity.__odata_schema__) metadata.append({"breadcrumb": [], "metadata": {"selected": True}}) schema = Schema.from_dict(schema_dict) catalog.streams.append( CatalogEntry( stream=entity_name, tap_stream_id=entity_name, key_properties=pks, schema=schema, metadata=metadata, replication_method="INCREMENTAL" if schema_dict.get( "properties", None).get("createdon", None) else "FULL_TABLE", )) return catalog
def generate_catalog(streams): catalog = Catalog([]) for stream in streams: schema = stream.load_schema() mdata = metadata.new() mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream.key_properties, valid_replication_keys=stream.replication_key or None, replication_method=stream.replication_method or None) catalog.streams.append( CatalogEntry(stream=stream.name, tap_stream_id=stream.name, key_properties=stream.key_properties, schema=Schema.from_dict(schema), metadata=mdata)) return catalog
def sync( # noqa: WPS210, WPS213 wp: WordPressSupportForums, catalog: Catalog, ) -> None: """Sync data from tap source. Arguments: wp {WordPressSupportForums} -- WordPressSupportForums client catalog {Catalog} -- Stream catalog """ # For every stream in the catalog LOGGER.info('Sync') # Only selected streams are synced, whether a stream is selected is # determined by whether the key-value: "selected": true is in the schema # file. for stream in catalog.get_selected_streams({}): LOGGER.info(f'Syncing stream: {stream.tap_stream_id}') # Write the schema singer.write_schema( stream_name=stream.tap_stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties, ) # Every stream has a corresponding method in the WordPress Stats object # The stream: mysql will call: wp.mysql tap_data: Callable = getattr(wp, stream.tap_stream_id) # The tap_data method yields rows of data from the API for row in tap_data(): # Write a row to the stream singer.write_record( stream.tap_stream_id, row, time_extracted=datetime.now(timezone.utc), )
def test_should_output_no_records_given_no_records_available( self, mock_stdout, requests_mock): requests_mock.get( "https://api.nikabot.com/api/v1/users?limit=1000&page=0", json=json.loads(EMPTY_RESPONSE)) config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog(streams=[ CatalogEntry( tap_stream_id="users", stream="users", schema=Schema.from_dict({}), key_properties=["id"], metadata=[{ "breadcrumb": [], "metadata": { "selected": True } }], ) ]) sync(config, state, catalog) assert mock_stdout.mock_calls == [ call( '{"type": "SCHEMA", "stream": "users", "schema": {}, "key_properties": ["id"]}\n' ) ] assert LOGGER.info.mock_calls == [ call("Syncing stream: %s", "users"), call( "Making %s request to %s with params %s", "GET", "https://api.nikabot.com/api/v1/users", { "limit": "1000", "page": "0" }, ), ]
def discover(config): streams = [] for table_spec in config['tables']: try: modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_matching_objects( table_spec, modified_since) sample_rate = table_spec.get('sample_rate', 5) max_sampling_read = table_spec.get('max_sampling_read', 1000) max_sampled_files = table_spec.get('max_sampled_files', 50) samples = file_utils.sample_files(table_spec, target_files, sample_rate=sample_rate, max_records=max_sampling_read, max_files=max_sampled_files) schema = generate_schema(table_spec, samples) stream_metadata = [] key_properties = table_spec.get('key_properties', []) streams.append( CatalogEntry( tap_stream_id=table_spec['name'], stream=table_spec['name'], schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) except Exception as err: LOGGER.error( f"Unable to write Catalog entry for '{table_spec['name']}' - it will be skipped due to error {err}" ) return Catalog(streams)
def test_should_output_records(self, mock_stdout, requests_mock): requests_mock.get("https://api.nikabot.com/api/v1/teams", json=json.loads(TEAMS_RESPONSE)) config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog( streams=[ CatalogEntry( tap_stream_id="teams", stream="teams", schema=Schema.from_dict({}), key_properties=["id"], metadata=[{"breadcrumb": [], "metadata": {"selected": True}}], ) ] ) sync(config, state, catalog) assert mock_stdout.mock_calls == [ call('{"type": "SCHEMA", "stream": "teams", "schema": {}, "key_properties": ["id"]}\n'), call( '{"type": "RECORD", "stream": "teams", "record": {"id": "5d6ca50762a07c00045125fb", "domain": "pageup", "bot_token": "e31d3b7ae51ff1feec8be578f23eb017e8143f66a7a085342c664544b81618ec41b87810d61a9c1f6133fe0c7d88aa3976232bb2a2665c4f89c38058b51cd20c", "activated_by": "U6K26HMGV", "status": "ACTIVE", "platform_id": "T034F9NPW", "created_at": "2019-09-02T05:13:43.151", "subscription": {"active_until": "2020-07-08T23:59:59", "status": "active", "number_of_users": 69, "subscriber_id": "U93KT77T6"}, "icon": {"image_34": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_34.png", "image_44": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_44.png", "image_68": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_68.png", "image_88": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_88.png", "image_102": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_102.png", "image_132": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_132.png", "image_230": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_230.png", "image_original": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_original.png"}}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), ]
def expected_subset_catalog_selected_default_col(): return Catalog.from_dict({ 'streams': [{ 'database_name': 'FakeDB', 'table_name': 'category', 'tap_stream_id': 'dev-category', 'is_view': False, 'stream': 'category', 'schema': { 'type': 'object', 'properties': { 'id': { 'minimum': -2147483648, 'type': 'integer', 'maximum': 2147483647, 'inclusion': 'available' } } }, 'metadata': [ { 'breadcrumb': (), 'metadata': { 'selected': True } }, { 'breadcrumb': ( 'properties', 'id' ), 'metadata': { 'selected-by-default': True, 'sql-datatype': 'int2' } } ] }] })
def test_getting_streams_to_sync(self): annotated_schemas = { 'streams': [{ 'stream': 'adcreative', 'tap_stream_id': 'adcreative', 'schema': { 'selected': True } }, { 'stream': 'ads', 'tap_stream_id': 'ads', 'schema': { 'selected': False } }] } catalog = Catalog.from_dict(annotated_schemas) streams_to_sync = tap_facebook.get_streams_to_sync(None, catalog, None) names_to_sync = [stream.name for stream in streams_to_sync] self.assertEqual(['adcreative'], names_to_sync)
def discover(): schemas, field_metadata = get_schemas() catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] table_metadata = {} for entry in mdata: if entry.get('breadcrumb') == (): table_metadata = entry.get('metadata', {}) key_properties = table_metadata.get('table-key-properties') catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=key_properties, schema=schema, metadata=mdata)) return catalog
def discover(): schemas, field_metadata = get_schemas() catalog = Catalog([]) flat_streams = flatten_streams() for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) mdata = metadata.to_map(field_metadata[stream_name]) stream = flat_streams.get(stream_name, {}) if stream.get('replication_method') == 'INCREMENTAL': for field_name in stream.get('replication_keys'): metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=stream.get('key_properties', None), schema=schema, metadata=metadata.to_list(mdata))) return catalog
def discover(reports): schemas, field_metadata = get_schemas(reports) catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] # table_metadata = {} for entry, value in mdata.items(): if entry == (): table_metadata = value key_properties = table_metadata.get('table-key-properties') catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=key_properties, schema=schema, metadata=metadata.to_list(mdata))) return catalog
def discover(config, config_path): schemas, schemas_metadata = get_schemas(config, config_path) streams = [] for schema_name, schema in schemas.items(): schema_meta = schemas_metadata[schema_name] catalog_entry = { 'stream': schema_name, 'tap_stream_id': schema_name, 'schema': schema, 'key_properties': _get_key_properties_from_meta(schema_meta), 'replication_method': _get_replication_method_from_meta(schema_meta), 'replication_key': _get_replication_key_from_meta(schema_meta), 'metadata': schema_meta } streams.append(catalog_entry) return Catalog.from_dict({'streams': streams})
def main(): parser = argparse.ArgumentParser() parser.add_argument("-p", "--properties", help="Catalog file with fields selected") parser.add_argument("-c", "--config", help="Optional config file") parser.add_argument("-s", "--state", help="State file") parser.add_argument( "-d", "--discover", help="Build a catalog from the underlying schema", action="store_true", ) args = parser.parse_args() if args.config: LOGGER.info("Config json found") config = load_file(args.config) elif "typeform_config" in env: LOGGER.info("Env var config found") config = json.loads(env["typeform_config"]) else: LOGGER.critical("No config found, aborting run") return properties = load_file(args.properties) state = load_file(args.state) atx = Context(config, state) if args.discover: # the schema is static from file so we don't need to pass in atx for connection info. catalog = discover() json.dump(catalog.to_dict(), sys.stdout) else: atx.catalog = Catalog.from_dict( properties) if args.properties else discover() sync(atx)
def resolve_catalog(discovered, catalog, state): streams = list(filter(entry_is_selected, catalog.streams)) currently_syncing = singer.get_currently_syncing(state) if currently_syncing: streams = dropwhile(lambda s: s.tap_stream_id != currently_syncing, streams) result = Catalog(streams=[]) # Iterate over the streams in the input catalog and match each one up # with the same stream in the discovered catalog. for catalog_entry in streams: discovered_table = discovered.get_stream(catalog_entry.tap_stream_id) if not discovered_table: LOGGER.warning( 'Database {} table {} selected but does not exist'.format( catalog_entry.database, catalog_entry.table)) continue selected = get_selected_properties(catalog_entry) # These are the columns we need to select columns = desired_columns(selected, discovered_table.schema) schema = Schema(type='object', properties={ col: discovered_table.schema.properties[col] for col in columns }) result.streams.append( CatalogEntry(tap_stream_id=catalog_entry.tap_stream_id, stream=catalog_entry.stream, table=catalog_entry.table, schema=schema, metadata=catalog_entry.metadata)) return result
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): # TODO: populate any metadata and stream's key properties here.. stream_metadata = property.get_stream_metadata(schema) key_properties = property.get_key_properties(stream_id) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def discover(client, spreadsheet_id): schemas, field_metadata = get_schemas(client, spreadsheet_id) catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] key_properties = None for mdt in mdata: table_key_properties = mdt.get('metadata', {}).get('table-key-properties') if table_key_properties: key_properties = table_key_properties catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=STREAMS.get(stream_name, {}).get( 'key_properties', key_properties), schema=schema, metadata=mdata)) return catalog
def discover(config): client = Client(config) ensure_credentials_are_authorized(client) include_account_stream = is_account_endpoint_authorized(client) streams = [] for _, stream in STREAM_OBJECTS.items(): if (not include_account_stream and stream.tap_stream_id == STREAM_OBJECTS['account'].tap_stream_id): continue raw_schema = load_schema(stream.tap_stream_id) schema = Schema.from_dict(raw_schema) streams.append( CatalogEntry(stream=stream.tap_stream_id, tap_stream_id=stream.tap_stream_id, key_properties=stream.pk_fields, schema=schema, metadata=metadata.get_standard_metadata( schema=raw_schema, schema_name=stream.tap_stream_id, key_properties=stream.pk_fields, valid_replication_keys=stream.replication_keys, replication_method=stream.replication_method))) return Catalog(streams)
def discover() -> Catalog: raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): stream_metadata = get_stream_metadata(stream_id, schema.to_dict()) key_properties = get_key_properties(stream_id) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=get_replication_key(stream_id), is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=get_replication_method(stream_id), )) return Catalog(streams)
def test_bookmarks(mock_write_state): from singer.catalog import Catalog from tap_mambu.tap_mambu_refactor.tap_processors.processor import TapProcessor catalog = Catalog.load(f"{FIXTURES_PATH}/processor_catalog.json") client_mock = MagicMock() processor = TapProcessor(catalog=catalog, stream_name="loan_accounts", client=client_mock, config=config_json, state={'currently_syncing': 'loan_accounts'}, sub_type="self", generators=[GeneratorMock([])]) processor.write_bookmark() expected_state = { 'currently_syncing': 'loan_accounts', 'bookmarks': { 'loan_accounts': '2021-06-01T00:00:00Z' } } mock_write_state.assert_called_once_with(expected_state)
def main_impl(): args = utils.parse_args(REQUIRED_CONFIG_KEYS) conn_config = {'user': args.config['user'], 'password': args.config['password'], 'host': args.config['host'], 'port': args.config['port'], 'sid': args.config['sid']} if args.config.get('scn_window_size'): log_miner.SCN_WINDOW_SIZE=int(args.config['scn_window_size']) if args.discover: filter_schemas_prop = args.config.get('filter_schemas') filter_schemas = [] if args.config.get('filter_schemas'): filter_schemas = args.config.get('filter_schemas').split(',') do_discovery(conn_config, filter_schemas) elif args.properties: state = args.state # Sort the properties streams = args.properties['streams'] for stream in streams: new_properties = {} old_properties = stream['schema']['properties'] order = stream['column_order'] for column in order: new_properties[column] = old_properties[column] stream['schema']['properties'] = new_properties args.catalog = Catalog.from_dict(args.properties) do_sync(conn_config, args.catalog, args.config.get('default_replication_method'), state) else: LOGGER.info("No properties were selected")
def discover(ctx): LOGGER.info("Running discover") use_event_log = has_access_to_event_log(ctx) catalog = Catalog([]) for tap_stream_id in streams_.stream_ids: if not use_event_log and tap_stream_id == schemas.IDS.EVENT_LOG: continue schema_dict = schemas.load_schema(ctx, tap_stream_id) schema = Schema.from_dict(schema_dict) mdata = metadata.get_standard_metadata( schema_dict, key_properties=schemas.PK_FIELDS[tap_stream_id]) mdata = metadata.to_map(mdata) for field_name in schema_dict['properties'].keys(): mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') catalog.streams.append( CatalogEntry(stream=tap_stream_id, tap_stream_id=tap_stream_id, key_properties=schemas.PK_FIELDS[tap_stream_id], schema=schema, metadata=metadata.to_list(mdata))) return catalog
def main(): args = get_args() jira_config = args.config # jira client instance jira_client = Client(jira_config) # Setup Context Context.client = jira_client catalog = Catalog.from_dict(args.properties) \ if args.properties else discover() Context.config = jira_config Context.state = args.state Context.catalog = catalog try: if args.discover: discover().dump() print() else: sync() finally: if Context.client and Context.client.login_timer: Context.client.login_timer.cancel()
def discover(ctx): check_authorization(ctx) catalog = Catalog([]) for tap_stream_id in schemas.STATIC_SCHEMA_STREAM_IDS: schema = Schema.from_dict(schemas.load_schema(tap_stream_id)) metadata = [] if tap_stream_id in schemas.ROOT_METADATA: metadata.append(schemas.ROOT_METADATA[tap_stream_id]) for field_name in schema.properties.keys(): if field_name in schemas.PK_FIELDS[tap_stream_id]: inclusion = 'automatic' else: inclusion = 'available' metadata.append({ 'metadata': { 'inclusion': inclusion }, 'breadcrumb': ['properties', field_name] }) catalog.streams.append(CatalogEntry( stream=tap_stream_id, tap_stream_id=tap_stream_id, key_properties=schemas.PK_FIELDS[tap_stream_id], schema=schema, metadata=metadata )) contacts_schema, contact_metadata = schemas.get_contacts_schema(ctx) catalog.streams.append(CatalogEntry( stream='contacts', tap_stream_id='contacts', key_properties=schemas.PK_FIELDS['contacts'], schema=contacts_schema, metadata=contact_metadata )) return catalog
def resolve_catalog(discovered_catalog, streams_to_sync): result = Catalog(streams=[]) # Iterate over the streams in the input catalog and match each one up # with the same stream in the discovered catalog. for catalog_entry in streams_to_sync: catalog_metadata = metadata.to_map(catalog_entry.metadata) replication_key = catalog_metadata.get((), {}).get('replication-key') discovered_table = discovered_catalog.get_stream(catalog_entry.tap_stream_id) database_name = common.get_database_name(catalog_entry) if not discovered_table: LOGGER.warning('Database %s table %s was selected but does not exist', database_name, catalog_entry.table) continue selected = {k for k, v in catalog_entry.schema.properties.items() if common.property_is_selected(catalog_entry, k) or k == replication_key} # These are the columns we need to select columns = desired_columns(selected, discovered_table.schema) result.streams.append(CatalogEntry( tap_stream_id=catalog_entry.tap_stream_id, metadata=catalog_entry.metadata, stream=catalog_entry.stream, table=catalog_entry.table, schema=Schema( type='object', properties={col: discovered_table.schema.properties[col] for col in columns} ) )) return result
def discover(): schemas, field_metadata = get_schemas() catalog = Catalog([]) flat_streams = flatten_streams() for stream_name, schema_dict in schemas.items(): try: schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] except Exception as err: LOGGER.error(err) LOGGER.error('stream_name: {}'.format(stream_name)) LOGGER.error('type schema_dict: {}'.format(type(schema_dict))) raise err catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=flat_streams.get(stream_name, {}).get( 'key_properties', None), schema=schema, metadata=mdata)) return catalog
def discover(config): model_id = config.get('model_id') schemas, field_metadata = get_schemas() catalog = Catalog([]) flat_streams = flatten_streams() for stream_name, schema_dict in schemas.items(): process_stream = True # conversion_paths endpoint requires model_id tap config param if stream_name == 'conversion_paths' and not model_id: process_stream = False if process_stream: schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=flat_streams.get( stream_name, {}).get('key_properties', None), schema=schema, metadata=mdata)) return catalog
def main_impl(): args = utils.parse_args(REQUIRED_CONFIG_KEYS) account_id = args.config['account_id'] access_token = args.config['access_token'] CONFIG.update(args.config) FacebookAdsApi.init(access_token=access_token) user = fb_user.User(fbid='me') accounts = user.get_ad_accounts() account = None for acc in accounts: if acc['account_id'] == account_id: account = acc if not account: raise TapFacebookException("Couldn't find account with id {}".format(account_id)) if args.discover: do_discover() elif args.properties: catalog = Catalog.from_dict(args.properties) do_sync(account, catalog, args.state) else: LOGGER.info("No properties were selected")
def main(): required_config_keys = ['start_date'] args = singer.parse_args(required_config_keys) config = args.config freshdesk_client = FreshdeskClient(args.config_path, config) catalog = args.catalog or Catalog([]) state = args.state if args.properties and not args.catalog: raise Exception("DEPRECATED: Use of the 'properties' parameter is not supported. Please use --catalog instead") if args.discover: LOGGER.info("Starting discovery mode") catalog = discover(freshdesk_client) write_catalog(catalog) else: LOGGER.info("Starting sync mode") config, state = parse_args(REQUIRED_CONFIG_KEYS) CONFIG.update(config) STATE.update(state) sync(freshdesk_client, config, state, catalog)