def do_discover(sf): """Describes a Salesforce instance's objects and generates a JSON schema for each field.""" global_description = sf.describe() objects_to_discover = {o['name'] for o in global_description['sobjects']} key_properties = ['Id'] sf_custom_setting_objects = [] object_to_tag_references = {} # For each SF Object describe it, loop its fields and build a schema entries = [] # Check if the user has BULK API enabled if sf.api_type == 'BULK' and not Bulk(sf).has_permissions(): raise TapSalesforceBulkAPIDisabledException( 'This client does not have Bulk API permissions, received "API_DISABLED_FOR_ORG" error code' ) for sobject_name in objects_to_discover: # Skip blacklisted SF objects depending on the api_type in use # ChangeEvent objects are not queryable via Bulk or REST (undocumented) if sobject_name in sf.get_blacklisted_objects() \ or sobject_name.endswith("ChangeEvent"): continue sobject_description = sf.describe(sobject_name) # Cache customSetting and Tag objects to check for blacklisting after # all objects have been described if sobject_description.get("customSetting"): sf_custom_setting_objects.append(sobject_name) elif sobject_name.endswith("__Tag"): relationship_field = next((f for f in sobject_description["fields"] if f.get("relationshipName") == "Item"), None) if relationship_field: # Map {"Object":"Object__Tag"} object_to_tag_references[relationship_field["referenceTo"] [0]] = sobject_name fields = sobject_description['fields'] replication_key = get_replication_key(sobject_name, fields) unsupported_fields = set() properties = {} mdata = metadata.new() found_id_field = False # Loop over the object's fields for f in fields: field_name = f['name'] if field_name == "Id": found_id_field = True property_schema, mdata = create_property_schema(f, mdata) # Compound Address fields cannot be queried by the Bulk API if f['type'] == "address" and sf.api_type == tap_salesforce.salesforce.BULK_API_TYPE: unsupported_fields.add( (field_name, 'cannot query compound address fields with bulk API')) # we haven't been able to observe any records with a json field, so we # are marking it as unavailable until we have an example to work with if f['type'] == "json": unsupported_fields.add(( field_name, 'do not currently support json fields - please contact support' )) # Blacklisted fields are dependent on the api_type being used field_pair = (sobject_name, field_name) if field_pair in sf.get_blacklisted_fields(): unsupported_fields.add( (field_name, sf.get_blacklisted_fields()[field_pair])) inclusion = metadata.get(mdata, ('properties', field_name), 'inclusion') if sf.select_fields_by_default and inclusion != 'unsupported': mdata = metadata.write(mdata, ('properties', field_name), 'selected-by-default', True) properties[field_name] = property_schema if replication_key: mdata = metadata.write(mdata, ('properties', replication_key), 'inclusion', 'automatic') # There are cases where compound fields are referenced by the associated # subfields but are not actually present in the field list field_name_set = {f['name'] for f in fields} filtered_unsupported_fields = [ f for f in unsupported_fields if f[0] in field_name_set ] missing_unsupported_field_names = [ f[0] for f in unsupported_fields if f[0] not in field_name_set ] if missing_unsupported_field_names: LOGGER.info( "Ignoring the following unsupported fields for object %s as they are missing from the field list: %s", sobject_name, ', '.join(sorted(missing_unsupported_field_names))) if filtered_unsupported_fields: LOGGER.info( "Not syncing the following unsupported fields for object %s: %s", sobject_name, ', '.join(sorted([k for k, _ in filtered_unsupported_fields]))) # Salesforce Objects are skipped when they do not have an Id field if not found_id_field: LOGGER.info("Skipping Salesforce Object %s, as it has no Id field", sobject_name) continue # Any property added to unsupported_fields has metadata generated and # removed for prop, description in filtered_unsupported_fields: if metadata.get(mdata, ('properties', prop), 'selected-by-default'): metadata.delete(mdata, ('properties', prop), 'selected-by-default') mdata = metadata.write(mdata, ('properties', prop), 'unsupported-description', description) mdata = metadata.write(mdata, ('properties', prop), 'inclusion', 'unsupported') if replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [replication_key]) else: mdata = metadata.write( mdata, (), 'forced-replication-method', { 'replication-method': 'FULL_TABLE', 'reason': 'No replication keys found from the Salesforce API' }) mdata = metadata.write(mdata, (), 'table-key-properties', key_properties) schema = { 'type': 'object', 'additionalProperties': False, 'properties': properties } entry = { 'stream': sobject_name, 'tap_stream_id': sobject_name, 'schema': schema, 'metadata': metadata.to_list(mdata) } entries.append(entry) # For each custom setting field, remove its associated tag from entries # See Blacklisting.md for more information unsupported_tag_objects = [ object_to_tag_references[f] for f in sf_custom_setting_objects if f in object_to_tag_references ] if unsupported_tag_objects: LOGGER.info( #pylint:disable=logging-not-lazy "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects " + "are not supported by the Bulk API:") LOGGER.info(unsupported_tag_objects) entries = [ e for e in entries if e['stream'] not in unsupported_tag_objects ] result = {'streams': entries} json.dump(result, sys.stdout, indent=4)
def discover_columns(connection, table_info): entries = [] for schema_name in table_info.keys(): for table_name in table_info[schema_name].keys(): mdata = {} columns = table_info[schema_name][table_name]['columns'] table_pks = [ col_name for col_name, col_info in columns.items() if col_info.is_primary_key ] with connection.cursor( cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute(" SELECT current_database()") database_name = cur.fetchone()[0] metadata.write(mdata, (), 'table-key-properties', table_pks) metadata.write(mdata, (), 'schema-name', schema_name) metadata.write(mdata, (), 'database-name', database_name) metadata.write(mdata, (), 'row-count', table_info[schema_name][table_name]['row_count']) metadata.write(mdata, (), 'is-view', table_info[schema_name][table_name].get('is_view')) column_schemas = { col_name: schema_for_column(col_info) for col_name, col_info in columns.items() } schema = { 'type': 'object', 'properties': column_schemas, 'definitions': {} } schema = include_array_schemas(columns, schema) for c_name in column_schemas.keys(): mdata = write_sql_data_type_md(mdata, columns[c_name]) if column_schemas[c_name].get('type') is None: mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'unsupported') mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', False) elif table_info[schema_name][table_name]['columns'][ c_name].is_primary_key: mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'automatic') mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', True) else: mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'available') mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', True) entry = { 'table_name': table_name, 'stream': table_name, 'metadata': metadata.to_list(mdata), 'tap_stream_id': post_db.compute_tap_stream_id(database_name, schema_name, table_name), 'schema': schema } entries.append(entry) return entries
def perform_field_select(self, conn_id, catalog): annotated_stream = menagerie.get_annotated_schema( conn_id, catalog['stream_id']) schema = annotated_stream['annotated-schema'] md = {} if catalog['tap_stream_id'] == 'GEO_PERFORMANCE_REPORT': md = self.select_all_fields_except( { 'conversionCategory', 'conversionTrackerId', 'conversionName', 'conversionSource', 'daysToConversion' }, schema, md) if catalog['tap_stream_id'] == 'KEYWORDS_PERFORMANCE_REPORT': md = self.select_all_fields_except( { 'clickType', 'daysToConversion', 'conversionCategory', 'conversionName', 'conversionSource', 'conversionTrackerId', 'device', 'network', 'networkWithSearchPartners', 'topVsOther', 'conversionAdjustment', 'daysToConversionOrAdjustment', }, schema, md) if catalog['tap_stream_id'] == 'accounts': md = self.select_all_fields_except({'customerId'}, schema, md) if catalog['tap_stream_id'] == 'ads': md = self.select_all_fields_except({'adGroupid'}, schema, md) if catalog['tap_stream_id'] == 'ad_groups': md = self.select_all_fields_except({'id'}, schema, md) if catalog['tap_stream_id'] == 'campaigns': md = self.select_all_fields_except({'id'}, schema, md) if catalog['tap_stream_id'] == 'AD_PERFORMANCE_REPORT': md = self.select_all_fields_except( { 'conversionCategory', 'conversionTrackerId', 'conversionName', 'conversionSource', 'clickType', 'daysToConversion', 'device', 'topVsOther', 'conversionAdjustment', 'daysToConversionOrAdjustment' }, schema, md) if catalog['tap_stream_id'] == 'ADGROUP_PERFORMANCE_REPORT': md = self.select_all_fields_except( { 'conversionCategory', 'conversionTrackerId', 'conversionName', 'conversionSource', 'clickType', 'daysToConversion', 'device', 'topVsOther', 'hourOfDay', 'conversionAdjustment', 'daysToConversionOrAdjustment' }, schema, md) if catalog['tap_stream_id'] == 'SEARCH_QUERY_PERFORMANCE_REPORT': md = self.select_all_fields_except( { 'conversionCategory', 'conversionTrackerId', 'conversionName', 'conversionSource', 'daysToConversion' }, schema, md) if catalog['tap_stream_id'] == 'KEYWORDLESS_QUERY_REPORT': md = self.select_all_fields_except( { 'conversionCategory', 'conversionCategory', 'conversionTrackerId', 'conversionName', 'conversionSource' }, schema, md) if catalog['tap_stream_id'] == 'CAMPAIGN_PERFORMANCE_REPORT': md = self.select_all_fields_except( { 'conversionCategory', 'conversionTrackerId', 'conversionName', 'conversionSource', 'clickType', 'device', 'topVsOther', 'hourOfDay', 'avgImprFreqPerCookie', 'uniqueCookies', 'daysToConversion', 'conversionAdjustment', 'daysToConversionOrAdjustment', 'adEventType' }, schema, md) if catalog['tap_stream_id'] == 'CLICK_PERFORMANCE_REPORT': md = self.select_all_fields_except({}, schema, md) if catalog['tap_stream_id'] == 'CRITERIA_PERFORMANCE_REPORT': md = self.select_all_fields_except( { 'conversionCategory', 'conversionTrackerId', 'conversionName', 'conversionSource', 'clickType', 'topVsOther', 'daysToConversion', 'conversionAdjustment', 'daysToConversionOrAdjustment' }, schema, md) if catalog['tap_stream_id'] == 'GENDER_PERFORMANCE_REPORT': md = self.select_all_fields_except( { 'conversionCategory', 'conversionTrackerId', 'conversionName', 'conversionSource', 'clickType', 'daysToConversion' }, schema, md) if catalog['tap_stream_id'] == 'AGE_RANGE_PERFORMANCE_REPORT': md = self.select_all_fields_except( { 'conversionCategory', 'conversionTrackerId', 'conversionName', 'conversionSource', 'clickType', 'daysToConversion' }, schema, md) if catalog['tap_stream_id'] == 'AUDIENCE_PERFORMANCE_REPORT': md = self.select_all_fields_except( { 'topVsOther', 'conversionCategory', 'conversionTrackerId', 'conversionName', 'conversionSource', 'clickType', 'daysToConversion' }, schema, md) if catalog['tap_stream_id'] == 'FINAL_URL_REPORT': md = self.select_all_fields_except( { 'topVsOther', 'conversionCategory', 'conversionTrackerId', 'conversionName', 'conversionSource', 'clickType', 'daysToConversion' }, schema, md) # if catalog['tap_stream_id'] == 'CREATIVE_CONVERSION_REPORT': # md = self.select_all_fields_except({'topVsOther', # 'conversionCategory', # 'conversionTrackerId', # 'conversionName', # 'conversionSource', # 'clickType'}, # schema, # md) return { 'key_properties': catalog.get('key_properties'), 'schema': schema, 'tap_stream_id': catalog.get('tap_stream_id'), 'replication_method': catalog.get('replication_method'), 'replication_key': catalog.get('replication_key'), 'metadata': metadata.to_list(md) }
def discover_columns(connection, table_info): entries = [] for schema_name in table_info.keys(): for table_name in table_info[schema_name].keys(): mdata = {} columns = table_info[schema_name][table_name]["columns"] table_pks = [ col_name for col_name, col_info in columns.items() if col_info.is_primary_key ] with connection.cursor( cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute(" SELECT current_database()") database_name = cur.fetchone()[0] metadata.write(mdata, (), "table-key-properties", table_pks) metadata.write(mdata, (), "schema-name", schema_name) metadata.write(mdata, (), "database-name", database_name) metadata.write(mdata, (), "row-count", table_info[schema_name][table_name]["row_count"]) metadata.write(mdata, (), "is-view", table_info[schema_name][table_name].get("is_view")) column_schemas = { col_name: schema_for_column(col_info) for col_name, col_info in columns.items() } schema = { "type": "object", "properties": column_schemas, "definitions": {} } schema = include_array_schemas(columns, schema) for c_name in column_schemas.keys(): mdata = write_sql_data_type_md(mdata, columns[c_name]) if column_schemas[c_name].get("type") is None: mdata = metadata.write(mdata, ("properties", c_name), "inclusion", "unsupported") mdata = metadata.write(mdata, ("properties", c_name), "selected-by-default", False) elif table_info[schema_name][table_name]["columns"][ c_name].is_primary_key: mdata = metadata.write(mdata, ("properties", c_name), "inclusion", "automatic") mdata = metadata.write(mdata, ("properties", c_name), "selected-by-default", True) else: mdata = metadata.write(mdata, ("properties", c_name), "inclusion", "available") mdata = metadata.write(mdata, ("properties", c_name), "selected-by-default", True) entry = { "table_name": table_name, "stream": table_name, "metadata": metadata.to_list(mdata), "tap_stream_id": post_db.compute_tap_stream_id(database_name, schema_name, table_name), "schema": schema, } entries.append(entry) return entries
def discover_catalog(snowflake_conn, config): """Returns a Catalog describing the structure of the database.""" tables = config.get('tables').split(',') sql_columns = get_table_columns(snowflake_conn, tables) table_info = {} columns = [] for sql_col in sql_columns: catalog = sql_col['TABLE_CATALOG'] schema = sql_col['TABLE_SCHEMA'] table_name = sql_col['TABLE_NAME'] if catalog not in table_info: table_info[catalog] = {} if schema not in table_info[catalog]: table_info[catalog][schema] = {} table_info[catalog][schema][table_name] = { 'row_count': sql_col.get('ROW_COUNT'), 'is_view': sql_col.get('TABLE_TYPE') == 'VIEW' } columns.append( Column( table_catalog=catalog, table_schema=schema, table_name=table_name, column_name=sql_col['COLUMN_NAME'], data_type=sql_col['DATA_TYPE'], character_maximum_length=sql_col['CHARACTER_MAXIMUM_LENGTH'], numeric_precision=sql_col['NUMERIC_PRECISION'], numeric_scale=sql_col['NUMERIC_SCALE'])) entries = [] for (k, cols) in itertools.groupby( columns, lambda c: (c.table_catalog, c.table_schema, c.table_name)): cols = list(cols) (table_catalog, table_schema, table_name) = k schema = Schema( type='object', properties={c.column_name: schema_for_column(c) for c in cols}) md = create_column_metadata(cols) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), 'database-name', table_catalog) md_map = metadata.write(md_map, (), 'schema-name', table_schema) if (table_catalog in table_info and table_schema in table_info[table_catalog] and table_name in table_info[table_catalog][table_schema]): # Row Count of views returns NULL - Transform it to not null integer by defaults to 0 row_count = table_info[table_catalog][table_schema][ table_name].get('row_count', 0) or 0 is_view = table_info[table_catalog][table_schema][table_name][ 'is_view'] md_map = metadata.write(md_map, (), 'row-count', row_count) md_map = metadata.write(md_map, (), 'is-view', is_view) entry = CatalogEntry(table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id( table_catalog, table_schema, table_name), schema=schema) entries.append(entry) return Catalog(entries)
def discover_catalog(connection): cursor = connection.cursor() cursor.execute(""" SELECT table_schema, table_name, column_name, data_type, character_maximum_length, numeric_precision, numeric_scale FROM information_schema.columns WHERE table_schema != 'INFORMATION_SCHEMA' ORDER BY table_schema, table_name """) columns = [] rec = cursor.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cursor.fetchone() entries = [] for (k, cols) in itertools.groupby(columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k md = create_column_metadata(cols) md_map = metadata.to_map(md) if "events" in table_name.lower(): key_properties = ['UUID'] valid_replication_keys = ["SERVER_UPLOAD_TIME"] replication_key = "SERVER_UPLOAD_TIME" elif "merge" in table_name.lower(): key_properties = [] valid_replication_keys = ["MERGE_EVENT_TIME"] replication_key = "MERGE_EVENT_TIME" else: replication_key = "" key_properties = [] valid_replication_keys = [] properties = {} for c in cols: if c.column_name == replication_key or c.column_name in key_properties: properties[c.column_name] = schema_for_column(c, "automatic") else: properties[c.column_name] = schema_for_column(c, "available") schema = Schema(type='object', properties=properties) md_map = metadata.write(md_map, (), 'table-key-properties', key_properties) md_map = metadata.write(md_map, (), 'valid-replication-keys', valid_replication_keys) md_map = metadata.write(md_map, ('properties', replication_key), 'inclusion', 'automatic') entry = CatalogEntry( stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=table_schema + "-" + table_name, replication_key=replication_key, # This is a non-discoverable key. replication_method="INCREMENTAL", # This is a non-discoverable key. schema=schema) entries.append(entry) return Catalog(entries)
def discover_catalog(mssql_conn, config): """Returns a Catalog describing the structure of the database.""" LOGGER.info("Preparing Catalog") mssql_conn = MSSQLConnection(config) filter_dbs_config = config.get("filter_dbs") if filter_dbs_config: filter_dbs_clause = ",".join( ["'{}'".format(db) for db in filter_dbs_config.split(",")]) table_schema_clause = "WHERE c.table_schema IN ({})".format( filter_dbs_clause) else: table_schema_clause = """ WHERE c.table_schema NOT IN ( 'information_schema', 'performance_schema', 'sys' )""" with connect_with_backoff(mssql_conn) as open_conn: cur = open_conn.cursor() LOGGER.info("Fetching tables") cur.execute("""SELECT table_schema, table_name, table_type FROM information_schema.tables c {} """.format(table_schema_clause)) table_info = {} for (db, table, table_type) in cur.fetchall(): if db not in table_info: table_info[db] = {} table_info[db][table] = { "row_count": None, "is_view": table_type == "VIEW" } LOGGER.info("Tables fetched, fetching columns") cur.execute("""with constraint_columns as ( select c.table_schema , c.table_name , c.column_name from information_schema.constraint_column_usage c join information_schema.table_constraints tc on tc.table_schema = c.table_schema and tc.table_name = c.table_name and tc.constraint_name = c.constraint_name and tc.constraint_type in ('PRIMARY KEY', 'UNIQUE')) SELECT c.table_schema, c.table_name, c.column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, case when cc.column_name is null then 0 else 1 end FROM information_schema.columns c left join constraint_columns cc on cc.table_name = c.table_name and cc.table_schema = c.table_schema and cc.column_name = c.column_name {} ORDER BY c.table_schema, c.table_name """.format(table_schema_clause)) columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() LOGGER.info("Columns Fetched") entries = [] for (k, cols) in itertools.groupby( columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema( type="object", properties={c.column_name: schema_for_column(c) for c in cols}) md = create_column_metadata(cols) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), "database-name", table_schema) is_view = table_info[table_schema][table_name]["is_view"] if table_schema in table_info and table_name in table_info[ table_schema]: row_count = table_info[table_schema][table_name].get( "row_count") if row_count is not None: md_map = metadata.write(md_map, (), "row-count", row_count) md_map = metadata.write(md_map, (), "is-view", is_view) key_properties = [ c.column_name for c in cols if c.is_primary_key == 1 ] md_map = metadata.write(md_map, (), "table-key-properties", key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id( table_schema, table_name), schema=schema, ) entries.append(entry) LOGGER.info("Catalog ready") return Catalog(entries)
def discover_streams(config): streams = [] LOGGER.info("Discovering custom fields for Accounts") custom_account_fields = STREAMS['metadata_accounts']( config).get_fields().get('custom') LOGGER.info("Discovering custom fields for Visitors") custom_visitor_fields = STREAMS['metadata_visitors']( config).get_fields().get('custom') for s in STREAMS.values(): LOGGER.info("Disco vering stream %s", s.name) s = s(config) schema = s.load_schema() mdata = metadata.to_map(s.load_metadata()) if s.name == 'accounts': LOGGER.info("Discovering custom fields for Accounts") custom_account_fields = {} instance = STREAMS['metadata_accounts'](config) custom_account_fields = get_custom_fields(instance) for key, value in custom_account_fields.items(): if 'metadata_custom' not in schema['properties']: schema['properties']['metadata_custom'] = {} schema['properties']['metadata_custom']['properties'] = { key: get_schema_propery_type(value.get('type')) } mdata = metadata.write(mdata, ("properties", 'metadata_custom'), 'inclusion', 'available') if s.name == 'visitors': for key, value in custom_visitor_fields.items(): if 'metadata_custom' not in schema['properties']: schema['properties']['metadata_custom'] = {} schema['properties']['metadata_custom'] = { key: get_schema_propery_type(value.get('type')) } mdata = metadata.write(mdata, ("properties", 'metadata_custom'), 'inclusion', 'available') if s.name == 'metadata_accounts': for key, value in custom_account_fields.items(): if 'custom' not in schema['properties']: schema['properties']['custom'] = {} schema['properties']['custom']['properties'] = {} schema['properties']['custom']['properties'][key] = {} schema['properties']['custom']['properties'][key][ 'properties'] = {} schema['properties']['custom']['properties'][key]['type'] = [ "null", "object" ] schema['properties']['custom']['properties'][key][ 'additional_properties'] = "false" schema['properties']['custom']['properties'][key][ 'properties'] = metadata_fields mdata = metadata.write(mdata, ("properties", "custom"), 'inclusion', 'available') if s.name == 'metadata_visitors': for key, value in custom_visitor_fields.items(): if 'custom' not in schema['properties']: schema['properties']['custom'] = {} schema['properties']['custom']['properties'] = {} schema['properties']['custom']['properties'][key] = {} schema['properties']['custom']['properties'][key][ 'properties'] = {} schema['properties']['custom']['properties'][key]['type'] = [ "null", "object" ] schema['properties']['custom']['properties'][key][ 'additional_properties'] = "false" schema['properties']['custom']['properties'][key][ 'properties'] = metadata_fields mdata = metadata.write(mdata, ("properties", 'custom'), 'inclusion', 'available') stream = { 'stream': s.name, 'tap_stream_id': s.name, 'schema': schema, 'metadata': metadata.to_list(mdata) } streams.append(stream) return streams
def do_discover(sf): """Describes a Salesforce instance's objects and generates a JSON schema for each field.""" global_description = sf.describe() objects_set = {o["name"] for o in global_description["sobjects"]} objects_to_discover = [ "Account", "Contact", "Lead", "Opportunity", "Campaign", "AccountContactRelation", "AccountContactRole", "OpportunityContactRole", "CampaignMember", "Task", "Invoice__c", "OpportunityHistory", "AccountHistory", "LeadHistory", "User" ] key_properties = ["Id"] sf_custom_setting_objects = [] object_to_tag_references = {} # For each SF Object describe it, loop its fields and build a schema entries = [] # Check if the user has BULK API enabled if sf.api_type == "BULK" and not Bulk(sf).has_permissions(): raise TapSalesforceBulkAPIDisabledException( 'This client does not have Bulk API permissions, received "API_DISABLED_FOR_ORG" error code' ) for sobject_name in objects_to_discover: # Skip blacklisted SF objects depending on the api_type in use # ChangeEvent objects are not queryable via Bulk or REST (undocumented) if sobject_name in sf.get_blacklisted_objects( ) or sobject_name.endswith("ChangeEvent"): continue if sobject_name not in objects_set: continue sobject_description = sf.describe(sobject_name) # Cache customSetting and Tag objects to check for blacklisting after # all objects have been described if sobject_description.get("customSetting"): sf_custom_setting_objects.append(sobject_name) elif sobject_name.endswith("__Tag"): relationship_field = next( (f for f in sobject_description["fields"] if f.get("relationshipName") == "Item"), None, ) if relationship_field: # Map {"Object":"Object__Tag"} object_to_tag_references[relationship_field["referenceTo"] [0]] = sobject_name fields = sobject_description["fields"] replication_key = get_replication_key(sobject_name, fields) unsupported_fields = set() properties = {} mdata = metadata.new() found_id_field = False # Loop over the object's fields for f in fields: field_name = f["name"] if field_name == "Id": found_id_field = True property_schema, mdata = create_property_schema(f, mdata) # Compound Address fields cannot be queried by the Bulk API if (f["type"] == "address" and sf.api_type == tap_salesforce.salesforce.BULK_API_TYPE): unsupported_fields.add( (field_name, "cannot query compound address fields with bulk API")) # we haven't been able to observe any records with a json field, so we # are marking it as unavailable until we have an example to work with if f["type"] == "json": unsupported_fields.add(( field_name, "do not currently support json fields - please contact support", )) # Blacklisted fields are dependent on the api_type being used field_pair = (sobject_name, field_name) if field_pair in sf.get_blacklisted_fields(): unsupported_fields.add( (field_name, sf.get_blacklisted_fields()[field_pair])) inclusion = metadata.get(mdata, ("properties", field_name), "inclusion") if sf.select_fields_by_default and inclusion != "unsupported": mdata = metadata.write(mdata, ("properties", field_name), "selected-by-default", True) properties[field_name] = property_schema if replication_key: mdata = metadata.write(mdata, ("properties", replication_key), "inclusion", "automatic") # There are cases where compound fields are referenced by the associated # subfields but are not actually present in the field list field_name_set = {f["name"] for f in fields} filtered_unsupported_fields = [ f for f in unsupported_fields if f[0] in field_name_set ] missing_unsupported_field_names = [ f[0] for f in unsupported_fields if f[0] not in field_name_set ] if missing_unsupported_field_names: LOGGER.info( "Ignoring the following unsupported fields for object %s as they are missing from the field list: %s", sobject_name, ", ".join(sorted(missing_unsupported_field_names)), ) if filtered_unsupported_fields: LOGGER.info( "Not syncing the following unsupported fields for object %s: %s", sobject_name, ", ".join(sorted([k for k, _ in filtered_unsupported_fields])), ) # Salesforce Objects are skipped when they do not have an Id field if not found_id_field: LOGGER.info("Skipping Salesforce Object %s, as it has no Id field", sobject_name) continue # Any property added to unsupported_fields has metadata generated and # removed for prop, description in filtered_unsupported_fields: if metadata.get(mdata, ("properties", prop), "selected-by-default"): metadata.delete(mdata, ("properties", prop), "selected-by-default") mdata = metadata.write(mdata, ("properties", prop), "unsupported-description", description) mdata = metadata.write(mdata, ("properties", prop), "inclusion", "unsupported") if replication_key: mdata = metadata.write(mdata, (), "valid-replication-keys", [replication_key]) else: mdata = metadata.write( mdata, (), "forced-replication-method", { "replication-method": "FULL_TABLE", "reason": "No replication keys found from the Salesforce API", }, ) mdata = metadata.write(mdata, (), "table-key-properties", key_properties) mdata = metadata.write(mdata, (), "selected", True) schema = { "type": "object", "additionalProperties": False, "properties": properties, } entry = { "stream": sobject_name, "tap_stream_id": sobject_name, "schema": schema, "metadata": metadata.to_list(mdata), } entries.append(entry) # For each custom setting field, remove its associated tag from entries # See Blacklisting.md for more information unsupported_tag_objects = [ object_to_tag_references[f] for f in sf_custom_setting_objects if f in object_to_tag_references ] if unsupported_tag_objects: LOGGER.info( # pylint:disable=logging-not-lazy "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects " + "are not supported by the Bulk API:") LOGGER.info(unsupported_tag_objects) entries = [ e for e in entries if e["stream"] not in unsupported_tag_objects ] result = {"streams": entries} return result
def discover_columns(connection, table_info, filter_schemas): cur = connection.cursor() binds_sql = [":{}".format(b) for b in range(len(filter_schemas))] if binds_sql: sql = """ SELECT OWNER, TABLE_NAME, COLUMN_NAME, DATA_TYPE, DATA_LENGTH, CHAR_LENGTH, CHAR_USED, DATA_PRECISION, DATA_SCALE FROM all_tab_columns WHERE OWNER != 'SYS' AND owner IN ({}) ORDER BY owner, table_name, column_name """.format(",".join(binds_sql)) else: sql = """ SELECT OWNER, TABLE_NAME, COLUMN_NAME, DATA_TYPE, DATA_LENGTH, CHAR_LENGTH, CHAR_USED, DATA_PRECISION, DATA_SCALE FROM all_tab_columns WHERE OWNER != 'SYS' ORDER BY owner, table_name, column_name """ LOGGER.info("fetching column info") cur.execute(sql, filter_schemas) columns = [] counter = 0 rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() pk_constraints = produce_pk_constraints(connection, filter_schemas) entries = [] for (k, cols) in itertools.groupby(columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k pks_for_table = pk_constraints.get(table_schema, {}).get(table_name, []) column_schemas = { c.column_name: schema_for_column(c, pks_for_table) for c in cols } schema = Schema(type='object', properties=column_schemas) md = produce_column_metadata(connection, table_info, table_schema, table_name, pk_constraints, column_schemas, cols) entry = { 'table_name': table_name, 'stream': table_name, 'metadata': metadata.to_list(md), 'tap_stream_id': table_schema + '-' + table_name, 'schema': schema.to_dict(), 'column_order': [str(column) for column in column_schemas] } entries.append(entry) return {'streams': entries}
def main(): parser = argparse.ArgumentParser() parser.add_argument('--output', '-o', type=str, required=True) if sys.stdin.isatty(): parser.add_argument('--input', '-i', type=str, required=True) args = parser.parse_args() with open(args.input) as f: catalog = json.load(f) else: args = parser.parse_args() catalog = json.loads(sys.stdin.read()) sys.stdin = sys.stdout logger.info("Catalog configuration starting...") select_streams = { 'type': 'checkbox', 'message': 'Select Streams', 'name': 'streams', 'choices': [ {'name': stream['stream']} for stream in catalog['streams'] ] } selected_streams = prompt(select_streams) for i, stream in enumerate(catalog['streams']): mdata = metadata.to_map(stream['metadata']) if stream['stream'] not in selected_streams['streams']: mdata = metadata.write( mdata, (), 'selected', False ) else: mdata = metadata.write( mdata, (), 'selected', True ) fields = [] field_reference = {} for breadcrumb, field in mdata.items(): if breadcrumb != (): selected, disabled = False, False if metadata.get( mdata, breadcrumb, 'inclusion') == 'automatic': selected, disabled = True, "automatic" elif metadata.get( mdata, breadcrumb, 'selected-by-default'): selected, disabled = True, False name = breadcrumb_name(breadcrumb) field_reference[name] = breadcrumb fields.append({ 'name': name, 'checked': selected, 'disabled': disabled }) stream_options = { 'type': 'checkbox', 'message': 'Select fields from stream: `{}`'.format( stream['stream']), 'name': 'fields', 'choices': fields } selections = prompt(stream_options) selections = [ field_reference[n] for n in selections['fields'] if n != "Select All" ] for breadcrumb in mdata.keys(): if breadcrumb != (): if ( metadata.get( mdata, breadcrumb, 'inclusion') == "automatic" ) or ( breadcrumb in selections ): mdata = metadata.write( mdata, breadcrumb, 'selected', True) else: mdata = metadata.write( mdata, breadcrumb, 'selected', False) catalog['streams'][i]['metadata'] = metadata.to_list(mdata) logger.info("Catalog configuration saved.") with open(args.output, 'w') as f: json.dump(catalog, f, indent=2)
def discover_stream(client, stream_name, force_rest): # pylint: disable=too-many-branches try: field_dict = get_field_dict(client, stream_name) except ApiException: return None properties = {} mdata = metadata.new() # Include entry with no breadcrumbs, required for PPW metadata.write(mdata, (), 'table-key-properties', ["Id"]) for field_name, props in field_dict.items(): field_properties = {} if props.get("joined", False): split_field_name = field_name.split(".") field_name = field_name.replace(".", "") mdata = metadata.write(mdata, ('properties', field_name), 'tap-zuora.joined_object', split_field_name[0]) if props["type"] in ["date", "datetime"]: field_properties["type"] = "string" field_properties["format"] = "date-time" else: field_properties["type"] = props["type"] if props["supported"]: field_properties["type"] = [field_properties["type"], "null"] if field_name in REQUIRED_KEYS: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') elif props["supported"]: # PPW can not perform its own selection, but relies on the tap default settings. # So for now, include all the supported fields instead of leaving the choice to the user # mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'unsupported') properties[field_name] = field_properties # Zuora sends back more entities than are actually available. We need to # run a sample export to test if the stream is available. If we are using # AQuA, we also need to see if we can use the Deleted property for that # stream. if force_rest: status = apis.Rest.stream_status(client, stream_name) else: status = apis.Aqua.stream_status(client, stream_name) # If the entity is unavailable, we need to return None if status == "unavailable": return None if status == "available_with_deleted": properties["Deleted"] = {"type": "boolean"} mdata = metadata.write(mdata, ('properties', 'Deleted'), 'inclusion', 'available') stream = { "tap_stream_id": stream_name, "stream": stream_name, "key_properties": ["Id"], "schema": { "type": "object", "additionalProperties": False, "properties": properties, }, 'metadata': metadata.to_list(mdata) } replication_key = get_replication_key(properties) if replication_key: stream["replication_key"] = replication_key stream["replication_method"] = "INCREMENTAL" else: stream["replication_method"] = "FULL_TABLE" return stream
def get_schemas(client): schemas = {} field_metadata = {} for stream_name, stream_metadata in STREAMS.items(): schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) field_metadata[stream_name] = mdata # Limit report endpoints to those available to the account endpoint = 'report_types' report_type_data = client.get( url='https://youtubereporting.googleapis.com/v1', path='reportTypes', endpoint=endpoint) report_types = report_type_data.get('reportTypes', []) for report_type in report_types: # report_name = report id minus the version (last 3 chars) report_name = report_type.get('id')[:-3] report_metadata = REPORTS.get(report_name, {}) schema_path = get_abs_path('schemas/reports.json') with open(schema_path) as file: schema = json.load(file) # dimensions, metrics, keys lists dimensions = report_metadata.get('dimensions', []) metrics = report_metadata.get('metrics', []) key_properties = ['dimensions_hash_key'] report_fields = [ 'report_id', 'report_type_id', 'report_name', 'create_time' ] combined_list = [ *dimensions, *metrics, *key_properties, *report_fields ] # remove keys not in combined_list remove = [ key for key in schema['properties'] if key not in combined_list ] for key in remove: del schema['properties'][key] schemas[report_name] = schema mdata = metadata.new() mdata = metadata.get_standard_metadata( schema=schema, key_properties=key_properties, valid_replication_keys=['create_time'], replication_method='INCREMENTAL') # Set dimensions and create_time (bookmark) as automatic inclusion mdata_map = metadata.to_map(mdata) for dimension in dimensions: mdata_map[('properties', dimension)]['inclusion'] = 'automatic' mdata_map[('properties', 'create_time')]['inclusion'] = 'automatic' mdata = metadata.to_list(mdata_map) field_metadata[report_name] = mdata return schemas, field_metadata
def unselect_column(our_stream, col): md = metadata.to_map(our_stream['metadata']) md.get(('properties', col))['selected'] = False our_stream['metadata'] = metadata.to_list(md) return our_stream
def do_discover(qb): """Describes a Quickbooks instance's objects and generates a JSON schema for each field.""" objects_to_discover = qb.describe() key_properties = ['Id'] qb_custom_setting_objects = [] object_to_tag_references = {} # For each SF Object describe it, loop its fields and build a schema entries = [] for sobject_name in objects_to_discover: fields = qb.describe(sobject_name) replication_key = REPLICATION_KEY if sobject_name.endswith('Report'): replication_key = None properties = {} mdata = metadata.new() # Loop over the object's fields for f in fields: field_name = f['name'] property_schema, mdata = create_property_schema(f, mdata) inclusion = metadata.get(mdata, ('properties', field_name), 'inclusion') if qb.select_fields_by_default: mdata = metadata.write(mdata, ('properties', field_name), 'selected-by-default', True) properties[field_name] = property_schema if replication_key: mdata = metadata.write(mdata, ('properties', replication_key), 'inclusion', 'automatic') if replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [replication_key]) else: mdata = metadata.write( mdata, (), 'forced-replication-method', { 'replication-method': 'FULL_TABLE', 'reason': 'No replication keys found from the Quickbooks API' }) mdata = metadata.write(mdata, (), 'table-key-properties', key_properties) schema = { 'type': 'object', 'additionalProperties': False, 'properties': properties } entry = { 'stream': sobject_name, 'tap_stream_id': sobject_name, 'schema': schema, 'metadata': metadata.to_list(mdata) } entries.append(entry) result = {'streams': entries} json.dump(result, sys.stdout, indent=4)
def discover_catalog(mysql_conn: Dict, dbs: str = None, tables: Optional[str] = None): """Returns a Catalog describing the structure of the database.""" if dbs: filter_dbs_clause = ",".join([f"'{db_name}'" for db_name in dbs.split(",")]) table_schema_clause = f"WHERE table_schema IN ({filter_dbs_clause})" else: table_schema_clause = """ WHERE table_schema NOT IN ( 'information_schema', 'performance_schema', 'mysql', 'sys' )""" tables_clause = "" if tables is not None and tables != "": filter_tables_clause = ",".join( [f"'{table_name}'" for table_name in tables.split(",")] ) tables_clause = f" AND table_name IN ({filter_tables_clause})" with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: cur.execute( f""" SELECT table_schema, table_name, table_type, table_rows FROM information_schema.tables {table_schema_clause}{tables_clause} """ ) table_info = {} for (db_name, table, table_type, rows) in cur.fetchall(): if db_name not in table_info: table_info[db_name] = {} table_info[db_name][table] = { "row_count": rows, "is_view": table_type == "VIEW", } cur.execute( f""" SELECT table_schema, table_name, column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, column_type, column_key FROM information_schema.columns {table_schema_clause}{tables_clause} ORDER BY table_schema, table_name """ ) columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() entries = [] for (k, cols) in itertools.groupby( columns, lambda c: (c.table_schema, c.table_name) ): cols = list(cols) (table_schema, table_name) = k schema = Schema( type="object", properties={c.column_name: schema_for_column(c) for c in cols}, ) mdata = create_column_metadata(cols) md_map = metadata.to_map(mdata) md_map = metadata.write(md_map, (), "database-name", table_schema) is_view = table_info[table_schema][table_name]["is_view"] if ( table_schema in table_info and table_name in table_info[table_schema] ): row_count = table_info[table_schema][table_name].get("row_count") if row_count is not None: md_map = metadata.write(md_map, (), "row-count", row_count) md_map = metadata.write(md_map, (), "is-view", is_view) column_is_key_prop = lambda c, s: ( c.column_key == "PRI" and s.properties[c.column_name].inclusion != "unsupported" ) key_properties = [ c.column_name for c in cols if column_is_key_prop(c, schema) ] if not is_view: md_map = metadata.write( md_map, (), "table-key-properties", key_properties ) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id( table_schema, table_name ), schema=schema, ) entries.append(entry) return Catalog(entries)
def do_discover_report(sf): """Describes a Salesforce instance's reports and generates a JSON schema for each field.""" sf_custom_setting_objects = [] object_to_tag_references = {} # For each SF Report describe it, loop its fields and build a schema entries = [] report_description = sf.describe() report_name = report_description['attributes']['reportName'] fields = report_description['reportExtendedMetadata']['detailColumnInfo'] unsupported_fields = set() properties = {} mdata = metadata.new() # Loop over the report's fields for field_name, field in fields.items(): property_schema, mdata = create_report_property_schema( field, mdata, sf.source_type) # Compound Address fields and geolocations cannot be queried by the Bulk API, so we ignore them if field['dataType'] in ( "address", "location" ) and sf.api_type == tap_salesforce.salesforce.BULK_API_TYPE: mdata.pop(('properties', field_name), None) continue # we haven't been able to observe any records with a json field, so we # are marking it as unavailable until we have an example to work with if field['dataType'] == "json": unsupported_fields.add(( field_name, 'do not currently support json fields - please contact support' )) inclusion = metadata.get(mdata, ('properties', field_name), 'inclusion') if sf.select_fields_by_default and inclusion != 'unsupported': mdata = metadata.write(mdata, ('properties', field_name), 'selected-by-default', True) properties[field_name] = property_schema # There are cases where compound fields are referenced by the associated # subfields but are not actually present in the field list field_name_set = {f for f in fields} filtered_unsupported_fields = [ f for f in unsupported_fields if f[0] in field_name_set ] missing_unsupported_field_names = [ f[0] for f in unsupported_fields if f[0] not in field_name_set ] if missing_unsupported_field_names: LOGGER.info( "Ignoring the following unsupported fields for report %s as they are missing from the field list: %s", sf.report_id, ', '.join(sorted(missing_unsupported_field_names))) if filtered_unsupported_fields: LOGGER.info( "Not syncing the following unsupported fields for report %s: %s", sf.report_id, ', '.join(sorted([k for k, _ in filtered_unsupported_fields]))) # Any property added to unsupported_fields has metadata generated and # removed for prop, description in filtered_unsupported_fields: if metadata.get(mdata, ('properties', prop), 'selected-by-default'): metadata.delete(mdata, ('properties', prop), 'selected-by-default') mdata = metadata.write(mdata, ('properties', prop), 'unsupported-description', description) mdata = metadata.write(mdata, ('properties', prop), 'inclusion', 'unsupported') # this is the last entry with empty breadcumb which is required othwerise stream won't be picked up # table-key-properties is also required mdata = metadata.write(mdata, (), 'table-key-properties', []) schema = { 'type': 'object', 'additionalProperties': False, 'properties': properties } entry = { 'stream': report_name, 'tap_stream_id': sf.report_id, 'schema': schema, 'metadata': metadata.to_list(mdata), 'column_order': [str(column) for column in properties] } entries.append(entry) # For each custom setting field, remove its associated tag from entries # See Blacklisting.md for more information unsupported_tag_objects = [ object_to_tag_references[f] for f in sf_custom_setting_objects if f in object_to_tag_references ] if unsupported_tag_objects: LOGGER.info( # pylint:disable=logging-not-lazy "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects " + "are not supported by the Bulk API:") LOGGER.info(unsupported_tag_objects) entries = [ e for e in entries if e['stream'] not in unsupported_tag_objects ] result = {'streams': entries} json.dump(result, sys.stdout, indent=4)
def discover_catalog(mysql_conn, config): '''Returns a Catalog describing the structure of the database.''' filter_dbs_config = config.get('filter_dbs') if filter_dbs_config: filter_dbs_clause = ",".join(["'{}'".format(db) for db in filter_dbs_config.split(",")]) table_schema_clause = "WHERE table_schema IN ({})".format(filter_dbs_clause) else: table_schema_clause = """ WHERE table_schema NOT IN ( 'information_schema', 'performance_schema', 'mysql' )""" with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: cur.execute(""" SELECT table_schema, table_name, table_type, table_rows FROM information_schema.tables {} """.format(table_schema_clause)) table_info = {} for (db, table, table_type, rows) in cur.fetchall(): if db not in table_info: table_info[db] = {} table_info[db][table] = { 'row_count': rows, 'is_view': table_type == 'VIEW' } cur.execute(""" SELECT table_schema, table_name, column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, column_type, column_key FROM information_schema.columns {} ORDER BY table_schema, table_name """.format(table_schema_clause)) columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() entries = [] for (k, cols) in itertools.groupby(columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema(type='object', properties={c.column_name: schema_for_column(c) for c in cols}) md = create_column_metadata(cols) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), 'database-name', table_schema) is_view = table_info[table_schema][table_name]['is_view'] if table_schema in table_info and table_name in table_info[table_schema]: row_count = table_info[table_schema][table_name].get('row_count') if row_count is not None: md_map = metadata.write(md_map, (), 'row-count', row_count) md_map = metadata.write(md_map, (), 'is-view', is_view) column_is_key_prop = lambda c, s: ( c.column_key == 'PRI' and s.properties[c.column_name].inclusion != 'unsupported' ) key_properties = [c.column_name for c in cols if column_is_key_prop(c, schema)] if not is_view: md_map = metadata.write(md_map, (), 'table-key-properties', key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id(table_schema, table_name), schema=schema) entries.append(entry) return Catalog(entries)
def unselect_column(our_stream, col): md = metadata.to_map(our_stream["metadata"]) md.get(("properties", col))["selected"] = False our_stream["metadata"] = metadata.to_list(md) return our_stream
def get_activity_type_stream(activity): # Activity streams have 6 attributes: # - marketoGUID # - leadId # - activityDate # - activityTypeId # - primaryAttribute # - attributes # # marketoGUID, leadId, activityDate, and activityTypeId are simple # fields. primaryAttribute has a name and type which define an # automatically included field on the record. Attributes is an array # of attribute names and types that become available fields. # Regarding pimaryAttribute fields: On this side of things, Marketo will # describe the field in an activity that is considered the primary attribute # On the sync side, we will have to present that information in a flattened record mdata = metadata.new() properties = { "marketoGUID": { "type": ["null", "string"] }, "leadId": { "type": ["null", "integer"] }, "activityDate": { "type": ["null", "string"], "format": "date-time" }, "activityTypeId": { "type": ["null", "integer"] } } for prop in properties: mdata = metadata.write(mdata, ('properties', prop), 'inclusion', 'automatic') if "primaryAttribute" in activity: properties["primary_attribute_value"] = {"type": ["null", "string"]} properties["primary_attribute_name"] = {"type": ["null", "string"]} properties["primary_attribute_value_id"] = {"type": ["null", "string"]} mdata = metadata.write(mdata, ('properties', "primary_attribute_value"), 'inclusion', 'automatic') mdata = metadata.write(mdata, ('properties', "primary_attribute_name"), 'inclusion', 'automatic') mdata = metadata.write(mdata, ('properties', "primary_attribute_value_id"), 'inclusion', 'automatic') primary = clean_string(activity["primaryAttribute"]["name"]) mdata = metadata.write(mdata, (), 'marketo.primary-attribute-name', primary) if "attributes" in activity: for attr in activity["attributes"]: attr_name = clean_string(attr["name"]) field_schema, mdata = get_schema_for_type(attr["dataType"], breadcrumb=('properties', attr_name), mdata=mdata, null=True) if field_schema: properties[attr_name] = field_schema activity_type_camel = clean_string(activity["name"]) mdata = metadata.write(mdata, (), 'marketo.activity-id', activity["id"]) tap_stream_id = "activities_{}".format(activity_type_camel) return { "tap_stream_id": tap_stream_id, "stream": tap_stream_id, "key_properties": ["marketoGUID"], "metadata": metadata.to_list(mdata), "schema": { "type": "object", "additionalProperties": False, "properties": properties, }, }
def discover_catalog(snowflake_conn, config, select_all=False): """Returns a Catalog describing the structure of the database.""" tables = config.get('tables').split(',') sql_columns = get_table_columns(snowflake_conn, tables) config_meta = config_meta_parser(config) table_info = {} columns = [] for sql_col in sql_columns: catalog = sql_col['TABLE_CATALOG'] schema = sql_col['TABLE_SCHEMA'] table_name = sql_col['TABLE_NAME'] if catalog not in table_info: table_info[catalog] = {} if schema not in table_info[catalog]: table_info[catalog][schema] = {} table_info[catalog][schema][table_name] = { 'row_count': sql_col.get('ROW_COUNT'), 'is_view': sql_col.get('TABLE_TYPE') == 'VIEW' } columns.append( Column( table_catalog=catalog, table_schema=schema, table_name=table_name, column_name=sql_col['COLUMN_NAME'], data_type=sql_col['DATA_TYPE'], character_maximum_length=sql_col['CHARACTER_MAXIMUM_LENGTH'], numeric_precision=sql_col['NUMERIC_PRECISION'], numeric_scale=sql_col['NUMERIC_SCALE'])) entries = [] for (k, cols) in itertools.groupby( columns, lambda c: (c.table_catalog, c.table_schema, c.table_name)): cols = list(cols) (table_catalog, table_schema, table_name) = k schema = Schema( type='object', properties={c.column_name: schema_for_column(c) for c in cols}) md = create_column_metadata(cols, select_all) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), 'database-name', table_catalog) md_map = metadata.write(md_map, (), 'schema-name', table_schema) if (table_catalog in table_info and table_schema in table_info[table_catalog] and table_name in table_info[table_catalog][table_schema]): # Row Count of views returns NULL - Transform it to not null integer by defaults to 0 row_count = table_info[table_catalog][table_schema][ table_name].get('row_count', 0) or 0 is_view = table_info[table_catalog][table_schema][table_name][ 'is_view'] md_map = metadata.write(md_map, (), 'row-count', row_count) md_map = metadata.write(md_map, (), 'is-view', is_view) # if select_all is True set replication-method default to FULL_TABLE, will be overriden if # user defined INCREMENTAL in the config metadata if select_all: md_map = metadata.write(md_map, (), 'replication-method', 'FULL_TABLE') # check config to see if there was optional rolling-lookback defined, inject into catalog if so full_table_name = f'{catalog}-{table_schema}-{table_name}'.upper() rolling = config.get('rolling-lookback') if rolling and full_table_name in rolling: rolling_table_meta = rolling.get(full_table_name) md_map = metadata.write(md_map, (), 'rolling-lookback', rolling_table_meta) # check config to see if there was optional metadata defined already full_table_name = f'{catalog}.{table_schema}.{table_name}'.upper() if config_meta and full_table_name in config_meta: table_meta = config_meta.get(full_table_name) for meta_key, meta_value in table_meta.items(): md_map = metadata.write(md_map, (), meta_key, meta_value) entry = CatalogEntry(table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id( table_catalog, table_schema, table_name), schema=schema) entries.append(entry) return Catalog(entries)