Example #1
0
def do_discover(sf):
    """Describes a Salesforce instance's objects and generates a JSON schema for each field."""
    global_description = sf.describe()

    objects_to_discover = {o['name'] for o in global_description['sobjects']}
    key_properties = ['Id']

    sf_custom_setting_objects = []
    object_to_tag_references = {}

    # For each SF Object describe it, loop its fields and build a schema
    entries = []

    # Check if the user has BULK API enabled
    if sf.api_type == 'BULK' and not Bulk(sf).has_permissions():
        raise TapSalesforceBulkAPIDisabledException(
            'This client does not have Bulk API permissions, received "API_DISABLED_FOR_ORG" error code'
        )

    for sobject_name in objects_to_discover:

        # Skip blacklisted SF objects depending on the api_type in use
        # ChangeEvent objects are not queryable via Bulk or REST (undocumented)
        if sobject_name in sf.get_blacklisted_objects() \
           or sobject_name.endswith("ChangeEvent"):
            continue

        sobject_description = sf.describe(sobject_name)

        # Cache customSetting and Tag objects to check for blacklisting after
        # all objects have been described
        if sobject_description.get("customSetting"):
            sf_custom_setting_objects.append(sobject_name)
        elif sobject_name.endswith("__Tag"):
            relationship_field = next((f for f in sobject_description["fields"]
                                       if f.get("relationshipName") == "Item"),
                                      None)
            if relationship_field:
                # Map {"Object":"Object__Tag"}
                object_to_tag_references[relationship_field["referenceTo"]
                                         [0]] = sobject_name

        fields = sobject_description['fields']
        replication_key = get_replication_key(sobject_name, fields)

        unsupported_fields = set()
        properties = {}
        mdata = metadata.new()

        found_id_field = False

        # Loop over the object's fields
        for f in fields:
            field_name = f['name']

            if field_name == "Id":
                found_id_field = True

            property_schema, mdata = create_property_schema(f, mdata)

            # Compound Address fields cannot be queried by the Bulk API
            if f['type'] == "address" and sf.api_type == tap_salesforce.salesforce.BULK_API_TYPE:
                unsupported_fields.add(
                    (field_name,
                     'cannot query compound address fields with bulk API'))

            # we haven't been able to observe any records with a json field, so we
            # are marking it as unavailable until we have an example to work with
            if f['type'] == "json":
                unsupported_fields.add((
                    field_name,
                    'do not currently support json fields - please contact support'
                ))

            # Blacklisted fields are dependent on the api_type being used
            field_pair = (sobject_name, field_name)
            if field_pair in sf.get_blacklisted_fields():
                unsupported_fields.add(
                    (field_name, sf.get_blacklisted_fields()[field_pair]))

            inclusion = metadata.get(mdata, ('properties', field_name),
                                     'inclusion')

            if sf.select_fields_by_default and inclusion != 'unsupported':
                mdata = metadata.write(mdata, ('properties', field_name),
                                       'selected-by-default', True)

            properties[field_name] = property_schema

        if replication_key:
            mdata = metadata.write(mdata, ('properties', replication_key),
                                   'inclusion', 'automatic')

        # There are cases where compound fields are referenced by the associated
        # subfields but are not actually present in the field list
        field_name_set = {f['name'] for f in fields}
        filtered_unsupported_fields = [
            f for f in unsupported_fields if f[0] in field_name_set
        ]
        missing_unsupported_field_names = [
            f[0] for f in unsupported_fields if f[0] not in field_name_set
        ]

        if missing_unsupported_field_names:
            LOGGER.info(
                "Ignoring the following unsupported fields for object %s as they are missing from the field list: %s",
                sobject_name,
                ', '.join(sorted(missing_unsupported_field_names)))

        if filtered_unsupported_fields:
            LOGGER.info(
                "Not syncing the following unsupported fields for object %s: %s",
                sobject_name,
                ', '.join(sorted([k for k, _ in filtered_unsupported_fields])))

        # Salesforce Objects are skipped when they do not have an Id field
        if not found_id_field:
            LOGGER.info("Skipping Salesforce Object %s, as it has no Id field",
                        sobject_name)
            continue

        # Any property added to unsupported_fields has metadata generated and
        # removed
        for prop, description in filtered_unsupported_fields:
            if metadata.get(mdata, ('properties', prop),
                            'selected-by-default'):
                metadata.delete(mdata, ('properties', prop),
                                'selected-by-default')

            mdata = metadata.write(mdata, ('properties', prop),
                                   'unsupported-description', description)
            mdata = metadata.write(mdata, ('properties', prop), 'inclusion',
                                   'unsupported')

        if replication_key:
            mdata = metadata.write(mdata, (), 'valid-replication-keys',
                                   [replication_key])
        else:
            mdata = metadata.write(
                mdata, (), 'forced-replication-method', {
                    'replication-method': 'FULL_TABLE',
                    'reason':
                    'No replication keys found from the Salesforce API'
                })

        mdata = metadata.write(mdata, (), 'table-key-properties',
                               key_properties)

        schema = {
            'type': 'object',
            'additionalProperties': False,
            'properties': properties
        }

        entry = {
            'stream': sobject_name,
            'tap_stream_id': sobject_name,
            'schema': schema,
            'metadata': metadata.to_list(mdata)
        }

        entries.append(entry)

    # For each custom setting field, remove its associated tag from entries
    # See Blacklisting.md for more information
    unsupported_tag_objects = [
        object_to_tag_references[f] for f in sf_custom_setting_objects
        if f in object_to_tag_references
    ]
    if unsupported_tag_objects:
        LOGGER.info(  #pylint:disable=logging-not-lazy
            "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects "
            + "are not supported by the Bulk API:")
        LOGGER.info(unsupported_tag_objects)
        entries = [
            e for e in entries if e['stream'] not in unsupported_tag_objects
        ]

    result = {'streams': entries}
    json.dump(result, sys.stdout, indent=4)
Example #2
0
def discover_columns(connection, table_info):
    entries = []
    for schema_name in table_info.keys():
        for table_name in table_info[schema_name].keys():
            mdata = {}
            columns = table_info[schema_name][table_name]['columns']
            table_pks = [
                col_name for col_name, col_info in columns.items()
                if col_info.is_primary_key
            ]
            with connection.cursor(
                    cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute(" SELECT current_database()")
                database_name = cur.fetchone()[0]

            metadata.write(mdata, (), 'table-key-properties', table_pks)
            metadata.write(mdata, (), 'schema-name', schema_name)
            metadata.write(mdata, (), 'database-name', database_name)
            metadata.write(mdata, (), 'row-count',
                           table_info[schema_name][table_name]['row_count'])
            metadata.write(mdata, (), 'is-view',
                           table_info[schema_name][table_name].get('is_view'))

            column_schemas = {
                col_name: schema_for_column(col_info)
                for col_name, col_info in columns.items()
            }

            schema = {
                'type': 'object',
                'properties': column_schemas,
                'definitions': {}
            }

            schema = include_array_schemas(columns, schema)

            for c_name in column_schemas.keys():
                mdata = write_sql_data_type_md(mdata, columns[c_name])

                if column_schemas[c_name].get('type') is None:
                    mdata = metadata.write(mdata, ('properties', c_name),
                                           'inclusion', 'unsupported')
                    mdata = metadata.write(mdata, ('properties', c_name),
                                           'selected-by-default', False)
                elif table_info[schema_name][table_name]['columns'][
                        c_name].is_primary_key:
                    mdata = metadata.write(mdata, ('properties', c_name),
                                           'inclusion', 'automatic')
                    mdata = metadata.write(mdata, ('properties', c_name),
                                           'selected-by-default', True)
                else:
                    mdata = metadata.write(mdata, ('properties', c_name),
                                           'inclusion', 'available')
                    mdata = metadata.write(mdata, ('properties', c_name),
                                           'selected-by-default', True)

            entry = {
                'table_name':
                table_name,
                'stream':
                table_name,
                'metadata':
                metadata.to_list(mdata),
                'tap_stream_id':
                post_db.compute_tap_stream_id(database_name, schema_name,
                                              table_name),
                'schema':
                schema
            }

            entries.append(entry)

    return entries
Example #3
0
    def perform_field_select(self, conn_id, catalog):
        annotated_stream = menagerie.get_annotated_schema(
            conn_id, catalog['stream_id'])
        schema = annotated_stream['annotated-schema']
        md = {}

        if catalog['tap_stream_id'] == 'GEO_PERFORMANCE_REPORT':
            md = self.select_all_fields_except(
                {
                    'conversionCategory', 'conversionTrackerId',
                    'conversionName', 'conversionSource', 'daysToConversion'
                }, schema, md)

        if catalog['tap_stream_id'] == 'KEYWORDS_PERFORMANCE_REPORT':
            md = self.select_all_fields_except(
                {
                    'clickType',
                    'daysToConversion',
                    'conversionCategory',
                    'conversionName',
                    'conversionSource',
                    'conversionTrackerId',
                    'device',
                    'network',
                    'networkWithSearchPartners',
                    'topVsOther',
                    'conversionAdjustment',
                    'daysToConversionOrAdjustment',
                }, schema, md)

        if catalog['tap_stream_id'] == 'accounts':
            md = self.select_all_fields_except({'customerId'}, schema, md)

        if catalog['tap_stream_id'] == 'ads':
            md = self.select_all_fields_except({'adGroupid'}, schema, md)

        if catalog['tap_stream_id'] == 'ad_groups':
            md = self.select_all_fields_except({'id'}, schema, md)

        if catalog['tap_stream_id'] == 'campaigns':
            md = self.select_all_fields_except({'id'}, schema, md)

        if catalog['tap_stream_id'] == 'AD_PERFORMANCE_REPORT':
            md = self.select_all_fields_except(
                {
                    'conversionCategory', 'conversionTrackerId',
                    'conversionName', 'conversionSource', 'clickType',
                    'daysToConversion', 'device', 'topVsOther',
                    'conversionAdjustment', 'daysToConversionOrAdjustment'
                }, schema, md)

        if catalog['tap_stream_id'] == 'ADGROUP_PERFORMANCE_REPORT':
            md = self.select_all_fields_except(
                {
                    'conversionCategory', 'conversionTrackerId',
                    'conversionName', 'conversionSource', 'clickType',
                    'daysToConversion', 'device', 'topVsOther', 'hourOfDay',
                    'conversionAdjustment', 'daysToConversionOrAdjustment'
                }, schema, md)

        if catalog['tap_stream_id'] == 'SEARCH_QUERY_PERFORMANCE_REPORT':
            md = self.select_all_fields_except(
                {
                    'conversionCategory', 'conversionTrackerId',
                    'conversionName', 'conversionSource', 'daysToConversion'
                }, schema, md)

        if catalog['tap_stream_id'] == 'KEYWORDLESS_QUERY_REPORT':
            md = self.select_all_fields_except(
                {
                    'conversionCategory', 'conversionCategory',
                    'conversionTrackerId', 'conversionName', 'conversionSource'
                }, schema, md)

        if catalog['tap_stream_id'] == 'CAMPAIGN_PERFORMANCE_REPORT':
            md = self.select_all_fields_except(
                {
                    'conversionCategory', 'conversionTrackerId',
                    'conversionName', 'conversionSource', 'clickType',
                    'device', 'topVsOther', 'hourOfDay',
                    'avgImprFreqPerCookie', 'uniqueCookies',
                    'daysToConversion', 'conversionAdjustment',
                    'daysToConversionOrAdjustment', 'adEventType'
                }, schema, md)

        if catalog['tap_stream_id'] == 'CLICK_PERFORMANCE_REPORT':
            md = self.select_all_fields_except({}, schema, md)

        if catalog['tap_stream_id'] == 'CRITERIA_PERFORMANCE_REPORT':
            md = self.select_all_fields_except(
                {
                    'conversionCategory', 'conversionTrackerId',
                    'conversionName', 'conversionSource', 'clickType',
                    'topVsOther', 'daysToConversion', 'conversionAdjustment',
                    'daysToConversionOrAdjustment'
                }, schema, md)

        if catalog['tap_stream_id'] == 'GENDER_PERFORMANCE_REPORT':
            md = self.select_all_fields_except(
                {
                    'conversionCategory', 'conversionTrackerId',
                    'conversionName', 'conversionSource', 'clickType',
                    'daysToConversion'
                }, schema, md)

        if catalog['tap_stream_id'] == 'AGE_RANGE_PERFORMANCE_REPORT':
            md = self.select_all_fields_except(
                {
                    'conversionCategory', 'conversionTrackerId',
                    'conversionName', 'conversionSource', 'clickType',
                    'daysToConversion'
                }, schema, md)

        if catalog['tap_stream_id'] == 'AUDIENCE_PERFORMANCE_REPORT':
            md = self.select_all_fields_except(
                {
                    'topVsOther', 'conversionCategory', 'conversionTrackerId',
                    'conversionName', 'conversionSource', 'clickType',
                    'daysToConversion'
                }, schema, md)

        if catalog['tap_stream_id'] == 'FINAL_URL_REPORT':
            md = self.select_all_fields_except(
                {
                    'topVsOther', 'conversionCategory', 'conversionTrackerId',
                    'conversionName', 'conversionSource', 'clickType',
                    'daysToConversion'
                }, schema, md)

        # if catalog['tap_stream_id'] == 'CREATIVE_CONVERSION_REPORT':
        #     md =  self.select_all_fields_except({'topVsOther',
        #                                          'conversionCategory',
        #                                          'conversionTrackerId',
        #                                          'conversionName',
        #                                          'conversionSource',
        #                                          'clickType'},
        #                                         schema,
        #                                         md)

        return {
            'key_properties': catalog.get('key_properties'),
            'schema': schema,
            'tap_stream_id': catalog.get('tap_stream_id'),
            'replication_method': catalog.get('replication_method'),
            'replication_key': catalog.get('replication_key'),
            'metadata': metadata.to_list(md)
        }
Example #4
0
def discover_columns(connection, table_info):
    entries = []
    for schema_name in table_info.keys():
        for table_name in table_info[schema_name].keys():

            mdata = {}
            columns = table_info[schema_name][table_name]["columns"]
            table_pks = [
                col_name for col_name, col_info in columns.items()
                if col_info.is_primary_key
            ]
            with connection.cursor(
                    cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute(" SELECT current_database()")
                database_name = cur.fetchone()[0]

            metadata.write(mdata, (), "table-key-properties", table_pks)
            metadata.write(mdata, (), "schema-name", schema_name)
            metadata.write(mdata, (), "database-name", database_name)
            metadata.write(mdata, (), "row-count",
                           table_info[schema_name][table_name]["row_count"])
            metadata.write(mdata, (), "is-view",
                           table_info[schema_name][table_name].get("is_view"))

            column_schemas = {
                col_name: schema_for_column(col_info)
                for col_name, col_info in columns.items()
            }

            schema = {
                "type": "object",
                "properties": column_schemas,
                "definitions": {}
            }

            schema = include_array_schemas(columns, schema)

            for c_name in column_schemas.keys():
                mdata = write_sql_data_type_md(mdata, columns[c_name])

                if column_schemas[c_name].get("type") is None:
                    mdata = metadata.write(mdata, ("properties", c_name),
                                           "inclusion", "unsupported")
                    mdata = metadata.write(mdata, ("properties", c_name),
                                           "selected-by-default", False)
                elif table_info[schema_name][table_name]["columns"][
                        c_name].is_primary_key:
                    mdata = metadata.write(mdata, ("properties", c_name),
                                           "inclusion", "automatic")
                    mdata = metadata.write(mdata, ("properties", c_name),
                                           "selected-by-default", True)
                else:
                    mdata = metadata.write(mdata, ("properties", c_name),
                                           "inclusion", "available")
                    mdata = metadata.write(mdata, ("properties", c_name),
                                           "selected-by-default", True)

            entry = {
                "table_name":
                table_name,
                "stream":
                table_name,
                "metadata":
                metadata.to_list(mdata),
                "tap_stream_id":
                post_db.compute_tap_stream_id(database_name, schema_name,
                                              table_name),
                "schema":
                schema,
            }

            entries.append(entry)

    return entries
def discover_catalog(snowflake_conn, config):
    """Returns a Catalog describing the structure of the database."""
    tables = config.get('tables').split(',')
    sql_columns = get_table_columns(snowflake_conn, tables)

    table_info = {}
    columns = []
    for sql_col in sql_columns:
        catalog = sql_col['TABLE_CATALOG']
        schema = sql_col['TABLE_SCHEMA']
        table_name = sql_col['TABLE_NAME']

        if catalog not in table_info:
            table_info[catalog] = {}

        if schema not in table_info[catalog]:
            table_info[catalog][schema] = {}

        table_info[catalog][schema][table_name] = {
            'row_count': sql_col.get('ROW_COUNT'),
            'is_view': sql_col.get('TABLE_TYPE') == 'VIEW'
        }

        columns.append(
            Column(
                table_catalog=catalog,
                table_schema=schema,
                table_name=table_name,
                column_name=sql_col['COLUMN_NAME'],
                data_type=sql_col['DATA_TYPE'],
                character_maximum_length=sql_col['CHARACTER_MAXIMUM_LENGTH'],
                numeric_precision=sql_col['NUMERIC_PRECISION'],
                numeric_scale=sql_col['NUMERIC_SCALE']))

    entries = []
    for (k, cols) in itertools.groupby(
            columns, lambda c:
        (c.table_catalog, c.table_schema, c.table_name)):
        cols = list(cols)
        (table_catalog, table_schema, table_name) = k
        schema = Schema(
            type='object',
            properties={c.column_name: schema_for_column(c)
                        for c in cols})
        md = create_column_metadata(cols)
        md_map = metadata.to_map(md)

        md_map = metadata.write(md_map, (), 'database-name', table_catalog)
        md_map = metadata.write(md_map, (), 'schema-name', table_schema)

        if (table_catalog in table_info
                and table_schema in table_info[table_catalog]
                and table_name in table_info[table_catalog][table_schema]):
            # Row Count of views returns NULL - Transform it to not null integer by defaults to 0
            row_count = table_info[table_catalog][table_schema][
                table_name].get('row_count', 0) or 0
            is_view = table_info[table_catalog][table_schema][table_name][
                'is_view']

            md_map = metadata.write(md_map, (), 'row-count', row_count)
            md_map = metadata.write(md_map, (), 'is-view', is_view)

            entry = CatalogEntry(table=table_name,
                                 stream=table_name,
                                 metadata=metadata.to_list(md_map),
                                 tap_stream_id=common.generate_tap_stream_id(
                                     table_catalog, table_schema, table_name),
                                 schema=schema)

            entries.append(entry)

    return Catalog(entries)
Example #6
0
def discover_catalog(connection):
    cursor = connection.cursor()
    cursor.execute("""
        SELECT table_schema,
               table_name,
               column_name,
               data_type,
               character_maximum_length,
               numeric_precision,
               numeric_scale
            FROM information_schema.columns
            WHERE table_schema != 'INFORMATION_SCHEMA'
            ORDER BY table_schema, table_name
        """)

    columns = []
    rec = cursor.fetchone()
    while rec is not None:
        columns.append(Column(*rec))
        rec = cursor.fetchone()

    entries = []
    for (k,
         cols) in itertools.groupby(columns, lambda c:
                                    (c.table_schema, c.table_name)):
        cols = list(cols)
        (table_schema, table_name) = k
        md = create_column_metadata(cols)
        md_map = metadata.to_map(md)

        if "events" in table_name.lower():
            key_properties = ['UUID']
            valid_replication_keys = ["SERVER_UPLOAD_TIME"]
            replication_key = "SERVER_UPLOAD_TIME"
        elif "merge" in table_name.lower():
            key_properties = []
            valid_replication_keys = ["MERGE_EVENT_TIME"]
            replication_key = "MERGE_EVENT_TIME"
        else:
            replication_key = ""
            key_properties = []
            valid_replication_keys = []

        properties = {}
        for c in cols:
            if c.column_name == replication_key or c.column_name in key_properties:
                properties[c.column_name] = schema_for_column(c, "automatic")
            else:
                properties[c.column_name] = schema_for_column(c, "available")
        schema = Schema(type='object', properties=properties)

        md_map = metadata.write(md_map, (), 'table-key-properties',
                                key_properties)

        md_map = metadata.write(md_map, (), 'valid-replication-keys',
                                valid_replication_keys)

        md_map = metadata.write(md_map, ('properties', replication_key),
                                'inclusion', 'automatic')

        entry = CatalogEntry(
            stream=table_name,
            metadata=metadata.to_list(md_map),
            tap_stream_id=table_schema + "-" + table_name,
            replication_key=replication_key,  # This is a non-discoverable key.
            replication_method="INCREMENTAL",  # This is a non-discoverable key.
            schema=schema)

        entries.append(entry)

    return Catalog(entries)
def discover_catalog(mssql_conn, config):
    """Returns a Catalog describing the structure of the database."""
    LOGGER.info("Preparing Catalog")
    mssql_conn = MSSQLConnection(config)
    filter_dbs_config = config.get("filter_dbs")

    if filter_dbs_config:
        filter_dbs_clause = ",".join(
            ["'{}'".format(db) for db in filter_dbs_config.split(",")])

        table_schema_clause = "WHERE c.table_schema IN ({})".format(
            filter_dbs_clause)
    else:
        table_schema_clause = """
        WHERE c.table_schema NOT IN (
        'information_schema',
        'performance_schema',
        'sys'
        )"""

    with connect_with_backoff(mssql_conn) as open_conn:
        cur = open_conn.cursor()
        LOGGER.info("Fetching tables")
        cur.execute("""SELECT table_schema,
                table_name,
                table_type
            FROM information_schema.tables c
            {}
        """.format(table_schema_clause))
        table_info = {}

        for (db, table, table_type) in cur.fetchall():
            if db not in table_info:
                table_info[db] = {}

            table_info[db][table] = {
                "row_count": None,
                "is_view": table_type == "VIEW"
            }
        LOGGER.info("Tables fetched, fetching columns")
        cur.execute("""with constraint_columns as (
                select c.table_schema
                , c.table_name
                , c.column_name

                from information_schema.constraint_column_usage c

                join information_schema.table_constraints tc
                        on tc.table_schema = c.table_schema
                        and tc.table_name = c.table_name
                        and tc.constraint_name = c.constraint_name
                        and tc.constraint_type in ('PRIMARY KEY', 'UNIQUE'))
                SELECT c.table_schema,
                    c.table_name,
                    c.column_name,
                    data_type,
                    character_maximum_length,
                    numeric_precision,
                    numeric_scale,
                    case when cc.column_name is null then 0 else 1 end
                FROM information_schema.columns c

                left join constraint_columns cc
                    on cc.table_name = c.table_name
                    and cc.table_schema = c.table_schema
                    and cc.column_name = c.column_name

                {}
                ORDER BY c.table_schema, c.table_name
        """.format(table_schema_clause))
        columns = []
        rec = cur.fetchone()
        while rec is not None:
            columns.append(Column(*rec))
            rec = cur.fetchone()
        LOGGER.info("Columns Fetched")
        entries = []
        for (k, cols) in itertools.groupby(
                columns, lambda c: (c.table_schema, c.table_name)):
            cols = list(cols)
            (table_schema, table_name) = k
            schema = Schema(
                type="object",
                properties={c.column_name: schema_for_column(c)
                            for c in cols})
            md = create_column_metadata(cols)
            md_map = metadata.to_map(md)

            md_map = metadata.write(md_map, (), "database-name", table_schema)

            is_view = table_info[table_schema][table_name]["is_view"]

            if table_schema in table_info and table_name in table_info[
                    table_schema]:
                row_count = table_info[table_schema][table_name].get(
                    "row_count")

                if row_count is not None:
                    md_map = metadata.write(md_map, (), "row-count", row_count)

                md_map = metadata.write(md_map, (), "is-view", is_view)

            key_properties = [
                c.column_name for c in cols if c.is_primary_key == 1
            ]

            md_map = metadata.write(md_map, (), "table-key-properties",
                                    key_properties)

            entry = CatalogEntry(
                table=table_name,
                stream=table_name,
                metadata=metadata.to_list(md_map),
                tap_stream_id=common.generate_tap_stream_id(
                    table_schema, table_name),
                schema=schema,
            )

            entries.append(entry)
    LOGGER.info("Catalog ready")
    return Catalog(entries)
Example #8
0
def discover_streams(config):
    streams = []

    LOGGER.info("Discovering custom fields for Accounts")
    custom_account_fields = STREAMS['metadata_accounts'](
        config).get_fields().get('custom')

    LOGGER.info("Discovering custom fields for Visitors")
    custom_visitor_fields = STREAMS['metadata_visitors'](
        config).get_fields().get('custom')

    for s in STREAMS.values():

        LOGGER.info("Disco vering stream %s", s.name)
        s = s(config)

        schema = s.load_schema()
        mdata = metadata.to_map(s.load_metadata())

        if s.name == 'accounts':
            LOGGER.info("Discovering custom fields for Accounts")
            custom_account_fields = {}
            instance = STREAMS['metadata_accounts'](config)

            custom_account_fields = get_custom_fields(instance)
            for key, value in custom_account_fields.items():
                if 'metadata_custom' not in schema['properties']:
                    schema['properties']['metadata_custom'] = {}
                schema['properties']['metadata_custom']['properties'] = {
                    key: get_schema_propery_type(value.get('type'))
                }
                mdata = metadata.write(mdata,
                                       ("properties", 'metadata_custom'),
                                       'inclusion', 'available')

        if s.name == 'visitors':
            for key, value in custom_visitor_fields.items():
                if 'metadata_custom' not in schema['properties']:
                    schema['properties']['metadata_custom'] = {}
                schema['properties']['metadata_custom'] = {
                    key: get_schema_propery_type(value.get('type'))
                }
                mdata = metadata.write(mdata,
                                       ("properties", 'metadata_custom'),
                                       'inclusion', 'available')

        if s.name == 'metadata_accounts':
            for key, value in custom_account_fields.items():
                if 'custom' not in schema['properties']:
                    schema['properties']['custom'] = {}
                schema['properties']['custom']['properties'] = {}
                schema['properties']['custom']['properties'][key] = {}
                schema['properties']['custom']['properties'][key][
                    'properties'] = {}
                schema['properties']['custom']['properties'][key]['type'] = [
                    "null", "object"
                ]
                schema['properties']['custom']['properties'][key][
                    'additional_properties'] = "false"
                schema['properties']['custom']['properties'][key][
                    'properties'] = metadata_fields
                mdata = metadata.write(mdata, ("properties", "custom"),
                                       'inclusion', 'available')
        if s.name == 'metadata_visitors':
            for key, value in custom_visitor_fields.items():
                if 'custom' not in schema['properties']:
                    schema['properties']['custom'] = {}
                schema['properties']['custom']['properties'] = {}
                schema['properties']['custom']['properties'][key] = {}
                schema['properties']['custom']['properties'][key][
                    'properties'] = {}
                schema['properties']['custom']['properties'][key]['type'] = [
                    "null", "object"
                ]
                schema['properties']['custom']['properties'][key][
                    'additional_properties'] = "false"
                schema['properties']['custom']['properties'][key][
                    'properties'] = metadata_fields
                mdata = metadata.write(mdata, ("properties", 'custom'),
                                       'inclusion', 'available')

        stream = {
            'stream': s.name,
            'tap_stream_id': s.name,
            'schema': schema,
            'metadata': metadata.to_list(mdata)
        }

        streams.append(stream)

    return streams
Example #9
0
def do_discover(sf):
    """Describes a Salesforce instance's objects and generates a JSON schema for each field."""
    global_description = sf.describe()

    objects_set = {o["name"] for o in global_description["sobjects"]}
    objects_to_discover = [
        "Account", "Contact", "Lead", "Opportunity", "Campaign",
        "AccountContactRelation", "AccountContactRole",
        "OpportunityContactRole", "CampaignMember", "Task", "Invoice__c",
        "OpportunityHistory", "AccountHistory", "LeadHistory", "User"
    ]
    key_properties = ["Id"]

    sf_custom_setting_objects = []
    object_to_tag_references = {}

    # For each SF Object describe it, loop its fields and build a schema
    entries = []

    # Check if the user has BULK API enabled
    if sf.api_type == "BULK" and not Bulk(sf).has_permissions():
        raise TapSalesforceBulkAPIDisabledException(
            'This client does not have Bulk API permissions, received "API_DISABLED_FOR_ORG" error code'
        )

    for sobject_name in objects_to_discover:

        # Skip blacklisted SF objects depending on the api_type in use
        # ChangeEvent objects are not queryable via Bulk or REST (undocumented)
        if sobject_name in sf.get_blacklisted_objects(
        ) or sobject_name.endswith("ChangeEvent"):
            continue
        if sobject_name not in objects_set:
            continue

        sobject_description = sf.describe(sobject_name)

        # Cache customSetting and Tag objects to check for blacklisting after
        # all objects have been described
        if sobject_description.get("customSetting"):
            sf_custom_setting_objects.append(sobject_name)
        elif sobject_name.endswith("__Tag"):
            relationship_field = next(
                (f for f in sobject_description["fields"]
                 if f.get("relationshipName") == "Item"),
                None,
            )
            if relationship_field:
                # Map {"Object":"Object__Tag"}
                object_to_tag_references[relationship_field["referenceTo"]
                                         [0]] = sobject_name

        fields = sobject_description["fields"]
        replication_key = get_replication_key(sobject_name, fields)

        unsupported_fields = set()
        properties = {}
        mdata = metadata.new()

        found_id_field = False

        # Loop over the object's fields
        for f in fields:
            field_name = f["name"]

            if field_name == "Id":
                found_id_field = True

            property_schema, mdata = create_property_schema(f, mdata)

            # Compound Address fields cannot be queried by the Bulk API
            if (f["type"] == "address" and sf.api_type
                    == tap_salesforce.salesforce.BULK_API_TYPE):
                unsupported_fields.add(
                    (field_name,
                     "cannot query compound address fields with bulk API"))

            # we haven't been able to observe any records with a json field, so we
            # are marking it as unavailable until we have an example to work with
            if f["type"] == "json":
                unsupported_fields.add((
                    field_name,
                    "do not currently support json fields - please contact support",
                ))

            # Blacklisted fields are dependent on the api_type being used
            field_pair = (sobject_name, field_name)
            if field_pair in sf.get_blacklisted_fields():
                unsupported_fields.add(
                    (field_name, sf.get_blacklisted_fields()[field_pair]))

            inclusion = metadata.get(mdata, ("properties", field_name),
                                     "inclusion")

            if sf.select_fields_by_default and inclusion != "unsupported":
                mdata = metadata.write(mdata, ("properties", field_name),
                                       "selected-by-default", True)

            properties[field_name] = property_schema

        if replication_key:
            mdata = metadata.write(mdata, ("properties", replication_key),
                                   "inclusion", "automatic")

        # There are cases where compound fields are referenced by the associated
        # subfields but are not actually present in the field list
        field_name_set = {f["name"] for f in fields}
        filtered_unsupported_fields = [
            f for f in unsupported_fields if f[0] in field_name_set
        ]
        missing_unsupported_field_names = [
            f[0] for f in unsupported_fields if f[0] not in field_name_set
        ]

        if missing_unsupported_field_names:
            LOGGER.info(
                "Ignoring the following unsupported fields for object %s as they are missing from the field list: %s",
                sobject_name,
                ", ".join(sorted(missing_unsupported_field_names)),
            )

        if filtered_unsupported_fields:
            LOGGER.info(
                "Not syncing the following unsupported fields for object %s: %s",
                sobject_name,
                ", ".join(sorted([k for k, _ in filtered_unsupported_fields])),
            )

        # Salesforce Objects are skipped when they do not have an Id field
        if not found_id_field:
            LOGGER.info("Skipping Salesforce Object %s, as it has no Id field",
                        sobject_name)
            continue

        # Any property added to unsupported_fields has metadata generated and
        # removed
        for prop, description in filtered_unsupported_fields:
            if metadata.get(mdata, ("properties", prop),
                            "selected-by-default"):
                metadata.delete(mdata, ("properties", prop),
                                "selected-by-default")

            mdata = metadata.write(mdata, ("properties", prop),
                                   "unsupported-description", description)
            mdata = metadata.write(mdata, ("properties", prop), "inclusion",
                                   "unsupported")

        if replication_key:
            mdata = metadata.write(mdata, (), "valid-replication-keys",
                                   [replication_key])
        else:
            mdata = metadata.write(
                mdata,
                (),
                "forced-replication-method",
                {
                    "replication-method": "FULL_TABLE",
                    "reason":
                    "No replication keys found from the Salesforce API",
                },
            )

        mdata = metadata.write(mdata, (), "table-key-properties",
                               key_properties)
        mdata = metadata.write(mdata, (), "selected", True)

        schema = {
            "type": "object",
            "additionalProperties": False,
            "properties": properties,
        }

        entry = {
            "stream": sobject_name,
            "tap_stream_id": sobject_name,
            "schema": schema,
            "metadata": metadata.to_list(mdata),
        }

        entries.append(entry)

    # For each custom setting field, remove its associated tag from entries
    # See Blacklisting.md for more information
    unsupported_tag_objects = [
        object_to_tag_references[f] for f in sf_custom_setting_objects
        if f in object_to_tag_references
    ]
    if unsupported_tag_objects:
        LOGGER.info(  # pylint:disable=logging-not-lazy
            "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects "
            + "are not supported by the Bulk API:")
        LOGGER.info(unsupported_tag_objects)
        entries = [
            e for e in entries if e["stream"] not in unsupported_tag_objects
        ]

    result = {"streams": entries}
    return result
Example #10
0
def discover_columns(connection, table_info, filter_schemas):
    cur = connection.cursor()
    binds_sql = [":{}".format(b) for b in range(len(filter_schemas))]
    if binds_sql:
        sql = """
      SELECT OWNER,
             TABLE_NAME, COLUMN_NAME,
             DATA_TYPE, DATA_LENGTH,
             CHAR_LENGTH, CHAR_USED,
             DATA_PRECISION, DATA_SCALE
        FROM all_tab_columns
       WHERE OWNER != 'SYS' AND owner IN ({})
       ORDER BY owner, table_name, column_name
      """.format(",".join(binds_sql))
    else:
        sql = """
      SELECT OWNER,
             TABLE_NAME, COLUMN_NAME,
             DATA_TYPE, DATA_LENGTH,
             CHAR_LENGTH, CHAR_USED,
             DATA_PRECISION, DATA_SCALE
        FROM all_tab_columns
       WHERE OWNER != 'SYS'
       ORDER BY owner, table_name, column_name
      """

    LOGGER.info("fetching column info")
    cur.execute(sql, filter_schemas)

    columns = []
    counter = 0
    rec = cur.fetchone()
    while rec is not None:
        columns.append(Column(*rec))

        rec = cur.fetchone()

    pk_constraints = produce_pk_constraints(connection, filter_schemas)
    entries = []
    for (k,
         cols) in itertools.groupby(columns, lambda c:
                                    (c.table_schema, c.table_name)):
        cols = list(cols)
        (table_schema, table_name) = k
        pks_for_table = pk_constraints.get(table_schema,
                                           {}).get(table_name, [])

        column_schemas = {
            c.column_name: schema_for_column(c, pks_for_table)
            for c in cols
        }
        schema = Schema(type='object', properties=column_schemas)

        md = produce_column_metadata(connection, table_info, table_schema,
                                     table_name, pk_constraints,
                                     column_schemas, cols)

        entry = {
            'table_name': table_name,
            'stream': table_name,
            'metadata': metadata.to_list(md),
            'tap_stream_id': table_schema + '-' + table_name,
            'schema': schema.to_dict(),
            'column_order': [str(column) for column in column_schemas]
        }
        entries.append(entry)

    return {'streams': entries}
Example #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--output', '-o', type=str, required=True)

    if sys.stdin.isatty():
        parser.add_argument('--input', '-i', type=str, required=True)

        args = parser.parse_args()

        with open(args.input) as f:
            catalog = json.load(f)

    else:

        args = parser.parse_args()

        catalog = json.loads(sys.stdin.read())

        sys.stdin = sys.stdout

    logger.info("Catalog configuration starting...")

    select_streams = {
        'type': 'checkbox',
        'message': 'Select Streams',
        'name': 'streams',
        'choices': [
            {'name': stream['stream']} for stream in catalog['streams']
        ]
    }

    selected_streams = prompt(select_streams)

    for i, stream in enumerate(catalog['streams']):

        mdata = metadata.to_map(stream['metadata'])

        if stream['stream'] not in selected_streams['streams']:
            mdata = metadata.write(
                mdata, (), 'selected', False
            )
        else:
            mdata = metadata.write(
                mdata, (), 'selected', True
            )

            fields = []

            field_reference = {}

            for breadcrumb, field in mdata.items():

                if breadcrumb != ():
                    selected, disabled = False, False
                    if metadata.get(
                            mdata, breadcrumb, 'inclusion') == 'automatic':
                        selected, disabled = True, "automatic"

                    elif metadata.get(
                            mdata, breadcrumb, 'selected-by-default'):
                        selected, disabled = True, False

                    name = breadcrumb_name(breadcrumb)

                    field_reference[name] = breadcrumb

                    fields.append({
                        'name': name,
                        'checked': selected,
                        'disabled': disabled
                    })

            stream_options = {
                'type': 'checkbox',
                'message': 'Select fields from stream: `{}`'.format(
                    stream['stream']),
                'name': 'fields',
                'choices': fields
            }

            selections = prompt(stream_options)

            selections = [
                field_reference[n] for n in selections['fields']
                if n != "Select All"
            ]

            for breadcrumb in mdata.keys():
                if breadcrumb != ():
                    if (
                        metadata.get(
                            mdata, breadcrumb, 'inclusion') == "automatic"
                    ) or (
                        breadcrumb in selections
                    ):
                        mdata = metadata.write(
                            mdata, breadcrumb, 'selected', True)
                    else:
                        mdata = metadata.write(
                            mdata, breadcrumb, 'selected', False)

            catalog['streams'][i]['metadata'] = metadata.to_list(mdata)

    logger.info("Catalog configuration saved.")

    with open(args.output, 'w') as f:
        json.dump(catalog, f, indent=2)
def discover_stream(client, stream_name, force_rest):  # pylint: disable=too-many-branches
    try:
        field_dict = get_field_dict(client, stream_name)
    except ApiException:
        return None

    properties = {}
    mdata = metadata.new()

    # Include entry with no breadcrumbs, required for PPW
    metadata.write(mdata, (), 'table-key-properties', ["Id"])

    for field_name, props in field_dict.items():
        field_properties = {}

        if props.get("joined", False):
            split_field_name = field_name.split(".")
            field_name = field_name.replace(".", "")
            mdata = metadata.write(mdata, ('properties', field_name),
                                   'tap-zuora.joined_object',
                                   split_field_name[0])

        if props["type"] in ["date", "datetime"]:
            field_properties["type"] = "string"
            field_properties["format"] = "date-time"
        else:
            field_properties["type"] = props["type"]

        if props["supported"]:
            field_properties["type"] = [field_properties["type"], "null"]

        if field_name in REQUIRED_KEYS:
            mdata = metadata.write(mdata, ('properties', field_name),
                                   'inclusion', 'automatic')
        elif props["supported"]:
            # PPW can not perform its own selection, but relies on the tap default settings.
            # So for now, include all the supported fields instead of leaving the choice to the user
            # mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available')
            mdata = metadata.write(mdata, ('properties', field_name),
                                   'inclusion', 'automatic')
        else:
            mdata = metadata.write(mdata, ('properties', field_name),
                                   'inclusion', 'unsupported')

        properties[field_name] = field_properties

    # Zuora sends back more entities than are actually available. We need to
    # run a sample export to test if the stream is available. If we are using
    # AQuA, we also need to see if we can use the Deleted property for that
    # stream.
    if force_rest:
        status = apis.Rest.stream_status(client, stream_name)
    else:
        status = apis.Aqua.stream_status(client, stream_name)

    # If the entity is unavailable, we need to return None
    if status == "unavailable":
        return None
    if status == "available_with_deleted":
        properties["Deleted"] = {"type": "boolean"}
        mdata = metadata.write(mdata, ('properties', 'Deleted'), 'inclusion',
                               'available')

    stream = {
        "tap_stream_id": stream_name,
        "stream": stream_name,
        "key_properties": ["Id"],
        "schema": {
            "type": "object",
            "additionalProperties": False,
            "properties": properties,
        },
        'metadata': metadata.to_list(mdata)
    }

    replication_key = get_replication_key(properties)
    if replication_key:
        stream["replication_key"] = replication_key
        stream["replication_method"] = "INCREMENTAL"
    else:
        stream["replication_method"] = "FULL_TABLE"

    return stream
def get_schemas(client):
    schemas = {}
    field_metadata = {}

    for stream_name, stream_metadata in STREAMS.items():
        schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
        with open(schema_path) as file:
            schema = json.load(file)
        schemas[stream_name] = schema
        mdata = metadata.new()

        # Documentation:
        # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
        # Reference:
        # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
        mdata = metadata.get_standard_metadata(
            schema=schema,
            key_properties=stream_metadata.get('key_properties', None),
            valid_replication_keys=stream_metadata.get('replication_keys',
                                                       None),
            replication_method=stream_metadata.get('replication_method', None))

        field_metadata[stream_name] = mdata

    # Limit report endpoints to those available to the account
    endpoint = 'report_types'
    report_type_data = client.get(
        url='https://youtubereporting.googleapis.com/v1',
        path='reportTypes',
        endpoint=endpoint)
    report_types = report_type_data.get('reportTypes', [])

    for report_type in report_types:
        # report_name = report id minus the version (last 3 chars)
        report_name = report_type.get('id')[:-3]
        report_metadata = REPORTS.get(report_name, {})

        schema_path = get_abs_path('schemas/reports.json')
        with open(schema_path) as file:
            schema = json.load(file)

        # dimensions, metrics, keys lists
        dimensions = report_metadata.get('dimensions', [])
        metrics = report_metadata.get('metrics', [])
        key_properties = ['dimensions_hash_key']
        report_fields = [
            'report_id', 'report_type_id', 'report_name', 'create_time'
        ]
        combined_list = [
            *dimensions, *metrics, *key_properties, *report_fields
        ]

        # remove keys not in combined_list
        remove = [
            key for key in schema['properties'] if key not in combined_list
        ]
        for key in remove:
            del schema['properties'][key]

        schemas[report_name] = schema
        mdata = metadata.new()

        mdata = metadata.get_standard_metadata(
            schema=schema,
            key_properties=key_properties,
            valid_replication_keys=['create_time'],
            replication_method='INCREMENTAL')

        # Set dimensions and create_time (bookmark) as automatic inclusion
        mdata_map = metadata.to_map(mdata)
        for dimension in dimensions:
            mdata_map[('properties', dimension)]['inclusion'] = 'automatic'
        mdata_map[('properties', 'create_time')]['inclusion'] = 'automatic'
        mdata = metadata.to_list(mdata_map)

        field_metadata[report_name] = mdata

    return schemas, field_metadata
def unselect_column(our_stream, col):
    md = metadata.to_map(our_stream['metadata'])
    md.get(('properties', col))['selected'] = False
    our_stream['metadata'] = metadata.to_list(md)
    return our_stream
Example #15
0
def do_discover(qb):
    """Describes a Quickbooks instance's objects and generates a JSON schema for each field."""
    objects_to_discover = qb.describe()
    key_properties = ['Id']

    qb_custom_setting_objects = []
    object_to_tag_references = {}

    # For each SF Object describe it, loop its fields and build a schema
    entries = []

    for sobject_name in objects_to_discover:

        fields = qb.describe(sobject_name)

        replication_key = REPLICATION_KEY
        if sobject_name.endswith('Report'):
            replication_key = None

        properties = {}
        mdata = metadata.new()

        # Loop over the object's fields
        for f in fields:
            field_name = f['name']

            property_schema, mdata = create_property_schema(f, mdata)

            inclusion = metadata.get(mdata, ('properties', field_name),
                                     'inclusion')

            if qb.select_fields_by_default:
                mdata = metadata.write(mdata, ('properties', field_name),
                                       'selected-by-default', True)

            properties[field_name] = property_schema

        if replication_key:
            mdata = metadata.write(mdata, ('properties', replication_key),
                                   'inclusion', 'automatic')

        if replication_key:
            mdata = metadata.write(mdata, (), 'valid-replication-keys',
                                   [replication_key])
        else:
            mdata = metadata.write(
                mdata, (), 'forced-replication-method', {
                    'replication-method': 'FULL_TABLE',
                    'reason':
                    'No replication keys found from the Quickbooks API'
                })

        mdata = metadata.write(mdata, (), 'table-key-properties',
                               key_properties)

        schema = {
            'type': 'object',
            'additionalProperties': False,
            'properties': properties
        }

        entry = {
            'stream': sobject_name,
            'tap_stream_id': sobject_name,
            'schema': schema,
            'metadata': metadata.to_list(mdata)
        }

        entries.append(entry)

    result = {'streams': entries}
    json.dump(result, sys.stdout, indent=4)
def discover_catalog(mysql_conn: Dict, dbs: str = None, tables: Optional[str] = None):
    """Returns a Catalog describing the structure of the database."""

    if dbs:
        filter_dbs_clause = ",".join([f"'{db_name}'" for db_name in dbs.split(",")])
        table_schema_clause = f"WHERE table_schema IN ({filter_dbs_clause})"
    else:
        table_schema_clause = """
        WHERE table_schema NOT IN (
        'information_schema',
        'performance_schema',
        'mysql',
        'sys'
        )"""

    tables_clause = ""

    if tables is not None and tables != "":
        filter_tables_clause = ",".join(
            [f"'{table_name}'" for table_name in tables.split(",")]
        )
        tables_clause = f" AND table_name IN ({filter_tables_clause})"

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            cur.execute(
                f"""
            SELECT table_schema,
                   table_name,
                   table_type,
                   table_rows
                FROM information_schema.tables
                {table_schema_clause}{tables_clause}
            """
            )

            table_info = {}

            for (db_name, table, table_type, rows) in cur.fetchall():
                if db_name not in table_info:
                    table_info[db_name] = {}

                table_info[db_name][table] = {
                    "row_count": rows,
                    "is_view": table_type == "VIEW",
                }

            cur.execute(
                f"""
                SELECT table_schema,
                       table_name,
                       column_name,
                       data_type,
                       character_maximum_length,
                       numeric_precision,
                       numeric_scale,
                       column_type,
                       column_key
                    FROM information_schema.columns
                    {table_schema_clause}{tables_clause}
                    ORDER BY table_schema, table_name
            """
            )

            columns = []
            rec = cur.fetchone()
            while rec is not None:
                columns.append(Column(*rec))
                rec = cur.fetchone()

            entries = []
            for (k, cols) in itertools.groupby(
                columns, lambda c: (c.table_schema, c.table_name)
            ):
                cols = list(cols)
                (table_schema, table_name) = k

                schema = Schema(
                    type="object",
                    properties={c.column_name: schema_for_column(c) for c in cols},
                )
                mdata = create_column_metadata(cols)
                md_map = metadata.to_map(mdata)

                md_map = metadata.write(md_map, (), "database-name", table_schema)

                is_view = table_info[table_schema][table_name]["is_view"]

                if (
                    table_schema in table_info
                    and table_name in table_info[table_schema]
                ):
                    row_count = table_info[table_schema][table_name].get("row_count")

                    if row_count is not None:
                        md_map = metadata.write(md_map, (), "row-count", row_count)

                    md_map = metadata.write(md_map, (), "is-view", is_view)

                column_is_key_prop = lambda c, s: (
                    c.column_key == "PRI"
                    and s.properties[c.column_name].inclusion != "unsupported"
                )

                key_properties = [
                    c.column_name for c in cols if column_is_key_prop(c, schema)
                ]

                if not is_view:
                    md_map = metadata.write(
                        md_map, (), "table-key-properties", key_properties
                    )

                entry = CatalogEntry(
                    table=table_name,
                    stream=table_name,
                    metadata=metadata.to_list(md_map),
                    tap_stream_id=common.generate_tap_stream_id(
                        table_schema, table_name
                    ),
                    schema=schema,
                )

                entries.append(entry)


    return Catalog(entries)
Example #17
0
def do_discover_report(sf):
    """Describes a Salesforce instance's reports and generates a JSON schema for each field."""
    sf_custom_setting_objects = []
    object_to_tag_references = {}

    # For each SF Report describe it, loop its fields and build a schema
    entries = []

    report_description = sf.describe()

    report_name = report_description['attributes']['reportName']
    fields = report_description['reportExtendedMetadata']['detailColumnInfo']

    unsupported_fields = set()
    properties = {}
    mdata = metadata.new()

    # Loop over the report's fields
    for field_name, field in fields.items():
        property_schema, mdata = create_report_property_schema(
            field, mdata, sf.source_type)

        # Compound Address fields and geolocations cannot be queried by the Bulk API, so we ignore them
        if field['dataType'] in (
                "address", "location"
        ) and sf.api_type == tap_salesforce.salesforce.BULK_API_TYPE:
            mdata.pop(('properties', field_name), None)
            continue

        # we haven't been able to observe any records with a json field, so we
        # are marking it as unavailable until we have an example to work with
        if field['dataType'] == "json":
            unsupported_fields.add((
                field_name,
                'do not currently support json fields - please contact support'
            ))

        inclusion = metadata.get(mdata, ('properties', field_name),
                                 'inclusion')

        if sf.select_fields_by_default and inclusion != 'unsupported':
            mdata = metadata.write(mdata, ('properties', field_name),
                                   'selected-by-default', True)

        properties[field_name] = property_schema

    # There are cases where compound fields are referenced by the associated
    # subfields but are not actually present in the field list
    field_name_set = {f for f in fields}
    filtered_unsupported_fields = [
        f for f in unsupported_fields if f[0] in field_name_set
    ]
    missing_unsupported_field_names = [
        f[0] for f in unsupported_fields if f[0] not in field_name_set
    ]

    if missing_unsupported_field_names:
        LOGGER.info(
            "Ignoring the following unsupported fields for report %s as they are missing from the field list: %s",
            sf.report_id, ', '.join(sorted(missing_unsupported_field_names)))

    if filtered_unsupported_fields:
        LOGGER.info(
            "Not syncing the following unsupported fields for report %s: %s",
            sf.report_id,
            ', '.join(sorted([k for k, _ in filtered_unsupported_fields])))

    # Any property added to unsupported_fields has metadata generated and
    # removed
    for prop, description in filtered_unsupported_fields:
        if metadata.get(mdata, ('properties', prop), 'selected-by-default'):
            metadata.delete(mdata, ('properties', prop), 'selected-by-default')

        mdata = metadata.write(mdata, ('properties', prop),
                               'unsupported-description', description)
        mdata = metadata.write(mdata, ('properties', prop), 'inclusion',
                               'unsupported')

    # this is the last entry with empty breadcumb which is required othwerise stream won't be picked up
    # table-key-properties is also required
    mdata = metadata.write(mdata, (), 'table-key-properties', [])

    schema = {
        'type': 'object',
        'additionalProperties': False,
        'properties': properties
    }

    entry = {
        'stream': report_name,
        'tap_stream_id': sf.report_id,
        'schema': schema,
        'metadata': metadata.to_list(mdata),
        'column_order': [str(column) for column in properties]
    }

    entries.append(entry)

    # For each custom setting field, remove its associated tag from entries
    # See Blacklisting.md for more information
    unsupported_tag_objects = [
        object_to_tag_references[f] for f in sf_custom_setting_objects
        if f in object_to_tag_references
    ]
    if unsupported_tag_objects:
        LOGGER.info(  # pylint:disable=logging-not-lazy
            "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects "
            + "are not supported by the Bulk API:")
        LOGGER.info(unsupported_tag_objects)
        entries = [
            e for e in entries if e['stream'] not in unsupported_tag_objects
        ]

    result = {'streams': entries}
    json.dump(result, sys.stdout, indent=4)
Example #18
0
def discover_catalog(mysql_conn, config):
    '''Returns a Catalog describing the structure of the database.'''


    filter_dbs_config = config.get('filter_dbs')


    if filter_dbs_config:
        filter_dbs_clause = ",".join(["'{}'".format(db)
                                         for db in filter_dbs_config.split(",")])

        table_schema_clause = "WHERE table_schema IN ({})".format(filter_dbs_clause)
    else:
        table_schema_clause = """
        WHERE table_schema NOT IN (
        'information_schema',
        'performance_schema',
        'mysql'
        )"""

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            cur.execute("""
            SELECT table_schema,
                   table_name,
                   table_type,
                   table_rows
                FROM information_schema.tables
                {}
            """.format(table_schema_clause))

            table_info = {}

            for (db, table, table_type, rows) in cur.fetchall():
                if db not in table_info:
                    table_info[db] = {}

                table_info[db][table] = {
                    'row_count': rows,
                    'is_view': table_type == 'VIEW'
                }

            cur.execute("""
                SELECT table_schema,
                       table_name,
                       column_name,
                       data_type,
                       character_maximum_length,
                       numeric_precision,
                       numeric_scale,
                       column_type,
                       column_key
                    FROM information_schema.columns
                    {}
                    ORDER BY table_schema, table_name
            """.format(table_schema_clause))

            columns = []
            rec = cur.fetchone()
            while rec is not None:
                columns.append(Column(*rec))
                rec = cur.fetchone()

            entries = []
            for (k, cols) in itertools.groupby(columns, lambda c: (c.table_schema, c.table_name)):
                cols = list(cols)
                (table_schema, table_name) = k
                schema = Schema(type='object',
                                properties={c.column_name: schema_for_column(c) for c in cols})
                md = create_column_metadata(cols)
                md_map = metadata.to_map(md)

                md_map = metadata.write(md_map,
                                        (),
                                        'database-name',
                                        table_schema)

                is_view = table_info[table_schema][table_name]['is_view']

                if table_schema in table_info and table_name in table_info[table_schema]:
                    row_count = table_info[table_schema][table_name].get('row_count')

                    if row_count is not None:
                        md_map = metadata.write(md_map,
                                                (),
                                                'row-count',
                                                row_count)

                    md_map = metadata.write(md_map,
                                            (),
                                            'is-view',
                                            is_view)

                column_is_key_prop = lambda c, s: (
                    c.column_key == 'PRI' and
                    s.properties[c.column_name].inclusion != 'unsupported'
                )

                key_properties = [c.column_name for c in cols if column_is_key_prop(c, schema)]

                if not is_view:
                    md_map = metadata.write(md_map,
                                            (),
                                            'table-key-properties',
                                            key_properties)

                entry = CatalogEntry(
                    table=table_name,
                    stream=table_name,
                    metadata=metadata.to_list(md_map),
                    tap_stream_id=common.generate_tap_stream_id(table_schema, table_name),
                    schema=schema)

                entries.append(entry)

    return Catalog(entries)
Example #19
0
def unselect_column(our_stream, col):
    md = metadata.to_map(our_stream["metadata"])
    md.get(("properties", col))["selected"] = False
    our_stream["metadata"] = metadata.to_list(md)
    return our_stream
Example #20
0
def get_activity_type_stream(activity):
    # Activity streams have 6 attributes:
    # - marketoGUID
    # - leadId
    # - activityDate
    # - activityTypeId
    # - primaryAttribute
    # - attributes
    #
    # marketoGUID, leadId, activityDate, and activityTypeId are simple
    # fields. primaryAttribute has a name and type which define an
    # automatically included field on the record. Attributes is an array
    # of attribute names and types that become available fields.

    # Regarding pimaryAttribute fields: On this side of things, Marketo will
    # describe the field in an activity that is considered the primary attribute
    # On the sync side, we will have to present that information in a flattened record
    mdata = metadata.new()

    properties = {
        "marketoGUID": {
            "type": ["null", "string"]
        },
        "leadId": {
            "type": ["null", "integer"]
        },
        "activityDate": {
            "type": ["null", "string"],
            "format": "date-time"
        },
        "activityTypeId": {
            "type": ["null", "integer"]
        }
    }

    for prop in properties:
        mdata = metadata.write(mdata, ('properties', prop), 'inclusion',
                               'automatic')

    if "primaryAttribute" in activity:
        properties["primary_attribute_value"] = {"type": ["null", "string"]}
        properties["primary_attribute_name"] = {"type": ["null", "string"]}
        properties["primary_attribute_value_id"] = {"type": ["null", "string"]}

        mdata = metadata.write(mdata,
                               ('properties', "primary_attribute_value"),
                               'inclusion', 'automatic')
        mdata = metadata.write(mdata, ('properties', "primary_attribute_name"),
                               'inclusion', 'automatic')
        mdata = metadata.write(mdata,
                               ('properties', "primary_attribute_value_id"),
                               'inclusion', 'automatic')

        primary = clean_string(activity["primaryAttribute"]["name"])
        mdata = metadata.write(mdata, (), 'marketo.primary-attribute-name',
                               primary)

    if "attributes" in activity:
        for attr in activity["attributes"]:
            attr_name = clean_string(attr["name"])
            field_schema, mdata = get_schema_for_type(attr["dataType"],
                                                      breadcrumb=('properties',
                                                                  attr_name),
                                                      mdata=mdata,
                                                      null=True)
            if field_schema:
                properties[attr_name] = field_schema

    activity_type_camel = clean_string(activity["name"])
    mdata = metadata.write(mdata, (), 'marketo.activity-id', activity["id"])

    tap_stream_id = "activities_{}".format(activity_type_camel)

    return {
        "tap_stream_id": tap_stream_id,
        "stream": tap_stream_id,
        "key_properties": ["marketoGUID"],
        "metadata": metadata.to_list(mdata),
        "schema": {
            "type": "object",
            "additionalProperties": False,
            "properties": properties,
        },
    }
Example #21
0
def discover_catalog(snowflake_conn, config, select_all=False):
    """Returns a Catalog describing the structure of the database."""
    tables = config.get('tables').split(',')
    sql_columns = get_table_columns(snowflake_conn, tables)
    config_meta = config_meta_parser(config)

    table_info = {}
    columns = []
    for sql_col in sql_columns:
        catalog = sql_col['TABLE_CATALOG']
        schema = sql_col['TABLE_SCHEMA']
        table_name = sql_col['TABLE_NAME']

        if catalog not in table_info:
            table_info[catalog] = {}

        if schema not in table_info[catalog]:
            table_info[catalog][schema] = {}

        table_info[catalog][schema][table_name] = {
            'row_count': sql_col.get('ROW_COUNT'),
            'is_view': sql_col.get('TABLE_TYPE') == 'VIEW'
        }

        columns.append(
            Column(
                table_catalog=catalog,
                table_schema=schema,
                table_name=table_name,
                column_name=sql_col['COLUMN_NAME'],
                data_type=sql_col['DATA_TYPE'],
                character_maximum_length=sql_col['CHARACTER_MAXIMUM_LENGTH'],
                numeric_precision=sql_col['NUMERIC_PRECISION'],
                numeric_scale=sql_col['NUMERIC_SCALE']))

    entries = []
    for (k, cols) in itertools.groupby(
            columns, lambda c:
        (c.table_catalog, c.table_schema, c.table_name)):
        cols = list(cols)
        (table_catalog, table_schema, table_name) = k
        schema = Schema(
            type='object',
            properties={c.column_name: schema_for_column(c)
                        for c in cols})
        md = create_column_metadata(cols, select_all)
        md_map = metadata.to_map(md)

        md_map = metadata.write(md_map, (), 'database-name', table_catalog)
        md_map = metadata.write(md_map, (), 'schema-name', table_schema)

        if (table_catalog in table_info
                and table_schema in table_info[table_catalog]
                and table_name in table_info[table_catalog][table_schema]):
            # Row Count of views returns NULL - Transform it to not null integer by defaults to 0
            row_count = table_info[table_catalog][table_schema][
                table_name].get('row_count', 0) or 0
            is_view = table_info[table_catalog][table_schema][table_name][
                'is_view']
            md_map = metadata.write(md_map, (), 'row-count', row_count)
            md_map = metadata.write(md_map, (), 'is-view', is_view)
            # if select_all is True set replication-method default to FULL_TABLE, will be overriden if
            # user defined INCREMENTAL in the config metadata
            if select_all:
                md_map = metadata.write(md_map, (), 'replication-method',
                                        'FULL_TABLE')

            # check config to see if there was optional rolling-lookback defined, inject into catalog if so
            full_table_name = f'{catalog}-{table_schema}-{table_name}'.upper()
            rolling = config.get('rolling-lookback')
            if rolling and full_table_name in rolling:
                rolling_table_meta = rolling.get(full_table_name)
                md_map = metadata.write(md_map, (), 'rolling-lookback',
                                        rolling_table_meta)

            # check config to see if there was optional metadata defined already
            full_table_name = f'{catalog}.{table_schema}.{table_name}'.upper()
            if config_meta and full_table_name in config_meta:
                table_meta = config_meta.get(full_table_name)
                for meta_key, meta_value in table_meta.items():
                    md_map = metadata.write(md_map, (), meta_key, meta_value)

            entry = CatalogEntry(table=table_name,
                                 stream=table_name,
                                 metadata=metadata.to_list(md_map),
                                 tap_stream_id=common.generate_tap_stream_id(
                                     table_catalog, table_schema, table_name),
                                 schema=schema)

            entries.append(entry)

    return Catalog(entries)