Ejemplo n.º 1
0
def sync(config, state, stream):
    table_name = stream['tap_stream_id']

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, table_name, 'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    was_interrupted = singer.get_bookmark(state, table_name,
                                          'last_evaluated_key') is not None

    # pick a new table version if last run wasn't interrupted
    if was_interrupted:
        stream_version = singer.get_bookmark(state, table_name, 'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, table_name, 'version', stream_version)
    singer.write_state(state)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_version(table_name, stream_version)

    last_evaluated_key = singer.get_bookmark(state, table_name,
                                             'last_evaluated_key')

    md_map = metadata.to_map(stream['metadata'])
    projection = metadata.get(md_map, (), 'tap-mongodb.projection')

    rows_saved = 0

    deserializer = Deserializer()
    for result in scan_table(table_name, projection, last_evaluated_key,
                             config):
        for item in result.get('Items', []):
            rows_saved += 1
            # TODO: Do we actually have to put the item we retreive from
            # dynamo into a map before we can deserialize?
            record = deserializer.deserialize_item(item)
            record_message = singer.RecordMessage(stream=table_name,
                                                  record=record,
                                                  version=stream_version)

            singer.write_message(record_message)
        if result.get('LastEvaluatedKey'):
            state = singer.write_bookmark(state, table_name,
                                          'last_evaluated_key',
                                          result.get('LastEvaluatedKey'))
            singer.write_state(state)

    state = singer.clear_bookmark(state, table_name, 'last_evaluated_key')

    state = singer.write_bookmark(state, table_name,
                                  'initial_full_table_complete', True)

    singer.write_state(state)

    singer.write_version(table_name, stream_version)

    return rows_saved
Ejemplo n.º 2
0
def do_discover(sf):
    """Describes a Salesforce instance's objects and generates a JSON schema for each field."""
    global_description = sf.describe()

    objects_to_discover = {o['name'] for o in global_description['sobjects']}
    key_properties = ['Id']

    sf_custom_setting_objects = []
    object_to_tag_references = {}

    # For each SF Object describe it, loop its fields and build a schema
    entries = []
    for sobject_name in objects_to_discover:

        # Skip blacklisted SF objects depending on the api_type in use
        if sobject_name in sf.get_blacklisted_objects():
            continue

        sobject_description = sf.describe(sobject_name)

        # Cache customSetting and Tag objects to check for blacklisting after
        # all objects have been described
        if sobject_description.get("customSetting"):
            sf_custom_setting_objects.append(sobject_name)
        elif sobject_name.endswith("__Tag"):
            relationship_field = next((f for f in sobject_description["fields"]
                                       if f.get("relationshipName") == "Item"),
                                      None)
            if relationship_field:
                # Map {"Object":"Object__Tag"}
                object_to_tag_references[relationship_field["referenceTo"]
                                         [0]] = sobject_name

        fields = sobject_description['fields']
        replication_key = get_replication_key(sobject_name, fields)

        unsupported_fields = set()
        properties = {}
        mdata = metadata.new()

        found_id_field = False

        # Loop over the object's fields
        for f in fields:
            field_name = f['name']

            if field_name == "Id":
                found_id_field = True

            property_schema, mdata = create_property_schema(f, mdata)

            # Compound Address fields cannot be queried by the Bulk API
            if f['type'] == "address" and sf.api_type == tap_salesforce.salesforce.BULK_API_TYPE:
                unsupported_fields.add(
                    (field_name,
                     'cannot query compound address fields with bulk API'))

            # Blacklisted fields are dependent on the api_type being used
            field_pair = (sobject_name, field_name)
            if field_pair in sf.get_blacklisted_fields():
                unsupported_fields.add(
                    (field_name, sf.get_blacklisted_fields()[field_pair]))

            inclusion = metadata.get(mdata, ('properties', field_name),
                                     'inclusion')

            if sf.select_fields_by_default and inclusion != 'unsupported':
                mdata = metadata.write(mdata, ('properties', field_name),
                                       'selected-by-default', True)

            properties[field_name] = property_schema

        if replication_key:
            mdata = metadata.write(mdata, ('properties', replication_key),
                                   'inclusion', 'automatic')

        # There are cases where compound fields are referenced by the associated
        # subfields but are not actually present in the field list
        field_name_set = {f['name'] for f in fields}
        filtered_unsupported_fields = [
            f for f in unsupported_fields if f[0] in field_name_set
        ]
        missing_unsupported_field_names = [
            f[0] for f in unsupported_fields if f[0] not in field_name_set
        ]

        if missing_unsupported_field_names:
            LOGGER.info(
                "Ignoring the following unsupported fields for object %s as they are missing from the field list: %s",
                sobject_name,
                ', '.join(sorted(missing_unsupported_field_names)))

        if filtered_unsupported_fields:
            LOGGER.info(
                "Not syncing the following unsupported fields for object %s: %s",
                sobject_name,
                ', '.join(sorted([k for k, _ in filtered_unsupported_fields])))

        # Salesforce Objects are skipped when they do not have an Id field
        if not found_id_field:
            LOGGER.info("Skipping Salesforce Object %s, as it has no Id field",
                        sobject_name)
            continue

        # Any property added to unsupported_fields has metadata generated and
        # removed
        for prop, description in filtered_unsupported_fields:
            if metadata.get(mdata, ('properties', prop),
                            'selected-by-default'):
                metadata.delete(mdata, ('properties', prop),
                                'selected-by-default')

            mdata = metadata.write(mdata, ('properties', prop),
                                   'unsupported-description', description)
            mdata = metadata.write(mdata, ('properties', prop), 'inclusion',
                                   'unsupported')

        if replication_key:
            mdata = metadata.write(mdata, (), 'valid-replication-keys',
                                   [replication_key])
        else:
            mdata = metadata.write(
                mdata, (), 'forced-replication-method', {
                    'replication-method': 'FULL_TABLE',
                    'reason':
                    'No replication keys found from the Salesforce API'
                })

        mdata = metadata.write(mdata, (), 'table-key-properties',
                               key_properties)

        schema = {
            'type': 'object',
            'additionalProperties': False,
            'properties': properties
        }

        entry = {
            'stream': sobject_name,
            'tap_stream_id': sobject_name,
            'schema': schema,
            'metadata': metadata.to_list(mdata)
        }

        entries.append(entry)

    # For each custom setting field, remove its associated tag from entries
    # See Blacklisting.md for more information
    unsupported_tag_objects = [
        object_to_tag_references[f] for f in sf_custom_setting_objects
        if f in object_to_tag_references
    ]
    if unsupported_tag_objects:
        LOGGER.info(  #pylint:disable=logging-not-lazy
            "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects "
            + "are not supported by the Bulk API:")
        LOGGER.info(unsupported_tag_objects)
        entries = [
            e for e in entries if e['stream'] not in unsupported_tag_objects
        ]

    result = {'streams': entries}
    json.dump(result, sys.stdout, indent=4)
Ejemplo n.º 3
0
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting full table sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')

    db = client[database_name]
    collection = db[stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched') is not None

    #pick a new table version if last run wasn't interrupted
    if was_interrupted:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'):
        # There is a bookmark
        max_id_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                           'max_id_value')
        max_id_type = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'max_id_type')
        max_id_value = common.string_to_class(max_id_value, max_id_type)
    else:
        max_id_value = get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    if max_id_value:
        # Write the bookmark if max_id_value is defined
        state = singer.write_bookmark(
            state, stream['tap_stream_id'], 'max_id_value',
            common.class_to_string(max_id_value,
                                   max_id_value.__class__.__name__))
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'max_id_type',
                                      max_id_value.__class__.__name__)

    find_filter = {'$lte': max_id_value}
    if last_id_fetched:
        last_id_fetched_type = singer.get_bookmark(state,
                                                   stream['tap_stream_id'],
                                                   'last_id_fetched_type')
        find_filter['$gte'] = common.string_to_class(last_id_fetched,
                                                     last_id_fetched_type)

    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(
        stream['tap_stream_id'], find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    # pylint: disable=logging-format-interpolation
    LOGGER.info(query_message)

    with collection.find({'_id': find_filter},
                         projection,
                         sort=[("_id", pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        schema = {"type": "object", "properties": {}}
        for row in cursor:
            rows_saved += 1

            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(
                    singer.SchemaMessage(
                        stream=common.calculate_destination_stream_name(
                            stream),
                        schema=schema,
                        key_properties=['_id']))
                common.SCHEMA_COUNT[stream['tap_stream_id']] += 1
            common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time(
            ) - schema_build_start_time

            record_message = common.row_to_singer_record(
                stream, row, stream_version, time_extracted)

            singer.write_message(record_message)

            state = singer.write_bookmark(
                state, stream['tap_stream_id'], 'last_id_fetched',
                common.class_to_string(row['_id'],
                                       row['_id'].__class__.__name__))
            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched_type',
                                          row['_id'].__class__.__name__)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time() - start_time

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')
    singer.clear_bookmark(state, stream['tap_stream_id'],
                          'last_id_fetched_type')

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'initial_full_table_complete', True)

    singer.write_message(activate_version_message)

    LOGGER.info('Synced {} records for {}'.format(rows_saved, tap_stream_id))
Ejemplo n.º 4
0
def sync_stream(stream_name):
    """
    Sync each stream, looking for newly created records. Updates are captured by events stream.
    """
    LOGGER.info("Started syncing stream %s", stream_name)

    stream_metadata = metadata.to_map(
        Context.get_catalog_entry(stream_name)['metadata'])
    stream_field_whitelist = json.loads(
        Context.config.get('whitelist_map', '{}')).get(stream_name)

    extraction_time = singer.utils.now()
    replication_key = metadata.get(stream_metadata, (),
                                   'valid-replication-keys')[0]
    # Invoice Items bookmarks on `date`, but queries on `created`
    filter_key = 'created' if stream_name == 'invoice_items' else replication_key
    stream_bookmark = singer.get_bookmark(Context.state, stream_name, replication_key) or \
        int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())
    bookmark = stream_bookmark

    # if this stream has a sub_stream, compare the bookmark
    sub_stream_name = SUB_STREAMS.get(stream_name)

    # If there is a sub-stream and its selected, get its bookmark (or the start date if no bookmark)
    should_sync_sub_stream = sub_stream_name and Context.is_selected(
        sub_stream_name)
    if should_sync_sub_stream:
        sub_stream_bookmark = singer.get_bookmark(Context.state, sub_stream_name, replication_key) \
            or int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())

        # if there is a sub stream, set bookmark to sub stream's bookmark
        # since we know it must be earlier than the stream's bookmark
        if sub_stream_bookmark != stream_bookmark:
            bookmark = sub_stream_bookmark
    else:
        sub_stream_bookmark = None

    with Transformer(
            singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer:
        end_time = dt_to_epoch(utils.now())
        window_size = int(
            Context.config.get('date_window_size', DEFAULT_DATE_WINDOW_SIZE))
        if DEFAULT_DATE_WINDOW_SIZE != window_size:
            LOGGER.info('Using non-default date window size of %d',
                        window_size)
        start_window = bookmark

        # NB: Immutable streams are never synced for updates. We've
        # observed a short lag period between when records are created and
        # when they are available via the API, so these streams will need
        # a short lookback window.
        if stream_name in IMMUTABLE_STREAMS:
            # pylint:disable=fixme
            # TODO: This may be an issue for other streams' created_at
            # entries, but to keep the surface small, doing this only for
            # immutable streams at first to confirm the suspicion.
            start_window -= IMMUTABLE_STREAM_LOOKBACK

        # NB: We observed records coming through newest->oldest and so
        # date-windowing was added and the tap only bookmarks after it has
        # gotten through a date window
        while start_window < end_time:
            stop_window = dt_to_epoch(
                epoch_to_dt(start_window) + timedelta(days=window_size))
            # cut off the last window at the end time
            if stop_window > end_time:
                stop_window = end_time

            for stream_obj in paginate(
                    STREAM_SDK_OBJECTS[stream_name]['sdk_object'], filter_key,
                    start_window, stop_window):

                # get the replication key value from the object
                rec = unwrap_data_objects(stream_obj.to_dict_recursive())
                rec = reduce_foreign_keys(rec, stream_name)
                stream_obj_created = rec[replication_key]
                rec['updated'] = stream_obj_created

                # sync stream if object is greater than or equal to the bookmark
                if stream_obj_created >= stream_bookmark:
                    rec = transformer.transform(
                        rec,
                        Context.get_catalog_entry(stream_name)['schema'],
                        stream_metadata)

                    # At this point, the record has been transformed and so
                    # any de-selected fields have been pruned. Now, prune off
                    # any fields that aren't present in the whitelist.
                    if stream_field_whitelist:
                        rec = apply_whitelist(rec, stream_field_whitelist)

                    singer.write_record(stream_name,
                                        rec,
                                        time_extracted=extraction_time)

                    Context.new_counts[stream_name] += 1

                # sync sub streams if its selected and the parent object
                # is greater than its bookmark
                if should_sync_sub_stream and stream_obj_created > sub_stream_bookmark:
                    sync_sub_stream(sub_stream_name, stream_obj)

            # Update stream/sub-streams bookmarks as stop window
            if stop_window > stream_bookmark:
                stream_bookmark = stop_window
                singer.write_bookmark(Context.state, stream_name,
                                      replication_key, stream_bookmark)

            # the sub stream bookmarks on its parent
            if should_sync_sub_stream and stop_window > sub_stream_bookmark:
                sub_stream_bookmark = stop_window
                singer.write_bookmark(Context.state, sub_stream_name,
                                      replication_key, sub_stream_bookmark)

            singer.write_state(Context.state)

            # update window for next iteration
            start_window = stop_window

    singer.write_state(Context.state)
Ejemplo n.º 5
0
 def is_selected(cls, stream_name):
     stream = cls.get_catalog_entry(stream_name)
     stream_metadata = metadata.to_map(stream['metadata'])
     return metadata.get(stream_metadata, (), 'selected')
Ejemplo n.º 6
0
def sync_log_based(config, state, stream):
    table_name = stream['tap_stream_id']

    client = dynamodb.get_client(config)
    streams_client = dynamodb.get_stream_client(config)

    md_map = metadata.to_map(stream['metadata'])
    projection = metadata.get(md_map, (), 'tap-mongodb.projection')
    if projection is not None:
        projection = [x.strip().split('.') for x in projection.split(',')]

    # Write activate version message
    stream_version = singer.get_bookmark(state, table_name, 'version')
    singer.write_version(table_name, stream_version)

    table = client.describe_table(TableName=table_name)['Table']
    stream_arn = table['LatestStreamArn']
    seq_number_bookmarks = singer.get_bookmark(state, table_name,
                                               'shard_seq_numbers')

    deserializer = deserialize.Deserializer()

    rows_saved = 0

    for shard in get_shards(streams_client, stream_arn):
        # check for bookmark
        seq_number = seq_number_bookmarks.get(shard['ShardId'])
        if seq_number:
            iterator_type = 'AFTER_SEQUENCE_NUMBER'
        else:
            iterator_type = 'TRIM_HORIZON'

        for record in get_shard_records(streams_client, stream_arn, shard,
                                        iterator_type, seq_number):
            if record['eventName'] == 'REMOVE':
                record_message = deserializer.deserialize_item(
                    record['dynamodb']['Keys'])
                record_message[SDC_DELETED_AT] = singer.utils.strftime(
                    record['dynamodb']['ApproximateCreationDateTime'])
            else:
                record_message = deserializer.deserialize_item(
                    record['dynamodb'].get('NewImage'))
                if record_message is None:
                    LOGGER.fatal(
                        'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"'
                    )
                    raise RuntimeError(
                        'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"'
                    )
                if projection is not None:
                    try:
                        record_message = deserializer.apply_projection(
                            record_message, projection)
                    except:
                        LOGGER.fatal(
                            "Projection failed to apply: %s",
                            metadata.get(md_map, (), 'tap-mongodb.projection'))
                        raise RuntimeError(
                            'Projection failed to apply: {}'.format(
                                metadata.get(md_map, (),
                                             'tap-mongodb.projection')))

            record_message = singer.RecordMessage(stream=table_name,
                                                  record=record_message,
                                                  version=stream_version)
            singer.write_message(record_message)

            rows_saved += 1

            seq_number_bookmarks[
                shard['ShardId']] = record['dynamodb']['SequenceNumber']
            state = singer.write_bookmark(state, table_name,
                                          'shard_seq_numbers',
                                          seq_number_bookmarks)

            if rows_saved % WRITE_STATE_PERIOD == 0:
                singer.write_state(state)

        # If the shard we just finished syncing is closed (i.e. has an
        # EndingSequenceNumber), pop it off
        if shard['SequenceNumberRange'].get('EndingSequenceNumber'):
            # Must check if the bookmark exists because if a shard has 0
            # records we will never set a bookmark for the shard
            if seq_number_bookmarks.get(shard['ShardId']):
                seq_number_bookmarks.pop(shard['ShardId'])
                state = singer.write_bookmark(state, table_name,
                                              'shard_seq_numbers',
                                              seq_number_bookmarks)

        singer.write_state(state)

    return rows_saved
Ejemplo n.º 7
0
def stream_is_selected(stream):
    md_map = metadata.to_map(stream.metadata)
    selected_md = metadata.get(md_map, (), "selected")

    return selected_md