Exemple #1
0
 def update_bookmark(self, state, value):
     current_bookmark = self.get_bookmark(state)
     if value and utils.strptime_with_tz(value) > current_bookmark:
         singer.write_bookmark(state, self.name, self.replication_key,
                               value)
Exemple #2
0
def sync_records(ns, catalog_entry, state, counter):
    chunked_bookmark = singer_utils.strptime_with_tz(
        ns.get_start_date(state, catalog_entry))
    stream = catalog_entry['stream']
    schema = catalog_entry['schema']
    stream_alias = catalog_entry.get('stream_alias')
    catalog_metadata = metadata.to_map(catalog_entry['metadata'])
    replication_key = catalog_metadata.get((), {}).get('replication-key')
    stream_version = get_stream_version(catalog_entry, state)
    activate_version_message = singer.ActivateVersionMessage(
        stream=(stream_alias or stream), version=stream_version)

    start_time = singer_utils.now()

    LOGGER.info('Syncing NetSuite data for stream %s', stream)

    previous_max_replication_key = None

    query_func = ns.query

    for rec in query_func(ns, catalog_entry, state):

        counter.increment()
        with Transformer(
                pre_hook=transform_data_hook(ns, stream)) as transformer:
            rec = transformer.transform(rec, schema)

        singer.write_message(
            singer.RecordMessage(stream=(stream_alias or stream),
                                 record=rec,
                                 version=stream_version,
                                 time_extracted=start_time))

        if replication_key:
            _rec = rec.get(replication_key, None)
            original_replication_key_value = ""
            replication_key_value = None
            if replication_key and _rec is not None:
                original_replication_key_value = _rec
                replication_key_value = singer_utils.strptime_with_tz(
                    original_replication_key_value)

            # Before writing a bookmark, make sure Quickbooks has not given us a
            # record with one outside our range
            if previous_max_replication_key is None or (
                    replication_key_value
                    and replication_key_value <= start_time
                    and replication_key_value > previous_max_replication_key):
                state = singer.write_bookmark(state,
                                              catalog_entry['tap_stream_id'],
                                              replication_key,
                                              original_replication_key_value)
                previous_max_replication_key = replication_key_value

            # Tables with no replication_key will send an
            # activate_version message for the next sync

    if not replication_key:
        singer.write_message(activate_version_message)
        state = singer.write_bookmark(state, catalog_entry['tap_stream_id'],
                                      'version', None)
Exemple #3
0
    def test_run(self):
        # Connect to stitch service.
        runner.run_check_job_and_check_status(self)

        # Get and check streams.
        self.found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(len(self.found_catalogs), 9, msg="unable to locate schemas for connection {}".format(self.conn_id))

        # Match streams.
        our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(self.conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(self.conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        # Run a sync job using orchestrator, verify tap and target exit codes
        # and verify actual rows were synced.
        first_sync_record_count = self.run_sync(self.conn_id)
        replicated_row_count =  reduce(lambda accum, c : accum + c, first_sync_record_count.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_sync_record_count))
        print("total replicated row count: {}".format(replicated_row_count))

        # Get incremental vs. non-incremental streams.
        non_incremental_streams = {key for key, value in self.expected_replication_method().items() if value != 'INCREMENTAL'}
        incremental_streams = {key for key, value in self.expected_replication_method().items() if value == 'INCREMENTAL'}

        # Get bookmark and state data for first sync, excluding full table streams.
        first_sync_state = menagerie.get_state(self.conn_id)
        first_sync_records = runner.get_records_from_target_output()

        for v in non_incremental_streams:
            first_sync_records.pop(v, None)

        first_max_bookmarks = self.max_bookmarks_by_stream(first_sync_records)
        first_min_bookmarks = self.min_bookmarks_by_stream(first_sync_records)

        # Run a second sync job using orchestrator.
        second_sync_record_count = self.run_sync(self.conn_id)

        # Get data about rows synced, excluding full table streams.
        second_sync_records = runner.get_records_from_target_output()

        for v in non_incremental_streams:
            second_sync_records.pop(v, None)

        second_min_bookmarks = self.min_bookmarks_by_stream(second_sync_records)

        for stream in incremental_streams:
            if stream in {'tasks', 'groups'}:
                continue
            with self.subTest(stream=stream):
                # get bookmark values from state and target data
                stream_bookmark_key = self.expected_rks().get(stream, set())
                assert len(stream_bookmark_key) == 1  # There shouldn't be a compound replication key
                stream_bookmark_key = stream_bookmark_key.pop()

                if not first_sync_state.get("bookmarks", {}).get(stream, None):
                    # Some streams require more than a free tier plan (tasks)
                    continue

                state_value = first_sync_state.get("bookmarks", {}).get(
                    stream, {None: None}).get(stream_bookmark_key)
                target_value = first_max_bookmarks.get(
                    stream, {None: None}).get(stream_bookmark_key)
                target_min_value = first_min_bookmarks.get(
                    stream, {None: None}).get(stream_bookmark_key)

                # Convert everything to datetime.

                state_value = utils.strptime_with_tz(state_value)
                target_value = utils.strptime_with_tz(target_value)
                target_min_value = utils.strptime_with_tz(target_min_value)

                # verify that there is data with different bookmark values - setup necessary
                self.assertTrue(target_value >= target_min_value, msg="Data isn't set up to be able to test bookmarks")

                # verify state agrees with target data after 1st sync
                self.assertEqual(state_value, target_value, msg="The bookmark value isn't correct based on target data")

                # verify that you get less data the 2nd time around
                self.assertGreater(
                    first_sync_record_count.get(stream, 0),
                    second_sync_record_count.get(stream, 0),
                    msg="second sync didn't have less records, bookmark usage not verified")

                if len(second_sync_records) > 0 and len(second_min_bookmarks) > 0:
                    # verify all data from 2nd sync >= 1st bookmark
                    target_value = second_min_bookmarks.get(stream, {None: None}).get(stream_bookmark_key)
                    target_value = utils.strptime_with_tz(target_value)
                    # verify that the minimum bookmark sent to the target for the second sync
                    # is greater than or equal to the bookmark from the first sync
                    self.assertTrue(target_value >= state_value)
Exemple #4
0
def get_attribution_window_bookmark(customer_id, stream_name):
    mid_bk_value = bookmarks.get_bookmark(
        STATE, state_key_name(customer_id, stream_name),
        'last_attribution_window_date')
    return utils.strptime_with_tz(mid_bk_value) if mid_bk_value else None
Exemple #5
0
 def is_session_bookmark_old(self, value):
     if self.session_bookmark is None:
         return True
     return utils.strptime_with_tz(value) > utils.strptime_with_tz(
         self.session_bookmark)
Exemple #6
0
def sync_deals(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deals from %s", start)
    most_recent_modified_time = start
    params = {'count': 250, 'includeAssociations': False, 'properties': []}

    schema = load_schema("deals")
    singer.write_schema("deals", schema, ["dealId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # Check if we should  include associations
    for key in mdata.keys():
        if 'associations' in key:
            assoc_mdata = mdata.get(key)
            if (assoc_mdata.get('selected')
                    and assoc_mdata.get('selected') == True):
                params['includeAssociations'] = True

    v3_fields = None
    has_selected_properties = mdata.get(('properties', 'properties'),
                                        {}).get('selected')
    if has_selected_properties or has_selected_custom_field(mdata):
        # On 2/12/20, hubspot added a lot of additional properties for
        # deals, and appending all of them to requests ended up leading to
        # 414 (url-too-long) errors. Hubspot recommended we use the
        # `includeAllProperties` and `allpropertiesFetchMode` params
        # instead.
        params['includeAllProperties'] = True
        params['allPropertiesFetchMode'] = 'latest_version'

        # Grab selected `hs_date_entered/exited` fields to call the v3 endpoint with
        v3_fields = [
            x[1].replace('property_', '') for x, y in mdata.items()
            if x and (y.get('selected') == True or has_selected_properties) and
            ('hs_date_entered' in x[1] or 'hs_date_exited' in x[1])
        ]

    url = get_url('deals_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE,
                               'deals',
                               url,
                               params,
                               'deals',
                               "hasMore", ["offset"], ["offset"],
                               v3_fields=v3_fields):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = bumble_bee.transform(
                    lift_properties_and_versions(row), schema, mdata)
                singer.write_record("deals",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deals', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Exemple #7
0
def sync_deals(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deals from %s", start)
    most_recent_modified_time = start
    params = {'limit': 100}

    schema = load_schema("deals")
    singer.write_schema("deals", schema, ["dealId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # We fetch all the deals properties
    deals_v3_custom_schema = get_v3_schema("deals")
    properties = []
    for key, value in deals_v3_custom_schema.items():
        properties.append(key)

    # Splitting properties into chunks of max 100 properties
    # to avoid asking for too many properties at once
    # as properties names are passed in the URL
    # URL have a safe length limit of 2000 chars, so 100 properties max should do it
    property_chunks = []
    while len(properties) > 100:
        head, tail = head_100(properties)
        property_chunks.append(head)
        properties = tail

    property_chunks.append(properties)

    # TODO: Make it configurable through the singer catalog metadata selection pattern
    associations = ["contacts"]

    url = get_url('deals_v3_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request_v3(STATE,
                                  "deals",
                                  url,
                                  params,
                                  "results",
                                  custom_properties_chunks=property_chunks,
                                  associations=associations):
            modified_time = None

            if 'updatedAt' in row:
                # Hubspot returns timestamps in ISO 8601
                modified_time = dateutil.parser.isoparse(row['updatedAt'])

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = bumble_bee.transform(
                    lift_properties_and_versions_v3(row), schema, mdata)
                singer.write_record("deals",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deals', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        # Select all Catalogs
        for catalog in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog,
                menagerie.get_annotated_schema(conn_id, catalog['stream_id']))

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        max_bookmarks_from_records = runner.get_most_recent_records_from_target(
            self, self.expected_bookmarks(),
            self.get_properties()['start_date'])

        start_of_today = utils.strftime(
            datetime.datetime(datetime.datetime.utcnow().year,
                              datetime.datetime.utcnow().month,
                              datetime.datetime.utcnow().day, 0, 0, 0, 0,
                              datetime.timezone.utc))
        max_bookmarks_from_records['subscription_changes'] = start_of_today
        max_bookmarks_from_records['email_events'] = start_of_today

        #if we didn't replicate data, the bookmark should be the start_date
        for k in self.expected_bookmarks().keys():
            if max_bookmarks_from_records.get(k) is None:
                max_bookmarks_from_records[k] = utils.strftime(
                    datetime.datetime(2017, 5, 1, 0, 0, 0, 0,
                                      datetime.timezone.utc))

        state = menagerie.get_state(conn_id)
        bookmarks = state.get('bookmarks')
        bookmark_streams = set(state.get('bookmarks').keys())

        #verify bookmarks and offsets
        for k, v in sorted(list(self.expected_bookmarks().items())):
            for w in v:
                bk_value = bookmarks.get(k, {}).get(w)
                self.assertEqual(
                    utils.strptime_with_tz(bk_value),
                    utils.strptime_with_tz(max_bookmarks_from_records[k]),
                    "Bookmark {} ({}) for stream {} should have been updated to {}"
                    .format(bk_value, w, k, max_bookmarks_from_records[k]))
                print("bookmark {}({}) updated to {} from max record value {}".
                      format(k, w, bk_value, max_bookmarks_from_records[k]))

        for k, v in self.expected_offsets().items():
            self.assertEqual(
                bookmarks.get(k, {}).get('offset', {}),
                v,
                msg="unexpected offset found for stream {} {}. state: {}".
                format(k, v, state))
            print("offsets {} cleared".format(k))

        diff = bookmark_streams.difference(self.acceptable_bookmarks())
        self.assertEqual(
            len(diff),
            0,
            msg="Unexpected bookmarks: {} Expected: {} Actual: {}".format(
                diff, self.acceptable_bookmarks(), bookmarks))

        self.assertEqual(
            state.get('currently_syncing'), None,
            "Unexpected `currently_syncing` bookmark value: {} Expected: None".
            format(state.get('currently_syncing')))
Exemple #9
0
 def _datetime_string_to_epoch(self, datetime_string):
     return utils.strptime_with_tz(datetime_string).timestamp() * 1000
Exemple #10
0
def sync_deals(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deals from %s", start)
    most_recent_modified_time = start
    params = {'count': 250, 'includeAssociations': False, 'properties': []}

    schema = load_schema("deals")
    singer.write_schema("deals", schema, ["dealId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # Check if we should  include associations
    for key in mdata.keys():
        if 'associations' in key:
            assoc_mdata = mdata.get(key)
            if (assoc_mdata.get('selected')
                    and assoc_mdata.get('selected') == True):
                params['includeAssociations'] = True

    # Append all the properties fields for deals to the request if
    # properties is selectedOB
    if mdata.get(('properties', 'properties'), {}).get('selected'):
        additional_properties = schema.get("properties").get("properties").get(
            "properties")
        for key in additional_properties.keys():
            params['properties'].append(key)

    url = get_url('deals_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore",
                               ["offset"], ["offset"]):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = bumble_bee.transform(row, schema, mdata)
                singer.write_record("deals",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deals', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Exemple #11
0
    def _query_recur(self,
                     query,
                     catalog_entry,
                     start_date_str,
                     end_date=None,
                     retries=MAX_RETRIES):
        params = {"q": query}
        url = "{}/services/data/v41.0/queryAll".format(self.sf.instance_url)
        headers = self.sf._get_standard_headers()

        if end_date is None:
            end_date = singer_utils.now()

        if retries == 0:
            raise TapSalesforceException(
                "Ran out of retries attempting to query Salesforce Object {}".
                format(catalog_entry['stream']))

        retryable = False
        try:
            while True:
                resp = self.sf._make_request('GET',
                                             url,
                                             headers=headers,
                                             params=params)
                resp_json = resp.json()

                for rec in resp_json.get('records'):
                    yield rec

                next_records_url = resp_json.get('nextRecordsUrl')

                if next_records_url is None:
                    break
                else:
                    url = "{}{}".format(self.sf.instance_url, next_records_url)

        except HTTPError as ex:
            response = ex.response.json()
            if isinstance(
                    response,
                    list) and response[0].get("errorCode") == "QUERY_TIMEOUT":
                start_date = singer_utils.strptime_with_tz(start_date_str)
                day_range = (end_date - start_date).days
                LOGGER.info(
                    "Salesforce returned QUERY_TIMEOUT querying %d days of %s",
                    day_range, catalog_entry['stream'])
                retryable = True
            else:
                raise ex

        if retryable:
            start_date = singer_utils.strptime_with_tz(start_date_str)
            half_day_range = (end_date - start_date) // 2
            end_date = end_date - half_day_range

            if half_day_range.days == 0:
                raise TapSalesforceException(
                    "Attempting to query by 0 day range, this would cause infinite looping."
                )

            query = self.sf._build_query_string(
                catalog_entry, singer_utils.strftime(start_date),
                singer_utils.strftime(end_date))
            for record in self._query_recur(query, catalog_entry,
                                            start_date_str, end_date,
                                            retries - 1):
                yield record
Exemple #12
0
def get_end_date():
    if CONFIG.get('end_date'):
        return utils.strptime_with_tz(CONFIG.get('end_date'))

    return utils.now()
Exemple #13
0
def get_start_for_stream(customer_id, stream_name):
    bk_value = bookmarks.get_bookmark(STATE,
                                      state_key_name(customer_id, stream_name),
                                      'date')
    bk_start_date = utils.strptime_with_tz(bk_value or CONFIG['start_date'])
    return bk_start_date
Exemple #14
0
def sync_deals(state: State):
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(state, "deals", bookmark_key))
    max_bk_value = start
    logger.info("sync_deals: start from %s", start)
    params = {
        'limit': 100,
        'includeAssociations': True,
        # On 2/12/20, hubspot added a lot of additional properties for
        # deals, and appending all of them to requests ended up leading to
        # 414 (url-too-long) errors. Hubspot recommended we use the
        # `includeAllProperties` and `allpropertiesFetchMode` params
        # instead.
        'includeAllProperties': True,
        'allPropertiesFetchMode': 'latest_version',
    }

    schema = load_schema("deals")
    singer.write_schema("hubspot_deals", schema, ["dealId"], [bookmark_key])

    v3_fields = None

    # Grab selected `hs_date_entered/exited` fields to call the v3 endpoint with
    v3_fields = []
    for field_name in schema['properties']:
        hs_field_name = field_name.replace('property_', '')
        if any(hs_field_name.startswith(prefix) for prefix in V3_PREFIXES):
            v3_fields.append(hs_field_name)

    url = get_url('deals_all')
    for row in gen_request(state,
                           'deals',
                           url,
                           params,
                           'deals',
                           "hasMore", ["offset"], ["offset"],
                           v3_fields=v3_fields):
        row_properties = row['properties']
        modified_time = None
        if bookmark_key in row_properties:
            # Hubspot returns timestamps in millis
            timestamp_millis = row_properties[bookmark_key][
                'timestamp'] / 1000.0
            modified_time = datetime.datetime.fromtimestamp(
                timestamp_millis, datetime.timezone.utc)
        elif 'createdate' in row_properties:
            # Hubspot returns timestamps in millis
            timestamp_millis = row_properties['createdate'][
                'timestamp'] / 1000.0
            modified_time = datetime.datetime.fromtimestamp(
                timestamp_millis, datetime.timezone.utc)
        if modified_time and modified_time >= max_bk_value:
            max_bk_value = modified_time

        if not modified_time or modified_time >= start:
            record = build_record(row, schema)
            write_record('hubspot_deals', record)

    state = singer.write_bookmark(state, 'hubspot_deals', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(state)
    return state
Exemple #15
0
    def _query_recur(self,
                     query,
                     catalog_entry,
                     start_date_str,
                     end_date=None,
                     retries=MAX_RETRIES):
        params = {"q": query}
        url = "{}/services/data/v41.0/queryAll".format(self.sf.instance_url)
        headers = self.sf._get_standard_headers()

        sync_start = singer_utils.now()
        if end_date is None:
            end_date = sync_start

        if retries == 0:
            raise TapSalesforceException(
                "Ran out of retries attempting to query Salesforce Object {}".
                format(catalog_entry['stream']))

        retryable = False
        try:
            for rec in self._sync_records(url, headers, params):
                yield rec

            # If the date range was chunked (an end_date was passed), sync
            # from the end_date -> now
            if end_date < sync_start:
                next_start_date_str = singer_utils.strftime(end_date)
                query = self.sf._build_query_string(catalog_entry,
                                                    next_start_date_str)
                for record in self._query_recur(query,
                                                catalog_entry,
                                                next_start_date_str,
                                                retries=retries):
                    yield record

        except HTTPError as ex:
            response = ex.response.json()
            if isinstance(
                    response,
                    list) and response[0].get("errorCode") == "QUERY_TIMEOUT":
                start_date = singer_utils.strptime_with_tz(start_date_str)
                day_range = (end_date - start_date).days
                LOGGER.info(
                    "Salesforce returned QUERY_TIMEOUT querying %d days of %s",
                    day_range, catalog_entry['stream'])
                retryable = True
            else:
                raise ex

        if retryable:
            start_date = singer_utils.strptime_with_tz(start_date_str)
            half_day_range = (end_date - start_date) // 2
            end_date = end_date - half_day_range

            if half_day_range.days == 0:
                raise TapSalesforceException(
                    "Attempting to query by 0 day range, this would cause infinite looping."
                )

            query = self.sf._build_query_string(
                catalog_entry, singer_utils.strftime(start_date),
                singer_utils.strftime(end_date))
            for record in self._query_recur(query, catalog_entry,
                                            start_date_str, end_date,
                                            retries - 1):
                yield record
Exemple #16
0
def sync_entity_chunked(state: State, entity_name, key_properties, path):
    schema = load_schema(entity_name)
    bookmark_key = 'startTimestamp'

    singer.write_schema('hubspot_' + entity_name, schema, key_properties,
                        [bookmark_key])

    start = get_start(state, entity_name, bookmark_key)
    logger.info("sync_%s from %s", entity_name, start)

    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    now_ts = int(now.timestamp() * 1000)

    start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000)
    url = get_url(entity_name)

    if entity_name == 'email_events':
        window_size = int(CONFIG['email_chunk_size'])
    elif entity_name == 'subscription_changes':
        window_size = int(CONFIG['subscription_chunk_size'])

    with metrics.record_counter(entity_name) as counter:
        while start_ts < now_ts:
            end_ts = start_ts + window_size
            params = {
                'startTimestamp': start_ts,
                'endTimestamp': end_ts,
                'limit': 1000,
            }
            while True:
                our_offset = singer.get_offset(state, entity_name)
                if bool(our_offset) and our_offset.get('offset') is not None:
                    params[StateFields.offset] = our_offset.get('offset')

                data = request(url, params).json()

                if data.get(path) is None:
                    raise RuntimeError(
                        "Unexpected API response: {} not in {}".format(
                            path, data.keys()))

                for row in data[path]:
                    counter.increment()
                    record = build_record(row, schema)
                    write_record(entity_name, record)

                if data.get('hasMore'):
                    state = singer.set_offset(state, entity_name, 'offset',
                                              data['offset'])
                    singer.write_state(state)
                else:
                    state = singer.clear_offset(state, entity_name)
                    singer.write_state(state)
                    break
            state = singer.write_bookmark(state, "hubspot_" + entity_name, 'startTimestamp', utils.strftime(
                datetime.datetime.fromtimestamp((start_ts / 1000),
                                                datetime.timezone.utc)))  # pylint: disable=line-too-long
            singer.write_state(state)
            start_ts = end_ts

    state = singer.clear_offset(state, entity_name)
    singer.write_state(state)
    return state
Exemple #17
0
def get_start_for_stream(config, state, advertiser_ids, stream_name):
    """Get start date for stream sync."""
    bk_value = bookmarks.get_bookmark(
        state, state_key_name(advertiser_ids, stream_name), "date")
    bk_start_date = utils.strptime_with_tz(bk_value or config["start_date"])
    return bk_start_date
Exemple #18
0
def append_times_to_dates(item, date_fields):
    if date_fields:
        for date_field in date_fields:
            if item.get(date_field):
                item[date_field] = utils.strftime(
                    utils.strptime_with_tz(item[date_field]))
Exemple #19
0
def get_end_date(config):
    """Get end date from config file."""
    if config.get("end_date"):
        return utils.strptime_with_tz(config.get("end_date"))

    return utils.now()
Exemple #20
0
 def is_bookmark_old(self, state, value, name=None):
     current_bookmark = self.get_bookmark(state, name)
     return utils.strptime_with_tz(value) > utils.strptime_with_tz(
         current_bookmark)
Exemple #21
0
def sync_records(qb, catalog_entry, state, counter, state_passed):
    chunked_bookmark = singer_utils.strptime_with_tz(qb.get_start_date(state, catalog_entry))
    stream = catalog_entry['stream']
    schema = catalog_entry['schema']
    stream_alias = catalog_entry.get('stream_alias')
    catalog_metadata = metadata.to_map(catalog_entry['metadata'])
    replication_key = catalog_metadata.get((), {}).get('replication-key')
    stream_version = get_stream_version(catalog_entry, state)
    activate_version_message = singer.ActivateVersionMessage(stream=(stream_alias or stream),
                                                             version=stream_version)

    start_time = singer_utils.now()

    LOGGER.info('Syncing Quickbooks data for stream %s', stream)

    previous_max_replication_key = None;

    query_func = qb.query
    if stream.endswith("Report"):
        query_func = qb.query_report

    for rec in query_func(catalog_entry, state, state_passed):

        counter.increment()
        with Transformer(pre_hook=transform_data_hook) as transformer:
            rec = transformer.transform(rec, schema)

        singer.write_message(
            singer.RecordMessage(
                stream=(
                        stream_alias or stream),
                record=rec,
                version=stream_version,
                time_extracted=start_time))

        if replication_key:
            jsonpath_expression = parse(f"$.{replication_key}")
            _rec = {'MetaData': json.loads(rec.get('MetaData', {}))}
            match = jsonpath_expression.find(_rec)
            original_replication_key_value = ""
            if replication_key and len(match) > 0:
                original_replication_key_value = match[0].value
                replication_key_value = singer_utils.strptime_with_tz(original_replication_key_value)

            # Before writing a bookmark, make sure Quickbooks has not given us a
            # record with one outside our range
            if previous_max_replication_key is None or (
                    replication_key_value and replication_key_value <= start_time and replication_key_value > previous_max_replication_key
            ):
                state = singer.write_bookmark(
                    state,
                    catalog_entry['tap_stream_id'],
                    replication_key,
                    original_replication_key_value)
                previous_max_replication_key = replication_key_value

            # Tables with no replication_key will send an
            # activate_version message for the next sync

    if not replication_key:
        singer.write_message(activate_version_message)
        state = singer.write_bookmark(
            state, catalog_entry['tap_stream_id'], 'version', None)
Exemple #22
0
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path):
    schema = load_schema(entity_name)
    bookmark_key = 'startTimestamp'

    singer.write_schema(entity_name, schema, key_properties, [bookmark_key],
                        catalog.get('stream_alias'))

    start = get_start(STATE, entity_name, bookmark_key)
    LOGGER.info("sync_%s from %s", entity_name, start)

    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    now_ts = int(now.timestamp() * 1000)

    start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000)
    url = get_url(entity_name)

    mdata = metadata.to_map(catalog.get('metadata'))

    if entity_name == 'email_events':
        window_size = int(CONFIG['email_chunk_size'])
    elif entity_name == 'subscription_changes':
        window_size = int(CONFIG['subscription_chunk_size'])

    with metrics.record_counter(entity_name) as counter:
        while start_ts < now_ts:
            end_ts = start_ts + window_size
            params = {
                'startTimestamp': start_ts,
                'endTimestamp': end_ts,
                'limit': 1000,
            }
            with Transformer(
                    UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
                while True:
                    our_offset = singer.get_offset(STATE, entity_name)
                    if bool(our_offset) and our_offset.get('offset') != None:
                        params[StateFields.offset] = our_offset.get('offset')

                    data = request(url, params).json()
                    time_extracted = utils.now()

                    for row in data[path]:
                        counter.increment()
                        record = bumble_bee.transform(
                            lift_properties_and_versions(row), schema, mdata)
                        singer.write_record(entity_name,
                                            record,
                                            catalog.get('stream_alias'),
                                            time_extracted=time_extracted)
                    if data.get('hasMore'):
                        STATE = singer.set_offset(STATE, entity_name, 'offset',
                                                  data['offset'])
                        singer.write_state(STATE)
                    else:
                        STATE = singer.clear_offset(STATE, entity_name)
                        singer.write_state(STATE)
                        break
            STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc)))  # pylint: disable=line-too-long
            singer.write_state(STATE)
            start_ts = end_ts

    STATE = singer.clear_offset(STATE, entity_name)
    singer.write_state(STATE)
    return STATE
Exemple #23
0
 def get_bookmark(self, state):
     return utils.strptime_with_tz(
         singer.get_bookmark(state, self.name, self.replication_key))
Exemple #24
0
def sync_records(sf, catalog_entry, state, counter):
    chunked_bookmark = singer_utils.strptime_with_tz(sf.get_start_date(state, catalog_entry))
    stream = catalog_entry['stream']
    schema = catalog_entry['schema']
    stream_alias = catalog_entry.get('stream_alias')
    catalog_metadata = metadata.to_map(catalog_entry['metadata'])
    replication_key = catalog_metadata.get((), {}).get('replication-key')
    stream_version = get_stream_version(catalog_entry, state)
    activate_version_message = singer.ActivateVersionMessage(stream=(stream_alias or stream),
                                                             version=stream_version)

    start_time = singer_utils.now()

    LOGGER.info('Syncing Salesforce data for stream %s', stream)

    for rec in sf.query(catalog_entry, state):
        counter.increment()
        with Transformer(pre_hook=transform_bulk_data_hook) as transformer:
            rec = transformer.transform(rec, schema)
        rec = fix_record_anytype(rec, schema)
        singer.write_message(
            singer.RecordMessage(
                stream=(
                    stream_alias or stream),
                record=rec,
                version=stream_version,
                time_extracted=start_time))

        replication_key_value = replication_key and singer_utils.strptime_with_tz(rec[replication_key])

        if sf.pk_chunking:
            if replication_key_value and replication_key_value <= start_time and replication_key_value > chunked_bookmark:
                # Replace the highest seen bookmark and save the state in case we need to resume later
                chunked_bookmark = singer_utils.strptime_with_tz(rec[replication_key])
                state = singer.write_bookmark(
                    state,
                    catalog_entry['tap_stream_id'],
                    'JobHighestBookmarkSeen',
                    singer_utils.strftime(chunked_bookmark))
                singer.write_state(state)
        # Before writing a bookmark, make sure Salesforce has not given us a
        # record with one outside our range
        elif replication_key_value and replication_key_value <= start_time:
            state = singer.write_bookmark(
                state,
                catalog_entry['tap_stream_id'],
                replication_key,
                rec[replication_key])
            singer.write_state(state)

        # Tables with no replication_key will send an
        # activate_version message for the next sync
    if not replication_key:
        singer.write_message(activate_version_message)
        state = singer.write_bookmark(
            state, catalog_entry['tap_stream_id'], 'version', None)

    # If pk_chunking is set, only write a bookmark at the end
    if sf.pk_chunking:
        # Write a bookmark with the highest value we've seen
        state = singer.write_bookmark(
            state,
            catalog_entry['tap_stream_id'],
            replication_key,
            singer_utils.strftime(chunked_bookmark))
Exemple #25
0
 def _get_end_datetime(self, startDateTime):
     endDateTime = utils.strptime_with_tz(startDateTime) + timedelta(
         self.api_window_in_days)
     return endDateTime.strftime("%Y-%m-%d %H:%M:%S")