Exemple #1
0
    def refresh_token(self):
        # http://developers.marketo.com/rest-api/authentication/#creating_an_access_token
        params = {
            "grant_type": "client_credentials",
            "client_id": self.client_id,
            "client_secret": self.client_secret,
        }
        singer.log_info("Refreshing token")

        try:
            url = self.get_url("identity/oauth/token")
            resp = requests.get(url, params=params)
            resp_time = pendulum.utcnow()
        except requests.exceptions.ConnectionError as e:
            raise ApiException(
                "Connection error while refreshing token at {}.".format(
                    url)) from e

        if resp.status_code != 200:
            raise ApiException("Error refreshing token [{}]: {}".format(
                resp.status_code, resp.content))

        data = resp.json()
        if "error" in data:
            if data["error"] == "unauthorized":
                msg = "Authorization failed: "
            else:
                msg = "Marketo API returned an error: "

            msg += data.get("error_description", "No message from api")
            raise ApiException(msg)

        self.access_token = data["access_token"]
        self.token_expires = resp_time.add(seconds=data["expires_in"] - 15)
        singer.log_info("Token valid until %s", self.token_expires)
Exemple #2
0
def get_or_create_export_for_leads(client, state, stream, export_start, config):
    export_id = bookmarks.get_bookmark(state, "leads", "export_id")
    # check if export is still valid
    if export_id is not None and not client.export_available("leads", export_id):
        singer.log_info("Export %s no longer available.", export_id)
        export_id = None

    if export_id is None:
        # Corona mode is required to query by "updatedAt", otherwise a full
        # sync is required using "createdAt".
        query_field = "updatedAt" if client.use_corona else "createdAt"
        max_export_days = int(config.get('max_export_days',
                                         MAX_EXPORT_DAYS))
        export_end = get_export_end(export_start,
                                    end_days=max_export_days)
        query = {query_field: {"startAt": export_start.isoformat(),
                               "endAt": export_end.isoformat()}}

        # Create the new export and store the id and end date in state.
        # Does not start the export (must POST to the "enqueue" endpoint).
        fields = []
        for entry in stream['metadata']:
            if len(entry['breadcrumb']) > 0 and (entry['metadata'].get('selected') or entry['metadata'].get('inclusion') == 'automatic'):
                fields.append(entry['breadcrumb'][-1])

        export_id = client.create_export("leads", fields, query)
        state = update_state_with_export_info(
            state, stream, export_id=export_id, export_end=export_end.isoformat())
    else:
        export_end = pendulum.parse(bookmarks.get_bookmark(state, "leads", "export_end"))

    return export_id, export_end
Exemple #3
0
    def wait_for_export(self, stream_type, export_id):
        # Poll the export status until it enters a finalized state or
        # exceeds the job timeout time.
        with metrics.job_timer('Export {} for {}'.format(
                export_id, stream_type)):
            timeout_time = pendulum.utcnow().add(seconds=self.job_timeout)
            while pendulum.utcnow() < timeout_time:
                status = self.poll_export(stream_type, export_id)
                singer.log_info("export %s status is %s", export_id, status)

                if status == "Created":
                    # If the status is created, the export has been made but
                    # not started, so enqueue the export.
                    self.enqueue_export(stream_type, export_id)

                elif status in ["Cancelled", "Failed"]:
                    # Cancelled and failed exports fail the current sync.
                    raise ExportFailed(status)

                elif status == "Completed":
                    return True

                time.sleep(self.poll_interval)

        raise ExportFailed("Export timed out after {} minutes".format(
            self.job_timeout / 60))
Exemple #4
0
    def update_calls_today(self):
        # http://developers.marketo.com/rest-api/endpoint-reference/lead-database-endpoint-reference/#!/Usage/getDailyUsageUsingGET
        data = self._request("GET", "rest/v1/stats/usage.json").json()
        if "result" not in data:
            raise ApiException(data)

        self.calls_today = int(data["result"][0]["total"])
        singer.log_info("Used %s of %s requests", self.calls_today,
                        self.max_daily_calls)
Exemple #5
0
def get_or_create_export_for_activities(client, state, stream, export_start,
                                        config):
    export_id = bookmarks.get_bookmark(state, stream["tap_stream_id"],
                                       "export_id")
    if export_id is not None and not client.export_available(
            "activities", export_id):
        singer.log_info("Export %s no longer available.", export_id)
        export_id = None

    if export_id is None:
        # The activity id is in the top-most breadcrumb of the metatdata
        # Activity ids correspond to activity type id in Marketo.
        # We need the activity type id to build the query.
        activity_metadata = metadata.to_map(stream["metadata"])
        activity_type_id = metadata.get(activity_metadata, (),
                                        'marketo.activity-id')

        # Activities must be queried by `createdAt` even though
        # that is not a real field. `createdAt` proxies `activityDate`.
        # The activity type id must also be included in the query. The
        # largest date range that can be used for activities is 30 days.
        max_export_days = int(config.get('max_export_days', MAX_EXPORT_DAYS))
        export_end = get_export_end(export_start, end_days=max_export_days)
        query = {
            "createdAt": {
                "startAt": export_start.isoformat(),
                "endAt": export_end.isoformat()
            },
            "activityTypeIds": [activity_type_id]
        }

        # Create the new export and store the id and end date in state.
        # Does not start the export (must POST to the "enqueue" endpoint).
        try:
            export_id = client.create_export("activities", ACTIVITY_FIELDS,
                                             query)
        except ApiQuotaExceeded as e:
            # The main reason we wrap the ApiQuotaExceeded exception in a
            # new one is to be able to tell the customer what their
            # configured max_export_days is.
            raise ApiQuotaExceeded(
                ("You may wish to consider changing the "
                 "`max_export_days` config value to a lower number if "
                 "you're unable to sync a single {} day window within "
                 "your current API quota.").format(max_export_days)) from e
        state = update_state_with_export_info(
            state,
            stream,
            export_id=export_id,
            export_end=export_end.isoformat())
    else:
        export_end = pendulum.parse(
            bookmarks.get_bookmark(state, stream["tap_stream_id"],
                                   "export_end"))

    return export_id, export_end
Exemple #6
0
def discover(client):
    singer.log_info("Starting discover")
    streams = []
    streams.append(discover_leads(client))
    streams.append(discover_catalog("activity_types", ACTIVITY_TYPES_AUTOMATIC_INCLUSION, unsupported=ACTIVITY_TYPES_UNSUPPORTED, stream_automatic_inclusion=True))
    streams.extend(discover_activities(client))
    streams.append(discover_catalog("campaigns", CAMPAIGNS_AUTOMATIC_INCLUSION))
    streams.append(discover_catalog("lists", LISTS_AUTOMATIC_INCLUSION))
    streams.append(discover_catalog("programs", PROGRAMS_AUTOMATIC_INCLUSION))
    json.dump({"streams": streams}, sys.stdout, indent=2)
    singer.log_info("Finished discover")
Exemple #7
0
    def create_export(self, stream_type, fields, query):
        # http://developers.marketo.com/rest-api/bulk-extract/#creating_a_job
        payload = {"format": "CSV", "fields": fields, "filter": query}

        endpoint = self.get_bulk_endpoint(stream_type, "create")
        endpoint_name = "{}_create".format(stream_type)
        singer.log_info('Scheduling export job with query %s', query)
        data = self.request("POST",
                            endpoint,
                            endpoint_name=endpoint_name,
                            json=payload)
        return data["result"][0]["exportId"]
def get_selected_streams(remaining_streams):
    selected_streams = []

    for stream in remaining_streams:
        mdata = metadata.to_map(stream.get('metadata'))

        if metadata.get(mdata, (), 'selected') == True:
            selected_streams.append(stream)
        else:
            singer.log_info("%s: not selected", stream["tap_stream_id"])

    return selected_streams
Exemple #9
0
def stream_rows(client, stream_type, export_id):
    with tempfile.NamedTemporaryFile(mode="w+", encoding="utf8") as csv_file:
        singer.log_info("Download starting.")
        resp = client.stream_export(stream_type, export_id)
        for chunk in resp.iter_content(chunk_size=1024, decode_unicode=True):
            if chunk:
                csv_file.write(chunk)

        singer.log_info("Download completed. Begin streaming rows.")
        csv_file.seek(0)
        reader = csv.reader(csv_file, delimiter=',', quotechar='"')
        headers = next(reader)
        for line in reader:
            yield dict(zip(headers, line))
Exemple #10
0
    def _request(self,
                 method,
                 url,
                 endpoint_name=None,
                 stream=False,
                 **kwargs):
        endpoint_name = endpoint_name or url
        url = self.get_url(url)
        headers = kwargs.pop("headers", {})
        headers.update(self.headers)
        req = requests.Request(method, url, headers=headers,
                               **kwargs).prepare()
        singer.log_info("%s: %s", method, req.url)
        with singer.metrics.http_request_timer(endpoint_name):
            resp = self._session.send(req, stream=stream)

        resp.raise_for_status()
        return resp
    def excute_graph_ql(self, query: str) -> dict:
        try:
            # the execute function sometimes prints and this causes errors for the target, so I block printing for it
            with HiddenPrints():
                response = json.loads(shopify.GraphQL().execute(query))
        except Exception:
            raise GraphQLGeneralError("Execution failed", code=500)

        if 'data' in response and response['data'] is not None:
            return response['data']

        if "errors" in response:
            errors = response["errors"]
            singer.log_info(errors)
            if errors[0]["extensions"]["code"] == "THROTTLED":
                raise GraphQLThrottledError("THROTTLED", code=429)

        raise GraphQLGeneralError("Failed", code=500)
    def get_graph_ql_data(self, replication_obj: Stream):
        LOGGER.info("Getting data with GraphQL")
        updated_at_min = replication_obj.get_bookmark()

        stop_time = singer.utils.now().replace(microsecond=0)
        date_window_size = float(
            Context.config.get("date_window_size", DATE_WINDOW_SIZE))

        # Page through till the end of the result set
        while updated_at_min < stop_time:
            after = None
            updated_at_max = updated_at_min + datetime.timedelta(
                days=date_window_size)

            if updated_at_max > stop_time:
                updated_at_max = stop_time
            singer.log_info("getting from %s - %s", updated_at_min,
                            updated_at_max)
            while True:
                query = self.get_graph_query(updated_at_min,
                                             updated_at_max,
                                             replication_obj.name,
                                             after=after)
                with metrics.http_request_timer(replication_obj.name):
                    data = self.excute_graph_ql(query)
                data = data[replication_obj.name]
                page_info = data['pageInfo']
                edges = data["edges"]
                for edge in edges:
                    after = edge["cursor"]
                    node = edge["node"]
                    yield node
                if not page_info["hasNextPage"]:
                    Context.state.get('bookmarks',
                                      {}).get(replication_obj.name,
                                              {}).pop('since_id', None)
                    replication_obj.update_bookmark(
                        utils.strftime(updated_at_max +
                                       datetime.timedelta(seconds=1)))
                    break

            updated_at_min = updated_at_max + datetime.timedelta(seconds=1)
Exemple #13
0
def get_or_create_export_for_leads(client, state, stream, export_start):
    export_id = bookmarks.get_bookmark(state, "leads", "export_id")
    # check if export is still valid
    if export_id is not None and not client.export_available(
            "leads", export_id):
        singer.log_info("Export %s no longer available.", export_id)
        export_id = None

    if export_id is None:
        # Corona mode is required to query by "updatedAt", otherwise a full
        # sync is required using "createdAt".
        query_field = "updatedAt" if client.use_corona else "createdAt"
        export_end = get_export_end(export_start)
        query = {
            query_field: {
                "startAt": export_start.isoformat(),
                "endAt": export_end.isoformat()
            }
        }

        # Create the new export and store the id and end date in state.
        # Does not start the export (must POST to the "enqueue" endpoint).
        fields = [
            f for f, s in stream["schema"]["properties"].items()
            if s.get("selected") or (s.get("inclusion") == "automatic")
        ]
        export_id = client.create_export("leads", fields, query)
        state = update_state_with_export_info(
            state,
            stream,
            export_id=export_id,
            export_end=export_end.isoformat())
    else:
        export_end = pendulum.parse(
            bookmarks.get_bookmark(state, "leads", "export_end"))

    return export_id, export_end
Exemple #14
0
    def test_corona(self):
        # http://developers.marketo.com/rest-api/bulk-extract/#limits
        # Corona allows us to do bulk queries for Leads using updatedAt
        # as a filter. Clients without Corona (should only be clients
        # with < 50,000 Leads) must do a full bulk export every sync.
        # We test for Corona by requesting a one-second export of leads
        # using the updatedAt filter.
        singer.log_info("Testing for Corona support")
        start_pen = pendulum.utcnow().subtract(days=1).replace(microsecond=0)
        end_pen = start_pen.add(seconds=1)
        payload = {
            "format": "CSV",
            "fields": ["id"],
            "filter": {
                "updatedAt": {
                    "startAt": start_pen.isoformat(),
                    "endAt": end_pen.isoformat(),
                },
            },
        }
        endpoint = self.get_bulk_endpoint("leads", "create")
        data = self._request("POST",
                             endpoint,
                             endpoint_name="leads_create",
                             json=payload).json()

        # If the error code indicating no Corona support is present,
        # Corona is not supported. If we don't get that error code,
        # Corona is supported and we need to clean up by cancelling the
        # test export we requested.
        err_codes = set(err["code"] for err in data.get("errors", []))
        if NO_CORONA_CODE in err_codes:
            singer.log_info("Corona not supported.")
            return False
        elif API_QUOTA_EXCEEDED in err_codes:
            raise ApiQuotaExceeded(
                API_QUOTA_EXCEEDED_MESSAGE.format(data['errors']))
        else:
            singer.log_info("Corona is supported.")
            singer.log_info(data)
            self.cancel_export("leads", data["result"][0]["exportId"])
            return True
Exemple #15
0
    def get_objects(self):
        updated_at_min = self.get_bookmark()

        stop_time = singer.utils.now().replace(microsecond=0)
        # Retrieve data for max 1 year. Otherwise log incremental needed.
        diff_days = (stop_time - updated_at_min).days
        yearly = False
        if diff_days > 365:
            yearly = True
            stop_time = updated_at_min + datetime.timedelta(days=365)
            LOGGER.info("This import will only import the first year of historical data. "
                        "You need to trigger further incremental imports to get the missing rows.")

        date_window_size = float(Context.config.get("date_window_size", DATE_WINDOW_SIZE))
        results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE)

        # Page through till the end of the resultset
        while updated_at_min < stop_time:
            # Bookmarking can also occur on the since_id
            since_id = self.get_since_id() or 1

            if since_id != 1:
                LOGGER.info("Resuming sync from since_id %d", since_id)

            # It's important that `updated_at_min` has microseconds
            # truncated. Why has been lost to the mists of time but we
            # think it has something to do with how the API treats
            # microseconds on its date windows. Maybe it's possible to
            # drop data due to rounding errors or something like that?
            updated_at_max = updated_at_min + datetime.timedelta(days=date_window_size)
            if updated_at_max > stop_time:
                updated_at_max = stop_time

            singer.log_info("getting from %s - %s", updated_at_min,
                            updated_at_max)

            min_filer_key = self.get_min_replication_key()
            max_filer_key = self.get_max_replication_key()

            while True:
                status_key = self.status_key or "status"
                query_params = {
                    "since_id": since_id,
                    min_filer_key: updated_at_min,
                    max_filer_key: updated_at_max,
                    "limit": results_per_page,
                }

                if self.add_status:
                    query_params[status_key] = "any"

                with metrics.http_request_timer(self.name):
                    objects = self.call_api(query_params)

                for obj in objects:
                    if obj.id < since_id:
                        # This verifies the api behavior expectation we
                        # have that all results actually honor the
                        # since_id parameter.
                        raise OutOfOrderIdsError("obj.id < since_id: {} < {}".format(
                            obj.id, since_id))
                    yield obj

                # You know you're at the end when the current page has
                # less than the request size limits you set.
                singer.log_info(f"Got {len(objects)} records")
                if len(objects) < results_per_page:
                    # Save the updated_at_max as our bookmark as we've synced all rows up in our
                    # window and can move forward. Also remove the since_id because we want to
                    # restart at 1.
                    Context.state.get('bookmarks', {}).get(self.name, {}).pop('since_id', None)
                    state_val = updated_at_max
                    if self.skip_day:
                        state_val = state_val + datetime.timedelta(days=1)
                    self.update_bookmark(utils.strftime(state_val))
                    break

                if objects[-1].id != max([o.id for o in objects]):
                    # This verifies the api behavior expectation we have
                    # that all pages are internally ordered by the
                    # `since_id`.
                    raise OutOfOrderIdsError("{} is not the max id in objects ({})".format(
                        objects[-1].id, max([o.id for o in objects])))
                since_id = objects[-1].id

                # Put since_id into the state.
                self.update_bookmark(since_id, bookmark_key='since_id')

            updated_at_min = updated_at_max + datetime.timedelta(seconds=1)

            if self.skip_day:
                updated_at_min = updated_at_min + datetime.timedelta(days=1)

        if yearly:
            LOGGER.info("This import only imported one year of historical data. "
                        "Please trigger further incremental data to get the missing rows.")
Exemple #16
0
def sync(client, catalog, config, state):
    starting_stream = bookmarks.get_currently_syncing(state)
    if starting_stream:
        singer.log_info("Resuming sync from %s", starting_stream)
    else:
        singer.log_info("Starting sync")

    for stream in catalog["streams"]:
        # Skip unselected streams.
        if not stream["schema"].get("selected"):
            singer.log_info("%s: not selected", stream["tap_stream_id"])
            continue

        # Skip streams that have already be synced when resuming.
        if starting_stream and stream["tap_stream_id"] != starting_stream:
            singer.log_info("%s: already synced", stream["tap_stream_id"])
            continue

        singer.log_info("%s: starting sync", stream["tap_stream_id"])

        # Now that we've started, there's no more "starting stream". Set
        # the current stream to resume on next run.
        starting_stream = None
        state = bookmarks.set_currently_syncing(state, stream["tap_stream_id"])
        singer.write_state(state)

        # Sync stream based on type.
        if stream["tap_stream_id"] == "activity_types":
            state, record_count = sync_activity_types(client, state, stream)
        elif stream["tap_stream_id"] == "leads":
            state, record_count = sync_leads(client, state, stream)
        elif stream["tap_stream_id"].startswith("activities_"):
            state, record_count = sync_activities(client, state, stream,
                                                  config)
        elif stream["tap_stream_id"] in ["campaigns", "lists"]:
            state, record_count = sync_paginated(client, state, stream)
        elif stream["tap_stream_id"] == "programs":
            state, record_count = sync_programs(client, state, stream)
        else:
            raise Exception("Stream %s not implemented" %
                            stream["tap_stream_id"])

        # Emit metric for record count.
        counter = singer.metrics.record_counter(stream["tap_stream_id"])
        counter.value = record_count
        counter._pop()  # pylint: disable=protected-access

        # Unset current stream.
        state = bookmarks.set_currently_syncing(state, None)
        singer.write_state(state)
        singer.log_info("%s: finished sync", stream["tap_stream_id"])

    # If Corona is not supported, log a warning near the end of the tap
    # log with instructions on how to get Corona supported.
    singer.log_info("Finished sync.")
    if not client.use_corona:
        singer.log_warning(NO_CORONA_WARNING)
Exemple #17
0
def sync_report(stream_name, stream_metadata, sdk_client):

    report_window_days = CONFIG.get("MAX_REPORT_TIME_WINDOW", 365)

    is_incremental = False
    if metadata.get(stream_metadata, (),
                    "replication-method") == "INCREMENTAL":
        is_incremental = True

    customer_id = sdk_client.client_customer_id

    stream_schema, _ = create_schema_for_report(stream_name, sdk_client)
    stream_schema = add_synthetic_keys_to_stream_schema(stream_schema)

    xml_attribute_list = get_fields_to_sync(stream_schema, stream_metadata)

    primary_keys = metadata.get(stream_metadata,
                                (), 'tap-adwords.report-key-properties') or []
    LOGGER.info("{} primary keys are {}".format(stream_name, primary_keys))

    write_schema(stream_name,
                 stream_schema,
                 primary_keys,
                 bookmark_properties=['day'])

    field_list = []
    for field in xml_attribute_list:
        field_list.append(stream_metadata[('properties',
                                           field)]['adwords.fieldName'])

    check_selected_fields(stream_name, field_list, sdk_client)
    # If an attribution window sync is interrupted, start where it left off
    start_date = get_attribution_window_bookmark(customer_id, stream_name)
    if start_date is not None:
        start_date = start_date + relativedelta(days=1)

    if start_date is None:
        start_date = apply_conversion_window(
            get_start_for_stream(customer_id, stream_name))

    if stream_name in REPORTS_WITH_90_DAY_MAX:
        cutoff = utils.now() + relativedelta(days=-90)
        if start_date < cutoff:
            LOGGER.warning(
                "report only supports up to 90 days, will start at {}".format(
                    start_date))
            start_date = cutoff

    start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)

    LOGGER.info('Selected fields: %s', field_list)

    max_end_date = utils.now() - relativedelta(days=1)
    required_end_date = get_end_date()

    report_end_date = min(max_end_date, required_end_date)
    report_end_date = report_end_date.replace(hour=23,
                                              minute=59,
                                              second=59,
                                              microsecond=0)

    next_start_date = start_date

    is_single_day_report = stream_name in REPORTS_REQUIRING_DAILY_REPORTS
    start_plus_window = next_start_date
    if not is_single_day_report:
        start_plus_window += relativedelta(days=report_window_days)
    end_date = min(start_plus_window, report_end_date)

    while next_start_date <= report_end_date:
        singer.log_info("syncing %s for %s - %s", stream_name,
                        next_start_date.strftime("%Y-%m-%d"),
                        end_date.strftime("%Y-%m-%d"))
        actual_end_date = min(end_date, report_end_date)
        sync_report_for_day(stream_name, stream_schema, sdk_client,
                            next_start_date, field_list, actual_end_date)
        next_start_date = end_date + relativedelta(days=1)

        start_plus_window = next_start_date
        if not is_single_day_report:
            start_plus_window += relativedelta(days=report_window_days)

        end_date = start_plus_window

        bookmarks.write_bookmark(STATE, state_key_name(customer_id,
                                                       stream_name),
                                 'last_attribution_window_date',
                                 actual_end_date.strftime(utils.DATETIME_FMT))
        singer.write_state(STATE)
    if not is_incremental:
        bookmarks.clear_bookmark(STATE, state_key_name(customer_id,
                                                       stream_name),
                                 'last_attribution_window_date')
    singer.write_state(STATE)
    LOGGER.info("Done syncing the %s report for customer_id %s", stream_name,
                customer_id)