def sync_endpoint(endpoint, path, date_fields=None): schema = load_schema(endpoint) bookmark_property = 'updated_at' singer.write_schema(endpoint, schema, ["id"], bookmark_properties=[bookmark_property]) start = get_start(endpoint) url = get_url(endpoint) data = request(url) time_extracted = utils.now() with Transformer() as transformer: for row in data: item = row[path] item = transformer.transform(item, schema) append_times_to_dates(item, date_fields) if item[bookmark_property] >= start: singer.write_record(endpoint, item, time_extracted=time_extracted) utils.update_state(STATE, endpoint, item[bookmark_property]) singer.write_state(STATE)
def sync_project(pid): url = get_url("projects", pid) data = request(url).json() flatten_id(data, "owner") project = transform_row(data, RESOURCES["projects"]["schema"]) state_key = "project_{}".format(project["id"]) #pylint: disable=maybe-no-member last_activity_at = project.get('last_activity_at', project.get('created_at')) if not last_activity_at: raise Exception( #pylint: disable=line-too-long "There is no last_activity_at or created_at field on project {}. This usually means I don't have access to the project." .format(project['id'])) if project['last_activity_at'] >= get_start(state_key): sync_branches(project) sync_commits(project) sync_issues(project) sync_milestones(project) sync_users(project) singer.write_record("projects", project) utils.update_state(STATE, state_key, last_activity_at) singer.write_state(STATE)
def sync_transactions(): schema = load_schema("transactions") singer.write_schema("transactions", schema, ["id"]) now = datetime.datetime.utcnow() start = utils.strptime(get_start("transactions")) logger.info("transactions: Syncing from {}".format(start)) while start < now: end = start + datetime.timedelta(days=1) if end > now: end = now data = braintree.Transaction.search( braintree.TransactionSearch.created_at.between(start, end)) logger.info("transactions: Fetched {} records from {} - {}".format( data.maximum_size, start, end)) for row in data: transformed = transform_row(row, schema) singer.write_record("transactions", transformed) utils.update_state(STATE, "transactions", utils.strftime(end)) singer.write_state(STATE) start += datetime.timedelta(days=1)
def sync_entity(entity, key_properties): start_date = get_start(entity) logger.info("{}: Starting sync from {}".format(entity, start_date)) schema = load_schema(entity) singer.write_schema(entity, schema, key_properties) logger.info("{}: Sent schema".format(entity)) logger.info("{}: Requesting export".format(entity)) export_start = utils.strftime( datetime.datetime.utcnow().replace(tzinfo=pytz.utc)) export_id = request_export(entity) logger.info("{}: Export ready".format(entity)) rows = stream_export(entity, export_id) logger.info("{}: Got {} records".format(entity, len(rows))) for row in rows: transformed_row = transform_row(entity, row) singer.write_record(entity, transformed_row) utils.update_state(STATE, entity, export_start) singer.write_state(STATE) logger.info("{}: State synced to {}".format(entity, export_start))
def sync_entity_chunked(entity_name, key_properties, path): schema = load_schema(entity_name) singer.write_schema(entity_name, schema, key_properties) start = get_start(entity_name) now_ts = int(datetime.datetime.utcnow().timestamp() * 1000) start_ts = int(utils.strptime(start).timestamp() * 1000) url = get_url(entity_name) while start_ts < now_ts: end_ts = start_ts + CHUNK_SIZES[entity_name] params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } for row in gen_request(url, params, path, "hasMore", "offset", "offset"): record = transform(row, schema) singer.write_record(entity_name, record) utils.update_state(STATE, entity_name, datetime.datetime.utcfromtimestamp(end_ts / 1000)) singer.write_state(STATE) start_ts = end_ts
def sync_deals(): last_sync = utils.strptime(get_start("deals")) days_since_sync = (datetime.datetime.utcnow() - last_sync).days if days_since_sync > 30: endpoint = "deals_all" else: endpoint = "deals_recent" schema = load_schema("deals") singer.write_schema("deals", schema, ["portalId", "dealId"]) url = get_url(endpoint) params = {'count': 250} for i, row in enumerate( gen_request(url, params, "deals", "hasMore", "offset", "offset")): record = request(get_url("deals_detail", deal_id=row['dealId'])).json() record = transform(record, schema) modified_time = None if 'hs_lastmodifieddate' in record: modified_time = utils.strptime( record['hs_lastmodifieddate']['value']) elif 'createdate' in record: modified_time = utils.strptime(record['createdate']['value']) if not modified_time or modified_time >= last_sync: singer.write_record("deals", record) utils.update_state(STATE, "deals", modified_time) if i % 250 == 0: singer.write_state(STATE)
def sync_type(type, endpoint, replicationKey, useValidationWindow): schema = load_schema(type) singer.write_schema(type, schema, [replicationKey]) dateFrom = get_start(type) if useValidationWindow: dateFrom = dateFrom - timedelta(days=CONFIG['validation_window']) nextpageuri = get_url(f"{endpoint}.json", dateFrom) lastRow = None headers = {"Accept": "application/json"} auth = HTTPBasicAuth(CONFIG['account_sid'], CONFIG['auth_token']) while nextpageuri != "": req = requests.Request("GET", url=f"{BASE_HOST}{nextpageuri}", headers=headers, auth=auth).prepare() resp = SESSION.send(req) resp.raise_for_status() json = resp.json() for row in json.get(endpoint): output = map_types(schema, row) lastRow = output singer.write_record(type, output) nextpageuri = json['@nextpageuri'] if lastRow != None: utils.update_state(STATE, type, lastRow[replicationKey])
def sync_entity(entity, primary_keys, date_keys=None, transform=None): schema = load_schema(entity) singer.write_schema(entity, schema, primary_keys) start_date = get_start(entity) for row in gen_request(entity): if transform: row = transform(row) if date_keys: # Rows can have various values for various date keys (See the calls to # `sync_entity` in `do_sync`), usually dates of creation and update. # But in some cases some keys may not be present. # # To handle this we: # # 1. Get _all_ the values for all the keys that are actually present in # the row (not every row has every key), and exclude missing ones. # # 2. Take the max of those values as the bookmark for that entity. # # A KeyError is raised if the row has none of the date keys. if not any(date_key in row for date_key in date_keys): raise KeyError('None of date keys found in the row') last_touched = max(row[date_key] for date_key in date_keys if date_key in row) utils.update_state(STATE, entity, last_touched) if last_touched < start_date: continue row = transform_row(row, schema) singer.write_record(entity, row) singer.write_state(STATE)
def sync_events(): schema = load_schema("events") singer.write_schema("events", schema, []) for export_bundle in request_export_bundles(): for event in download_events(export_bundle['Id']): transform_event(event) singer.write_record("events", event) stop_timestamp = datetime.datetime.utcfromtimestamp( export_bundle['Stop']) utils.update_state(STATE, "events", stop_timestamp) singer.write_state(STATE)
def sync_reviews(headers, CONFIG, STATE): page = 0 # initialize at first page rec = 1 # initialize first record total_pages = 1 # initial total pages, which gets overwritten total_elements = 0 # initial total elements, which gets overwritten size = 10 # number of records per request to_timestamp = int(headers['X-Revinate-Porter-Timestamp']) last_update = to_timestamp # init from_timestamp = 0 # initial value # set from_timestamp as NVL(STATE.last_update, CONFIG.start_date, now - 1 year) if 'last_update' not in STATE: if 'start_date' not in CONFIG: from_timestamp = to_timestamp - (60 * 60 * 24 * 7) # looks back 1 week else: from_timestamp = int(time.mktime(datetime.datetime.strptime(CONFIG['start_date'], \ '%Y-%m-%dT%H:%M:%SZ').timetuple())) else: from_timestamp = int(STATE['last_update']) updated_at_range = str(from_timestamp) + '..' + str(to_timestamp) # loop thru all pages while (page + 1) <= total_pages: if (rec + size) <= total_elements: rec_to = rec + size - 1 else: rec_to = total_elements LOGGER.info('Page {} of {} Total Pages, Record {}-{} of {} Total Records'.format( \ str(page + 1), str(total_pages), str(rec), str(rec_to), str(total_elements))) params = { 'updatedAt': updated_at_range, 'page': page, 'size': size, 'sort': 'updatedAt,ASC' } try: reviews_parsed = fetch_reviews(headers, params) except Exception as exception: LOGGER.exception(exception) break # loop thru all records on page for record in reviews_parsed['content']: parsed_review = parse_review(record) singer.write_record('reviews', parsed_review) last_update = record['updatedAt'] rec = rec + 1 page_json = reviews_parsed['page'] total_pages = int(page_json.get('totalPages', 1)) total_elements = int(page_json.get('totalElements', 0)) page = page + 1 # update STATE last_update utils.update_state(STATE, 'last_update', last_update) singer.write_state(STATE) LOGGER.info("State synced to last_update: {}".format(last_update)) LOGGER.info("Done syncing reviews.")
def sync_events(): schema = load_schema("events") singer.write_schema("events", schema, []) for export_bundle in request_export_bundles(): with metrics.record_counter("events") as counter: for event in download_events(export_bundle['Id']): transform_event(event) counter.increment() singer.write_record("events", event) stop_timestamp = datetime.datetime.utcfromtimestamp(export_bundle['Stop']) utils.update_state(STATE, "events", stop_timestamp) singer.write_state(STATE)
def sync_owners(): schema = load_schema("owners") singer.write_schema("owners", schema, ["portalId", "ownerId"]) start = get_start("owners") data = request(get_url("owners")).json() for row in data: record = transform(row, schema) if record['updatedAt'] >= start: singer.write_record("owners", record) utils.update_state(STATE, "owners", record['updatedAt']) singer.write_state(STATE)
def sync_keywords(): schema = load_schema("keywords") singer.write_schema("keywords", schema, ["keyword_guid"]) start = get_start("keywords") data = request(get_url("keywords")).json() for row in data['keywords']: record = transform(row, schema) if record['created_at'] >= start: singer.write_record("keywords", record) utils.update_state(STATE, "keywords", record['created_at']) singer.write_state(STATE)
def sync_workflows(): schema = load_schema("workflows") singer.write_schema("workflows", schema, ["id"]) start = get_start("workflows") data = request(get_url("workflows")).json() for row in data['workflows']: record = transform(row, schema) if record['updatedAt'] >= start: singer.write_record("workflows", record) utils.update_state(STATE, "workflows", record['updatedAt']) singer.write_state(STATE)
def sync_contacts(): last_sync = utils.strptime(get_start("contacts")) days_since_sync = (datetime.datetime.utcnow() - last_sync).days if days_since_sync > 30: endpoint = "contacts_all" offset_keys = ['vid-offset'] offset_targets = ['vidOffset'] else: endpoint = "contacts_recent" offset_keys = ['vid-offset', 'time-offset'] offset_targets = ['vidOffset', 'timeOffset'] schema = load_schema("contacts") singer.write_schema("contacts", schema, ["canonical-vid"]) url = get_url(endpoint) params = { 'showListMemberships': True, 'count': 100, } vids = [] for row in gen_request(url, params, 'contacts', 'has-more', offset_keys, offset_targets): modified_time = None if 'lastmodifieddate' in row['properties']: modified_time = utils.strptime( _transform_datetime( row['properties']['lastmodifieddate']['value'])) if not modified_time or modified_time >= last_sync: vids.append(row['vid']) if len(vids) == 100: data = request(get_url("contacts_detail"), params={ 'vid': vids }).json() for vid, record in data.items(): record = transform(record, schema) singer.write_record("contacts", record) modified_time = None if 'lastmodifieddate' in record['properties']: modified_time = record['properties']['lastmodifieddate'][ 'value'] utils.update_state(STATE, "contacts", modified_time) vids = [] singer.write_state(STATE)
def sync_entity(entity): logger.info("Syncing {} from {}".format(entity, get_start(entity))) schema = load_schema(entity) singer.write_schema(entity, schema, ["id"]) for i, row in enumerate(gen_request(entity)): transform_datetimes(row) singer.write_record(entity, row) utils.update_state(STATE, entity, row["updated_at"]) # "end_users" is the only one that can be queried by updated_at # As such, the other streams require a full sync before writing bookmarks. if i % 50 == 49 and entity == "end_users": singer.write_state(STATE) singer.write_state(STATE)
def sync_entity(entity, primary_keys, date_keys=None, transform=None): schema = load_schema(entity) singer.write_schema(entity, schema, primary_keys) start_date = get_start(entity) for row in gen_request(entity): if transform: row = transform(row) row = transform_row(row, schema) if date_keys: last_touched = max(row[date_key] for date_key in date_keys) utils.update_state(STATE, entity, last_touched) if last_touched < start_date: continue singer.write_record(entity, row) singer.write_state(STATE)
def sync_activities(activity_type_id, lead_fields, date_fields, leads_schema, do_leads=False): global LEAD_IDS, LEAD_IDS_SYNCED state_key = 'activities_{}'.format(activity_type_id) start = get_start(state_key) data = request("v1/activities/pagingtoken.json", {'sinceDatetime': start}) params = { 'activityTypeIds': activity_type_id, 'nextPageToken': data['nextPageToken'], 'batchSize': LEADS_BATCH_SIZE, } for row in gen_request("v1/activities.json", params=params): # Stream in the activity and update the state. singer.write_record("activities", row) utils.update_state(STATE, state_key, row['activityDate']) if do_leads: # Add the lead id to the set of lead ids that need synced unless # already synced. lead_id = row['leadId'] if lead_id not in LEAD_IDS_SYNCED: LEAD_IDS.add(lead_id) # If we have 300 or more lead ids (one page), sync those leads and mark # the ids as synced. Once the leads have been synced we can update the # state. if len(LEAD_IDS) >= LEADS_BATCH_SIZE: # Take the first 300 off the set and sync them. lead_ids = list(LEAD_IDS)[:LEADS_BATCH_SIZE] sync_leads(lead_ids, lead_fields, date_fields, leads_schema) # Remove the synced lead ids from the set to be synced and add them # to the set of synced ids. LEAD_IDS = LEAD_IDS.difference(lead_ids) LEAD_IDS_SYNCED = LEAD_IDS_SYNCED.union(lead_ids) # Update the state. singer.write_state(STATE)
def sync_companies(): last_sync = utils.strptime(get_start("companies")) days_since_sync = (datetime.datetime.utcnow() - last_sync).days if days_since_sync > 30: endpoint = "companies_all" path = "companies" more_key = "has-more" offset_keys = ["offset"] offset_targets = ["offset"] else: endpoint = "companies_recent" path = "results" more_key = "hasMore" offset_keys = ["offset"] offset_targets = ["offset"] schema = load_schema('companies') singer.write_schema("companies", schema, ["companyId"]) url = get_url(endpoint) params = {'count': 250} for i, row in enumerate( gen_request(url, params, path, more_key, offset_keys, offset_targets)): record = request( get_url("companies_detail", company_id=row['companyId'])).json() record = transform(record, schema) modified_time = None if 'hs_lastmodifieddate' in record: modified_time = utils.strptime( record['hs_lastmodifieddate']['value']) elif 'createdate' in record: modified_time = utils.strptime(record['createdate']['value']) if not modified_time or modified_time >= last_sync: singer.write_record("companies", record) utils.update_state(STATE, "companies", modified_time) if i % 250 == 0: singer.write_state(STATE)
def sync_type(type, endpoint, replicationKey): schema = load_schema(type) singer.write_schema(type, schema, [replicationKey]) url = get_url(endpoint, get_start(type).strftime("%Y-%m-%d")) req = requests.Request("GET", url=url).prepare() resp = SESSION.send(req) resp.raise_for_status() finalRow = None for row in resp.json(): finalRow = row if row.get("date"): row["date"] = dateparser.parse(row["date"]).isoformat() + "Z"; if row.get("rating"): row["rating"] = int(row["rating"]) singer.write_record(type, row) if finalRow != None: utils.update_state(STATE, type, finalRow['date'])
def sync_advertisers(client): schema = load_schema("merchants") singer.write_schema("merchants", schema, ["Id"]) lastModified = get_start("merchants", False) finalRow = None resp = client.service.getMerchantList(sRelationship="joined") for x in resp.body.getMerchantListReturn: x = helpers.serialize_object(x) if x['aCommissionRanges'] != None: x['aCommissionRanges'] = x['aCommissionRanges'] if x['aSectors'] != None: x['aSectors'] = t['aSectors'] if lastModified == None or x['dDetailsModified'] > lastModified: singer.write_record("merchants", map_type(x)) finalRow = x if finalRow != None: utils.update_state(STATE, "merchants", finalRow['dDetailsModified'])
def sync_endpoint(catalog_entry, schema, mdata, date_fields = None): singer.write_schema(catalog_entry.tap_stream_id, schema, [PRIMARY_KEY], bookmark_properties = [REPLICATION_KEY]) start = get_start(catalog_entry.tap_stream_id) url = get_url(catalog_entry.tap_stream_id) data = request(url)[catalog_entry.tap_stream_id] time_extracted = utils.now() stream_version = get_stream_version(catalog_entry.tap_stream_id) activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version ) for row in data: with Transformer() as transformer: rec = transformer.transform(row, schema, mdata) append_times_to_dates(rec, date_fields) try: updated_at = rec[REPLICATION_KEY] except KeyError: updated_at = start if updated_at >= start: new_record = singer.RecordMessage( stream=catalog_entry.stream, record=rec, version=stream_version, time_extracted=time_extracted) singer.write_message(new_record) utils.update_state(STATE, catalog_entry.tap_stream_id, updated_at) singer.write_state(STATE) singer.write_message(activate_version_message)
def do_sync(guid, company_id, start_date): """Use the request function to get data and write the schema and response to singer""" schema = load_schema() LOGGER.info("---------- Writing Schema ----------") singer.write_schema("powerinbox_response", schema, "stripe") LOGGER.info("---------- Starting sync ----------") pickup_date = get_start_date("start_date") if (pickup_date): next_date = pickup_date else: next_date = start_date try: while next_date < utils.strftime(utils.now(), DATE_FORMAT): ext_url = ("{company_id}/{guid}/all_stripe/{date}.json" .format(company_id=company_id, guid=guid, date=next_date)) response = request(BASE_URL+ext_url) with metrics.record_counter(BASE_URL+ext_url) as counter: for record in response: singer.write_records("powerinbox_response", [record]) utils.update_state(STATE, "start_date", next_date.encode("ascii", "ignore")) counter.increment() next_date = utils.strftime((utils.strptime_to_utc(next_date)+ timedelta(days=1)), DATE_FORMAT) except Exception as exc: LOGGER.critical(exc) singer.write_state(STATE) raise exc singer.write_state(STATE) LOGGER.info("---------- Completed sync ----------")
def sync_transactions(client): schema = load_schema("transactions") singer.write_schema("transactions", schema, ["Id"]) dateFrom = get_start("transactions") - timedelta( days=CONFIG['validation_window']) dateTo = datetime.now(timezone.utc) start = dateFrom offset = 0 finalRow = None # handle batches by number of days and number of rows while start < dateTo: end = start + timedelta(days=MAX_DAYS) if (end > dateTo): end = dateTo resp = client.service.getTransactionList(dStartDate=start, dEndDate=end, iOffset=offset, iLimit=BATCH_SIZE, sDateType="transaction") if (resp.body.getTransactionListCountReturn.iRowsReturned > 0): for t in resp.body.getTransactionListReturn: t = helpers.serialize_object(t) if t['aTransactionParts'] != None: t['aTransactionParts'] = t['aTransactionParts'] singer.write_record("transactions", map_type(t)) finalRow = t if (offset + resp.body.getTransactionListCountReturn.iRowsReturned ) < resp.body.getTransactionListCountReturn.iRowsAvailable: offset += resp.body.getTransactionListCountReturn.iRowsReturned else: start = end offset = 0 if finalRow != None: utils.update_state(STATE, "transactions", finalRow['dTransactionDate'])
def sync_endpoint(endpoint, schema, mdata, date_fields = None): singer.write_schema(endpoint, schema, [PRIMARY_KEY], bookmark_properties = [REPLICATION_KEY]) start = get_start(endpoint) url = get_url(endpoint) data = request(url)[endpoint] time_extracted = utils.now() for row in data: with Transformer() as transformer: rec = transformer.transform(row, schema, mdata) append_times_to_dates(rec, date_fields) updated_at = rec[REPLICATION_KEY] if updated_at >= start: singer.write_record(endpoint, rec, time_extracted = time_extracted) utils.update_state(STATE, endpoint, updated_at) singer.write_state(STATE)
def sync_endpoint(schema_name, endpoint=None, path=None, special_field_name=None, special_field_value=None, keys=None, object_to_id=None, parameter_for_updated=None): schema = load_schema(schema_name) bookmark_property = 'updated_at' LOGGER.info('Loading ' + schema_name) if keys is None: keys = ['id'] singer.write_schema(schema_name, schema, keys, bookmark_properties=[bookmark_property]) start = get_start(schema_name) start_dt = datetime.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S.%fZ') updated_since = start_dt.strftime("%Y%m%dT%H%M%S") LOGGER.info('updated_since ' + updated_since) with Transformer() as transformer: url = get_url(endpoint or schema_name) url = endpoint or url if parameter_for_updated is not None: url = url + '?' + parameter_for_updated + '=' + updated_since response = request(url, None) LOGGER.info('URL :' + url) if schema_name is 'project_financials': response = [response] time_extracted = utils.now() for row in response: if special_field_name is not None: row[special_field_name] = special_field_value if object_to_id is not None: for key in object_to_id: if row[key] is not None: row[key + '_id'] = row[key]['id'] else: row[key + '_id'] = None item = transformer.transform(row, schema) if not bookmark_property in item: item[bookmark_property] = \ datetime.datetime.now().strftime('%Y-%m-%d') \ + 'T00:00:00.00Z' if datetime.datetime.strptime(item[bookmark_property], '%Y-%m-%dT%H:%M:%S.%fZ') >= start_dt: singer.write_record(schema_name, item, time_extracted=time_extracted) utils.update_state(STATE, schema_name, item[bookmark_property]) singer.write_state(STATE)
def sync_rate_cards( # pylint: disable=too-many-arguments schema_name, endpoint=None, path=None, special_field_name=None, special_field_value=None, date_fields=None, with_updated_since=True, for_each_handler=None, map_handler=None, object_to_id=None, ): schema = load_schema(schema_name) bookmark_property = 'updated_at' LOGGER.info('Loading ' + schema_name) singer.write_schema(schema_name, schema, ['id'], bookmark_properties=[bookmark_property]) start = get_start(schema_name) with Transformer() as transformer: url = get_url(endpoint or schema_name) url = endpoint or url response = request(url, None) time_extracted = utils.now() for row in response: if map_handler is not None: row = map_handler(row) if object_to_id is not None: for key in object_to_id: if row[key] is not None: row[key + '_id'] = row[key]['id'] else: row[key + '_id'] = None item = transformer.transform(row, schema) if not bookmark_property in item: item[bookmark_property] = \ datetime.datetime.now().strftime('%Y-%m-%d') \ + 'T00:00:00Z' # find expenses sync_endpoint( 'rate_cards_rates', BASE_API_URL + 'rate_cards/' + str(row['id']) + '/rates', None, 'rate_card_id', str(row['id']), ['rate_card_id', 'role'], ) singer.write_record(schema_name, item, time_extracted=time_extracted) # take any additional actions required for the currently loaded endpoint utils.update_state(STATE, schema_name, item[bookmark_property]) singer.write_state(STATE)
def sync_project( # pylint: disable=too-many-arguments schema_name, endpoint=None, path=None, special_field_name=None, special_field_value=None, date_fields=None, with_updated_since=True, for_each_handler=None, map_handler=None, object_to_id=None, ): schema = load_schema(schema_name) bookmark_property = 'updated_at' LOGGER.info('Loading ' + schema_name) singer.write_schema(schema_name, schema, ['id']) start = get_start(schema_name) with Transformer() as transformer: url = get_url(endpoint or schema_name) url = endpoint or url response = request(url, None) for row in response: for refs in row['external_refs']: if refs['name'] == 'harvest_project_id': LOGGER.info('Loading in if' + str(refs)) row['connected_project'] = refs['value'] item = transformer.transform(row, schema) time_extracted = utils.now() # find related sync_endpoint( 'expense_items', BASE_API_URL + 'projects/' + str(row['id']) + '/expense_items', None, 'project_id', str(row['id'])) sync_endpoint( 'invoices', BASE_API_URL + 'projects/' + str(row['id']) + '/invoices', None, 'project_id', str(row['id'])) sync_endpoint( 'milestones', BASE_API_URL + 'projects/' + str(row['id']) + '/milestones', None, 'project_id', str(row['id'])) sync_endpoint( 'project_team', BASE_API_URL + 'projects/' + str(row['id']) + '/team', None, 'project_id', str(row['id']), ['person_id', 'project_id'], ) sync_endpoint( 'sprints', BASE_API_URL + 'projects/' + str(row['id']) + '/sprints', None, 'project_id', str(row['id'])) sync_endpoint( 'workflow_columns', BASE_API_URL + 'projects/' + str(row['id']) + '/workflow_columns', None, 'project_id', str(row['id'])) sync_endpoint( 'project_financials', BASE_API_URL + 'projects/' + str(row['id']) + '/financials', None, None, None, ['project_id'], ) if bookmark_property in item and item[bookmark_property] \ >= start: singer.write_record(schema_name, item, time_extracted=time_extracted) utils.update_state(STATE, schema_name, item[bookmark_property]) singer.write_state(STATE)
def sync_allocations( schema_name, endpoint=None, path=None, special_field_name=None, special_field_value=None, keys=None, object_to_id=None, ): schema = load_schema(schema_name) bookmark_property = 'updated_at' LOGGER.info('Loading ' + schema_name) if keys is None: keys = ['id'] singer.write_schema(schema_name, schema, keys, bookmark_properties=[bookmark_property]) start = get_start(schema_name) weekDays = [ 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', ] with Transformer() as transformer: url = get_url(endpoint or schema_name) url = endpoint or url response = request(url, None) time_extracted = utils.now() for row in response: # add here logic date = datetime.datetime.strptime(row['start_date'], '%Y-%m-%d') LOGGER.info("Project" + str(row['project']) + "-" + str(row['person'])) end_date = datetime.datetime.strptime(row['end_date'], '%Y-%m-%d') newRow = {} #LOGGER.info("ID:" + str(row['id'])) #LOGGER.info("Date : " + date.strftime('%Y%m%d')) while date <= end_date: #LOGGER.info('Date : ' + str(date.weekday()) + 'weekday' # + weekDays[date.weekday()]) #LOGGER.info(row['project']) #LOGGER.info(row[weekDays[date.weekday()]]) #LOGGER.info(str(date.strftime('%Y-%m-%d'))) #if row['id'] = 72051: # LOGGER.info(row['project']) # LOGGER.info(row['person']) # LOGGER.info(str(date.strftime('%Y-%m-%d'))) # LOGGER.info(str(end_date.strftime('%Y-%m-%d'))) newRow['allocation'] = row[weekDays[date.weekday()]] if not newRow['allocation'] > 0: date = date + timedelta(days=1) continue newRow['project'] = row['project'] newRow['non_project_time'] = row['non_project_time'] newRow['connected_project'] = row['connected_project'] newRow['person'] = row['person'] newRow['project'] = row['project'] newRow['date'] = date.strftime('%Y-%m-%d') newRow['notes'] = row['notes'] newRow['created_by'] = row['created_by'] newRow['updated_by'] = row['updated_by'] newRow['created_at'] = row['created_at'] newRow['updated_at'] = row['updated_at'] newRow['id'] = str(row['id']) \ + str(date.strftime('%Y%m%d')) date = date + timedelta(days=1) item = transformer.transform(newRow, schema) if not bookmark_property in item: item[bookmark_property] = \ datetime.datetime.now().strftime('%Y-%m-%d') \ + 'T00:00:00Z' if bookmark_property in item \ and item[bookmark_property] >= start: singer.write_record(schema_name, item, time_extracted=time_extracted) utils.update_state(STATE, schema_name, item[bookmark_property]) else: singer.write_record(schema_name, item, time_extracted=time_extracted) # take any additional actions required for the currently loaded endpoint utils.update_state(STATE, schema_name, item[bookmark_property]) singer.write_state(STATE)
def sync_in_app_events(): schema = load_schema("raw_data/in_app_events") singer.write_schema("in_app_events", schema, [ "event_time", "event_name", "appsflyer_id" ]) # This order matters fieldnames = ( "attributed_touch_type", "attributed_touch_time", "install_time", "event_time", "event_name", "event_value", "event_revenue", "event_revenue_currency", "event_revenue_usd", "event_source", "is_receipt_validated", "af_prt", "media_source", "af_channel", "af_keywords", "campaign", "af_c_id", "af_adset", "af_adset_id", "af_ad", "af_ad_id", "af_ad_type", "af_siteid", "af_sub_siteid", "af_sub1", "af_sub2", "af_sub3", "af_sub4", "af_sub5", "af_cost_model", "af_cost_value", "af_cost_currency", "contributor1_af_prt", "contributor1_media_source", "contributor1_campaign", "contributor1_touch_type", "contributor1_touch_time", "contributor2_af_prt", "contributor2_media_source", "contributor2_campaign", "contributor2_touch_type", "contributor2_touch_time", "contributor3_af_prt", "contributor3_media_source", "contributor3_campaign", "contributor3_touch_type", "contributor3_touch_time", "region", "country_code", "state", "city", "postal_code", "dma", "ip", "wifi", "operator", "carrier", "language", "appsflyer_id", "advertising_id", "idfa", "android_id", "customer_user_id", "imei", "idfv", "platform", "device_type", "os_version", "app_version", "sdk_version", "app_id", "app_name", "bundle_id", "is_retargeting", "retargeting_conversion_type", "af_attribution_lookback", "af_reengagement_window", "is_primary_attribution", "user_agent", "http_referrer", "original_url", ) stop_time = datetime.datetime.now() from_datetime = get_start("in_app_events") to_datetime = get_stop(from_datetime, stop_time, 10) while from_datetime < stop_time: LOGGER.info("Syncing data from %s to %s", from_datetime, to_datetime) params = dict() params["from"] = from_datetime.strftime("%Y-%m-%d %H:%M") params["to"] = to_datetime.strftime("%Y-%m-%d %H:%M") params["api_token"] = CONFIG["api_token"] url = get_url("in_app_events", app_id=CONFIG["app_id"]) request_data = request(url, params) csv_data = RequestToCsvAdapter(request_data) reader = csv.DictReader(csv_data, fieldnames) next(reader) # Skip the heading row bookmark = from_datetime for i, row in enumerate(reader): record = xform(row, schema) singer.write_record("in_app_events", record) # AppsFlyer returns records in order of most recent first. if utils.strptime(record["event_time"]) > bookmark: bookmark = utils.strptime(record["event_time"]) # Write out state utils.update_state(STATE, "in_app_events", bookmark) singer.write_state(STATE) # Move the timings forward from_datetime = to_datetime to_datetime = get_stop(from_datetime, stop_time, 10)
def sync_transactions(): schema = load_schema("transactions") singer.write_schema("transactions", schema, ["id"], bookmark_properties=['created_at']) latest_updated_at = to_utc( utils.strptime(STATE.get('latest_updated_at', DEFAULT_TIMESTAMP))) run_maximum_updated_at = latest_updated_at latest_disbursement_date = to_utc( utils.strptime(STATE.get('latest_disbursement_date', DEFAULT_TIMESTAMP))) run_maximum_disbursement_date = latest_disbursement_date latest_start_date = to_utc(utils.strptime(get_start("transactions"))) period_start = latest_start_date - TRAILING_DAYS period_end = utils.now() logger.info("transactions: Syncing from {}".format(period_start)) logger.info( "transactions: latest_updated_at from {}, disbursement_date from {}". format(latest_updated_at, latest_disbursement_date)) logger.info( "transactions: latest_start_date from {}".format(latest_start_date)) # increment through each day (20k results max from api) for start, end in daterange(period_start, period_end): end = min(end, period_end) data = braintree.Transaction.search( braintree.TransactionSearch.created_at.between(start, end)) time_extracted = utils.now() logger.info("transactions: Fetched {} records from {} - {}".format( data.maximum_size, start, end)) row_written_count = 0 row_skipped_count = 0 for row in data: # Ensure updated_at consistency if not getattr(row, 'updated_at'): row.updated_at = row.created_at transformed = transform_row(row, schema) updated_at = to_utc(row.updated_at) # if disbursement is successful, get disbursement date # set disbursement datetime to min if not found if row.disbursement_details is None: disbursement_date = datetime.min else: if row.disbursement_details.disbursement_date is None: row.disbursement_details.disbursement_date = datetime.min disbursement_date = to_utc( datetime.combine( row.disbursement_details.disbursement_date, datetime.min.time())) # Is this more recent than our past stored value of update_at? # Is this more recent than our past stored value of disbursement_date? # Use >= for updated_at due to non monotonic updated_at values # Use > for disbursement_date - confirming all transactions disbursed # at the same time # Update our high water mark for updated_at and disbursement_date # in this run if (updated_at >= latest_updated_at) or (disbursement_date >= latest_disbursement_date): if updated_at > run_maximum_updated_at: run_maximum_updated_at = updated_at if disbursement_date > run_maximum_disbursement_date: run_maximum_disbursement_date = disbursement_date singer.write_record("transactions", transformed, time_extracted=time_extracted) row_written_count += 1 else: row_skipped_count += 1 logger.info("transactions: Written {} records from {} - {}".format( row_written_count, start, end)) logger.info("transactions: Skipped {} records from {} - {}".format( row_skipped_count, start, end)) # End day loop logger.info("transactions: Complete. Last updated record: {}".format( run_maximum_updated_at)) logger.info("transactions: Complete. Last disbursement date: {}".format( run_maximum_disbursement_date)) latest_updated_at = run_maximum_updated_at latest_disbursement_date = run_maximum_disbursement_date STATE['latest_updated_at'] = utils.strftime(latest_updated_at) STATE['latest_disbursement_date'] = utils.strftime( latest_disbursement_date) utils.update_state(STATE, "transactions", utils.strftime(end)) singer.write_state(STATE)