def update_changesets(): """ Updates OrgChangeset records based on status of the publish job. Returns: (str, int): http response """ now = datetime.utcnow() statuses = {} org_changesets = OrgChangeset.query( OrgChangeset.publish_job_running == True).fetch() if not org_changesets: logging.info("no changesets to update") return '', 204 for org_changeset in org_changesets: if org_changeset.publish_job_id not in statuses: try: statuses[org_changeset.publish_job_id] = get_job( org_changeset.publish_job_id) except Exception: logging.exception( "failed to retrieve job status from dataflow api") statuses[org_changeset.publish_job_id] = { 'currentState': 'STATUS_API_CALL_FAILED' } job_status = statuses[org_changeset.publish_job_id] job_status = job_status.get('currentState', 'STATUS_API_RESPONSE_ERROR') org_changeset.publish_job_status = job_status # update the changeset details if the publish job status will not change any more if job_status in FINAL_STATES: org_changeset.publish_job_finished = True org_changeset.publish_job_running = False org_changeset.publish_job_failed = job_status != SUCCESS_STATE org_changeset.publish_finished_at = now if job_status == SUCCESS_STATE: publish_changeset_status(org_changeset.org_uid, org_changeset.changeset, CHANGESET_STATUS_SYNCED) else: publish_changeset_status(org_changeset.org_uid, org_changeset.changeset, CHANGESET_STATUS_ERROR) logging.info( "updating org changeset ({}, {}) with job status {}".format( org_changeset.org_uid, org_changeset.changeset, org_changeset.publish_job_status)) org_changeset.put() return '', 204
def get_last_changeset(org): """ Gets the last changeset for an org. For orgs which are being ingested by the adapter service the last changeset is always Org.changeset, but some orgs are 'synced' via an external process (the 'uploader' provider for example). In this case the last changeset needs to be derived from OrgChangeset. Args: org(Org): the Org object """ # org.changeset is the changeset currently being worked on (could be finished also, but it is the last) org_uid = org.key.string_id() org_changeset = OrgChangeset.query( OrgChangeset.org_uid == org_uid).order(-OrgChangeset.changeset).get() return max(org.changeset, org_changeset.changeset if org_changeset else -1)
def publish_changeset_status(org_uid, changeset, status_value): """ Utility function for publishing org changeset status events on pubsub. Args: org_uid(str): org identifier changeset(int): update cycle identifier status_value(str): status (eg. syncing, synced, error) """ topic = get_client().topic(STATUS_TOPIC) payload = { "meta": { "version": "2.0.0", "data_source_id": org_uid, "timestamp": datetime.utcnow().replace(microsecond=0).isoformat() }, "data": [{ "type": "changeset_sync_status", "id": "{}_{}".format(org_uid, changeset), "attributes": { "status": status_value, "changeset": changeset, "synced_at": None } }] } attributes = payload['data'][0]['attributes'] if status_value == CHANGESET_STATUS_SYNCED: org_changeset = OrgChangeset.query( OrgChangeset.org_uid == org_uid, OrgChangeset.changeset == changeset).get() attributes['synced_at'] = org_changeset.publish_finished_at.replace( microsecond=0).isoformat() logging.info("publishing on status pubsub topic: {}".format(payload)) topic.publish(json.dumps(payload))
def changeset_list(org_uid): """ Renders a page which shows all changesets and their status (ingestion and publish). Handles one org or all. Args: org_uid(str): org identifier Returns: (str, int): changeset listing page """ cursor = Cursor(urlsafe=request.args.get('cursor')) failed = request.args.get('failed') == '1' query = OrgChangeset.query() if org_uid: query = query.filter(OrgChangeset.org_uid == org_uid) if failed: query = query.filter( ndb.OR(OrgChangeset.publish_job_failed == True, OrgChangeset.publish_changeset_failed == True)) # OR query can't sort by a field if failed: query = query.order(-OrgChangeset.key) else: query = query.order(-OrgChangeset.ingestion_completed_at) changesets, next_cursor, more = query.fetch_page(20, start_cursor=cursor) return render_template('changeset_list.html', org_uid=org_uid, changesets=changesets, next_cursor=next_cursor, more=more, url_root=request.url_root, failed=request.args.get('failed', '0')), 200
def test_complete_first_changeset(self): """ Verifies that Org and OrgChangeset get updated to indicate that a changeset is complete. """ started_at = datetime.now() org = Org(id='test', changeset=0, changeset_started_at=started_at).put() sync_utils.complete_changeset('test') # Org flags/timestamps are updated org = Org.get_by_id('test') self.assertEqual(org.changeset_completed_at, datetime(2010, 1, 1)) self.assertEqual(org.last_update_cycle_completed_at, datetime(2010, 1, 1)) self.assertFalse(org.update_cycle_active) # OrgChangeset record is added org_changeset = OrgChangeset.query().get() self.assertEqual(org_changeset.org_uid, 'test') self.assertEqual(org_changeset.changeset, 0) self.assertEqual(org_changeset.ingestion_started_at, started_at) self.assertEqual(org_changeset.ingestion_completed_at, datetime(2010, 1, 1)) self.assertFalse(org_changeset.publish_job_running) self.assertFalse(org_changeset.publish_job_finished) self.assertEqual(org_changeset.publish_job_count, 0) # Publish task is queued for the first changeset self.assertEqual(len(self.taskqueue.get_filtered_tasks()), 1) self.assertEqual( self.taskqueue.get_filtered_tasks()[0].payload, json.dumps({ "job_params": { "org_changeset_ids": [org_changeset.key.id()] } }))
def start_publish(): """ Kicks off a dataflow template to publish normalised data. The jobs are created via a task queue task, passing the ID of the OrgChangesets which need to be published. This endpoint is invoked by a regular cron job or by a request from the admin UI, and takes an additional parameter which allows for each org to be published by a separate dataflow job (this is useful for isolation of an org which causes the whole publish job to fail). Returns: (str, int): http response """ logging.info("about to kick off a publish dataflow job") per_org = request.form.get('per_org') == '1' if per_org: logging.info("publish job per org requested") # we want to publish changesets which: # - have newly been ingested (publish not running and not finished) # - OR have been attempted to be published but failed # - due to the whole job failing # - OR publish of the individual changeset failing org_changesets_query = OrgChangeset.query( ndb.OR( ndb.AND(OrgChangeset.publish_job_running == False, OrgChangeset.publish_job_finished == False), ndb.AND( OrgChangeset.publish_job_running == False, OrgChangeset.publish_job_finished == True, ndb.OR(OrgChangeset.publish_job_failed == True, OrgChangeset.publish_changeset_failed == True)))).order( OrgChangeset.key) org_changesets = list(emit_items(org_changesets_query)) # Query any currently running org changesets running_org_changesets_query = OrgChangeset.query( OrgChangeset.publish_job_running == True) running_org_changesets = list(emit_items(running_org_changesets_query)) running_orgs = list( set([ running_org_changeset.org_uid for running_org_changeset in running_org_changesets ])) # Filter any org changesets that already have a running changeset for that org gated_org_changesets = filter(lambda oc: oc.org_uid not in running_orgs, org_changesets) if len(gated_org_changesets) != len(org_changesets): filtered_ocs = filter(lambda oc: oc.org_uid in running_orgs, org_changesets) filtered_oc_tuples = [(oc.org_uid, oc.changeset) for oc in filtered_ocs] logging.info( "stopped these changesets from being published as job already running for the org: {}" .format(filtered_oc_tuples)) if not gated_org_changesets: logging.info("nothing to publish") return '', 204 # remove changesets for blacklisted orgs blacklisted_orgs = {} org_changesets_to_publish = [] for org_changeset in gated_org_changesets: org = blacklisted_orgs.get(org_changeset.org_uid, Org.get_by_id(org_changeset.org_uid)) if org and org.publish_disabled: blacklisted_orgs[org.key.string_id()] = org else: org_changesets_to_publish.append(org_changeset) to_publish = [] if per_org: org_changesets_sorted = sorted(org_changesets_to_publish, key=attrgetter('org_uid')) for org_uid, changesets in groupby(org_changesets_sorted, key=attrgetter('org_uid')): to_publish.append({ 'org_uid': org_uid, 'org_changeset_ids': [changeset.key.id() for changeset in changesets] }) else: to_publish.append({ 'org_changeset_ids': [changeset.key.id() for changeset in org_changesets_to_publish] }) logging.info("have {} publish tasks to create".format(len(to_publish))) items_to_tasks(items=to_publish, queue=Queue('create-publish-job'), task_generator=lambda item: Task( url='/orchestrator/create_publish_job_task', payload=dumps({'job_params': item}))) return '', 204
def status(org_uid): """ Retrieve org status. Args: org_uid(str): org identifier Returns: (str, int): http response """ def date_str(date): """ Formats a date into a string (handles None values also). Args: date(date|datetime): date to be formatted Returns: str: formatted date """ if date is None: return None return date.isoformat() + 'Z' org = Org.get_by_id(org_uid) if not org: logging.info("org {} not found".format(org_uid)) return '', 404 changeset = OrgChangeset.query( OrgChangeset.org_uid == org_uid, OrgChangeset.publish_job_finished == True, OrgChangeset.publish_job_failed == False).order( -OrgChangeset.publish_finished_at).fetch(1) # first publish happens only when all the data is ingested, so if the first publish happened the org is synced synced = False if changeset: synced = True # synced_at is the ingestion completion time of the last changeset that got published synced_at = None if changeset: synced_at = changeset[0].ingestion_completed_at status_payload = { 'synced': synced, 'synced_at': date_str(synced_at), 'connected': org.status == CONNECTED, 'updating': org.changeset_started_at is not None and org.changeset_completed_at is None, 'source': org.provider, 'id': org_uid } logging.info("org status: {}".format(status_payload)) return jsonify(status_payload), 200
def get_changeset_status_payload(org_uid, changeset): """ Creates response body for changeset status API. Args: org_uid(str): org identifier changeset(int): update cycle identifier Returns: dict: changeset status response payload """ changeset_id = "{}_{}".format(org_uid, changeset) status = "unknown" synced_at = None org = Org.get_by_id(org_uid) if not org: payload = { "meta": { "version": "2.0.0", }, "errors": [{ "id": "{}_not_found".format(org_uid), "status": "404", "code": "not_found", "title": "Data Source not found", "detail": "Data Source {} could not be found.".format(org_uid) }] } logging.info("org {}:{} not found - response {}".format( org_uid, changeset, payload)) return payload if changeset > get_last_changeset(org): payload = { "meta": { "version": "2.0.0", "data_source_id": org_uid }, "errors": [{ "id": "{}_{}_not_found".format(org_uid, changeset), "status": "404", "code": "not_found", "title": "Changeset not found", "detail": "Changeset {} could not be found for {}.".format( changeset, org_uid) }] } logging.info("changeset {}:{} not found - response {}".format( org_uid, changeset, payload)) return payload org_changeset = OrgChangeset.query( OrgChangeset.org_uid == org_uid, OrgChangeset.changeset == changeset).get() # if org_changeset exists means ingestion is done if org_changeset: # if published successfully it means synced finished = org_changeset.publish_job_finished and not org_changeset.publish_job_running successful = not org_changeset.publish_job_failed and not org_changeset.publish_changeset_failed if finished and successful: status = CHANGESET_STATUS_SYNCED synced_at = org_changeset.publish_finished_at.replace( microsecond=0).isoformat() else: if not finished: status = CHANGESET_STATUS_SYNCING else: status = CHANGESET_STATUS_ERROR # ingestion is still in progress else: if org.status == CONNECTED: status = CHANGESET_STATUS_SYNCING elif org.status == DISCONNECTED: status = CHANGESET_STATUS_ERROR # just in case we have a gap in the above logic (could indicate inconsistent org state also) if status == "unknown": logging.error("could not determine changeset status for {}:{}".format( org_uid, changeset)) payload = { "meta": { "version": "2.0.0", "data_source_id": org_uid }, "data": [{ "type": "changeset_status", "id": changeset_id, "relationships": { "sync_status": { "data": { "type": "changeset_sync_status", "id": changeset_id } } } }], "included": [{ "type": "changeset_sync_status", "id": changeset_id, "attributes": { "status": status, "synced_at": synced_at } }] } logging.info("changeset status for {}: {}".format(changeset_id, payload)) return payload