コード例 #1
0
def update_changesets():
    """
    Updates OrgChangeset records based on status of the publish job.

    Returns:
        (str, int): http response
    """
    now = datetime.utcnow()
    statuses = {}
    org_changesets = OrgChangeset.query(
        OrgChangeset.publish_job_running == True).fetch()

    if not org_changesets:
        logging.info("no changesets to update")
        return '', 204

    for org_changeset in org_changesets:
        if org_changeset.publish_job_id not in statuses:
            try:
                statuses[org_changeset.publish_job_id] = get_job(
                    org_changeset.publish_job_id)
            except Exception:
                logging.exception(
                    "failed to retrieve job status from dataflow api")
                statuses[org_changeset.publish_job_id] = {
                    'currentState': 'STATUS_API_CALL_FAILED'
                }

        job_status = statuses[org_changeset.publish_job_id]
        job_status = job_status.get('currentState',
                                    'STATUS_API_RESPONSE_ERROR')
        org_changeset.publish_job_status = job_status

        # update the changeset details if the publish job status will not change any more
        if job_status in FINAL_STATES:
            org_changeset.publish_job_finished = True
            org_changeset.publish_job_running = False
            org_changeset.publish_job_failed = job_status != SUCCESS_STATE
            org_changeset.publish_finished_at = now

            if job_status == SUCCESS_STATE:
                publish_changeset_status(org_changeset.org_uid,
                                         org_changeset.changeset,
                                         CHANGESET_STATUS_SYNCED)
            else:
                publish_changeset_status(org_changeset.org_uid,
                                         org_changeset.changeset,
                                         CHANGESET_STATUS_ERROR)

        logging.info(
            "updating org changeset ({}, {}) with job status {}".format(
                org_changeset.org_uid, org_changeset.changeset,
                org_changeset.publish_job_status))

        org_changeset.put()

    return '', 204
コード例 #2
0
def get_last_changeset(org):
    """
    Gets the last changeset for an org.

    For orgs which are being ingested by the adapter service the last changeset is always Org.changeset, but some orgs
    are 'synced' via an external process (the 'uploader' provider for example). In this case the last changeset needs
    to be derived from OrgChangeset.

    Args:
        org(Org): the Org object
    """
    # org.changeset is the changeset currently being worked on (could be finished also, but it is the last)
    org_uid = org.key.string_id()
    org_changeset = OrgChangeset.query(
        OrgChangeset.org_uid == org_uid).order(-OrgChangeset.changeset).get()
    return max(org.changeset, org_changeset.changeset if org_changeset else -1)
コード例 #3
0
def publish_changeset_status(org_uid, changeset, status_value):
    """
    Utility function for publishing org changeset status events on pubsub.

    Args:
        org_uid(str): org identifier
        changeset(int): update cycle identifier
        status_value(str): status (eg. syncing, synced, error)
    """
    topic = get_client().topic(STATUS_TOPIC)

    payload = {
        "meta": {
            "version": "2.0.0",
            "data_source_id": org_uid,
            "timestamp": datetime.utcnow().replace(microsecond=0).isoformat()
        },
        "data": [{
            "type": "changeset_sync_status",
            "id": "{}_{}".format(org_uid, changeset),
            "attributes": {
                "status": status_value,
                "changeset": changeset,
                "synced_at": None
            }
        }]
    }

    attributes = payload['data'][0]['attributes']

    if status_value == CHANGESET_STATUS_SYNCED:
        org_changeset = OrgChangeset.query(
            OrgChangeset.org_uid == org_uid,
            OrgChangeset.changeset == changeset).get()
        attributes['synced_at'] = org_changeset.publish_finished_at.replace(
            microsecond=0).isoformat()

    logging.info("publishing on status pubsub topic: {}".format(payload))

    topic.publish(json.dumps(payload))
コード例 #4
0
def changeset_list(org_uid):
    """
    Renders a page which shows all changesets and their status (ingestion and publish). Handles one org or all.

    Args:
        org_uid(str): org identifier

    Returns:
        (str, int): changeset listing page
    """
    cursor = Cursor(urlsafe=request.args.get('cursor'))
    failed = request.args.get('failed') == '1'

    query = OrgChangeset.query()

    if org_uid:
        query = query.filter(OrgChangeset.org_uid == org_uid)

    if failed:
        query = query.filter(
            ndb.OR(OrgChangeset.publish_job_failed == True,
                   OrgChangeset.publish_changeset_failed == True))

    # OR query can't sort by a field
    if failed:
        query = query.order(-OrgChangeset.key)
    else:
        query = query.order(-OrgChangeset.ingestion_completed_at)

    changesets, next_cursor, more = query.fetch_page(20, start_cursor=cursor)

    return render_template('changeset_list.html',
                           org_uid=org_uid,
                           changesets=changesets,
                           next_cursor=next_cursor,
                           more=more,
                           url_root=request.url_root,
                           failed=request.args.get('failed', '0')), 200
コード例 #5
0
    def test_complete_first_changeset(self):
        """
        Verifies that Org and OrgChangeset get updated to indicate that a changeset is complete.
        """
        started_at = datetime.now()

        org = Org(id='test', changeset=0,
                  changeset_started_at=started_at).put()
        sync_utils.complete_changeset('test')

        # Org flags/timestamps are updated
        org = Org.get_by_id('test')
        self.assertEqual(org.changeset_completed_at, datetime(2010, 1, 1))
        self.assertEqual(org.last_update_cycle_completed_at,
                         datetime(2010, 1, 1))
        self.assertFalse(org.update_cycle_active)

        # OrgChangeset record is added
        org_changeset = OrgChangeset.query().get()
        self.assertEqual(org_changeset.org_uid, 'test')
        self.assertEqual(org_changeset.changeset, 0)
        self.assertEqual(org_changeset.ingestion_started_at, started_at)
        self.assertEqual(org_changeset.ingestion_completed_at,
                         datetime(2010, 1, 1))
        self.assertFalse(org_changeset.publish_job_running)
        self.assertFalse(org_changeset.publish_job_finished)
        self.assertEqual(org_changeset.publish_job_count, 0)

        # Publish task is queued for the first changeset
        self.assertEqual(len(self.taskqueue.get_filtered_tasks()), 1)
        self.assertEqual(
            self.taskqueue.get_filtered_tasks()[0].payload,
            json.dumps({
                "job_params": {
                    "org_changeset_ids": [org_changeset.key.id()]
                }
            }))
コード例 #6
0
def start_publish():
    """
    Kicks off a dataflow template to publish normalised data. The jobs are created via a task queue task, passing the
    ID of the OrgChangesets which need to be published.

    This endpoint is invoked by a regular cron job or by a request from the admin UI, and takes an additional parameter
    which allows for each org to be published by a separate dataflow job (this is useful for isolation of an org which
    causes the whole publish job to fail).

    Returns:
        (str, int): http response
    """
    logging.info("about to kick off a publish dataflow job")

    per_org = request.form.get('per_org') == '1'
    if per_org:
        logging.info("publish job per org requested")

    # we want to publish changesets which:
    # - have newly been ingested (publish not running and not finished)
    # - OR have been attempted to be published but failed
    #   - due to the whole job failing
    #   - OR publish of the individual changeset failing
    org_changesets_query = OrgChangeset.query(
        ndb.OR(
            ndb.AND(OrgChangeset.publish_job_running == False,
                    OrgChangeset.publish_job_finished == False),
            ndb.AND(
                OrgChangeset.publish_job_running == False,
                OrgChangeset.publish_job_finished == True,
                ndb.OR(OrgChangeset.publish_job_failed == True,
                       OrgChangeset.publish_changeset_failed == True)))).order(
                           OrgChangeset.key)

    org_changesets = list(emit_items(org_changesets_query))

    # Query any currently running org changesets
    running_org_changesets_query = OrgChangeset.query(
        OrgChangeset.publish_job_running == True)
    running_org_changesets = list(emit_items(running_org_changesets_query))

    running_orgs = list(
        set([
            running_org_changeset.org_uid
            for running_org_changeset in running_org_changesets
        ]))

    # Filter any org changesets that already have a running changeset for that org
    gated_org_changesets = filter(lambda oc: oc.org_uid not in running_orgs,
                                  org_changesets)

    if len(gated_org_changesets) != len(org_changesets):
        filtered_ocs = filter(lambda oc: oc.org_uid in running_orgs,
                              org_changesets)
        filtered_oc_tuples = [(oc.org_uid, oc.changeset)
                              for oc in filtered_ocs]

        logging.info(
            "stopped these changesets from being published as job already running for the org: {}"
            .format(filtered_oc_tuples))

    if not gated_org_changesets:
        logging.info("nothing to publish")
        return '', 204

    # remove changesets for blacklisted orgs
    blacklisted_orgs = {}
    org_changesets_to_publish = []
    for org_changeset in gated_org_changesets:
        org = blacklisted_orgs.get(org_changeset.org_uid,
                                   Org.get_by_id(org_changeset.org_uid))
        if org and org.publish_disabled:
            blacklisted_orgs[org.key.string_id()] = org
        else:
            org_changesets_to_publish.append(org_changeset)

    to_publish = []

    if per_org:
        org_changesets_sorted = sorted(org_changesets_to_publish,
                                       key=attrgetter('org_uid'))
        for org_uid, changesets in groupby(org_changesets_sorted,
                                           key=attrgetter('org_uid')):
            to_publish.append({
                'org_uid':
                org_uid,
                'org_changeset_ids':
                [changeset.key.id() for changeset in changesets]
            })
    else:
        to_publish.append({
            'org_changeset_ids':
            [changeset.key.id() for changeset in org_changesets_to_publish]
        })

    logging.info("have {} publish tasks to create".format(len(to_publish)))

    items_to_tasks(items=to_publish,
                   queue=Queue('create-publish-job'),
                   task_generator=lambda item: Task(
                       url='/orchestrator/create_publish_job_task',
                       payload=dumps({'job_params': item})))

    return '', 204
コード例 #7
0
def status(org_uid):
    """
    Retrieve org status.

    Args:
        org_uid(str): org identifier

    Returns:
        (str, int): http response
    """
    def date_str(date):
        """
        Formats a date into a string (handles None values also).

        Args:
            date(date|datetime): date to be formatted

        Returns:
            str: formatted date
        """
        if date is None:
            return None

        return date.isoformat() + 'Z'

    org = Org.get_by_id(org_uid)

    if not org:
        logging.info("org {} not found".format(org_uid))
        return '', 404

    changeset = OrgChangeset.query(
        OrgChangeset.org_uid == org_uid,
        OrgChangeset.publish_job_finished == True,
        OrgChangeset.publish_job_failed == False).order(
            -OrgChangeset.publish_finished_at).fetch(1)

    # first publish happens only when all the data is ingested, so if the first publish happened the org is synced
    synced = False
    if changeset:
        synced = True

    # synced_at is the ingestion completion time of the last changeset that got published
    synced_at = None
    if changeset:
        synced_at = changeset[0].ingestion_completed_at

    status_payload = {
        'synced':
        synced,
        'synced_at':
        date_str(synced_at),
        'connected':
        org.status == CONNECTED,
        'updating':
        org.changeset_started_at is not None
        and org.changeset_completed_at is None,
        'source':
        org.provider,
        'id':
        org_uid
    }

    logging.info("org status: {}".format(status_payload))

    return jsonify(status_payload), 200
コード例 #8
0
def get_changeset_status_payload(org_uid, changeset):
    """
    Creates response body for changeset status API.

    Args:
        org_uid(str): org identifier
        changeset(int): update cycle identifier

    Returns:
        dict: changeset status response payload
    """
    changeset_id = "{}_{}".format(org_uid, changeset)
    status = "unknown"
    synced_at = None

    org = Org.get_by_id(org_uid)

    if not org:
        payload = {
            "meta": {
                "version": "2.0.0",
            },
            "errors": [{
                "id":
                "{}_not_found".format(org_uid),
                "status":
                "404",
                "code":
                "not_found",
                "title":
                "Data Source not found",
                "detail":
                "Data Source {} could not be found.".format(org_uid)
            }]
        }

        logging.info("org {}:{} not found - response {}".format(
            org_uid, changeset, payload))
        return payload

    if changeset > get_last_changeset(org):
        payload = {
            "meta": {
                "version": "2.0.0",
                "data_source_id": org_uid
            },
            "errors": [{
                "id":
                "{}_{}_not_found".format(org_uid, changeset),
                "status":
                "404",
                "code":
                "not_found",
                "title":
                "Changeset not found",
                "detail":
                "Changeset {} could not be found for {}.".format(
                    changeset, org_uid)
            }]
        }

        logging.info("changeset {}:{} not found - response {}".format(
            org_uid, changeset, payload))
        return payload

    org_changeset = OrgChangeset.query(
        OrgChangeset.org_uid == org_uid,
        OrgChangeset.changeset == changeset).get()

    # if org_changeset exists means ingestion is done
    if org_changeset:
        # if published successfully it means synced
        finished = org_changeset.publish_job_finished and not org_changeset.publish_job_running
        successful = not org_changeset.publish_job_failed and not org_changeset.publish_changeset_failed

        if finished and successful:
            status = CHANGESET_STATUS_SYNCED
            synced_at = org_changeset.publish_finished_at.replace(
                microsecond=0).isoformat()
        else:
            if not finished:
                status = CHANGESET_STATUS_SYNCING
            else:
                status = CHANGESET_STATUS_ERROR

    # ingestion is still in progress
    else:
        if org.status == CONNECTED:
            status = CHANGESET_STATUS_SYNCING
        elif org.status == DISCONNECTED:
            status = CHANGESET_STATUS_ERROR

    # just in case we have a gap in the above logic (could indicate inconsistent org state also)
    if status == "unknown":
        logging.error("could not determine changeset status for {}:{}".format(
            org_uid, changeset))

    payload = {
        "meta": {
            "version": "2.0.0",
            "data_source_id": org_uid
        },
        "data": [{
            "type": "changeset_status",
            "id": changeset_id,
            "relationships": {
                "sync_status": {
                    "data": {
                        "type": "changeset_sync_status",
                        "id": changeset_id
                    }
                }
            }
        }],
        "included": [{
            "type": "changeset_sync_status",
            "id": changeset_id,
            "attributes": {
                "status": status,
                "synced_at": synced_at
            }
        }]
    }

    logging.info("changeset status for {}: {}".format(changeset_id, payload))

    return payload