Beispiel #1
0
def get_assignment(request, crowd_name):
    # get the interface implementation from the crowd name.
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)

    # get assignment context
    context = interface.get_assignment_context(request)
    try:
        interface.require_context(
            context, ['task_id', 'is_accepted'],
            ValueError('Task id unavailable in assignment request context.'))
    except ValueError:
        # This task is no longer available (due to a race condition).
        # Return the 'No available tasks' template.
        template = get_scoped_template(crowd_name, 'unavailable.html')
        return HttpResponse(template.render(RequestContext(request, {})))

    # Retrieve the tweet based on task_id from the database
    try:
        current_task = model_spec.task_model.objects.get(
            task_id=context['task_id'])
    except model_spec.task_model.DoesNotExist:
        raise ValueError('Invalid task id: ' + context['task_id'])

    content = json.loads(current_task.data)
    group_context = json.loads(current_task.group.group_context)

    # Save the information of this worker
    worker_id = context.get('worker_id')
    if worker_id:
        try:
            current_worker = model_spec.worker_model.objects.get(
                worker_id=worker_id)
        except model_spec.worker_model.DoesNotExist:
            current_worker = model_spec.worker_model(
                worker_id=context['worker_id'])

            # Call the pre-save hook, the save to the database
            interface.worker_pre_save(current_worker)
            current_worker.save()
    else:
        current_worker = None

    # Relate workers and tasks (after a worker accepts the task).
    if context.get('is_accepted', False):
        if not current_worker:
            raise ValueError("Accepted tasks must have an associated worker.")
        if not current_worker.tasks.filter(task_id=current_task.task_id).exists():
            current_worker.tasks.add(current_task)

    # Add task data to the context.
    crowd_config = json.loads(current_task.group.crowd_config)
    context.update(group_context=group_context,
                   content=content,
                   backend_submit_url=interface.get_backend_submit_url(),
                   frontend_submit_url=interface.get_frontend_submit_url(crowd_config))

    # Load the template and render it.
    template = get_scoped_template(crowd_name, current_task.task_type + '.html',
                            context=context)
    return HttpResponse(template.render(RequestContext(request, context)))
Beispiel #2
0
def purge_tasks(request, crowd_name):
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)
    tasks = model_spec.task_model.objects.all()

    # Call the delete hook, then delete the tasks from our database.
    interface.delete_tasks(tasks)
    tasks.delete()
    return HttpResponse('ok')
Beispiel #3
0
def purge_tasks(request, crowd_name):
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)
    tasks = model_spec.task_model.objects.all()

    # Call the delete hook, then delete the tasks from our database.
    interface.delete_tasks(tasks)
    tasks.delete()
    return HttpResponse('ok')
Beispiel #4
0
def main():
    m = CrowdRegistry.get_registry_entry('amt')[1]
    p = RetainerPool.objects.filter(status__lt=6)
    print "Before killing:", p
    for pool in p:
        _finish_pool(pool, m)
    p = RetainerPool.objects.filter(status__lt=6)
    print "After killing:", p
Beispiel #5
0
def finish_pool(request, crowd_name):
    pool_id = request.POST.get('pool_id')
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)
    try:
        pool = model_spec.retainer_pool_model.objects.get(external_id=pool_id)
    except model_spec.retainer_pool_model.DoesNotExist:
        return HttpResponse(json.dumps({'error': 'Invalid pool id'}))
    _finish_pool(pool, model_spec)
    logger.info("Retainer pool %s finished" % pool)
    return HttpResponse(json.dumps({'status': 'ok'}))
Beispiel #6
0
def understands_retainer(request, crowd_name, worker_id):
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)
    try:
        worker = model_spec.worker_model.objects.get(worker_id=worker_id)
    except model_spec.worker_model.DoesNotExist:
        return HttpResponse(json.dumps({'error': 'Invalid worker id'}))

    worker.understands_retainer = True
    worker.save()
    logger.info('%s understands the retainer model.' % worker)

    return HttpResponse(json.dumps({'status': 'ok'}))
Beispiel #7
0
def post_response(request, crowd_name):

    # get the interface implementation from the crowd name.
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)

    # get context from the request
    context = interface.get_response_context(request)

    # validate context
    interface.require_context(
        context, ['assignment_id', 'task_id', 'worker_id', 'answers'],
        ValueError("Response context missing required keys."))

    # Check if this is a duplicate response
    assignment_id = context['assignment_id']
    if model_spec.assignment_model.objects.filter(
            assignment_id=assignment_id, finished_at__isnull=False).exists():
        return HttpResponse('Duplicate!')

    # Retrieve the task and worker from the database based on ids.
    current_task = model_spec.task_model.objects.get(
        task_id=context['task_id'])
    assignment = model_spec.assignment_model.objects.get(
        assignment_id=assignment_id)

    # Store this response into the database
    assignment.content = context['answers']
    assignment.finished_at = timezone.now()
    interface.response_pre_save(assignment)
    assignment.save()

    # Check if this task has been finished
    # If we've gotten too many responses, ignore.
    if (not current_task.is_complete and
        (current_task.assignments.filter(finished_at__isnull=False).count() >=
         current_task.num_assignments)):
        current_task.is_complete = True
        current_task.pre_celery = timezone.now()
        current_task.save()
        gather_answer.delay(current_task.task_id, model_spec)

        # terminate in progress retainer tasks
        (model_spec.assignment_model.objects.exclude(
            task__task_type='retainer').filter(
                task=current_task,
                finished_at__isnull=True).update(finished_at=timezone.now(),
                                                 terminated=True))

    return HttpResponse('ok')  # AJAX call succeded.
Beispiel #8
0
def index(request):
    interface, _ = CrowdRegistry.get_registry_entry('internal')

    # Get worker id from session, or create one if this is a first-time user.
    worker_id = request.session.get('worker_id')
    if not worker_id:
        worker_id = str(uuid.uuid4())
        request.session['worker_id'] = worker_id

    # Map from task type to total assignments in the system
    total_tasks_by_type = dict(
        CrowdTask.objects.values_list('task_type').annotate(
            num_tasks=Count('task_id')))

    task_type_map = {
        'sa': 'Sentiment Analysis',
        'er': 'Entity Resolution',
        'ft': 'Filtering',
    }

    # Eligible task types with the number of available assignments for each.
    eligible_task_ids = list(
        interface.get_eligible_tasks(worker_id).values_list('task_id',
                                                            flat=True))
    incomplete_tasks_by_type = (CrowdTask.objects.filter(
        task_id__in=eligible_task_ids).values('task_type').annotate(
            num_tasks=Count('task_id')))

    task_types = {
        t['task_type']: build_context(task_type_map,
                                      total_tasks_by_type,
                                      worker_id,
                                      task_type_obj=t)
        for t in incomplete_tasks_by_type
    }

    for t_shortname, t_fullname in task_type_map.iteritems():
        if t_shortname not in task_types:
            task_types[t_shortname] = build_context(task_type_map,
                                                    total_tasks_by_type,
                                                    worker_id,
                                                    task_type=t_shortname)

    # Render index template
    return render(request, 'internal/index.html', {
        'task_types': task_types,
        'task_map': task_type_map
    })
Beispiel #9
0
def get_assignment(request, crowd_name):
    # get the interface implementation from the crowd name.
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)
    logger.info('Non-retainer worker requested task assignment.')

    # get assignment context
    context = interface.get_assignment_context(request)
    try:
        interface.require_context(
            context, ['task_id', 'is_accepted'],
            ValueError('Task id unavailable in assignment request context.'))
    except ValueError:
        # This task is no longer available (due to a race condition).
        # Return the 'No available tasks' template.
        template = get_scoped_template(crowd_name, 'unavailable.html')
        return HttpResponse(template.render(RequestContext(request, {})))

    return _get_assignment(request, crowd_name, interface, model_spec, context)
Beispiel #10
0
def get_retainer_assignment(request, crowd_name, worker_id, task_id):
    # get the interface implementation from the crowd name.
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)
    logger.info('Retainer worker fetched task assignment.')

    # fetch assignment if it already exists (e.g. the user refreshed the browser).
    try:
        assignment_id = model_spec.assignment_model.objects.get(
            task_id=task_id, worker_id=worker_id).assignment_id
    except model_spec.assignment_model.DoesNotExist:
        assignment_id = str(uuid.uuid4())
    context = {
        'task_id': task_id,
        'worker_id': worker_id,
        'is_accepted': True,
        'assignment_id': assignment_id
    }

    return _get_assignment(request, crowd_name, interface, model_spec, context)
Beispiel #11
0
def post_response(request, crowd_name):

    # get the interface implementation from the crowd name.
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)

    # get context from the request
    context = interface.get_response_context(request)

    # validate context
    interface.require_context(
        context, ['assignment_id', 'task_id', 'worker_id', 'answers'],
        ValueError("Response context missing required keys."))

    # Check if this is a duplicate response
    if model_spec.response_model.objects.filter(
            assignment_id=context['assignment_id']).exists():
        return HttpResponse('Duplicate!')

    # Retrieve the task and worker from the database based on ids.
    current_task = model_spec.task_model.objects.get(
        task_id=context['task_id'])
    current_worker = model_spec.worker_model.objects.get(
        worker_id=context['worker_id'])

    # Store this response into the database
    current_response = model_spec.response_model(
        task=current_task,
        worker=current_worker,
        content=context['answers'],
        assignment_id=context['assignment_id'])
    interface.response_pre_save(current_response)
    current_response.save()

    # Check if this task has been finished
    # If we've gotten too many responses, ignore.
    if (not current_task.is_complete and
            current_task.responses.count() >= current_task.num_assignments):
        current_task.is_complete = True
        current_task.save()
        gather_answer.delay(current_task.task_id, model_spec)

    return HttpResponse('ok')  # AJAX call succeded.
Beispiel #12
0
def index(request):
    interface, _ = CrowdRegistry.get_registry_entry('internal')

    # Get worker id from session, or create one if this is a first-time user.
    worker_id = request.session.get('worker_id')
    if not worker_id:
        worker_id = str(uuid.uuid4())
        request.session['worker_id'] = worker_id

    # Map from task type to total assignments in the system
    total_tasks_by_type = dict(
        CrowdTask.objects.values_list('task_type')
        .annotate(num_tasks=Count('task_id')))

    task_type_map = {
        'sa': 'Sentiment Analysis',
        'er': 'Entity Resolution',
        'ft': 'Filtering',
    }

    # Eligible task types with the number of available assignments for each.
    eligible_task_ids = list(interface.get_eligible_tasks(worker_id)
                             .values_list('task_id', flat=True))
    incomplete_tasks_by_type = (CrowdTask.objects
                                .filter(task_id__in=eligible_task_ids)
                                .values('task_type')
                                .annotate(num_tasks=Count('task_id')))

    task_types = { t['task_type'] :
                   build_context(task_type_map, total_tasks_by_type,
                                 worker_id, task_type_obj=t)
                   for t in incomplete_tasks_by_type }

    for t_shortname, t_fullname in task_type_map.iteritems():
        if t_shortname not in task_types:
            task_types[t_shortname] = build_context(
                task_type_map, total_tasks_by_type, worker_id,
                task_type=t_shortname)

    # Render index template
    return render(request, 'internal/index.html', {'task_types': task_types,
                                                   'task_map': task_type_map})
Beispiel #13
0
def post_response(request, crowd_name):

    # get the interface implementation from the crowd name.
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)

    # get context from the request
    context = interface.get_response_context(request)

    # validate context
    interface.require_context(
        context, ['assignment_id', 'task_id', 'worker_id', 'answers'],
        ValueError("Response context missing required keys."))

    # Check if this is a duplicate response
    if model_spec.response_model.objects.filter(
            assignment_id=context['assignment_id']).exists():
        return HttpResponse('Duplicate!')

    # Retrieve the task and worker from the database based on ids.
    current_task = model_spec.task_model.objects.get(task_id=context['task_id'])
    current_worker = model_spec.worker_model.objects.get(
        worker_id=context['worker_id'])

    # Store this response into the database
    current_response = model_spec.response_model(
        task=current_task,
        worker=current_worker,
        content=context['answers'],
        assignment_id=context['assignment_id'])
    interface.response_pre_save(current_response)
    current_response.save()

    # Check if this task has been finished
    # If we've gotten too many responses, ignore.
    if (not current_task.is_complete
        and current_task.responses.count() >= current_task.num_assignments):
        current_task.is_complete = True
        current_task.save()
        gather_answer.delay(current_task.task_id, model_spec)

    return HttpResponse('ok')  # AJAX call succeded.
Beispiel #14
0
def post_retainer_tasks():

    # Process each installed crowd.
    registry = CrowdRegistry.get_registry()
    for crowd_name, (crowd_interface, crowd_model_spec) in registry.iteritems():

        # Skip crowds that don't support retainer pools.
        if not crowd_model_spec.retainer_pool_model:
            logger.info("Crowd %s doesn't support retainer pools, not posting "
                        "tasks." % crowd_name)
            continue

        # Find pools that need more workers.
        logger.info("Crowd %s supports retainer pools, looking for pools that "
                    "need more workers." % crowd_name)
        valid_states = (RetainerPoolStatus.RECRUITING, RetainerPoolStatus.IDLE,
                        RetainerPoolStatus.ACTIVE, RetainerPoolStatus.REFILLING)
        for pool in crowd_model_spec.retainer_pool_model.objects.filter(
                status__in=valid_states):
            
            group = pool.task_groups.order_by('created_at')[0]
            exp_config = json.loads(group.global_config).get('experimental')
            if exp_config:
                churn_thresh = exp_config.get('churn_threshold')
            else:
                churn_thresh = None

            # Check if the pool needs more workers.
            num_active_workers = pool.active_workers.count()
            need_more_active = num_active_workers < pool.capacity
            logging.info("%s needs workers? %s" % (pool, need_more_active))

            num_reserve_workers = pool.reserve_workers.count()
            need_more_reserve = (churn_thresh is not None
                                 and num_reserve_workers < settings.CHURN_RESERVE_SIZE)
            logging.info("%s needs more reserve? %s" % (pool, need_more_reserve))
            

            # if a pool has finished recruiting, start tasks appropriately
            if (pool.status in (RetainerPoolStatus.RECRUITING, RetainerPoolStatus.REFILLING)
                and not need_more_active):

                logger.info("%s is done recruiting" % pool)
                waiting_task_groups = crowd_model_spec.group_model.objects.filter(
                    retainer_pool=pool,
                    retainer_pool_status__in=(TaskGroupRetainerStatus.WAITING,
                                              TaskGroupRetainerStatus.RUNNING))
                if not waiting_task_groups.exists():
                    logger.info("No waiting task groups, pool is idle")
                    pool.status = RetainerPoolStatus.IDLE
                else:
                    logger.info("Waiting task groups found, starting work.")
                    pool.status = RetainerPoolStatus.ACTIVE
                    for task_group in waiting_task_groups:
                        logger.info("%s now running." % task_group)
                        task_group.retainer_pool_status = (
                            TaskGroupRetainerStatus.RUNNING)
                        task_group.save()
                pool.save()

        
            # Create new recruitment tasks if necessary
            elif need_more_active or need_more_reserve:
                logger.info("Posting tasks for %s" % pool)
                
                if (pool.status in (RetainerPoolStatus.ACTIVE, RetainerPoolStatus.IDLE)
                    and need_more_active):
                    pool.status = RetainerPoolStatus.REFILLING

                now = timezone.now()
                if (pool.last_recruited_at > now - timedelta(
                        seconds=settings.RETAINER_TASK_EXPIRATION_SECONDS)):
                    logger.info("Pool was recruited recently... skipping.")
                    continue
                pool.last_recruited_at = now
                pool.save()

                # Create dummy tasks on the crowd platform
                dummy_content = json.dumps({})
                dummy_config = {
                    'num_assignments': 1,
                    'task_type': 'retainer',
                    'task_batch_size': 1,
                    'callback_url': '',
                    crowd_name: json.loads(group.crowd_config),
                }
                for i in range(1, settings.NUM_RETAINER_RECRUITMENT_TASKS + 1):
                    task_config = copy.deepcopy(dummy_config)
                    task_config[crowd_name]['title'] += " [" + str(i) + "]"
                    task_id = crowd_interface.create_task(task_config, dummy_content)

                    # skip interface.task_pre_save because this isn't a real task.
                    task = crowd_model_spec.task_model.objects.create(
                        task_type=task_config['task_type'],
                        data=dummy_content,
                        create_time=timezone.now(),
                        task_id=task_id,
                        group=pool.task_groups.order_by('-created_at')[0],
                        num_assignments=task_config['num_assignments'],
                    )
                    logger.info("Created Task %s" % task_id)

                    # Create the retainer task to remember it.
                    retainer_task = RetainerTask.objects.create(
                        task=task, crowd_name=crowd_name)
                    logger.info("Created %s" % retainer_task)
            else:
                logger.info("%s has status %s, nothing to do." % (pool, pool.get_status_display()))

    # Delete old retainerTasks to keep the listings fresh
    logger.info('Removing old retainer tasks...')
    for retainer_task in RetainerTask.objects.filter(active=True).select_related(
        'task__group__retainer_pool'):

        # Did we already process this retainer_task?
        session_task = retainer_task.task
        if not session_task:
            continue

        # Make sure we're actually recruiting
        group = session_task.group
        retainer_pool = group.retainer_pool

        # Always kill off recruitment tasks for finished pools
        pool_finished = (retainer_pool.status == RetainerPoolStatus.FINISHED)

        exp_config = json.loads(group.global_config).get('experimental')
        if exp_config:
            churn_thresh = exp_config.get('churn_threshold')
        else:
            churn_thresh = None
        num_reserve_workers = retainer_pool.reserve_workers.count()
        need_more_reserve = (churn_thresh is not None
                             and num_reserve_workers < settings.CHURN_RESERVE_SIZE)

        not_recruiting = (retainer_pool.status not in (RetainerPoolStatus.RECRUITING, 
                                                       RetainerPoolStatus.REFILLING)
                          and not need_more_reserve) or pool_finished
        if not_recruiting:
            # reset the last_recruited timestamp in case we start recruiting again.
            retainer_pool.last_recruited_at = timezone.now() - timedelta(
                seconds=settings.RETAINER_TASK_EXPIRATION_SECONDS)
            retainer_pool.save()

        # Kill off tasks that have been around for too long
        old_task_cutoff = (
            timezone.now()
            - timedelta(seconds=settings.RETAINER_TASK_EXPIRATION_SECONDS))
        if not_recruiting or retainer_task.created_at < old_task_cutoff:
            try:
                was_assigned = session_task.assignments.exists()
                interface, _ = CrowdRegistry.get_registry_entry(
                    retainer_task.crowd_name)
                # delete the crowd platform task if no one has accepted it.
                if not was_assigned:
                    interface.delete_tasks([session_task,])
                    logger.info("Deleted platform task %s" % session_task.task_id)
                    session_task.delete()
                    logger.info("Deleted ampcrowd task object %s" % session_task)
                    retainer_task.active = False
                    retainer_task.save()
                
                # expire the crowd platform task if the pool is finished
                elif pool_finished:
                    try:
                        interface.expire_tasks([session_task,])
                    except Exception as e:
                        logger.exception("Couldn't expire task %s, ignoring..." % session_task)
                    logger.info("Expired platform task %s" % session_task.task_id)
                    retainer_task.active = False
                    retainer_task.save()

                else:
                    logger.info("Not deleting %s, it has a worker." % session_task)

            except Exception, e:
                logger.warning('Could not remove task %s: %s' % (session_task, str(e)))
Beispiel #15
0
def create_task_group(request, crowd_name):
    """ See README.md for API. """

    # get the interface implementation from the crowd name.
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)

    # Response dictionaries
    correct_response = {'status': 'ok'}
    wrong_response = {'status': 'wrong'}

    # Parse information contained in the URL
    json_dict = request.POST.get('data')

    # Validate the format.
    if not interface.validate_create_request(json_dict):
        wrong_response['reason'] = 'Invalid request data.'
        return HttpResponse(json.dumps(wrong_response))

    # Pull out important data fields
    json_dict = json.loads(json_dict)
    configuration = json_dict['configuration']
    group_id = json_dict['group_id']
    group_context = json.dumps(json_dict['group_context'])
    content = json_dict['content']
    point_identifiers = content.keys()

    # Create a new group for the tasks.
    if model_spec.group_model.objects.filter(group_id=group_id).exists():
        wrong_response['reason'] = 'Group id %s is already in use.' % group_id
        return HttpResponse(json.dumps(wrong_response))
    current_group = model_spec.group_model(
        group_id=group_id,
        tasks_finished=0,
        callback_url=configuration['callback_url'],
        group_context=group_context,
        crowd_config=json.dumps(configuration.get(crowd_name, {})),
        global_config=json.dumps(configuration))

    # Call the group hook function, then save the new group to the database.
    interface.group_pre_save(current_group)
    current_group.save()

    # Build crowd tasks from the group
    if 'retainer_pool' in configuration:  # Retainer pool tasks

        # The specified crowd must support retainer pools
        retainer_pool_model = model_spec.retainer_pool_model
        if not retainer_pool_model:
            wrong_response['reason'] = 'Crowd does not support retainer pools.'
            return HttpResponse(json.dumps(wrong_response))

        # Create or find the retainer pool.
        retainer_config = configuration['retainer_pool']
        create_pool = retainer_config['create_pool']
        pool_id = retainer_config.get('pool_id', '')
        if create_pool:
            (retainer_pool,
             created) = retainer_pool_model.objects.get_or_create(
                 external_id=pool_id,
                 defaults={
                     'capacity': retainer_config['pool_size'],
                     'status': RetainerPoolStatus.RECRUITING,
                 })
            if created == False:  # pool id already taken
                wrong_response[
                    'reason'] = 'Pool id %s already in use' % pool_id
                return HttpResponse(json.dumps(wrong_response))

        else:
            try:
                retainer_pool = retainer_pool_model.objects.get(
                    external_id=pool_id)

                # TODO: Make sure this pool is compatible with the new task group
            except retainer_pool_model.DoesNotExist:
                # clean up
                current_group.delete()
                wrong_response['reason'] = 'Pool %s does not exist' % pool_id
                return HttpResponse(json.dumps(wrong_response))
        current_group.retainer_pool = retainer_pool

        # Don't call interface.create_task, the `post_retainer_tasks` celery
        # task will do so.
        # Batch and create the tasks.
        batch_size = configuration['task_batch_size']
        for i in range(0, len(point_identifiers), batch_size):
            batch_point_ids = point_identifiers[i:i + batch_size]
            batch_content = {j: content[j] for j in batch_point_ids}
            task_id = str(uuid.uuid4())  # generate a random id for this task
            task = model_spec.task_model(
                task_type=configuration['task_type'],
                data=json.dumps(batch_content),
                create_time=timezone.now(),
                task_id=task_id,
                group=current_group,
                num_assignments=configuration['num_assignments'],
                is_retainer=True,
            )
            interface.task_pre_save(task)
            task.save()

        #for point_id, point_content in content.iteritems():
        #    task_id = str(uuid.uuid4()) # generate a random id for this task
        #    task = model_spec.task_model(
        #        task_type=configuration['task_type'],
        #        data=json.dumps({point_id: point_content}),
        #        create_time=pytz.utc.localize(datetime.now()),
        #        task_id=task_id,
        #        group=current_group,
        #        num_assignments=configuration['num_assignments'],
        #        is_retainer=True,
        #    )
        #    interface.task_pre_save(task)
        #    task.save()

        # start the work right away if the pool is ready
        if retainer_pool.status in [
                RetainerPoolStatus.IDLE, RetainerPoolStatus.ACTIVE
        ]:
            current_group.retainer_pool_status = TaskGroupRetainerStatus.RUNNING
            retainer_pool.status = RetainerPoolStatus.ACTIVE
            retainer_pool.save()
        else:
            current_group.retainer_pool_status = TaskGroupRetainerStatus.WAITING
        current_group.save()

    else:  # Not retainer, create a task for each batch of points.
        for i in range(0, len(point_identifiers),
                       configuration['task_batch_size']):

            # build the batch
            current_content = {}
            for j in range(i, i + configuration['task_batch_size']):

                if j >= len(point_identifiers):
                    break
                current_content[point_identifiers[j]] = content[
                    point_identifiers[j]]
            current_content = json.dumps(current_content)

            # Call the create task hook
            current_task_id = interface.create_task(configuration,
                                                    current_content)

            # Build the task object
            current_task = model_spec.task_model(
                task_type=configuration['task_type'],
                data=current_content,
                create_time=pytz.utc.localize(datetime.now()),
                task_id=current_task_id,
                group=current_group,
                num_assignments=configuration['num_assignments'])

            # Call the pre-save hook, then save the task to the database.
            interface.task_pre_save(current_task)
            current_task.save()

    return HttpResponse(json.dumps(correct_response))
Beispiel #16
0
def create_task_group(request, crowd_name):
    """ See README.md for API. """

    # get the interface implementation from the crowd name.
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)

    # Response dictionaries
    correct_response = {'status': 'ok'}
    wrong_response = {'status': 'wrong'}

    # Parse information contained in the URL
    json_dict = request.POST.get('data')

    # Validate the format.
    if not interface.validate_create_request(json_dict):
        return HttpResponse(json.dumps(wrong_response))

    # Pull out important data fields
    json_dict = json.loads(json_dict)
    configuration = json_dict['configuration']
    group_id = json_dict['group_id']
    group_context = json.dumps(json_dict['group_context'])
    content = json_dict['content']
    point_identifiers = content.keys()

    # Create a new group for the tasks.
    current_group = model_spec.group_model(
        group_id=group_id,
        tasks_finished=0,
        callback_url=configuration['callback_url'],
        group_context=group_context,
        crowd_config=json.dumps(configuration.get(crowd_name, {})))

    # Call the group hook function, then save the new group to the database.
    interface.group_pre_save(current_group)
    current_group.save()

    # Create a task for each batch of points.
    for i in range(0, len(point_identifiers), configuration['task_batch_size']):

        # build the batch
        current_content = {}
        for j in range(i, i + configuration['task_batch_size']):

            if j >= len(point_identifiers):
                break
            current_content[point_identifiers[j]] = content[point_identifiers[j]]
        current_content = json.dumps(current_content)

        # Call the create task hook
        current_task_id = interface.create_task(configuration, current_content)

        # Build the task object
        current_task = model_spec.task_model(
            task_type=configuration['task_type'],
            data=current_content,
            create_time=pytz.utc.localize(datetime.now()),
            task_id=current_task_id,
            group=current_group,
            num_assignments=configuration['num_assignments'])

        # Call the pre-save hook, then save the task to the database.
        interface.task_pre_save(current_task)
        current_task.save()

    return HttpResponse(json.dumps(correct_response))
Beispiel #17
0
def ping(request, crowd_name):
    try:
        interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)
        now = timezone.now()

        # get and validate context
        context = interface.get_response_context(request)
        interface.require_context(
            context, ['task_id', 'worker_id', 'assignment_id'],
            ValueError("ping context missing required keys."))
        task = model_spec.task_model.objects.get(task_id=context['task_id'])
        worker = model_spec.worker_model.objects.get(
            worker_id=context['worker_id'])
        assignment = model_spec.assignment_model.objects.get(
            assignment_id=context['assignment_id'])
        pool_status = task.group.retainer_pool.get_status_display()
        terminate_work = False
        terminate_worker = assignment.worker_released_at is not None

        # update waiting time
        ping_type = request.POST['ping_type']

        # Task started waiting, create a new session
        if ping_type == 'starting':
            assignment.finish_waiting_session()

        # Task is waiting, increment wait time.
        elif ping_type == 'waiting' and pool_status != 'finished':
            last_ping = assignment.last_ping
            time_since_last_ping = (now - last_ping).total_seconds()
            assignment.time_waited_session += time_since_last_ping

        # Task is working, verify that the assignment hasn't been terminated.
        elif ping_type == 'working':
            active_task_id = request.POST.get('active_task', None)
            if not active_task_id:
                logger.warning('Ping from %s, but no active task id.' %
                               assignment)
                terminate_worker = False  # Don't kill them if we don't know what they're working on

            else:
                try:
                    active_assignment = model_spec.assignment_model.objects.filter(
                        worker=worker, task_id=active_task_id)[0]
                    if active_assignment.terminated:
                        terminate_work = True
                except IndexError:  # No active assignment
                    terminate_worker = False  # Don't kill the worker if we don't know what they're working on.


#                if terminate_worker: # make sure their current task can be recycled
#                    active_assignment.finished_at = now
#                    active_assignment.terminated = True
#                    active_assignment.save()

        assignment.last_ping = now
        assignment.save()
        worker.last_ping = now
        worker.save()
        logger.info('ping from worker %s, task %s' % (worker, task))

        retainer_config = json.loads(task.group.global_config)['retainer_pool']
        data = {
            'ping_type':
            ping_type,
            'wait_time':
            assignment.time_waited,
            'tasks_completed':
            worker.completed_assignments_for_pool_session(task).count(),
            'pool_status':
            pool_status,
            'waiting_rate':
            retainer_config['waiting_rate'],
            'per_task_rate':
            retainer_config['task_rate'],
            'min_required_tasks':
            retainer_config['min_tasks_per_worker'],
            'terminate_work':
            terminate_work,
            'terminate_worker':
            terminate_worker,
        }
        return HttpResponse(json.dumps(data), content_type='application/json')
    except Exception as e:
        logger.exception(e)
        raise e
Beispiel #18
0
def assign_retainer_task(request, crowd_name):
    try:
        # get the interface implementation from the crowd name.
        interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)

        context = interface.get_response_context(request)
        interface.require_context(
            context, ['task_id', 'worker_id'],
            ValueError("retainer assignment context missing required keys."))

        try:
            task = (model_spec.task_model.objects.select_related(
                'group__retainer_pool').get(task_id=context['task_id']))
            group = task.group
            pool = group.retainer_pool
            worker = model_spec.worker_model.objects.get(
                worker_id=context['worker_id'])
            logger.info('Retainer task %s requested work.' % task)
        except Exception:  # Issue loading models from IDs, finish this assignment
            return HttpResponse(json.dumps({
                'start': False,
                'pool_status': 'finished'
            }),
                                content_type='application/json')

        exp_config = json.loads(group.global_config).get('experimental')
        if exp_config:
            straggler_mitigation = exp_config.get('mitigate_stragglers', False)
            straggler_routing_policy = exp_config.get(
                'straggler_routing_policy', 'random')
            churn_threshold = exp_config.get('churn_threshold')
        else:
            straggler_mitigation = False
            churn_threshold = None

        # Acquire an exclusive lock to avoid duplicate assignments
        lockf = open('/tmp/ASSIGNMENT_LOCK', 'wb')
        logger.debug("Locking assignment lock...")
        locks.lock(lockf, locks.LOCK_EX)

        # Don't assign a task if the worker is on reserve or the pool is inactive.
        on_reserve = (task.assignments.filter(worker=worker,
                                              on_reserve=True).exists()
                      if churn_threshold is not None else False)
        pool_inactive = pool.status not in (RetainerPoolStatus.ACTIVE,
                                            RetainerPoolStatus.REFILLING,
                                            RetainerPoolStatus.IDLE)
        no_work_response = HttpResponse(json.dumps({
            'start':
            False,
            'pool_status':
            pool.get_status_display()
        }),
                                        content_type='application/json')
        if on_reserve:
            logger.info("Worker on reserve: not assigning work.")
            return no_work_response

        if pool_inactive:
            logger.info(
                "Pool still recruiting or otherwise inactive: not assigning work."
            )
            return no_work_response

        # Look for a task the worker is already assigned to
        assignment_task = None
        existing_assignments = (worker.assignments.filter(
            finished_at__isnull=True).filter(
                task__group__retainer_pool=pool).exclude(
                    task__task_type='retainer'))

        logger.info('Looking for assignments for retainer worker...')
        if existing_assignments.exists():
            assignment_task = existing_assignments[0].task
            logger.info('Found an existing assignment for this worker')
        else:  # Look for open tasks
            incomplete_tasks = (

                # incomplete tasks
                model_spec.task_model.objects.filter(is_complete=False)

                # in this pool's tasks
                .filter(group__retainer_pool=pool)

                # that aren't dummy retainer tasks
                .exclude(task_type='retainer')

                # that the worker hasn't worked on already
                .exclude(assignments__worker=worker))

            # First check if the open tasks haven't been assigned to enough workers.
            # TODO: avoid gross SQL
            non_terminated_assignments = """
SELECT COUNT(*) FROM %(crowdname)s_%(assignment_model)s
WHERE %(crowdname)s_%(assignment_model)s.terminated = False
      AND %(crowdname)s_%(assignment_model)s.task_id = %(crowdname)s_%(task_model)s.task_id
""" % {
                'crowdname': crowd_name,
                'assignment_model':
                model_spec.assignment_model.__name__.lower(),
                'task_model': model_spec.task_model.__name__.lower(),
            }

            open_tasks = incomplete_tasks.extra(
                where=["num_assignments > (%s)" % non_terminated_assignments])
            if open_tasks.exists():
                logger.info('Found an unassigned but open task')
                assignment_task = open_tasks.order_by('?')[0]

            # Then, check if there in-progress tasks with enough assignments.
            elif incomplete_tasks.exists():
                if not straggler_mitigation:  # only assign tasks that have been abandoned
                    # Bad performance characteristics! consider rewriting.
                    active_workers = set(pool.active_workers.all())
                    abandoned_tasks = [
                        t for t in incomplete_tasks if len([
                            a for a in t.assignments.select_related(
                                'worker').all() if a.worker in active_workers
                        ]) < t.num_assignments
                    ]
                    if abandoned_tasks:
                        logger.info('Found an assigned but abandoned task.')
                        assignment_task = random.choice(abandoned_tasks)
                    else:
                        logger.info('All tasks are assigned.')

                # Straggler mitigation
                else:
                    logger.info(
                        'Assigning to an active task for straggler mitigation with policy %s.'
                        % straggler_routing_policy)
                    if straggler_routing_policy == 'random':
                        assignment_task = incomplete_tasks.order_by('?')[0]
                    elif straggler_routing_policy == 'oldest':
                        now = timezone.now()
                        annotated = incomplete_tasks.annotate(
                            start=Min('assignments__assigned_at'))
                        weights = [(now - t.start).total_seconds()
                                   for t in annotated]
                        weights = np.array(weights) / sum(weights)
                        assignment_task = np.random.choice(list(annotated),
                                                           size=1,
                                                           p=weights)[0]
                    elif straggler_routing_policy == 'young-workers':
                        now = timezone.now()
                        weights = [
                            1 / (now - min([
                                a.worker.assignments.filter(
                                    task__task_type='retainer',
                                    task__group__retainer_pool=pool).order_by(
                                        'assigned_at')[0].assigned_at
                                for a in task.assignments.all()
                            ])).total_seconds() for task in incomplete_tasks
                        ]
                        weights = np.array(weights) / sum(weights)
                        assignment_task = np.random.choice(
                            list(incomplete_tasks), size=1, p=weights)[0]
                    elif straggler_routing_policy == 'fair':
                        # assign to the task with the fewest assignments
                        assignment_task = (incomplete_tasks.extra(
                            select={
                                'n_assignments': non_terminated_assignments
                            },
                            order_by=['n_assignments']))[0]
                    else:
                        logger.info(
                            'Unkown straggler routing policy: %s. Using random instead...'
                            % straggler_routing_policy)
                        assignment_task = incomplete_tasks.order_by('?')[0]

        # return a url to the assignment
        if assignment_task:
            # create the assignment if necessary
            try:
                logger.info('Looking up assignment...')
                assignment = worker.assignments.get(task=assignment_task,
                                                    worker=worker)
                if not assignment.retainer_session_task:
                    assignment.retainer_session_task = task
                    assignment.save()
            except model_spec.assignment_model.DoesNotExist:
                logger.info('No assignment found: creating new one.')
                assignment_id = str(uuid.uuid4())
                assignment = model_spec.assignment_model.objects.create(
                    assignment_id=assignment_id,
                    worker=worker,
                    task=assignment_task,
                    retainer_session_task=task)

            if not assignment_task.group.work_start_time:
                assignment_task.group.work_start_time = timezone.now()
                assignment_task.group.save()
            url_args = {
                'crowd_name': crowd_name,
                'worker_id': worker.worker_id,
                'task_id': assignment_task.task_id,
            }
            response_data = json.dumps({
                'start':
                True,
                'task_url':
                reverse('basecrowd:get_retainer_assignment', kwargs=url_args),
                'task_id':
                assignment_task.task_id,
                'pool_status':
                pool.get_status_display()
            })
            logger.info('Linking task to assignment.')
            return HttpResponse(response_data, content_type='application/json')
        else:
            logger.info('No tasks found!')
            return no_work_response

    except Exception as e:
        logger.exception(e)
        raise e

    finally:
        # Release the assignment lock--either an assignment has been created in the DB, or an error occurred.
        logger.debug("Unlocking assignment lock...")
        locks.unlock(lockf)
        lockf.close()
Beispiel #19
0
def create_task_group(request, crowd_name):
    ''' See README.md for API. '''

    # get the interface implementation from the crowd name.
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)

    # Response dictionaries
    correct_response = {'status': 'ok'}
    wrong_response = {'status': 'wrong'}

    # Parse information contained in the URL
    json_dict = request.POST.get('data')

    # Validate the format.
    if not interface.validate_create_request(json_dict):
        return HttpResponse(json.dumps(wrong_response))

    # Pull out important data fields
    json_dict = json.loads(json_dict)
    configuration = json_dict['configuration']
    group_id = json_dict['group_id']
    group_context = json.dumps(json_dict['group_context'])
    content = json_dict['content']
    point_identifiers = content.keys()

    # Create a new group for the tasks.
    current_group = model_spec.group_model(
        group_id=group_id,
        tasks_finished=0,
        callback_url=configuration['callback_url'],
        group_context=group_context,
        crowd_config=json.dumps(configuration.get(crowd_name, {})))

    # Call the group hook function, then save the new group to the database.
    interface.group_pre_save(current_group)
    current_group.save()

    # Create a task for each batch of points.
    for i in range(0, len(point_identifiers),
                   configuration['task_batch_size']):

        # build the batch
        current_content = {}
        for j in range(i, i + configuration['task_batch_size']):

            if j >= len(point_identifiers):
                break
            current_content[point_identifiers[j]] = content[
                point_identifiers[j]]
        current_content = json.dumps(current_content)

        # Call the create task hook
        current_task_id = interface.create_task(configuration, current_content)

        # Build the task object
        current_task = model_spec.task_model(
            task_type=configuration['task_type'],
            data=current_content,
            create_time=pytz.utc.localize(datetime.now()),
            task_id=current_task_id,
            group=current_group,
            num_assignments=configuration['num_assignments'])

        # Call the pre-save hook, then save the task to the database.
        interface.task_pre_save(current_task)
        current_task.save()

    return HttpResponse(json.dumps(correct_response))
Beispiel #20
0
def get_assignment(request, crowd_name):

    # get the interface implementation from the crowd name.
    interface, model_spec = CrowdRegistry.get_registry_entry(crowd_name)

    # get assignment context
    context = interface.get_assignment_context(request)
    try:
        interface.require_context(
            context, ['task_id', 'is_accepted'],
            ValueError('Task id unavailable in assignment request context.'))
    except ValueError:
        # This task is no longer available (due to a race condition).
        # Return the 'No available tasks' template.
        template = get_scoped_template(crowd_name, 'unavailable.html')
        return HttpResponse(template.render(RequestContext(request, {})))

    # Retrieve the tweet based on task_id from the database
    try:
        current_task = model_spec.task_model.objects.get(
            task_id=context['task_id'])
    except model_spec.task_model.DoesNotExist:
        raise ValueError('Invalid task id: ' + context['task_id'])

    content = json.loads(current_task.data)
    group_context = json.loads(current_task.group.group_context)

    # Save the information of this worker
    worker_id = context.get('worker_id')
    if worker_id:
        try:
            current_worker = model_spec.worker_model.objects.get(
                worker_id=worker_id)
        except model_spec.worker_model.DoesNotExist:
            current_worker = model_spec.worker_model(
                worker_id=context['worker_id'])

            # Call the pre-save hook, the save to the database
            interface.worker_pre_save(current_worker)
            current_worker.save()
    else:
        current_worker = None

    # Relate workers and tasks (after a worker accepts the task).
    if context.get('is_accepted', False):
        if not current_worker:
            raise ValueError("Accepted tasks must have an associated worker.")
        if not current_worker.tasks.filter(
                task_id=current_task.task_id).exists():
            current_worker.tasks.add(current_task)

    # Add task data to the context.
    crowd_config = json.loads(current_task.group.crowd_config)
    context.update(
        group_context=group_context,
        content=content,
        backend_submit_url=interface.get_backend_submit_url(),
        frontend_submit_url=interface.get_frontend_submit_url(crowd_config))

    # Load the template and render it.
    template = get_scoped_template(crowd_name,
                                   current_task.task_type + '.html',
                                   context=context)
    return HttpResponse(template.render(RequestContext(request, context)))