def test_model_task_redis_no_dupes_data_unassign_assigned_data( test_project_labeled_and_tfidf, test_queue_labeled, test_irr_queue_labeled, test_admin_queue_labeled, test_redis, tmpdir, settings): project = test_project_labeled_and_tfidf person2 = create_profile('test_profilezzz', 'password', '*****@*****.**') person3 = create_profile('test_profile2', 'password', '*****@*****.**') ProjectPermissions.objects.create(profile=person2, project=project, permission='CODER') ProjectPermissions.objects.create(profile=person3, project=project, permission='CODER') initial_training_set = project.get_current_training_set().set_number queue = project.queue_set.get(type="normal") queue.length = 40 queue.save() irr_queue = project.queue_set.get(type="irr") irr_queue.length = 40 irr_queue.save() model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles') settings.MODEL_PICKLE_PATH = str(model_path_temp) batch_size = project.batch_size fill_queue(queue, 'random', irr_queue, irr_percent=project.percentage_irr, batch_size=batch_size) labels = project.labels.all() assignments = get_assignments(project.creator, project, batch_size) for assignment in assignments: label_data(random.choice(labels), assignment, project.creator, 3) tasks.send_model_task.delay(project.pk).get() assert project.get_current_training_set( ).set_number == initial_training_set + 1 redis_items = test_redis.lrange(redis_serialize_queue(queue), 0, -1) assert len(redis_items) == len(set(redis_items)) assignments = get_assignments(project.creator, project, 40) for assignment in assignments[:batch_size]: label_data(random.choice(labels), assignment, project.creator, 3) tasks.send_model_task.delay(project.pk).get() assert project.get_current_training_set( ).set_number == initial_training_set + 2 redis_items = test_redis.lrange(redis_serialize_queue(queue), 0, -1) assert len(redis_items) == len(set(redis_items)) batch_unassign(project.creator) redis_items = test_redis.lrange(redis_serialize_queue(queue), 0, -1) assert len(redis_items) == len(set(redis_items))
def test_model_task_redis_no_dupes_data_left_in_queue( test_project_labeled_and_tfidf, test_queue_labeled, test_irr_queue_labeled, test_admin_queue_labeled, test_redis, tmpdir, settings): project = test_project_labeled_and_tfidf initial_training_set = project.get_current_training_set().set_number queue = project.queue_set.get(type="normal") queue.length = 40 queue.save() irr_queue = project.queue_set.get(type="irr") irr_queue.length = 40 irr_queue.save() model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles') settings.MODEL_PICKLE_PATH = str(model_path_temp) batch_size = project.batch_size fill_queue(queue, 'random', irr_queue, irr_percent=project.percentage_irr, batch_size=batch_size) labels = project.labels.all() for i in range(int(batch_size * ((100 - project.percentage_irr) / 100))): datum = assign_datum(project.creator, project) label_data(random.choice(labels), datum, project.creator, 3) tasks.send_model_task.delay(project.pk).get() assert project.get_current_training_set( ).set_number == initial_training_set + 1 redis_items = test_redis.lrange(redis_serialize_queue(queue), 0, -1) assert len(redis_items) == len(set(redis_items))
def test_redis_parse_list_dataids(test_queue, test_redis): fill_queue(test_queue, orderby="random") data_ids = [d.pk for d in test_queue.data.all()] redis_ids = test_redis.lrange(redis_serialize_queue(test_queue), 0, -1) parsed_ids = redis_parse_list_dataids(redis_ids) assert data_ids.sort() == parsed_ids.sort()
def test_redis_parse_data(test_queue, test_redis): fill_queue(test_queue, orderby="random") popped_data_key = test_redis.lpop(redis_serialize_queue(test_queue)) parsed_data = redis_parse_data(popped_data_key) assert_obj_exists(Data, {"pk": parsed_data.pk}) assert_obj_exists(DataQueue, {"data_id": parsed_data.pk})
def unassign_datum(datum, profile): """Remove a profile's assignment to a datum. Re-add the datum to its respective queue in Redis. """ assignment = AssignedData.objects.filter(profile=profile, data=datum).get() queue = assignment.queue assignment.delete() settings.REDIS.lpush(redis_serialize_queue(queue), redis_serialize_data(datum))
def pop_queue(queue): """Remove a datum from the given queue (in redis and the database) and return it. Returns None and does nothing if the queue is empty. Client code should prefer pop_first_nonempty_queue() if the intent is to pop the first nonempty queue, as it avoids concurrency issues. """ # Redis first, since this op is guaranteed to be atomic data_id = settings.REDIS.rpop(redis_serialize_queue(queue)) if data_id is None: return None else: data_id = data_id.decode().split(":")[1] data_obj = Data.objects.filter(pk=data_id).get() return data_obj
def test_redis_serialize_queue(test_queue): queue_key = redis_serialize_queue(test_queue) assert queue_key == "queue:" + str(test_queue.pk)
def pop_first_nonempty_queue(project, profile=None, type="normal"): ''' Determine which queues are eligible to be popped (and in what order) and pass them into redis to have the first nonempty one popped. Return a (queue, data item) tuple if one was found; return a (None, None) tuple if not. ''' if profile is not None: # Use priority to ensure we set profile queues above project queues # in the resulting list; break ties by pk profile_queues = project.queue_set.filter(profile=profile, type=type) else: profile_queues = Queue.objects.none() profile_queues = profile_queues.annotate(priority=Value(1, IntegerField())) project_queues = (project.queue_set.filter( profile=None, type=type).annotate(priority=Value(2, IntegerField()))) eligible_queue_ids = [ redis_serialize_queue(queue) for queue in ( profile_queues.union(project_queues).order_by('priority', 'pk')) ] if type == "irr": for queue_id in eligible_queue_ids: queue = redis_parse_queue(queue_id.encode()) # first get the assigned data that was already labeled, or data already assigned labeled_irr_data = DataLabel.objects.filter( profile=profile).values_list('data', flat=True) assigned_data = AssignedData.objects.filter( profile=profile, queue=queue).values_list('data', flat=True) skipped_data = IRRLog.objects.filter( profile=profile, label__isnull=True).values_list('data', flat=True) assigned_unlabeled = DataQueue.objects.filter(queue=queue).exclude( data__in=labeled_irr_data).exclude( data__in=assigned_data).exclude(data__in=skipped_data) # if there are no elements, return none if len(assigned_unlabeled) == 0: return (None, None) else: # else, get the first element off the group and return it datum = Data.objects.get(pk=assigned_unlabeled[0].data.pk) return (queue, datum) if len(eligible_queue_ids) == 0: return (None, None) # Use a custom Lua script here to find the first nonempty queue atomically # and pop its first item. If all queues are empty, return nil. script = settings.REDIS.register_script(''' for _, k in pairs(KEYS) do local m = redis.call('LPOP', k) if m then return {k, m} end end return nil ''') result = script(keys=eligible_queue_ids) if result is None: return (None, None) else: queue_key, data_key = result return (redis_parse_queue(queue_key), redis_parse_data(data_key))