def test_annotate_data(seeded_database, client, test_project_data, test_queue, test_labels, test_admin_queue, test_irr_queue): '''This tests the basic ability to annotate a datum''' # get a datum from the queue project = test_project_data fill_queue(test_queue, 'random') request_info = {"labelID": test_labels[0].pk, "labeling_time": 3} permission_message = 'Account disabled by administrator. Please contact project owner for details' # call annotate data without the user having permission. Check that # the data is not annotated and the response has an error. client.login(username=SEED_USERNAME, password=SEED_PASSWORD) client_profile = Profile.objects.get(user__username=SEED_USERNAME) data = get_assignments(client_profile, project, 1) response = client.post('/api/annotate_data/' + str(data[0].pk) + '/', request_info) assert 'detail' in response.json() and permission_message in response.json( )['detail'] assert DataLabel.objects.filter(data=data[0]).count() == 0 ProjectPermissions.objects.create(profile=client_profile, project=project, permission='CODER') # give the user permission and call annotate again # The data should be labeled and in the proper places # check that the response was {} (no error) response = client.post('/api/annotate_data/' + str(data[0].pk) + '/', request_info) assert 'error' not in response.json() and 'detail' not in response.json() assert DataLabel.objects.filter(data=data[0]).count() == 1 assert DataQueue.objects.filter(data=data[0]).count() == 0
def test_download_labeled_data(seeded_database, client, admin_client, test_project_labeled, test_queue_labeled, test_irr_queue_labeled, test_admin_queue_labeled): ''' This tests the download labeled data api call ''' project = test_project_labeled fill_queue(test_queue_labeled, 'random', test_irr_queue_labeled, project.percentage_irr, project.batch_size) admin_client.login(username=SEED_USERNAME2, password=SEED_PASSWORD2) admin_profile = Profile.objects.get(user__username=SEED_USERNAME2) ProjectPermissions.objects.create(profile=admin_profile, project=project, permission='ADMIN') client.login(username=SEED_USERNAME, password=SEED_PASSWORD) client_profile = Profile.objects.get(user__username=SEED_USERNAME) ProjectPermissions.objects.create(profile=client_profile, project=project, permission='CODER') # check admin priviledges response = client.get('/api/download_data/' + str(project.pk) + '/').json() assert 'detail' in response and 'Invalid permission. Must be an admin' in response[ 'detail'] # check that the response is the correct type response = admin_client.get('/api/download_data/' + str(project.pk) + '/') assert 'detail' not in response assert response.get("Content-Type") == "text/csv"
def test_unassign_after_fillqueue(db, test_profile, test_project_data, test_queue, test_labels, test_redis): fill_queue(test_queue, 'random') assert test_redis.llen('queue:' + str(test_queue.pk)) == test_queue.length assert test_redis.scard('set:' + str(test_queue.pk)) == test_queue.length data = get_assignments(test_profile, test_project_data, 10) assert test_redis.llen('queue:' + str(test_queue.pk)) == (test_queue.length - 10) assert test_redis.scard('set:' + str(test_queue.pk)) == test_queue.length test_label = test_labels[0] for i in range(5): label_data(test_label, data[i], test_profile, 3) assert test_redis.llen('queue:' + str(test_queue.pk)) == (test_queue.length - 10) assert test_redis.scard('set:' + str(test_queue.pk)) == (test_queue.length - 5) fill_queue(test_queue, 'random') assert test_redis.llen('queue:' + str(test_queue.pk)) == test_queue.length - 5 assert test_redis.scard('set:' + str(test_queue.pk)) == test_queue.length
def test_queue_refill(setup_celery, test_project_data, test_all_queues, test_profile, test_labels, test_redis, tmpdir, settings): ''' Check that the queues refill the way they should. Have one person label everything in a batch. Check that the queue refills but the irr queue now has twice the irr% * batch amount ''' project = test_project_data normal_queue, admin_queue, irr_queue = test_all_queues fill_queue(normal_queue, 'random', irr_queue, project.percentage_irr, project.batch_size) irr_count = math.ceil((project.percentage_irr / 100) * project.batch_size) non_irr_count = math.ceil( ((100 - project.percentage_irr) / 100) * project.batch_size) for i in range(non_irr_count): datum = assign_datum(test_profile, project, "normal") assert datum is not None label_data(test_labels[0], datum, test_profile, 3) check_and_trigger_model(datum, test_profile) for i in range(irr_count): datum = assign_datum(test_profile, project, "irr") assert datum is not None label_data(test_labels[0], datum, test_profile, 3) check_and_trigger_model(datum, test_profile) assert DataQueue.objects.filter( queue=normal_queue).count() == non_irr_count assert DataQueue.objects.filter(queue=irr_queue).count() == irr_count * 2
def upload_data(form_data, project, queue=None, irr_queue=None, batch_size=30): """Perform data upload given validated form_data. 1. Add data to database 2. If new project then fill queue (only new project will pass queue object) 3. Save the uploaded data file 4. Create tf_idf file 5. Check and Trigger model """ new_df = add_data(project, form_data) if queue: fill_queue(queue=queue, irr_queue=irr_queue, orderby='random', irr_percent=project.percentage_irr, batch_size=batch_size) # Since User can upload Labeled Data and this data is added to current training_set # we need to check_and_trigger model. However since training model requires # tf_idf to be created we must create a chord which garuntees that tfidf # creation task is completed before check and trigger model task if len(new_df) > 0: save_data_file(new_df, project.pk) if project.classifier is not None: transaction.on_commit(lambda: chord( tasks.send_tfidf_creation_task.s(project.pk), tasks.send_check_and_trigger_model_task.si(project.pk)). apply_async())
def send_model_task(project_pk): """Trains, Saves, Predicts, Fills Queue.""" from core.models import Project, TrainingSet from core.utils.utils_model import predict_data, train_and_save_model from core.utils.utils_queue import fill_queue, find_queue_length project = Project.objects.get(pk=project_pk) queue = project.queue_set.get(type="normal") irr_queue = project.queue_set.get(type="irr") al_method = project.learning_method batch_size = project.batch_size model = train_and_save_model(project) if al_method != "random": predict_data(project, model) TrainingSet.objects.create( project=project, set_number=project.get_current_training_set().set_number + 1) # Determine if queue size has changed (num_coders changed) and re-fill queue num_coders = len(project.projectpermissions_set.all()) + 1 q_length = find_queue_length(batch_size, num_coders) if q_length != queue.length: queue.length = q_length queue.save() fill_queue( queue, irr_queue=irr_queue, orderby=al_method, irr_percent=project.percentage_irr, batch_size=batch_size, )
def test_fill_half_irr_queues(setup_celery, test_project_half_irr_data, test_half_irr_all_queues, test_profile, test_redis, tmpdir, settings): ''' Using a project with equal irr settings (50%, 2), check that the normal and irr queues get filled correctly ''' normal_queue, admin_queue, irr_queue = test_half_irr_all_queues batch_size = test_project_half_irr_data.batch_size percentage_irr = test_project_half_irr_data.percentage_irr fill_queue(normal_queue, 'random', irr_queue, percentage_irr, batch_size) # check that the queue is filled with the correct proportion of IRR and not irr_count = math.ceil((percentage_irr / 100) * batch_size) non_irr_count = math.ceil(((100 - percentage_irr) / 100) * batch_size) num_in_norm = DataQueue.objects.filter(queue=normal_queue).count() num_in_irr = DataQueue.objects.filter(queue=irr_queue).count() assert (num_in_norm + num_in_irr) == batch_size assert num_in_norm == non_irr_count assert num_in_irr == irr_count assert num_in_norm == num_in_irr # check that all of the data in the irr queue is labeled irr_ind=True assert DataQueue.objects.filter(queue=irr_queue, data__irr_ind=False).count() == 0 # check that NONE of the data in the normal queue is irr_ind=True assert DataQueue.objects.filter(queue=normal_queue, data__irr_ind=True).count() == 0 # check that there is no duplicate data across the two queues data_irr = DataQueue.objects.filter(queue=irr_queue).values_list( 'data__hash', flat=True) data_norm = DataQueue.objects.filter(queue=normal_queue).values_list( 'data__hash', flat=True) assert len(set(data_irr) & set(data_norm)) == 0
def test_get_labeled_data(setup_celery, test_profile, test_project_labeled, test_queue_labeled, test_irr_queue_labeled, test_admin_queue_labeled, test_redis, tmpdir, settings): ''' This tests that the labeled data is pulled correctly ''' # This tests labeled data util call project = test_project_labeled project_labels = Label.objects.filter(project=project) fill_queue(test_queue_labeled, 'random', test_irr_queue_labeled, project.percentage_irr, project.batch_size) # get the labeled data and the labels labeled_data, labels = get_labeled_data(project) assert isinstance(labeled_data, pd.DataFrame) assert isinstance(labels, pd.DataFrame) # should have the same number of labels and labeled data as in project assert len(labels) == len(project_labels) project_labeled = DataLabel.objects.filter(data__project=project) assert len(labeled_data) == len(project_labeled) # check that the labeled data is returned matches the stuff in DataLabel assert len( set(project_labeled.values_list("data__upload_id", flat=True)) & set(labeled_data["ID"].tolist())) == len(labeled_data)
def test_model_task_redis_no_dupes_data_left_in_queue( test_project_labeled_and_tfidf, test_queue_labeled, test_irr_queue_labeled, test_admin_queue_labeled, test_redis, tmpdir, settings): project = test_project_labeled_and_tfidf initial_training_set = project.get_current_training_set().set_number queue = project.queue_set.get(type="normal") queue.length = 40 queue.save() irr_queue = project.queue_set.get(type="irr") irr_queue.length = 40 irr_queue.save() model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles') settings.MODEL_PICKLE_PATH = str(model_path_temp) batch_size = project.batch_size fill_queue(queue, 'random', irr_queue, irr_percent=project.percentage_irr, batch_size=batch_size) labels = project.labels.all() for i in range(int(batch_size * ((100 - project.percentage_irr) / 100))): datum = assign_datum(project.creator, project) label_data(random.choice(labels), datum, project.creator, 3) tasks.send_model_task.delay(project.pk).get() assert project.get_current_training_set( ).set_number == initial_training_set + 1 redis_items = test_redis.lrange(redis_serialize_queue(queue), 0, -1) assert len(redis_items) == len(set(redis_items))
def test_cohens_kappa_perc_agreement_no_agreement( setup_celery, test_project_half_irr_data, test_half_irr_all_queues, test_profile, test_profile2, test_labels_half_irr, test_redis, tmpdir, settings, ): """This just tests the kappa and percent if nobody ever agreed.""" project = test_project_half_irr_data labels = test_labels_half_irr normal_queue, admin_queue, irr_queue = test_half_irr_all_queues fill_queue( normal_queue, "random", irr_queue, project.percentage_irr, project.batch_size ) # label 5 irr elements but disagree on all of them for i in range(5): datum = assign_datum(test_profile, project, "irr") assign_datum(test_profile2, project, "irr") label_data(labels[i % 3], datum, test_profile, 3) label_data(labels[(i + 1) % 3], datum, test_profile2, 3) kappa, perc = cohens_kappa(project) assert round(kappa, 3) == -0.471 assert perc == 0.0
def test_unassign(db, test_profile, test_project_data, test_queue, test_redis): fill_queue(test_queue, orderby='random') assert test_redis.llen('queue:' + str(test_queue.pk)) == test_queue.length assert test_redis.scard('set:' + str(test_queue.pk)) == test_queue.length datum = get_assignments(test_profile, test_project_data, 1)[0] assert test_redis.llen('queue:' + str(test_queue.pk)) == (test_queue.length - 1) assert test_redis.scard('set:' + str(test_queue.pk)) == test_queue.length assert AssignedData.objects.filter(data=datum, profile=test_profile).exists() unassign_datum(datum, test_profile) assert test_redis.llen('queue:' + str(test_queue.pk)) == test_queue.length assert test_redis.scard('set:' + str(test_queue.pk)) == test_queue.length assert not AssignedData.objects.filter(data=datum, profile=test_profile).exists() # The unassigned datum should be the next to be assigned reassigned_datum = get_assignments(test_profile, test_project_data, 1)[0] assert reassigned_datum == datum
def test_percent_agree_table(seeded_database, client, admin_client, test_project_all_irr_data, test_all_irr_all_queues, test_labels_all_irr): ''' This tests that the percent agree table can be called and returns correctly. Note: the exact values of the table are checked in the util tests. ''' labels = test_labels_all_irr normal_queue, admin_queue, irr_queue = test_all_irr_all_queues project = test_project_all_irr_data client.login(username=SEED_USERNAME, password=SEED_PASSWORD) client_profile = Profile.objects.get(user__username=SEED_USERNAME) ProjectPermissions.objects.create(profile=client_profile, project=project, permission='CODER') admin_client.login(username=SEED_USERNAME2, password=SEED_PASSWORD2) admin_profile = Profile.objects.get(user__username=SEED_USERNAME2) ProjectPermissions.objects.create(profile=admin_profile, project=project, permission='ADMIN') third_profile = Profile.objects.get(user__username="******") fill_queue(normal_queue, 'random', irr_queue, project.percentage_irr, project.batch_size) # non-admin should not be able to call the test response = client.get('/api/perc_agree_table/' + str(project.pk) + '/') assert 403 == response.status_code and "Invalid permission. Must be an admin" in str( response.content) data = get_assignments(client_profile, project, 15) data2 = get_assignments(admin_profile, project, 15) for i in range(15): response = admin_client.post('/api/annotate_data/' + str(data[i].pk) + '/', { "labelID": labels[i % 3].pk, "labeling_time": 3 }) assert 'error' not in response.json() and 'detail' not in response.json() response = client.post('/api/annotate_data/' + str(data2[i].pk) + '/', { "labelID": labels[i % 3].pk, "labeling_time": 3 }) assert 'error' not in response.json() and 'detail' not in response.json() # check that the three user pairs are in table response = admin_client.get('/api/perc_agree_table/' + str(project.pk) + '/').json() assert 'data' in response response_frame = pd.DataFrame(response['data']) # should have combination [adm, cl] [adm, u3], [cl, u3] assert response_frame['First Coder'].tolist() == [SEED_USERNAME, SEED_USERNAME, SEED_USERNAME2] assert response_frame['Second Coder'].tolist( ) == [SEED_USERNAME2, str(third_profile), str(third_profile)] # check that the table has just those three combinations assert len(response_frame) == 3 # should have "no samples" for combos with user3 assert response_frame.loc[response_frame['Second Coder'] == str( third_profile)]["Percent Agreement"].tolist() == ["No samples", "No samples"] # check that the percent agreement matches n%, n between 0 and 100 perc = response_frame["Percent Agreement"].tolist()[0] assert float(perc[:len(perc) - 1]) <= 100 and float(perc[:len(perc) - 1]) >= 0
def test_redis_parse_list_dataids(test_queue, test_redis): fill_queue(test_queue, orderby="random") data_ids = [d.pk for d in test_queue.data.all()] redis_ids = test_redis.lrange(redis_serialize_queue(test_queue), 0, -1) parsed_ids = redis_parse_list_dataids(redis_ids) assert data_ids.sort() == parsed_ids.sort()
def test_redis_parse_data(test_queue, test_redis): fill_queue(test_queue, orderby="random") popped_data_key = test_redis.lpop(redis_serialize_queue(test_queue)) parsed_data = redis_parse_data(popped_data_key) assert_obj_exists(Data, {"pk": parsed_data.pk}) assert_obj_exists(DataQueue, {"data_id": parsed_data.pk})
def test_fill_queue_all_remaining_data(db, test_queue): # Raise the queue length so it's bigger than the amount of data available all_data_count = Data.objects.filter(project=test_queue.project).count() test_queue.length = all_data_count + 1 test_queue.save() fill_queue(test_queue, orderby="random") assert test_queue.data.count() == all_data_count
def test_init_redis_one_nonempty_queue(db, test_project_data, test_redis): queue = add_queue(test_project_data, 10) fill_queue(queue, orderby="random") test_redis.flushdb() init_redis() assert_redis_matches_db(test_redis)
def test_restore_data( seeded_database, client, admin_client, test_project_data, test_queue, test_irr_queue, test_labels, test_admin_queue, ): """This tests that data can be restored after it is discarded.""" project = test_project_data fill_queue( test_queue, "random", test_irr_queue, project.percentage_irr, project.batch_size ) admin_client.login(username=SEED_USERNAME2, password=SEED_PASSWORD2) admin_profile = Profile.objects.get(user__username=SEED_USERNAME2) ProjectPermissions.objects.create( profile=admin_profile, project=project, permission="ADMIN" ) client.login(username=SEED_USERNAME, password=SEED_PASSWORD) client_profile = Profile.objects.get(user__username=SEED_USERNAME) ProjectPermissions.objects.create( profile=client_profile, project=project, permission="CODER" ) # assign a batch of data. Should be IRR and non-IRR data = get_assignments(client_profile, project, 30) for i in range(30): response = client.post("/api/skip_data/" + str(data[i].pk) + "/") # have the admin also get a batch and call skip on everything data = get_assignments(admin_profile, project, 30) for i in range(30): response = admin_client.post("/api/skip_data/" + str(data[i].pk) + "/") admin_data = DataQueue.objects.filter(data__project=project, queue=test_admin_queue) # discard all data for datum in admin_data: admin_client.post("/api/discard_data/" + str(datum.data.pk) + "/") # check for admin privalidges response = client.post( "/api/restore_data/" + str(admin_data[0].data.pk) + "/" ).json() assert ( "detail" in response and "Invalid permission. Must be an admin" in response["detail"] ) # restore all data. It should not be in recycle bin for datum in admin_data: admin_client.post("/api/restore_data/" + str(datum.data.pk) + "/") assert RecycleBin.objects.filter(data=datum.data).count() == 0 assert not Data.objects.get(pk=datum.data.pk).irr_ind
def test_model_task_redis_no_dupes_data_unassign_assigned_data( test_project_labeled_and_tfidf, test_queue_labeled, test_irr_queue_labeled, test_admin_queue_labeled, test_redis, tmpdir, settings): project = test_project_labeled_and_tfidf person2 = create_profile('test_profilezzz', 'password', '*****@*****.**') person3 = create_profile('test_profile2', 'password', '*****@*****.**') ProjectPermissions.objects.create(profile=person2, project=project, permission='CODER') ProjectPermissions.objects.create(profile=person3, project=project, permission='CODER') initial_training_set = project.get_current_training_set().set_number queue = project.queue_set.get(type="normal") queue.length = 40 queue.save() irr_queue = project.queue_set.get(type="irr") irr_queue.length = 40 irr_queue.save() model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles') settings.MODEL_PICKLE_PATH = str(model_path_temp) batch_size = project.batch_size fill_queue(queue, 'random', irr_queue, irr_percent=project.percentage_irr, batch_size=batch_size) labels = project.labels.all() assignments = get_assignments(project.creator, project, batch_size) for assignment in assignments: label_data(random.choice(labels), assignment, project.creator, 3) tasks.send_model_task.delay(project.pk).get() assert project.get_current_training_set( ).set_number == initial_training_set + 1 redis_items = test_redis.lrange(redis_serialize_queue(queue), 0, -1) assert len(redis_items) == len(set(redis_items)) assignments = get_assignments(project.creator, project, 40) for assignment in assignments[:batch_size]: label_data(random.choice(labels), assignment, project.creator, 3) tasks.send_model_task.delay(project.pk).get() assert project.get_current_training_set( ).set_number == initial_training_set + 2 redis_items = test_redis.lrange(redis_serialize_queue(queue), 0, -1) assert len(redis_items) == len(set(redis_items)) batch_unassign(project.creator) redis_items = test_redis.lrange(redis_serialize_queue(queue), 0, -1) assert len(redis_items) == len(set(redis_items))
def test_skip_irr( setup_celery, test_project_half_irr_data, test_half_irr_all_queues, test_profile, test_profile2, test_profile3, test_labels_half_irr, test_redis, tmpdir, settings, ): """This tests the skip function, and see if the data is in the correct places.""" project = test_project_half_irr_data normal_queue, admin_queue, irr_queue = test_half_irr_all_queues fill_queue(normal_queue, "random", irr_queue, project.percentage_irr, project.batch_size) # get an irr datum. One should exist. datum = assign_datum(test_profile, project, "irr") assert datum is not None # let one user skip an irr datum. It should not be in adminqueue, should be in irr queue, # should be in irrlog, should be in irr queue, not be in datalabel skip_data(datum, test_profile) assert DataQueue.objects.filter(data=datum, queue=admin_queue).count() == 0 assert DataQueue.objects.filter(data=datum, queue=irr_queue).count() == 1 assert IRRLog.objects.filter(data=datum, profile=test_profile).count() == 1 assert DataLabel.objects.filter(data=datum, profile=test_profile).count() == 0 # let the other user skip the data. It should be in admin queue, # IRRlog (twice), and nowhere else. datum2 = assign_datum(test_profile2, project, "irr") assert datum.pk == datum2.pk skip_data(datum2, test_profile2) assert DataQueue.objects.filter(data=datum, queue=admin_queue).count() == 1 assert DataQueue.objects.filter(data=datum, queue=irr_queue).count() == 0 assert IRRLog.objects.filter(data=datum).count() == 2 assert DataLabel.objects.filter(data=datum).count() == 0 # have two users label an IRR datum then have a third user skip it. # It should be in the IRRLog but not in admin queue or anywhere else. second_datum = assign_datum(test_profile, project, "irr") second_datum2 = assign_datum(test_profile2, project, "irr") assert second_datum.pk != datum.pk assert second_datum.pk == second_datum2.pk second_datum3 = assign_datum(test_profile3, project, "irr") assert second_datum2.pk == second_datum3.pk label_data(test_labels_half_irr[0], second_datum, test_profile, 3) label_data(test_labels_half_irr[0], second_datum2, test_profile2, 3) skip_data(second_datum3, test_profile3) assert DataQueue.objects.filter(data=second_datum3, queue=admin_queue).count() == 0 assert DataQueue.objects.filter(data=second_datum3, queue=irr_queue).count() == 0 assert IRRLog.objects.filter(data=second_datum3).count() == 3 assert DataLabel.objects.filter(data=second_datum3).count() == 1
def test_annotate_irr(setup_celery, test_project_half_irr_data, test_half_irr_all_queues, test_profile, test_profile2, test_profile3, test_labels_half_irr, test_redis, tmpdir, settings): ''' This tests the irr labeling workflow, and checks that the data is in the correct models ''' project = test_project_half_irr_data normal_queue, admin_queue, irr_queue = test_half_irr_all_queues fill_queue(normal_queue, 'random', irr_queue, project.percentage_irr, project.batch_size) # get an irr datum. One should exist. datum = assign_datum(test_profile, project, "irr") assert datum is not None # let one user label a datum. It should be in DataLabel, not be in IRRLog, # still be in IRR Queue label_data(test_labels_half_irr[0], datum, test_profile, 3) assert DataLabel.objects.filter(data=datum, profile=test_profile).count() > 0 assert IRRLog.objects.filter(data=datum, profile=test_profile).count() == 0 assert DataQueue.objects.filter(data=datum, queue=irr_queue).count() > 0 datum2 = assign_datum(test_profile2, project, "irr") assert datum.pk == datum2.pk datum3 = assign_datum(test_profile3, project, "irr") assert datum.pk == datum3.pk # let other user label the same datum. It should now be in datatable with # creater=profile, be in IRRLog (twice), not be in IRRQueue label_data(test_labels_half_irr[0], datum2, test_profile2, 3) assert DataLabel.objects.filter(data=datum2).count() == 1 assert DataLabel.objects.get(data=datum2).profile.pk == project.creator.pk assert IRRLog.objects.filter(data=datum2).count() == 2 assert DataQueue.objects.filter(data=datum2, queue=irr_queue).count() == 0 # let a third user label the first data something else. It should be in # IRRLog but not overwrite the label from before label_data(test_labels_half_irr[0], datum3, test_profile3, 3) assert IRRLog.objects.filter(data=datum3).count() == 3 assert DataLabel.objects.filter(data=datum3).count() == 1 assert DataLabel.objects.get(data=datum3).profile.pk == project.creator.pk # let two users disagree on a datum. It should be in the admin queue, # not in irr queue, not in datalabel, in irrlog twice second_datum = assign_datum(test_profile, project, "irr") # should be a new datum assert datum.pk != second_datum.pk second_datum2 = assign_datum(test_profile2, project, "irr") label_data(test_labels_half_irr[0], second_datum, test_profile, 3) label_data(test_labels_half_irr[1], second_datum2, test_profile2, 3) assert DataQueue.objects.filter(data=second_datum2, queue=admin_queue).count() == 1 assert DataQueue.objects.filter(data=second_datum2, queue=irr_queue).count() == 0 assert DataLabel.objects.filter(data=second_datum2).count() == 0 assert IRRLog.objects.filter(data=second_datum2).count() == 2
def test_get_irr_metrics(seeded_database, client, admin_client, test_project_half_irr_data, test_half_irr_all_queues, test_labels_half_irr): ''' This tests the irr metrics api call. Note: the exact values are checked in the util tests. ''' # sign in users labels = test_labels_half_irr normal_queue, admin_queue, irr_queue = test_half_irr_all_queues project = test_project_half_irr_data client.login(username=SEED_USERNAME, password=SEED_PASSWORD) client_profile = Profile.objects.get(user__username=SEED_USERNAME) ProjectPermissions.objects.create(profile=client_profile, project=project, permission='CODER') admin_client.login(username=SEED_USERNAME2, password=SEED_PASSWORD2) admin_profile = Profile.objects.get(user__username=SEED_USERNAME2) ProjectPermissions.objects.create(profile=admin_profile, project=project, permission='ADMIN') fill_queue(normal_queue, 'random', irr_queue, project.percentage_irr, project.batch_size) # non-admin should not be able to call the test response = client.get('/api/get_irr_metrics/' + str(project.pk) + '/') assert 403 == response.status_code and "Invalid permission. Must be an admin" in str( response.content) # initially, should have no irr data processed response = admin_client.get('/api/get_irr_metrics/' + str(project.pk) + '/').json() assert 'error' not in response and 'detail' not in response assert 'kappa' in response and response['kappa'] == "No irr data processed" assert 'percent agreement' in response and response['percent agreement'] == "No irr data processed" # have each person label three irr data data = get_assignments(client_profile, project, 3) data2 = get_assignments(admin_profile, project, 3) for i in range(3): response = client.post('/api/annotate_data/' + str(data[i].pk) + '/', { "labelID": labels[i].pk, "labeling_time": 3 }) assert 'error' not in response.json() and 'detail' not in response.json() response = admin_client.post('/api/annotate_data/' + str(data2[i].pk) + '/', { "labelID": labels[(i + 1) % 3].pk, "labeling_time": 3 }) assert 'error' not in response.json() response = admin_client.get('/api/get_irr_metrics/' + str(project.pk) + '/').json() # the percent agreement should be a number between 0 and 100 with a % assert 'percent agreement' in response percent = float(response['percent agreement'][:len(response['percent agreement']) - 1]) assert percent <= 100 and percent >= 0 and '%' == response['percent agreement'][-1] # kappa should be a value between -1 and 1 assert 'kappa' in response and response['kappa'] >= -1 and response['kappa'] <= 1
def test_assign_datum_profile_queue_returns_correct_datum( db, test_profile_queue, test_profile, test_profile_queue2, test_profile2, test_redis): fill_queue(test_profile_queue, orderby='random') fill_queue(test_profile_queue2, orderby='random') datum = assign_datum(test_profile, test_profile_queue.project) assert isinstance(datum, Data)
def test_assign_datum_project_queue_returns_datum(db, test_queue, test_profile, test_redis): """Assign a datum from a project-wide queue (null profile ID).""" fill_queue(test_queue, orderby="random") datum = assign_datum(test_profile, test_queue.project) # Make sure we got the datum assert isinstance(datum, Data)
def test_fill_queue_random_predicted_data(test_project_predicted_data, test_queue, test_redis): fill_queue(test_queue, 'random') assert_redis_matches_db(test_redis) assert test_queue.data.count() == test_queue.length for datum in test_queue.data.all(): assert len(datum.datalabel_set.all()) == 0 assert_obj_exists(DataUncertainty, {'data': datum})
def test_g_naivebayes_classifier( setup_celery, test_project_gnb_data_tfidf, test_gnb_labels, test_gnb_queue_list, test_profile, test_redis, tmpdir, settings, ): """This tests that a project with the Gaussian Naiive Bayes classifier can successfully train and give predictions for a model.""" normal_queue, admin_queue, irr_queue = test_gnb_queue_list labels = test_gnb_labels project = test_project_gnb_data_tfidf active_l = project.learning_method batch_size = project.batch_size initial_training_set = project.get_current_training_set() model_path_temp = tmpdir.listdir()[0].mkdir("model_pickles") settings.MODEL_PICKLE_PATH = str(model_path_temp) assert project.classifier == "gnb" assert active_l == "least confident" fill_queue(normal_queue, "random") assert DataQueue.objects.filter(queue=normal_queue).count() == batch_size for i in range(batch_size): datum = assign_datum(test_profile, project) label_data(labels[i % 3], datum, test_profile, 3) ret_str = check_and_trigger_model(datum) assert ret_str == "model running" # Assert model created and saved assert_obj_exists(Model, {"project": project}) model = Model.objects.get(project=project) assert os.path.isfile(model.pickle_path) assert model.pickle_path == os.path.join( str(model_path_temp), "project_" + str(project.pk) + "_training_" + str(initial_training_set.set_number) + ".pkl", ) # Assert predictions created predictions = DataPrediction.objects.filter(data__project=project) assert ( len(predictions) == Data.objects.filter(project=project, labelers=None).count() * project.labels.count() )
def test_init_redis_multiple_queues(db, test_project_data, test_redis): queue = add_queue(test_project_data, 10) fill_queue(queue, orderby="random") add_queue(test_project_data, 10) test_redis.flushdb() init_redis() assert_redis_matches_db(test_redis)
def test_redis_parse_queue(test_queue, test_redis): fill_queue(test_queue, orderby='random') queue_key = [key for key in test_redis.keys() if 'queue' in key.decode()][0] parsed_queue = redis_parse_queue(queue_key) assert parsed_queue.pk == test_queue.pk assert_obj_exists(DataQueue, {'queue_id': parsed_queue.pk}) assert_obj_exists(Queue, {'pk': parsed_queue.pk})
def test_redis_parse_queue(test_queue, test_redis): fill_queue(test_queue, orderby="random") queue_key = [key for key in test_redis.keys() if "queue" in key.decode()][0] parsed_queue = redis_parse_queue(queue_key) assert parsed_queue.pk == test_queue.pk assert_obj_exists(DataQueue, {"queue_id": parsed_queue.pk}) assert_obj_exists(Queue, {"pk": parsed_queue.pk})
def test_fill_nonempty_queue(db, test_queue): # Manually add one observation so the queue is now nonempty test_datum = Data.objects.create( text="test data", project=test_queue.project, upload_id_hash=md5_hash(0) ) DataQueue.objects.create(data=test_datum, queue=test_queue) assert test_queue.data.count() == 1 fill_queue(test_queue, orderby="random") assert test_queue.data.count() == test_queue.length
def test_pop_first_nonempty_queue_single_queue(db, test_project_data, test_queue, test_redis): fill_queue(test_queue, orderby='random') queue, data = pop_first_nonempty_queue(test_project_data) assert isinstance(queue, Queue) assert queue == test_queue assert isinstance(data, Data)