def test_get_current_training_set_one_training_set(test_project): training_set = test_project.get_current_training_set() assertTrainingSet = TrainingSet.objects.filter( project=test_project).order_by('-set_number')[0] assert_obj_exists(TrainingSet, {'project': test_project, 'set_number': 0}) assert training_set == assertTrainingSet
def test_get_current_training_set_one_training_set(test_project): training_set = test_project.get_current_training_set() assertTrainingSet = TrainingSet.objects.filter( project=test_project).order_by("-set_number")[0] assert_obj_exists(TrainingSet, {"project": test_project, "set_number": 0}) assert training_set == assertTrainingSet
def test_redis_parse_data(test_queue, test_redis): fill_queue(test_queue, orderby="random") popped_data_key = test_redis.lpop(redis_serialize_queue(test_queue)) parsed_data = redis_parse_data(popped_data_key) assert_obj_exists(Data, {"pk": parsed_data.pk}) assert_obj_exists(DataQueue, {"data_id": parsed_data.pk})
def test_add_queue_no_profile(test_project): QUEUE_LEN = 10 add_queue(test_project, QUEUE_LEN) assert_obj_exists(Queue, { 'project': test_project, 'length': QUEUE_LEN, 'profile': None })
def test_add_queue_profile(test_project, test_profile): QUEUE_LEN = 10 add_queue(test_project, QUEUE_LEN, profile=test_profile) assert_obj_exists(Queue, { 'project': test_project, 'length': QUEUE_LEN, 'profile': test_profile })
def test_check_and_trigger_queue_changes_success( setup_celery, test_project_labeled_and_tfidf, test_queue_labeled, test_irr_queue_labeled, test_redis, tmpdir, settings, test_profile2): project = test_project_labeled_and_tfidf test_queue = test_queue_labeled initial_training_set = project.get_current_training_set() model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles') settings.MODEL_PICKLE_PATH = str(model_path_temp) # Add another user to permissions ProjectPermissions.objects.create(profile=test_profile2, project=project, permission='CODER') datum = DataLabel.objects.filter(data__project=project).first().data check = check_and_trigger_model(datum) assert check == 'model running' # Assert model created and saved assert_obj_exists(Model, {'project': project}) model = Model.objects.get(project=project) assert os.path.isfile(model.pickle_path) assert model.pickle_path == os.path.join( str(model_path_temp), 'project_' + str(project.pk) + '_training_' + str(initial_training_set.set_number) + '.pkl') # Assert predictions created predictions = DataPrediction.objects.filter(data__project=project) assert len(predictions) == Data.objects.filter( project=project, labelers=None).count() * project.labels.count() # Assert queue filled and redis sycned batch_size = project.batch_size q = project.queue_set.get(type="normal") q_irr = project.queue_set.get(type="irr") assert (q.data.count() + q_irr.data.count()) == batch_size assert_redis_matches_db(test_redis) num_coders = len(project.projectpermissions_set.all()) + 1 new_queue_length = find_queue_length(batch_size, num_coders) assert q.length == new_queue_length # Assert least confident in queue data_list = get_ordered_data(test_queue.data.all(), 'least confident') previous_lc = data_list[0].datauncertainty_set.get().least_confident for datum in data_list: assert len(datum.datalabel_set.all()) == 0 assert_obj_exists(DataUncertainty, {'data': datum}) assert datum.datauncertainty_set.get().least_confident <= previous_lc previous_lc = datum.datauncertainty_set.get().least_confident assert (DataQueue.objects.filter(queue=test_queue).count() + DataQueue.objects.filter(queue=test_irr_queue_labeled).count() ) == batch_size # Assert new training set assert project.get_current_training_set() != initial_training_set assert project.get_current_training_set( ).set_number == initial_training_set.set_number + 1
def test_fill_queue_random_predicted_data(test_project_predicted_data, test_queue, test_redis): fill_queue(test_queue, 'random') assert_redis_matches_db(test_redis) assert test_queue.data.count() == test_queue.length for datum in test_queue.data.all(): assert len(datum.datalabel_set.all()) == 0 assert_obj_exists(DataUncertainty, {'data': datum})
def test_g_naivebayes_classifier( setup_celery, test_project_gnb_data_tfidf, test_gnb_labels, test_gnb_queue_list, test_profile, test_redis, tmpdir, settings, ): """This tests that a project with the Gaussian Naiive Bayes classifier can successfully train and give predictions for a model.""" normal_queue, admin_queue, irr_queue = test_gnb_queue_list labels = test_gnb_labels project = test_project_gnb_data_tfidf active_l = project.learning_method batch_size = project.batch_size initial_training_set = project.get_current_training_set() model_path_temp = tmpdir.listdir()[0].mkdir("model_pickles") settings.MODEL_PICKLE_PATH = str(model_path_temp) assert project.classifier == "gnb" assert active_l == "least confident" fill_queue(normal_queue, "random") assert DataQueue.objects.filter(queue=normal_queue).count() == batch_size for i in range(batch_size): datum = assign_datum(test_profile, project) label_data(labels[i % 3], datum, test_profile, 3) ret_str = check_and_trigger_model(datum) assert ret_str == "model running" # Assert model created and saved assert_obj_exists(Model, {"project": project}) model = Model.objects.get(project=project) assert os.path.isfile(model.pickle_path) assert model.pickle_path == os.path.join( str(model_path_temp), "project_" + str(project.pk) + "_training_" + str(initial_training_set.set_number) + ".pkl", ) # Assert predictions created predictions = DataPrediction.objects.filter(data__project=project) assert ( len(predictions) == Data.objects.filter(project=project, labelers=None).count() * project.labels.count() )
def test_redis_parse_queue(test_queue, test_redis): fill_queue(test_queue, orderby='random') queue_key = [key for key in test_redis.keys() if 'queue' in key.decode()][0] parsed_queue = redis_parse_queue(queue_key) assert parsed_queue.pk == test_queue.pk assert_obj_exists(DataQueue, {'queue_id': parsed_queue.pk}) assert_obj_exists(Queue, {'pk': parsed_queue.pk})
def test_redis_parse_queue(test_queue, test_redis): fill_queue(test_queue, orderby="random") queue_key = [key for key in test_redis.keys() if "queue" in key.decode()][0] parsed_queue = redis_parse_queue(queue_key) assert parsed_queue.pk == test_queue.pk assert_obj_exists(DataQueue, {"queue_id": parsed_queue.pk}) assert_obj_exists(Queue, {"pk": parsed_queue.pk})
def test_model_task( test_project_labeled_and_tfidf, test_queue_labeled, test_irr_queue_labeled, test_redis, tmpdir, settings, ): project = test_project_labeled_and_tfidf test_queue = test_queue_labeled initial_training_set = project.get_current_training_set() initial_queue_length = test_queue.length model_path_temp = tmpdir.listdir()[0].mkdir("model_pickles") settings.MODEL_PICKLE_PATH = str(model_path_temp) tasks.send_model_task.delay(project.pk).get() # Assert model created and saved assert_obj_exists(Model, {"project": project}) model = Model.objects.get(project=project) assert os.path.isfile(model.pickle_path) assert model.pickle_path == os.path.join( str(model_path_temp), "project_" + str(project.pk) + "_training_" + str(initial_training_set.set_number) + ".pkl", ) # Assert predictions created predictions = DataPrediction.objects.filter(data__project=project) assert (len(predictions) == Data.objects.filter( project=project, labelers=None).count() * project.labels.count()) # Assert bothe queues are filled and redis sycned assert (test_queue.data.count() + test_irr_queue_labeled.data.count()) == test_queue.length assert_redis_matches_db(test_redis) # Assert queue correct size assert test_queue.length == initial_queue_length # Assert least confident in queue data_list = get_ordered_data(test_queue.data.all(), "least confident") previous_lc = data_list[0].datauncertainty_set.get().least_confident for datum in data_list: assert len(datum.datalabel_set.all()) == 0 assert_obj_exists(DataUncertainty, {"data": datum}) assert datum.datauncertainty_set.get().least_confident <= previous_lc previous_lc = datum.datauncertainty_set.get().least_confident # Assert new training set assert project.get_current_training_set() != initial_training_set assert (project.get_current_training_set().set_number == initial_training_set.set_number + 1)
def test_get_assignments_no_existing_assignment_one_assignment( db, test_profile, test_project_data, test_queue, test_redis): fill_queue(test_queue, orderby='random') assert AssignedData.objects.count() == 0 data = get_assignments(test_profile, test_project_data, 1) assert len(data) == 1 assert isinstance(data[0], Data) assert_obj_exists(AssignedData, {'data': data[0], 'profile': test_profile})
def test_add_data_no_labels(db, test_project): test_data = read_test_data_backend( file='./core/data/test_files/test_no_labels.csv') df = add_data(test_project, test_data) for i, row in df.iterrows(): assert_obj_exists( Data, { 'upload_id_hash': row['id_hash'], 'hash': row['hash'], 'project': test_project })
def test_check_and_trigger_batched_success(setup_celery, test_project_labeled_and_tfidf, test_queue_labeled, test_irr_queue_labeled, test_redis, tmpdir, settings): project = test_project_labeled_and_tfidf test_queue = test_queue_labeled initial_training_set = project.get_current_training_set() initial_queue_size = test_queue.length model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles') settings.MODEL_PICKLE_PATH = str(model_path_temp) datum = DataLabel.objects.filter(data__project=project).first().data check = check_and_trigger_model(datum) assert check == 'model running' # Assert model created and saved assert_obj_exists(Model, {'project': project}) model = Model.objects.get(project=project) assert os.path.isfile(model.pickle_path) assert model.pickle_path == os.path.join( str(model_path_temp), 'project_' + str(project.pk) + '_training_' + str(initial_training_set.set_number) + '.pkl') # Assert predictions created predictions = DataPrediction.objects.filter(data__project=project) assert len(predictions) == Data.objects.filter( project=project, labelers=None).count() * project.labels.count() # Assert queue filled and redis sycned assert (test_queue.data.count() + test_irr_queue_labeled.data.count()) == test_queue.length assert_redis_matches_db(test_redis) assert test_queue.length == initial_queue_size # Assert least confident in queue data_list = get_ordered_data(test_queue.data.all(), 'least confident') previous_lc = data_list[0].datauncertainty_set.get().least_confident for datum in data_list: assert len(datum.datalabel_set.all()) == 0 assert_obj_exists(DataUncertainty, {'data': datum}) assert datum.datauncertainty_set.get().least_confident <= previous_lc previous_lc = datum.datauncertainty_set.get().least_confident assert (DataQueue.objects.filter(queue=test_queue).count() + DataQueue.objects.filter(queue=test_irr_queue_labeled).count() ) == TEST_QUEUE_LEN # Assert new training set assert project.get_current_training_set() != initial_training_set assert project.get_current_training_set( ).set_number == initial_training_set.set_number + 1
def test_add_data_no_labels(db, test_project): test_data = read_test_data_backend( file="./core/data/test_files/test_no_labels.csv") df = add_data(test_project, test_data) for i, row in df.iterrows(): assert_obj_exists( Data, { "upload_id_hash": row["id_hash"], "hash": row["hash"], "project": test_project, }, )
def test_fill_queue_entropy_predicted_data(test_project_predicted_data, test_queue, test_redis): fill_queue(test_queue, 'entropy') assert_redis_matches_db(test_redis) assert test_queue.data.count() == test_queue.length data_list = get_ordered_data(test_queue.data.all(), 'entropy') previous_e = data_list[0].datauncertainty_set.get().entropy for datum in data_list: assert len(datum.datalabel_set.all()) == 0 assert_obj_exists(DataUncertainty, {'data': datum}) assert datum.datauncertainty_set.get().entropy <= previous_e previous_e = datum.datauncertainty_set.get().entropy
def test_fill_queue_margin_sampling_predicted_data(test_project_predicted_data, test_queue, test_redis): fill_queue(test_queue, 'margin sampling') assert_redis_matches_db(test_redis) assert test_queue.data.count() == test_queue.length data_list = get_ordered_data(test_queue.data.all(), 'margin sampling') previous_ms = data_list[0].datauncertainty_set.get().margin_sampling for datum in data_list: assert len(datum.datalabel_set.all()) == 0 assert_obj_exists(DataUncertainty, {'data': datum}) assert datum.datauncertainty_set.get().margin_sampling >= previous_ms previous_ms = datum.datauncertainty_set.get().margin_sampling
def test_fill_queue_least_confident_predicted_data( test_project_predicted_data, test_queue, test_redis ): fill_queue(test_queue, "least confident") assert_redis_matches_db(test_redis) assert test_queue.data.count() == test_queue.length data_list = get_ordered_data(test_queue.data.all(), "least confident") previous_lc = data_list[0].datauncertainty_set.get().least_confident for datum in data_list: assert len(datum.datalabel_set.all()) == 0 assert_obj_exists(DataUncertainty, {"data": datum}) assert datum.datauncertainty_set.get().least_confident <= previous_lc previous_lc = datum.datauncertainty_set.get().least_confident
def test_get_assignments_no_existing_assignment_max_queue_length( db, test_profile, test_project_data, test_queue, test_redis): fill_queue(test_queue, orderby='random') assert AssignedData.objects.count() == 0 data = get_assignments(test_profile, test_project_data, TEST_QUEUE_LEN) assert len(data) == TEST_QUEUE_LEN for datum in data: assert isinstance(datum, Data) assert_obj_exists(AssignedData, { 'data': datum, 'profile': test_profile })
def test_get_assignments_no_existing_assignment_half_max_queue_length( db, test_profile, test_project_data, test_queue, test_redis): fill_queue(test_queue, orderby="random") assert AssignedData.objects.count() == 0 data = get_assignments(test_profile, test_project_data, TEST_QUEUE_LEN // 2) assert len(data) == TEST_QUEUE_LEN // 2 for datum in data: assert isinstance(datum, Data) assert_obj_exists(AssignedData, { "data": datum, "profile": test_profile })
def test_randomforest_classifier(setup_celery, test_project_randomforest_data_tfidf, test_randomforest_labels, test_randomforest_queue_list, test_profile, test_redis, tmpdir, settings): ''' This tests that a project with the random forest classifier can successfully train and give predictions for a model ''' normal_queue, admin_queue, irr_queue = test_randomforest_queue_list labels = test_randomforest_labels project = test_project_randomforest_data_tfidf active_l = project.learning_method batch_size = project.batch_size initial_training_set = project.get_current_training_set() model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles') settings.MODEL_PICKLE_PATH = str(model_path_temp) assert project.classifier == "random forest" assert active_l == 'least confident' fill_queue(normal_queue, 'random') assert DataQueue.objects.filter(queue=normal_queue).count() == batch_size for i in range(batch_size): datum = assign_datum(test_profile, project) label_data(labels[i % 3], datum, test_profile, 3) ret_str = check_and_trigger_model(datum) assert ret_str == 'model running' # Assert model created and saved assert_obj_exists(Model, {'project': project}) model = Model.objects.get(project=project) assert os.path.isfile(model.pickle_path) assert model.pickle_path == os.path.join( str(model_path_temp), 'project_' + str(project.pk) + '_training_' + str(initial_training_set.set_number) + '.pkl') # Assert predictions created predictions = DataPrediction.objects.filter(data__project=project) assert len(predictions) == Data.objects.filter( project=project, labelers=None).count() * project.labels.count()
def test_train_and_save_model(test_project_labeled_and_tfidf, tmpdir, settings): project = test_project_labeled_and_tfidf model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles') settings.MODEL_PICKLE_PATH = str(model_path_temp) model = train_and_save_model(project) assert isinstance(model, Model) assert_obj_exists(Model, { 'pickle_path': model.pickle_path, 'project': project }) assert os.path.isfile(model.pickle_path) assert model.pickle_path == os.path.join( str(model_path_temp), 'project_' + str(project.pk) + '_training_' + str(project.get_current_training_set().set_number) + '.pkl')
def test_create_profile(db): username = "******" password = "******" email = "*****@*****.**" create_profile(username, password, email) auth_user_attrs = { "username": username, "password": password, "email": email } assert_obj_exists(get_user_model(), auth_user_attrs) auth_user = get_user_model().objects.filter(**auth_user_attrs).first() assert_obj_exists(Profile, {"user": auth_user})
def test_create_profile(db): username = '******' password = '******' email = '*****@*****.**' create_profile(username, password, email) auth_user_attrs = { 'username': username, 'password': password, 'email': email } assert_obj_exists(get_user_model(), auth_user_attrs) auth_user = (get_user_model().objects.filter(**auth_user_attrs).first()) assert_obj_exists(Profile, {'user': auth_user})
def test_predict_data(test_project_with_trained_model, tmpdir): project = test_project_with_trained_model predictions = predict_data(project, project.model_set.get()) # Number of unlabeled data * number of labels. Each data gets a prediction for each label. expected_predction_count = project.data_set.filter( datalabel__isnull=True).count() * project.labels.count() assert len(predictions) == expected_predction_count for prediction in predictions: assert isinstance(prediction, DataPrediction) assert_obj_exists( DataPrediction, { 'data': prediction.data, 'model': prediction.model, 'label': prediction.label, 'predicted_probability': prediction.predicted_probability })
def test_add_data_with_labels(db, test_project_labels): test_data = read_test_data_backend( file='./core/data/test_files/test_some_labels.csv') df = add_data(test_project_labels, test_data) for i, row in df.iterrows(): assert_obj_exists( Data, { 'upload_id_hash': row['id_hash'], 'hash': row['hash'], 'project': test_project_labels }) if not pd.isnull(row['Label']): assert_obj_exists( DataLabel, { 'data__hash': row['hash'], 'profile': test_project_labels.creator, 'label__name': row['Label'] })
def test_label_data(db, test_profile, test_queue, test_redis): fill_queue(test_queue, orderby='random') datum = assign_datum(test_profile, test_queue.project) test_label = Label.objects.create(name='test', project=test_queue.project) label_data(test_label, datum, test_profile, 3) # Make sure the label was properly recorded assert datum in test_profile.labeled_data.all() assert_obj_exists( DataLabel, { 'data': datum, 'profile': test_profile, 'label': test_label, 'time_to_label': 3 }) # Make sure the assignment was removed assert not AssignedData.objects.filter( profile=test_profile, data=datum, queue=test_queue).exists()
def test_label_data(db, test_profile, test_queue, test_redis): fill_queue(test_queue, orderby="random") datum = assign_datum(test_profile, test_queue.project) test_label = Label.objects.create(name="test", project=test_queue.project) label_data(test_label, datum, test_profile, 3) # Make sure the label was properly recorded assert datum in test_profile.labeled_data.all() assert_obj_exists( DataLabel, { "data": datum, "profile": test_profile, "label": test_label, "time_to_label": 3, }, ) # Make sure the assignment was removed assert not AssignedData.objects.filter( profile=test_profile, data=datum, queue=test_queue).exists()
def test_add_data_with_labels(db, test_project_labels): test_data = read_test_data_backend( file="./core/data/test_files/test_some_labels.csv") df = add_data(test_project_labels, test_data) for i, row in df.iterrows(): assert_obj_exists( Data, { "upload_id_hash": row["id_hash"], "hash": row["hash"], "project": test_project_labels, }, ) if not pd.isnull(row["Label"]): assert_obj_exists( DataLabel, { "data__hash": row["hash"], "profile": test_project_labels.creator, "label__name": row["Label"], }, )
def test_create_project(db, test_profile): name = 'test_project' create_project(name, test_profile) assert_obj_exists(Project, {'name': name})