Exemple #1
0
def test_get_current_training_set_one_training_set(test_project):
    training_set = test_project.get_current_training_set()
    assertTrainingSet = TrainingSet.objects.filter(
        project=test_project).order_by('-set_number')[0]

    assert_obj_exists(TrainingSet, {'project': test_project, 'set_number': 0})
    assert training_set == assertTrainingSet
Exemple #2
0
def test_get_current_training_set_one_training_set(test_project):
    training_set = test_project.get_current_training_set()
    assertTrainingSet = TrainingSet.objects.filter(
        project=test_project).order_by("-set_number")[0]

    assert_obj_exists(TrainingSet, {"project": test_project, "set_number": 0})
    assert training_set == assertTrainingSet
def test_redis_parse_data(test_queue, test_redis):
    fill_queue(test_queue, orderby="random")

    popped_data_key = test_redis.lpop(redis_serialize_queue(test_queue))
    parsed_data = redis_parse_data(popped_data_key)

    assert_obj_exists(Data, {"pk": parsed_data.pk})
    assert_obj_exists(DataQueue, {"data_id": parsed_data.pk})
def test_add_queue_no_profile(test_project):
    QUEUE_LEN = 10
    add_queue(test_project, QUEUE_LEN)
    assert_obj_exists(Queue, {
        'project': test_project,
        'length': QUEUE_LEN,
        'profile': None
    })
def test_add_queue_profile(test_project, test_profile):
    QUEUE_LEN = 10
    add_queue(test_project, QUEUE_LEN, profile=test_profile)
    assert_obj_exists(Queue, {
        'project': test_project,
        'length': QUEUE_LEN,
        'profile': test_profile
    })
def test_check_and_trigger_queue_changes_success(
        setup_celery, test_project_labeled_and_tfidf, test_queue_labeled,
        test_irr_queue_labeled, test_redis, tmpdir, settings, test_profile2):
    project = test_project_labeled_and_tfidf
    test_queue = test_queue_labeled
    initial_training_set = project.get_current_training_set()
    model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles')
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    # Add another user to permissions
    ProjectPermissions.objects.create(profile=test_profile2,
                                      project=project,
                                      permission='CODER')

    datum = DataLabel.objects.filter(data__project=project).first().data
    check = check_and_trigger_model(datum)
    assert check == 'model running'

    # Assert model created and saved
    assert_obj_exists(Model, {'project': project})
    model = Model.objects.get(project=project)
    assert os.path.isfile(model.pickle_path)
    assert model.pickle_path == os.path.join(
        str(model_path_temp), 'project_' + str(project.pk) + '_training_' +
        str(initial_training_set.set_number) + '.pkl')

    # Assert predictions created
    predictions = DataPrediction.objects.filter(data__project=project)
    assert len(predictions) == Data.objects.filter(
        project=project, labelers=None).count() * project.labels.count()

    # Assert queue filled and redis sycned
    batch_size = project.batch_size
    q = project.queue_set.get(type="normal")
    q_irr = project.queue_set.get(type="irr")
    assert (q.data.count() + q_irr.data.count()) == batch_size
    assert_redis_matches_db(test_redis)

    num_coders = len(project.projectpermissions_set.all()) + 1
    new_queue_length = find_queue_length(batch_size, num_coders)
    assert q.length == new_queue_length

    # Assert least confident in queue
    data_list = get_ordered_data(test_queue.data.all(), 'least confident')
    previous_lc = data_list[0].datauncertainty_set.get().least_confident
    for datum in data_list:
        assert len(datum.datalabel_set.all()) == 0
        assert_obj_exists(DataUncertainty, {'data': datum})
        assert datum.datauncertainty_set.get().least_confident <= previous_lc
        previous_lc = datum.datauncertainty_set.get().least_confident
    assert (DataQueue.objects.filter(queue=test_queue).count() +
            DataQueue.objects.filter(queue=test_irr_queue_labeled).count()
            ) == batch_size

    # Assert new training set
    assert project.get_current_training_set() != initial_training_set
    assert project.get_current_training_set(
    ).set_number == initial_training_set.set_number + 1
def test_fill_queue_random_predicted_data(test_project_predicted_data,
                                          test_queue, test_redis):
    fill_queue(test_queue, 'random')

    assert_redis_matches_db(test_redis)
    assert test_queue.data.count() == test_queue.length
    for datum in test_queue.data.all():
        assert len(datum.datalabel_set.all()) == 0
        assert_obj_exists(DataUncertainty, {'data': datum})
def test_g_naivebayes_classifier(
    setup_celery,
    test_project_gnb_data_tfidf,
    test_gnb_labels,
    test_gnb_queue_list,
    test_profile,
    test_redis,
    tmpdir,
    settings,
):
    """This tests that a project with the Gaussian Naiive Bayes classifier can
    successfully train and give predictions for a model."""
    normal_queue, admin_queue, irr_queue = test_gnb_queue_list
    labels = test_gnb_labels
    project = test_project_gnb_data_tfidf

    active_l = project.learning_method
    batch_size = project.batch_size
    initial_training_set = project.get_current_training_set()
    model_path_temp = tmpdir.listdir()[0].mkdir("model_pickles")
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    assert project.classifier == "gnb"
    assert active_l == "least confident"

    fill_queue(normal_queue, "random")

    assert DataQueue.objects.filter(queue=normal_queue).count() == batch_size

    for i in range(batch_size):
        datum = assign_datum(test_profile, project)
        label_data(labels[i % 3], datum, test_profile, 3)

    ret_str = check_and_trigger_model(datum)
    assert ret_str == "model running"

    # Assert model created and saved
    assert_obj_exists(Model, {"project": project})
    model = Model.objects.get(project=project)
    assert os.path.isfile(model.pickle_path)
    assert model.pickle_path == os.path.join(
        str(model_path_temp),
        "project_"
        + str(project.pk)
        + "_training_"
        + str(initial_training_set.set_number)
        + ".pkl",
    )

    # Assert predictions created
    predictions = DataPrediction.objects.filter(data__project=project)
    assert (
        len(predictions)
        == Data.objects.filter(project=project, labelers=None).count()
        * project.labels.count()
    )
Exemple #9
0
def test_redis_parse_queue(test_queue, test_redis):
    fill_queue(test_queue, orderby='random')

    queue_key = [key for key in test_redis.keys()
                 if 'queue' in key.decode()][0]
    parsed_queue = redis_parse_queue(queue_key)

    assert parsed_queue.pk == test_queue.pk
    assert_obj_exists(DataQueue, {'queue_id': parsed_queue.pk})
    assert_obj_exists(Queue, {'pk': parsed_queue.pk})
def test_redis_parse_queue(test_queue, test_redis):
    fill_queue(test_queue, orderby="random")

    queue_key = [key for key in test_redis.keys()
                 if "queue" in key.decode()][0]
    parsed_queue = redis_parse_queue(queue_key)

    assert parsed_queue.pk == test_queue.pk
    assert_obj_exists(DataQueue, {"queue_id": parsed_queue.pk})
    assert_obj_exists(Queue, {"pk": parsed_queue.pk})
Exemple #11
0
def test_model_task(
    test_project_labeled_and_tfidf,
    test_queue_labeled,
    test_irr_queue_labeled,
    test_redis,
    tmpdir,
    settings,
):
    project = test_project_labeled_and_tfidf
    test_queue = test_queue_labeled
    initial_training_set = project.get_current_training_set()
    initial_queue_length = test_queue.length

    model_path_temp = tmpdir.listdir()[0].mkdir("model_pickles")
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    tasks.send_model_task.delay(project.pk).get()

    # Assert model created and saved
    assert_obj_exists(Model, {"project": project})
    model = Model.objects.get(project=project)
    assert os.path.isfile(model.pickle_path)
    assert model.pickle_path == os.path.join(
        str(model_path_temp),
        "project_" + str(project.pk) + "_training_" +
        str(initial_training_set.set_number) + ".pkl",
    )

    # Assert predictions created
    predictions = DataPrediction.objects.filter(data__project=project)
    assert (len(predictions) == Data.objects.filter(
        project=project, labelers=None).count() * project.labels.count())

    # Assert bothe queues are filled and redis sycned
    assert (test_queue.data.count() +
            test_irr_queue_labeled.data.count()) == test_queue.length
    assert_redis_matches_db(test_redis)

    # Assert queue correct size
    assert test_queue.length == initial_queue_length

    # Assert least confident in queue
    data_list = get_ordered_data(test_queue.data.all(), "least confident")
    previous_lc = data_list[0].datauncertainty_set.get().least_confident
    for datum in data_list:
        assert len(datum.datalabel_set.all()) == 0
        assert_obj_exists(DataUncertainty, {"data": datum})
        assert datum.datauncertainty_set.get().least_confident <= previous_lc
        previous_lc = datum.datauncertainty_set.get().least_confident

    # Assert new training set
    assert project.get_current_training_set() != initial_training_set
    assert (project.get_current_training_set().set_number ==
            initial_training_set.set_number + 1)
Exemple #12
0
def test_get_assignments_no_existing_assignment_one_assignment(
        db, test_profile, test_project_data, test_queue, test_redis):
    fill_queue(test_queue, orderby='random')

    assert AssignedData.objects.count() == 0

    data = get_assignments(test_profile, test_project_data, 1)

    assert len(data) == 1
    assert isinstance(data[0], Data)
    assert_obj_exists(AssignedData, {'data': data[0], 'profile': test_profile})
Exemple #13
0
def test_add_data_no_labels(db, test_project):
    test_data = read_test_data_backend(
        file='./core/data/test_files/test_no_labels.csv')
    df = add_data(test_project, test_data)

    for i, row in df.iterrows():
        assert_obj_exists(
            Data, {
                'upload_id_hash': row['id_hash'],
                'hash': row['hash'],
                'project': test_project
            })
def test_check_and_trigger_batched_success(setup_celery,
                                           test_project_labeled_and_tfidf,
                                           test_queue_labeled,
                                           test_irr_queue_labeled, test_redis,
                                           tmpdir, settings):
    project = test_project_labeled_and_tfidf
    test_queue = test_queue_labeled
    initial_training_set = project.get_current_training_set()
    initial_queue_size = test_queue.length
    model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles')
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    datum = DataLabel.objects.filter(data__project=project).first().data
    check = check_and_trigger_model(datum)
    assert check == 'model running'

    # Assert model created and saved
    assert_obj_exists(Model, {'project': project})
    model = Model.objects.get(project=project)
    assert os.path.isfile(model.pickle_path)
    assert model.pickle_path == os.path.join(
        str(model_path_temp), 'project_' + str(project.pk) + '_training_' +
        str(initial_training_set.set_number) + '.pkl')

    # Assert predictions created
    predictions = DataPrediction.objects.filter(data__project=project)
    assert len(predictions) == Data.objects.filter(
        project=project, labelers=None).count() * project.labels.count()

    # Assert queue filled and redis sycned
    assert (test_queue.data.count() +
            test_irr_queue_labeled.data.count()) == test_queue.length
    assert_redis_matches_db(test_redis)
    assert test_queue.length == initial_queue_size

    # Assert least confident in queue
    data_list = get_ordered_data(test_queue.data.all(), 'least confident')
    previous_lc = data_list[0].datauncertainty_set.get().least_confident
    for datum in data_list:
        assert len(datum.datalabel_set.all()) == 0
        assert_obj_exists(DataUncertainty, {'data': datum})
        assert datum.datauncertainty_set.get().least_confident <= previous_lc
        previous_lc = datum.datauncertainty_set.get().least_confident
    assert (DataQueue.objects.filter(queue=test_queue).count() +
            DataQueue.objects.filter(queue=test_irr_queue_labeled).count()
            ) == TEST_QUEUE_LEN

    # Assert new training set
    assert project.get_current_training_set() != initial_training_set
    assert project.get_current_training_set(
    ).set_number == initial_training_set.set_number + 1
Exemple #15
0
def test_add_data_no_labels(db, test_project):
    test_data = read_test_data_backend(
        file="./core/data/test_files/test_no_labels.csv")
    df = add_data(test_project, test_data)

    for i, row in df.iterrows():
        assert_obj_exists(
            Data,
            {
                "upload_id_hash": row["id_hash"],
                "hash": row["hash"],
                "project": test_project,
            },
        )
def test_fill_queue_entropy_predicted_data(test_project_predicted_data,
                                           test_queue, test_redis):
    fill_queue(test_queue, 'entropy')

    assert_redis_matches_db(test_redis)
    assert test_queue.data.count() == test_queue.length

    data_list = get_ordered_data(test_queue.data.all(), 'entropy')
    previous_e = data_list[0].datauncertainty_set.get().entropy
    for datum in data_list:
        assert len(datum.datalabel_set.all()) == 0
        assert_obj_exists(DataUncertainty, {'data': datum})
        assert datum.datauncertainty_set.get().entropy <= previous_e
        previous_e = datum.datauncertainty_set.get().entropy
def test_fill_queue_margin_sampling_predicted_data(test_project_predicted_data,
                                                   test_queue, test_redis):
    fill_queue(test_queue, 'margin sampling')

    assert_redis_matches_db(test_redis)
    assert test_queue.data.count() == test_queue.length

    data_list = get_ordered_data(test_queue.data.all(), 'margin sampling')
    previous_ms = data_list[0].datauncertainty_set.get().margin_sampling
    for datum in data_list:
        assert len(datum.datalabel_set.all()) == 0
        assert_obj_exists(DataUncertainty, {'data': datum})
        assert datum.datauncertainty_set.get().margin_sampling >= previous_ms
        previous_ms = datum.datauncertainty_set.get().margin_sampling
def test_fill_queue_least_confident_predicted_data(
    test_project_predicted_data, test_queue, test_redis
):
    fill_queue(test_queue, "least confident")

    assert_redis_matches_db(test_redis)
    assert test_queue.data.count() == test_queue.length

    data_list = get_ordered_data(test_queue.data.all(), "least confident")
    previous_lc = data_list[0].datauncertainty_set.get().least_confident
    for datum in data_list:
        assert len(datum.datalabel_set.all()) == 0
        assert_obj_exists(DataUncertainty, {"data": datum})
        assert datum.datauncertainty_set.get().least_confident <= previous_lc
        previous_lc = datum.datauncertainty_set.get().least_confident
Exemple #19
0
def test_get_assignments_no_existing_assignment_max_queue_length(
        db, test_profile, test_project_data, test_queue, test_redis):
    fill_queue(test_queue, orderby='random')

    assert AssignedData.objects.count() == 0

    data = get_assignments(test_profile, test_project_data, TEST_QUEUE_LEN)

    assert len(data) == TEST_QUEUE_LEN
    for datum in data:
        assert isinstance(datum, Data)
        assert_obj_exists(AssignedData, {
            'data': datum,
            'profile': test_profile
        })
def test_get_assignments_no_existing_assignment_half_max_queue_length(
        db, test_profile, test_project_data, test_queue, test_redis):
    fill_queue(test_queue, orderby="random")

    assert AssignedData.objects.count() == 0

    data = get_assignments(test_profile, test_project_data,
                           TEST_QUEUE_LEN // 2)

    assert len(data) == TEST_QUEUE_LEN // 2
    for datum in data:
        assert isinstance(datum, Data)
        assert_obj_exists(AssignedData, {
            "data": datum,
            "profile": test_profile
        })
def test_randomforest_classifier(setup_celery,
                                 test_project_randomforest_data_tfidf,
                                 test_randomforest_labels,
                                 test_randomforest_queue_list, test_profile,
                                 test_redis, tmpdir, settings):
    '''
    This tests that a project with the random forest classifier can successfully train
    and give predictions for a model
    '''
    normal_queue, admin_queue, irr_queue = test_randomforest_queue_list
    labels = test_randomforest_labels
    project = test_project_randomforest_data_tfidf

    active_l = project.learning_method
    batch_size = project.batch_size
    initial_training_set = project.get_current_training_set()
    model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles')
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    assert project.classifier == "random forest"
    assert active_l == 'least confident'

    fill_queue(normal_queue, 'random')

    assert DataQueue.objects.filter(queue=normal_queue).count() == batch_size

    for i in range(batch_size):
        datum = assign_datum(test_profile, project)
        label_data(labels[i % 3], datum, test_profile, 3)

    ret_str = check_and_trigger_model(datum)
    assert ret_str == 'model running'

    # Assert model created and saved
    assert_obj_exists(Model, {'project': project})
    model = Model.objects.get(project=project)
    assert os.path.isfile(model.pickle_path)
    assert model.pickle_path == os.path.join(
        str(model_path_temp), 'project_' + str(project.pk) + '_training_' +
        str(initial_training_set.set_number) + '.pkl')

    # Assert predictions created
    predictions = DataPrediction.objects.filter(data__project=project)
    assert len(predictions) == Data.objects.filter(
        project=project, labelers=None).count() * project.labels.count()
def test_train_and_save_model(test_project_labeled_and_tfidf, tmpdir,
                              settings):
    project = test_project_labeled_and_tfidf

    model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles')
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    model = train_and_save_model(project)

    assert isinstance(model, Model)
    assert_obj_exists(Model, {
        'pickle_path': model.pickle_path,
        'project': project
    })
    assert os.path.isfile(model.pickle_path)
    assert model.pickle_path == os.path.join(
        str(model_path_temp), 'project_' + str(project.pk) + '_training_' +
        str(project.get_current_training_set().set_number) + '.pkl')
Exemple #23
0
def test_create_profile(db):
    username = "******"
    password = "******"
    email = "*****@*****.**"

    create_profile(username, password, email)

    auth_user_attrs = {
        "username": username,
        "password": password,
        "email": email
    }

    assert_obj_exists(get_user_model(), auth_user_attrs)

    auth_user = get_user_model().objects.filter(**auth_user_attrs).first()

    assert_obj_exists(Profile, {"user": auth_user})
Exemple #24
0
def test_create_profile(db):
    username = '******'
    password = '******'
    email = '*****@*****.**'

    create_profile(username, password, email)

    auth_user_attrs = {
        'username': username,
        'password': password,
        'email': email
    }

    assert_obj_exists(get_user_model(), auth_user_attrs)

    auth_user = (get_user_model().objects.filter(**auth_user_attrs).first())

    assert_obj_exists(Profile, {'user': auth_user})
def test_predict_data(test_project_with_trained_model, tmpdir):
    project = test_project_with_trained_model

    predictions = predict_data(project, project.model_set.get())

    # Number of unlabeled data * number of labels.  Each data gets a prediction for each label.
    expected_predction_count = project.data_set.filter(
        datalabel__isnull=True).count() * project.labels.count()
    assert len(predictions) == expected_predction_count

    for prediction in predictions:
        assert isinstance(prediction, DataPrediction)
        assert_obj_exists(
            DataPrediction, {
                'data': prediction.data,
                'model': prediction.model,
                'label': prediction.label,
                'predicted_probability': prediction.predicted_probability
            })
Exemple #26
0
def test_add_data_with_labels(db, test_project_labels):
    test_data = read_test_data_backend(
        file='./core/data/test_files/test_some_labels.csv')
    df = add_data(test_project_labels, test_data)

    for i, row in df.iterrows():
        assert_obj_exists(
            Data, {
                'upload_id_hash': row['id_hash'],
                'hash': row['hash'],
                'project': test_project_labels
            })
        if not pd.isnull(row['Label']):
            assert_obj_exists(
                DataLabel, {
                    'data__hash': row['hash'],
                    'profile': test_project_labels.creator,
                    'label__name': row['Label']
                })
Exemple #27
0
def test_label_data(db, test_profile, test_queue, test_redis):
    fill_queue(test_queue, orderby='random')

    datum = assign_datum(test_profile, test_queue.project)
    test_label = Label.objects.create(name='test', project=test_queue.project)
    label_data(test_label, datum, test_profile, 3)

    # Make sure the label was properly recorded
    assert datum in test_profile.labeled_data.all()
    assert_obj_exists(
        DataLabel, {
            'data': datum,
            'profile': test_profile,
            'label': test_label,
            'time_to_label': 3
        })

    # Make sure the assignment was removed
    assert not AssignedData.objects.filter(
        profile=test_profile, data=datum, queue=test_queue).exists()
def test_label_data(db, test_profile, test_queue, test_redis):
    fill_queue(test_queue, orderby="random")

    datum = assign_datum(test_profile, test_queue.project)
    test_label = Label.objects.create(name="test", project=test_queue.project)
    label_data(test_label, datum, test_profile, 3)

    # Make sure the label was properly recorded
    assert datum in test_profile.labeled_data.all()
    assert_obj_exists(
        DataLabel,
        {
            "data": datum,
            "profile": test_profile,
            "label": test_label,
            "time_to_label": 3,
        },
    )

    # Make sure the assignment was removed
    assert not AssignedData.objects.filter(
        profile=test_profile, data=datum, queue=test_queue).exists()
Exemple #29
0
def test_add_data_with_labels(db, test_project_labels):
    test_data = read_test_data_backend(
        file="./core/data/test_files/test_some_labels.csv")
    df = add_data(test_project_labels, test_data)

    for i, row in df.iterrows():
        assert_obj_exists(
            Data,
            {
                "upload_id_hash": row["id_hash"],
                "hash": row["hash"],
                "project": test_project_labels,
            },
        )
        if not pd.isnull(row["Label"]):
            assert_obj_exists(
                DataLabel,
                {
                    "data__hash": row["hash"],
                    "profile": test_project_labels.creator,
                    "label__name": row["Label"],
                },
            )
Exemple #30
0
def test_create_project(db, test_profile):
    name = 'test_project'
    create_project(name, test_profile)

    assert_obj_exists(Project, {'name': name})