コード例 #1
0
def test_queue_refill(setup_celery, test_project_data, test_all_queues,
                      test_profile, test_labels, test_redis, tmpdir, settings):
    '''
    Check that the queues refill the way they should.
    Have one person label everything in a batch. Check that the queue refills but the irr queue now has twice the irr% * batch amount
    '''
    project = test_project_data
    normal_queue, admin_queue, irr_queue = test_all_queues
    fill_queue(normal_queue, 'random', irr_queue, project.percentage_irr,
               project.batch_size)

    irr_count = math.ceil((project.percentage_irr / 100) * project.batch_size)
    non_irr_count = math.ceil(
        ((100 - project.percentage_irr) / 100) * project.batch_size)

    for i in range(non_irr_count):
        datum = assign_datum(test_profile, project, "normal")
        assert datum is not None
        label_data(test_labels[0], datum, test_profile, 3)
        check_and_trigger_model(datum, test_profile)
    for i in range(irr_count):
        datum = assign_datum(test_profile, project, "irr")
        assert datum is not None
        label_data(test_labels[0], datum, test_profile, 3)
        check_and_trigger_model(datum, test_profile)
    assert DataQueue.objects.filter(
        queue=normal_queue).count() == non_irr_count
    assert DataQueue.objects.filter(queue=irr_queue).count() == irr_count * 2
コード例 #2
0
def test_check_and_trigger_queue_changes_success(
        setup_celery, test_project_labeled_and_tfidf, test_queue_labeled,
        test_irr_queue_labeled, test_redis, tmpdir, settings, test_profile2):
    project = test_project_labeled_and_tfidf
    test_queue = test_queue_labeled
    initial_training_set = project.get_current_training_set()
    model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles')
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    # Add another user to permissions
    ProjectPermissions.objects.create(profile=test_profile2,
                                      project=project,
                                      permission='CODER')

    datum = DataLabel.objects.filter(data__project=project).first().data
    check = check_and_trigger_model(datum)
    assert check == 'model running'

    # Assert model created and saved
    assert_obj_exists(Model, {'project': project})
    model = Model.objects.get(project=project)
    assert os.path.isfile(model.pickle_path)
    assert model.pickle_path == os.path.join(
        str(model_path_temp), 'project_' + str(project.pk) + '_training_' +
        str(initial_training_set.set_number) + '.pkl')

    # Assert predictions created
    predictions = DataPrediction.objects.filter(data__project=project)
    assert len(predictions) == Data.objects.filter(
        project=project, labelers=None).count() * project.labels.count()

    # Assert queue filled and redis sycned
    batch_size = project.batch_size
    q = project.queue_set.get(type="normal")
    q_irr = project.queue_set.get(type="irr")
    assert (q.data.count() + q_irr.data.count()) == batch_size
    assert_redis_matches_db(test_redis)

    num_coders = len(project.projectpermissions_set.all()) + 1
    new_queue_length = find_queue_length(batch_size, num_coders)
    assert q.length == new_queue_length

    # Assert least confident in queue
    data_list = get_ordered_data(test_queue.data.all(), 'least confident')
    previous_lc = data_list[0].datauncertainty_set.get().least_confident
    for datum in data_list:
        assert len(datum.datalabel_set.all()) == 0
        assert_obj_exists(DataUncertainty, {'data': datum})
        assert datum.datauncertainty_set.get().least_confident <= previous_lc
        previous_lc = datum.datauncertainty_set.get().least_confident
    assert (DataQueue.objects.filter(queue=test_queue).count() +
            DataQueue.objects.filter(queue=test_irr_queue_labeled).count()
            ) == batch_size

    # Assert new training set
    assert project.get_current_training_set() != initial_training_set
    assert project.get_current_training_set(
    ).set_number == initial_training_set.set_number + 1
コード例 #3
0
def label_admin_label(request, data_pk):
    """This is called when an admin manually labels a datum on the admin annotation
    page. It labels a single datum with the given label and profile, with null as the
    time.

    Args:
        request: The POST request
        data_pk: Primary key of the data
    Returns:
        {}
    """
    datum = Data.objects.get(pk=data_pk)
    project = datum.project
    label = Label.objects.get(pk=request.data["labelID"])
    profile = request.user.profile
    response = {}

    current_training_set = project.get_current_training_set()

    with transaction.atomic():
        queue = project.queue_set.get(type="admin")
        DataLabel.objects.create(
            data=datum,
            label=label,
            profile=profile,
            training_set=current_training_set,
            time_to_label=None,
            timestamp=timezone.now(),
        )

        DataQueue.objects.filter(data=datum, queue=queue).delete()

        # update redis
        settings.REDIS.srem(redis_serialize_set(queue),
                            redis_serialize_data(datum))

        # make sure the data is no longer irr
        if datum.irr_ind:
            Data.objects.filter(pk=datum.pk).update(irr_ind=False)

    # NOTE: this checks if the model needs to be triggered, but not if the
    # queues need to be refilled. This is because for something to be in the
    # admin queue, annotate or skip would have already checked for an empty queue
    check_and_trigger_model(datum)
    return Response(response)
コード例 #4
0
ファイル: api_annotate.py プロジェクト: taggsoft/SMART
def skip_data(request, data_pk):
    """Take a datum that is in the assigneddata queue for that user
    and place it in the admin queue. Remove it from the
    assignedData queue.

    Args:
        request: The POST request
        data_pk: Primary key of the data
    Returns:
        {}
    """
    data = Data.objects.get(pk=data_pk)
    profile = request.user.profile
    project = data.project
    response = {}

    # if the data is IRR or processed IRR, dont add to admin queue yet
    num_history = IRRLog.objects.filter(data=data).count()

    if RecycleBin.objects.filter(data=data).count() > 0:
        assignment = AssignedData.objects.get(data=data, profile=profile)
        assignment.delete()
    elif data.irr_ind or num_history > 0:
        # unassign the skipped item
        assignment = AssignedData.objects.get(data=data, profile=profile)
        assignment.delete()

        # log the data and check IRR but don't put in admin queue yet
        IRRLog.objects.create(data=data,
                              profile=profile,
                              label=None,
                              timestamp=timezone.now())
        # if the IRR history has more than the needed number of labels , it is
        # already processed so don't do anything else
        if num_history <= project.num_users_irr:
            process_irr_label(data, None)
    else:
        # the data is not IRR so treat it as normal
        move_skipped_to_admin_queue(data, profile, project)

    # for all data, check if we need to refill queue
    check_and_trigger_model(data, profile)

    return Response(response)
コード例 #5
0
ファイル: api_annotate.py プロジェクト: taggsoft/SMART
def annotate_data(request, data_pk):
    """Annotate a single datum which is in the assigneddata queue given the user,
       data_id, and label_id.  This will remove it from assigneddata, remove it
       from dataqueue and add it to labeleddata.  Also check if project is ready
       to have model run, if so start that process.

    Args:
        request: The POST request
        data_pk: Primary key of the data
    Returns:
        {}
    """
    data = Data.objects.get(pk=data_pk)
    project = data.project
    profile = request.user.profile
    response = {}
    label = Label.objects.get(pk=request.data['labelID'])
    labeling_time = request.data['labeling_time']

    num_history = IRRLog.objects.filter(data=data).count()

    if RecycleBin.objects.filter(data=data).count() > 0:
        # this data is no longer in use. delete it
        assignment = AssignedData.objects.get(data=data, profile=profile)
        assignment.delete()
    elif num_history >= project.num_users_irr:
        # if the IRR history has more than the needed number of labels , it is
        # already processed so just add this label to the history.
        IRRLog.objects.create(data=data,
                              profile=profile,
                              label=label,
                              timestamp=timezone.now())
        assignment = AssignedData.objects.get(data=data, profile=profile)
        assignment.delete()
    else:
        label_data(label, data, profile, labeling_time)
        if data.irr_ind:
            # if it is reliability data, run processing step
            process_irr_label(data, label)

    # for all data, check if we need to refill queue
    check_and_trigger_model(data, profile)

    return Response(response)
コード例 #6
0
def test_g_naivebayes_classifier(
    setup_celery,
    test_project_gnb_data_tfidf,
    test_gnb_labels,
    test_gnb_queue_list,
    test_profile,
    test_redis,
    tmpdir,
    settings,
):
    """This tests that a project with the Gaussian Naiive Bayes classifier can
    successfully train and give predictions for a model."""
    normal_queue, admin_queue, irr_queue = test_gnb_queue_list
    labels = test_gnb_labels
    project = test_project_gnb_data_tfidf

    active_l = project.learning_method
    batch_size = project.batch_size
    initial_training_set = project.get_current_training_set()
    model_path_temp = tmpdir.listdir()[0].mkdir("model_pickles")
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    assert project.classifier == "gnb"
    assert active_l == "least confident"

    fill_queue(normal_queue, "random")

    assert DataQueue.objects.filter(queue=normal_queue).count() == batch_size

    for i in range(batch_size):
        datum = assign_datum(test_profile, project)
        label_data(labels[i % 3], datum, test_profile, 3)

    ret_str = check_and_trigger_model(datum)
    assert ret_str == "model running"

    # Assert model created and saved
    assert_obj_exists(Model, {"project": project})
    model = Model.objects.get(project=project)
    assert os.path.isfile(model.pickle_path)
    assert model.pickle_path == os.path.join(
        str(model_path_temp),
        "project_"
        + str(project.pk)
        + "_training_"
        + str(initial_training_set.set_number)
        + ".pkl",
    )

    # Assert predictions created
    predictions = DataPrediction.objects.filter(data__project=project)
    assert (
        len(predictions)
        == Data.objects.filter(project=project, labelers=None).count()
        * project.labels.count()
    )
コード例 #7
0
def test_check_and_trigger_batched_success(setup_celery,
                                           test_project_labeled_and_tfidf,
                                           test_queue_labeled,
                                           test_irr_queue_labeled, test_redis,
                                           tmpdir, settings):
    project = test_project_labeled_and_tfidf
    test_queue = test_queue_labeled
    initial_training_set = project.get_current_training_set()
    initial_queue_size = test_queue.length
    model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles')
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    datum = DataLabel.objects.filter(data__project=project).first().data
    check = check_and_trigger_model(datum)
    assert check == 'model running'

    # Assert model created and saved
    assert_obj_exists(Model, {'project': project})
    model = Model.objects.get(project=project)
    assert os.path.isfile(model.pickle_path)
    assert model.pickle_path == os.path.join(
        str(model_path_temp), 'project_' + str(project.pk) + '_training_' +
        str(initial_training_set.set_number) + '.pkl')

    # Assert predictions created
    predictions = DataPrediction.objects.filter(data__project=project)
    assert len(predictions) == Data.objects.filter(
        project=project, labelers=None).count() * project.labels.count()

    # Assert queue filled and redis sycned
    assert (test_queue.data.count() +
            test_irr_queue_labeled.data.count()) == test_queue.length
    assert_redis_matches_db(test_redis)
    assert test_queue.length == initial_queue_size

    # Assert least confident in queue
    data_list = get_ordered_data(test_queue.data.all(), 'least confident')
    previous_lc = data_list[0].datauncertainty_set.get().least_confident
    for datum in data_list:
        assert len(datum.datalabel_set.all()) == 0
        assert_obj_exists(DataUncertainty, {'data': datum})
        assert datum.datauncertainty_set.get().least_confident <= previous_lc
        previous_lc = datum.datauncertainty_set.get().least_confident
    assert (DataQueue.objects.filter(queue=test_queue).count() +
            DataQueue.objects.filter(queue=test_irr_queue_labeled).count()
            ) == TEST_QUEUE_LEN

    # Assert new training set
    assert project.get_current_training_set() != initial_training_set
    assert project.get_current_training_set(
    ).set_number == initial_training_set.set_number + 1
コード例 #8
0
def test_randomforest_classifier(setup_celery,
                                 test_project_randomforest_data_tfidf,
                                 test_randomforest_labels,
                                 test_randomforest_queue_list, test_profile,
                                 test_redis, tmpdir, settings):
    '''
    This tests that a project with the random forest classifier can successfully train
    and give predictions for a model
    '''
    normal_queue, admin_queue, irr_queue = test_randomforest_queue_list
    labels = test_randomforest_labels
    project = test_project_randomforest_data_tfidf

    active_l = project.learning_method
    batch_size = project.batch_size
    initial_training_set = project.get_current_training_set()
    model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles')
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    assert project.classifier == "random forest"
    assert active_l == 'least confident'

    fill_queue(normal_queue, 'random')

    assert DataQueue.objects.filter(queue=normal_queue).count() == batch_size

    for i in range(batch_size):
        datum = assign_datum(test_profile, project)
        label_data(labels[i % 3], datum, test_profile, 3)

    ret_str = check_and_trigger_model(datum)
    assert ret_str == 'model running'

    # Assert model created and saved
    assert_obj_exists(Model, {'project': project})
    model = Model.objects.get(project=project)
    assert os.path.isfile(model.pickle_path)
    assert model.pickle_path == os.path.join(
        str(model_path_temp), 'project_' + str(project.pk) + '_training_' +
        str(initial_training_set.set_number) + '.pkl')

    # Assert predictions created
    predictions = DataPrediction.objects.filter(data__project=project)
    assert len(predictions) == Data.objects.filter(
        project=project, labelers=None).count() * project.labels.count()
コード例 #9
0
def test_check_and_trigger_model_first_labeled(
    setup_celery, test_project_data, test_labels, test_queue, test_profile
):
    initial_training_set = test_project_data.get_current_training_set()

    fill_queue(test_queue, orderby="random")

    datum = assign_datum(test_profile, test_queue.project)
    test_label = test_labels[0]
    label_data(test_label, datum, test_profile, 3)

    check = check_and_trigger_model(datum)
    assert check == "no trigger"

    assert test_project_data.get_current_training_set() == initial_training_set
    assert test_project_data.model_set.count() == 0
    assert DataPrediction.objects.filter(data__project=test_project_data).count() == 0
    assert DataUncertainty.objects.filter(data__project=test_project_data).count() == 0
    assert DataQueue.objects.filter(queue=test_queue).count() == TEST_QUEUE_LEN - 1
コード例 #10
0
def send_check_and_trigger_model_task(project_pk):
    from core.models import Data
    from core.utils.utils_model import check_and_trigger_model

    datum = Data.objects.filter(project=project_pk).first()
    check_and_trigger_model(datum)