Ejemplo n.º 1
0
def test_get_labeled_data(setup_celery, test_profile, test_project_labeled,
                          test_queue_labeled, test_irr_queue_labeled,
                          test_admin_queue_labeled, test_redis, tmpdir,
                          settings):
    '''
    This tests that the labeled data is pulled correctly
    '''
    # This tests labeled data util call
    project = test_project_labeled
    project_labels = Label.objects.filter(project=project)
    fill_queue(test_queue_labeled, 'random', test_irr_queue_labeled,
               project.percentage_irr, project.batch_size)

    # get the labeled data and the labels
    labeled_data, labels = get_labeled_data(project)
    assert isinstance(labeled_data, pd.DataFrame)
    assert isinstance(labels, pd.DataFrame)

    # should have the same number of labels and labeled data as in project
    assert len(labels) == len(project_labels)

    project_labeled = DataLabel.objects.filter(data__project=project)
    assert len(labeled_data) == len(project_labeled)

    # check that the labeled data is returned matches the stuff in DataLabel
    assert len(
        set(project_labeled.values_list("data__upload_id", flat=True))
        & set(labeled_data["ID"].tolist())) == len(labeled_data)
Ejemplo n.º 2
0
def download_data(request, project_pk):
    """This function gets the labeled data and makes it available for download

    Args:
        request: The POST request
        project_pk: Primary key of the project
    Returns:
        an HttpResponse containing the requested data
    """
    project = Project.objects.get(pk=project_pk)
    data, labels = get_labeled_data(project)
    data = data.to_dict("records")

    buffer = io.StringIO()
    wr = csv.DictWriter(buffer,
                        fieldnames=['ID', 'Text', 'Label'],
                        quoting=csv.QUOTE_ALL)
    wr.writeheader()
    wr.writerows(data)
    buffer.seek(0)
    response = HttpResponse(buffer, content_type='text/csv')
    response['Content-Disposition'] = 'attachment;'

    return response
Ejemplo n.º 3
0
def download_model(request, project_pk):
    """This function gets the labeled data and makes it available for download

    Args:
        request: The POST request
        pk: Primary key of the project
    Returns:
        an HttpResponse containing the requested data
    """
    project = Project.objects.get(pk=project_pk)

    # https://stackoverflow.com/questions/12881294/django-create-a-zip-of-multiple-files-and-make-it-downloadable
    zip_subdir = 'model_project' + str(project_pk)

    tfidf_path = os.path.join(
        settings.TF_IDF_PATH,
        'project_' + str(project_pk) + '_tfidf_matrix.pkl')
    tfidf_vectorizer_path = os.path.join(
        settings.TF_IDF_PATH, 'project_' + str(project_pk) + '_vectorizer.pkl')
    readme_path = os.path.join(settings.BASE_DIR, 'core', 'data', 'README.pdf')
    dockerfile_path = os.path.join(settings.BASE_DIR, 'core', 'data',
                                   'Dockerfile')
    requirements_path = os.path.join(settings.BASE_DIR, 'core', 'data',
                                     'requirements.txt')
    start_script_path = os.path.join(settings.BASE_DIR, 'core', 'data',
                                     'start_notebook.sh')
    usage_examples_path = os.path.join(settings.BASE_DIR, 'core', 'data',
                                       'UsageExamples.ipynb')
    current_training_set = project.get_current_training_set()
    model_path = os.path.join(
        settings.MODEL_PICKLE_PATH, 'project_' + str(project_pk) +
        '_training_' + str(current_training_set.set_number - 1) + '.pkl')

    data, label_data = get_labeled_data(project)
    # open the tempfile and write the label data to it
    temp_labeleddata_file = tempfile.NamedTemporaryFile(mode='w',
                                                        suffix=".csv",
                                                        delete=False,
                                                        dir=settings.DATA_DIR)
    temp_labeleddata_file.seek(0)
    data.to_csv(temp_labeleddata_file.name, index=False)
    temp_labeleddata_file.flush()
    temp_labeleddata_file.close()

    temp_label_file = tempfile.NamedTemporaryFile(mode='w',
                                                  suffix=".csv",
                                                  delete=False,
                                                  dir=settings.DATA_DIR)
    temp_label_file.seek(0)
    label_data.to_csv(temp_label_file.name, index=False)
    temp_label_file.flush()
    temp_label_file.close()

    s = io.BytesIO()
    # open the zip folder
    zip_file = zipfile.ZipFile(s, "w")
    for path in [
            tfidf_path, tfidf_vectorizer_path, readme_path, model_path,
            temp_labeleddata_file.name, temp_label_file.name, dockerfile_path,
            requirements_path, start_script_path, usage_examples_path
    ]:
        fdir, fname = os.path.split(path)
        if path == temp_label_file.name:
            fname = "project_" + str(project_pk) + "_labels.csv"
        elif path == temp_labeleddata_file.name:
            fname = "project_" + str(project_pk) + "_labeled_data.csv"
        # write the file to the zip folder
        zip_path = os.path.join(zip_subdir, fname)
        zip_file.write(path, zip_path)
    zip_file.close()

    response = HttpResponse(s.getvalue(),
                            content_type="application/x-zip-compressed")
    response['Content-Disposition'] = 'attachment;'

    return response