def test_add_and_delete_documents(): client = common.login_with_test_user(common.client()) collection_id = common.get_collection_id(client, "Trial Collection") assert collection_id != None added_doc_id = client.add_document(collection_id=collection_id, text="This is a test document to be deleted.") assert added_doc_id != None added_doc_id_2 = client.add_document(collection_id=collection_id, text="This is a test document to be deleted.") assert added_doc_id_2 != None delete_resp = client.delete_documents([added_doc_id, added_doc_id_2]) assert delete_resp != None assert isinstance(delete_resp, dict) assert delete_resp["success"] changed_objs = delete_resp["changed_objs"] assert len(changed_objs["annotations"]["deleted"]) == 0 assert added_doc_id in changed_objs["documents"]["deleted"] assert added_doc_id_2 in changed_objs["documents"]["deleted"]
def test_get_classifier_status(): client = common.login_with_test_user(common.client()) for collection in client.list_collections(): classifier = client.get_collection_classifier(collection["_id"]) status = client.get_classifier_status(classifier["_id"]) _check_pipeline_status(status, classifier["_id"])
def test_list_collections(): client = common.login_with_test_user(common.client()) unarchived_collections = client.list_collections(False) assert type(unarchived_collections) is list assert len(unarchived_collections) > 0 all_collections = client.list_collections(True) assert type(all_collections) is list assert len(all_collections) >= len(unarchived_collections) # make sure every collection has some expected properties for col in (all_collections + unarchived_collections): assert "_id" in col assert "annotators" in col assert "viewers" in col assert "configuration" in col assert "metadata" in col # make sure all_collections is superset of unarchived_collections for col in unarchived_collections: found = False for col2 in all_collections: if col["_id"] == col2["_id"]: found = True break assert found == True # make sure anything in all that's not in unarchived is archived for col in all_collections: found = False for col2 in unarchived_collections: if col["_id"] == col2["_id"]: found = True break assert col["archived"] == (not found)
def test_login_and_logout(): client = common.client() assert client.is_logged_in() == False # invalid login with pytest.raises(pine.client.exceptions.PineClientAuthException): client.login_eve("asdf", "asdf") assert client.is_logged_in() == False # valid login with ID user = common.test_user_data()[0] try: client.login_eve(user["_id"], user["password"]) assert client.is_logged_in() == True assert client.get_my_user_id() == user["_id"] client_user = client.get_logged_in_user() assert client_user != None assert "display_name" in client_user assert "is_admin" in client_user assert "username" in client_user assert "id" in client_user finally: client.logout() assert client.is_logged_in() == False # valid login with email try: client.login_eve(user["email"], user["password"]) assert client.is_logged_in() == True assert client.get_my_user_id() == user["_id"] finally: client.logout() assert client.is_logged_in() == False
def test_collection_user_permissions(): client = common.login_with_test_user(common.client()) collection_id = _get_collection_id("NER Test Collection", client) assert collection_id != None permissions = client.get_collection_permissions(collection_id) assert permissions.to_dict() == { "view": True, "annotate": True, "add_documents": True, "add_images": True, "modify_users": False, "modify_labels": False, "modify_document_metadata": True, "download_data": False, "archive": False } collection_id = _get_collection_id("Trial Collection", client) assert collection_id != None permissions = client.get_collection_permissions(collection_id) assert permissions.to_dict() == { "view": True, "annotate": True, "add_documents": True, "add_images": True, "modify_users": True, "modify_labels": True, "modify_document_metadata": True, "download_data": True, "archive": True }
def test_delete_document_not_allowed(): client = common.login_with_test_user(common.client()) collection_id = common.get_collection_id(client, "Trial Collection") assert collection_id != None with pytest.raises(pine.client.exceptions.PineClientHttpException): client.delete_document("nonexistent")
def test_get_collection_classifier(): client = common.login_with_test_user(common.client()) # make sure that a correct classifier is returned for each collection for collection in client.list_collections(): classifier = client.get_collection_classifier(collection["_id"]) assert type(classifier) is dict assert classifier["collection_id"] == collection["_id"] assert classifier["pipeline_id"] is not None
def test_download_collection_data_errors(): # find a collection that has annotations that the test user does NOT have access to user = "******" collection_title = "NER Test Collection" client = common.login_with_user(user, common.client()) with pytest.raises(pine.client.exceptions.PineClientValueException): client.download_collection_data(None) collection_id = _get_collection_id(collection_title, client) assert collection_id != None with pytest.raises(pine.client.exceptions.PineClientHttpException) as excinfo: client.download_collection_data(collection_id) assert excinfo.value.status_code == 401
def test_get_pipelines(): client = common.login_with_test_user(common.client()) imported_pipelines = common.test_pipeline_data() expected_pipeline_ids = [ pipeline["_id"] for pipeline in imported_pipelines ] pipelines = client.get_pipelines() actual_pipeline_ids = [pipeline["_id"] for pipeline in pipelines] assert set(expected_pipeline_ids) == set(actual_pipeline_ids) for pipeline in pipelines: expected = [ p for p in imported_pipelines if p["_id"] == pipeline["_id"] ][0] for key in expected: assert expected[key] == pipeline[key]
def test_collection_creation_and_get_and_archive(): client = common.login_with_test_user(common.client()) pipeline_id = [p for p in client.get_pipelines() if p["name"].lower() == "spacy"][0]["_id"] assert pipeline_id is not None my_id = client.get_my_user_id() assert my_id is not None collection_builder = client.collection_builder() \ .viewer(my_id) \ .annotator(my_id) \ .label("label") \ .title("Collection to Test Creation") \ .description("This is a collection for pytest to test creation.") \ .classifier(pipeline_id, train_every=100) collection_id = client.create_collection(collection_builder) assert type(collection_id) is str try: collection = client.get_collection(collection_id) assert type(collection) is dict assert collection["_id"] == collection_id assert collection["creator_id"] == my_id assert collection["annotators"] == [my_id] assert collection["viewers"] == [my_id] assert collection["archived"] == False assert collection["labels"] == ["label"] assert collection["metadata"] == { "title": "Collection to Test Creation", "description": "This is a collection for pytest to test creation." } updated_collection = client.archive_collection(collection_id, True) assert type(updated_collection) is dict assert updated_collection["_id"] == collection_id assert updated_collection["archived"] == True updated_collection = client.archive_collection(collection_id, False) assert type(updated_collection) is dict assert updated_collection["_id"] == collection_id assert updated_collection["archived"] == False finally: client.archive_collection(collection_id, True)
def test_sync_train(): client = common.login_with_test_user(common.client()) collection = common.get_collection(client, "Small Collection OpenNLP") assert collection is not None collection_id = collection["_id"] assert collection_id is not None classifier_id = client.get_collection_classifier(collection_id)["_id"] assert classifier_id is not None train_job_data = client.classifier_train(classifier_id, do_async=False) _assert_job_response(train_job_data, True) results = train_job_data["job_response"] assert results is not None and isinstance(results, dict) assert "average_metrics" in results and isinstance( results["average_metrics"], dict) assert "updated_objects" in results and isinstance( results["updated_objects"], dict) assert "fit" in results and isinstance(results["fit"], dict) assert "model_filename" in results and isinstance( results["model_filename"], str)
def test_download_collection_data(): # find a collection that has annotations that the test user has access to user = "******" collection_title = "NER Test Collection" client = common.login_with_user(user, common.client()) col_data = common.test_collection_data(collection_title) assert col_data != None collection_id = _get_collection_id(collection_title, client) assert collection_id != None # start with nothing but IDs kwargs = { "collection_id": collection_id, "include_collection_metadata": False, "include_document_metadata": False, "include_document_text": False, "include_annotations": False, "include_annotation_latest_version_only": True } data = client.download_collection_data(**kwargs) assert set(data.keys()) == {"_id", "documents"} assert len(data["documents"]) == col_data["documents"]["num_docs"] for doc in data["documents"]: assert set(doc.keys()) == {"_id"} doc_ids = [doc["_id"] for doc in data["documents"]] # turn on collection metadata kwargs["include_collection_metadata"] = True data = client.download_collection_data(**kwargs) assert set(data.keys()) == {"_id", "documents", "annotators", "viewers", "metadata", "configuration", "labels", "archived", "creator_id"} assert len(data["documents"]) == len(doc_ids) for doc in data["documents"]: assert set(doc.keys()) == {"_id"} # turn on document metadata kwargs["include_document_metadata"] = True data = client.download_collection_data(**kwargs) assert len(data["documents"]) == len(doc_ids) for doc in data["documents"]: assert set(doc.keys()) == {"_id", "metadata", "has_annotated", "creator_id", "overlap"} # turn on document text kwargs["include_document_text"] = True data = client.download_collection_data(**kwargs) assert len(data["documents"]) == len(doc_ids) for doc in data["documents"]: assert set(doc.keys()) == {"_id", "metadata", "has_annotated", "creator_id", "overlap", "text"} # turn on annotations kwargs["include_annotations"] = True data = client.download_collection_data(**kwargs) assert len(data["documents"]) == len(doc_ids) for doc in data["documents"]: assert set(doc.keys()) == {"_id", "metadata", "has_annotated", "creator_id", "overlap", "text", "annotations"} annotations = doc["annotations"] assert type(annotations) is list and len(annotations) > 0 for annotation in annotations: assert set(annotation.keys()) == {"_id", "creator_id", "annotation"} # turn on all annotation versions kwargs["include_annotation_latest_version_only"] = False data = client.download_collection_data(**kwargs) assert len(data["documents"]) == len(doc_ids) for doc in data["documents"]: annotations = doc["annotations"] assert type(annotations) is list and len(annotations) > 0 for annotation in annotations: assert set(annotation.keys()) == {"_id", "creator_id", "annotation", "_version", "_latest_version"}
# -*- coding: utf-8 -*- # # Hello World client in Python # Connects REQ socket to tcp://localhost:5555 # Sends "Hello" to server, expects "World" back # ADDRESS_SPEC = "tcp://localhost:6666" #ADDRESS_SPEC = "tcp://192.168.86.181:5555" #ADDRESS_SPEC = "tcp://127.0.0.1:5678" from common import client from zeromq_compat import ZeroMQ socket = ZeroMQ(ADDRESS_SPEC, ZeroMQ.REQ) client(socket)
def _test_train_and_predict(collection_title): client = common.login_with_test_user(common.client()) collection = common.get_collection(client, collection_title) assert collection is not None collection_id = collection["_id"] assert collection_id is not None labels = collection["labels"] assert labels is not None and len(labels) > 0 classifier_id = client.get_collection_classifier(collection_id)["_id"] assert classifier_id is not None first_document = client.get_collection_documents(collection_id, truncate=False)[0] document_id = first_document["_id"] document_text = first_document["text"] assert document_text.startswith("Thousands of demonstrators have ") # train async train_job_data = client.classifier_train(classifier_id, do_async=True) train_job_id = _assert_job_response(train_job_data, False) common.wait_for_job_to_finish(client, classifier_id, train_job_id, max_wait_seconds=120) status = client.get_classifier_status(classifier_id) _check_pipeline_status(status, classifier_id) assert status["job_response"]["has_trained"] train_job_results = client.get_classifier_job_results( classifier_id, train_job_id) assert train_job_results != None and isinstance(train_job_results, dict) # predict from ID sync prediction_job_data = client.classifier_predict(classifier_id, [document_id], [], do_async=False) prediction_job_id = _assert_job_response(prediction_job_data, True) docs_by_id = prediction_job_data["job_response"]["documents_by_id"] texts = prediction_job_data["job_response"]["texts"] assert docs_by_id.keys() == {document_id} prediction_from_id = docs_by_id[document_id] assert len(texts) == 0 # predict from text async prediction_job_data = client.classifier_predict(classifier_id, [], [document_text], do_async=True) prediction_job_id = _assert_job_response(prediction_job_data, False) common.wait_for_job_to_finish(client, classifier_id, prediction_job_id, max_wait_seconds=120) prediction_job_data = client.get_classifier_job_results( classifier_id, prediction_job_id) assert prediction_job_data != None and isinstance(prediction_job_data, dict) docs_by_id = prediction_job_data["documents_by_id"] texts = prediction_job_data["texts"] assert len(docs_by_id) == 0 assert len(texts) == 1 prediction_from_text = texts[0] # should be the same assert prediction_from_id == prediction_from_text # make sure they're in the right format assert isinstance(prediction_from_id, dict) assert "doc" in prediction_from_id and "ner" in prediction_from_id assert isinstance(prediction_from_id["doc"], list) for pred in prediction_from_id["doc"]: assert isinstance(pred, str) assert pred in labels assert isinstance(prediction_from_id["ner"], list) for pred in prediction_from_id["ner"]: assert isinstance(pred, list) and isinstance( pred[0], int) and isinstance(pred[1], int) and isinstance( pred[2], str) assert pred[0] >= 0 and pred[1] > pred[0] assert pred[2] in labels return prediction_from_id
def test_is_valid(): client = common.client() assert client.is_valid() == True
def test_get_pipeline_status(): client = common.login_with_test_user(common.client()) for pipeline in common.test_pipeline_data(): status = client.get_pipeline_status(pipeline["_id"]) _check_pipeline_status(status, None)
def test_get_and_advance_next_documents(tmp_path): client = common.login_with_test_user(common.client()) pipeline_id = [ p for p in client.get_pipelines() if p["name"].lower() == "spacy" ][0]["_id"] assert pipeline_id is not None my_id = client.get_my_user_id() assert my_id is not None # write documents CSV documents_file = tmp_path / "documents.csv" with open(documents_file, "w", newline="") as csvfile: writer = csv.writer(csvfile) for i in range(5): writer.writerow(["This is document number {}.".format(i)]) collection = client.collection_builder() \ .viewer(my_id) \ .annotator(my_id) \ .label("label") \ .title("Collection to Test Next Document") \ .description("This is a collection for pytest to test the next/advance documents feature.") \ .classifier(pipeline_id, train_every=100, overlap=1) \ .document_csv_file(documents_file, has_header=False, text_column=0) collection_id = client.create_collection(collection) assert collection_id is not None document_ids = [ d["_id"] for d in client.get_collection_documents(collection_id, True, 1) ] assert len(document_ids) == 5 try: classifier = client.get_collection_classifier(collection_id) assert classifier is not None classifier_id = classifier["_id"] assert classifier_id is not None # add more documents document_ids += [ client.add_document(collection_id=collection_id, overlap=1, text="This is document number {}.".format(i)) for i in range(5, 10) ] assert len(document_ids) == 10 for document_id in document_ids: assert type(document_id) is str next_ids = [] next_id = client.get_next_document(classifier_id) while next_id is not None: assert type(next_id) is str assert next_id not in next_ids # no duplicates next_ids.append(next_id) assert len(next_ids) <= len( document_ids) # sanity check to prevent an infinite loop updated_document = client.advance_next_document( classifier_id, next_id) assert type(updated_document) is dict next_id = client.get_next_document(classifier_id) assert set(document_ids) == set(next_ids) finally: client.archive_collection(collection_id)