def create_document(set_id, body) -> Tuple[Document, int]:  # noqa: E501
    """Add a new document to the document set

    Contents of the document in the body of the request. This should be in plain text. The Content-Type header should be appropriately set to text/plain. # noqa: E501

    :param set_id: ID of a set
    :type set_id: str
    :param body: 
    :type body: str

    :rtype: Document
    """

    granularity = "line"

    text_contents = text_contents_from_document_body(body.decode(),
                                                     granularity=granularity)

    document = Document(text_contents=text_contents,
                        text_split_granularity=granularity)

    operation_result = db[set_id].insert_one(document.to_dict())

    if not operation_result.inserted_id:
        error = HTTPStatus.INTERNAL_SERVER_ERROR
        return create_api_http_status(error), error.value

    classify(set_id, operation_result.inserted_id)

    document = get_document(set_id, operation_result.inserted_id)

    return document, HTTPStatus.OK.value
def get_set(
        set_id) -> Tuple[Union[ApiHttpStatus, Documents], int]:  # noqa: E501
    """lists all documents in the set

     # noqa: E501

    :param set_id: ID of a set
    :type set_id: str

    :rtype: Documents
    """

    document_list = []
    for document_dict in db[set_id].find({}, {
            "_id": 1,
            "name": 1,
            "predictedClassification": 1
    }):
        document_dict["documentId"] = str(document_dict["_id"])
        del document_dict["_id"]

        document = Document.from_dict(document_dict)

        document_list.append(document)
    return Documents(documents=document_list), HTTPStatus.OK.value
Esempio n. 3
0
def process_document(path, data):

    db = create_db_client()
    classify.__globals__["db"] = db

    collection = pathlib.Path(path).parts[-3]
    name = pathlib.Path(path).parts[-1]
    text_contents = text_contents_from_document_body(data,
                                                     granularity="document")

    document = Document(name=name,
                        text_contents=text_contents,
                        text_split_granularity="document")
    operation_result = db[collection].insert_one(document.to_dict())
    doc_id = operation_result.inserted_id
    # disable parallel SVC for population script since already parallelizing below
    classify(collection, doc_id)
Esempio n. 4
0
def process_document(odcument, collection, trained_model):

    db = create_db_client(db_name=USER_ID)
    classify.__globals__["db"] = db

    granularity: str = "paragraph"

    text_contents = text_contents_from_document_body(document["content"],
                                                     granularity=granularity)

    document_object = Document(
        text_contents=text_contents,
        text_split_granularity=granularity,
        name=document["document_number"],
    )
    operation_result = db[collection].insert_one(document_object.to_dict())
    doc_id = operation_result.inserted_id

    classification = classify_text(document["content"],
                                   trained_model=trained_model)

    classified_text_contents = calculate_text_content_classifications(
        document_object,
        explanations=classification.explanations,
        trained_model=trained_model,
    )

    doc_id = db[collection].update_one(
        {"_id": ObjectId(doc_id)},
        {
            "$set": {
                # Update document wide predicted classification
                "predictedClassification":
                classification.to_dict(),
                # Update paragrah classifications
                "textContents": [
                    text_content.to_dict()
                    for text_content in classified_text_contents
                ],
            }
        },
    )
def delete_document(set_id: str, doc_id: str) -> Tuple[Document, int]:
    """
    Delete a document a in the set
    :param set_id: ID of a set
    :param doc_id: ID of a document
    """
    result = db[set_id].find_one_and_delete({"_id": ObjectId(doc_id)})

    result["documentId"] = str(result["_id"])
    del result["_id"]
    deleted = Document.from_dict(result)
    return deleted, HTTPStatus.OK.value
def get_document(
        set_id: str, doc_id: str
) -> Tuple[Union[ApiHttpStatus, Document], int]:  # noqa: E501
    """get document from set

     # noqa: E501

    :param set_id: ID of a set
    :type set_id: str
    :param doc_id: ID of a document
    :type doc_id: str

    :rtype: Document
    """

    doc = db[set_id].find_one({"_id": ObjectId(doc_id)})
    if not doc:
        error = HTTPStatus.NOT_FOUND
        return create_api_http_status(error), error.value

    doc["documentId"] = str(doc["_id"])
    del doc["_id"]

    document = Document.from_dict(doc)

    # recreate text_content objects
    text_contents = []

    for text_content_dict in doc["textContents"]:
        classification_dict = text_content_dict["predictedClassification"]

        # Build predicted_classification from dictionary
        classification = None
        if classification_dict:
            explanations = []
            for explanation_dict in classification_dict["explanations"]:
                features = []
                for feature_dict in explanation_dict["features"]:
                    feature = Feature.from_dict(feature_dict)
                    features.append(feature)
                explanation = PredictedClassificationExplanation.from_dict(
                    explanation_dict)
                explanation.features = features
                explanations.append(explanation)
            classification = PredictedClassification.from_dict(
                classification_dict)
            classification.explanations = explanations

        # build sensitive_sections from dictionaries
        sensitive_sections_dicts = text_content_dict["sensitiveSections"]
        sensitive_sections = []
        if sensitive_sections_dicts:

            for sensitive_section_dict in sensitive_sections_dicts[
                    "sensitiveSections"]:
                sensitive_sections.append(
                    SensitiveSection.from_dict(sensitive_section_dict))

        # build text_content object
        text_content = TextContent.from_dict(text_content_dict)
        text_content.predicted_classification = classification
        text_content.sensitive_sections = SensitiveSections(sensitive_sections)
        text_contents.append(text_content)

    # # add text_contents to document and return it
    document.text_contents = text_contents

    return document, HTTPStatus.OK.value
Esempio n. 7
0
def document_from_mongo_dict(doc: Dict) -> Document:
    document_dict = deepcopy(doc)
    document_dict["document_id"] = str(doc["_id"])
    del document_dict["_id"]
    document = Document().from_dict(document_dict)
    return document