def create_document(set_id, body) -> Tuple[Document, int]: # noqa: E501 """Add a new document to the document set Contents of the document in the body of the request. This should be in plain text. The Content-Type header should be appropriately set to text/plain. # noqa: E501 :param set_id: ID of a set :type set_id: str :param body: :type body: str :rtype: Document """ granularity = "line" text_contents = text_contents_from_document_body(body.decode(), granularity=granularity) document = Document(text_contents=text_contents, text_split_granularity=granularity) operation_result = db[set_id].insert_one(document.to_dict()) if not operation_result.inserted_id: error = HTTPStatus.INTERNAL_SERVER_ERROR return create_api_http_status(error), error.value classify(set_id, operation_result.inserted_id) document = get_document(set_id, operation_result.inserted_id) return document, HTTPStatus.OK.value
def get_set( set_id) -> Tuple[Union[ApiHttpStatus, Documents], int]: # noqa: E501 """lists all documents in the set # noqa: E501 :param set_id: ID of a set :type set_id: str :rtype: Documents """ document_list = [] for document_dict in db[set_id].find({}, { "_id": 1, "name": 1, "predictedClassification": 1 }): document_dict["documentId"] = str(document_dict["_id"]) del document_dict["_id"] document = Document.from_dict(document_dict) document_list.append(document) return Documents(documents=document_list), HTTPStatus.OK.value
def process_document(path, data): db = create_db_client() classify.__globals__["db"] = db collection = pathlib.Path(path).parts[-3] name = pathlib.Path(path).parts[-1] text_contents = text_contents_from_document_body(data, granularity="document") document = Document(name=name, text_contents=text_contents, text_split_granularity="document") operation_result = db[collection].insert_one(document.to_dict()) doc_id = operation_result.inserted_id # disable parallel SVC for population script since already parallelizing below classify(collection, doc_id)
def process_document(odcument, collection, trained_model): db = create_db_client(db_name=USER_ID) classify.__globals__["db"] = db granularity: str = "paragraph" text_contents = text_contents_from_document_body(document["content"], granularity=granularity) document_object = Document( text_contents=text_contents, text_split_granularity=granularity, name=document["document_number"], ) operation_result = db[collection].insert_one(document_object.to_dict()) doc_id = operation_result.inserted_id classification = classify_text(document["content"], trained_model=trained_model) classified_text_contents = calculate_text_content_classifications( document_object, explanations=classification.explanations, trained_model=trained_model, ) doc_id = db[collection].update_one( {"_id": ObjectId(doc_id)}, { "$set": { # Update document wide predicted classification "predictedClassification": classification.to_dict(), # Update paragrah classifications "textContents": [ text_content.to_dict() for text_content in classified_text_contents ], } }, )
def delete_document(set_id: str, doc_id: str) -> Tuple[Document, int]: """ Delete a document a in the set :param set_id: ID of a set :param doc_id: ID of a document """ result = db[set_id].find_one_and_delete({"_id": ObjectId(doc_id)}) result["documentId"] = str(result["_id"]) del result["_id"] deleted = Document.from_dict(result) return deleted, HTTPStatus.OK.value
def get_document( set_id: str, doc_id: str ) -> Tuple[Union[ApiHttpStatus, Document], int]: # noqa: E501 """get document from set # noqa: E501 :param set_id: ID of a set :type set_id: str :param doc_id: ID of a document :type doc_id: str :rtype: Document """ doc = db[set_id].find_one({"_id": ObjectId(doc_id)}) if not doc: error = HTTPStatus.NOT_FOUND return create_api_http_status(error), error.value doc["documentId"] = str(doc["_id"]) del doc["_id"] document = Document.from_dict(doc) # recreate text_content objects text_contents = [] for text_content_dict in doc["textContents"]: classification_dict = text_content_dict["predictedClassification"] # Build predicted_classification from dictionary classification = None if classification_dict: explanations = [] for explanation_dict in classification_dict["explanations"]: features = [] for feature_dict in explanation_dict["features"]: feature = Feature.from_dict(feature_dict) features.append(feature) explanation = PredictedClassificationExplanation.from_dict( explanation_dict) explanation.features = features explanations.append(explanation) classification = PredictedClassification.from_dict( classification_dict) classification.explanations = explanations # build sensitive_sections from dictionaries sensitive_sections_dicts = text_content_dict["sensitiveSections"] sensitive_sections = [] if sensitive_sections_dicts: for sensitive_section_dict in sensitive_sections_dicts[ "sensitiveSections"]: sensitive_sections.append( SensitiveSection.from_dict(sensitive_section_dict)) # build text_content object text_content = TextContent.from_dict(text_content_dict) text_content.predicted_classification = classification text_content.sensitive_sections = SensitiveSections(sensitive_sections) text_contents.append(text_content) # # add text_contents to document and return it document.text_contents = text_contents return document, HTTPStatus.OK.value
def document_from_mongo_dict(doc: Dict) -> Document: document_dict = deepcopy(doc) document_dict["document_id"] = str(doc["_id"]) del document_dict["_id"] document = Document().from_dict(document_dict) return document