Beispiel #1
0
def fetch_sections_data(request, doc_id):
    """
    :param request:
    :param doc_id: for which we need to fetch sections
    :return: Response in Json format which consists the all sections
     data for the given doc_id
    """
    context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
    context.start_span(component=__name__)
    try:
        if doc_id != "":
            solution_id = common.get_solution_from_session(request)
            document_info = document_data(doc_id, solution_id, False)
            data = document_info["data"]["data"]
            review_state = document_info["data"]["review_state"]
            counts = {"extracted": 0, "reviewed": 0}
            data["elements"] = process_elements(data["elements"], counts)
            data['need_review_count'] = counts["reviewed"]
            data['attributes_extracted'] = counts["extracted"]
            return {"status": "success",
                    "data": data,
                    "volume": MOUNT_PATH,
                    "review_state": review_state,
                    "msg": "successfully returned document sections data"}
        else:
            return {"status": "failure",
                    "msg": "Failed to return document sections data"}
    # TODO raise specific exception
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
        return {'status': 'failure',
                'msg': 'Internal error while submitting section data',
                'error': str(e)}
    finally:
        context.end_span()
Beispiel #2
0
def find_documents(request, collection, query, solution_id, projection_fields=None):
    context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
    context.start_span(component=__name__)
    try:
        cursor = MongoDbConn.find(collection, query, projection=projection_fields)
        sort_by, order_by_asc, skip, limit = get_pagination_details(request, sort_by='updated_ts', order_by_asc=-1,
                                                                    skip=0, limit=0)
        documents_list = cursor.sort(sort_by, order_by_asc).skip(skip).limit(limit)

        documents = []
        for document in documents_list:
            document.pop("_id", None)
            document = construct_json(document, DOCUMENT_SUMMARY_FIELDS)
            doc_type = get_doc_type(document['extn'])
            if doc_type == "image":
                document["is_digital"] = False
            else:
                document["is_digital"] = True
            if "confidence_score" not in document:
                document["confidence_score"] = get_confidence_score(document, solution_id, document["is_digital"])
                document["is_failed"] = True if document["doc_state"] == "failed" else False
                document["review_text"] = get_review_text(document["doc_state"], document)
                documents.append(document)
        return documents
    # TODO raise specific exception
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
    finally:
        context.end_span()
Beispiel #3
0
def format_entity_data(entity, elements, review_data, enrich_data, rules_reqd=True):
    context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
    context.start_span(component=__name__)
    try:
        entity_data_json = json.loads(entity)
        return format_enriched_data(entity_data_json, elements, review_data, enrich_data, rules_reqd)
    # TODO raise specific exception
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
        return {}
    finally:
        context.end_span()
Beispiel #4
0
def document_data(doc_id, solution_id, entity_reqd=True, rules_reqd=True):
    context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
    context.start_span(component=__name__)
    try:
        filter_query = {"doc_id": doc_id}
        projection = {"metadata": 1, "doc_id": 1, "confidence_score": 1, "elements": 1, "doc_state": 1, "root_id": 1,
                      "entity": 1, "_id": 0}
        document = MongoDbConn.find_one(DOCUMENTS_COLLECTION, filter_query, projection=projection)
        doc_type = get_doc_type(document["metadata"]["properties"]['extension'])
        overall_doc_score = get_doc_confidence_score(document, doc_type)
        document['document_confidence_score'] = overall_doc_score
        template_id = document["metadata"]["template_info"]["id"]
        template_type = get_template_type(template_id, solution_id)
        document["doc_type"] = doc_type
        if doc_type == "email":
            document["attachments"] = add_email_info(document)
        # elif doc_type == "excel":
        #     for page in document["pages"]:
        #         page["doc_html"] = get_html_data(page)
        document["template_type"] = template_type
        if "metadata" in document:
            if "searchable_pdf" in document["metadata"].keys():
                document["searchable_pdf"] = document["metadata"]["searchable_pdf"]
        if "entity" in document and entity_reqd:
            entity_data_orgnl = json.loads(document["entity"])
            enrich_data = list(get_enrichments(entity_data_orgnl, "enrichments"))
            filter_query["is_deleted"] = False
            elements = get_all_elements(document["elements"], [])
            review_data = dict(attributes_extracted=0, review_required=0, confidence=0)
            review_data["entity_feedback"] = document["entity_feedback"] if "entity_feedback" in document else []
            document["entity"] = format_entity_data(document["entity"], elements,
                                                    review_data, enrich_data, rules_reqd)
            document['document_confidence_score'] = review_data["confidence"]
            document["attributes_extracted"] = review_data["attributes_extracted"]
            if rules_reqd:
                document["review_required"] = 0
            else:
                document["review_required"] = review_data["review_required"]
            document = remove_items(document, ["entity_feedback"])
            document["elements"] = elements
        else:
            document.pop("entity", None)
        review_state = get_review_state(entity_reqd, rules_reqd, doc_type, template_type)
        data = {"data": document, "volume": MOUNT_PATH, "review_state": review_state}
        return {"status": "success", "msg": "document data", "data": data}
    # TODO raise specific exception
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
        return {"status": "failure", "msg": str(e), "data": {}}
    finally:
        context.end_span()
Beispiel #5
0
def update_queue_extracted_feedback(document, doc_id, state):
    context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
    context.start_span(component=__name__)
    try:
        if not document:
            document = MongoDbConn.find_one(DOCUMENTS_COLLECTION, {"doc_id": doc_id})
        if "life_cycle" in document:
            curr_state = check_current_status(document, state)
            if curr_state and curr_state != "In Progress":
                update_queue_status(document, state, "In Progress", update_reqd=True)
        return {"status": "success", "msg": "Feedback submitted"}
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
        return {"status": "failed", "msg": "Error updating queue status", "error": str(e)}
    finally:
        context.end_span()
Beispiel #6
0
def construct_table_data(headings, columns, domain_mapping):
    headings_list = []
    column_name = "column"
    for heading in headings:
        if "final_column" in heading:
            headings_list.append(heading["final_column"])
            column_name = "final_column"
        if "column" in heading:
            headings_list.append(heading["column"])
    column_list = []

    if isinstance(domain_mapping, dict) and "data" in domain_mapping:
        domain_list = domain_mapping["data"]

    line_dict = {}
    for column in columns:
        col_name = column["name"]
        if col_name == "subheaders":
            continue
        domain_mapped = ""
        context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
        context.start_span(component=__name__)
        try:
            [col_heading_idx] = [idx for idx, value in enumerate(headings) if
                                 "".join(value[column_name]) == col_name]
            if "map_to" in domain_list[col_heading_idx]:
                domain_mapped = domain_list[col_heading_idx]["map_to"]
        # TODO raise specific exception
        except Exception as e:
            context.log(message=str(e), obj={"tb": traceback.format_exc()})
        context.end_span()
        for val in column["value"]:
            line_num = val["line"]
            value = construct_json(val, ["text", "score", "value_coordinates"])
            value["domain_mapping"] = domain_mapped
            if "score" not in value:
                value["score"] = 100
            row_dict = {col_name: value}
            if line_num not in line_dict:
                line_dict[line_num] = row_dict
            else:
                line_dict[line_num].update(row_dict)

    ordered_dict = OrderedDict(sorted(line_dict.items(), key=lambda x: int(x[0])))
    for line_num, value in ordered_dict.items():
        column_list.append(value)
    return column_list, headings_list
Beispiel #7
0
def download_document_json(request, doc_id):
    context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
    context.start_span(component=__name__)
    try:
        solution_id = common.get_solution_from_session(request)
        data = document_data(doc_id, solution_id)
        if data["status"] == "success":
            download_data = data["data"]["data"]
            download_data = remove_items(download_data, ["elements", "updated_ts"])
            return download_file(download_data, doc_id)
        else:
            return data
    # TODO raise specific exception
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
        return {"status": "failure", "msg": "Internal Error occurred", "Error": str(e)}
    finally:
        context.end_span()
Beispiel #8
0
def process_complete_review(request, doc_id):
    context = tracer.get_context(request_id=str(uuid4()), log_level="ERROR")
    context.start_span(component=__name__)
    try:
        solution_id = common.get_solution_from_session(request)
        path = request.get_full_path()
        if "text/" in path or "entity/" in path:
            payload = json.loads(request.body.decode())
            doc_id = payload["doc_id"]
            if payload["feedback"]:
                if "text/" in path:
                    feedback_status = process_text_feedback(request)
                else:
                    feedback_status = process_entity_feedback(request)
                if feedback_status["status"] != "success":
                    return {"status": "failure", "msg": "Failed to submit feedback"}
        query = {"doc_id": doc_id, "solution_id": solution_id}
        document = MongoDbConn.find_one(DOCUMENTS_COLLECTION, query)
        data = dict(doc_id=doc_id,pipeline_name="manual_review",root_id=document["root_id"])
        if 'completeReview/review/' in path:
            data.update({"object_type": ["document", "domain", "recommendation"],"complete_review":True})
        post_status = post(API_GATEWAY_POST_JOB_URI + PIPELINE["MANUAL_TRIGGER"],
                           {"solution_id": solution_id, "data": data})
        if post_status["status"] != "success":
            return {"status": "failure", "msg": "Error while posting review"}
        state = ""
        if "text/" in path:
            state = "extracted"
        elif "grouping/" in path:
            state = "classified"
        elif "entity/" in path:
            state = "processed"
        elif 'review/' in path:
            state = 'reviewed'
        update_queue_status(document, state, "Closed", update_reqd=True)
        # context.end_span()
        return {"status": "success", "msg": "Review completion Posted successfully"}
    # TODO raise specific exception
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
        return {"status": "failure", "msg": "Internal Error occured while posting review", "error": str(e)}
    finally:
        context.end_span()
Beispiel #9
0
def get_document_details(request, doc_id, page_no):
    context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
    context.start_span(component=__name__)
    try:
        solution_id = common.get_solution_from_session(request)
        query = {"doc_id": doc_id, "page_no": int(page_no)}
        projection = {"solution_id": 0, "updated_ts": 0, "created_ts": 0, "_id": 0, "doc_id": 0}
        elements = MongoDbConn.find(DOC_ELEMENTS_COLLECTION, query, projection=projection)
        document = MongoDbConn.find_one(DOCUMENTS_COLLECTION, {"doc_id": doc_id}, projection={"doc_id": 1, "entity": 1})
        processed_rules = {}
        if "entity" in document:
            processed_rules = get_all_rules_processed(document["entity"])

        mapping_data = get_doc_mapping_from_template(doc_id, solution_id)
        element_list = []
        for element in elements:
            domain_mapping = get_domain_mapping(mapping_data, element_id=element["element_id"],
                                                section_id=element["section_id"])
            if element["type"] == "table":
                table = dict()
                if "headings" and "columns" in element:
                    table["table"], table["headings"] = construct_table_data(element["headings"], element["columns"],
                                                                             domain_mapping)
                    element["tables"] = table
                    element = remove_items(element, ["headings", "columns"])
            else:
                element["domain_mapping"] = ""
                if domain_mapping and isinstance(domain_mapping, dict) and "domain_mapping" in domain_mapping:
                    element["domain_mapping"] = domain_mapping["domain_mapping"]
                    if processed_rules:
                        element["rules"] = get_rules_info(element["domain_mapping"], processed_rules, solution_id,
                                                          element["text"])
            if "score" not in element:
                element["score"] = 0
            element_list.append(element)
        data = {"elements": element_list, "entity": {}}
        return {"status": "success", "data": data}
    # TODO raise specific exception
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
        return {"status": "failure", "msg": "Error occured while processing", "error": str(e)}
    finally:
        context.end_span()
Beispiel #10
0
def process_entity_feedback(request):
    context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
    context.start_span(component=__name__)
    try:
        solution_id = common.get_solution_from_session(request)
        request_data = json.loads(request.body.decode())
        if request_data["feedback"]:
            request_data["request_type"] = "extract_entities"
            feedback_status = post_feedback(request_data, solution_id)
            if feedback_status['status'] == 'success' and "feedback" in request.get_full_path():
                return update_queue_extracted_feedback(None, request_data["doc_id"], "processed")
            else:
                return feedback_status
        else:
            return {"status": "success", "msg": "No changes to be saved"}
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
        return {"status": "failure", "msg": "Internal error occured in processing Feedback", "error": str(e)}
    finally:
        context.end_span()
Beispiel #11
0
def documents_data(solution_id, filter_obj=None):
    context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
    context.start_span(component=__name__)
    try:
        filter_query = {"solution_id": solution_id, "is_root": True,
                        "$or": [{"is_test": False}, {"is_test": {"$exists": False}}]}
        projection_fields = dict()
        for field in DOCUMENT_SUMMARY_FIELDS:
            projection_fields[field] = 1
        apply_filters(filter_obj, filter_query)
        documents = find_documents(filter_obj, DOCUMENTS_COLLECTION, filter_query, solution_id,
                                   projection_fields=projection_fields)
        documents_total_count = MongoDbConn.count(DOCUMENTS_COLLECTION, filter_query)
        resp = {'config': summary_config, 'data': documents, 'total_count': documents_total_count}
        return {"status": "success", "msg": "documents data", "data": resp}
    # TODO raise specific exception
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
        return {"status": "failure", "msg": str(e), "data": {}}
    finally:
        context.end_span()
Beispiel #12
0
def process_text_feedback(request):
    context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
    context.start_span(component=__name__)
    try:
        solution_id = common.get_solution_from_session(request)
        payload = json.loads(request.body.decode())
        if payload["feedback"]:
            payload["feedback"] = reprocess_feedback(payload["feedback"])
            payload["request_type"] = "extract_elements"
            feedback_status = post_feedback(payload, solution_id)
            if feedback_status['status'] == "success" and "feedback" in request.get_full_path():
                return update_queue_extracted_feedback(None, payload["doc_id"], "extracted")
            else:
                return feedback_status
        else:
            return {"status": "success", "msg": "No changes to be saved"}
    # TODO raise specific exception
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
        return {"status": "failure", "msg": "Internal error occurred while submitting feedback"}
    finally:
        context.end_span()
Beispiel #13
0
def save_threshold_data(solution_id, payload):
    """
    :param solution_id:
    :param payload: request payload
    :return: response in json format
    """
    context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
    context.start_span(component=__name__)
    try:
        data = payload['data']
        post_status = post(API_GATEWAY_POST_JOB_URI + DOCUMENT_ENDPOINT["thresholds_update"],
                           {"solution_id": solution_id, "data": data})
        if post_status['status'] == 'success':
            return {"status": "success", "msg": "Threshold data updated successfully"}
        else:
            return {"status": "failure", "msg": "Error while updating threshold data"}
    # TODO raise specific exception
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
        return {"status": "failure", "msg": str(e)}
    finally:
        context.end_span()
Beispiel #14
0
def page_group_review(request, doc_id):
    context = tracer.get_context(request_id=str(uuid4()), log_level="INFO")
    context.start_span(component=__name__)
    try:
        solution_id = common.get_solution_from_session(request)
        if request.method == "GET":
            query = {"doc_id": doc_id}
            projection = {"doc_id": 1, "solution_id": 1, "pages": 1, "page_groups": 1, "metadata.properties": 1,
                          "_id": 0}
            document = MongoDbConn.find_one(DOCUMENTS_COLLECTION, query, projection=projection)

            if document is not None:
                document["volume"] = MOUNT_PATH
                return {"status": "success", "data": document}
            else:
                return {"status": "failure", "msg": "Failed to return document data"}
        elif request.method == "POST":
            payload = json.loads(request.body.decode())
            query = {"doc_id": doc_id, "solution_id": solution_id}
            document = MongoDbConn.find_one(DOCUMENTS_COLLECTION, query)
            doc_groups = document["page_groups"]
            feedback_list = get_groups_feedback(payload, doc_groups)
            if feedback_list:
                feedback_status = post_groups_feedback(feedback_list, doc_id, solution_id, document["root_id"])
            else:
                feedback_status = True

            if feedback_status:
                return process_complete_review(request, doc_id)
            else:
                return {'status': 'failure', 'msg': 'Error posting feedback'}
    # TODO raise specific exception
    except Exception as e:
        context.log(message=str(e), obj={"tb": traceback.format_exc()})
        return {'status': 'failure', 'msg': 'Internal error while submitting review', 'error': str(e)}
    finally:
        context.end_span()