Exemple #1
0
def processImage(
    documentId, features, bucketName, objectName, outputTableName, documentsTableName
):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(
        bucketName, objectName, detectText, detectForms, detectTables
    )

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    print("Generating output for DocumentId: {}".format(documentId))

    opg = OutputGenerator(
        documentId, response, bucketName, objectName, detectForms, detectTables, ddb
    )
    opg.run()

    print("DocumentId: {}".format(documentId))

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities in S3
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    # if Kendra is available then let it index the document
    # index the searchable pdf in Kendra
    if 'KENDRA_INDEX_ID' in os.environ:
        kendraClient = KendraHelper()
        fileName = os.path.basename(objectName).split(".")[0]
        fileExtension = os.path.basename(objectName).split(".")[1]
        outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName)
        kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'],
                                   os.environ['KENDRA_ROLE_ARN'],
                                   outputBucketName, outputDocumentName,
                                   documentId, fileExtension)

    print("Processed Comprehend data for document: {}".format(documentId))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    outputBucket = request["outputBucket"]
    documentsTable = request["documentsTable"]
    qUrl = request["elasticQueueUrl"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    opg = OutputGenerator(jobId, jobTag, pages, outputBucket, objectName,
                          detectForms, detectTables, ddb)
    opg.run()

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag, jobId)

    jsonMessage = {
        'documentId': jobTag,
        'jobId': jobId,
        'bucketName': outputBucket,
        'objectName': objectName
    }

    client = AwsHelper().getClient('sqs')
    postMessage(client, qUrl, jsonMessage)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    print(output)

    return {'statusCode': 200, 'body': output}
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    outputBucketName = request['outputBucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]
    elasticsearchDomain = request["elasticsearchDomain"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource('dynamodb')
    ddb = dynamodb.Table(outputTable)

    opg = OutputGenerator(jobTag, pages, outputBucketName, objectName,
                          detectForms, detectTables, ddb, elasticsearchDomain)
    opg.run()

    generatePdf(jobTag, bucketName, objectName, outputBucketName)

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    print(output)

    return {'statusCode': 200, 'body': output}
Exemple #5
0
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities in S3
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
def processRequest(request):

    output = ""

    print(request)

    jobId = request["jobId"]
    jobTag = request["jobTag"]
    jobStatus = request["jobStatus"]
    jobAPI = request["jobAPI"]
    bucketName = request["bucketName"]
    objectName = request["objectName"]
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if jobAPI == "StartDocumentAnalysis":
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    opg = OutputGenerator(
        jobTag, pages, bucketName, objectName, detectForms, detectTables, ddb
    )
    opg.run()

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName
    )

    print(output)

    return {"statusCode": 200, "body": output}
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    print("Generating output for DocumentId: {}".format(documentId))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName)

    # generate Comprehend and ComprehendMedical entities in S3
    path = objectName + "-analysis" + "/" + documentId + "/"
    print("path: " + path)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, 'response.json', path, maxPages)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
def processImage(features, bucket_name, object_name):

    detectTables = "detectTables" in features
    detectText = "detectText" in features

    response = callTextract(bucket_name, object_name, detectText, detectTables)

    print("Generating output for DocumentId: {}".format(object_name))
    #print(json.dumps(response))
    #get page metadata
    metadata = get_page_metadata(bucket_name=bucket_name, object_name=object_name)

    opg = OutputGenerator(response, bucketName=bucket_name, objectName=object_name, tables=detectTables,
    metadata=metadata)

    output = opg.run()

    print("DocumentId: {}".format(object_name))

    return output
    def processDocument(self, ips, i, document):
        print("\nTextracting Document # {}: {}".format(i, document))
        print('=' * (len(document) + 30))

        # Get document textracted
        dp = DocumentProcessor(ips["bucketName"], document, ips["awsRegion"],
                               ips["text"], ips["forms"], ips["tables"])
        response = dp.run()
        print("Recieved Textract response...")

        #FileHelper.writeToFile("temp-response.json", json.dumps(response))

        #Generate output files
        print("Generating output...")
        name, ext = FileHelper.getFileNameAndExtension(document)
        opg = OutputGenerator(
            response, os.path.join(ips["output"], "{}-{}".format(name, ext)),
            ips["forms"], ips["tables"])
        opg.run()

        if (ips["insights"] or ips["medical-insights"] or ips["translate"]):
            opg.generateInsights(ips["insights"], ips["medical-insights"],
                                 ips["translate"], ips["awsRegion"])

        print("{} textracted successfully.".format(document))
def processImage(documentId, bucketName, objectName, callerId):

    response = callTextract(bucketName, objectName)

    print("Generating output for documentId: {}".format(documentId))

    opg = OutputGenerator(documentId=documentId,
                          response=response,
                          bucketName=textractBucketName,
                          objectName=objectName,
                          forms=False,
                          tables=False)
    tagging = "documentId={}".format(documentId)
    opg.writeTextractOutputs(taggingStr=tagging)

    lineage_client.recordLineage({
        "documentId": documentId,
        "callerId": callerId,
        "sourceBucketName": bucketName,
        "targetBucketName": textractBucketName,
        "sourceFileName": objectName,
        "targetFileName": objectName
    })
def generateOutput(filePath, response):
    print("Generating output...")
    name, ext = FileHelper.getFileNameAndExtension(filePath)
    opg = OutputGenerator(response, "{}-v2-{}".format(name, ext), True, True)
    opg.run()
    opg.generateInsights(True, True, 'es', 'us-east-1')
Exemple #12
0
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    outputBucketName = request['outputBucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]
    elasticsearchDomain = request["elasticsearchDomain"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource('dynamodb')
    ddb = dynamodb.Table(outputTable)

    opg = OutputGenerator(jobTag, pages, outputBucketName, objectName,
                          detectForms, detectTables, ddb, elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(jobTag, bucketName, objectName, outputBucketName)

    # generate Comprehend and ComprehendMedical entities
    path = objectName + "-analysis" + "/" + jobTag + "/"
    print("path: " + path)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, 'response.json', path, maxPages)

    print("DocumentId: {}".format(jobTag))

    # index document once the comprehend entities and KVPairs have been extracted
    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    return {'statusCode': 200, 'body': output}
Exemple #13
0
def processRequest(request):

    output = ""

    print("Request : {}".format(request))

    jobId = request['jobId']
    documentId = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    outputBucketName = request['outputBucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]
    elasticsearchDomain = request["elasticsearchDomain"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource('dynamodb')
    ddb = dynamodb.Table(outputTable)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, pages, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    # if Kendra is available then let it index the document
    if 'KENDRA_INDEX_ID' in os.environ:
        kendraClient = KendraHelper()
        fileName = os.path.basename(objectName).split(".")[0]
        fileExtension = os.path.basename(objectName).split(".")[1]
        outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName)
        kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'],
                                   os.environ['KENDRA_ROLE_ARN'], bucketName,
                                   outputDocumentName, documentId,
                                   fileExtension)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    # index document once the comprehend entities and KVPairs have been extracted
    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(documentId)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        documentId, bucketName, objectName)

    return {'statusCode': 200, 'body': output}
def runComprehend(bucketName, objectName, callerId):

    comprehend = AwsHelper().getClient('comprehend')
    documentId, documentName = dissectObjectName(objectName)
    assert (
        documentId == S3Helper().getTagsS3(bucketName,
                                           objectName).get('documentId', None)
    ), "File path {} does not match the expected documentId tag of the object triggered.".format(
        objectName)

    textractOutputJson = json.loads(S3Helper().readFromS3(
        bucketName, objectName))
    og = OutputGenerator(response=textractOutputJson,
                         forms=False,
                         tables=False)

    pipeline_client.body = {
        "documentId": documentId,
        "bucketName": bucketName,
        "objectName": objectName,
        "stage": PIPELINE_STAGE
    }
    pipeline_client.stageInProgress()

    document = Document(textractOutputJson)
    originalFileName = "{}/{}".format(documentId, documentName)
    comprehendFileName = originalFileName + "/comprehend-output.json"
    comprehendFileS3Url = "https://{}.s3.amazonaws.com/{}".format(
        comprehendBucket, urllib.parse.quote_plus(comprehendFileName,
                                                  safe="/"))
    tagging = "documentId={}".format(documentId)

    es.connect()
    esPayload = []
    page_num = 1
    for page in document.pages:
        table = og.structurePageTable(page)
        forms = og.structurePageForm(page)
        text = og.structurePageText(page)

        keyPhrases = []
        entitiesDetected = {}

        lenOfEncodedText = len(text)
        print("Comprehend documentId {} processing page {}".format(
            documentId, str(page_num)))
        print("Length of encoded text is " + str(lenOfEncodedText))
        if lenOfEncodedText > COMPREHEND_CHARACTER_LIMIT:
            print(
                "Size was too big to run singularly; breaking up the page text into chunks"
            )
            try:
                chunksOfText = chunkUpTheText(text)
            except Exception as e:
                pipeline_client.stageFailed(
                    "Could not determine how to snip the text on page {} into chunks."
                    .format(page_num))
                raise (e)
            keyPhrases, entitiesDetected = batchSendToComprehend(
                comprehend, chunksOfText, 'en')
        else:
            keyPhrases, entitiesDetected = singularSendToComprehend(
                comprehend, text, 'en')

        esPageLoad = compileESPayload(es, page_num, keyPhrases,
                                      entitiesDetected, text, table, forms,
                                      documentId)
        esPayload.append(esPageLoad)
        page_num = page_num + 1

    try:
        es.post_bulk(index=esIndex, payload=esPayload)
    except Exception as e:
        pipeline_client.stageFailed("Could not post to Elasticsearch")
        raise (e)

    print("Data uploaded to ES")
    try:
        S3Helper().writeToS3(json.dumps(esPayload),
                             comprehendBucket,
                             comprehendFileName,
                             taggingStr=tagging)
    except Exception as e:
        pipeline_client.stageFailed("Failed to write comprehend payload to S3")
        raise (e)

    lineage_client.recordLineage({
        "documentId": documentId,
        "callerId": callerId,
        "sourceBucketName": bucketName,
        "targetBucketName": comprehendBucket,
        "sourceFileName": objectName,
        "targetFileName": comprehendFileName
    })
    pipeline_client.stageSucceeded()
    print("Comprehend data uploaded to S3 at {}".format(comprehendFileName))
def processRequest(request):

    output = ""
    status = request['jobStatus']
    jobId = request['jobId']
    jobTag = request['jobTag']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    objectName = request['objectName']

    pipeline_client.body = {
        "documentId": jobTag,
        "bucketName": bucketName,
        "objectName": objectName,
        "stage": PIPELINE_STAGE
    }
    if status == 'FAILED':
        pipeline_client.stageFailed(
            "Textract Analysis didn't complete successfully")
        raise Exception(
            "Textract job for document ID {}; bucketName {} fileName {}; failed during Textract analysis. Please double check the document quality"
            .format(jobTag, bucketName, objectName))

    pipeline_client.stageInProgress()
    try:
        pages = getJobResults(jobAPI, jobId)
    except Exception as e:
        pipeline_client.stageFailed()
        raise (e)

    print("Result pages received: {}".format(len(pages)))

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    try:
        opg = OutputGenerator(documentId=jobTag,
                              response=pages,
                              bucketName=textractBucketName,
                              objectName=objectName,
                              forms=detectForms,
                              tables=detectTables)
    except Exception as e:
        pipeline_client.stageFailed(
            "Could not convert results from Textract into processable object. Try uploading again."
        )
        raise (e)

    tagging = "documentId={}".format(jobTag)
    opg.writeTextractOutputs(taggingStr=tagging)

    lineage_client.recordLineage({
        "documentId": jobTag,
        "callerId": request["callerId"],
        "sourceBucketName": bucketName,
        "targetBucketName": textractBucketName,
        "sourceFileName": objectName,
        "targetFileName": objectName
    })

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)
    pipeline_client.stageSucceeded()
    print(output)
    return {'statusCode': 200, 'body': output}