def processDocument(self, ips, i, document):
        print("\nTextracting Document # {}: {}".format(i, document))
        print('=' * (len(document) + 30))

        # Get document textracted
        dp = DocumentProcessor(ips["bucketName"], document, ips["awsRegion"],
                               ips["text"], ips["forms"], ips["tables"])
        response = dp.run()
        print("Recieved Textract response...")

        #FileHelper.writeToFile("temp-response.json", json.dumps(response))

        #Generate output files
        print("Generating output...")
        name, ext = FileHelper.getFileNameAndExtension(document)
        opg = OutputGenerator(
            response, os.path.join(ips["output"], "{}-{}".format(name, ext)),
            ips["forms"], ips["tables"])
        opg.run()

        if (ips["insights"] or ips["medical-insights"] or ips["translate"]):
            opg.generateInsights(ips["insights"], ips["medical-insights"],
                                 ips["translate"], ips["awsRegion"])

        print("{} textracted successfully.".format(document))
Example #2
0
def processImage(
    documentId, features, bucketName, objectName, outputTableName, documentsTableName
):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(
        bucketName, objectName, detectText, detectForms, detectTables
    )

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    print("Generating output for DocumentId: {}".format(documentId))

    opg = OutputGenerator(
        documentId, response, bucketName, objectName, detectForms, detectTables, ddb
    )
    opg.run()

    print("DocumentId: {}".format(documentId))

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    outputBucket = request["outputBucket"]
    documentsTable = request["documentsTable"]
    qUrl = request["elasticQueueUrl"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    opg = OutputGenerator(jobId, jobTag, pages, outputBucket, objectName,
                          detectForms, detectTables, ddb)
    opg.run()

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag, jobId)

    jsonMessage = {
        'documentId': jobTag,
        'jobId': jobId,
        'bucketName': outputBucket,
        'objectName': objectName
    }

    client = AwsHelper().getClient('sqs')
    postMessage(client, qUrl, jsonMessage)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    print(output)

    return {'statusCode': 200, 'body': output}
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    outputBucketName = request['outputBucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]
    elasticsearchDomain = request["elasticsearchDomain"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource('dynamodb')
    ddb = dynamodb.Table(outputTable)

    opg = OutputGenerator(jobTag, pages, outputBucketName, objectName,
                          detectForms, detectTables, ddb, elasticsearchDomain)
    opg.run()

    generatePdf(jobTag, bucketName, objectName, outputBucketName)

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    print(output)

    return {'statusCode': 200, 'body': output}
def processRequest(request):

    output = ""

    print(request)

    jobId = request["jobId"]
    jobTag = request["jobTag"]
    jobStatus = request["jobStatus"]
    jobAPI = request["jobAPI"]
    bucketName = request["bucketName"]
    objectName = request["objectName"]
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if jobAPI == "StartDocumentAnalysis":
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    opg = OutputGenerator(
        jobTag, pages, bucketName, objectName, detectForms, detectTables, ddb
    )
    opg.run()

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName
    )

    print(output)

    return {"statusCode": 200, "body": output}
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities in S3
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    # if Kendra is available then let it index the document
    # index the searchable pdf in Kendra
    if 'KENDRA_INDEX_ID' in os.environ:
        kendraClient = KendraHelper()
        fileName = os.path.basename(objectName).split(".")[0]
        fileExtension = os.path.basename(objectName).split(".")[1]
        outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName)
        kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'],
                                   os.environ['KENDRA_ROLE_ARN'],
                                   outputBucketName, outputDocumentName,
                                   documentId, fileExtension)

    print("Processed Comprehend data for document: {}".format(documentId))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
Example #7
0
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities in S3
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    print("Generating output for DocumentId: {}".format(documentId))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName)

    # generate Comprehend and ComprehendMedical entities in S3
    path = objectName + "-analysis" + "/" + documentId + "/"
    print("path: " + path)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, 'response.json', path, maxPages)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
def processImage(features, bucket_name, object_name):

    detectTables = "detectTables" in features
    detectText = "detectText" in features

    response = callTextract(bucket_name, object_name, detectText, detectTables)

    print("Generating output for DocumentId: {}".format(object_name))
    #print(json.dumps(response))
    #get page metadata
    metadata = get_page_metadata(bucket_name=bucket_name, object_name=object_name)

    opg = OutputGenerator(response, bucketName=bucket_name, objectName=object_name, tables=detectTables,
    metadata=metadata)

    output = opg.run()

    print("DocumentId: {}".format(object_name))

    return output
def generateOutput(filePath, response):
    print("Generating output...")
    name, ext = FileHelper.getFileNameAndExtension(filePath)
    opg = OutputGenerator(response, "{}-v2-{}".format(name, ext), True, True)
    opg.run()
    opg.generateInsights(True, True, 'es', 'us-east-1')
Example #11
0
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    outputBucketName = request['outputBucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]
    elasticsearchDomain = request["elasticsearchDomain"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource('dynamodb')
    ddb = dynamodb.Table(outputTable)

    opg = OutputGenerator(jobTag, pages, outputBucketName, objectName,
                          detectForms, detectTables, ddb, elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(jobTag, bucketName, objectName, outputBucketName)

    # generate Comprehend and ComprehendMedical entities
    path = objectName + "-analysis" + "/" + jobTag + "/"
    print("path: " + path)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, 'response.json', path, maxPages)

    print("DocumentId: {}".format(jobTag))

    # index document once the comprehend entities and KVPairs have been extracted
    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    return {'statusCode': 200, 'body': output}
Example #12
0
def processRequest(request):

    output = ""

    print("Request : {}".format(request))

    jobId = request['jobId']
    documentId = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    outputBucketName = request['outputBucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]
    elasticsearchDomain = request["elasticsearchDomain"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource('dynamodb')
    ddb = dynamodb.Table(outputTable)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, pages, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    # if Kendra is available then let it index the document
    if 'KENDRA_INDEX_ID' in os.environ:
        kendraClient = KendraHelper()
        fileName = os.path.basename(objectName).split(".")[0]
        fileExtension = os.path.basename(objectName).split(".")[1]
        outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName)
        kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'],
                                   os.environ['KENDRA_ROLE_ARN'], bucketName,
                                   outputDocumentName, documentId,
                                   fileExtension)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    # index document once the comprehend entities and KVPairs have been extracted
    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(documentId)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        documentId, bucketName, objectName)

    return {'statusCode': 200, 'body': output}