def processImage( documentId, features, bucketName, objectName, outputTableName, documentsTableName ): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract( bucketName, objectName, detectText, detectForms, detectTables ) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) print("Generating output for DocumentId: {}".format(documentId)) opg = OutputGenerator( documentId, response, bucketName, objectName, detectForms, detectTables, ddb ) opg.run() print("DocumentId: {}".format(documentId)) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def processImage(documentId, features, bucketName, outputBucketName, objectName, outputTableName, documentsTableName, elasticsearchDomain): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract(bucketName, objectName, detectText, detectForms, detectTables) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId, SERVICE_OUTPUT_PATH_S3_PREFIX) print("Generating output for DocumentId: {} and storing in {}".format( documentId, outputPath)) opg = OutputGenerator(documentId, response, outputBucketName, objectName, detectForms, detectTables, ddb, outputPath, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName, outputPath) # generate Comprehend and ComprehendMedical entities in S3 comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX) print("Comprehend output path: " + comprehendOutputPath) maxPages = 100 comprehendClient = ComprehendHelper() responseDocumentName = "{}{}response.json".format(outputPath, TEXTRACT_PATH_S3_PREFIX) comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, responseDocumentName, comprehendOutputPath, maxPages) # if Kendra is available then let it index the document # index the searchable pdf in Kendra if 'KENDRA_INDEX_ID' in os.environ: kendraClient = KendraHelper() fileName = os.path.basename(objectName).split(".")[0] fileExtension = os.path.basename(objectName).split(".")[1] outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName) kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'], os.environ['KENDRA_ROLE_ARN'], outputBucketName, outputDocumentName, documentId, fileExtension) print("Processed Comprehend data for document: {}".format(documentId)) for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def processRequest(request): output = "" print(request) jobId = request['jobId'] jobTag = request['jobTag'] jobStatus = request['jobStatus'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] objectName = request['objectName'] outputTable = request["outputTable"] outputBucket = request["outputBucket"] documentsTable = request["documentsTable"] qUrl = request["elasticQueueUrl"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if (jobAPI == "StartDocumentAnalysis"): detectForms = True detectTables = True opg = OutputGenerator(jobId, jobTag, pages, outputBucket, objectName, detectForms, detectTables, ddb) opg.run() print("DocumentId: {}".format(jobTag)) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(jobTag, jobId) jsonMessage = { 'documentId': jobTag, 'jobId': jobId, 'bucketName': outputBucket, 'objectName': objectName } client = AwsHelper().getClient('sqs') postMessage(client, qUrl, jsonMessage) output = "Processed -> Document: {}, Object: {}/{} processed.".format( jobTag, bucketName, objectName) print(output) return {'statusCode': 200, 'body': output}
def processRequest(request): output = "" print(request) jobId = request['jobId'] jobTag = request['jobTag'] jobStatus = request['jobStatus'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] outputBucketName = request['outputBucketName'] objectName = request['objectName'] outputTable = request["outputTable"] documentsTable = request["documentsTable"] elasticsearchDomain = request["elasticsearchDomain"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if (jobAPI == "StartDocumentAnalysis"): detectForms = True detectTables = True dynamodb = AwsHelper().getResource('dynamodb') ddb = dynamodb.Table(outputTable) opg = OutputGenerator(jobTag, pages, outputBucketName, objectName, detectForms, detectTables, ddb, elasticsearchDomain) opg.run() generatePdf(jobTag, bucketName, objectName, outputBucketName) print("DocumentId: {}".format(jobTag)) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(jobTag) output = "Processed -> Document: {}, Object: {}/{} processed.".format( jobTag, bucketName, objectName) print(output) return {'statusCode': 200, 'body': output}
def processImage(documentId, features, bucketName, outputBucketName, objectName, outputTableName, documentsTableName, elasticsearchDomain): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract(bucketName, objectName, detectText, detectForms, detectTables) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId, SERVICE_OUTPUT_PATH_S3_PREFIX) print("Generating output for DocumentId: {} and storing in {}".format( documentId, outputPath)) opg = OutputGenerator(documentId, response, outputBucketName, objectName, detectForms, detectTables, ddb, outputPath, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName, outputPath) # generate Comprehend and ComprehendMedical entities in S3 comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX) print("Comprehend output path: " + comprehendOutputPath) maxPages = 100 comprehendClient = ComprehendHelper() responseDocumentName = "{}{}response.json".format(outputPath, TEXTRACT_PATH_S3_PREFIX) comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, responseDocumentName, comprehendOutputPath, maxPages) print("DocumentId: {}".format(documentId)) print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities)) for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def processRequest(request): output = "" print(request) jobId = request["jobId"] jobTag = request["jobTag"] jobStatus = request["jobStatus"] jobAPI = request["jobAPI"] bucketName = request["bucketName"] objectName = request["objectName"] outputTable = request["outputTable"] documentsTable = request["documentsTable"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if jobAPI == "StartDocumentAnalysis": detectForms = True detectTables = True dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) opg = OutputGenerator( jobTag, pages, bucketName, objectName, detectForms, detectTables, ddb ) opg.run() print("DocumentId: {}".format(jobTag)) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(jobTag) output = "Processed -> Document: {}, Object: {}/{} processed.".format( jobTag, bucketName, objectName ) print(output) return {"statusCode": 200, "body": output}
def processImage(documentId, features, bucketName, outputBucketName, objectName, outputTableName, documentsTableName, elasticsearchDomain): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract(bucketName, objectName, detectText, detectForms, detectTables) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) print("Generating output for DocumentId: {}".format(documentId)) opg = OutputGenerator(documentId, response, outputBucketName, objectName, detectForms, detectTables, ddb, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName) # generate Comprehend and ComprehendMedical entities in S3 path = objectName + "-analysis" + "/" + documentId + "/" print("path: " + path) maxPages = 100 comprehendClient = ComprehendHelper() comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, 'response.json', path, maxPages) print("DocumentId: {}".format(documentId)) print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities)) for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def processImage(features, bucket_name, object_name): detectTables = "detectTables" in features detectText = "detectText" in features response = callTextract(bucket_name, object_name, detectText, detectTables) print("Generating output for DocumentId: {}".format(object_name)) #print(json.dumps(response)) #get page metadata metadata = get_page_metadata(bucket_name=bucket_name, object_name=object_name) opg = OutputGenerator(response, bucketName=bucket_name, objectName=object_name, tables=detectTables, metadata=metadata) output = opg.run() print("DocumentId: {}".format(object_name)) return output
def processDocument(self, ips, i, document): print("\nTextracting Document # {}: {}".format(i, document)) print('=' * (len(document) + 30)) # Get document textracted dp = DocumentProcessor(ips["bucketName"], document, ips["awsRegion"], ips["text"], ips["forms"], ips["tables"]) response = dp.run() print("Recieved Textract response...") #FileHelper.writeToFile("temp-response.json", json.dumps(response)) #Generate output files print("Generating output...") name, ext = FileHelper.getFileNameAndExtension(document) opg = OutputGenerator( response, os.path.join(ips["output"], "{}-{}".format(name, ext)), ips["forms"], ips["tables"]) opg.run() if (ips["insights"] or ips["medical-insights"] or ips["translate"]): opg.generateInsights(ips["insights"], ips["medical-insights"], ips["translate"], ips["awsRegion"]) print("{} textracted successfully.".format(document))
def processImage(documentId, bucketName, objectName, callerId): response = callTextract(bucketName, objectName) print("Generating output for documentId: {}".format(documentId)) opg = OutputGenerator(documentId=documentId, response=response, bucketName=textractBucketName, objectName=objectName, forms=False, tables=False) tagging = "documentId={}".format(documentId) opg.writeTextractOutputs(taggingStr=tagging) lineage_client.recordLineage({ "documentId": documentId, "callerId": callerId, "sourceBucketName": bucketName, "targetBucketName": textractBucketName, "sourceFileName": objectName, "targetFileName": objectName })
def generateOutput(filePath, response): print("Generating output...") name, ext = FileHelper.getFileNameAndExtension(filePath) opg = OutputGenerator(response, "{}-v2-{}".format(name, ext), True, True) opg.run() opg.generateInsights(True, True, 'es', 'us-east-1')
def processRequest(request): output = "" print(request) jobId = request['jobId'] jobTag = request['jobTag'] jobStatus = request['jobStatus'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] outputBucketName = request['outputBucketName'] objectName = request['objectName'] outputTable = request["outputTable"] documentsTable = request["documentsTable"] elasticsearchDomain = request["elasticsearchDomain"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if (jobAPI == "StartDocumentAnalysis"): detectForms = True detectTables = True dynamodb = AwsHelper().getResource('dynamodb') ddb = dynamodb.Table(outputTable) opg = OutputGenerator(jobTag, pages, outputBucketName, objectName, detectForms, detectTables, ddb, elasticsearchDomain) opg_output = opg.run() generatePdf(jobTag, bucketName, objectName, outputBucketName) # generate Comprehend and ComprehendMedical entities path = objectName + "-analysis" + "/" + jobTag + "/" print("path: " + path) maxPages = 100 comprehendClient = ComprehendHelper() comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, 'response.json', path, maxPages) print("DocumentId: {}".format(jobTag)) # index document once the comprehend entities and KVPairs have been extracted for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(jobTag) output = "Processed -> Document: {}, Object: {}/{} processed.".format( jobTag, bucketName, objectName) return {'statusCode': 200, 'body': output}
def processRequest(request): output = "" print("Request : {}".format(request)) jobId = request['jobId'] documentId = request['jobTag'] jobStatus = request['jobStatus'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] outputBucketName = request['outputBucketName'] objectName = request['objectName'] outputTable = request["outputTable"] documentsTable = request["documentsTable"] elasticsearchDomain = request["elasticsearchDomain"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if (jobAPI == "StartDocumentAnalysis"): detectForms = True detectTables = True dynamodb = AwsHelper().getResource('dynamodb') ddb = dynamodb.Table(outputTable) outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId, SERVICE_OUTPUT_PATH_S3_PREFIX) print("Generating output for DocumentId: {} and storing in {}".format( documentId, outputPath)) opg = OutputGenerator(documentId, pages, outputBucketName, objectName, detectForms, detectTables, ddb, outputPath, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName, outputPath) # generate Comprehend and ComprehendMedical entities comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX) print("Comprehend output path: " + comprehendOutputPath) maxPages = 100 comprehendClient = ComprehendHelper() responseDocumentName = "{}{}response.json".format(outputPath, TEXTRACT_PATH_S3_PREFIX) comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, responseDocumentName, comprehendOutputPath, maxPages) # if Kendra is available then let it index the document if 'KENDRA_INDEX_ID' in os.environ: kendraClient = KendraHelper() fileName = os.path.basename(objectName).split(".")[0] fileExtension = os.path.basename(objectName).split(".")[1] outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName) kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'], os.environ['KENDRA_ROLE_ARN'], bucketName, outputDocumentName, documentId, fileExtension) print("DocumentId: {}".format(documentId)) print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities)) # index document once the comprehend entities and KVPairs have been extracted for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(documentId) output = "Processed -> Document: {}, Object: {}/{} processed.".format( documentId, bucketName, objectName) return {'statusCode': 200, 'body': output}
def runComprehend(bucketName, objectName, callerId): comprehend = AwsHelper().getClient('comprehend') documentId, documentName = dissectObjectName(objectName) assert ( documentId == S3Helper().getTagsS3(bucketName, objectName).get('documentId', None) ), "File path {} does not match the expected documentId tag of the object triggered.".format( objectName) textractOutputJson = json.loads(S3Helper().readFromS3( bucketName, objectName)) og = OutputGenerator(response=textractOutputJson, forms=False, tables=False) pipeline_client.body = { "documentId": documentId, "bucketName": bucketName, "objectName": objectName, "stage": PIPELINE_STAGE } pipeline_client.stageInProgress() document = Document(textractOutputJson) originalFileName = "{}/{}".format(documentId, documentName) comprehendFileName = originalFileName + "/comprehend-output.json" comprehendFileS3Url = "https://{}.s3.amazonaws.com/{}".format( comprehendBucket, urllib.parse.quote_plus(comprehendFileName, safe="/")) tagging = "documentId={}".format(documentId) es.connect() esPayload = [] page_num = 1 for page in document.pages: table = og.structurePageTable(page) forms = og.structurePageForm(page) text = og.structurePageText(page) keyPhrases = [] entitiesDetected = {} lenOfEncodedText = len(text) print("Comprehend documentId {} processing page {}".format( documentId, str(page_num))) print("Length of encoded text is " + str(lenOfEncodedText)) if lenOfEncodedText > COMPREHEND_CHARACTER_LIMIT: print( "Size was too big to run singularly; breaking up the page text into chunks" ) try: chunksOfText = chunkUpTheText(text) except Exception as e: pipeline_client.stageFailed( "Could not determine how to snip the text on page {} into chunks." .format(page_num)) raise (e) keyPhrases, entitiesDetected = batchSendToComprehend( comprehend, chunksOfText, 'en') else: keyPhrases, entitiesDetected = singularSendToComprehend( comprehend, text, 'en') esPageLoad = compileESPayload(es, page_num, keyPhrases, entitiesDetected, text, table, forms, documentId) esPayload.append(esPageLoad) page_num = page_num + 1 try: es.post_bulk(index=esIndex, payload=esPayload) except Exception as e: pipeline_client.stageFailed("Could not post to Elasticsearch") raise (e) print("Data uploaded to ES") try: S3Helper().writeToS3(json.dumps(esPayload), comprehendBucket, comprehendFileName, taggingStr=tagging) except Exception as e: pipeline_client.stageFailed("Failed to write comprehend payload to S3") raise (e) lineage_client.recordLineage({ "documentId": documentId, "callerId": callerId, "sourceBucketName": bucketName, "targetBucketName": comprehendBucket, "sourceFileName": objectName, "targetFileName": comprehendFileName }) pipeline_client.stageSucceeded() print("Comprehend data uploaded to S3 at {}".format(comprehendFileName))
def processRequest(request): output = "" status = request['jobStatus'] jobId = request['jobId'] jobTag = request['jobTag'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] objectName = request['objectName'] pipeline_client.body = { "documentId": jobTag, "bucketName": bucketName, "objectName": objectName, "stage": PIPELINE_STAGE } if status == 'FAILED': pipeline_client.stageFailed( "Textract Analysis didn't complete successfully") raise Exception( "Textract job for document ID {}; bucketName {} fileName {}; failed during Textract analysis. Please double check the document quality" .format(jobTag, bucketName, objectName)) pipeline_client.stageInProgress() try: pages = getJobResults(jobAPI, jobId) except Exception as e: pipeline_client.stageFailed() raise (e) print("Result pages received: {}".format(len(pages))) detectForms = False detectTables = False if (jobAPI == "StartDocumentAnalysis"): detectForms = True detectTables = True try: opg = OutputGenerator(documentId=jobTag, response=pages, bucketName=textractBucketName, objectName=objectName, forms=detectForms, tables=detectTables) except Exception as e: pipeline_client.stageFailed( "Could not convert results from Textract into processable object. Try uploading again." ) raise (e) tagging = "documentId={}".format(jobTag) opg.writeTextractOutputs(taggingStr=tagging) lineage_client.recordLineage({ "documentId": jobTag, "callerId": request["callerId"], "sourceBucketName": bucketName, "targetBucketName": textractBucketName, "sourceFileName": objectName, "targetFileName": objectName }) output = "Processed -> Document: {}, Object: {}/{} processed.".format( jobTag, bucketName, objectName) pipeline_client.stageSucceeded() print(output) return {'statusCode': 200, 'body': output}