def markDocumentComplete(self, documentId): err = None dynamodb = AwsHelper().getResource("dynamodb") table = dynamodb.Table(self._documentsTableName) try: table.update_item( Key = { 'documentId': documentId }, UpdateExpression = 'SET documentStatus= :documentstatusValue, documentCompletedOn = :documentCompletedOnValue', ConditionExpression = 'attribute_exists(documentId)', ExpressionAttributeValues = { ':documentstatusValue': "SUCCEEDED", ':documentCompletedOnValue': str(datetime.datetime.utcnow()) } ) except ClientError as e: if e.response['Error']['Code'] == "ConditionalCheckFailedException": print(e.response['Error']['Message']) err = {'Error' : 'Document does not exist.'} else: raise return err
def queryByIndexBothKeys(self, indexPartitionKey, indexSortKey): """List the data from database based on index partition key and index sort key Args: indexPartitionKey(str): partition key value of index indexSortKey(str): sort key value of index Returns: List of data from database based on partition key and sort key """ response = {'Items': []} if self._databaseName == 'dynamodb': dynamodb = AwsHelper().getResource(self._databaseName, self._awsRegion) table = dynamodb.Table(self._tableName) try: response = table.query( IndexName=self._indexName, KeyConditionExpression=Key( self._indexPartitionKeyName).eq(indexPartitionKey) & Key(self._indexSortKeyName).eq(indexSortKey)) except ParamValidationError as e: print("Parameter validation error: %s" % e) except ClientError as e: print("Unexpected error: %s" % e) return response['Items']
def getDocumentCount(self): dynamodb = AwsHelper().getResource("dynamodb") table = dynamodb.Table(self._documentsTableName) return table.item_count
def createDocument(self, documentId, bucketName, objectName): err = None dynamodb = AwsHelper().getResource("dynamodb") table = dynamodb.Table(self._documentsTableName) try: table.update_item( Key={"documentId": documentId}, UpdateExpression= "SET bucketName = :bucketNameValue, objectName = :objectNameValue, documentStatus = :documentstatusValue, documentCreatedOn = :documentCreatedOnValue", ConditionExpression="attribute_not_exists(documentId)", ExpressionAttributeValues={ ":bucketNameValue": bucketName, ":objectNameValue": objectName, ":documentstatusValue": "IN_PROGRESS", ":documentCreatedOnValue": str(datetime.datetime.utcnow()), }, ) except ClientError as e: print(e) if e.response["Error"][ "Code"] == "ConditionalCheckFailedException": print(e.response["Error"]["Message"]) err = {"Error": "Document already exist."} else: raise return err
def markDocumentComplete(self, documentId): err = None dynamodb = AwsHelper().getResource("dynamodb") table = dynamodb.Table(self._documentsTableName) try: table.update_item( Key={"documentId": documentId}, UpdateExpression= "SET documentStatus= :documentstatusValue, documentCompletedOn = :documentCompletedOnValue", ConditionExpression="attribute_exists(documentId)", ExpressionAttributeValues={ ":documentstatusValue": "SUCCEEDED", ":documentCompletedOnValue": str(datetime.datetime.utcnow()), }, ) except ClientError as e: if e.response["Error"][ "Code"] == "ConditionalCheckFailedException": print(e.response["Error"]["Message"]) err = {"Error": "Document does not exist."} else: raise return err
def callTextract(bucketName, objectName, detectText, detectForms, detectTables): textract = AwsHelper().getClient('textract') if(not detectForms and not detectTables): response = textract.detect_document_text( Document={ 'S3Object': { 'Bucket': bucketName, 'Name': objectName } } ) else: features = [] if(detectTables): features.append("TABLES") if(detectForms): features.append("FORMS") response = textract.analyze_document( Document={ 'S3Object': { 'Bucket': bucketName, 'Name': objectName } }, FeatureTypes=features ) return response
def save(self, info): """Store the data into database Args: info(dict): information to store Returns: None """ response = {'status': 'OK'} if self._databaseName == 'dynamodb': dynamodb = AwsHelper().getResource(self._databaseName, self._awsRegion) table = dynamodb.Table(self._tableName) for key in info: if not info[key]: response['status'] = 'BAD' response['error'] = key + ' should not be empty.' print(response['error']) return response try: table.put_item(Item=info) except ParamValidationError as e: print("Parameter validation error: %s" % e) except ClientError as e: print("Unexpected error: %s" % e) return response
def processImage(documentId, features, bucketName, outputBucketName, objectName, outputTableName, documentsTableName, elasticsearchDomain): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract(bucketName, objectName, detectText, detectForms, detectTables) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId, SERVICE_OUTPUT_PATH_S3_PREFIX) print("Generating output for DocumentId: {} and storing in {}".format( documentId, outputPath)) opg = OutputGenerator(documentId, response, outputBucketName, objectName, detectForms, detectTables, ddb, outputPath, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName, outputPath) # generate Comprehend and ComprehendMedical entities in S3 comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX) print("Comprehend output path: " + comprehendOutputPath) maxPages = 100 comprehendClient = ComprehendHelper() responseDocumentName = "{}{}response.json".format(outputPath, TEXTRACT_PATH_S3_PREFIX) comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, responseDocumentName, comprehendOutputPath, maxPages) # if Kendra is available then let it index the document # index the searchable pdf in Kendra if 'KENDRA_INDEX_ID' in os.environ: kendraClient = KendraHelper() fileName = os.path.basename(objectName).split(".")[0] fileExtension = os.path.basename(objectName).split(".")[1] outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName) kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'], os.environ['KENDRA_ROLE_ARN'], outputBucketName, outputDocumentName, documentId, fileExtension) print("Processed Comprehend data for document: {}".format(documentId)) for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def updateDocumentStatus(self, documentId, documentStatus, jobId=None): err = None dynamodb = AwsHelper().getResource("dynamodb") table = dynamodb.Table(self._documentsTableName) try: table.update_item( Key={'documentId': documentId}, UpdateExpression= 'SET documentStatus= :documentstatusValue, jobId= :jobIdValue', ConditionExpression='attribute_exists(documentId)', ExpressionAttributeValues={ ':documentstatusValue': documentStatus, ':jobIdValue': jobId }) except ClientError as e: if e.response['Error'][ 'Code'] == "ConditionalCheckFailedException": print(e.response['Error']['Message']) err = {'Error': 'Document does not exist.'} else: raise return err
def createDocument(self, documentId, bucketName, objectName): err = None dynamodb = AwsHelper().getResource("dynamodb") table = dynamodb.Table(self._documentsTableName) try: table.update_item( Key = { "documentId": documentId }, UpdateExpression = 'SET bucketName = :bucketNameValue, objectName = :objectNameValue, documentStatus = :documentstatusValue, documentCreatedOn = :documentCreatedOnValue', ConditionExpression = 'attribute_not_exists(documentId)', ExpressionAttributeValues = { ':bucketNameValue': bucketName, ':objectNameValue': objectName, ':documentstatusValue': 'IN_PROGRESS', ':documentCreatedOnValue': str(datetime.datetime.utcnow()) } ) except ClientError as e: print(e) if e.response['Error']['Code'] == "ConditionalCheckFailedException": print(e.response['Error']['Message']) err = {'Error' : 'Document already exist.'} else: raise return err
def getDocument(self, documentId): dynamodb = AwsHelper().getClient("dynamodb") ddbGetItemResponse = dynamodb.get_item( Key={"documentId": { "S": documentId }}, TableName=self._documentsTableName) itemToReturn = None if "Item" in ddbGetItemResponse: itemToReturn = { "documentId": ddbGetItemResponse["Item"]["documentId"]["S"], "bucketName": ddbGetItemResponse["Item"]["bucketName"]["S"], "objectName": ddbGetItemResponse["Item"]["objectName"]["S"], "documentStatus": ddbGetItemResponse["Item"]["documentStatus"]["S"], } return itemToReturn
def getDocument(self, documentId): dynamodb = AwsHelper().getClient("dynamodb") ddbGetItemResponse = dynamodb.get_item( Key={'documentId': { 'S': documentId }}, TableName=self._documentsTableName) itemToReturn = None if ('Item' in ddbGetItemResponse): itemToReturn = { 'documentId': ddbGetItemResponse['Item']['documentId']['S'], 'bucketName': ddbGetItemResponse['Item']['bucketName']['S'], 'objectName': ddbGetItemResponse['Item']['objectName']['S'], 'documentStatus': ddbGetItemResponse['Item']['documentStatus']['S'], } return itemToReturn
def _startJob(self): response = None client = AwsHelper().getClient('textract', self.inputParameters.awsRegion) if (not self.inputParameters.detectForms and not self.inputParameters.detectTables): response = client.start_document_text_detection( DocumentLocation={ 'S3Object': { 'Bucket': self.inputParameters.bucketName, 'Name': self.inputParameters.documentPath } }) else: features = [] if (self.inputParameters.detectTables): features.append("TABLES") if (self.inputParameters.detectForms): features.append("FORMS") response = client.start_document_analysis(DocumentLocation={ 'S3Object': { 'Bucket': self.inputParameters.bucketName, 'Name': self.inputParameters.documentPath } }, FeatureTypes=features) return response["JobId"]
def getDocuments(self, nextToken=None): dynamodb = AwsHelper().getResource("dynamodb") table = dynamodb.Table(self._opsTableName) pageSize = 25 if(nextToken): response = table.scan(ExclusiveStartKey={ "documentId" : nextToken}, Limit=pageSize) else: response = table.scan(Limit=pageSize) print("response: {}".format(response)) data = [] if('Items' in response): data = response['Items'] documents = { "documents" : data } if 'LastEvaluatedKey' in response: nextToken = response['LastEvaluatedKey']['documentId'] print("nexToken: {}".format(nextToken)) documents["nextToken"] = nextToken return documents
def processImage( documentId, features, bucketName, objectName, outputTableName, documentsTableName ): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract( bucketName, objectName, detectText, detectForms, detectTables ) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) print("Generating output for DocumentId: {}".format(documentId)) opg = OutputGenerator( documentId, response, bucketName, objectName, detectForms, detectTables, ddb ) opg.run() print("DocumentId: {}".format(documentId)) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def read_from_s3(aws_env): bucket_name = aws_env['bucketName'] s3_file_name = aws_env['objectName'] aws_region = aws_env['awsRegion'] s3 = AwsHelper().getResource('s3', aws_region) obj = s3.Object(bucket_name, s3_file_name) encoding = "utf-8" try: content = obj.get()['Body'].read() except Exception as e: print(e) return try: encoding = chardet.detect(content)['encoding'] print("Trying to decode with {}".format(encoding)) content_decoded = content.decode(encoding) return content_decoded except UnicodeDecodeError as e: print("Failing to decode with encoding {0}: {1}".format(encoding, e)) try: print("Trying by removing the last character") content_without_last_char = content[:-1].decode(encoding) return content_without_last_char except UnicodeDecodeError as e: print("Failing to decode: {}".format(e)) print("Returning content in bytes") return content
def callTextract(bucketName, objectName): textract = AwsHelper().getClient('textract') response = textract.detect_document_text( Document={'S3Object': { 'Bucket': bucketName, 'Name': objectName }}) return response
def lambda_handler(event, context): if (event['status'] <= 0): return {**event, "errorMessage": "Status isnt positive"} aws_env = { **event, "bucketName": os.environ.get('DOCUMENTS_BUCKET'), "awsRegion": 'eu-west-1', "tmpJsonOutput": "/tmp/tmp_result.json", "tmpTxtOutput": "/tmp/tmp_result.txt", "outputBucket": os.environ.get('DOCUMENTS_BUCKET'), "outputNameJson": get_bbox_filename(event['objectName'], ".json"), "outputNameTxt": get_bbox_filename(event['objectName'], ".txt"), "textractOnly": os.environ.get('TEXTRACT_ONLY'), "minCharNeeded": int(os.environ.get('MIN_CHAR_NEEDED')), "extract_pdf_lines": os.environ.get('EXTRACT_PDF_LINES'), } status = {"statusCode": 200, "body": "All right"} extract_pdf_lines = aws_env['extract_pdf_lines'] textract_only = aws_env['textractOnly'] tmp_folder = "/tmp/pdfToBbox" pdf_tmp_path = copy_pdf_to_tmp(tmp_folder, aws_env) print("==> aws_env: ", aws_env) if textract_only == "false" and is_pdf_has_enough_characters( pdf_tmp_path, aws_env['minCharNeeded']) is True: print("=> Extracting bounding box with pdfplumber") if extract_pdf_lines == "true": print("=> Extracting pdf lines bbox") pdf = Pdf(pdf_tmp_path, aws_env['tmpJsonOutput'], aws_env['tmpTxtOutput']) pdf.parse_pdf() pdf.save_in_json() pdf.save_in_txt() write_bbox_to_s3(aws_env) else: print("=> Extracting pdf words bbox") if execute_pdf_to_bbox(pdf_tmp_path, aws_env['tmpJsonOutput']): print("=> Error while trying to get pdf information") aws_env["status"] = -1 aws_env["errorMessage"] = "PDF format not supported." else: write_bbox_to_s3(aws_env) else: print("Extracting bounding box with textract") #send_to_textract(aws_env) aws_env['size'] = S3Helper.getS3FileSize(aws_env['bucketName'], aws_env['outputNameTxt'], aws_env['awsRegion']) aws_env["s3Url"] = get_new_s3_url(aws_env['s3Url'], "txt") aws_env["status"] = status aws_env["status"] = 1 aws_env["errorMessage"] = None aws_env["contentType"] = "text/txt" aws_env['objectName'] = aws_env['outputNameTxt'] aws_env["sourceUrl"] = aws_env["s3Url"] AwsHelper.refreshTmpFolder(tmp_folder) return update_event(aws_env, event)
def TranslateCaptions(self, translationContext, terminology_names=[]): marker = "<span>" sourceLanguageCode = translationContext["sourceLang"] targetLanguageCodes = translationContext["targetLangList"] translate_role = translationContext["roleArn"] bucket = translationContext["bucket"] inputPath = translationContext["inputLocation"] outputPath = translationContext["outputlocation"] jobPrefix = translationContext["jobPrefix"] try: translate_client = AwsHelper().getClient('translate') targetLanguageCode = targetLanguageCodes[0] self.logger.debug( "Starting translation to {}".format(targetLanguageCode)) singletonTargetList = [] singletonTargetList.append(targetLanguageCode) millis = int(round(time.time() * 1000)) job_name = jobPrefix + str(millis) self.logger.debug("JobName: {}".format(job_name)) terminology_name = [] if len(terminology_names) > 0: for item in terminology_names: if targetLanguageCode in item['TargetLanguageCodes']: terminology_name.append(item['Name']) break if len(terminology_name) == 0: self.logger.debug(int("No custom terminology specified.")) else: self.logger.debug( "Using custom terminology {}".format(terminology_name)) # Save the delimited transcript text to S3 response = translate_client.start_text_translation_job( JobName=job_name, InputDataConfig={ 'S3Uri': "s3://{}/{}".format(bucket, inputPath), 'ContentType': "text/html" }, OutputDataConfig={ 'S3Uri': "s3://{}/{}".format(bucket, outputPath) }, DataAccessRoleArn=translate_role, SourceLanguageCode=sourceLanguageCode, TargetLanguageCodes=singletonTargetList, TerminologyNames=terminology_name) jobinfo = { "JobId": response["JobId"], "TargetLanguageCode": targetLanguageCode } return jobinfo except Exception as e: self.logger.error(e) raise e
def deleteItem(self, itemId): dynamodb = AwsHelper().getResource("dynamodb") table = dynamodb.Table(self._itemsTableName) table.delete_item( Key={ 'itemId': itemId } )
def deleteDocument(self, documentId): dynamodb = AwsHelper().getResource("dynamodb") table = dynamodb.Table(self._opsTableName) table.delete_item( Key={ 'documentId': documentId } )
def processRequest(request): output = "" print(request) jobId = request['jobId'] jobTag = request['jobTag'] jobStatus = request['jobStatus'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] objectName = request['objectName'] outputTable = request["outputTable"] outputBucket = request["outputBucket"] documentsTable = request["documentsTable"] qUrl = request["elasticQueueUrl"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if (jobAPI == "StartDocumentAnalysis"): detectForms = True detectTables = True opg = OutputGenerator(jobId, jobTag, pages, outputBucket, objectName, detectForms, detectTables, ddb) opg.run() print("DocumentId: {}".format(jobTag)) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(jobTag, jobId) jsonMessage = { 'documentId': jobTag, 'jobId': jobId, 'bucketName': outputBucket, 'objectName': objectName } client = AwsHelper().getClient('sqs') postMessage(client, qUrl, jsonMessage) output = "Processed -> Document: {}, Object: {}/{} processed.".format( jobTag, bucketName, objectName) print(output) return {'statusCode': 200, 'body': output}
def updateDocumentStatus(self, documentId, status, stage, timestamp, message=None): ret = None dynamodb = AwsHelper().getResource("dynamodb") table = dynamodb.Table(self._opsTableName) try: if message: new_datapoint = { "timestamp": timestamp, "stage": stage, "status": status, "message": message } else: new_datapoint = { "timestamp": timestamp, "stage": stage, "status": status } table.update_item( Key = { 'documentId': documentId }, UpdateExpression = 'SET documentStatus = :documentStatus, documentStage = :documentStage, lastUpdate = :lastUpdate, timeline = list_append(timeline, :new_datapoint)', ConditionExpression = 'attribute_exists(documentId)', ExpressionAttributeValues = { ':documentStatus': status, ':documentStage': stage, ':lastUpdate': timestamp, ':new_datapoint': [new_datapoint] } ) ret = { 'Status': 200 } except ClientError as e: print(e) ret = { 'Error' : e.response['Error']['Message'], 'Status': e.response['ResponseMetadata']['HTTPStatusCode'] } except Exception as e: print(e) ret = { 'Error' : 'Updating document failed', 'Status': 400 } return ret
def processImage(documentId, features, bucketName, outputBucketName, objectName, outputTableName, documentsTableName, elasticsearchDomain): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract(bucketName, objectName, detectText, detectForms, detectTables) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId, SERVICE_OUTPUT_PATH_S3_PREFIX) print("Generating output for DocumentId: {} and storing in {}".format( documentId, outputPath)) opg = OutputGenerator(documentId, response, outputBucketName, objectName, detectForms, detectTables, ddb, outputPath, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName, outputPath) # generate Comprehend and ComprehendMedical entities in S3 comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX) print("Comprehend output path: " + comprehendOutputPath) maxPages = 100 comprehendClient = ComprehendHelper() responseDocumentName = "{}{}response.json".format(outputPath, TEXTRACT_PATH_S3_PREFIX) comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, responseDocumentName, comprehendOutputPath, maxPages) print("DocumentId: {}".format(documentId)) print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities)) for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def queryDocumentId(self, targetBucketName, targetFileName, versionId=None): ret = None res = None dynamodb = AwsHelper().getResource("dynamodb") table = dynamodb.Table(self._lineageTableName) documentSignature = "BUCKET:{}@FILE:{}".format(targetBucketName, targetFileName) if versionId: documentSignature += "@VERSION:{}".format(versionId) try: res = table.query( KeyConditionExpression = Key('documentSignature').eq(documentSignature), IndexName = self._lineageIndexName ) except ClientError as e: print(e) ret = { 'Error': e.response['Error']['Message'], 'Status': e.response['ResponseMetadata']['HTTPStatusCode'] } except Exception as e: print(e) ret = { 'Error': 'Unknown error occurred during querying the document Id', 'Status': 400 } try: items = res['Items'] print(items) if len(items) == 0: ret = { 'Status': 404, 'documentId': None } else: items.sort(key=lambda item: datetime.fromisoformat(item['timestamp'])) ret = { 'Status': 200, 'documentId': items[0]['documentId'] } except Exception as e: print(e) ret = { 'Error': 'Could not find the documentId for specified document Signature', 'Status': 400 } return ret
def processRequest(request): output = "" print("request: {}".format(request)) itemId = request["itemId"] bucketName = request["bucketName"] objectName = request["objectName"] print("Input Object: {}/{}".format(bucketName, objectName)) ext = FileHelper.getFileExtenstion(objectName.lower()) print("Extension: {}".format(ext)) if(ext and ext in ["jpg", "jpeg", "png"]): qUrl = request['syncQueueUrl'] elif (ext in ["mov", "mp4"]): qUrl = request['asyncQueueUrl'] if(qUrl): jsonMessage = { 'itemId' : itemId, 'bucketName': bucketName, 'objectName' : objectName } client = AwsHelper().getClient('sqs') response = postMessage(client, qUrl, jsonMessage) output = "Completed routing for itemId: {}, object: {}/{}".format(itemId, bucketName, objectName) print(output) return response
def processItems(qUrl, snsTopic, snsRole): sqs = AwsHelper().getClient('sqs') messages = getMessagesFromQueue(sqs, qUrl) jc = 0 totalMessages = 0 hitLimit = False limitException = None if(messages): totalMessages = len(messages) print("Total messages: {}".format(totalMessages)) for message in messages: receipt_handle = message['ReceiptHandle'] try: if(hitLimit): changeVisibility(sqs, qUrl, receipt_handle) else: print("starting job...") processItem(message, snsTopic, snsRole) print("started job...") print('Deleting item from queue...') # Delete received message from queue sqs.delete_message( QueueUrl=qUrl, ReceiptHandle=receipt_handle ) print('Deleted item from queue...') jc += 1 except Exception as e: print("Error while starting job or deleting from queue: {}".format(e)) changeVisibility(sqs, qUrl, receipt_handle) if(e.__class__.__name__ == 'LimitExceededException' or e.__class__.__name__ == "ProvisionedThroughputExceededException"): hitLimit = True limitException = e if(hitLimit): raise limitException return totalMessages, jc
def startJob(bucketName, objectName, documentId, snsTopic, snsRole, detectForms, detectTables): print("Starting job with documentId: {}, bucketName: {}, objectName: {}". format(documentId, bucketName, objectName)) response = None client = AwsHelper().getClient('textract') if (not detectForms and not detectTables): response = client.start_document_text_detection( ClientRequestToken=documentId, DocumentLocation={ 'S3Object': { 'Bucket': bucketName, 'Name': objectName } }, NotificationChannel={ "RoleArn": snsRole, "SNSTopicArn": snsTopic }, JobTag=documentId) else: features = [] if (detectTables): features.append("TABLES") if (detectForms): features.append("FORMS") response = client.start_document_analysis( ClientRequestToken=documentId, DocumentLocation={ 'S3Object': { 'Bucket': bucketName, 'Name': objectName } }, FeatureTypes=features, NotificationChannel={ "RoleArn": snsRole, "SNSTopicArn": snsTopic }, JobTag=documentId) return response["JobId"]
def getComprehend(languageCode="en"): ss = boto3.Session() region = ss.region_name client = AwsHelper().getClient('comprehend', region) def entities(text): return client.detect_entities(Text=text, LanguageCode=languageCode) return {"entities": entities}
def startTranslationJob(bucketName, sourceCode, destCode, access_role): translate = AwsHelper().getClient('translate') try: millis = int(round(time.time() * 1000)) response = translate.start_text_translation_job( JobName="TranslateJob-json-{}".format(millis), InputDataConfig={ 'S3Uri': "s3://{}/xmlin/".format(bucketName), 'ContentType': 'text/html' }, OutputDataConfig={'S3Uri': "s3://{}/xmlout/".format(bucketName)}, DataAccessRoleArn=access_role, SourceLanguageCode=sourceCode, TargetLanguageCodes=[destCode]) print(response["JobId"]) except ClientError as e: logger.error("An error occured starting the Translate Batch Job: %s" % e)