Esempio n. 1
0
    def markDocumentComplete(self, documentId):

        err = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._documentsTableName)

        try:
            table.update_item(
                Key = { 'documentId': documentId },
                UpdateExpression = 'SET documentStatus= :documentstatusValue, documentCompletedOn = :documentCompletedOnValue',
                ConditionExpression = 'attribute_exists(documentId)',
                ExpressionAttributeValues = {
                    ':documentstatusValue': "SUCCEEDED",
                    ':documentCompletedOnValue': str(datetime.datetime.utcnow())
                }
            )
        except ClientError as e:
            if e.response['Error']['Code'] == "ConditionalCheckFailedException":
                print(e.response['Error']['Message'])
                err  = {'Error' : 'Document does not exist.'}
            else:
                raise

        return err
Esempio n. 2
0
    def queryByIndexBothKeys(self, indexPartitionKey, indexSortKey):
        """List the data from database based on index partition key and index sort key
        
        Args:
           indexPartitionKey(str): partition key value of index
           indexSortKey(str): sort key value of index

        Returns:
            List of data from database based on partition key and sort key
        """
        response = {'Items': []}
        if self._databaseName == 'dynamodb':
            dynamodb = AwsHelper().getResource(self._databaseName,
                                               self._awsRegion)
            table = dynamodb.Table(self._tableName)
            try:
                response = table.query(
                    IndexName=self._indexName,
                    KeyConditionExpression=Key(
                        self._indexPartitionKeyName).eq(indexPartitionKey)
                    & Key(self._indexSortKeyName).eq(indexSortKey))
            except ParamValidationError as e:
                print("Parameter validation error: %s" % e)
            except ClientError as e:
                print("Unexpected error: %s" % e)
        return response['Items']
    def getDocumentCount(self):

        dynamodb = AwsHelper().getResource("dynamodb")

        table = dynamodb.Table(self._documentsTableName)

        return table.item_count
Esempio n. 4
0
    def createDocument(self, documentId, bucketName, objectName):

        err = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._documentsTableName)

        try:
            table.update_item(
                Key={"documentId": documentId},
                UpdateExpression=
                "SET bucketName = :bucketNameValue, objectName = :objectNameValue, documentStatus = :documentstatusValue, documentCreatedOn = :documentCreatedOnValue",
                ConditionExpression="attribute_not_exists(documentId)",
                ExpressionAttributeValues={
                    ":bucketNameValue": bucketName,
                    ":objectNameValue": objectName,
                    ":documentstatusValue": "IN_PROGRESS",
                    ":documentCreatedOnValue": str(datetime.datetime.utcnow()),
                },
            )
        except ClientError as e:
            print(e)
            if e.response["Error"][
                    "Code"] == "ConditionalCheckFailedException":
                print(e.response["Error"]["Message"])
                err = {"Error": "Document already exist."}
            else:
                raise

        return err
Esempio n. 5
0
    def markDocumentComplete(self, documentId):

        err = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._documentsTableName)

        try:
            table.update_item(
                Key={"documentId": documentId},
                UpdateExpression=
                "SET documentStatus= :documentstatusValue, documentCompletedOn = :documentCompletedOnValue",
                ConditionExpression="attribute_exists(documentId)",
                ExpressionAttributeValues={
                    ":documentstatusValue": "SUCCEEDED",
                    ":documentCompletedOnValue":
                    str(datetime.datetime.utcnow()),
                },
            )
        except ClientError as e:
            if e.response["Error"][
                    "Code"] == "ConditionalCheckFailedException":
                print(e.response["Error"]["Message"])
                err = {"Error": "Document does not exist."}
            else:
                raise

        return err
Esempio n. 6
0
def callTextract(bucketName, objectName, detectText, detectForms, detectTables):
    textract = AwsHelper().getClient('textract')
    if(not detectForms and not detectTables):
        response = textract.detect_document_text(
            Document={
                'S3Object': {
                    'Bucket': bucketName,
                    'Name': objectName
                }
            }
        )
    else:
        features  = []
        if(detectTables):
            features.append("TABLES")
        if(detectForms):
            features.append("FORMS")

        response = textract.analyze_document(
            Document={
                'S3Object': {
                    'Bucket': bucketName,
                    'Name': objectName
                }
            },
            FeatureTypes=features
        )

    return response
Esempio n. 7
0
    def save(self, info):
        """Store the data into database
        
        Args:
            info(dict): information to store

        Returns:
            None
        """
        response = {'status': 'OK'}
        if self._databaseName == 'dynamodb':
            dynamodb = AwsHelper().getResource(self._databaseName,
                                               self._awsRegion)
            table = dynamodb.Table(self._tableName)
            for key in info:
                if not info[key]:
                    response['status'] = 'BAD'
                    response['error'] = key + ' should not be empty.'
                    print(response['error'])
                    return response
            try:
                table.put_item(Item=info)
            except ParamValidationError as e:
                print("Parameter validation error: %s" % e)
            except ClientError as e:
                print("Unexpected error: %s" % e)
        return response
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities in S3
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    # if Kendra is available then let it index the document
    # index the searchable pdf in Kendra
    if 'KENDRA_INDEX_ID' in os.environ:
        kendraClient = KendraHelper()
        fileName = os.path.basename(objectName).split(".")[0]
        fileExtension = os.path.basename(objectName).split(".")[1]
        outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName)
        kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'],
                                   os.environ['KENDRA_ROLE_ARN'],
                                   outputBucketName, outputDocumentName,
                                   documentId, fileExtension)

    print("Processed Comprehend data for document: {}".format(documentId))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
Esempio n. 9
0
    def updateDocumentStatus(self, documentId, documentStatus, jobId=None):

        err = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._documentsTableName)

        try:
            table.update_item(
                Key={'documentId': documentId},
                UpdateExpression=
                'SET documentStatus= :documentstatusValue, jobId= :jobIdValue',
                ConditionExpression='attribute_exists(documentId)',
                ExpressionAttributeValues={
                    ':documentstatusValue': documentStatus,
                    ':jobIdValue': jobId
                })
        except ClientError as e:
            if e.response['Error'][
                    'Code'] == "ConditionalCheckFailedException":
                print(e.response['Error']['Message'])
                err = {'Error': 'Document does not exist.'}
            else:
                raise

        return err
Esempio n. 10
0
    def createDocument(self, documentId, bucketName, objectName):

        err = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._documentsTableName)

        try:
            table.update_item(
                Key = { "documentId": documentId },
                UpdateExpression = 'SET bucketName = :bucketNameValue, objectName = :objectNameValue, documentStatus = :documentstatusValue, documentCreatedOn = :documentCreatedOnValue',
                ConditionExpression = 'attribute_not_exists(documentId)',
                ExpressionAttributeValues = {
                    ':bucketNameValue': bucketName,
                    ':objectNameValue': objectName,
                    ':documentstatusValue': 'IN_PROGRESS',
                    ':documentCreatedOnValue': str(datetime.datetime.utcnow())
                }
            )
        except ClientError as e:
            print(e)
            if e.response['Error']['Code'] == "ConditionalCheckFailedException":
                print(e.response['Error']['Message'])
                err  = {'Error' : 'Document already exist.'}
            else:
                raise

        return err
Esempio n. 11
0
    def getDocument(self, documentId):

        dynamodb = AwsHelper().getClient("dynamodb")

        ddbGetItemResponse = dynamodb.get_item(
            Key={"documentId": {
                "S": documentId
            }},
            TableName=self._documentsTableName)

        itemToReturn = None

        if "Item" in ddbGetItemResponse:
            itemToReturn = {
                "documentId":
                ddbGetItemResponse["Item"]["documentId"]["S"],
                "bucketName":
                ddbGetItemResponse["Item"]["bucketName"]["S"],
                "objectName":
                ddbGetItemResponse["Item"]["objectName"]["S"],
                "documentStatus":
                ddbGetItemResponse["Item"]["documentStatus"]["S"],
            }

        return itemToReturn
    def getDocument(self, documentId):

        dynamodb = AwsHelper().getClient("dynamodb")

        ddbGetItemResponse = dynamodb.get_item(
            Key={'documentId': {
                'S': documentId
            }},
            TableName=self._documentsTableName)

        itemToReturn = None

        if ('Item' in ddbGetItemResponse):
            itemToReturn = {
                'documentId':
                ddbGetItemResponse['Item']['documentId']['S'],
                'bucketName':
                ddbGetItemResponse['Item']['bucketName']['S'],
                'objectName':
                ddbGetItemResponse['Item']['objectName']['S'],
                'documentStatus':
                ddbGetItemResponse['Item']['documentStatus']['S'],
            }

        return itemToReturn
Esempio n. 13
0
    def _startJob(self):
        response = None
        client = AwsHelper().getClient('textract',
                                       self.inputParameters.awsRegion)
        if (not self.inputParameters.detectForms
                and not self.inputParameters.detectTables):
            response = client.start_document_text_detection(
                DocumentLocation={
                    'S3Object': {
                        'Bucket': self.inputParameters.bucketName,
                        'Name': self.inputParameters.documentPath
                    }
                })
        else:
            features = []
            if (self.inputParameters.detectTables):
                features.append("TABLES")
            if (self.inputParameters.detectForms):
                features.append("FORMS")

            response = client.start_document_analysis(DocumentLocation={
                'S3Object': {
                    'Bucket': self.inputParameters.bucketName,
                    'Name': self.inputParameters.documentPath
                }
            },
                                                      FeatureTypes=features)

        return response["JobId"]
Esempio n. 14
0
    def getDocuments(self, nextToken=None):

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._opsTableName)

        pageSize = 25

        if(nextToken):
            response = table.scan(ExclusiveStartKey={ "documentId" : nextToken}, Limit=pageSize)
        else:
            response = table.scan(Limit=pageSize)

        print("response: {}".format(response))

        data = []

        if('Items' in response):        
            data = response['Items']

        documents = { 
            "documents" : data
        }

        if 'LastEvaluatedKey' in response:
            nextToken = response['LastEvaluatedKey']['documentId']
            print("nexToken: {}".format(nextToken))
            documents["nextToken"] = nextToken

        return documents
Esempio n. 15
0
def processImage(
    documentId, features, bucketName, objectName, outputTableName, documentsTableName
):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(
        bucketName, objectName, detectText, detectForms, detectTables
    )

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    print("Generating output for DocumentId: {}".format(documentId))

    opg = OutputGenerator(
        documentId, response, bucketName, objectName, detectForms, detectTables, ddb
    )
    opg.run()

    print("DocumentId: {}".format(documentId))

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
Esempio n. 16
0
def read_from_s3(aws_env):
    bucket_name = aws_env['bucketName']
    s3_file_name = aws_env['objectName']
    aws_region = aws_env['awsRegion']
    s3 = AwsHelper().getResource('s3', aws_region)
    obj = s3.Object(bucket_name, s3_file_name)
    encoding = "utf-8"
    try:
        content = obj.get()['Body'].read()
    except Exception as e:
        print(e)
        return
    try:
        encoding = chardet.detect(content)['encoding']
        print("Trying to decode with {}".format(encoding))
        content_decoded = content.decode(encoding)
        return content_decoded
    except UnicodeDecodeError as e:
        print("Failing to decode with encoding {0}: {1}".format(encoding, e))
        try:
            print("Trying by removing the last character")
            content_without_last_char = content[:-1].decode(encoding)
            return content_without_last_char
        except UnicodeDecodeError as e:
            print("Failing to decode: {}".format(e))
            print("Returning content in bytes")
            return content
Esempio n. 17
0
def callTextract(bucketName, objectName):
    textract = AwsHelper().getClient('textract')
    response = textract.detect_document_text(
        Document={'S3Object': {
            'Bucket': bucketName,
            'Name': objectName
        }})
    return response
Esempio n. 18
0
def lambda_handler(event, context):
    if (event['status'] <= 0):
        return {**event, "errorMessage": "Status isnt positive"}
    aws_env = {
        **event,
        "bucketName": os.environ.get('DOCUMENTS_BUCKET'),
        "awsRegion": 'eu-west-1',
        "tmpJsonOutput": "/tmp/tmp_result.json",
        "tmpTxtOutput": "/tmp/tmp_result.txt",
        "outputBucket": os.environ.get('DOCUMENTS_BUCKET'),
        "outputNameJson": get_bbox_filename(event['objectName'], ".json"),
        "outputNameTxt": get_bbox_filename(event['objectName'], ".txt"),
        "textractOnly": os.environ.get('TEXTRACT_ONLY'),
        "minCharNeeded": int(os.environ.get('MIN_CHAR_NEEDED')),
        "extract_pdf_lines": os.environ.get('EXTRACT_PDF_LINES'),
    }
    status = {"statusCode": 200, "body": "All right"}
    extract_pdf_lines = aws_env['extract_pdf_lines']
    textract_only = aws_env['textractOnly']
    tmp_folder = "/tmp/pdfToBbox"
    pdf_tmp_path = copy_pdf_to_tmp(tmp_folder, aws_env)

    print("==> aws_env: ", aws_env)
    if textract_only == "false" and is_pdf_has_enough_characters(
            pdf_tmp_path, aws_env['minCharNeeded']) is True:
        print("=> Extracting bounding box with pdfplumber")
        if extract_pdf_lines == "true":
            print("=> Extracting pdf lines bbox")
            pdf = Pdf(pdf_tmp_path, aws_env['tmpJsonOutput'],
                      aws_env['tmpTxtOutput'])
            pdf.parse_pdf()
            pdf.save_in_json()
            pdf.save_in_txt()
            write_bbox_to_s3(aws_env)
        else:
            print("=> Extracting pdf words bbox")
            if execute_pdf_to_bbox(pdf_tmp_path, aws_env['tmpJsonOutput']):
                print("=> Error while trying to get pdf information")
                aws_env["status"] = -1
                aws_env["errorMessage"] = "PDF format not supported."
            else:
                write_bbox_to_s3(aws_env)
    else:
        print("Extracting bounding box with textract")
        #send_to_textract(aws_env)
    aws_env['size'] = S3Helper.getS3FileSize(aws_env['bucketName'],
                                             aws_env['outputNameTxt'],
                                             aws_env['awsRegion'])
    aws_env["s3Url"] = get_new_s3_url(aws_env['s3Url'], "txt")
    aws_env["status"] = status
    aws_env["status"] = 1
    aws_env["errorMessage"] = None
    aws_env["contentType"] = "text/txt"
    aws_env['objectName'] = aws_env['outputNameTxt']
    aws_env["sourceUrl"] = aws_env["s3Url"]
    AwsHelper.refreshTmpFolder(tmp_folder)
    return update_event(aws_env, event)
Esempio n. 19
0
    def TranslateCaptions(self, translationContext, terminology_names=[]):

        marker = "<span>"
        sourceLanguageCode = translationContext["sourceLang"]
        targetLanguageCodes = translationContext["targetLangList"]
        translate_role = translationContext["roleArn"]
        bucket = translationContext["bucket"]
        inputPath = translationContext["inputLocation"]
        outputPath = translationContext["outputlocation"]
        jobPrefix = translationContext["jobPrefix"]
        try:
            translate_client = AwsHelper().getClient('translate')
            targetLanguageCode = targetLanguageCodes[0]
            self.logger.debug(
                "Starting translation to {}".format(targetLanguageCode))
            singletonTargetList = []
            singletonTargetList.append(targetLanguageCode)
            millis = int(round(time.time() * 1000))
            job_name = jobPrefix + str(millis)
            self.logger.debug("JobName: {}".format(job_name))

            terminology_name = []
            if len(terminology_names) > 0:
                for item in terminology_names:
                    if targetLanguageCode in item['TargetLanguageCodes']:
                        terminology_name.append(item['Name'])
                        break
                if len(terminology_name) == 0:
                    self.logger.debug(int("No custom terminology specified."))
                else:
                    self.logger.debug(
                        "Using custom terminology {}".format(terminology_name))

            # Save the delimited transcript text to S3
            response = translate_client.start_text_translation_job(
                JobName=job_name,
                InputDataConfig={
                    'S3Uri': "s3://{}/{}".format(bucket, inputPath),
                    'ContentType': "text/html"
                },
                OutputDataConfig={
                    'S3Uri': "s3://{}/{}".format(bucket, outputPath)
                },
                DataAccessRoleArn=translate_role,
                SourceLanguageCode=sourceLanguageCode,
                TargetLanguageCodes=singletonTargetList,
                TerminologyNames=terminology_name)
            jobinfo = {
                "JobId": response["JobId"],
                "TargetLanguageCode": targetLanguageCode
            }
            return jobinfo

        except Exception as e:
            self.logger.error(e)
            raise e
    def deleteItem(self, itemId):

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._itemsTableName)

        table.delete_item(
            Key={
                'itemId': itemId
            }
        )
Esempio n. 21
0
    def deleteDocument(self, documentId):

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._opsTableName)

        table.delete_item(
            Key={
                'documentId': documentId
            }
        )
Esempio n. 22
0
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    outputBucket = request["outputBucket"]
    documentsTable = request["documentsTable"]
    qUrl = request["elasticQueueUrl"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    opg = OutputGenerator(jobId, jobTag, pages, outputBucket, objectName,
                          detectForms, detectTables, ddb)
    opg.run()

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag, jobId)

    jsonMessage = {
        'documentId': jobTag,
        'jobId': jobId,
        'bucketName': outputBucket,
        'objectName': objectName
    }

    client = AwsHelper().getClient('sqs')
    postMessage(client, qUrl, jsonMessage)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    print(output)

    return {'statusCode': 200, 'body': output}
Esempio n. 23
0
    def updateDocumentStatus(self, documentId, status, stage, timestamp, message=None):

        ret = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._opsTableName)
        try:
            if message:
                new_datapoint = {
                    "timestamp": timestamp,
                    "stage": stage,
                    "status": status,
                    "message": message
                }
            else:
                new_datapoint = {
                    "timestamp": timestamp,
                    "stage": stage,
                    "status": status
                }
            table.update_item(
                Key = {
                    'documentId': documentId
                },
                UpdateExpression = 'SET documentStatus = :documentStatus, documentStage = :documentStage, lastUpdate = :lastUpdate, timeline = list_append(timeline, :new_datapoint)',
                ConditionExpression = 'attribute_exists(documentId)',
                ExpressionAttributeValues = {
                    ':documentStatus': status,
                    ':documentStage': stage,
                    ':lastUpdate': timestamp,
                    ':new_datapoint': [new_datapoint]
                }
            )
            ret = {
                'Status': 200
            }
        except ClientError as e:
            print(e)
            ret  = {
                'Error' : e.response['Error']['Message'],
                'Status': e.response['ResponseMetadata']['HTTPStatusCode']
            }
        except Exception as e:
            print(e)
            ret = {
                'Error' : 'Updating document failed',
                'Status': 400
            }

        return ret
Esempio n. 24
0
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities in S3
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
Esempio n. 25
0
 def queryDocumentId(self, targetBucketName, targetFileName, versionId=None):
     ret = None
     res = None
     
     dynamodb = AwsHelper().getResource("dynamodb")
     table = dynamodb.Table(self._lineageTableName)
     documentSignature = "BUCKET:{}@FILE:{}".format(targetBucketName, targetFileName)
     if versionId:
         documentSignature += "@VERSION:{}".format(versionId)
     try:
         res = table.query(
             KeyConditionExpression = Key('documentSignature').eq(documentSignature),
             IndexName = self._lineageIndexName
         )
     except ClientError as e:
         print(e)
         ret = {
             'Error': e.response['Error']['Message'],
             'Status': e.response['ResponseMetadata']['HTTPStatusCode']
         }
     except Exception as e:
         print(e)
         ret = {
             'Error': 'Unknown error occurred during querying the document Id',
             'Status': 400
         }
     try:
         items = res['Items']
         print(items)
         if len(items) == 0:
             ret = {
                 'Status': 404,
                 'documentId': None
             }
         else:
             items.sort(key=lambda item: datetime.fromisoformat(item['timestamp']))
             ret = {
                 'Status': 200,
                 'documentId': items[0]['documentId']
             }
     except Exception as e:
         print(e)
         ret = {
             'Error': 'Could not find the documentId for specified document Signature',
             'Status': 400
         }
         
     return ret
Esempio n. 26
0
def processRequest(request):

    output = ""

    print("request: {}".format(request))

    itemId = request["itemId"]
    bucketName = request["bucketName"]
    objectName = request["objectName"]

    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print("Extension: {}".format(ext))

    if(ext and ext in ["jpg", "jpeg", "png"]):
        qUrl = request['syncQueueUrl']
    elif (ext in ["mov", "mp4"]):
        qUrl = request['asyncQueueUrl']

    if(qUrl):
        jsonMessage = { 'itemId' : itemId,
            'bucketName': bucketName,
            'objectName' : objectName }

        client = AwsHelper().getClient('sqs')
        response = postMessage(client, qUrl, jsonMessage)

    output = "Completed routing for itemId: {}, object: {}/{}".format(itemId, bucketName, objectName)

    print(output)
    return response
def processItems(qUrl, snsTopic, snsRole):

    sqs = AwsHelper().getClient('sqs')
    messages = getMessagesFromQueue(sqs, qUrl)

    jc = 0
    totalMessages = 0
    hitLimit = False
    limitException = None

    if(messages):


        totalMessages = len(messages)
        print("Total messages: {}".format(totalMessages))

        for message in messages:
            receipt_handle = message['ReceiptHandle']

            try:
                if(hitLimit):
                    changeVisibility(sqs, qUrl, receipt_handle)
                else:
                    print("starting job...")
                    processItem(message, snsTopic, snsRole)
                    print("started job...")
                    print('Deleting item from queue...')
                    # Delete received message from queue
                    sqs.delete_message(
                        QueueUrl=qUrl,
                        ReceiptHandle=receipt_handle
                    )
                    print('Deleted item from queue...')
                    jc += 1
            except Exception as e:
                print("Error while starting job or deleting from queue: {}".format(e))
                changeVisibility(sqs, qUrl, receipt_handle)
                if(e.__class__.__name__ == 'LimitExceededException' 
                    or e.__class__.__name__ == "ProvisionedThroughputExceededException"):
                    hitLimit = True
                    limitException = e

        if(hitLimit):
            raise limitException

    return totalMessages, jc
def startJob(bucketName, objectName, documentId, snsTopic, snsRole,
             detectForms, detectTables):

    print("Starting job with documentId: {}, bucketName: {}, objectName: {}".
          format(documentId, bucketName, objectName))

    response = None
    client = AwsHelper().getClient('textract')
    if (not detectForms and not detectTables):
        response = client.start_document_text_detection(
            ClientRequestToken=documentId,
            DocumentLocation={
                'S3Object': {
                    'Bucket': bucketName,
                    'Name': objectName
                }
            },
            NotificationChannel={
                "RoleArn": snsRole,
                "SNSTopicArn": snsTopic
            },
            JobTag=documentId)
    else:
        features = []
        if (detectTables):
            features.append("TABLES")
        if (detectForms):
            features.append("FORMS")

        response = client.start_document_analysis(
            ClientRequestToken=documentId,
            DocumentLocation={
                'S3Object': {
                    'Bucket': bucketName,
                    'Name': objectName
                }
            },
            FeatureTypes=features,
            NotificationChannel={
                "RoleArn": snsRole,
                "SNSTopicArn": snsTopic
            },
            JobTag=documentId)

    return response["JobId"]
def getComprehend(languageCode="en"):
    ss = boto3.Session()
    region = ss.region_name
    client = AwsHelper().getClient('comprehend', region)

    def entities(text):
        return client.detect_entities(Text=text, LanguageCode=languageCode)

    return {"entities": entities}
def startTranslationJob(bucketName, sourceCode, destCode, access_role):
    translate = AwsHelper().getClient('translate')
    try:
        millis = int(round(time.time() * 1000))
        response = translate.start_text_translation_job(
            JobName="TranslateJob-json-{}".format(millis),
            InputDataConfig={
                'S3Uri': "s3://{}/xmlin/".format(bucketName),
                'ContentType': 'text/html'
            },
            OutputDataConfig={'S3Uri': "s3://{}/xmlout/".format(bucketName)},
            DataAccessRoleArn=access_role,
            SourceLanguageCode=sourceCode,
            TargetLanguageCodes=[destCode])
        print(response["JobId"])
    except ClientError as e:
        logger.error("An error occured starting the Translate Batch Job: %s" %
                     e)