def run(self):

        if (not self.document.pages):
            return

        FileHelper.writeToFile("{}-response.json".format(self.fileName),
                               json.dumps(self.response))

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        p = 1
        for page in self.document.pages:

            FileHelper.writeToFile(
                "{}-page-{}-response.json".format(self.fileName, p),
                json.dumps(page.blocks))

            self._outputWords(page, p)

            self._outputText(page, p)

            if (self.forms):
                self._outputForm(page, p)
                self._outputFormTranslate(page, p)

            if (self.tables):
                self._outputTable(page, p)
                self._outputTablePretty(page, p)
                self._outputTablePrettyTranslate(page, p)

            p = p + 1
Beispiel #2
0
    def _outputForm(self, page, p):
        csvData = []
        for field in page.form.fields:
            csvItem = []
            if (field.key):
                csvItem.append(field.key.text)
                csvItem.append(field.key.confidence)
            else:
                csvItem.append("")
                csvItem.append("")
            if (field.value):
                csvItem.append(field.value.text)
                csvItem.append(field.value.confidence)
            else:
                csvItem.append("")
                csvItem.append("")

            csvItem.append(field.boundingBox["Top"])
            csvItem.append(field.boundingBox["Height"])
            csvItem.append(field.boundingBox["Width"])
            csvItem.append(field.boundingBox["Left"])

            csvData.append(csvItem)
        csvFieldNames = [
            'Key', 'KeyConfidence', 'Value', 'ValueConfidence', "Top",
            "Height", "Width", "Left"
        ]
        FileHelper.writeCSV("{}-page-{}-forms.csv".format(self.fileName, p),
                            csvFieldNames, csvData)
    def _outputText(self, page, p):
        text = page.text
        FileHelper.writeToFile("{}-page-{}-text.txt".format(self.fileName, p),
                               text)

        textInReadingOrder = page.getTextInReadingOrder()
        FileHelper.writeToFile(
            "{}-page-{}-text-inreadingorder.txt".format(self.fileName, p),
            textInReadingOrder)
Beispiel #4
0
	def execute(self):
		arguments=self.arguments

		current_dir_path=os.getcwd()
		profile=self.config.get_selected_profile()

		def inject_app_id(file_path):
			FileHelper.search_replace_file(file_path, 'YOUR-USERAPP-APP-ID', profile['user']['app_id'])

		if profile['user']['token'] is None:
			if profile['user']['login'] is None:
				print("(info) Not authenticated. Please login" + ('' if profile['user']['login'] is None else ' login as user' + profile['user']['login']) + '.')
			
			UserAppLoginCommand([profile['user']['login']]).execute()
			profile=self.config.get_selected_profile()

		if len(arguments) > 1:
			app_name=arguments.pop(0)

			target_dir_path=current_dir_path+'/'+app_name

			if not os.path.exists(target_dir_path):
				os.makedirs(target_dir_path)

			frontend=arguments.pop(0)

			frontend_zip_url='https://app.userapp.io/partials/docs/quickstart/{name}/frontend/userapp-{name}-demo.zip'
			backend_zip_url='https://app.userapp.io/partials/docs/quickstart/{name}/backend/userapp-{name}-backend.zip'

			frontend_error = FileHelper.unzip_url(frontend_zip_url.format(name=frontend), target_dir_path + '/public')

			if frontend_error == 'invalid_url':
				print("(error) Frontend '"+frontend+"' does not exist.")
				return

			if frontend == 'angularjs':
				inject_app_id(target_dir_path + '/public/js/app.js')

			if len(arguments) > 0:
				backend=arguments.pop(0)
				backend_error = FileHelper.unzip_url(backend_zip_url.format(name=backend), target_dir_path)

				if backend_error == 'invalid_url':
					print("(error) Backend '"+backend+"' does not exist.")
					return

				if backend == 'nodejs':
					inject_app_id(target_dir_path + '/app.js')

					ProcessHelper.execute('sudo npm install', cwd=target_dir_path+'/')
					ProcessHelper.execute('nodejs app.js', block=False, wait=False, cwd=target_dir_path+'/')

					WebBrowserHelper.open_url('http://localhost:3000')
		else:
			print("(error) Please specify <dir name> <frontend> <backend>. E.g. 'init myapp angularjs nodejs'.")
 def _outputTablePretty(self, page, p, table_format='github'):
     for table_number, table in enumerate(page.tables):
         rows_list = list()
         for row in table.rows:
             one_row = list()
             for cell in row.cells:
                 one_row = one_row + [cell.text]
             rows_list.append(one_row)
         pretty_table = tabulate(rows_list, tablefmt=table_format)
         FileHelper.writeToFile(
             "{}-page-{}-table-{}-tables-pretty.txt".format(
                 self.fileName, p, table_number), pretty_table)
 def _outputWords(self, page, p):
     csvData = []
     for line in page.lines:
         for word in line.words:
             csvItem  = []
             csvItem.append(word.id)
             if(word.text):
                 csvItem.append(word.text)
             else:
                 csvItem.append("")
             csvData.append(csvItem)
     csvFieldNames = ['Word-Id', 'Word-Text']
     FileHelper.writeCSV("{}-page-{}-words.csv".format(self.fileName, p), csvFieldNames, csvData)
    def processDocument(self, ips, i, document):
        print("\nTextracting Document # {}: {}".format(i, document))
        print('=' * (len(document) + 30))

        # Get document textracted
        dp = DocumentProcessor(ips["bucketName"], document, ips["awsRegion"],
                               ips["text"], ips["forms"], ips["tables"])
        response = dp.run()
        print("Recieved Textract response...")

        #FileHelper.writeToFile("temp-response.json", json.dumps(response))

        #Generate output files
        print("Generating output...")
        name, ext = FileHelper.getFileNameAndExtension(document)
        opg = OutputGenerator(
            response, os.path.join(ips["output"], "{}-{}".format(name, ext)),
            ips["forms"], ips["tables"])
        opg.run()

        if (ips["insights"] or ips["medical-insights"] or ips["translate"]):
            opg.generateInsights(ips["insights"], ips["medical-insights"],
                                 ips["translate"], ips["awsRegion"])

        print("{} textracted successfully.".format(document))
def processRequest(request):

    output = ""

    print("request: {}".format(request))

    bucketName = request["bucketName"]
    objectName = request["objectName"]
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]

    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print("Extension: {}".format(ext))

    if ext and ext in ["jpg", "jpeg", "png", "pdf"]:
        documentId = str(uuid.uuid1())
        ds = datastore.DocumentStore(documentsTable, outputTable)
        ds.createDocument(documentId, bucketName, objectName)

        output = "Saved document {} for {}/{}".format(documentId, bucketName,
                                                      objectName)

        print(output)

    return {"statusCode": 200, "body": json.dumps(output)}
Beispiel #9
0
def processRequest(request):

    output = ""

    print(f"request: {request}")
    bucketName = request["bucketName"]
    objectName = request["objectName"]
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]

    print(f"Input Object: {bucketName}/{objectName}")

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print(f"Extension: {ext}")

    if(ext and ext in ["jpg", "jpeg", "png", "pdf"]):
        documentId = str(uuid.uuid1())
        ds = datastore.DocumentStore(documentsTable, outputTable)
        ds.createDocument(documentId, bucketName, objectName)

        output = f"Saved document {documentId} for {bucketName}/{objectName}"
        print(output)

    return {
        'statusCode': 200,
        'body': json.dumps(output)
    }
Beispiel #10
0
def processRequest(request):

    output = ""

    print("request: {}".format(request))

    itemId = request["itemId"]
    bucketName = request["bucketName"]
    objectName = request["objectName"]

    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print("Extension: {}".format(ext))

    if(ext and ext in ["jpg", "jpeg", "png"]):
        qUrl = request['syncQueueUrl']
    elif (ext in ["mov", "mp4"]):
        qUrl = request['asyncQueueUrl']

    if(qUrl):
        jsonMessage = { 'itemId' : itemId,
            'bucketName': bucketName,
            'objectName' : objectName }

        client = AwsHelper().getClient('sqs')
        response = postMessage(client, qUrl, jsonMessage)

    output = "Completed routing for itemId: {}, object: {}/{}".format(itemId, bucketName, objectName)

    print(output)
    return response
def processRequest(request):

    output = ""

    print("request: {}".format(request))

    bucketName = request["bucketName"]
    objectName = request["objectName"]
    itemsTable = request["itemsTable"]
    outputBucket = request["outputBucket"]

    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print("Extension: {}".format(ext))

    if (ext and ext in ["jpg", "jpeg", "png", "mov", "mp4"]):
        itemId = str(uuid.uuid1())
        ds = datastore.ItemStore(itemsTable)
        ds.createItem(itemId, bucketName, objectName)

        output = "Saved item {} for {}/{}".format(itemId, bucketName,
                                                  objectName)

        print(output)

    return {'statusCode': 200, 'body': json.dumps(output)}
    def _outputTable(self, page, p):

        csvData = []
        for table in page.tables:
            csvRow = []
            csvRow.append("Table")
            csvData.append(csvRow)
            for row in table.rows:
                csvRow  = []
                for cell in row.cells:
                    csvRow.append(cell.text)
                csvData.append(csvRow)
            csvData.append([])
            csvData.append([])

        FileHelper.writeCSVRaw("{}-page-{}-tables.csv".format(self.fileName, p), csvData)
def run():
    filePath = "temp-response.json"
    response = json.loads(FileHelper.readFile(filePath))

    doc = Document(response)

    #print(doc)
    processDocument(doc)
Beispiel #14
0
        def validateInput(self, args):

            event = self.getInputParameters(args)

            ips = {}

            if (not 'documents' in event):
                raise Exception(
                    "Document or path to a foler or S3 bucket containing documents is required."
                )

            inputDocument = event['documents']
            idl = inputDocument.lower()

            bucketName = None
            documents = []
            awsRegion = 'us-east-1'

            if (idl.startswith("s3://")):
                o = urlparse(inputDocument)
                bucketName = o.netloc
                path = o.path[1:]
                ar = S3Helper.getS3BucketRegion(bucketName)
                if (ar):
                    awsRegion = ar

                if (idl.endswith("/")):
                    allowedFileTypes = ["jpg", "jpeg", "png", "pdf"]
                    documents = S3Helper.getFileNames(awsRegion, bucketName,
                                                      path, 1,
                                                      allowedFileTypes)
                else:
                    documents.append(path)
            else:
                if (idl.endswith("/")):
                    allowedFileTypes = ["jpg", "jpeg", "png"]
                    documents = FileHelper.getFileNames(
                        inputDocument, allowedFileTypes)
                else:
                    documents.append(inputDocument)

                if ('region' in event):
                    awsRegion = event['region']

            ips["bucketName"] = bucketName
            ips["documents"] = documents
            ips["awsRegion"] = awsRegion
            ips["text"] = ('text' in event)
            ips["forms"] = ('forms' in event)
            ips["tables"] = ('tables' in event)
            ips["insights"] = ('insights' in event)
            ips["medical-insights"] = ('medical-insights' in event)
            if ("translate" in event):
                ips["translate"] = event["translate"]
            else:
                ips["translate"] = ""

            return ips
 def _getProjectPath(self):
     var = input("Enter your Project Path : ")
     print()
     try:
         if FileHelper.isFolder(var):
             self._projectPath = var
         else:
             opt = input("Folder Not Found! Do you want to Try To create? (y/n)")
             if opt=="y":
                 FileHelper.createFolder(var)
                 self._projectPath = var
                 print("Folder Created!")
             else:
                 raise Exception()
     except:
         print("\t\tInvalid Path")
         print()
         self._getProjectPath()
Beispiel #16
0
def run():
    filePath = "temp-response.json"
    response = json.loads(FileHelper.readFile(filePath))

    doc = Document(response)

    #print(doc)
    #traverseDocument(doc)
    generateOutput(filePath, response)
Beispiel #17
0
 def _outputForm(self, page, p):
     csvData = []
     for field in page.form.fields:
         csvItem  = []
         if(field.key):
             csvItem.append(field.key.text)
             csvItem.append(field.key.confidence)
         else:
             csvItem.append("")
             csvItem.append("")
         if(field.value):
             csvItem.append(field.value.text)
             csvItem.append(field.value.confidence)
         else:
             csvItem.append("")
             csvItem.append("")
         csvData.append(csvItem)
     csvFieldNames = ['Key', 'KeyConfidence', 'Value', 'ValueConfidence']
     FileHelper.writeCSV("{}-forms.csv".format(self.fileName), csvFieldNames, csvData)
def processRequest(request):
    output = ""
    logger.debug("request: {}".format(request))
    up = urlparse(request["s3uri"], allow_fragments=False)
    accountid = request["accountId"]
    jobid = request["jobId"]
    bucketName = up.netloc
    objectkey = up.path.lstrip('/')
    # choose the base path for iterating within the translated files for the specific job
    basePrefixPath = objectkey + accountid + "-TranslateText-" + jobid + "/"
    languageCode = request["langCode"]
    logger.debug("Base Prefix Path:{}".format(basePrefixPath))
    # Filter only the translated XML files for processing
    objs = S3Helper().getFilteredFileNames(bucketName, basePrefixPath, "xml")
    for obj in objs:
        try:
            content = S3Helper().readFromS3(bucketName, obj)
            logger.debug(content)
            #Convert the XML file to Dictionary object
            data_dict = xmltodict.parse(content)
            #Generate the Json content from the dictionary
            data_dict = data_dict["all"]
            flatten_dict = {
                k: (data_dict[k]["item"] if
                    (isinstance(v, dict) and len(v.keys()) == 1
                     and "item" in v.keys()) else v)
                for (k, v) in data_dict.items()
            }
            json_data = json.dumps(flatten_dict,
                                   ensure_ascii=False).encode('utf-8')
            logger.debug(json_data)
            newObjectKey = "output/{}.json".format(FileHelper.getFileName(obj))
            #Write the JSON object to the S3 output folder within the bucket
            S3Helper().writeToS3(json_data, bucketName, newObjectKey)
            output = "Output Object: {}/{}".format(bucketName, newObjectKey)
            logger.debug(output)
        except ValueError:
            logger.error("Error occured loading the json file:{}".format(obj))
        except ClientError as e:
            logger.error("An error occured with S3 bucket operations: %s" % e)
        except:
            e = sys.exc_info()[0]
            logger.error("Error occured processing the xmlfile: %s" % e)
    objs = S3Helper().getFilteredFileNames(bucketName, "xmlin/", "xml")
    if (request["delete_xmls"] and request["delete_xmls"] == "true"):
        for obj in objs:
            try:
                logger.debug("Deleting temp xml files {}".format(obj))
                S3Helper().deleteObject(bucketName, obj)
            except ClientError as e:
                logger.error("An error occured with S3 bucket operations: %s" %
                             e)
            except:
                e = sys.exc_info()[0]
                logger.error("Error occured processing the xmlfile: %s" % e)
    def _outputTablePrettyTranslate(self, page, p, table_format='github'):

        tt = None

        tt = TextTranslater('auto', 'en', 'us-east-1')

        for table_number, table in enumerate(page.tables):
            rows_list = list()
            for row in table.rows:
                one_row = list()
                for cell in row.cells:
                    if cell.text != "":
                        one_row = one_row + [tt.getTranslation(cell.text)]
                    else:
                        one_row = one_row + [cell.text]
                rows_list.append(one_row)
            pretty_table = tabulate(rows_list, tablefmt=table_format)
            FileHelper.writeToFile(
                "{}-page-{}-table-{}-tables-pretty-translated.txt".format(
                    self.fileName, p, table_number), pretty_table)
def lambda_handler(event, context):
    print("Comprehend Event: {}".format(event))

    bucketName = event['Records'][0]['s3']['bucket']['name']
    objectName = urllib.parse.unquote_plus(
        event['Records'][0]['s3']['object']['key'])
    callerId = context.invoked_function_arn
    assert (FileHelper().getFileNameAndExtension(objectName.lower()) == (
        'fullresponse', 'json'
    )), "File detected does not match expected format: 'fullresponse.json'"

    runComprehend(bucketName, objectName, callerId)
 def _outputFormTranslate(self, page, p):
     tt = None
     tt = TextTranslater('auto', 'en', 'us-east-1')
     csvData = []
     for field in page.form.fields:
         csvItem = []
         if (field.key):
             csvItem.append(tt.getTranslation(field.key.text))
             csvItem.append(field.key.confidence)
         else:
             csvItem.append("")
             csvItem.append("")
         if (field.value):
             csvItem.append(tt.getTranslation(field.value.text))
             csvItem.append(field.value.confidence)
         else:
             csvItem.append("")
             csvItem.append("")
         csvData.append(csvItem)
     csvFieldNames = ['Key', 'KeyConfidence', 'Value', 'ValueConfidence']
     FileHelper.writeCSV("{}-page-{}-forms-translated.csv".format(self.fileName, p),
                         csvFieldNames, csvData)
Beispiel #22
0
def processRequest(request):

    output = ""

    print("request: {}".format(request))

    documentId = request["documentId"]
    bucketName = request["bucketName"]
    objectName = request["objectName"]
    jobErrorHandlerQueueUrl = request['errorHandlerQueueUrl']

    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print("Extension: {}".format(ext))

    client = AwsHelper().getClient('sqs')
    # If not expected extension, change status to FAILED and exit
    if(ext and ext not in ["jpg", "jpeg", "png", "pdf"]):
        jsonErrorHandlerMessage = {
            'documentId': documentId
        }
        postMessage(client, jobErrorHandlerQueueUrl, jsonErrorHandlerMessage)
        return

    if(ext and ext in ["jpg", "jpeg", "png"]):
        qUrl = request['syncQueueUrl']
        errorHandlerTimeoutSeconds = SYNC_JOB_TIMEOUT_SECONDS
    elif (ext in ["pdf"]):
        qUrl = request['asyncQueueUrl']
        errorHandlerTimeoutSeconds = ASYNC_JOB_TIMEOUT_SECONDS

    if(qUrl):
        features = ["Text", "Forms", "Tables"]
        jsonMessage = {'documentId': documentId,
                       "features": features,
                       'bucketName': bucketName,
                       'objectName': objectName}
        postMessage(client, qUrl, jsonMessage)

        jsonErrorHandlerMessage = {
            'documentId': documentId
        }
        postMessage(client, jobErrorHandlerQueueUrl,
                    jsonErrorHandlerMessage, errorHandlerTimeoutSeconds)

    output = "Completed routing for documentId: {}, object: {}/{}".format(
        documentId, bucketName, objectName)
    print(output)
Beispiel #23
0
def processRequest(documentId, bucketName, objectName, callerId):

    output = ""
    pipeline_client.body = {
        "documentId": documentId,
        "bucketName": bucketName,
        "objectName": objectName,
        "stage": PIPELINE_STAGE
    }
    pipeline_client.stageInProgress()
    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtension(objectName.lower())
    print("Extension: {}".format(ext))

    if (ext and ext in ["jpg", "jpeg", "png"]):
        targetBucketName = syncBucketName
    elif (ext in ["pdf"]):
        targetBucketName = asyncBucketName
    else:
        raise Exception("Incorrect file extension")
    targetFileName = "{}/{}".format(documentId, objectName)
    if (targetBucketName):
        print("Doing S3 Object Copy for documentId: {}, object: {}/{}".format(
            documentId, targetBucketName, targetFileName))
        try:
            S3Helper().copyToS3(bucketName, objectName, targetBucketName,
                                targetFileName)
        except Exception as e:
            print(e)
            pipeline_client.stageFailed()
    else:
        print("")
        pipeline_client.stageFailed()

    output = "Completed S3 Object Copy for documentId: {}, object: {}/{}".format(
        documentId, targetBucketName, targetFileName)
    lineage_client.recordLineageOfCopy({
        "documentId": documentId,
        "callerId": callerId,
        "sourceBucketName": bucketName,
        "targetBucketName": targetBucketName,
        "sourceFileName": objectName,
        "targetFileName": targetFileName,
    })
    pipeline_client.stageSucceeded()
    print(output)
def processRequest(request):

    output = ""

    print("request: {}".format(request))

    bucketName = request["bucketName"]
    objectName = request["objectName"]
    itemsTable = request["itemsTable"]
    outputBucket = request["outputBucket"]

    jobId = request["jobId"]
    invocationId = request['invocationId']
    invocationSchemaVersion = request['invocationSchemaVersion']
    taskId = request['taskId']

    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print("Extension: {}".format(ext))

    if (ext and ext in ["jpg", "jpeg", "png", "mov", "mp4"]):
        itemId = str(uuid.uuid1())
        ds = datastore.ItemStore(itemsTable)
        ds.createItem(itemId, bucketName, objectName)

        output = "Saved item {} for {}/{}".format(itemId, bucketName,
                                                  objectName)

        print(output)

    results = [{
        'taskId':
        taskId,
        'resultCode':
        'Succeeded',
        'resultString':
        "Item submitted for processing with Id: {}".format(itemId)
    }]

    return {
        'invocationSchemaVersion': invocationSchemaVersion,
        'treatMissingKeysAs': 'PermanentFailure',
        'invocationId': invocationId,
        'results': results
    }
Beispiel #25
0
def processRequest(request):

    output = ""

    print("request: {}".format(request))

    bucketName = request["bucketName"]
    objectName = request["objectName"]
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]

    jobId = request["jobId"]
    invocationId = request["invocationId"]
    invocationSchemaVersion = request["invocationSchemaVersion"]
    taskId = request["taskId"]

    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print("Extension: {}".format(ext))

    if ext and ext in ["jpg", "jpeg", "png", "pdf"]:
        documentId = str(uuid.uuid1())
        ds = datastore.DocumentStore(documentsTable, outputTable)
        ds.createDocument(documentId, bucketName, objectName)

        output = "Saved document {} for {}/{}".format(documentId, bucketName,
                                                      objectName)

        print(output)

    results = [{
        "taskId":
        taskId,
        "resultCode":
        "Succeeded",
        "resultString":
        "Document submitted for processing with Id: {}".format(documentId),
    }]

    return {
        "invocationSchemaVersion": invocationSchemaVersion,
        "treatMissingKeysAs": "PermanentFailure",
        "invocationId": invocationId,
        "results": results,
    }
def processRequest(request):
    output = ""
    logger.info("request: {}".format(request))

    bucketName = request["bucketName"]
    sourceLanguageCode = request["sourceLanguage"]
    targetLanguageCode = request["targetLanguage"]
    access_role = request["access_role"]
    triggerFile = request["trigger_file"]
    try:
        # Filter only the JSON files for processing
        objs = S3Helper().getFilteredFileNames(bucketName, "input/", "json")
        for obj in objs:
            try:
                content = S3Helper().readFromS3(bucketName, obj)
                logger.debug(content)
                jsonDocument = json.loads(content)
                print(jsonDocument)
                # Convert the JSON document into XML
                outputXML = json2xml.Json2xml(jsonDocument,
                                              attr_type=False).to_xml()
                logger.debug(outputXML)
                newObjectKey = "xmlin/{}.xml".format(
                    FileHelper.getFileName(obj))
                # Store the XML in the S3 location for Translation
                S3Helper().writeToS3(str(outputXML), bucketName, newObjectKey)
                output = "Output Object: {}/{}".format(bucketName,
                                                       newObjectKey)
                logger.debug(output)
                # Rename the JSON files to prevent reprocessing
                S3Helper().renameObject(bucketName, obj,
                                        "{}.processed".format(obj))
            except ValueError:
                logger.error(
                    "Error occured loading the json file:{}".format(obj))
            except ClientError as e:
                logger.error("An error occured with S3 Bucket Operation: %s" %
                             e)
        # Start the translation batch job using Amazon Translate
        startTranslationJob(bucketName, sourceLanguageCode, targetLanguageCode,
                            access_role)
        S3Helper().deleteObject(bucketName, "input/{}".format(triggerFile))
    except ClientError as e:
        logger.error("An error occured with S3 Bucket Operation: %s" % e)
Beispiel #27
0
    def __init__(self, bucketName, documentPath, awsRegion, detectText,
                 detectForms, detectTables):

        ip = Input()
        if (bucketName):
            ip.bucketName = bucketName
        if (documentPath):
            ip.documentPath = documentPath
        if (awsRegion):
            ip.awsRegion = awsRegion
        if (detectText):
            ip.detectText = detectText
        if (detectForms):
            ip.detectForms = detectForms
        if (detectTables):
            ip.detectTables = detectTables

        if (not ip.bucketName and not ip.documentPath):
            raise Exception("Document is required.")

        if (ip.bucketName):
            ip.isLocalDocument = False
        else:
            ip.isLocalDocument = True

        ext = FileHelper.getFileExtenstion(ip.documentPath).lower()
        print("Extension", ext)
        if (ext == "pdf"):
            ip.documentType = "PDF"
        elif (ext == "jpg" or ext == "jpeg" or ext == "png"):
            ip.documentType = "IMAGE"
        else:
            raise Exception("Document should be jpg/jpeg, png or pdf.")

        if (ip.documentType == "PDF" and ip.isLocalDocument):
            raise Exception("PDF must be in S3 bucket.")

        if (ip.detectText == False and ip.detectForms == False
                and ip.detectTables == False):
            raise Exception(
                "Select at least one option to extract text, form or table")

        self.inputParameters = ip
def processRequest(request):

    output = ""

    print("request: {}".format(request))

    documentId = request["documentId"]
    bucketName = request["bucketName"]
    objectName = request["objectName"]

    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print("Extension: {}".format(ext))

    if (ext and ext in ["jpg", "jpeg", "png"]):
        qUrl = request['syncQueueUrl']
    elif (ext in ["pdf"]):
        qUrl = request['asyncQueueUrl']

    if (qUrl):
        # To avoid form and tables outputs, this will be added to a configuration file
        #features = ["Text", "Forms", "Tables"]
        features = ["Text"]
        jsonMessage = {
            'documentId': documentId,
            "features": features,
            'bucketName': bucketName,
            'objectName': objectName
        }

        client = AwsHelper().getClient('sqs')
        postMessage(client, qUrl, jsonMessage)

    output = "Completed routing for documentId: {}, object: {}/{}".format(
        documentId, bucketName, objectName)

    print(output)
def processRequest(request):

    output = ""

    print("request: {}".format(request))

    documentId = request["documentId"]
    bucketName = request["bucketName"]
    objectName = request["objectName"]

    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print("Extension: {}".format(ext))

    if(ext and ext in ["jpg", "jpeg", "png"]):
        qUrl = request['syncQueueUrl']
    elif (ext in ["pdf"]):
        qUrl = request['asyncQueueUrl']
    elif (ext in ["htm", "html"]):
        qUrl = os.environ['HTMLTOBOUNDINGBOX_QUEUE_URL']

    if(qUrl):
        features = ["Text"] # , "Forms", "Tables"]

        jsonMessage = { 'documentId' : documentId,
            "features" : features,
            'bucketName': bucketName,
            'objectName' : objectName }

        client = AwsHelper().getClient('sqs')
        postMessage(client, qUrl, jsonMessage)
        if (ext in ["pdf"]):
            postMessage(client, request["pdftoimgQueueUrl"], jsonMessage)

    output = "Completed routing for documentId: {}, object: {}/{}".format(documentId, bucketName, objectName)

    print(output)
def generateOutput(filePath, response):
    print("Generating output...")
    name, ext = FileHelper.getFileNameAndExtension(filePath)
    opg = OutputGenerator(response, "{}-v2-{}".format(name, ext), True, True)
    opg.run()
    opg.generateInsights(True, True, 'es', 'us-east-1')
Beispiel #31
0
		def inject_app_id(file_path):
			FileHelper.search_replace_file(file_path, 'YOUR-USERAPP-APP-ID', profile['user']['app_id'])