コード例 #1
0
    def run(self):

        if (not self.document.pages):
            return

        opath = "{}response.json".format(self.outputPath)
        S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath)

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        docText = ""

        p = 1
        for page in self.document.pages:
            opath = "{}page-{}-response.json".format(
                self.outputPath, self.metadata['page_number'])
            S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath)

            text_file = self._outputText(page, p)

            docText = docText + page.text + "\n"

            if (self.tables):
                csv_file = self._outputTable(page, p)

            p = p + 1
        return {"csv": csv_file, "text_file": text_file}
コード例 #2
0
ファイル: og.py プロジェクト: roydeboys/Aws-Textract-Demo
    def run(self):

        if not self.document.pages:
            return

        opath = "{}response.json".format(self.outputPath)
        S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath)
        self.saveItem(self.documentId, "Response", opath)

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        docText = ""

        p = 1
        for page in self.document.pages:

            opath = "{}page-{}-response.json".format(self.outputPath, p)
            S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath)
            self.saveItem(self.documentId, "page-{}-Response".format(p), opath)

            self._outputText(page, p)

            docText = docText + page.text + "\n"

            if self.forms:
                self._outputForm(page, p)

            if self.tables:
                self._outputTable(page, p)

            p = p + 1
コード例 #3
0
 def _outputText(self, page, p):
     #text = page.text
     text = page.getTextInReadingOrder()
     opath = "{}page-{}-text.txt".format(self.outputPath, p)
     S3Helper.writeToS3(text, self.bucketName, opath)
     self.saveItem(self.documentId, "page-{}-Text".format(p), opath)
     self.indexDocument(self.bucketName, opath, text)
コード例 #4
0
def write_extracted_zip(aws_env: dict, zip_tmp: str):
    output_bucket = aws_env['bucketName']
    output_folder = aws_env['outputName']
    aws_region = aws_env['awsRegion']

    print("Writing s3://{0}/{1} in {2}".format(output_bucket, output_folder,
                                               aws_region))
    for path, folders, files in os.walk(zip_tmp):
        print("=> Path: {0}".format(path))
        for file in files:
            print("=> File: {0}".format(files))
            file_path = os.path.join(path, file)
            s3_output_path = os.path.join(output_folder, file)
            try:
                with open(file_path, "r") as open_file:
                    content = open_file.read()
                    print("=> Writing {0} to s3: {0}".format(
                        file_path, s3_output_path))
                    S3Helper.writeToS3(content, output_bucket, s3_output_path,
                                       aws_region)
            except UnicodeDecodeError:
                with open(file_path, "rb") as open_file:
                    content = open_file.read()
                    print("=> Writing to s3: {0}".format(
                        file_path, s3_output_path))
                    S3Helper.writeToS3(content, output_bucket, s3_output_path,
                                       aws_region)
コード例 #5
0
    def run(self):

        if(not self.document.pages):
            return

        opath = "{}{}response.json".format(self.outputPath,TEXTRACT_PATH_S3_PREFIX)
        S3Helper.writeToS3(json.dumps(round_floats(prune_blocks(
            self.response)), separators=(',', ':')), self.bucketName, opath)
        self.saveItem(self.documentId, '{}Response'.format(TEXTRACT_PATH_S3_PREFIX), opath)

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        docText = ""

        p = 1
        for page in self.document.pages:
            docText = docText + page.text + "\n"

            if(self.forms):
                key_val_pairs = self._outputForm(page, p)

            if(self.tables):
                self._outputTable(page, p)

            p = p + 1
        
        return {DOCTEXT: docText, KVPAIRS: key_val_pairs}
コード例 #6
0
    def run(self):

        if(not self.document.pages):
            return

        opath = "{}response.json".format(self.outputPath)
        S3Helper.writeToS3(json.dumps(round_floats(prune_blocks(
            self.response)), separators=(',', ':')), self.bucketName, opath)
        self.saveItem(self.documentId, 'Response', opath)

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        docText = ""

        p = 1
        for page in self.document.pages:
            docText = docText + page.text + "\n"

            if(self.forms):
                self._outputForm(page, p)

            if(self.tables):
                self._outputTable(page, p)

            p = p + 1

        return docText
    def run(self):

        if (not self.document.pages):
            return

        opath = "{}response.json".format(self.outputPath)
        S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath)
        self.saveItem(self.documentId, 'Response', opath)

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        docText = ""

        p = 1
        for page in self.document.pages:
            # Avoid printing json outputs for every page
            #opath = "{}page-{}-response.json".format(self.outputPath, p)
            #S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath)
            #self.saveItem(self.documentId, "page-{}-Response".format(p), opath)
            # Avoid printing text outputs at a page level
            #self._outputText(page, p)

            docText = docText + page.text + "\n"

            if (self.forms):
                self._outputForm(page, p)

            if (self.tables):
                self._outputTable(page, p)

            p = p + 1
        opath = "{}response.txt".format(self.outputPath)
        S3Helper.writeToS3(docText, self.bucketName, opath)
        self.saveItem(self.documentId, 'Response', opath)
コード例 #8
0
        def validateInput(self, args):

            event = self.getInputParameters(args)

            ips = {}

            if (not 'documents' in event):
                raise Exception(
                    "Document or path to a foler or S3 bucket containing documents is required."
                )

            inputDocument = event['documents']
            idl = inputDocument.lower()

            bucketName = None
            documents = []
            awsRegion = 'us-east-1'

            if (idl.startswith("s3://")):
                o = urlparse(inputDocument)
                bucketName = o.netloc
                path = o.path[1:]
                ar = S3Helper.getS3BucketRegion(bucketName)
                if (ar):
                    awsRegion = ar

                if (idl.endswith("/")):
                    allowedFileTypes = ["jpg", "jpeg", "png", "pdf"]
                    documents = S3Helper.getFileNames(awsRegion, bucketName,
                                                      path, 1,
                                                      allowedFileTypes)
                else:
                    documents.append(path)
            else:
                if (idl.endswith("/")):
                    allowedFileTypes = ["jpg", "jpeg", "png"]
                    documents = FileHelper.getFileNames(
                        inputDocument, allowedFileTypes)
                else:
                    documents.append(inputDocument)

                if ('region' in event):
                    awsRegion = event['region']

            ips["bucketName"] = bucketName
            ips["documents"] = documents
            ips["awsRegion"] = awsRegion
            ips["text"] = ('text' in event)
            ips["forms"] = ('forms' in event)
            ips["tables"] = ('tables' in event)
            ips["insights"] = ('insights' in event)
            ips["medical-insights"] = ('medical-insights' in event)
            if ("translate" in event):
                ips["translate"] = event["translate"]
            else:
                ips["translate"] = ""

            return ips
コード例 #9
0
def write_bbox_to_s3(aws_env: dict) -> None:
    with open(aws_env['tmpJsonOutput'], "r") as file:
        content = file.read()
        S3Helper.writeToS3(content, aws_env['outputBucket'],
                           aws_env['outputNameJson'], aws_env['awsRegion'])
    with open(aws_env['tmpTxtOutput'], "r") as file:
        content = file.read()
        S3Helper.writeToS3(content, aws_env['outputBucket'],
                           aws_env['outputNameTxt'], aws_env['awsRegion'])
def processRequest(request):
    output = ""
    logger.debug("request: {}".format(request))
    up = urlparse(request["s3uri"], allow_fragments=False)
    accountid = request["accountId"]
    jobid = request["jobId"]
    bucketName = up.netloc
    objectkey = up.path.lstrip('/')
    # choose the base path for iterating within the translated files for the specific job
    basePrefixPath = objectkey + accountid + "-TranslateText-" + jobid + "/"
    languageCode = request["langCode"]
    logger.debug("Base Prefix Path:{}".format(basePrefixPath))
    # Filter only the translated XML files for processing
    objs = S3Helper().getFilteredFileNames(bucketName, basePrefixPath, "xml")
    for obj in objs:
        try:
            content = S3Helper().readFromS3(bucketName, obj)
            logger.debug(content)
            #Convert the XML file to Dictionary object
            data_dict = xmltodict.parse(content)
            #Generate the Json content from the dictionary
            data_dict = data_dict["all"]
            flatten_dict = {
                k: (data_dict[k]["item"] if
                    (isinstance(v, dict) and len(v.keys()) == 1
                     and "item" in v.keys()) else v)
                for (k, v) in data_dict.items()
            }
            json_data = json.dumps(flatten_dict,
                                   ensure_ascii=False).encode('utf-8')
            logger.debug(json_data)
            newObjectKey = "output/{}.json".format(FileHelper.getFileName(obj))
            #Write the JSON object to the S3 output folder within the bucket
            S3Helper().writeToS3(json_data, bucketName, newObjectKey)
            output = "Output Object: {}/{}".format(bucketName, newObjectKey)
            logger.debug(output)
        except ValueError:
            logger.error("Error occured loading the json file:{}".format(obj))
        except ClientError as e:
            logger.error("An error occured with S3 bucket operations: %s" % e)
        except:
            e = sys.exc_info()[0]
            logger.error("Error occured processing the xmlfile: %s" % e)
    objs = S3Helper().getFilteredFileNames(bucketName, "xmlin/", "xml")
    if (request["delete_xmls"] and request["delete_xmls"] == "true"):
        for obj in objs:
            try:
                logger.debug("Deleting temp xml files {}".format(obj))
                S3Helper().deleteObject(bucketName, obj)
            except ClientError as e:
                logger.error("An error occured with S3 bucket operations: %s" %
                             e)
            except:
                e = sys.exc_info()[0]
                logger.error("Error occured processing the xmlfile: %s" % e)
コード例 #11
0
    def _outputText(self, page, p):
        page_number = self.metadata['page_number']
        text = page.text
        opath = "{}page-{}-text.txt".format(self.outputPath, page_number)
        S3Helper.writeToS3(text, self.bucketName, opath)
        textInReadingOrder = page.getTextInReadingOrder()
        opath = "{}page-{}-text-inreadingorder.txt".format(
            self.outputPath, page_number)
        S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath)

        return opath
def processRequest(request):
    output = ""
    logger.info("request: {}".format(request))

    bucketName = request["bucketName"]
    sourceLanguageCode = request["sourceLanguage"]
    targetLanguageCode = request["targetLanguage"]
    access_role = request["access_role"]
    triggerFile = request["trigger_file"]
    try:
        captions = Captions()
        #filter only the VTT and SRT file for processing in the input folder
        objs = S3Helper().getFilteredFileNames(bucketName, "input/",
                                               ["vtt", "srt"])
        for obj in objs:
            try:
                vttObject = {}
                vttObject["Bucket"] = bucketName
                vttObject["Key"] = obj
                captions_list = []
                #based on the file type call the method that coverts them into python list object
                if (obj.endswith("vtt")):
                    captions_list = captions.vttToCaptions(vttObject)
                elif (obj.endswith("srt")):
                    captions_list = captions.srtToCaptions(vttObject)
                #convert the text captions in the list object to a delimited file
                delimitedFile = captions.ConvertToDemilitedFiles(captions_list)
                fileName = obj.split("/")[-1]
                newObjectKey = "captions-in/{}.delimited".format(fileName)
                S3Helper().writeToS3(str(delimitedFile), bucketName,
                                     newObjectKey)
                output = "Output Object: {}/{}".format(bucketName,
                                                       newObjectKey)
                logger.debug(output)
                S3Helper().renameObject(bucketName, obj,
                                        "{}.processed".format(obj))
            except ClientError as e:
                logger.error(
                    "An error occured starting the Translate Batch Job: %s" %
                    e)
        translateContext = {}
        translateContext["sourceLang"] = sourceLanguageCode
        translateContext["targetLangList"] = [targetLanguageCode]
        translateContext["roleArn"] = access_role
        translateContext["bucket"] = bucketName
        translateContext["inputLocation"] = "captions-in/"
        translateContext["outputlocation"] = "captions-out/"
        translateContext["jobPrefix"] = "TranslateJob-captions"
        #Call Amazon Translate to translate the delimited files in the captions-in folder
        jobinfo = captions.TranslateCaptions(translateContext)
        S3Helper().deleteObject(bucketName, "input/{}".format(triggerFile))
        logger.debug(jobinfo)
    except ClientError as e:
        logger.error("An error occured with S3 Bucket Operation: %s" % e)
コード例 #13
0
    def _outputText(self, page, p):
        text = page.text
        opath = "{}{}page-{}-text.txt".format(self.outputPath,TEXTRACT_PATH_S3_PREFIX, p)
        S3Helper.writeToS3(text, self.bucketName, opath)
        self.saveItem(self.documentId, "{}page-{}-Text".format(TEXTRACT_PATH_S3_PREFIX, p), opath)

        textInReadingOrder = page.getTextInReadingOrder()
        opath = "{}{}page-{}-text-inreadingorder.txt".format(self.outputPath,TEXTRACT_PATH_S3_PREFIX, p)
        S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath)
        self.saveItem(self.documentId,
                      "{}page-{}-TextInReadingOrder".format(TEXTRACT_PATH_S3_PREFIX, p), opath)
コード例 #14
0
ファイル: og.py プロジェクト: keshava/jarvis-be
    def _outputText(self, page, p, no_write=False):
        text = page.text
        textInReadingOrder = page.getTextInReadingOrder()

        if no_write:
            return (text, textInReadingOrder)
        else:
            opath = "{}/page-{}/text.txt".format(self.outputPath, p)
            opath = "{}/page-{}/text-inreadingorder.txt".format(
                self.outputPath, p)
            S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath)
            S3Helper.writeToS3(text, self.bucketName, opath)
コード例 #15
0
ファイル: esindex.py プロジェクト: LouisLoison/deepblooGit
def getResults(bucketName, outputPath):
    content = {
        "responseByPage":
        json.loads(
            S3Helper.readFromS3(bucketName,
                                "{}pages.json".format(outputPath))),
        "fullText":
        S3Helper.readFromS3(bucketName, "{}text.txt".format(outputPath)),
        "fullTextReadingOrder":
        S3Helper.readFromS3(bucketName,
                            "{}text-inreadingorder.txt".format(outputPath))
    }
    return content
コード例 #16
0
    def processComprehendMedicalICD10(self,
                                      comprehendMedicalICD10,
                                      numOfPages,
                                      bucket,
                                      comprehendOutputPath):

        data = {}
        data['results'] = []

        for p in range(0, numOfPages):
            page = {}
            # page numbers start at 1
            page['Page'] = p + 1
            page['Entities'] = []

            # to detect and skip duplicates
            entities = set()

            for e in comprehendMedicalICD10[p]:

                # add this entity if not already present
                if e['Text'].upper() not in entities:
                    # add entity to results list
                    entity = {}
                    entity['Text'] = e['Text']
                    entity['Category'] = e['Category']
                    entity['Type'] = e['Type']

                    entity['ICD10CMConcepts'] = []

                    if 'ICD10CMConcepts' in e:

                        for c in e['ICD10CMConcepts']:
                            concept = {}
                            concept['Description'] = c['Description']
                            concept['Code'] = c['Code']
                            concept['Score'] = c['Score']
                            entity['ICD10CMConcepts'].append(concept)

                    page['Entities'].append(entity)
                    entities.add(e['Text'].upper())

                    # make a note of this added entity
                    entities.add(e['Text'].upper())

            data['results'].append(page)

        # create results file in S3 under document folder
        S3Helper.writeToS3(json.dumps(data), bucket,
                           comprehendOutputPath + "comprehendMedicalICD10.json")
コード例 #17
0
def spacy_sentences_extraction(content: str, aws_env: dict):
    excluded_pipeline = ["tagger", "ner", "textcat", "parser"]
    model_path = "/opt/python/xx_ent_wiki_sm/xx_ent_wiki_sm-2.3.0"
    sentence_content = ""

    if os.path.isdir(model_path) is False:
        model_path = "xx_ent_wiki_sm"
    nlp = spacy.load(model_path, disable=excluded_pipeline)
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    doc = nlp(content)
    print("Pipelines names: ", nlp.pipe_names)
    for sent in doc.sents:
        sentence = sent.text.replace('\n', ' ')
        sentence_content += "{}\n".format(sentence.strip())
    S3Helper.writeToS3(sentence_content, aws_env['outputBucket'],
                       aws_env['outputNameTxt'], aws_env['awsRegion'])
コード例 #18
0
def processRequest(bucketName, objectName, callerId):

    output = ""

    documentId = S3Helper().getTagsS3(bucketName,
                                      objectName).get('documentId', None)
    if not documentId:
        raise Exception("Unidentified document. Please check its tags.")

    pipeline_client.body = {
        "documentId": documentId,
        "bucketName": bucketName,
        "objectName": objectName,
        "stage": PIPELINE_STAGE
    }
    pipeline_client.stageInProgress()

    print('Task ID: ' + documentId)

    if (documentId and bucketName and objectName):
        print("DocumentId: {}, Object: {}/{}".format(documentId, bucketName,
                                                     objectName))

        processImage(documentId, bucketName, objectName, callerId)

        output = "Document: {}, Object: {}/{} processed.".format(
            documentId, bucketName, objectName)
        pipeline_client.stageSucceeded()
        print(output)
    else:
        pipeline_client.stageFailed()

    return {'statusCode': 200, 'body': output}
コード例 #19
0
    def srtToCaptions(self, vttObject):

        captions = []
        srt = ""
        # Get metadata
        s3 = boto3.client('s3')
        try:
            self.logger.debug("Getting data from s3://" + vttObject["Bucket"] +
                              "/" + vttObject["Key"])
            srt = S3Helper().readFromS3(vttObject["Bucket"], vttObject["Key"])
            self.logger.debug(srt)
        except Exception as e:
            raise e
        #buffer = StringIO(srt)
        f = NamedTemporaryFile(mode='w+', delete=False)
        f.write(srt)
        f.close()
        for srtcaption in webvtt.from_srt(f.name):
            caption = {}
            self.logger.debug(srtcaption)
            caption["start"] = self.formatTimeVTTtoSeconds(srtcaption.start)
            caption["end"] = self.formatTimeVTTtoSeconds(srtcaption.end)
            caption["caption"] = srtcaption.lines[0]
            self.logger.debug("Caption Object:{}".format(caption))
            captions.append(caption)

        return captions
コード例 #20
0
    def vttToCaptions(self, vttObject):

        captions = []
        vtt = ""
        # Get metadata
        s3 = boto3.client('s3')
        try:
            self.logger.debug("Getting data from s3://" + vttObject["Bucket"] +
                              "/" + vttObject["Key"])
            vtt = S3Helper().readFromS3(vttObject["Bucket"], vttObject["Key"])
            self.logger.debug(vtt)
        except Exception as e:
            #Fix me
            self.logger.error(e)

        buffer = StringIO(vtt)

        for vttcaption in webvtt.read_buffer(buffer):
            caption = {}
            caption["start"] = self.formatTimeVTTtoSeconds(vttcaption.start)
            caption["end"] = self.formatTimeVTTtoSeconds(vttcaption.end)
            caption["caption"] = vttcaption.text
            captions.append(caption)

        return captions
コード例 #21
0
ファイル: textract_starter.py プロジェクト: keshava/jarvis-be
def processItem(bucketName, objectName, snsTopic, snsRole):
    print('Bucket Name: ' + bucketName)
    print('Object Name: ' + objectName)

    documentId = S3Helper().getTagsS3(bucketName,
                                      objectName).get('documentId', None)
    if not documentId:
        raise Exception("Unidentified document. Please check its tags.")

    print('Task ID: ' + documentId)

    pipeline_client.body = {
        "documentId": documentId,
        "bucketName": bucketName,
        "objectName": objectName,
        "stage": PIPELINE_STAGE
    }
    pipeline_client.stageInProgress()
    jobId = startJob(bucketName, objectName, documentId, snsTopic, snsRole)

    if (jobId):
        pipeline_client.stageSucceeded()
        print("Started Job with Id: {}".format(jobId))
    else:
        pipeline_client.stageFailed()
    return jobId
コード例 #22
0
    def _outputFullTable(self,pages):
        csvData = []
        for page in self.document.pages:
            for table in page.tables:
                csvRow = []
                #csvRow.append("Table")
                csvData.append(csvRow)
                for row in table.rows:
                    csvRow  = []
                    for cell in row.cells:
                        csvRow.append(cell.text)
                    csvData.append(csvRow)
                csvData.append([])
                csvData.append([])

        opath = "{}page-{}-tables.csv".format(self.outputPath, 'Full')
        S3Helper.writeCSVRaw(csvData, self.bucketName, opath)
 def _outputForm(self, page, p):
     csvData = []
     for field in page.form.fields:
         csvItem = []
         if (field.key):
             csvItem.append(field.key.text)
         else:
             csvItem.append("")
         if (field.value):
             csvItem.append(field.value.text)
         else:
             csvItem.append("")
         csvData.append(csvItem)
     csvFieldNames = ['Key', 'Value']
     opath = "{}page-{}-forms.csv".format(self.outputPath, p)
     S3Helper.writeCSV(csvFieldNames, csvData, self.bucketName, opath)
     self.saveItem(self.documentId, "page-{}-Forms".format(p), opath)
コード例 #24
0
    def processAndReturnComprehendMedicalEntities(self,
                                                  comprehendMedicalEntities,
                                                  numOfPages, bucket,
                                                  comprehendOutputPath):

        data = {}
        data['results'] = []
        medical_entities_to_index = {}

        for p in range(0, numOfPages):
            page = {}
            # page numbers start at 1
            page['Page'] = p + 1
            page['Entities'] = []

            # to detect and skip duplicates
            entities = set()

            for e in comprehendMedicalEntities[p]:

                # add this entity if not already present
                if e['Text'].upper() not in entities:
                    # add entity to results list
                    entity = {}
                    entity['Text'] = e['Text']
                    entity['Category'] = e['Category']

                    if 'Score' in e:
                        entity['Score'] = e['Score']

                    page['Entities'].append(entity)

                    if e['Category'] not in medical_entities_to_index:
                        medical_entities_to_index[e['Category']] = []
                    medical_entities_to_index[e['Category']].append(e['Text'])

                    # make a note of this added entity
                    entities.add(e['Text'].upper())

            data['results'].append(page)

        # create results file in S3 under document folder
        S3Helper.writeToS3(
            json.dumps(data), bucket,
            comprehendOutputPath + "comprehendMedicalEntities.json")
        return medical_entities_to_index
コード例 #25
0
def lambda_handler(event, context):
    if (event['status'] <= 0):
        return {**event, "errorMessage": "Status isnt positive"}
    aws_env = {
        **event,
        "bucketName": os.environ.get('DOCUMENTS_BUCKET'),
        "awsRegion": 'eu-west-1',
        "tmpJsonOutput": "/tmp/tmp_result.json",
        "tmpTxtOutput": "/tmp/tmp_result.txt",
        "outputBucket": os.environ.get('DOCUMENTS_BUCKET'),
        "outputNameJson": get_bbox_filename(event['objectName'], ".json"),
        "outputNameTxt": get_bbox_filename(event['objectName'], ".txt"),
        "textractOnly": os.environ.get('TEXTRACT_ONLY'),
        "minCharNeeded": int(os.environ.get('MIN_CHAR_NEEDED')),
        "extract_pdf_lines": os.environ.get('EXTRACT_PDF_LINES'),
    }
    status = {"statusCode": 200, "body": "All right"}
    extract_pdf_lines = aws_env['extract_pdf_lines']
    textract_only = aws_env['textractOnly']
    tmp_folder = "/tmp/pdfToBbox"
    pdf_tmp_path = copy_pdf_to_tmp(tmp_folder, aws_env)

    print("==> aws_env: ", aws_env)
    if textract_only == "false" and is_pdf_has_enough_characters(
            pdf_tmp_path, aws_env['minCharNeeded']) is True:
        print("=> Extracting bounding box with pdfplumber")
        if extract_pdf_lines == "true":
            print("=> Extracting pdf lines bbox")
            pdf = Pdf(pdf_tmp_path, aws_env['tmpJsonOutput'],
                      aws_env['tmpTxtOutput'])
            pdf.parse_pdf()
            pdf.save_in_json()
            pdf.save_in_txt()
            write_bbox_to_s3(aws_env)
        else:
            print("=> Extracting pdf words bbox")
            if execute_pdf_to_bbox(pdf_tmp_path, aws_env['tmpJsonOutput']):
                print("=> Error while trying to get pdf information")
                aws_env["status"] = -1
                aws_env["errorMessage"] = "PDF format not supported."
            else:
                write_bbox_to_s3(aws_env)
    else:
        print("Extracting bounding box with textract")
        #send_to_textract(aws_env)
    aws_env['size'] = S3Helper.getS3FileSize(aws_env['bucketName'],
                                             aws_env['outputNameTxt'],
                                             aws_env['awsRegion'])
    aws_env["s3Url"] = get_new_s3_url(aws_env['s3Url'], "txt")
    aws_env["status"] = status
    aws_env["status"] = 1
    aws_env["errorMessage"] = None
    aws_env["contentType"] = "text/txt"
    aws_env['objectName'] = aws_env['outputNameTxt']
    aws_env["sourceUrl"] = aws_env["s3Url"]
    AwsHelper.refreshTmpFolder(tmp_folder)
    return update_event(aws_env, event)
コード例 #26
0
ファイル: og.py プロジェクト: keshava/jarvis-be
 def _outputTable(self, page, p, no_write=False):
     csvData = []
     for table in page.tables:
         csvRow = []
         csvRow.append("Table")
         csvData.append(csvRow)
         for row in table.rows:
             csvRow = []
             for cell in row.cells:
                 csvRow.append(cell.text)
             csvData.append(csvRow)
         csvData.append([])
         csvData.append([])
     if no_write:
         return csvData
     else:
         opath = "{}/page-{}/tables.csv".format(self.outputPath, p)
         S3Helper.writeCSVRaw(csvData, self.bucketName, opath)
    def _outputTable(self, page, p):

        csvData = []
        for table in page.tables:
            csvRow = []
            csvRow.append("Table")
            csvData.append(csvRow)
            for row in table.rows:
                csvRow = []
                for cell in row.cells:
                    csvRow.append(cell.text)
                csvData.append(csvRow)
            csvData.append([])
            csvData.append([])

        opath = "{}page-{}-tables.csv".format(self.outputPath, p)
        S3Helper.writeCSVRaw(csvData, self.bucketName, opath)
        self.saveItem(self.documentId, "page-{}-Tables".format(p), opath)
def processRequest(request):
    output = ""
    logger.info("request: {}".format(request))

    bucketName = request["bucketName"]
    sourceLanguageCode = request["sourceLanguage"]
    targetLanguageCode = request["targetLanguage"]
    access_role = request["access_role"]
    triggerFile = request["trigger_file"]
    try:
        # Filter only the JSON files for processing
        objs = S3Helper().getFilteredFileNames(bucketName, "input/", "json")
        for obj in objs:
            try:
                content = S3Helper().readFromS3(bucketName, obj)
                logger.debug(content)
                jsonDocument = json.loads(content)
                print(jsonDocument)
                # Convert the JSON document into XML
                outputXML = json2xml.Json2xml(jsonDocument,
                                              attr_type=False).to_xml()
                logger.debug(outputXML)
                newObjectKey = "xmlin/{}.xml".format(
                    FileHelper.getFileName(obj))
                # Store the XML in the S3 location for Translation
                S3Helper().writeToS3(str(outputXML), bucketName, newObjectKey)
                output = "Output Object: {}/{}".format(bucketName,
                                                       newObjectKey)
                logger.debug(output)
                # Rename the JSON files to prevent reprocessing
                S3Helper().renameObject(bucketName, obj,
                                        "{}.processed".format(obj))
            except ValueError:
                logger.error(
                    "Error occured loading the json file:{}".format(obj))
            except ClientError as e:
                logger.error("An error occured with S3 Bucket Operation: %s" %
                             e)
        # Start the translation batch job using Amazon Translate
        startTranslationJob(bucketName, sourceLanguageCode, targetLanguageCode,
                            access_role)
        S3Helper().deleteObject(bucketName, "input/{}".format(triggerFile))
    except ClientError as e:
        logger.error("An error occured with S3 Bucket Operation: %s" % e)
コード例 #29
0
    def processComprehendEntities(self,
                                  comprehendEntities,
                                  numOfPages,
                                  bucket,
                                  documentPath):

        data = {}
        data['results'] = []
        entities_to_index = {}

        # process comprehend entities for each page
        for p in range(0, numOfPages):
            page = {}
            # page number start at 1 but list of page data starts at 0
            page['Page'] = p + 1
            page['Entities'] = []

            # to detect and skip duplicates
            entities = set()

            for e in comprehendEntities[p]['Entities']:

                # add this entity if not already present
                if e['Text'].upper() not in entities:
                    # add entity to results list
                    entity = {}
                    entity['Text'] = e['Text']
                    entity['Type'] = e['Type']
                    entity['Score'] = e['Score']
                    page['Entities'].append(entity)

                    if e['Type'] not in entities_to_index:
                        entities_to_index[e['Type']] = []
                    entities_to_index[e['Type']].append(e['Text'])

                    # make a note of this added entity
                    entities.add(e['Text'].upper())

            data['results'].append(page)

        # create results file in S3 under document folder
        S3Helper.writeToS3(json.dumps(data), bucket,
                           documentPath + "comprehendEntities.json")
        return entities_to_index
コード例 #30
0
    def _outputTable(self, page, p):
        page_number = self.metadata['page_number']
        csvData = []
        for table in page.tables:
            csvRow = []
            csvRow.append("Table")
            csvData.append(csvRow)
            for row in table.rows:
                csvRow = []
                for cell in row.cells:
                    csvRow.append(cell.text)
                csvData.append(csvRow)
            csvData.append([])
            csvData.append([])

        opath = "{}page-{}-tables.csv".format(self.outputPath, page_number)
        S3Helper.writeCSVRaw(csvData, self.bucketName, opath)

        return opath