def run(self): if (not self.document.pages): return opath = "{}response.json".format(self.outputPath) S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath) print("Total Pages in Document: {}".format(len(self.document.pages))) docText = "" p = 1 for page in self.document.pages: opath = "{}page-{}-response.json".format( self.outputPath, self.metadata['page_number']) S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath) text_file = self._outputText(page, p) docText = docText + page.text + "\n" if (self.tables): csv_file = self._outputTable(page, p) p = p + 1 return {"csv": csv_file, "text_file": text_file}
def run(self): if not self.document.pages: return opath = "{}response.json".format(self.outputPath) S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath) self.saveItem(self.documentId, "Response", opath) print("Total Pages in Document: {}".format(len(self.document.pages))) docText = "" p = 1 for page in self.document.pages: opath = "{}page-{}-response.json".format(self.outputPath, p) S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath) self.saveItem(self.documentId, "page-{}-Response".format(p), opath) self._outputText(page, p) docText = docText + page.text + "\n" if self.forms: self._outputForm(page, p) if self.tables: self._outputTable(page, p) p = p + 1
def _outputText(self, page, p): #text = page.text text = page.getTextInReadingOrder() opath = "{}page-{}-text.txt".format(self.outputPath, p) S3Helper.writeToS3(text, self.bucketName, opath) self.saveItem(self.documentId, "page-{}-Text".format(p), opath) self.indexDocument(self.bucketName, opath, text)
def write_extracted_zip(aws_env: dict, zip_tmp: str): output_bucket = aws_env['bucketName'] output_folder = aws_env['outputName'] aws_region = aws_env['awsRegion'] print("Writing s3://{0}/{1} in {2}".format(output_bucket, output_folder, aws_region)) for path, folders, files in os.walk(zip_tmp): print("=> Path: {0}".format(path)) for file in files: print("=> File: {0}".format(files)) file_path = os.path.join(path, file) s3_output_path = os.path.join(output_folder, file) try: with open(file_path, "r") as open_file: content = open_file.read() print("=> Writing {0} to s3: {0}".format( file_path, s3_output_path)) S3Helper.writeToS3(content, output_bucket, s3_output_path, aws_region) except UnicodeDecodeError: with open(file_path, "rb") as open_file: content = open_file.read() print("=> Writing to s3: {0}".format( file_path, s3_output_path)) S3Helper.writeToS3(content, output_bucket, s3_output_path, aws_region)
def run(self): if(not self.document.pages): return opath = "{}{}response.json".format(self.outputPath,TEXTRACT_PATH_S3_PREFIX) S3Helper.writeToS3(json.dumps(round_floats(prune_blocks( self.response)), separators=(',', ':')), self.bucketName, opath) self.saveItem(self.documentId, '{}Response'.format(TEXTRACT_PATH_S3_PREFIX), opath) print("Total Pages in Document: {}".format(len(self.document.pages))) docText = "" p = 1 for page in self.document.pages: docText = docText + page.text + "\n" if(self.forms): key_val_pairs = self._outputForm(page, p) if(self.tables): self._outputTable(page, p) p = p + 1 return {DOCTEXT: docText, KVPAIRS: key_val_pairs}
def run(self): if(not self.document.pages): return opath = "{}response.json".format(self.outputPath) S3Helper.writeToS3(json.dumps(round_floats(prune_blocks( self.response)), separators=(',', ':')), self.bucketName, opath) self.saveItem(self.documentId, 'Response', opath) print("Total Pages in Document: {}".format(len(self.document.pages))) docText = "" p = 1 for page in self.document.pages: docText = docText + page.text + "\n" if(self.forms): self._outputForm(page, p) if(self.tables): self._outputTable(page, p) p = p + 1 return docText
def run(self): if (not self.document.pages): return opath = "{}response.json".format(self.outputPath) S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath) self.saveItem(self.documentId, 'Response', opath) print("Total Pages in Document: {}".format(len(self.document.pages))) docText = "" p = 1 for page in self.document.pages: # Avoid printing json outputs for every page #opath = "{}page-{}-response.json".format(self.outputPath, p) #S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath) #self.saveItem(self.documentId, "page-{}-Response".format(p), opath) # Avoid printing text outputs at a page level #self._outputText(page, p) docText = docText + page.text + "\n" if (self.forms): self._outputForm(page, p) if (self.tables): self._outputTable(page, p) p = p + 1 opath = "{}response.txt".format(self.outputPath) S3Helper.writeToS3(docText, self.bucketName, opath) self.saveItem(self.documentId, 'Response', opath)
def validateInput(self, args): event = self.getInputParameters(args) ips = {} if (not 'documents' in event): raise Exception( "Document or path to a foler or S3 bucket containing documents is required." ) inputDocument = event['documents'] idl = inputDocument.lower() bucketName = None documents = [] awsRegion = 'us-east-1' if (idl.startswith("s3://")): o = urlparse(inputDocument) bucketName = o.netloc path = o.path[1:] ar = S3Helper.getS3BucketRegion(bucketName) if (ar): awsRegion = ar if (idl.endswith("/")): allowedFileTypes = ["jpg", "jpeg", "png", "pdf"] documents = S3Helper.getFileNames(awsRegion, bucketName, path, 1, allowedFileTypes) else: documents.append(path) else: if (idl.endswith("/")): allowedFileTypes = ["jpg", "jpeg", "png"] documents = FileHelper.getFileNames( inputDocument, allowedFileTypes) else: documents.append(inputDocument) if ('region' in event): awsRegion = event['region'] ips["bucketName"] = bucketName ips["documents"] = documents ips["awsRegion"] = awsRegion ips["text"] = ('text' in event) ips["forms"] = ('forms' in event) ips["tables"] = ('tables' in event) ips["insights"] = ('insights' in event) ips["medical-insights"] = ('medical-insights' in event) if ("translate" in event): ips["translate"] = event["translate"] else: ips["translate"] = "" return ips
def write_bbox_to_s3(aws_env: dict) -> None: with open(aws_env['tmpJsonOutput'], "r") as file: content = file.read() S3Helper.writeToS3(content, aws_env['outputBucket'], aws_env['outputNameJson'], aws_env['awsRegion']) with open(aws_env['tmpTxtOutput'], "r") as file: content = file.read() S3Helper.writeToS3(content, aws_env['outputBucket'], aws_env['outputNameTxt'], aws_env['awsRegion'])
def processRequest(request): output = "" logger.debug("request: {}".format(request)) up = urlparse(request["s3uri"], allow_fragments=False) accountid = request["accountId"] jobid = request["jobId"] bucketName = up.netloc objectkey = up.path.lstrip('/') # choose the base path for iterating within the translated files for the specific job basePrefixPath = objectkey + accountid + "-TranslateText-" + jobid + "/" languageCode = request["langCode"] logger.debug("Base Prefix Path:{}".format(basePrefixPath)) # Filter only the translated XML files for processing objs = S3Helper().getFilteredFileNames(bucketName, basePrefixPath, "xml") for obj in objs: try: content = S3Helper().readFromS3(bucketName, obj) logger.debug(content) #Convert the XML file to Dictionary object data_dict = xmltodict.parse(content) #Generate the Json content from the dictionary data_dict = data_dict["all"] flatten_dict = { k: (data_dict[k]["item"] if (isinstance(v, dict) and len(v.keys()) == 1 and "item" in v.keys()) else v) for (k, v) in data_dict.items() } json_data = json.dumps(flatten_dict, ensure_ascii=False).encode('utf-8') logger.debug(json_data) newObjectKey = "output/{}.json".format(FileHelper.getFileName(obj)) #Write the JSON object to the S3 output folder within the bucket S3Helper().writeToS3(json_data, bucketName, newObjectKey) output = "Output Object: {}/{}".format(bucketName, newObjectKey) logger.debug(output) except ValueError: logger.error("Error occured loading the json file:{}".format(obj)) except ClientError as e: logger.error("An error occured with S3 bucket operations: %s" % e) except: e = sys.exc_info()[0] logger.error("Error occured processing the xmlfile: %s" % e) objs = S3Helper().getFilteredFileNames(bucketName, "xmlin/", "xml") if (request["delete_xmls"] and request["delete_xmls"] == "true"): for obj in objs: try: logger.debug("Deleting temp xml files {}".format(obj)) S3Helper().deleteObject(bucketName, obj) except ClientError as e: logger.error("An error occured with S3 bucket operations: %s" % e) except: e = sys.exc_info()[0] logger.error("Error occured processing the xmlfile: %s" % e)
def _outputText(self, page, p): page_number = self.metadata['page_number'] text = page.text opath = "{}page-{}-text.txt".format(self.outputPath, page_number) S3Helper.writeToS3(text, self.bucketName, opath) textInReadingOrder = page.getTextInReadingOrder() opath = "{}page-{}-text-inreadingorder.txt".format( self.outputPath, page_number) S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath) return opath
def processRequest(request): output = "" logger.info("request: {}".format(request)) bucketName = request["bucketName"] sourceLanguageCode = request["sourceLanguage"] targetLanguageCode = request["targetLanguage"] access_role = request["access_role"] triggerFile = request["trigger_file"] try: captions = Captions() #filter only the VTT and SRT file for processing in the input folder objs = S3Helper().getFilteredFileNames(bucketName, "input/", ["vtt", "srt"]) for obj in objs: try: vttObject = {} vttObject["Bucket"] = bucketName vttObject["Key"] = obj captions_list = [] #based on the file type call the method that coverts them into python list object if (obj.endswith("vtt")): captions_list = captions.vttToCaptions(vttObject) elif (obj.endswith("srt")): captions_list = captions.srtToCaptions(vttObject) #convert the text captions in the list object to a delimited file delimitedFile = captions.ConvertToDemilitedFiles(captions_list) fileName = obj.split("/")[-1] newObjectKey = "captions-in/{}.delimited".format(fileName) S3Helper().writeToS3(str(delimitedFile), bucketName, newObjectKey) output = "Output Object: {}/{}".format(bucketName, newObjectKey) logger.debug(output) S3Helper().renameObject(bucketName, obj, "{}.processed".format(obj)) except ClientError as e: logger.error( "An error occured starting the Translate Batch Job: %s" % e) translateContext = {} translateContext["sourceLang"] = sourceLanguageCode translateContext["targetLangList"] = [targetLanguageCode] translateContext["roleArn"] = access_role translateContext["bucket"] = bucketName translateContext["inputLocation"] = "captions-in/" translateContext["outputlocation"] = "captions-out/" translateContext["jobPrefix"] = "TranslateJob-captions" #Call Amazon Translate to translate the delimited files in the captions-in folder jobinfo = captions.TranslateCaptions(translateContext) S3Helper().deleteObject(bucketName, "input/{}".format(triggerFile)) logger.debug(jobinfo) except ClientError as e: logger.error("An error occured with S3 Bucket Operation: %s" % e)
def _outputText(self, page, p): text = page.text opath = "{}{}page-{}-text.txt".format(self.outputPath,TEXTRACT_PATH_S3_PREFIX, p) S3Helper.writeToS3(text, self.bucketName, opath) self.saveItem(self.documentId, "{}page-{}-Text".format(TEXTRACT_PATH_S3_PREFIX, p), opath) textInReadingOrder = page.getTextInReadingOrder() opath = "{}{}page-{}-text-inreadingorder.txt".format(self.outputPath,TEXTRACT_PATH_S3_PREFIX, p) S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath) self.saveItem(self.documentId, "{}page-{}-TextInReadingOrder".format(TEXTRACT_PATH_S3_PREFIX, p), opath)
def _outputText(self, page, p, no_write=False): text = page.text textInReadingOrder = page.getTextInReadingOrder() if no_write: return (text, textInReadingOrder) else: opath = "{}/page-{}/text.txt".format(self.outputPath, p) opath = "{}/page-{}/text-inreadingorder.txt".format( self.outputPath, p) S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath) S3Helper.writeToS3(text, self.bucketName, opath)
def getResults(bucketName, outputPath): content = { "responseByPage": json.loads( S3Helper.readFromS3(bucketName, "{}pages.json".format(outputPath))), "fullText": S3Helper.readFromS3(bucketName, "{}text.txt".format(outputPath)), "fullTextReadingOrder": S3Helper.readFromS3(bucketName, "{}text-inreadingorder.txt".format(outputPath)) } return content
def processComprehendMedicalICD10(self, comprehendMedicalICD10, numOfPages, bucket, comprehendOutputPath): data = {} data['results'] = [] for p in range(0, numOfPages): page = {} # page numbers start at 1 page['Page'] = p + 1 page['Entities'] = [] # to detect and skip duplicates entities = set() for e in comprehendMedicalICD10[p]: # add this entity if not already present if e['Text'].upper() not in entities: # add entity to results list entity = {} entity['Text'] = e['Text'] entity['Category'] = e['Category'] entity['Type'] = e['Type'] entity['ICD10CMConcepts'] = [] if 'ICD10CMConcepts' in e: for c in e['ICD10CMConcepts']: concept = {} concept['Description'] = c['Description'] concept['Code'] = c['Code'] concept['Score'] = c['Score'] entity['ICD10CMConcepts'].append(concept) page['Entities'].append(entity) entities.add(e['Text'].upper()) # make a note of this added entity entities.add(e['Text'].upper()) data['results'].append(page) # create results file in S3 under document folder S3Helper.writeToS3(json.dumps(data), bucket, comprehendOutputPath + "comprehendMedicalICD10.json")
def spacy_sentences_extraction(content: str, aws_env: dict): excluded_pipeline = ["tagger", "ner", "textcat", "parser"] model_path = "/opt/python/xx_ent_wiki_sm/xx_ent_wiki_sm-2.3.0" sentence_content = "" if os.path.isdir(model_path) is False: model_path = "xx_ent_wiki_sm" nlp = spacy.load(model_path, disable=excluded_pipeline) nlp.add_pipe(nlp.create_pipe('sentencizer')) doc = nlp(content) print("Pipelines names: ", nlp.pipe_names) for sent in doc.sents: sentence = sent.text.replace('\n', ' ') sentence_content += "{}\n".format(sentence.strip()) S3Helper.writeToS3(sentence_content, aws_env['outputBucket'], aws_env['outputNameTxt'], aws_env['awsRegion'])
def processRequest(bucketName, objectName, callerId): output = "" documentId = S3Helper().getTagsS3(bucketName, objectName).get('documentId', None) if not documentId: raise Exception("Unidentified document. Please check its tags.") pipeline_client.body = { "documentId": documentId, "bucketName": bucketName, "objectName": objectName, "stage": PIPELINE_STAGE } pipeline_client.stageInProgress() print('Task ID: ' + documentId) if (documentId and bucketName and objectName): print("DocumentId: {}, Object: {}/{}".format(documentId, bucketName, objectName)) processImage(documentId, bucketName, objectName, callerId) output = "Document: {}, Object: {}/{} processed.".format( documentId, bucketName, objectName) pipeline_client.stageSucceeded() print(output) else: pipeline_client.stageFailed() return {'statusCode': 200, 'body': output}
def srtToCaptions(self, vttObject): captions = [] srt = "" # Get metadata s3 = boto3.client('s3') try: self.logger.debug("Getting data from s3://" + vttObject["Bucket"] + "/" + vttObject["Key"]) srt = S3Helper().readFromS3(vttObject["Bucket"], vttObject["Key"]) self.logger.debug(srt) except Exception as e: raise e #buffer = StringIO(srt) f = NamedTemporaryFile(mode='w+', delete=False) f.write(srt) f.close() for srtcaption in webvtt.from_srt(f.name): caption = {} self.logger.debug(srtcaption) caption["start"] = self.formatTimeVTTtoSeconds(srtcaption.start) caption["end"] = self.formatTimeVTTtoSeconds(srtcaption.end) caption["caption"] = srtcaption.lines[0] self.logger.debug("Caption Object:{}".format(caption)) captions.append(caption) return captions
def vttToCaptions(self, vttObject): captions = [] vtt = "" # Get metadata s3 = boto3.client('s3') try: self.logger.debug("Getting data from s3://" + vttObject["Bucket"] + "/" + vttObject["Key"]) vtt = S3Helper().readFromS3(vttObject["Bucket"], vttObject["Key"]) self.logger.debug(vtt) except Exception as e: #Fix me self.logger.error(e) buffer = StringIO(vtt) for vttcaption in webvtt.read_buffer(buffer): caption = {} caption["start"] = self.formatTimeVTTtoSeconds(vttcaption.start) caption["end"] = self.formatTimeVTTtoSeconds(vttcaption.end) caption["caption"] = vttcaption.text captions.append(caption) return captions
def processItem(bucketName, objectName, snsTopic, snsRole): print('Bucket Name: ' + bucketName) print('Object Name: ' + objectName) documentId = S3Helper().getTagsS3(bucketName, objectName).get('documentId', None) if not documentId: raise Exception("Unidentified document. Please check its tags.") print('Task ID: ' + documentId) pipeline_client.body = { "documentId": documentId, "bucketName": bucketName, "objectName": objectName, "stage": PIPELINE_STAGE } pipeline_client.stageInProgress() jobId = startJob(bucketName, objectName, documentId, snsTopic, snsRole) if (jobId): pipeline_client.stageSucceeded() print("Started Job with Id: {}".format(jobId)) else: pipeline_client.stageFailed() return jobId
def _outputFullTable(self,pages): csvData = [] for page in self.document.pages: for table in page.tables: csvRow = [] #csvRow.append("Table") csvData.append(csvRow) for row in table.rows: csvRow = [] for cell in row.cells: csvRow.append(cell.text) csvData.append(csvRow) csvData.append([]) csvData.append([]) opath = "{}page-{}-tables.csv".format(self.outputPath, 'Full') S3Helper.writeCSVRaw(csvData, self.bucketName, opath)
def _outputForm(self, page, p): csvData = [] for field in page.form.fields: csvItem = [] if (field.key): csvItem.append(field.key.text) else: csvItem.append("") if (field.value): csvItem.append(field.value.text) else: csvItem.append("") csvData.append(csvItem) csvFieldNames = ['Key', 'Value'] opath = "{}page-{}-forms.csv".format(self.outputPath, p) S3Helper.writeCSV(csvFieldNames, csvData, self.bucketName, opath) self.saveItem(self.documentId, "page-{}-Forms".format(p), opath)
def processAndReturnComprehendMedicalEntities(self, comprehendMedicalEntities, numOfPages, bucket, comprehendOutputPath): data = {} data['results'] = [] medical_entities_to_index = {} for p in range(0, numOfPages): page = {} # page numbers start at 1 page['Page'] = p + 1 page['Entities'] = [] # to detect and skip duplicates entities = set() for e in comprehendMedicalEntities[p]: # add this entity if not already present if e['Text'].upper() not in entities: # add entity to results list entity = {} entity['Text'] = e['Text'] entity['Category'] = e['Category'] if 'Score' in e: entity['Score'] = e['Score'] page['Entities'].append(entity) if e['Category'] not in medical_entities_to_index: medical_entities_to_index[e['Category']] = [] medical_entities_to_index[e['Category']].append(e['Text']) # make a note of this added entity entities.add(e['Text'].upper()) data['results'].append(page) # create results file in S3 under document folder S3Helper.writeToS3( json.dumps(data), bucket, comprehendOutputPath + "comprehendMedicalEntities.json") return medical_entities_to_index
def lambda_handler(event, context): if (event['status'] <= 0): return {**event, "errorMessage": "Status isnt positive"} aws_env = { **event, "bucketName": os.environ.get('DOCUMENTS_BUCKET'), "awsRegion": 'eu-west-1', "tmpJsonOutput": "/tmp/tmp_result.json", "tmpTxtOutput": "/tmp/tmp_result.txt", "outputBucket": os.environ.get('DOCUMENTS_BUCKET'), "outputNameJson": get_bbox_filename(event['objectName'], ".json"), "outputNameTxt": get_bbox_filename(event['objectName'], ".txt"), "textractOnly": os.environ.get('TEXTRACT_ONLY'), "minCharNeeded": int(os.environ.get('MIN_CHAR_NEEDED')), "extract_pdf_lines": os.environ.get('EXTRACT_PDF_LINES'), } status = {"statusCode": 200, "body": "All right"} extract_pdf_lines = aws_env['extract_pdf_lines'] textract_only = aws_env['textractOnly'] tmp_folder = "/tmp/pdfToBbox" pdf_tmp_path = copy_pdf_to_tmp(tmp_folder, aws_env) print("==> aws_env: ", aws_env) if textract_only == "false" and is_pdf_has_enough_characters( pdf_tmp_path, aws_env['minCharNeeded']) is True: print("=> Extracting bounding box with pdfplumber") if extract_pdf_lines == "true": print("=> Extracting pdf lines bbox") pdf = Pdf(pdf_tmp_path, aws_env['tmpJsonOutput'], aws_env['tmpTxtOutput']) pdf.parse_pdf() pdf.save_in_json() pdf.save_in_txt() write_bbox_to_s3(aws_env) else: print("=> Extracting pdf words bbox") if execute_pdf_to_bbox(pdf_tmp_path, aws_env['tmpJsonOutput']): print("=> Error while trying to get pdf information") aws_env["status"] = -1 aws_env["errorMessage"] = "PDF format not supported." else: write_bbox_to_s3(aws_env) else: print("Extracting bounding box with textract") #send_to_textract(aws_env) aws_env['size'] = S3Helper.getS3FileSize(aws_env['bucketName'], aws_env['outputNameTxt'], aws_env['awsRegion']) aws_env["s3Url"] = get_new_s3_url(aws_env['s3Url'], "txt") aws_env["status"] = status aws_env["status"] = 1 aws_env["errorMessage"] = None aws_env["contentType"] = "text/txt" aws_env['objectName'] = aws_env['outputNameTxt'] aws_env["sourceUrl"] = aws_env["s3Url"] AwsHelper.refreshTmpFolder(tmp_folder) return update_event(aws_env, event)
def _outputTable(self, page, p, no_write=False): csvData = [] for table in page.tables: csvRow = [] csvRow.append("Table") csvData.append(csvRow) for row in table.rows: csvRow = [] for cell in row.cells: csvRow.append(cell.text) csvData.append(csvRow) csvData.append([]) csvData.append([]) if no_write: return csvData else: opath = "{}/page-{}/tables.csv".format(self.outputPath, p) S3Helper.writeCSVRaw(csvData, self.bucketName, opath)
def _outputTable(self, page, p): csvData = [] for table in page.tables: csvRow = [] csvRow.append("Table") csvData.append(csvRow) for row in table.rows: csvRow = [] for cell in row.cells: csvRow.append(cell.text) csvData.append(csvRow) csvData.append([]) csvData.append([]) opath = "{}page-{}-tables.csv".format(self.outputPath, p) S3Helper.writeCSVRaw(csvData, self.bucketName, opath) self.saveItem(self.documentId, "page-{}-Tables".format(p), opath)
def processRequest(request): output = "" logger.info("request: {}".format(request)) bucketName = request["bucketName"] sourceLanguageCode = request["sourceLanguage"] targetLanguageCode = request["targetLanguage"] access_role = request["access_role"] triggerFile = request["trigger_file"] try: # Filter only the JSON files for processing objs = S3Helper().getFilteredFileNames(bucketName, "input/", "json") for obj in objs: try: content = S3Helper().readFromS3(bucketName, obj) logger.debug(content) jsonDocument = json.loads(content) print(jsonDocument) # Convert the JSON document into XML outputXML = json2xml.Json2xml(jsonDocument, attr_type=False).to_xml() logger.debug(outputXML) newObjectKey = "xmlin/{}.xml".format( FileHelper.getFileName(obj)) # Store the XML in the S3 location for Translation S3Helper().writeToS3(str(outputXML), bucketName, newObjectKey) output = "Output Object: {}/{}".format(bucketName, newObjectKey) logger.debug(output) # Rename the JSON files to prevent reprocessing S3Helper().renameObject(bucketName, obj, "{}.processed".format(obj)) except ValueError: logger.error( "Error occured loading the json file:{}".format(obj)) except ClientError as e: logger.error("An error occured with S3 Bucket Operation: %s" % e) # Start the translation batch job using Amazon Translate startTranslationJob(bucketName, sourceLanguageCode, targetLanguageCode, access_role) S3Helper().deleteObject(bucketName, "input/{}".format(triggerFile)) except ClientError as e: logger.error("An error occured with S3 Bucket Operation: %s" % e)
def processComprehendEntities(self, comprehendEntities, numOfPages, bucket, documentPath): data = {} data['results'] = [] entities_to_index = {} # process comprehend entities for each page for p in range(0, numOfPages): page = {} # page number start at 1 but list of page data starts at 0 page['Page'] = p + 1 page['Entities'] = [] # to detect and skip duplicates entities = set() for e in comprehendEntities[p]['Entities']: # add this entity if not already present if e['Text'].upper() not in entities: # add entity to results list entity = {} entity['Text'] = e['Text'] entity['Type'] = e['Type'] entity['Score'] = e['Score'] page['Entities'].append(entity) if e['Type'] not in entities_to_index: entities_to_index[e['Type']] = [] entities_to_index[e['Type']].append(e['Text']) # make a note of this added entity entities.add(e['Text'].upper()) data['results'].append(page) # create results file in S3 under document folder S3Helper.writeToS3(json.dumps(data), bucket, documentPath + "comprehendEntities.json") return entities_to_index
def _outputTable(self, page, p): page_number = self.metadata['page_number'] csvData = [] for table in page.tables: csvRow = [] csvRow.append("Table") csvData.append(csvRow) for row in table.rows: csvRow = [] for cell in row.cells: csvRow.append(cell.text) csvData.append(csvRow) csvData.append([]) csvData.append([]) opath = "{}page-{}-tables.csv".format(self.outputPath, page_number) S3Helper.writeCSVRaw(csvData, self.bucketName, opath) return opath