def _outputText(self, page, p): #text = page.text text = page.getTextInReadingOrder() opath = "{}page-{}-text.txt".format(self.outputPath, p) S3Helper.writeToS3(text, self.bucketName, opath) self.saveItem(self.documentId, "page-{}-Text".format(p), opath) self.indexDocument(self.bucketName, opath, text)
def run(self): if (not self.document.pages): return opath = "{}response.json".format(self.outputPath) S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath) print("Total Pages in Document: {}".format(len(self.document.pages))) docText = "" p = 1 for page in self.document.pages: opath = "{}page-{}-response.json".format( self.outputPath, self.metadata['page_number']) S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath) text_file = self._outputText(page, p) docText = docText + page.text + "\n" if (self.tables): csv_file = self._outputTable(page, p) p = p + 1 return {"csv": csv_file, "text_file": text_file}
def run(self): if not self.document.pages: return opath = "{}response.json".format(self.outputPath) S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath) self.saveItem(self.documentId, "Response", opath) print("Total Pages in Document: {}".format(len(self.document.pages))) docText = "" p = 1 for page in self.document.pages: opath = "{}page-{}-response.json".format(self.outputPath, p) S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath) self.saveItem(self.documentId, "page-{}-Response".format(p), opath) self._outputText(page, p) docText = docText + page.text + "\n" if self.forms: self._outputForm(page, p) if self.tables: self._outputTable(page, p) p = p + 1
def run(self): if(not self.document.pages): return opath = "{}response.json".format(self.outputPath) S3Helper.writeToS3(json.dumps(round_floats(prune_blocks( self.response)), separators=(',', ':')), self.bucketName, opath) self.saveItem(self.documentId, 'Response', opath) print("Total Pages in Document: {}".format(len(self.document.pages))) docText = "" p = 1 for page in self.document.pages: docText = docText + page.text + "\n" if(self.forms): self._outputForm(page, p) if(self.tables): self._outputTable(page, p) p = p + 1 return docText
def run(self): if(not self.document.pages): return opath = "{}{}response.json".format(self.outputPath,TEXTRACT_PATH_S3_PREFIX) S3Helper.writeToS3(json.dumps(round_floats(prune_blocks( self.response)), separators=(',', ':')), self.bucketName, opath) self.saveItem(self.documentId, '{}Response'.format(TEXTRACT_PATH_S3_PREFIX), opath) print("Total Pages in Document: {}".format(len(self.document.pages))) docText = "" p = 1 for page in self.document.pages: docText = docText + page.text + "\n" if(self.forms): key_val_pairs = self._outputForm(page, p) if(self.tables): self._outputTable(page, p) p = p + 1 return {DOCTEXT: docText, KVPAIRS: key_val_pairs}
def write_extracted_zip(aws_env: dict, zip_tmp: str): output_bucket = aws_env['bucketName'] output_folder = aws_env['outputName'] aws_region = aws_env['awsRegion'] print("Writing s3://{0}/{1} in {2}".format(output_bucket, output_folder, aws_region)) for path, folders, files in os.walk(zip_tmp): print("=> Path: {0}".format(path)) for file in files: print("=> File: {0}".format(files)) file_path = os.path.join(path, file) s3_output_path = os.path.join(output_folder, file) try: with open(file_path, "r") as open_file: content = open_file.read() print("=> Writing {0} to s3: {0}".format( file_path, s3_output_path)) S3Helper.writeToS3(content, output_bucket, s3_output_path, aws_region) except UnicodeDecodeError: with open(file_path, "rb") as open_file: content = open_file.read() print("=> Writing to s3: {0}".format( file_path, s3_output_path)) S3Helper.writeToS3(content, output_bucket, s3_output_path, aws_region)
def run(self): if (not self.document.pages): return opath = "{}response.json".format(self.outputPath) S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath) self.saveItem(self.documentId, 'Response', opath) print("Total Pages in Document: {}".format(len(self.document.pages))) docText = "" p = 1 for page in self.document.pages: # Avoid printing json outputs for every page #opath = "{}page-{}-response.json".format(self.outputPath, p) #S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath) #self.saveItem(self.documentId, "page-{}-Response".format(p), opath) # Avoid printing text outputs at a page level #self._outputText(page, p) docText = docText + page.text + "\n" if (self.forms): self._outputForm(page, p) if (self.tables): self._outputTable(page, p) p = p + 1 opath = "{}response.txt".format(self.outputPath) S3Helper.writeToS3(docText, self.bucketName, opath) self.saveItem(self.documentId, 'Response', opath)
def write_bbox_to_s3(aws_env: dict) -> None: with open(aws_env['tmpJsonOutput'], "r") as file: content = file.read() S3Helper.writeToS3(content, aws_env['outputBucket'], aws_env['outputNameJson'], aws_env['awsRegion']) with open(aws_env['tmpTxtOutput'], "r") as file: content = file.read() S3Helper.writeToS3(content, aws_env['outputBucket'], aws_env['outputNameTxt'], aws_env['awsRegion'])
def _outputText(self, page, p): page_number = self.metadata['page_number'] text = page.text opath = "{}page-{}-text.txt".format(self.outputPath, page_number) S3Helper.writeToS3(text, self.bucketName, opath) textInReadingOrder = page.getTextInReadingOrder() opath = "{}page-{}-text-inreadingorder.txt".format( self.outputPath, page_number) S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath) return opath
def _outputText(self, page, p): text = page.text opath = "{}{}page-{}-text.txt".format(self.outputPath,TEXTRACT_PATH_S3_PREFIX, p) S3Helper.writeToS3(text, self.bucketName, opath) self.saveItem(self.documentId, "{}page-{}-Text".format(TEXTRACT_PATH_S3_PREFIX, p), opath) textInReadingOrder = page.getTextInReadingOrder() opath = "{}{}page-{}-text-inreadingorder.txt".format(self.outputPath,TEXTRACT_PATH_S3_PREFIX, p) S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath) self.saveItem(self.documentId, "{}page-{}-TextInReadingOrder".format(TEXTRACT_PATH_S3_PREFIX, p), opath)
def _outputText(self, page, p, no_write=False): text = page.text textInReadingOrder = page.getTextInReadingOrder() if no_write: return (text, textInReadingOrder) else: opath = "{}/page-{}/text.txt".format(self.outputPath, p) opath = "{}/page-{}/text-inreadingorder.txt".format( self.outputPath, p) S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath) S3Helper.writeToS3(text, self.bucketName, opath)
def processComprehendMedicalICD10(self, comprehendMedicalICD10, numOfPages, bucket, comprehendOutputPath): data = {} data['results'] = [] for p in range(0, numOfPages): page = {} # page numbers start at 1 page['Page'] = p + 1 page['Entities'] = [] # to detect and skip duplicates entities = set() for e in comprehendMedicalICD10[p]: # add this entity if not already present if e['Text'].upper() not in entities: # add entity to results list entity = {} entity['Text'] = e['Text'] entity['Category'] = e['Category'] entity['Type'] = e['Type'] entity['ICD10CMConcepts'] = [] if 'ICD10CMConcepts' in e: for c in e['ICD10CMConcepts']: concept = {} concept['Description'] = c['Description'] concept['Code'] = c['Code'] concept['Score'] = c['Score'] entity['ICD10CMConcepts'].append(concept) page['Entities'].append(entity) entities.add(e['Text'].upper()) # make a note of this added entity entities.add(e['Text'].upper()) data['results'].append(page) # create results file in S3 under document folder S3Helper.writeToS3(json.dumps(data), bucket, comprehendOutputPath + "comprehendMedicalICD10.json")
def spacy_sentences_extraction(content: str, aws_env: dict): excluded_pipeline = ["tagger", "ner", "textcat", "parser"] model_path = "/opt/python/xx_ent_wiki_sm/xx_ent_wiki_sm-2.3.0" sentence_content = "" if os.path.isdir(model_path) is False: model_path = "xx_ent_wiki_sm" nlp = spacy.load(model_path, disable=excluded_pipeline) nlp.add_pipe(nlp.create_pipe('sentencizer')) doc = nlp(content) print("Pipelines names: ", nlp.pipe_names) for sent in doc.sents: sentence = sent.text.replace('\n', ' ') sentence_content += "{}\n".format(sentence.strip()) S3Helper.writeToS3(sentence_content, aws_env['outputBucket'], aws_env['outputNameTxt'], aws_env['awsRegion'])
def processAndReturnComprehendMedicalEntities(self, comprehendMedicalEntities, numOfPages, bucket, comprehendOutputPath): data = {} data['results'] = [] medical_entities_to_index = {} for p in range(0, numOfPages): page = {} # page numbers start at 1 page['Page'] = p + 1 page['Entities'] = [] # to detect and skip duplicates entities = set() for e in comprehendMedicalEntities[p]: # add this entity if not already present if e['Text'].upper() not in entities: # add entity to results list entity = {} entity['Text'] = e['Text'] entity['Category'] = e['Category'] if 'Score' in e: entity['Score'] = e['Score'] page['Entities'].append(entity) if e['Category'] not in medical_entities_to_index: medical_entities_to_index[e['Category']] = [] medical_entities_to_index[e['Category']].append(e['Text']) # make a note of this added entity entities.add(e['Text'].upper()) data['results'].append(page) # create results file in S3 under document folder S3Helper.writeToS3( json.dumps(data), bucket, comprehendOutputPath + "comprehendMedicalEntities.json") return medical_entities_to_index
def processComprehendEntities(self, comprehendEntities, numOfPages, bucket, documentPath): data = {} data['results'] = [] entities_to_index = {} # process comprehend entities for each page for p in range(0, numOfPages): page = {} # page number start at 1 but list of page data starts at 0 page['Page'] = p + 1 page['Entities'] = [] # to detect and skip duplicates entities = set() for e in comprehendEntities[p]['Entities']: # add this entity if not already present if e['Text'].upper() not in entities: # add entity to results list entity = {} entity['Text'] = e['Text'] entity['Type'] = e['Type'] entity['Score'] = e['Score'] page['Entities'].append(entity) if e['Type'] not in entities_to_index: entities_to_index[e['Type']] = [] entities_to_index[e['Type']].append(e['Text']) # make a note of this added entity entities.add(e['Text'].upper()) data['results'].append(page) # create results file in S3 under document folder S3Helper.writeToS3(json.dumps(data), bucket, documentPath + "comprehendEntities.json") return entities_to_index
def writeTextractOutputs(self, taggingStr=None): if not self.document.pages: return docText = "" p = 1 for page in self.document.pages: opath = "{}/page-{}/response.json".format(self.outputPath, p) S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath, taggingStr) self._outputText(page, p) docText = docText + page.text + "\n" if (self.forms): self._outputForm(page, p) if (self.tables): self._outputTable(page, p) p = p + 1 # Write the whole output for it to then be used for comprehend opath = "{}/fullresponse.json".format(self.outputPath) print("Total Pages in Document: {}".format(len(self.document.pages))) S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath, taggingStr)
def processImage(itemId, bucketName, objectName, outputBucketName, itemsTableName): apiName = objectName.split("/")[0] response = callRekognition(bucketName, objectName, apiName) print("Generating output for ItemId: {}".format(itemId)) print(response) outputPath = "sync/{}-analysis/{}/".format(objectName, itemId) opath = "{}response.json".format(outputPath) S3Helper.writeToS3(json.dumps(response), outputBucketName, opath) #opg = OutputGenerator(itemId, response, bucketName, objectName, detectForms, detectTables, ddb) #opg.run() print("ItemId: {}".format(itemId)) ds = datastore.ItemStore(itemsTableName) ds.markItemComplete(itemId)
def processRequest(request): output = "" print(request) jobId = request['jobId'] jobTag = request['jobTag'] jobStatus = request['jobStatus'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] objectName = request['objectName'] outputBucket = request["outputBucket"] itemsTable = request["itemsTable"] pages = getJobResults(jobAPI, jobId) print("Result pages received: {}".format(len(pages))) print(pages) outputPath = "async/{}-analysis/{}/".format(objectName, jobTag) opath = "{}response.json".format(outputPath) S3Helper.writeToS3(json.dumps(pages), outputBucket, opath) #opg = OutputGenerator(jobTag, pages, bucketName, objectName, ddb) #opg.run() print("ItemId: {}".format(jobTag)) ds = datastore.ItemStore(itemsTable) ds.markItemComplete(jobTag) output = "Processed -> Item: {}, Object: {}/{} processed.".format(jobTag, bucketName, objectName) print(output) return { 'statusCode': 200, 'body': output }
def _outputText(self, page, p): text = page.text opath = "{}page-{}-text.txt".format(self.outputPath, p) S3Helper.writeToS3(text, self.bucketName, opath) self.saveItem(self.documentId, "page-{}-Text".format(p), opath) textInReadingOrder = page.getTextInReadingOrder() opath = "{}page-{}-text-inreadingorder.txt".format(self.outputPath, p) S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath) self.saveItem(self.documentId, "page-{}-TextInReadingOrder".format(p), opath) entityAnalysis = self.awsComprehend(textInReadingOrder) opath = "{}page-{}-text-entity.txt".format(self.outputPath, p) S3Helper.writeToS3(entityAnalysis, self.bucketName, opath) self.saveItem(self.documentId, "page-{}-EntityText".format(p), opath)
def run(self): if(not self.document.pages): return #opath = "{}response.json".format(self.outputPath) #S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath) #self.saveItem(self.documentId, 'Response', opath) print("Total Pages in Document: {}".format(len(self.document.pages))) docText = "" docTextInReadingOrder = "" jsonPages = list(map(lambda x : x.blocks, self.document.pages)) opath = "{}pages.json".format(self.outputPath) S3Helper.writeToS3(json.dumps(jsonPages), self.bucketName, opath) self.saveItem(self.documentId, "All pages", opath) p = 1 for page in self.document.pages: #opath = "{}page-{}-response.json".format(self.outputPath, p) #S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath) #self.saveItem(self.documentId, "page-{}-Response".format(p), opath) self._outputText(page, p) docText = docText + page.text + "\n" docTextInReadingOrder = docTextInReadingOrder + page.getTextInReadingOrder() + "\n" if(self.forms): self._outputForm(page, p) if(self.tables): self._outputTable(page, p) p = p + 1 opath = "{}text.txt".format(self.outputPath) S3Helper.writeToS3(docText, self.bucketName, opath) opath = "{}text-inreadingorder.txt".format(self.outputPath) S3Helper.writeToS3(docTextInReadingOrder, self.bucketName, opath)
def test_write_to_s3(self): S3Helper.writeToS3("Hello World", BUCKET_NAME, S3_FILE_NAME, REGION) body = self.conn.Object( BUCKET_NAME, S3_FILE_NAME).get()['Body'].read().decode('utf-8') self.assertEqual(body, "Hello World")