Python S3Helper.readFromS3 Examples

Programming Language: Python

Namespace/Package Name: helper

Class/Type: S3Helper

Method/Function: readFromS3

Examples at hotexamples.com: 7

Python S3Helper.readFromS3 - 7 examples found. These are the top rated real world Python examples of helper.S3Helper.readFromS3 extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

writeToS3(21)

S3Helper(14)

readFromS3(7)

writeCSVRaw(4)

writeCSV(3)

getS3BucketRegion(2)

getS3FileSize(2)

getFileNames(1)

readBytesFromS3(1)

Example #1

Show file

File: esindex.py Project: LouisLoison/deepblooGit

def getResults(bucketName, outputPath):
    content = {
        "responseByPage":
        json.loads(
            S3Helper.readFromS3(bucketName,
                                "{}pages.json".format(outputPath))),
        "fullText":
        S3Helper.readFromS3(bucketName, "{}text.txt".format(outputPath)),
        "fullTextReadingOrder":
        S3Helper.readFromS3(bucketName,
                            "{}text-inreadingorder.txt".format(outputPath))
    }
    return content

Example #2

Show file

File: redact.py Project: juanlamadrid20/amazon-textract-document-understanding-solution

def getPageResponse(request):
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    documentId = request["documentId"]
    page = request["page"]
    ds = datastore.DocumentStore(documentsTable, outputTable)
    doc = ds.getDocument(documentId)
    if(doc and doc["documentStatus"] == "SUCCEEDED"):
        fileName = "{}-analysis/{}/page-{}-response.json".format(doc["objectName"], doc["documentId"], page)
        responseJson = json.loads(S3Helper.readFromS3(doc["bucketName"], fileName))
        doc["textractResponse"] = responseJson
    output = {}
    if(doc):
        output = doc
    return output

Example #3

Show file

File: redact.py Project: juanlamadrid20/amazon-textract-document-understanding-solution

def getPageForm(request):
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    documentId = request["documentId"]
    page = request["page"]
    ds = datastore.DocumentStore(documentsTable, outputTable)
    doc = ds.getDocument(documentId)
    if(doc and doc["documentStatus"] == "SUCCEEDED"):
        fileName = "{}-analysis/{}/page-{}-forms.csv".format(doc["objectName"], doc["documentId"], page)
        file = S3Helper.readFromS3(doc["bucketName"], fileName)
        doc["textractResponse"] = parsePairs(file)
    output = {}
    print(output)
    if(doc):
        output = doc
    return output

Example #4

Show file

File: redact.py Project: weisisheng/document-understanding-solution

def getPageTable(request):
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    documentId = request["documentId"]
    page = request["page"]
    ds = datastore.DocumentStore(documentsTable, outputTable)
    doc = ds.getDocument(documentId)
    if (doc and doc["documentStatus"] == "SUCCEEDED"):
        fileName = "{}-analysis/{}/page-{}-tables.csv".format(
            doc["objectName"], doc["documentId"], page)
        file = S3Helper.readFromS3(doc["bucketName"], fileName)
        tables = parseTables(getTableFromString(file))
    output = {"tables": []}
    if (tables):
        output["tables"] = tables
    return output

Example #5

Show file

File: comprehendHelper.py Project: weisisheng/document-understanding-solution

    def processComprehend(self,
                          bucket,
                          textractResponseLocation,
                          comprehendOutputPath,
                          maxPages=200):


        # get textract results from S3
        textractFile = S3Helper.readFromS3(
            bucket, textractResponseLocation)
        textract = json.loads(textractFile)

        # total number of textracted pages
        numOfPages = self.getNumOfPages(textract)

        # error
        if numOfPages <= 0:
            return False

        # enforce a maximum of pages to be processed
        if numOfPages > maxPages:
            numOfPages = maxPages

        # iterate over results page by page and extract raw text for comprehend
        rawPages = [""] * numOfPages
        if self.extractTextByPages(textract, rawPages, numOfPages) == False:
            return False

        # process pages by batches of 25 max, determine how many batches we need
        numOfBatches = int(numOfPages / PAGES_PER_BATCH)
        if numOfPages % PAGES_PER_BATCH != 0:
            numOfBatches += 1

        # to store comprehend and medical API calls results.
        comprehendEntities = [None] * numOfPages
        comprehendMedicalEntities = [None] * numOfPages
        comprehendMedicalICD10 = [None] * numOfPages

        pagesProcessed = 0

        # process pages by batch
        for batch in range(0, numOfBatches):

            pageStartIndex = batch * PAGES_PER_BATCH
            pagesToProcess = numOfPages - pagesProcessed

            if pagesToProcess > PAGES_PER_BATCH:
                pagesToProcess = PAGES_PER_BATCH

            # keep track of all threads we spawn
            threads = list()

            # Comprehend call that can batch up to 25 pages together synchronously
            x = threading.Thread(target=self.batchComprehendDetectEntitiesSync,
                                 args=(rawPages, pagesToProcess, pageStartIndex, comprehendEntities))
            x.start()
            threads.append(x)

            # comprehendMedicalEntities is shared among threads
            medicalEntitiesMutex = threading.Lock()

            # ComprehendMedical
            for index in range(0, pagesToProcess):

                # Comprehend Medical can only handle one page at a time synchronously. The SDK handles
                # throttling by the service.
                x = threading.Thread(target=self.comprehendMedicalDetectEntitiesSync,
                                     args=(rawPages,
                                           pageStartIndex + index,
                                           comprehendMedicalEntities,
                                           medicalEntitiesMutex))
                x.start()
                threads.append(x)

            # comprehendMedicalEntities is shared among threads
            medicalICD10Mutex = threading.Lock()

            # ComprehendMedical
            for index in range(0, pagesToProcess):

                # Comprehend Medical can only handle one page at a time synchronously. The SDK handles
                # throttling by the service.
                x = threading.Thread(target=self.comprehendMedicalDetectICD10Sync,
                                     args=(rawPages,
                                           pageStartIndex + index,
                                           comprehendMedicalICD10,
                                           medicalICD10Mutex))
                x.start()
                threads.append(x)

            # wait on all threads to finish their work
            for index, thread in enumerate(threads):
                thread.join()

            print("all threads joined...")

            # check success of threads
            for i in range(pageStartIndex, pagesToProcess):
                if (comprehendEntities[pageStartIndex + i] == None) or (comprehendMedicalEntities[pageStartIndex + i] == None):
                    print("a page failed to process" + str(i))
                    return False

            # increment the number of pages processed for the next batch
            pagesProcessed += pagesToProcess

        # process comprehend data, create the entities result file in S3
        processedComprehendData = self.processAndReturnComprehendEntities(comprehendEntities,
                                       numOfPages,
                                       bucket,
                                       comprehendOutputPath)
                                  
        # process comprehend medical data, create the entities result file in S3
        comprehendMedicalEntities = self.processAndReturnComprehendMedicalEntities(comprehendMedicalEntities,
                                              numOfPages,
                                              bucket,
                                              comprehendOutputPath)
        # final list of comprehend and comprehend medical entities to be indexed
        processedComprehendData.update(comprehendMedicalEntities)

        # process comprehend medical data, create the ICD10 result file in S3
        self.processComprehendMedicalICD10(comprehendMedicalICD10,
                                           numOfPages,
                                           bucket,
                                           comprehendOutputPath)

        return processedComprehendData

Example #6

Show file

File: test_helper.py Project: awslabs/document-understanding-solution

 def test_read_from_s3(self):
     self.conn.Object(BUCKET_NAME, S3_FILE_NAME).put(Body="Test")
     body = S3Helper.readFromS3(BUCKET_NAME, S3_FILE_NAME, REGION)
     self.assertEqual(body, "Test")

Example #7

Show file

def get_file_content(aws_env: dict):
    name_path_s3, _ = os.path.splitext(aws_env["objectName"])
    txt_path_s3 = name_path_s3 + ".txt"
    return S3Helper.readFromS3(aws_env['bucketName'], txt_path_s3,
                               aws_env['awsRegion'])