Example #1
0
def getResults(bucketName, outputPath):
    content = {
        "responseByPage":
        json.loads(
            S3Helper.readFromS3(bucketName,
                                "{}pages.json".format(outputPath))),
        "fullText":
        S3Helper.readFromS3(bucketName, "{}text.txt".format(outputPath)),
        "fullTextReadingOrder":
        S3Helper.readFromS3(bucketName,
                            "{}text-inreadingorder.txt".format(outputPath))
    }
    return content
def getPageResponse(request):
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    documentId = request["documentId"]
    page = request["page"]
    ds = datastore.DocumentStore(documentsTable, outputTable)
    doc = ds.getDocument(documentId)
    if(doc and doc["documentStatus"] == "SUCCEEDED"):
        fileName = "{}-analysis/{}/page-{}-response.json".format(doc["objectName"], doc["documentId"], page)
        responseJson = json.loads(S3Helper.readFromS3(doc["bucketName"], fileName))
        doc["textractResponse"] = responseJson
    output = {}
    if(doc):
        output = doc
    return output
def getPageForm(request):
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    documentId = request["documentId"]
    page = request["page"]
    ds = datastore.DocumentStore(documentsTable, outputTable)
    doc = ds.getDocument(documentId)
    if(doc and doc["documentStatus"] == "SUCCEEDED"):
        fileName = "{}-analysis/{}/page-{}-forms.csv".format(doc["objectName"], doc["documentId"], page)
        file = S3Helper.readFromS3(doc["bucketName"], fileName)
        doc["textractResponse"] = parsePairs(file)
    output = {}
    print(output)
    if(doc):
        output = doc
    return output
def getPageTable(request):
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    documentId = request["documentId"]
    page = request["page"]
    ds = datastore.DocumentStore(documentsTable, outputTable)
    doc = ds.getDocument(documentId)
    if (doc and doc["documentStatus"] == "SUCCEEDED"):
        fileName = "{}-analysis/{}/page-{}-tables.csv".format(
            doc["objectName"], doc["documentId"], page)
        file = S3Helper.readFromS3(doc["bucketName"], fileName)
        tables = parseTables(getTableFromString(file))
    output = {"tables": []}
    if (tables):
        output["tables"] = tables
    return output
    def processComprehend(self,
                          bucket,
                          textractResponseLocation,
                          comprehendOutputPath,
                          maxPages=200):


        # get textract results from S3
        textractFile = S3Helper.readFromS3(
            bucket, textractResponseLocation)
        textract = json.loads(textractFile)

        # total number of textracted pages
        numOfPages = self.getNumOfPages(textract)

        # error
        if numOfPages <= 0:
            return False

        # enforce a maximum of pages to be processed
        if numOfPages > maxPages:
            numOfPages = maxPages

        # iterate over results page by page and extract raw text for comprehend
        rawPages = [""] * numOfPages
        if self.extractTextByPages(textract, rawPages, numOfPages) == False:
            return False

        # process pages by batches of 25 max, determine how many batches we need
        numOfBatches = int(numOfPages / PAGES_PER_BATCH)
        if numOfPages % PAGES_PER_BATCH != 0:
            numOfBatches += 1

        # to store comprehend and medical API calls results.
        comprehendEntities = [None] * numOfPages
        comprehendMedicalEntities = [None] * numOfPages
        comprehendMedicalICD10 = [None] * numOfPages

        pagesProcessed = 0

        # process pages by batch
        for batch in range(0, numOfBatches):

            pageStartIndex = batch * PAGES_PER_BATCH
            pagesToProcess = numOfPages - pagesProcessed

            if pagesToProcess > PAGES_PER_BATCH:
                pagesToProcess = PAGES_PER_BATCH

            # keep track of all threads we spawn
            threads = list()

            # Comprehend call that can batch up to 25 pages together synchronously
            x = threading.Thread(target=self.batchComprehendDetectEntitiesSync,
                                 args=(rawPages, pagesToProcess, pageStartIndex, comprehendEntities))
            x.start()
            threads.append(x)

            # comprehendMedicalEntities is shared among threads
            medicalEntitiesMutex = threading.Lock()

            # ComprehendMedical
            for index in range(0, pagesToProcess):

                # Comprehend Medical can only handle one page at a time synchronously. The SDK handles
                # throttling by the service.
                x = threading.Thread(target=self.comprehendMedicalDetectEntitiesSync,
                                     args=(rawPages,
                                           pageStartIndex + index,
                                           comprehendMedicalEntities,
                                           medicalEntitiesMutex))
                x.start()
                threads.append(x)

            # comprehendMedicalEntities is shared among threads
            medicalICD10Mutex = threading.Lock()

            # ComprehendMedical
            for index in range(0, pagesToProcess):

                # Comprehend Medical can only handle one page at a time synchronously. The SDK handles
                # throttling by the service.
                x = threading.Thread(target=self.comprehendMedicalDetectICD10Sync,
                                     args=(rawPages,
                                           pageStartIndex + index,
                                           comprehendMedicalICD10,
                                           medicalICD10Mutex))
                x.start()
                threads.append(x)

            # wait on all threads to finish their work
            for index, thread in enumerate(threads):
                thread.join()

            print("all threads joined...")

            # check success of threads
            for i in range(pageStartIndex, pagesToProcess):
                if (comprehendEntities[pageStartIndex + i] == None) or (comprehendMedicalEntities[pageStartIndex + i] == None):
                    print("a page failed to process" + str(i))
                    return False

            # increment the number of pages processed for the next batch
            pagesProcessed += pagesToProcess

        # process comprehend data, create the entities result file in S3
        processedComprehendData = self.processAndReturnComprehendEntities(comprehendEntities,
                                       numOfPages,
                                       bucket,
                                       comprehendOutputPath)
                                  
        # process comprehend medical data, create the entities result file in S3
        comprehendMedicalEntities = self.processAndReturnComprehendMedicalEntities(comprehendMedicalEntities,
                                              numOfPages,
                                              bucket,
                                              comprehendOutputPath)
        # final list of comprehend and comprehend medical entities to be indexed
        processedComprehendData.update(comprehendMedicalEntities)

        # process comprehend medical data, create the ICD10 result file in S3
        self.processComprehendMedicalICD10(comprehendMedicalICD10,
                                           numOfPages,
                                           bucket,
                                           comprehendOutputPath)

        return processedComprehendData
 def test_read_from_s3(self):
     self.conn.Object(BUCKET_NAME, S3_FILE_NAME).put(Body="Test")
     body = S3Helper.readFromS3(BUCKET_NAME, S3_FILE_NAME, REGION)
     self.assertEqual(body, "Test")
Example #7
0
def get_file_content(aws_env: dict):
    name_path_s3, _ = os.path.splitext(aws_env["objectName"])
    txt_path_s3 = name_path_s3 + ".txt"
    return S3Helper.readFromS3(aws_env['bucketName'], txt_path_s3,
                               aws_env['awsRegion'])