def run(self):

        if (not self.document.pages):
            return

        FileHelper.writeToFile("{}-response.json".format(self.fileName),
                               json.dumps(self.response))

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        p = 1
        for page in self.document.pages:

            FileHelper.writeToFile(
                "{}-page-{}-response.json".format(self.fileName, p),
                json.dumps(page.blocks))

            self._outputWords(page, p)

            self._outputText(page, p)

            if (self.forms):
                self._outputForm(page, p)
                self._outputFormTranslate(page, p)

            if (self.tables):
                self._outputTable(page, p)
                self._outputTablePretty(page, p)
                self._outputTablePrettyTranslate(page, p)

            p = p + 1
    def _outputText(self, page, p):
        text = page.text
        FileHelper.writeToFile("{}-page-{}-text.txt".format(self.fileName, p),
                               text)

        textInReadingOrder = page.getTextInReadingOrder()
        FileHelper.writeToFile(
            "{}-page-{}-text-inreadingorder.txt".format(self.fileName, p),
            textInReadingOrder)
 def _outputTablePretty(self, page, p, table_format='github'):
     for table_number, table in enumerate(page.tables):
         rows_list = list()
         for row in table.rows:
             one_row = list()
             for cell in row.cells:
                 one_row = one_row + [cell.text]
             rows_list.append(one_row)
         pretty_table = tabulate(rows_list, tablefmt=table_format)
         FileHelper.writeToFile(
             "{}-page-{}-table-{}-tables-pretty.txt".format(
                 self.fileName, p, table_number), pretty_table)
    def _generateInsightsPerDocument(self, page, p, insights, medicalInsights, translate, ta, tma, tt):

        maxLen = 2000

        text = page.text

        start = 0
        sl = len(text)

        sentiment = []
        syntax = []
        entities = []
        keyPhrases = []
        medicalEntities = []
        phi = []
        translation = ""

        while(start < sl):
            end = start + maxLen
            if(end > sl):
                end = sl

            subText = text[start:end]

            if(insights):
                self._insights(start, text, sentiment, syntax, entities, keyPhrases, ta)

            if(medicalInsights):
                self._medicalInsights(start, text, medicalEntities, phi, tma)

            if(translate):
                translation = translation + tt.getTranslation(subText) + "\n"

            start = end

        if(insights):
            FileHelper.writeCSV("{}-page-{}-insights-sentiment.csv".format(self.fileName, p),
                            ["Sentiment"], sentiment)
            FileHelper.writeCSV("{}-page-{}-insights-entities.csv".format(self.fileName, p),
                            ["Type", "Text", "Score", "BeginOffset", "EndOffset"], entities)
            FileHelper.writeCSV("{}-page-{}-insights-syntax.csv".format(self.fileName, p),
                            ["PartOfSpeech-Tag", "PartOfSpeech-Score", "Text", "BeginOffset", "EndOffset"], syntax)
            FileHelper.writeCSV("{}-page-{}-insights-keyPhrases.csv".format(self.fileName, p),
                            ["Text", "Score", "BeginOffset", "EndOffset"], keyPhrases)

        if(medicalInsights):
            FileHelper.writeCSV("{}-page-{}-medical-insights-entities.csv".format(self.fileName, p),
                            ["Text", "Type", "Category", "Score", "BeginOffset", "EndOffset"], medicalEntities)

            FileHelper.writeToFile("{}-page-{}-medical-insights-phi.json".format(self.fileName, p), json.dumps(phi))

        if(translate):
            FileHelper.writeToFile("{}-page-{}-text-translation.txt".format(self.fileName, p), translation)
    def _outputTablePrettyTranslate(self, page, p, table_format='github'):

        tt = None

        tt = TextTranslater('auto', 'en', 'us-east-1')

        for table_number, table in enumerate(page.tables):
            rows_list = list()
            for row in table.rows:
                one_row = list()
                for cell in row.cells:
                    if cell.text != "":
                        one_row = one_row + [tt.getTranslation(cell.text)]
                    else:
                        one_row = one_row + [cell.text]
                rows_list.append(one_row)
            pretty_table = tabulate(rows_list, tablefmt=table_format)
            FileHelper.writeToFile(
                "{}-page-{}-table-{}-tables-pretty-translated.txt".format(
                    self.fileName, p, table_number), pretty_table)
Exemple #6
0
    def _outputText(self, page, p):
        text = page.text
        FileHelper.writeToFile("{}-text.txt".format(self.fileName), text)

        '''textInReadingOrder = page.getTextInReadingOrder()