Example #1
0
def JsonWrite(PagesText, fileName, fileFolder):
    pdf_folder = fileFolder + str(fileName) + '/'
    if not os.path.exists(pdf_folder):
        os.mkdir(pdf_folder)
    for index in range(len(PagesText)):
        Page = PagesText[index]
        txtPath = pdf_folder + str(fileName) + '_' + str(index + 1) + '.txt'
        with open(txtPath, 'w') as f:
            for Line in Page:
                f.write(Line + '\n')

    logging = Logger(__name__)
    Logger.get_log(logging).info('JsonFile Saved')
    logging.logger.handlers.clear()
    def Segmentation(self):
        self.FNoteExtract()

        for PageNo in range(len(self.PagesLayout)):
            PageImage = self.PagesImage[PageNo]
            PageLayout = self.PagesLayout[PageNo]
            PageFNote = self.FigureNotes[PageNo]

            Image = ImgExtraction(PageImage, PageLayout, PageFNote)
            self.Image.append(Image)

        logging = Logger(__name__)
        Logger.get_log(logging).info('Image Segmentation Finished')
        logging.logger.handlers.clear()
Example #3
0
def ImageWrite(ImageList, fileName, fileFolder):
    imgFolder = fileFolder + fileName[:-4] + '/'

    if not os.path.exists(imgFolder[:-1]):
        os.mkdir(imgFolder)

    for index in range(len(ImageList)):
        Image = ImageList[index]
        imgName = fileName[:-4] + '_p' + str(index + 1) + '.jpg'
        cv2.imwrite(imgFolder + imgName, Image)

    logging = Logger(__name__)
    Logger.get_log(logging).info('Image Saved')
    logging.logger.handlers.clear()
    def Segmentation(self):
        for PageNo in range(len(self.PagesLayout)):
            PageLayout = self.PagesLayout[PageNo]

            if self.TextLevel == 1:
                Text = Leve1Extraction(PageLayout)
                self.Text.append(Text)

            elif self.TextLevel == 2:
                pass

        logging = Logger(__name__)
        Logger.get_log(logging).info('Text Segmentation Finished')
        logging.logger.handlers.clear()
Example #5
0
def pdf2image(fileName):

    with tempfile.TemporaryDirectory() as path:
        PagesImage = convert_from_path(fileName, output_folder=path)

    for index in range(len(PagesImage)):
        PageImage = PagesImage[index]
        PagesImage[index] = cv2.cvtColor(np.asarray(PageImage),
                                         cv2.COLOR_RGB2BGR)

    logging = Logger(__name__)
    Logger.get_log(logging).info('pdf2image Completed')
    logging.logger.handlers.clear()

    return PagesImage
    def Segmentation(self):
        for PageLayout in self.PagesLayout:
            table = detect_table(PageLayout)

            newTable = []
            new_c_header = []
            new_r_header = []
            new_body = []

            for index in range(len(table)):
                tableItem = table[index]
                newTable.append(Region(tableItem))
                if self.TableLevel == 2:
                    c_header, r_header, body = extraction(
                        PageLayout, tableItem)

                    for cell in c_header:
                        cell.insert(0, Region(cell[0]))
                        cell.remove(cell[1])
                        if not cell[6] == []:
                            for child in cell[6]:
                                child.insert(0, Region(child[0]))
                                child.remove(child[1])
                    for cell in r_header:
                        cell.insert(0, Region(cell[0]))
                        cell.remove(cell[1])
                        if not cell[6] == []:
                            for child in cell[6]:
                                child.insert(0, Region(child[0]))
                                child.remove(child[1])
                    for cell in body:
                        cell.insert(0, Region(cell[0]))
                        cell.remove(cell[1])

                    new_c_header.append(c_header)
                    new_r_header.append(r_header)
                    new_body.append(body)

            self.Table.append(newTable)
            if self.TableLevel == 2:
                self.Column_Header.append(new_c_header)
                self.Row_Header.append(new_r_header)
                self.Body.append(new_body)

        logging = Logger(__name__)
        Logger.get_log(logging).info('Table Segmentation Finished')
        logging.logger.handlers.clear()
    def Segmentation(self):
        FigNoteList = []
        TabNoteList = []

        for PageNo in range(len(self.PagesLayout)):
            PageLayout = self.PagesLayout[PageNo]

            if self.TextLevel == 1:
                Text = Leve1Extraction(PageLayout)
                self.Text.append(Text)

            elif self.TextLevel == 2:
                Page, pgIndex = PageExtraction(PageLayout)
                Note, ntIndex = NoteExtraction(PageLayout)

                FigNoteList.append(FigureNoteExtraction(PageLayout))
                TabNoteList.append(TableNoteExtraction(PageLayout))

                if PageNo == 0:
                    Title, ttIndex = TitleExtraction(PageLayout)
                    Author, auIndex = AuthorExtraction(PageLayout, ttIndex)
                    self.Title.append(Title)
                    self.Author.append(Author)
                else:
                    ttIndex = -1
                    auIndex = []
                    self.Title.append([])
                    self.Author.append([])

                Text = Level2Extraction(PageLayout, pgIndex, ntIndex, ttIndex,
                                        auIndex)

                self.Page.append(Page)
                self.Note.append(Note)
                self.Text.append(Text)

        if self.TextLevel == 2:
            self.TableNote = NotePostProcess(TabNoteList, "T")
            self.FigureNote = NotePostProcess(FigNoteList, "F")
            self.Text = FigTabNoteOut(self.Text, self.TableNote,
                                      self.FigureNote)

        logging = Logger(__name__)
        Logger.get_log(logging).info('Text Segmentation Finished')
        logging.logger.handlers.clear()
def ImageWrite(ImageList, fileName, fileFolder):
    ImgWrite = False
    for Image in ImageList:
        if isinstance(Image, np.ndarray):
            ImgWrite = True

    if ImgWrite:
        imgFolder = fileFolder + fileName[:-4] + '/'

        if not os.path.exists(imgFolder[:-1]):
            os.mkdir(imgFolder)

        for index in range(len(ImageList)):
            Image = ImageList[index]
            imgName = fileName[:-4] + '_p' + str(index+1) + '.jpg'
            if isinstance(Image, np.ndarray):
                cv2.imwrite(imgFolder + imgName, Image)

        logging = Logger(__name__)
        Logger.get_log(logging).info('Image Saved')
        logging.logger.handlers.clear()
Example #9
0
    def Segmentation(self):
        for PageNo in range(len(self.PagesLayout)):
            PageLayout = self.PagesLayout[PageNo]

            if self.TextLevel == 1:
                Text = Leve1Extraction(PageLayout)
                self.Text.append(Text)

            elif self.TextLevel == 2:
                Zhang = ZhangExtraction(PageLayout)
                self.Zhang.append(Zhang)
                Jie = JieExtraction(PageLayout)
                self.Jie.append(Jie)
                Tiao = TiaoExtraction(PageLayout)
                self.Tiao.append(Tiao)
                Title = TitleExtraction(PageLayout)
                self.Title.append(Title)

        logging = Logger(__name__)
        Logger.get_log(logging).info('Text Segmentation Finished')
        logging.logger.handlers.clear()
Example #10
0
    def configCheck(self):
        if not self.folder[-1] == '/':
            Logger.get_log(
                self.logging).critical('Configuration - Folder Format Error')
            print("Configuration - Folder may loss '/' to the end of the path")
            y_n = input(
                "Do you want system add '/' to the end of path ? (Y/N)\n")
            if y_n.lower() == 'y' or y_n.lower() == 'yes':
                self.folder += '/'
            else:
                sys.exit()

        if not self.filename == 'all' and not self.filename[-4:] == '.pdf':
            Logger.get_log(self.logging).critical(
                'Configuration - FileName Not End With .pdf ')
            print('Configuration - FileName Not End With \'.pdf\'')
            y_n = input(
                "Do you want system add '.pdf' to the end of filename ? (Y/N)\n"
            )
            if y_n.lower() == 'y' or y_n.lower() == 'yes':
                self.filename += '.pdf'
            else:
                sys.exit()

        if not (self.text_level == 1 or self.text_level == 2):
            Logger.get_log(self.logging).critical(
                'Configuration - text_level Format Error ')
            while True:
                print('Configuration - text_level Format Error ')
                text_level = input(
                    "Please press 1/2 to specify a text_level \n")
                if text_level == '1' or text_level == '2':
                    self.text_level = text_level
                    break
Example #11
0
 def __init__(self):
     self.logging = Logger(__name__)
     Logger.get_log(self.logging).info('Start processing ConfigFile')
     self.config()
     Logger.get_log(self.logging).info('ConfigFile Processed\n')
Example #12
0
def FNTypeCheck(FigNoteList):
    TypeList = []
    TypeCountList = []

    for pgNum in range(len(FigNoteList)):
        PageFigNote = FigNoteList[pgNum]
        for figNoteIndex in range(len(PageFigNote)):
            figNote = PageFigNote[figNoteIndex]

            figNoteText = figNote[1].get_text()[:-1].lower().replace(" ", "")
            Type = FNTypeCalculate(figNoteText)
            TypeList.append(Type)

    for index in range(len(TypeList) - 1, -1, -1):
        item = TypeList[index]
        if item.find('E') >= 0:
            TypeList.remove(item)

    if not TypeList == []:
        while True:
            Type = TypeList[0]
            TypeCount = TypeList.count(Type)
            TypeCountList.append([Type, TypeCount])
            for index in range(len(TypeList) - 1, -1, -1):
                item = TypeList[index]
                if item == Type:
                    TypeList.remove(item)
            if len(TypeList) == 0:
                break

        MaxTypeCount = [[None, -1]]
        for index in range(len(TypeCountList)):
            TCPair = TypeCountList[index]
            count = TCPair[1]
            if count > MaxTypeCount[0][1]:
                MaxTypeCount[0][0] = TCPair[0]
                MaxTypeCount[0][1] = count

        for index in range(len(TypeCountList)):
            TCPair = TypeCountList[index]
            type = TCPair[0]
            count = TCPair[1]
            if count == MaxTypeCount[0][1] and not type == MaxTypeCount[0][0]:
                MaxTypeCount.append([TCPair[0], count])

        if len(MaxTypeCount) > 1:
            MaxType = '000'
            for item in MaxTypeCount:
                if item[0] > MaxType:
                    MaxType = item[0]

            logging = Logger(__name__)
            Logger.get_log(logging).critical(
                'Same Type of ImageNote: {}'.format(MaxTypeCount))
            logging.logger.handlers.clear()

            return MaxType

        else:
            return MaxTypeCount[0][0]
    else:
        logging = Logger(__name__)
        Logger.get_log(logging).critical('No ImageNote')
        logging.logger.handlers.clear()
        return None