def JsonWrite(PagesText, fileName, fileFolder): pdf_folder = fileFolder + str(fileName) + '/' if not os.path.exists(pdf_folder): os.mkdir(pdf_folder) for index in range(len(PagesText)): Page = PagesText[index] txtPath = pdf_folder + str(fileName) + '_' + str(index + 1) + '.txt' with open(txtPath, 'w') as f: for Line in Page: f.write(Line + '\n') logging = Logger(__name__) Logger.get_log(logging).info('JsonFile Saved') logging.logger.handlers.clear()
def Segmentation(self): self.FNoteExtract() for PageNo in range(len(self.PagesLayout)): PageImage = self.PagesImage[PageNo] PageLayout = self.PagesLayout[PageNo] PageFNote = self.FigureNotes[PageNo] Image = ImgExtraction(PageImage, PageLayout, PageFNote) self.Image.append(Image) logging = Logger(__name__) Logger.get_log(logging).info('Image Segmentation Finished') logging.logger.handlers.clear()
def ImageWrite(ImageList, fileName, fileFolder): imgFolder = fileFolder + fileName[:-4] + '/' if not os.path.exists(imgFolder[:-1]): os.mkdir(imgFolder) for index in range(len(ImageList)): Image = ImageList[index] imgName = fileName[:-4] + '_p' + str(index + 1) + '.jpg' cv2.imwrite(imgFolder + imgName, Image) logging = Logger(__name__) Logger.get_log(logging).info('Image Saved') logging.logger.handlers.clear()
def Segmentation(self): for PageNo in range(len(self.PagesLayout)): PageLayout = self.PagesLayout[PageNo] if self.TextLevel == 1: Text = Leve1Extraction(PageLayout) self.Text.append(Text) elif self.TextLevel == 2: pass logging = Logger(__name__) Logger.get_log(logging).info('Text Segmentation Finished') logging.logger.handlers.clear()
def pdf2image(fileName): with tempfile.TemporaryDirectory() as path: PagesImage = convert_from_path(fileName, output_folder=path) for index in range(len(PagesImage)): PageImage = PagesImage[index] PagesImage[index] = cv2.cvtColor(np.asarray(PageImage), cv2.COLOR_RGB2BGR) logging = Logger(__name__) Logger.get_log(logging).info('pdf2image Completed') logging.logger.handlers.clear() return PagesImage
def Segmentation(self): for PageLayout in self.PagesLayout: table = detect_table(PageLayout) newTable = [] new_c_header = [] new_r_header = [] new_body = [] for index in range(len(table)): tableItem = table[index] newTable.append(Region(tableItem)) if self.TableLevel == 2: c_header, r_header, body = extraction( PageLayout, tableItem) for cell in c_header: cell.insert(0, Region(cell[0])) cell.remove(cell[1]) if not cell[6] == []: for child in cell[6]: child.insert(0, Region(child[0])) child.remove(child[1]) for cell in r_header: cell.insert(0, Region(cell[0])) cell.remove(cell[1]) if not cell[6] == []: for child in cell[6]: child.insert(0, Region(child[0])) child.remove(child[1]) for cell in body: cell.insert(0, Region(cell[0])) cell.remove(cell[1]) new_c_header.append(c_header) new_r_header.append(r_header) new_body.append(body) self.Table.append(newTable) if self.TableLevel == 2: self.Column_Header.append(new_c_header) self.Row_Header.append(new_r_header) self.Body.append(new_body) logging = Logger(__name__) Logger.get_log(logging).info('Table Segmentation Finished') logging.logger.handlers.clear()
def Segmentation(self): FigNoteList = [] TabNoteList = [] for PageNo in range(len(self.PagesLayout)): PageLayout = self.PagesLayout[PageNo] if self.TextLevel == 1: Text = Leve1Extraction(PageLayout) self.Text.append(Text) elif self.TextLevel == 2: Page, pgIndex = PageExtraction(PageLayout) Note, ntIndex = NoteExtraction(PageLayout) FigNoteList.append(FigureNoteExtraction(PageLayout)) TabNoteList.append(TableNoteExtraction(PageLayout)) if PageNo == 0: Title, ttIndex = TitleExtraction(PageLayout) Author, auIndex = AuthorExtraction(PageLayout, ttIndex) self.Title.append(Title) self.Author.append(Author) else: ttIndex = -1 auIndex = [] self.Title.append([]) self.Author.append([]) Text = Level2Extraction(PageLayout, pgIndex, ntIndex, ttIndex, auIndex) self.Page.append(Page) self.Note.append(Note) self.Text.append(Text) if self.TextLevel == 2: self.TableNote = NotePostProcess(TabNoteList, "T") self.FigureNote = NotePostProcess(FigNoteList, "F") self.Text = FigTabNoteOut(self.Text, self.TableNote, self.FigureNote) logging = Logger(__name__) Logger.get_log(logging).info('Text Segmentation Finished') logging.logger.handlers.clear()
def ImageWrite(ImageList, fileName, fileFolder): ImgWrite = False for Image in ImageList: if isinstance(Image, np.ndarray): ImgWrite = True if ImgWrite: imgFolder = fileFolder + fileName[:-4] + '/' if not os.path.exists(imgFolder[:-1]): os.mkdir(imgFolder) for index in range(len(ImageList)): Image = ImageList[index] imgName = fileName[:-4] + '_p' + str(index+1) + '.jpg' if isinstance(Image, np.ndarray): cv2.imwrite(imgFolder + imgName, Image) logging = Logger(__name__) Logger.get_log(logging).info('Image Saved') logging.logger.handlers.clear()
def Segmentation(self): for PageNo in range(len(self.PagesLayout)): PageLayout = self.PagesLayout[PageNo] if self.TextLevel == 1: Text = Leve1Extraction(PageLayout) self.Text.append(Text) elif self.TextLevel == 2: Zhang = ZhangExtraction(PageLayout) self.Zhang.append(Zhang) Jie = JieExtraction(PageLayout) self.Jie.append(Jie) Tiao = TiaoExtraction(PageLayout) self.Tiao.append(Tiao) Title = TitleExtraction(PageLayout) self.Title.append(Title) logging = Logger(__name__) Logger.get_log(logging).info('Text Segmentation Finished') logging.logger.handlers.clear()
def configCheck(self): if not self.folder[-1] == '/': Logger.get_log( self.logging).critical('Configuration - Folder Format Error') print("Configuration - Folder may loss '/' to the end of the path") y_n = input( "Do you want system add '/' to the end of path ? (Y/N)\n") if y_n.lower() == 'y' or y_n.lower() == 'yes': self.folder += '/' else: sys.exit() if not self.filename == 'all' and not self.filename[-4:] == '.pdf': Logger.get_log(self.logging).critical( 'Configuration - FileName Not End With .pdf ') print('Configuration - FileName Not End With \'.pdf\'') y_n = input( "Do you want system add '.pdf' to the end of filename ? (Y/N)\n" ) if y_n.lower() == 'y' or y_n.lower() == 'yes': self.filename += '.pdf' else: sys.exit() if not (self.text_level == 1 or self.text_level == 2): Logger.get_log(self.logging).critical( 'Configuration - text_level Format Error ') while True: print('Configuration - text_level Format Error ') text_level = input( "Please press 1/2 to specify a text_level \n") if text_level == '1' or text_level == '2': self.text_level = text_level break
def __init__(self): self.logging = Logger(__name__) Logger.get_log(self.logging).info('Start processing ConfigFile') self.config() Logger.get_log(self.logging).info('ConfigFile Processed\n')
def FNTypeCheck(FigNoteList): TypeList = [] TypeCountList = [] for pgNum in range(len(FigNoteList)): PageFigNote = FigNoteList[pgNum] for figNoteIndex in range(len(PageFigNote)): figNote = PageFigNote[figNoteIndex] figNoteText = figNote[1].get_text()[:-1].lower().replace(" ", "") Type = FNTypeCalculate(figNoteText) TypeList.append(Type) for index in range(len(TypeList) - 1, -1, -1): item = TypeList[index] if item.find('E') >= 0: TypeList.remove(item) if not TypeList == []: while True: Type = TypeList[0] TypeCount = TypeList.count(Type) TypeCountList.append([Type, TypeCount]) for index in range(len(TypeList) - 1, -1, -1): item = TypeList[index] if item == Type: TypeList.remove(item) if len(TypeList) == 0: break MaxTypeCount = [[None, -1]] for index in range(len(TypeCountList)): TCPair = TypeCountList[index] count = TCPair[1] if count > MaxTypeCount[0][1]: MaxTypeCount[0][0] = TCPair[0] MaxTypeCount[0][1] = count for index in range(len(TypeCountList)): TCPair = TypeCountList[index] type = TCPair[0] count = TCPair[1] if count == MaxTypeCount[0][1] and not type == MaxTypeCount[0][0]: MaxTypeCount.append([TCPair[0], count]) if len(MaxTypeCount) > 1: MaxType = '000' for item in MaxTypeCount: if item[0] > MaxType: MaxType = item[0] logging = Logger(__name__) Logger.get_log(logging).critical( 'Same Type of ImageNote: {}'.format(MaxTypeCount)) logging.logger.handlers.clear() return MaxType else: return MaxTypeCount[0][0] else: logging = Logger(__name__) Logger.get_log(logging).critical('No ImageNote') logging.logger.handlers.clear() return None