def Segmentation(self): logging = Logger(__name__) Logger.get_log(logging).info('Segmentation Start') TextLevel = self.configList.text_level TableLevel = self.configList.table_level if not self.configList.tit_choice: self.Table = TableExtraction(TableLevel, self.PagesLayout) self.Image = ImageExtraction(self.PagesImage, self.PagesLayout) self.Text = TextExtraction(TextLevel, self.PagesLayout) self.Text.Text = ImgTabOut(self.PgHeight, self.Text.Text, self.Image.Image, self.Table.Table) elif self.configList.tit_choice == 1: self.Text = TextExtraction(TextLevel, self.PagesLayout) elif self.configList.tit_choice == 2: self.Image = ImageExtraction(self.PagesImage, self.PagesLayout) elif self.configList.tit_choice == 3: self.Table = TableExtraction(TableLevel, self.PagesLayout) Logger.get_log(logging).info('Segmentation Finished') logging.logger.handlers.clear()
def TitleExtraction(PageLayout): title = [] titleHeight = -1 titleIndex = 0 Height = PageLayout.height for index in range(len(PageLayout._objs)): item = PageLayout._objs[index] if item.y0 > 0.8 * Height: if isinstance(item, LTTextBoxHorizontal): for line in item: if isinstance(line, LTTextLineHorizontal): height = line.height if height > 1.3 * titleHeight: titleHeight = height titleIndex = index title = item break else: break if title == []: logging = Logger(__name__) Logger.get_log(logging).critical('No Title Found') logging.logger.handlers.clear() return title, titleIndex
def JsonWrite(JsonFile, fileName, fileFolder): jsonPath = fileFolder + fileName[:-4] + '.json' with open(jsonPath, 'w') as f: json.dump(JsonFile, f) logging = Logger(__name__) Logger.get_log(logging).info('JsonFile Saved') logging.logger.handlers.clear()
def Segmentation(self): logging = Logger(__name__) Logger.get_log(logging).info('Segmentation Start') TextLevel = self.configList.text_level self.Text = TextExtraction(TextLevel, self.PagesLayout) Logger.get_log(logging).info('Segmentation Finished') logging.logger.handlers.clear()
def configCheck(self): if not self.folder[-1] == '/': Logger.get_log( self.logging).critical('Configuration - Folder Format Error') print("Configuration - Folder may loss '/' to the end of the path") y_n = input( "Do you want system add '/' to the end of path ? (Y/N)\n") if y_n.lower() == 'y' or y_n.lower() == 'yes': self.folder += '/' else: sys.exit() if not self.filename == 'all' and not self.filename[-4:] == '.pdf': Logger.get_log(self.logging).critical( 'Configuration - FileName Not End With .pdf ') print('Configuration - FileName Not End With \'.pdf\'') y_n = input( "Do you want system add '.pdf' to the end of filename ? (Y/N)\n" ) if y_n.lower() == 'y' or y_n.lower() == 'yes': self.filename += '.pdf' else: sys.exit() if not (self.tit_choice == 0 or self.tit_choice == 1 or self.tit_choice == 2 or self.tit_choice == 3): Logger.get_log(self.logging).critical( 'Configuration - tit_choice Format Error ') while True: print('Configuration - tit_choice Format Error') tit_choice = input( "Please press 0/1/2/3 to specify a tit_choice \n") if tit_choice == '0' or tit_choice == '1' or tit_choice == '2' or tit_choice == '3': self.tit_choice = tit_choice break if not (self.text_level == 1 or self.text_level == 2): Logger.get_log(self.logging).critical( 'Configuration - text_level Format Error ') while True: print('Configuration - text_level Format Error ') text_level = input( "Please press 1/2 to specify a text_level \n") if text_level == '1' or text_level == '2': self.text_level = text_level break if not (self.table_level == 1 or self.table_level == 2): Logger.get_log(self.logging).critical( 'Configuration - table_level Format Error ') while True: print('Configuration - table_level Format Error ') table_level = input( "Please press 1/2 to specify a table_level \n") if table_level == '1' or table_level == '2': self.text_level = table_level break
def pdf2layout(fileName): logging = Logger(__name__) try: PagesLayout = with_pdf(fileName, '', _parse_pages, *tuple(['/tmp'])) Logger.get_log(logging).info('pdf2xml Completed') logging.logger.handlers.clear() return PagesLayout except Exception: Logger.get_log(logging).critical('pdf2xml failed\n') return None
def Segmentation(self): for PageNo in range(len(self.PagesLayout)): PageImage = self.PagesImage[PageNo] PageLayout = self.PagesLayout[PageNo] Image = ImgExtraction(PageImage, PageLayout) self.Image.append(Image) logging = Logger(__name__) Logger.get_log(logging).info('Image Segmentation Finished') logging.logger.handlers.clear()
def JsonWrite(PagesText, fileName, fileFolder): pdf_folder = fileFolder + str(fileName) + '/' if not os.path.exists(pdf_folder): os.mkdir(pdf_folder) for index in range(len(PagesText)): Page = PagesText[index] txtPath = pdf_folder + str(fileName) + '_' + str(index + 1) + '.txt' with open(txtPath, 'w') as f: for Line in Page: f.write(Line + '\n') logging = Logger(__name__) Logger.get_log(logging).info('JsonFile Saved') logging.logger.handlers.clear()
def ImageWrite(ImageList, fileName, fileFolder): imgFolder = fileFolder + str(fileName) + '/' if not os.path.exists(imgFolder[:-1]): os.mkdir(imgFolder) for index in range(len(ImageList)): Image = ImageList[index] imgName = str(fileName) + '_' + str(index + 1) + '.jpg' cv2.imwrite(imgFolder + imgName, Image) logging = Logger(__name__) Logger.get_log(logging).info('Image Saved') logging.logger.handlers.clear()
def Segmentation(self): for PageNo in range(len(self.PagesLayout)): PageLayout = self.PagesLayout[PageNo] if self.TextLevel == 1: Text = Leve1Extraction(PageLayout) self.Text.append(Text) elif self.TextLevel == 2: pass logging = Logger(__name__) Logger.get_log(logging).info('Text Segmentation Finished') logging.logger.handlers.clear()
def pdf2image(fileName): with tempfile.TemporaryDirectory() as path: PagesImage = convert_from_path(fileName, output_folder=path) for index in range(len(PagesImage)): PageImage = PagesImage[index] PagesImage[index] = cv2.cvtColor(np.asarray(PageImage), cv2.COLOR_RGB2BGR) logging = Logger(__name__) Logger.get_log(logging).info('pdf2image Completed') logging.logger.handlers.clear() return PagesImage
def Segmentation(self): for PageLayout in self.PagesLayout: table = detect_table(PageLayout) newTable = [] new_c_header = [] new_r_header = [] new_body = [] for index in range(len(table)): tableItem = table[index] newTable.append(Region(tableItem)) if self.TableLevel == 2: c_header, r_header, body = extraction( PageLayout, tableItem) for cell in c_header: cell.insert(0, Region(cell[0])) cell.remove(cell[1]) if not cell[6] == []: for child in cell[6]: child.insert(0, Region(child[0])) child.remove(child[1]) for cell in r_header: cell.insert(0, Region(cell[0])) cell.remove(cell[1]) if not cell[6] == []: for child in cell[6]: child.insert(0, Region(child[0])) child.remove(child[1]) for cell in body: cell.insert(0, Region(cell[0])) cell.remove(cell[1]) new_c_header.append(c_header) new_r_header.append(r_header) new_body.append(body) self.Table.append(newTable) if self.TableLevel == 2: self.Column_Header.append(new_c_header) self.Row_Header.append(new_r_header) self.Body.append(new_body) logging = Logger(__name__) Logger.get_log(logging).info('Table Segmentation Finished') logging.logger.handlers.clear()
def Segmentation(self): FigNoteList = [] TabNoteList = [] for PageNo in range(len(self.PagesLayout)): PageLayout = self.PagesLayout[PageNo] if self.TextLevel == 1: Text = Leve1Extraction(PageLayout) self.Text.append(Text) elif self.TextLevel == 2: Page, pgIndex = PageExtraction(PageLayout) Note, ntIndex = NoteExtraction(PageLayout) FigNoteList.append(FigureNoteExtraction(PageLayout)) TabNoteList.append(TableNoteExtraction(PageLayout)) if PageNo == 0: Title, ttIndex = TitleExtraction(PageLayout) Author, auIndex = AuthorExtraction(PageLayout, ttIndex) self.Title.append(Title) self.Author.append(Author) else: ttIndex = -1 auIndex = [] self.Title.append([]) self.Author.append([]) Text = Level2Extraction(PageLayout, pgIndex, ntIndex, ttIndex, auIndex) self.Page.append(Page) self.Note.append(Note) self.Text.append(Text) if self.TextLevel == 2: self.TableNote = NotePostProcess(TabNoteList, "T") self.FigureNote = NotePostProcess(FigNoteList, "F") self.Text = FigTabNoteOut(self.Text, self.TableNote, self.FigureNote) logging = Logger(__name__) Logger.get_log(logging).info('Text Segmentation Finished') logging.logger.handlers.clear()
def __init__(self, pages_layout): self.pages_layout = pages_layout logging = Logger(__name__) logging.logger.info('Analysis Start') self.table = TableAnalysis(self.pages_layout) self.text = TextAnalysis(self.pages_layout) self.figure = FigureAnalysis(self.pages_layout) logging.logger.info('Analysis Finished')
def __init__(self, pages_layout): self.pages_layout = pages_layout self.pages_table = [] for page_layout in self.pages_layout: page_table = detect_table(page_layout) self.pages_table.append(page_table) logging = Logger(__name__) logging.logger.info('Table Analysis Finished')
def __init__(self, pages_layout): self.pages_layout = pages_layout self.pages_text = [] for page_layout in self.pages_layout: page_text = detect_text(page_layout) self.pages_text.append(page_text) logging = Logger(__name__) logging.logger.info('Text Analysis Finished')
def Segmentation(self): for PageNo in range(len(self.PagesLayout)): PageLayout = self.PagesLayout[PageNo] if self.TextLevel == 1: Text = Leve1Extraction(PageLayout) self.Text.append(Text) elif self.TextLevel == 2: Zhang = ZhangExtraction(PageLayout) self.Zhang.append(Zhang) Jie = JieExtraction(PageLayout) self.Jie.append(Jie) Tiao = TiaoExtraction(PageLayout) self.Tiao.append(Tiao) Title = TitleExtraction(PageLayout) self.Title.append(Title) logging = Logger(__name__) Logger.get_log(logging).info('Text Segmentation Finished') logging.logger.handlers.clear()
def ImageWrite(ImageList, fileName, fileFolder): ImgWrite = False for Image in ImageList: if isinstance(Image, np.ndarray): ImgWrite = True if ImgWrite: imgFolder = fileFolder + fileName[:-4] + '/' if not os.path.exists(imgFolder[:-1]): os.mkdir(imgFolder) for index in range(len(ImageList)): Image = ImageList[index] imgName = fileName[:-4] + '_p' + str(index+1) + '.jpg' if isinstance(Image, np.ndarray): cv2.imwrite(imgFolder + imgName, Image) logging = Logger(__name__) Logger.get_log(logging).info('Image Saved') logging.logger.handlers.clear()
def image_write(folder, filename, page_images, flag): path = os.path.join(folder, filename) if not os.path.exists(path): os.mkdir(path) for idx in range(len(page_images)): page_image = page_images[idx] page_path = os.path.join(path, filename + '_' + str(idx + 1) + '.jpg') cv2.imwrite(page_path, page_image) logging = Logger(__name__) logging.logger.info(flag + ' Saved')
def AuthorExtraction(PageLayout, TitleIndex): author = [] auIndex = [] breakSign = False for index in range(TitleIndex + 1, len(PageLayout._objs)): Box = PageLayout._objs[index] if isinstance(Box, LTTextBoxHorizontal): for line in Box: lineText = line.get_text().replace(' ', '').lower() if lineText.find('abstract') >= 0: abstractIndex = index abstractUpY = line.y1 breakSign = True break if breakSign: break if not breakSign: logging = Logger(__name__) Logger.get_log(logging).critical('No Abstract Found') logging.logger.handlers.clear() for index in range(TitleIndex + 1, len(PageLayout._objs)): Box = PageLayout._objs[index] if isinstance(Box, LTTextBoxHorizontal): for line in Box: lineText = line.get_text().replace(' ', '').lower() if lineText.find('introduction') >= 0: abstractIndex = index abstractUpY = line.y1 breakSign = True break if breakSign: break if not breakSign: logging = Logger(__name__) Logger.get_log(logging).critical('No Introduction Found') logging.logger.handlers.clear() author.append(PageLayout._objs[TitleIndex + 1]) auIndex.append(TitleIndex + 1) else: Width = PageLayout.width Height = PageLayout.height for index in range(TitleIndex + 1, abstractIndex): Box = PageLayout._objs[index] if Box.y0 > max(abstractUpY, 0.6 * Height) and isinstance( Box, LTTextBoxHorizontal): if (Box.x0 + Box.x1) > Width / 4 and (Box.x0 + Box.x1) < 7 * Width / 4: author.append(Box) auIndex.append(index) return author, auIndex
def json_write(folder, filename, content): path = os.path.join(folder, filename) if not os.path.exists(path): os.mkdir(path) file_dict = {'Pages': []} for idx in range(len(content)): page_path = os.path.join(path, filename + '_' + str(idx + 1) + '.json') with open(page_path, 'w', encoding='utf-8') as f: json.dump(content[idx], f, ensure_ascii=False, indent=4) file_dict['Pages'].append(content[idx]) file_path = os.path.join(path, filename + '.json') with open(file_path, 'w', encoding='utf-8') as f: json.dump(file_dict, f, ensure_ascii=False, indent=4) logging = Logger(__name__) logging.logger.info('JsonFile Saved')
def __init__(self): self.logging = Logger(__name__) self.logging.logger.info('Start processing ConfigFile') cp = ConfigParser() cp.read('conf.cfg') self.pdf_folder = cp.get('configuration', 'pdf_folder') self.json_output = cp.get('configuration', 'json_output') self.ori_output = cp.get('configuration', 'ori_output') self.anno_output = cp.get('configuration', 'anno_output') if not os.path.exists(self.pdf_folder): os.makedirs(self.pdf_folder) if not os.path.exists(self.json_output): os.makedirs(self.json_output) if not os.path.exists(self.ori_output): os.makedirs(self.ori_output) if not os.path.exists(self.anno_output): os.makedirs(self.anno_output) self.file_list = sorted(os.listdir(self.pdf_folder)) self.logging.logger.info('ConfigFile Processed\n')
def __init__(self): self.logging = Logger(__name__) Logger.get_log(self.logging).info('Start processing ConfigFile') self.config() Logger.get_log(self.logging).info('ConfigFile Processed\n')
def FNTypeCheck(FigNoteList): TypeList = [] TypeCountList = [] for pgNum in range(len(FigNoteList)): PageFigNote = FigNoteList[pgNum] for figNoteIndex in range(len(PageFigNote)): figNote = PageFigNote[figNoteIndex] figNoteText = figNote[1].get_text()[:-1].lower().replace(" ", "") Type = FNTypeCalculate(figNoteText) TypeList.append(Type) for index in range(len(TypeList) - 1, -1, -1): item = TypeList[index] if item.find('E') >= 0: TypeList.remove(item) if not TypeList == []: while True: Type = TypeList[0] TypeCount = TypeList.count(Type) TypeCountList.append([Type, TypeCount]) for index in range(len(TypeList) - 1, -1, -1): item = TypeList[index] if item == Type: TypeList.remove(item) if len(TypeList) == 0: break MaxTypeCount = [[None, -1]] for index in range(len(TypeCountList)): TCPair = TypeCountList[index] count = TCPair[1] if count > MaxTypeCount[0][1]: MaxTypeCount[0][0] = TCPair[0] MaxTypeCount[0][1] = count for index in range(len(TypeCountList)): TCPair = TypeCountList[index] type = TCPair[0] count = TCPair[1] if count == MaxTypeCount[0][1] and not type == MaxTypeCount[0][0]: MaxTypeCount.append([TCPair[0], count]) if len(MaxTypeCount) > 1: MaxType = '000' for item in MaxTypeCount: if item[0] > MaxType: MaxType = item[0] logging = Logger(__name__) Logger.get_log(logging).critical( 'Same Type of ImageNote: {}'.format(MaxTypeCount)) logging.logger.handlers.clear() return MaxType else: return MaxTypeCount[0][0] else: logging = Logger(__name__) Logger.get_log(logging).critical('No ImageNote') logging.logger.handlers.clear() return None