def get_result_from_file(filename): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams result = {"filename": filename, "pages": []} fp = open(filename, "rb") parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 2.0 laparams.detect_vertical = True laparams.line_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_index = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() bounding_box = get_bounding_box(layout) labels = get_text_labels(layout) result["pages"].append({"index": page_index, "bounding_box": bounding_box, "labels": labels}) page_index += 1 fp.close() return result
def extract_text_from_pdf(path_in, path_out, fichier, page_beg, page_end=0): if (page_end == 0): page_end = page_beg fp = open(path_in + '/' + fichier, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 4.0 # 2.0 by default : two char whose distance is closer than this value are considered contiguous and get grouped into one. laparams.word_margin = 0.3 # 0.1 by default : distance between two words is greater than this value => insert space laparams.line_margin = 0.5 # 0.5 by default : Distance between 2 Lines under this value are grouped device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' x = list(doc.get_pages()) for i in range(page_beg - 1, page_end): page = x[i] extracted_text += "EXTRACTION DE LA PAGE " + str(i + 1) + "\n\n" interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() extracted_text += "\n" return extracted_text
def to_txt(infile: str, outfile: str): """ Convert a pdf file to txt. :param infile: pdf file path; :param outfile: txt file path; :return: txt file path; """ caching = True rsrcmgr = PDFResourceManager(caching=caching) codec = 'utf-8' pagenos = set() maxpages = 0 password = '' laparams = LAParams() laparams.word_margin = float(0) laparams.line_margin = float(1) outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') device = TextConverter(rsrcmgr, outfp, laparams=laparams) fp = io.open(infile, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return outfile
def get_text_rows(path): rows = defaultdict(list) # Open a PDF file. fp = open(path, 'rb') # Create a PDF parser object associated with the file object. # parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter # document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. # if not document.is_extractable: # raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() laparams.line_overlap = 0.01 laparams.line_margin = 0.01 laparams.word_margin = 0.15 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) def parse_obj(lt_objs, page): # loop over the object list for obj in lt_objs: # if it's a textbox, print text and location if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): rows[(page, -int(obj.bbox[1]))].append( (int(obj.bbox[0]), sanitize(obj.get_text()))) # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): parse_obj(obj._objs, page) # loop over all pages in the document for page_num, page in enumerate(PDFPage.get_pages(fp)): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object parse_obj(layout._objs, page_num) for key in sorted(rows): rows[key] = sorted(rows[key]) page, y = key y = -y yield (page, y, rows[key])
def parse(self, path): out = StringIO.StringIO() fp = None # Directory if os.path.isdir(path): raise NotImplementedError() # File else: fp = file(path) rsrc = PDFResourceManager() codec = 'utf-8' laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin = 2.0 laparams.word_margin = 0.0 device = TextConverter(rsrc, out, codec=codec, laparams=laparams) doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() interpreter = PDFPageInterpreter(rsrc, device) for page in doc.get_pages(): interpreter.process_page(page) device.close() sample = Sample(path, None, out.getvalue()) out.close() return sample
def extract_layout_by_page(pdf_path, page_number): """ :param pdf_path: pdf file path :param page_number: the specific page that you want to parse(start from 1) :return: a list of pdfminer layout object """ fp = open(pdf_path, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() laparams.line_overlap = 0.3 laparams.char_margin = 3 laparams.word_margin = 0.3 laparams.line_margin = 0.01 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) layouts = [] # 循环遍历列表,每次处理一个page的内容 pages = list(doc.get_pages()) interpreter.process_page(pages[page_number - 1]) # 接受该页面的LTPage对象 return device.get_result()
def __extract_extra__(request, item_id=None): if not request.user.is_authenticated(): return HttpResponse('Please sign in first') from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.converter import TextConverter from cStringIO import StringIO laparams = LAParams() outtype = 'text' laparams.char_margin = 1.0 laparams.line_margin = 0.3 laparams.word_margin = 0.2 codec = 'utf-8' caching = True if item_id: all_items = Item.objects.filter(id=item_id) else: all_items = Item.objects.all() for item in all_items: # Don't extract if no PDF exists; or if we already have search text if not item.pdf_file or item.other_search_text: continue rsrcmgr = PDFResourceManager(caching=caching) outfp = StringIO() device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = item.pdf_file.file try: process_pdf(rsrcmgr, device, fp, pagenos=set(), maxpages=0, password='', caching=caching, check_extractable=True) except AssertionError: logger.warning('FAILED in completely PDF index "%s"' % item.title) return HttpResponse('FAILED in completely PDF index "%s"' \ % item.title) else: logger.debug('Full PDF index of item "%s"' % item.title) finally: fp.close() device.close() outfp.seek(0) page_text = outfp.read() outfp.close() item.other_search_text = page_text item.save() return HttpResponse('Full PDF indexed for item "%s"' % item.title)
def convert(fname, pages=None, M=1.0, L=0.3, W=0.2, F=0.5): """ Converts a pdf filename into plain text. Each value is specified not as an actual length, but as a proportion of the length to the size of each character in question. Parameters define layout analysis. In a PDF text is in several chunks of various types. Text extraction needs to recover text chunks which ar regarded as continuous if elements distance is closer than the char_margin (identified as M) and thus are grouped into one block. Two lines are part of the same text if they are closer than the line_margin (L). If the distance between two words is greater than the word_margin (W), blank characters (spaces) shall be inserted as necessary to keep format. Boxes flow (F) specifies how much a horizontal and vertical position of a text matters when determining text flow order. The value should be within the range from -1.0 (only horizontal position matters) to +1.0 (only vertical position matters). Keyword arguments: fname -- PDF file name (string) pages -- Set of pages to extract (set) M -- char_margin (float) L -- line_margin (float) W -- word_margin (float) F -- boxes_flow (float) Return: text: pdf contents as plain text """ if not pages: pagenums = set() else: pagenums = set(pages) output = BytesIO() codec = "utf-8" manager = PDFResourceManager() laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = False laparams.char_margin = M laparams.line_margin = L laparams.word_margin = W laparams.boxes_flow = F converter = TextConverter(manager, output, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close return text
def readText(self,path, outtype='text', opts={}): outfile = path[:-3] + outtype outdir = '/'.join(path.split('/')[:-1]) # debug option pagenos = set() maxpages = 0 # output option # ?outfile = None # ?outtype = None outdir = None #layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) print laparams # #PDFDocument.debug = debug #PDFParser.debug = debug CMapDB.debug = self.debug PDFResourceManager.debug = self.debug PDFPageInterpreter.debug = self.debug PDFDevice.debug = self.debug # rsrcmgr = PDFResourceManager() #outtype = 'text' outfp = StringIO() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True) fp.close() device.close() print outfp.getvalue() outfp.close() return
def convert_to_text_file(filename_in, filename_out, rewrite=False): """ Parse file according to BORME PDF format filename: filenameOut: """ if os.path.isdir(filename_out): filename_out = os.path.join(filename_out, os.path.basename(filename_in)) if os.path.exists(filename_out) and not rewrite: logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out) return False # conf codec = 'utf-8' laparams = LAParams() imagewriter = None pagenos = set() maxpages = 0 password = '' rotation = 0 # <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False> laparams.detect_vertical = True laparams.all_texts = False laparams.char_margin = 2.0 laparams.line_margin = 0.5 laparams.word_margin = 0.1 caching = True rsrcmgr = PDFResourceManager(caching=caching) outfp = open(filename_out, 'w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = open(filename_in, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) # https://github.com/euske/pdfminer/issues/72 #page = PDFPage() #PDFPage.cropbox = # y esto? for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return True
def extractrefs(infile, outfile): pagenos = set() caching = True infp = open(infile, 'rb') outfp = open(outfile, 'w') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.line_margin = 1.4 device = RefsExtractor(rsrcmgr, outfp, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(infp, pagenos, caching=caching, check_extractable=True): interpreter.process_page(page) infp.close() outfp.close()
def output_pdf_to_table(path): fp = open(path, "rb") rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.line_margin = line_margin_threshold codec = 'utf-8' device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password="" maxpages=pages_to_view caching=True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) layout = device.get_result() getRows(layout)
def extract_text(self): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() laparams = LAParams() laparams.char_margin = 1000 laparams.word_margin = 0.01 laparams.line_margin = 0.01 converter = TextConverter(resource_manager, fake_file_handle, laparams=laparams) page_interpreter = PDFPageInterpreter(resource_manager, converter) page_interpreter.device.handle_undefined_char = lambda f, c: chr(c) with open(self.path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) self._text = fake_file_handle.getvalue() self.rows = self._text.split('\n') converter.close() fake_file_handle.close()
def read_pdf_data(filename): fp = open(filename, 'rb') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.line_margin = 0.1 device = My(rsrcmgr, sys.stdout, laparams=laparams) device.reset() interpreter = PDFPageInterpreter(rsrcmgr, device) result_data = [] count = 0 for page in PDFPage.get_pages(fp, set()): interpreter.process_page(page) result_data.append(device.group) device.word = "" device.group = [] device.word_pos_info = {} count += 1 # break fp.close() return result_data
def readpdf(pdfFile): fp = open(pdfFile, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) #doc.initialize('password') # leave empty for no password rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = float('1.1') #too small and it splits the description, too big and Quantity-Unit-Part number are not separated: 1.1 seems to work laparams.line_margin = float('0.8') device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) # receive the LTPage object for this page device.get_result() #print(device.rows) df = pd.DataFrame(device.rows, columns=['Page', 'x', 'y', 'c1','c2','String']) return df
def _get_content(fname): rsrcmgr = PDFResourceManager(caching=caching) laparams = LAParams() laparams.line_margin = 1.0 laparams.boxes_flow = 1.0 imagewriter = None with io.BytesIO() as outfp: device = TextConverter( rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter ) interpreter = PDFPageInterpreter(rsrcmgr, device) with open(fname, "rb") as f: for page in PDFPage.get_pages( f, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True, ): interpreter.process_page(page) return outfp.getvalue().decode("utf-8")
def pdf_to_string(pdf_file): fp = open(pdf_file, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() # Configuração das margens laparams = LAParams() laparams.line_margin = 0.3 laparams.word_margin = 0.3 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: print(lt_obj)
def from_pdf(pdfFile): try: pagenos = set() strfp = StringIO() codec = 'utf-8' laparams = LAParams() #laparams.char_margin = 10 laparams.line_margin = 20 #laparams.word_margin = 10 laparams.boxes_flow = -1 rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, strfp, codec=codec, laparams=laparams) fp = file(pdfFile, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, check_extractable=True): interpreter.process_page(page) except Exception, e: print e traceback.print_exc() pass
def get_result_from_file(filename): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams result = {"filename": filename, "pages": []} fp = open(filename, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 2.0 laparams.detect_vertical = True laparams.line_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_index = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() bounding_box = get_bounding_box(layout) labels = get_text_labels(layout) result["pages"].append({ "index": page_index, "bounding_box": bounding_box, "labels": labels }) page_index += 1 fp.close() return result
def pdf_to_string(pdf_file): fp = open(pdf_file, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.line_margin = 0.3 laparams.word_margin = 0.3 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, (LTTextBox, LTTextLine)): extracted_text += lt_obj.get_text() return extracted_text
def main(argv): def usage(): print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
def ConvertPdf(pdfpath, outfp, opts={}): import sys from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams from pdfminer.image import ImageWriter debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() if not outtype: outtype = 'txt' if outtype == 'txt': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) fp = file(pdfpath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() return True
def main(argv): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() using_optparse = False parser = ArgumentParser(prog='pdf2txt.py', description='Convert pdf to txt', formatter_class=ArgumentDefaultsHelpFormatter) if using_optparse: DEBUG(3, 'using optparse') parser.add_argument = parser.add_option parser.parse_known_args = parser.parse_args parser.disable_interspersed_args() parser.add_argument('-d', dest='debuglevel', action='count', default=0, help='Debug (repeat for more verbose debugging)') parser.add_argument( '-p', '--pages', dest='pagenos', action='store', type=str, default='', help= 'Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.' ) parser.add_argument('-c', '--codec', dest='codec', action='store', type=str, default='utf-8', help='Specifies the output codec.') parser.add_argument( '-t', '--type', dest='outtype', action='store', type=str, default='shape', choices=['text', 'html', 'xml', 'tag', 'shape'], help='Specifies the output format, one of: shape, text, html, xml, tag' ) parser.add_argument( '-m', dest='maxpages', action='store', type=int, default=0, help= 'Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.' ) parser.add_argument( '-P', '--password', dest='password', action='store', type=str, default='', help='Provides the user password to access PDF contents.') parser.add_argument( '-o', '--output', dest='outfile', action='store', type=str, default=None, help= 'Specifies the output file name. By default, it prints the extracted contents to stdout in text format.' ) parser.add_argument( '-C', '--no-caching', dest='caching', action='store_false', default=True, help= 'Suppress object caching. This will reduce the memory consumption but also slows down the process.' ) parser.add_argument('-n', '--no-layout', dest='layout', action='store_false', default=True, help='Suppress layout analysis.') parser.add_argument('--show-pageno', dest='show_pageno', action='store_true', default=False, help='Show page numbers.') parser.add_argument( '-A', '--analyze-all', dest='all_texts', action='store_true', default=False, help= 'Forces to perform layout analysis for all the text strings, including text contained in figures.' ) parser.add_argument('-V', '--detect-vertical', dest='detect_vertical', action='store_true', default=False, help='Allows vertical writing detection.') parser.add_argument( '-M', dest='char_margin', action='store', type=float, default=2.0, help= 'Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.' ) parser.add_argument( '-L', dest='line_margin', action='store', type=float, default=0.5, help= 'Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.' ) parser.add_argument( '-W', dest='word_margin', action='store', type=float, default=0.1, help= 'It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.' ) parser.add_argument( '-F', dest='boxes_flow', action='store', type=float, default=0.5, help= 'Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).' ) parser.add_argument( '-Y', '--layout-mode', dest='layoutmode', action='store', type=str, default='normal', choices=['exact', 'normal', 'loose'], help= 'Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.' ) parser.add_argument('-O', '--image-writer', dest='imagewriter', action='store', type=str, default=None, help='imagewriter') parser.add_argument('-R', '--rotation', dest='rotation', action='store', type=int, default=0, help='rotation') parser.add_argument('-S', '--strip-control', dest='stripcontrol', action='store_true', default=False, help='stripcontrol') parser.add_argument( '-s', dest='scale', action='store', type=float, default=1, help='Specifies the output scale. Can be used in HTML format only.') parser.add_argument( '--draw-lines', dest='draw_lines', action='store_true', help= "Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output." ) parser.add_argument( '--draw-boxes', dest='draw_boxes', action='store_true', help= "Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output." ) parser.add_argument( '--draw-blocks', dest='draw_blocks', action='store_true', help= "Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output." ) parser.add_argument( '--shear-limit', dest='shear_limit', action='store', default=0.1, type=float, help= "If the text is sheared above this limit, reject it. Valid only for the `shape' output." ) parser.add_argument( '--rotation-limit', dest='rotation_limit', action='store', default=2, type=float, help= "If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output." ) parser.add_argument( '--line-height-diff', dest='line_height_diff', action='store', type=float, default=0.1, help= 'Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).' ) parser.add_argument('--heading-before', dest='heading_before', action='store', type=str, default='', help='String to put before each heading, e.g. <h1>') parser.add_argument('--heading-after', dest='heading_after', action='store', type=str, default='', help='String to put after each heading, e.g. </h1>') parser.add_argument( '--box-separator', dest='box_separator', action='store', type=str, default=r'\n\n', help= r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--block-separator', dest='block_separator', action='store', type=str, default=r'\n\n', help= r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-separator', dest='indent_separator', action='store', type=str, default=r'\n\n', help= r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-string', dest='indent_string', action='store', type=str, default=r'\t', help= r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-limit', dest='indent_limit', action='store', type=float, default=3, help= 'If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.' ) parser.add_argument( '--page-separator', dest='page_separator', action='store', type=str, default=r'\n\n', help= r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--norm-whitespace', dest='norm_whitespace', action='store_true', default=False, help= 'Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).' ) parser.add_argument( '--print-stats', dest='print_stats', action='store_true', default=False, help= 'Instead of the text, output some simple statistics about the file.') parser.add_argument( '--max-blocks', dest='max_blocks', action='store', default=0, type=int, help= 'If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.' ) parser.add_argument( '--max-textlines', dest='max_textlines', action='store', default=0, type=int, help= 'If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.' ) parser.add_argument( '--line-height-method', dest='line_height_method', action='store', type=str, default='bbox', choices=['bbox', 'mean', 'median'], help= 'Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.' ) parser.add_argument(dest='pdffile', help='List of PDF files to go through', default=None, nargs='+') args, rest = parser.parse_known_args() global debuglevel debuglevel = debug = args.debuglevel DEBUG(3, 'args:', str(args)) DEBUG(3, 'rest:', str(rest)) DEBUG(3, 'optparse:', using_optparse) if args.pagenos: pagenos.update(int(x) - 1 for x in args.pagenos.split(',')) maxpages = args.maxpages outfile = args.outfile password = args.password caching = args.caching showpageno = args.show_pageno if not args.layout: laparams = None if laparams and args.all_texts: laparams.all_texts = True if laparams and args.detect_vertical: laparams.detect_vertical = True if laparams: laparams.char_margin = args.char_margin laparams.line_margin = args.line_margin laparams.word_margin = args.word_margin laparams.boxes_flow = args.boxes_flow layoutmode = args.layoutmode if args.imagewriter: imagewriter = ImageWriter(args.imagewriter) rotation = args.rotation stripcontrol = args.stripcontrol outtype = args.outtype codec = args.codec scale = args.scale args.box_separator = unescape_string(args.box_separator) args.block_separator = unescape_string(args.block_separator) args.indent_separator = unescape_string(args.indent_separator) args.indent_string = unescape_string(args.indent_string) args.page_separator = unescape_string(args.page_separator) global options options = args PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') DEBUG(2, 'output goes to', outfile) else: outfp = sys.stdout DEBUG(2, 'output goes to stdout') if outtype == 'shape': device = ShapeTextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, showpageno=showpageno, imagewriter=imagewriter) elif outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in options.pdffile: DEBUG(2, 'processing', fname) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() DEBUG(2, 'finished.') return
def main(argv): import getopt def usage(): print( f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' # pageno = 1 scale = 1 caching = True # showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = open(outfile, 'w', encoding=encoding) else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() outfp.close() bad_words = [ 'Personal', 'Information', 'Projects', 'Internship', 'Technologies' ] with open('cv.txt') as oldfile, open('cv_new.txt', 'w') as newfile: for line in oldfile: if not any(bad_word in line for bad_word in bad_words): newfile.write(line) file = open("cv_new.txt", "r") s = file.read() s = s.split('\n') while ("" in s): s.remove("") while (" " in s): s.remove(" ") while ("\x0c" in s): s.remove("\x0c") details = [] i = 0 while (i < len(s)): s1 = s[i].split(': ') if (len(s1) > 1): details.append(s1[1]) i += 1 sql = "INSERT INTO entries (name, post, exp) VALUES (%s, %s, %s)" val = (details[0], details[1], details[2]) mycursor.execute(sql, val) mydb.commit() print(mycursor.rowcount, "record inserted.") return
def convert_pdf_to_text(self): """ PDFファイルをテキストに変換 PDFは2段に段組みされたものも含む """ laparams = LAParams() # パラメータインスタンス laparams.boxes_flow = None # -1.0(水平位置のみが重要)から+1.0(垂直位置のみが重要)default 0.5 laparams.word_margin = 0.2 # default 0.1 laparams.char_margin = 2.0 # default 2.0 laparams.line_margin = 0 # default 0.5 # 出力ファイルのオープン ファイルがある時は上書きされる with open(self.output_path, "w", encoding="utf-8") as f: # 初期化 self.text_l = "" # 左側の文字列 self.text_r = "" # 右側の文字列 print("Analyzing from {} page to {} page(0:to last)".format( self.start_page, self.last_page)) # 対象ページを読み、テキスト抽出する。(maxpages:0は全ページ) for page_layout in extract_pages( self.input_path, maxpages=0, laparams=laparams): # ファイルにwithしている # 抽出するページの選別。extract_pagesの引数では、開始ページだけの指定に対応できないため if page_layout.pageid < self.start_page: continue # 指定開始ページより前は飛ばす if self.last_page and self.last_page < page_layout.pageid: break # 指定終了ページ以降は中断 # ページの幅から段組みの境界を計算(用紙幅の半分とする) if self.border == 0: self.border = int(page_layout.width / 2) if page_layout.pageid == self.start_page: print("Check on page #{}".format(page_layout.pageid)) print("Page Info width:{}, heght:{}".format( page_layout.width, page_layout.height)) print("Calc result border: {}, footer: {}".format( self.border, self.footer)) # 要素の出現順の確認(debug) # for element in self.flatten_lttext(page_layout, LTTextBox): # print("bbox{} {}".format(element.bbox, element.get_text()[:20])) # 要素のイテレータをたどり入れ子の要素を1次元に取り出す。戻るイテレータはLTTextBox型のみ # 要素の行の上側y1で降順、行の左側x0で昇順にソートする。 for element in sorted(self.flatten_lttext( page_layout, LTTextBox), key=lambda x: (-x.y1, x.x0)): # for element in self.flatten_lttext(page_layout, LTTextBox): if element.y1 < self.footer: continue # フッター位置の文字は抽出しない if element.y0 > self.header: continue # ヘッダー位置の文字は抽出しない _text = element.get_text() # debug # print("y1:{}, y0:{}■{}".format(element.y1, element.y0, _text)) if element.x1 < self.border: # 文字列全体が左側 self.text_l += _text else: if element.x0 >= self.border: # 文字列全体が右側 self.text_r += _text else: # 文字列が境界をまたいでいる場合 # 右側に既に文章があれば先に出力する if self.text_r: self.write2text(f) self.text_l += _text # 1ページ分処理したら書き込む self.write2text(f)
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrc = PDFResourceManager() if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrc, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() outfp.close() return
if __name__ == "__main__": doc = open_pdf(sys.argv[1]) Point = Route = False pages = page_count(doc) if pages == 68: Point = True elif pages == 1143: Route = True else: sys.stderr.write("PDF file not of recognised (NRG) format\n") sys.exit(1) rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.line_margin = 0 # Forces every line to be absolutely positioned laparams.word_margin = 0.2 # Prevents space before narrow characters device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) writer = BufferedWriter(sys.stdout) for (pageno, page) in enumerate(doc.get_pages()): interpreter.process_page(page) layout = device.get_result() # returns LTPage (text, other) = fsplit(lambda obj: isinstance(obj, LTText), layout) header_y = 0 if Point: # Locates bottom of header separator (lowest non-text < 10px height)
def pdfminerr(argv): global pdfminerr, install import getopt def usage(): print ("usage: just put the path to the pdf file in pdf.txt, and make sure you create a seprate folder and put nothing there except for this repository.") return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def convert_pdf(path, outtype='txt', opts={}): outfile = path[:-3] + outtype outdir = '/'.join(path.split('/')[:-1]) debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() if not outtype: outtype = 'txt' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'txt': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() outfp.close() return
def pdf2txt(argv): import getopt (opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
import string from string import punctuation from io import BytesIO from openpyxl import load_workbook from bs4 import BeautifulSoup # pdf path document = open('/Users/anan/Desktop/pdf/Part1_test.pdf', 'rb') # crate pdf manager rsrcmgr = PDFResourceManager() retster = BytesIO() # Set parameters for analysis. laparams = LAParams() laparams.char_margin = 4 # 5 4best laparams.line_margin = 5 # test:3 part:5 laparams.word_margin = 4 laparams.boxes_flow = 0.5 # Create a PDF page TMLConverter object device = HTMLConverter(rsrcmgr, retster, codec='utf-8', laparams=laparams) # create pdf interpreter interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(document): interpreter.process_page(page) content = retster.getvalue().decode() # 以 Beautiful Soup 解析 HTML 程式碼 soup = BeautifulSoup(content, 'html.parser')
def readPDF2HTML(pdfFile, opts={}): # open a PDF file fp = StringIO(pdfFile.read()) retstr = StringIO() # create a PDF parser object associated with the file object parser = PDFParser(fp) # create a PDF document allows text extraction document = PDFDocument(parser) # password if needed # check if document allows text extraction without password if not document.is_extractable: raise PDFTextExtractionNotAllowed # create a PDF resource manager object that sotres shared resources rsrcmgr = PDFResourceManager() # create a PDF device object laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) codec = 'utf-8' device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() # process each page contained in the document for page in PDFPage.get_pages(fp, pagenos): interpreter.process_page(page) # close streams and return text content fp.close() content = retstr.getvalue() device.close() retstr.close() return content
def main(argv): import getopt def usage(): print( "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]" " [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]" " [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]" " [-t text|html|xml|tag] [-c codec] [-s scale]" " file ..." % argv[0] ) return 100 try: (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:") except getopt.GetoptError: return usage() if not args: return usage() # input option password = b"" pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = "normal" codec = "utf-8" pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == "-d": logging.getLogger().setLevel(logging.DEBUG) elif k == "-p": pagenos.update(int(x) - 1 for x in v.split(",")) elif k == "-m": maxpages = int(v) elif k == "-P": password = v elif k == "-o": outfile = v elif k == "-C": caching = False elif k == "-n": laparams = None elif k == "-A": laparams.all_texts = True elif k == "-V": laparams.detect_vertical = True elif k == "-M": laparams.char_margin = float(v) elif k == "-L": laparams.line_margin = float(v) elif k == "-W": laparams.word_margin = float(v) elif k == "-F": laparams.boxes_flow = float(v) elif k == "-Y": layoutmode = v elif k == "-O": imagewriter = ImageWriter(v) elif k == "-R": rotation = int(v) elif k == "-S": stripcontrol = True elif k == "-t": outtype = v elif k == "-c": codec = v elif k == "-s": scale = float(v) # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = "text" if outfile: if outfile.endswith(".htm") or outfile.endswith(".html"): outtype = "html" elif outfile.endswith(".xml"): outtype = "xml" elif outfile.endswith(".tag"): outtype = "tag" if outfile: outfp = open(outfile, "wb") else: outfp = sys.stdout if outfp.encoding is not None: codec = None if outtype == "text": device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == "xml": device = XMLConverter( rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol ) elif outtype == "html": device = HTMLConverter( rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter ) elif outtype == "tag": device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = open(fname, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
##ATTEMPT 1 # Create a PDF parser object associated with the file object. #parser = PDFParser(open_file) # Create a PDF document object that stores the document structure. #doc = PDFDocument(parser) # Connect the parser and document objects. #print parser.nextline() #print parser.nextline() #print parser.nextline() ##ATTEMPT 2 #Code from pdf2txt.py laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin=0.5 laparams.word_margin=0.1 laparams.all_texts=False rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, fp_out, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pdf_pages = PDFPage.get_pages(fp_in, set()) pagenum = 0 pagelim = 3 for page in pdf_pages: pagenum += 1 if pagenum > pagelim: continue print "Transcribing page " + str(pagenum) + " from PDF to text" interpreter.process_page(page)
def main(fname, k, v): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-S': stripcontrol = True elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrc = PDFResourceManager() if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrc, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() outfp.close() return
from pdfminer.layout import LTTextBox from string import punctuation import sys import io from string import punctuation import re # pdf path document = open('C:/Users/Vincent/Dropbox/writeProgram/python/20180509.pdf', 'rb') #crate pdf manager rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() laparams.char_margin = 3.2 #5 4best laparams.line_margin = 8 #5-8 laparams.word_margin = 10 laparams.boxes_flow = 0.5 # Create a PDF page aggregator object device = PDFPageAggregator(rsrcmgr, laparams=laparams) #create pdf interpreter interpreter = PDFPageInterpreter(rsrcmgr , device) checkPoint = 0 #初始狀態 檢查各個段落 partOne = "General Information" partTwo = "Deal History" partThree = "Investors (" # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, # LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for page in PDFPage.get_pages(document): interpreter.process_page(page)
#fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000078-AN168968.pdf', 'rb') #fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000510-BX425914.pdf', 'rb') #fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000078-AN168907.pdf', 'rb') #fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000078-AN168907.pdf', 'rb') #fp = open(r'S:/Bhavani/Aaron/POxca-000052-R201631.pdf', 'rb') fp = open(r'C:/Users/ashmaro1/Documents/_Projects/Glencore/POxca-000052-R201631.pdf','rb') parser = PDFParser(fp) doc = PDFDocument(parser) #doc.initialize('password') # leave empty for no password rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = float('1.1') #too small and it splits the description, too big and Quantity-Unit-Part number are not separated: 1.4 seems to work laparams.line_margin = float('0.8') device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) # receive the LTPage object for this page device.get_result() print(device.rows) df = pd.DataFrame(device.rows, columns=['Page', 'x', 'y', 'c1','c2','String']) # create text rows from 'y' coordinate data
def main(argv): import getopt def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv): import getopt #getopt 模块,它的功能是 获取执行命令行时附带的参数,关于getopt模块详细可参照http://www.16kan.com/post/207647.html def usage(): #usage() 函数,用于在用户输入错误命令或者命令输入不规范时,输出py文件的使用范例。当参数不足或错误时,usage()被调用 print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') ''' getopt函数的格式是getopt.getopt ( [命令行参数列表], "短选项", [长选项列表] ) 短选项名后的冒号(:)表示该选项必须有附加的参数。p,m,P,o,M,L,W,F,Y,O,t,c,s均为必须参数 长选项名后的等号(=)表示该选项必须有附加的参数。 返回opts和args。 ''' except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' #参数P pagenos = set() #参数p maxpages = 0 #参数m # output option outfile = None #参数o output outtype = None #参数t out type outdir = None #参数O output directory layoutmode = 'normal' #参数Y codec = 'utf-8' #参数c pageno = 1 scale = 1 #参数s,暂缺M,L,F,Y四个参数 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: #确认输出文件格式 outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) #TextConverter貌似不能指定outdir参数 elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()
def pdf2txt(argv): import getopt (opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def main(files=None): if files is None: files = get_datafiles() # debug option level debug = 0 # input option password = '' pagenos = set() # pagenos.update( int(x)-1 for x in v.split(',') ) maxpages = 0 # output option rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True rsrcmgr = PDFResourceManager(caching=caching) showpageno = True # Line Agumentation ? Parameters laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True laparams.line_overlap = 0.3 # Line overlap laparams.char_margin = 2.0 # Letter Spacing laparams.line_margin = 0.5 # Line Spacing laparams.word_margin = 0.1 # Word spacing laparams.boxes_flow = 0.5 # +-1.0 how much hor vs. vertical matters # position maters for line continuation # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # for fname in files: fname = str(fname) imagedir = os.path.abspath(os.path.join(os.path.dirname(fname), 'img')) # print(imagedir) imagewriter = None imagewriter = ImageWriter(imagedir) # output folder for images name = os.path.splitext(os.path.basename(fname))[0] print(name) outfile = fname[:-4] + '.txt' device = TextCon(rsrcmgr, laparams=laparams, imagewriter=imagewriter, imagename=name) interpreter = PDFPageInterpreter(rsrcmgr, device) fp = file(fname, 'rb') try: for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) except: continue rows = [list(row) for row in device.rows] pages = max([row[0] for row in rows]) max_y = max([row[4] for row in rows]) min_y = min([row[2] for row in rows]) list_0 = [int(row[4]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > pages - 1 ] max_y2 = max(list_1) list_0 = [int(row[2]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > pages - 1 ] min_y2 = min(list_1) print('max_ys:', max_y - max_y2) print('min_ys:', min_y - min_y2) # Get max and min the hard way because of stupid headers list_0 = [int(row[3]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > 10 ] if list_1: max_x = max(list_1) else: max_x = max([int(row[3]) for row in device.rows]) list_0 = [int(row[1]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > 10 ] if list_1: min_x = min(list_1) else: min_x = min([int(row[3]) for row in device.rows]) # Errors if more pics on one side then other # mid_x = (sum([(float(row[1]) + float(row[3]))/2 for row in # device.rows])/len(device.rows)) mid_x = (max_x + min_x) / 2 # mid_x = 595/2 # center of A4 at 72px/in Letter would be 612/2 l_height = sum([row[4] - row[2] for row in rows]) / len(rows) # print('max_x:', max_x) # print('min_x:', min_x) # print('mid_x:', mid_x) print('l_height:', l_height) column2 = [] lines = [] pagenumber = 0 table_caps = ['\n'] table_data = [] table = False for i, row in enumerate(rows): #l_height = row[4]-row[2] l_space = rows[i - 1][2] - row[4] #print(l_height, l_space, rows[i-1][2], rows[i][4], str(row[5])) if row[0] == pagenumber + 1: lines += column2 column2 = [] pagenumber += 1 if row[0] == pagenumber: if (max_y - min_y) * 0.95 > l_space > 0.8 * l_height: # capture Table (assuming tables will span all columns) if re.match(r"^table", str(row[5]), re.I): table = True table_caps.append(str(row[5])) table_data.append('\n') table_data.append(str(row[5])) table_data.append('\n') continue else: table = False # capture table captions multi lines elif (table_caps[-1] == str(rows[i - 1][5]) and -2 * l_height < l_space < 0.5 * l_height): table_caps[-1] += str(row[5]) table_data[-2] += str(row[5]) continue if table: # capture table data if int(rows[i - 1][2]) == int(rows[i][2]): table_data[-1] += '\t' + str(row[5]) continue else: table_data.append(str(row[5])) continue elif int(row[1]) > mid_x and ((int(rows[i - 1][1]) < mid_x and int(rows[i - 1][3]) < mid_x) or (int(rows[i - 1][1]) > mid_x and int(rows[i - 1][3]) > mid_x) or rows[i - 1][3] > max_x * 0.9 or l_space > 2.5 * l_height): """ r_space > c_space or previous[3] > max_x * 0.9 or l_space > 2 * l_height):""" if len(column2) > 0: if 1 > (row[2] - column2[-1][2]) > -1: # join if on same line if int(row[1]) < int(column2[-1][1]): column2[-1][5] = row[5] + " " + column2[-1][5] else: column2[-1][5] = column2[-1][5] + " " + row[5] else: column2.append(row) else: column2.append(row) # print(2, str(row[5])) else: if len(lines) > 0: if 1 > (row[2] - lines[-1][2]) > -1: # join if on same line if int(row[1]) < int(lines[-1][1]): lines[-1][5] = row[5] + " " + lines[-1][5] else: lines[-1][5] = lines[-1][5] + " " + row[5] else: lines.append(row) else: lines.append(row) # print(3, str(row[5])) # add final column lines += column2 fig_caps = ['\n'] headers = ['\n'] footers = ['\n'] supp_info = ['\n'] new_lines = [] supp_re = re.compile( r"Corresponding author|Electronic mail|email" "|E-mail|^doi|doi:|^keywords|^pacs|^apc", re.I) for i, line in enumerate(lines): #l_height = lines[i][4]-lines[i][2] l_space = lines[i - 1][2] - lines[i][4] l_space_below = 0 l_space_2below = 0 if i + 1 < len(lines): l_space_below = lines[i][2] - lines[i + 1][4] if i + 2 < len(lines): l_space_2below = lines[i + 1][2] - lines[i + 2][4] fig = fig_caps[-1] print(l_space, l_space_below, l_space_2below, lines[i][2], lines[i][4], str(line[5])) # capture figure captions multi lines if (fig_caps[-1] == str(lines[i - 1][5]) and -2 * l_height < l_space < 0.5 * l_height): fig_caps.append(str(line[5])) continue # capture headers (up to two lines) if (lines[i][2] > max_y * 0.95 and (l_space_below > 0.5 * l_height or l_space_2below > 0.5 * l_height)): headers.append('\n') headers.append(str(line[5])) if supp_re.search(str(line[5])): headers.append('\n') headers.append(str(line[5])) else: continue # capture supporting info if supp_re.search(str(line[5])): print(str(line[5])) supp_info.append('\n') supp_info.append(str(line[5])) continue if (max_y - min_y) * 0.95 > l_space > 0.5 * l_height: # capture figure captions if re.match(r"^fig", str(line[5]), re.I): fig_caps.append('\n') fig_caps.append(str(line[5])) continue # capture footers elif lines[i][2] < min_y + max_y * 0.015: footers.append('\n') footers.append(str(line[5])) continue else: string = str(lines[i - 1][5]) if (any(string in s for s in fig_caps) or any(string in s for s in headers)): # or #string == footers[-1] or string == supp_info[-1]): pass else: new_lines.append('\n') new_lines.append(str(line[5])) with open(outfile, 'w') as f: f.write(' '.join(new_lines)) f.write('\n\nFigures') f.write(' '.join(fig_caps)) f.write('\n\nTables') #f.write(' '.join(table_caps)) f.write('\n'.join(table_data)) f.write('\n\nHeaders') f.write(' '.join(headers)) f.write('\n\nFooters') f.write(' '.join(footers)) f.write('\n\nSupporting Info') f.write(' '.join(supp_info)) # the histogram of the data # n, bins, patches = plt.hist(x_data, 50) # plt.show() device.close() print('Done') return
def extTxt(fname, outfile): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option # outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True # '-d': debug += 1 laparams = LAParams() laparams.line_margin = float(30) laparams.word_margin = float(0.1) # '-n': laparams = None # '-A': laparams.all_texts = True # '-V': laparams.detect_vertical = True # '-M': laparams.char_margin = float(v) # '-F': laparams.boxes_flow = float(v) # '-Y': layoutmode = v # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) outtype = 'text' # outfile = 'SAMPLE/Output.tmp' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout outLns = '' device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) # fname = 'SAMPLE/sample.pdf' fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-c codec] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:c:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None codec = 'utf-8' pageno = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-c': codec = v # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout device = CourseRegisterParser(rsrcmgr, outfp, codec=codec, laparams=laparams) for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() outfp.close() return
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i - 1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator fp = open("Lista_samurai_x.pdf", "rb") parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) doc.set_parser(parser) doc.initialize("") rsrcmgr = PDFResourceManager() laparamns = LAParams() laparamns.line_margin = 0.3 laparamns.word_margin = 0.3 device = PDFPageAggregator(rsrcmgr, laparamns=laparamns) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for ltobject in layout: print(ltobject.get_text())
def main(argv): import getopt def usage(): print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:' print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = 'tag' imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'tag' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout for fname in args: l = glob.glob(fname) count = len(l) print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format' for pdf in l: # print pdf d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'} ext = '.' + d[outtype] outfile = pdf[0:-4] + ext print outfile outfp = file(outfile, 'wb') if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) device.showpageno = False else: return usage() fp = file(pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() print 'Done' return
def parse_pdf(PDF_path, TEX_Path, is_annotation, is_train_image, is_main): filename = os.path.basename(PDF_path).split('.pdf')[0] page_counter = 0 titles_counter = 0 titles_coordinates = [] images_counter = 0 images_coordinates = [] lists_coordinates = [] tables_coordinates = [] text_coordinates = [] all_train_objects_coordinates = [] with_annotations = is_annotation if is_main: generate_images(PDF_path, filename, is_train_image) if is_train_image: tex_instances = find_tex_istances(TEX_Path) if not tex_instances: return all_train_objects_coordinates # Open a PDF file. fp = open(PDF_path, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. try: document = PDFDocument(parser) except PSSyntaxError: print('Invalid PDF structure') return all_train_objects_coordinates # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. # Set parameters for analysis. laparams = LAParams() laparams.line_margin = 0.4 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): page_counter += 1 page_length = page.mediabox[3] interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() print( '##########################################################################################################') print('PAGE NUMBER: ', page_counter) print( '##########################################################################################################', '\n') for x in layout: # TEXTS (TITLES AND LISTS) if isinstance(x, LTTextBoxHorizontal): lines = x._objs if '' in lines: lines.remove('') for i in range(2 if len(lines) > 2 else len( lines)): # iterate over the first lines of a texbox since titles are always on the top pdf_line_result = lines[i].get_text().split('\n')[0].lower() pdf_line_result = ''.join([i for i in pdf_line_result if not i.isdigit()]) for instance in tex_instances[0]: tex_title = instance[2].lower() if are_similar(tex_title, pdf_line_result) and pdf_line_result != '': titles_counter += 1 titles_coordinates.append( calculate_object_coordinates(page_counter, lines[i].bbox, page_length, 'title')) lines_counter = 0 for i in range(len(lines)): pdf_line_result = lines[i].get_text().split('\n')[0].lower() pdf_line_result = ''.join([i for i in pdf_line_result if not i.isdigit()]) for instance in tex_instances[2]: tex_list_item = instance[3] if are_similar(tex_list_item[0:50], pdf_line_result[0:50]) and pdf_line_result != '': lists_coordinates.append( calculate_object_coordinates(page_counter, lines[i].bbox, page_length, 'list')) elif lines_counter > 2: text_coordinates.append( calculate_object_coordinates(page_counter, lines[i].bbox, page_length, 'text')) lines_counter += 1 # FIGURES elif isinstance(x, LTImage) or isinstance(x, LTFigure): if (x.width / x.height > 5) or (x.height / x.width > 5): pass else: images_counter += 1 images_coordinates.append( calculate_object_coordinates(page_counter, x.bbox, page_length, 'image')) elif isinstance(x, LTLine): if (x.height == 0 and x.width < 30) or x.width <= 0: pass else: tables_coordinates.append( calculate_object_coordinates(page_counter, x.bbox, page_length, 'table')) extracted_tables_coordinates = extract_tables_coordinates(tables_coordinates) extracted_lists_coordinates = extract_lists_coordinates(lists_coordinates) if with_annotations == 'yes': print('Generating annotations...') if len(titles_coordinates) != 0: annotate_img(filename, titles_coordinates, titles_coordinates[0][0], (0, 0, 255), 3) if len(images_coordinates) != 0: annotate_img(filename, images_coordinates, images_coordinates[0][0], (0, 255, 0), 3) if len(extracted_lists_coordinates) != 0: annotate_img(filename, extracted_lists_coordinates, extracted_lists_coordinates[0][0], (255, 0, 0), 3) if len(extracted_tables_coordinates) != 0: annotate_img(filename, extracted_tables_coordinates, extracted_tables_coordinates[0][0], (230, 255, 102), 3) all_train_objects_coordinates.extend(titles_coordinates) all_train_objects_coordinates.extend(images_coordinates) all_train_objects_coordinates.extend(extracted_lists_coordinates) all_train_objects_coordinates.extend(extracted_tables_coordinates) all_train_objects_coordinates = sorted(all_train_objects_coordinates, key=itemgetter(0)) return all_train_objects_coordinates
def main(argv): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() using_optparse = False parser = ArgumentParser(prog='pdf2txt.py', description='Convert pdf to txt', formatter_class=ArgumentDefaultsHelpFormatter) if using_optparse: DEBUG(3, 'using optparse') parser.add_argument = parser.add_option parser.parse_known_args = parser.parse_args parser.disable_interspersed_args() parser.add_argument('-d', dest='debuglevel', action='count', default = 0, help='Debug (repeat for more verbose debugging)') parser.add_argument('-p', '--pages', dest='pagenos', action='store', type=str, default = '', help='Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.') parser.add_argument('-c', '--codec', dest='codec', action='store', type=str, default='utf-8', help='Specifies the output codec.') parser.add_argument('-t', '--type', dest='outtype', action='store', type=str, default='shape', choices = ['text', 'html', 'xml', 'tag', 'shape'], help='Specifies the output format, one of: shape, text, html, xml, tag') parser.add_argument('-m', dest='maxpages', action='store', type=int, default=0, help='Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.') parser.add_argument('-P', '--password', dest='password', action='store', type=str, default='', help='Provides the user password to access PDF contents.') parser.add_argument('-o', '--output', dest='outfile', action='store', type=str, default=None, help='Specifies the output file name. By default, it prints the extracted contents to stdout in text format.') parser.add_argument('-C', '--no-caching', dest='caching', action='store_false', default=True, help='Suppress object caching. This will reduce the memory consumption but also slows down the process.') parser.add_argument('-n', '--no-layout', dest='layout', action='store_false', default=True, help='Suppress layout analysis.') parser.add_argument('--show-pageno', dest='show_pageno', action='store_true', default=False, help='Show page numbers.') parser.add_argument('-A', '--analyze-all', dest='all_texts', action='store_true', default=False, help='Forces to perform layout analysis for all the text strings, including text contained in figures.') parser.add_argument('-V', '--detect-vertical', dest='detect_vertical', action='store_true', default=False, help='Allows vertical writing detection.') parser.add_argument('-M', dest='char_margin', action='store', type=float, default=2.0, help='Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.') parser.add_argument('-L', dest='line_margin', action='store', type=float, default=0.5, help='Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.') parser.add_argument('-W', dest='word_margin', action='store', type=float, default=0.1, help='It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.') parser.add_argument('-F', dest='boxes_flow', action='store', type=float, default=0.5, help='Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).') parser.add_argument('-Y', '--layout-mode', dest='layoutmode', action='store', type=str, default='normal', choices = ['exact', 'normal', 'loose'], help='Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.') parser.add_argument('-O', '--image-writer', dest='imagewriter', action='store', type=str, default=None, help='imagewriter') parser.add_argument('-R', '--rotation', dest='rotation', action='store', type=int, default=0, help='rotation') parser.add_argument('-S', '--strip-control', dest='stripcontrol', action='store_true', default=False, help='stripcontrol') parser.add_argument('-s', dest='scale', action='store', type=float, default=1, help='Specifies the output scale. Can be used in HTML format only.') parser.add_argument('--draw-lines', dest='draw_lines', action='store_true', help="Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output.") parser.add_argument('--draw-boxes', dest='draw_boxes', action='store_true', help="Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output.") parser.add_argument('--draw-blocks', dest='draw_blocks', action='store_true', help="Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output.") parser.add_argument('--shear-limit', dest='shear_limit', action='store', default=0.1, type=float, help="If the text is sheared above this limit, reject it. Valid only for the `shape' output.") parser.add_argument('--rotation-limit', dest='rotation_limit', action='store', default=2, type=float, help="If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output.") parser.add_argument('--line-height-diff', dest='line_height_diff', action='store', type=float, default=0.1, help='Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).') parser.add_argument('--heading-before', dest='heading_before', action='store', type=str, default='', help='String to put before each heading, e.g. <h1>') parser.add_argument('--heading-after', dest='heading_after', action='store', type=str, default='', help='String to put after each heading, e.g. </h1>') parser.add_argument('--box-separator', dest='box_separator', action='store', type=str, default=r'\n\n', help=r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--block-separator', dest='block_separator', action='store', type=str, default=r'\n\n', help=r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--indent-separator', dest='indent_separator', action='store', type=str, default=r'\n\n', help=r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--indent-string', dest='indent_string', action='store', type=str, default=r'\t', help=r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--indent-limit', dest='indent_limit', action='store', type=float, default=3, help='If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.') parser.add_argument('--page-separator', dest='page_separator', action='store', type=str, default=r'\n\n', help=r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--norm-whitespace', dest='norm_whitespace', action='store_true', default=False, help='Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).') parser.add_argument('--print-stats', dest='print_stats', action='store_true', default=False, help='Instead of the text, output some simple statistics about the file.') parser.add_argument('--max-blocks', dest='max_blocks', action='store', default=0, type=int, help='If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.') parser.add_argument('--max-textlines', dest='max_textlines', action='store', default=0, type=int, help='If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.') parser.add_argument('--line-height-method', dest='line_height_method', action='store', type=str, default='bbox', choices = ['bbox', 'mean', 'median'], help='Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.') parser.add_argument(dest='pdffile', help='List of PDF files to go through', default=None, nargs='+') args, rest = parser.parse_known_args() global debuglevel debuglevel = debug = args.debuglevel DEBUG(3, 'args:', str(args)) DEBUG(3, 'rest:', str(rest)) DEBUG(3, 'optparse:', using_optparse) if args.pagenos: pagenos.update( int(x)-1 for x in args.pagenos.split(',') ) maxpages = args.maxpages outfile = args.outfile password = args.password caching = args.caching showpageno = args.show_pageno if not args.layout: laparams = None if laparams and args.all_texts: laparams.all_texts = True if laparams and args.detect_vertical: laparams.detect_vertical = True if laparams: laparams.char_margin = args.char_margin laparams.line_margin = args.line_margin laparams.word_margin = args.word_margin laparams.boxes_flow = args.boxes_flow layoutmode = args.layoutmode if args.imagewriter: imagewriter = ImageWriter(args.imagewriter) rotation = args.rotation stripcontrol = args.stripcontrol outtype = args.outtype codec = args.codec scale = args.scale args.box_separator = unescape_string(args.box_separator) args.block_separator = unescape_string(args.block_separator) args.indent_separator = unescape_string(args.indent_separator) args.indent_string = unescape_string(args.indent_string) args.page_separator = unescape_string(args.page_separator) global options options = args PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') DEBUG(2, 'output goes to', outfile) else: outfp = sys.stdout DEBUG(2, 'output goes to stdout') if outtype == 'shape': device = ShapeTextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, showpageno=showpageno, imagewriter=imagewriter) elif outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in options.pdffile: DEBUG(2, 'processing', fname) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() DEBUG(2, 'finished.') return
def main(argv): import getopt def usage(): print( "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] " "[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] " "[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ..." % argv[0] ) return 100 try: (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:") except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = "" pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = "normal" codec = "utf-8" pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == "-d": debug += 1 elif k == "-p": pagenos.update(int(x) - 1 for x in v.split(",")) elif k == "-m": maxpages = int(v) elif k == "-P": password = v elif k == "-o": outfile = v elif k == "-C": caching = False elif k == "-n": laparams = None elif k == "-A": laparams.all_texts = True elif k == "-V": laparams.detect_vertical = True elif k == "-M": laparams.char_margin = float(v) elif k == "-L": laparams.line_margin = float(v) elif k == "-W": laparams.word_margin = float(v) elif k == "-F": laparams.boxes_flow = float(v) elif k == "-Y": layoutmode = v elif k == "-O": outdir = v elif k == "-t": outtype = v elif k == "-c": codec = v elif k == "-s": scale = float(v) # # PDFDocument.debug = debug # PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = "text" if outfile: if outfile.endswith(".htm") or outfile.endswith(".html"): outtype = "html" elif outfile.endswith(".xml"): outtype = "xml" elif outfile.endswith(".tag"): outtype = "tag" if outfile: outfp = file(outfile, "w") else: outfp = sys.stdout if outtype == "text": device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == "xml": device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == "html": device = HTMLConverter( rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir ) elif outtype == "tag": device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, "rb") process_pdf( rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ) fp.close() device.close() outfp.close() return
def convert_pdf_To_Txt(path,opts={}): """ this ALGO form pdfinterp modul documentation """ # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout retstr = StringIO() if outtype == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) #print retstr.getvalue() txt2Pdf=retstr.getvalue() #print type(txt2Pdf) #fp.close() #device.close() #outfp.close() return txt2Pdf
def main(argv): import getopt def usage(): print(f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: if sys.platform == 'linux': outfp = open(outfile, 'w', encoding=encoding) elif sys.platform == 'win32': outfp = open(outfile, 'wb') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) device.close() outfp.close() return
if m is None: continue if gname is not None: logging.warn('skip %s: 2 groups in one doc: %s [%s] and %s [%s]' % (pdf_file, gname, gtype, m.groups()[2], m.groups()[0])) return gname = m.groups()[2].strip() gtype = m.groups()[0].strip() if u'五十音順' in gname: # HACK: some docs are really borked.. return logging.info(u'%s is a doc for %s' % (pdf_file, gname)) return gname LAPARAMS = LAParams() LAPARAMS.line_margin = 10.0 def extract_pdf_text(fname): rsrcmgr = PDFResourceManager(caching=True) codec = 'utf-8' outfp = StringIO() device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=LAPARAMS, imagewriter=None) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, set(), maxpages=1, caching=True, check_extractable=True): interpreter.process_page(page) fp.close() device.close()
def main(argv): def usage(): print(( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] [-r] ' '[-S] [-f] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'fSrdp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True roundCoords = False simplifyOutput = False formatOutput = False laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) elif k == '-r': roundCoords = True elif k == '-S': simplifyOutput = True elif k == '-f': formatOutput = True PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if formatOutput and outtype.endswith('ml'): try: from cStringIO import StringIO except ImportError: from StringIO import StringIO outfp = StringIO() else: outfp = getRealOutput(outfile) if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, layoutmode=layoutmode, scale=scale, roundCoords=roundCoords, simplifyOutput=simplifyOutput) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if formatOutput: root = outfp.getvalue() with getRealOutput(outfile) as realOutput: try: from bs4 import BeautifulSoup as bs except ImportError: bs = None sys.stderr.write('Could not import BeautifulSoup, skipping output formatting') realOutput.write(root) else: soup = bs(root) prettyHTML = soup.prettify() realOutput.write(prettyHTML) outfp.close() return