Esempio n. 1
0
def parse_pdf_to_txt(pdf_handle, write_file):
    pagenos = set()
    maxpages = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    #laparams.all_texts = True
    laparams.detect_vertical = True

    # 创建pdf资源管理器 来管理共享资源
    rsrcmgr = PDFResourceManager(caching=caching)

    print("ready to open out file ........")
    with open(write_file, "wt", encoding=codec, errors='ignore') as outfp:
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams)
        print("ready to converte pdf to xml ........")
        process_pdf(rsrcmgr,
                    device,
                    pdf_handle,
                    pagenos,
                    maxpages=maxpages,
                    password='',
                    caching=caching,
                    check_extractable=True)
        device.close()
Esempio n. 2
0
def extract_pdf_page(filename, page_number_or_numbers):
    """Given the name of a PDF file and the pages to extract, use PDFMiner to extract those
    pages and return them as XML (in utf-8 bytes).

    The param page_number_or_numbers can be a single page number or an iterable thereof.
    """
    # This code adapted from pdf2txt.py which is part of PDFMiner.
    # Here's the command line version of the code below --
    #    pdf2txt.py -p 1 -o expected.xml sample.pdf

    if is_iterable(page_number_or_numbers):
        page_numbers = page_number_or_numbers
    else:
        page_numbers = [page_number_or_numbers]

    f_out = StringIO.StringIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager()
    device = XMLConverter(rsrcmgr, f_out, codec='utf-8', laparams=laparams)

    with open(filename, 'rb') as f_in:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(f_in, page_numbers):
            interpreter.process_page(page)

    device.close()

    xml = f_out.getvalue()
    f_out.close()

    return xml
Esempio n. 3
0
class Miner:
    def __init__(self, pdf_file, txt_file, file_format='txt', layout_analysis=True):
        self.pdf_file = file(pdf_file, 'rb')
        self.outfp = file(txt_file, 'w')

        if layout_analysis:
            laparams = LAParams()
        else:
            laparams = None

        self.rsrcmgr = PDFResourceManager(caching=True)
        
        if file_format == 'txt':
            self.device = TextConverter(self.rsrcmgr, self.outfp, codec='utf-8', 
                laparams=laparams, imagewriter=None)
        elif file_format == 'html':
            self.device = HTMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', 
                laparams=laparams, imagewriter=None)
        elif file_format == 'xml':
            self.device = XMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', 
                laparams=laparams, imagewriter=None)

    def extract(self):
        interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
        pagenos = set()
        for page in PDFPage.get_pages(self.pdf_file, pagenos, maxpages=0, 
            password=None, caching=True, check_extractable=True):
            interpreter.process_page(page)
        self.pdf_file.close()
        self.device.close()
        self.outfp.close()
Esempio n. 4
0
def pdf2xml(filename):
    rsrcmgr = PDFResourceManager(caching=True)
    outfp = StringIO.StringIO()
    device = XMLConverter(rsrcmgr,
                          outfp,
                          codec='utf-8',
                          laparams=LAParams(),
                          imagewriter=None)

    fp = file(filename, 'rb')
    pages = PDFPage.get_pages(fp,
                              None,
                              maxpages=0,
                              password='',
                              caching=True,
                              check_extractable=True)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in pages:
        interpreter.process_page(page)
    fp.close()
    device.close()

    xml = outfp.getvalue()
    outfp.close()
    return xml
Esempio n. 5
0
def convert(infile, outfile, rotation=0):
    debug = 0
    password = ''
    pagenos = set()
    maxpages = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()

    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(outfile, 'wb')
    device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    fp = open(infile, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                    maxpages=maxpages, password=password,
                                    caching=caching, check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()
    outfp.close()
def parse_pdfs(pdf_filenames):
    # Set parameters
    pagenos = set()
    maxpages = 0
    password = ''
    imagewriter = None
    codec = 'utf-8'
    caching = True
    laparams = LAParams()

    rsrcmgr = PDFResourceManager(caching=caching)

    # Convert to XML as it retains the most information about text position (compared to text, html, etc).
    for pdf_file in pdf_filenames:

        print "Converting %s to xml."%pdf_file

        fname, ext = os.path.splitext(pdf_file)
        outfile = fname + '.xml'
        with open(pdf_file, 'rb') as fp, open(outfile, 'w') as outfp:

            device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                          imagewriter=imagewriter)

            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
                interpreter.process_page(page)

            device.close()

        print "Conversion complete."
Esempio n. 7
0
def pdf2xml(path, codec='utf-8', password = "", maxpages = 0, caching = True):
	'''
	Given the name of a PDF file, use PDFMiner to extract its pages and return them as XML (in utf-8 bytes).
	'''
	rsrcmgr = PDFResourceManager()
	retstr = BytesIO()
	laparams = LAParams()

	device = XMLConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
	with open(path, 'rb') as fp:
		interpreter = PDFPageInterpreter(rsrcmgr, device)
		pagenos=set()
		#pg = 1
		for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
			interpreter.process_page(page)
			#xml = '%s %s %s' % ('<PAGE {}>'.format(pg), retstr.getvalue(), '</PAGE {}>'.format(pg))
			#pg += 1
		xml = retstr.getvalue()
	
	device.close()
	retstr.close()

	xml = xml.decode('utf-8')
	if not xml.startswith('</pages>'):
		xml += '\n</pages>'

	return xml
Esempio n. 8
0
 def _get_xml_data(self, sourcefile):
     """Store XML representation fo file"""
     rm = PDFResourceManager(caching=True,
                             font_correctors=self.font_correctors)
     laparams = LAParams()
     outfp = io.BytesIO()
     device = XMLConverter(rm,
                           outfp,
                           codec="UTF-8",
                           laparams=laparams,
                           imagewriter=None)
     interpreter = PDFPageInterpreter(rm, device)
     infile = open(sourcefile, "rb")
     pagenos = set()
     maxpages = 0
     rotation = 0
     password = ""
     for page in PDFPage.get_pages(infile,
                                   pagenos,
                                   maxpages=maxpages,
                                   password=password,
                                   caching=True,
                                   check_extractable=True):
         interpreter.process_page(page)
     infile.close()
     device.close()
     retval = outfp.getvalue()
     outfp.close()
     return retval
def lerPDF(arquivo):
    recursos = PDFResourceManager()
    buffer = StringIO()
    layoutParams = LAParams()
    disp = XMLConverter(recursos, buffer, laparams=layoutParams)

    process_pdf(recursos, disp, arquivo)
    disp.close()

    conteudo = buffer.getvalue()
    buffer.close()
    return conteudo
Esempio n. 10
0
    def to_xml(infile):
        output = StringIO()
        manager = PDFResourceManager()
        converter = XMLConverter(manager, output, laparams=LAParams())
        interpreter = PDFPageInterpreter(manager, converter)

        for page in PDFPage.get_pages(infile):
            interpreter.process_page(page)
        converter.close()
        xml = output.getvalue()
        output.close
        return xml
Esempio n. 11
0
 def getTitle(self, stream):
     stream.seek(0)
     input1 = PdfFileReader(stream)
     title = input1.getDocumentInfo().title
     # if fail to get thesis's title , we deal with it by using a special algorithm.
     if title in ['untitled', '']:
         from pdfminer.pdfdocument import PDFDocument
         from pdfminer.pdfparser import PDFParser
         from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
         from pdfminer.pdfdevice import PDFDevice, TagExtractor
         from pdfminer.pdfpage import PDFPage
         from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
         from pdfminer.layout import LAParams
         try:
             from cStringIO import StringIO
         except ImportError:
             from StringIO import StringIO
         # init parameters
         caching = True
         codec = 'utf-8'
         imagewriter = None
         stripcontrol = False
         pagenos = set()
         password = ''
         maxpages = 0
         rotation = 0
         rsrcmgr = PDFResourceManager(caching=caching)
         laparams = LAParams()
         outfp = StringIO()
         # convert pdf to xml, using StringIO to store XML
         device = XMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               stripcontrol=stripcontrol)
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         pagenos.update(int(x) - 1 for x in '1'.split(','))
         stream.seek(0)
         for page in PDFPage.get_pages(stream,
                                       pagenos,
                                       maxpages=maxpages,
                                       password=password,
                                       caching=caching,
                                       check_extractable=True):
             page.rotate = (page.rotate + rotation) % 360
             interpreter.process_page(page)
         device.close()
         outfp.seek(0)
         # parse the xml to get title
         title = self._getTitleFromXmlStr(outfp.read().encode(codec))
     return title
Esempio n. 12
0
def extract_pdf_page(filename):

    # Paths for creating folder and file
    input_file_name = Path(filename).stem
    output_file_folder = Path(XML_PATH, input_file_name)
    output_file_folder.mkdir(parents=True, exist_ok=True)
    output_file_path = Path(output_file_folder,
                            input_file_name + "-" + TIME_NOW + ".xml")
    output_images_path = Path(XML_PATH, input_file_name, "images")
    output_images_path.mkdir(parents=True, exist_ok=True)

    output_file = io.StringIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager()
    device = XMLConverter(rsrcmgr, output_file, laparams=laparams)

    doc = fitz.open(filename)

    for i in range(len(doc)):
        for img in doc.getPageImageList(i):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            if pix.n < 5:  # this is GRAY or RGB
                pix.writePNG(
                    str(output_images_path) + "//" + "%s-%s-%s.png" %
                    (input_file_name, i, xref))
            else:  # CMYK: convert to RGB first
                pix1 = fitz.Pixmap(fitz.csRGB, pix)
                pix1.writePNG(
                    str(output_images_path) + "//" + "%s-%s.png" %
                    (input_file_name, i, xref))
                pix1 = None
            pix = None

    with open(filename, 'rb') as fh:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            interpreter.process_page(page)

    device.close()

    xml = output_file.getvalue()
    with open(output_file_path, 'w', encoding="utf-8") as fd:
        fd.write(xml)

    output_file.close()

    return xml
Esempio n. 13
0
 def __init__(self, src, limit = None):
     if(len(src) < 5 or src[(len(src) - 4):(len(src))] != ".pdf"):
         raise Exception("PDF file has to end in .pdf and has to have a name!")
     input_file = open(src, "rb")
     out = StringIO()
     rsrc = PDFResourceManager()
     device = XMLConverter(rsrc, out, codec='UTF-8', laparams=None) 
     try:
         process_pdf(rsrc, device, input_file, pagenos=None, maxpages=limit, password='', check_extractable=True)
     finally:
         device.close()
         input_file.close()
     text = out.getvalue()
     out.close()
     self.text = self.cleanText(text)
Esempio n. 14
0
class Miner:
    def __init__(self,
                 pdf_file,
                 txt_file,
                 file_format='txt',
                 layout_analysis=True):
        self.pdf_file = file(pdf_file, 'rb')
        self.outfp = file(txt_file, 'w')

        if layout_analysis:
            laparams = LAParams()
        else:
            laparams = None

        self.rsrcmgr = PDFResourceManager(caching=True)

        if file_format == 'txt':
            self.device = TextConverter(self.rsrcmgr,
                                        self.outfp,
                                        codec='utf-8',
                                        laparams=laparams,
                                        imagewriter=None)
        elif file_format == 'html':
            self.device = HTMLConverter(self.rsrcmgr,
                                        self.outfp,
                                        codec='utf-8',
                                        laparams=laparams,
                                        imagewriter=None)
        elif file_format == 'xml':
            self.device = XMLConverter(self.rsrcmgr,
                                       self.outfp,
                                       codec='utf-8',
                                       laparams=laparams,
                                       imagewriter=None)

    def extract(self):
        interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
        pagenos = set()
        for page in PDFPage.get_pages(self.pdf_file,
                                      pagenos,
                                      maxpages=0,
                                      password=None,
                                      caching=True,
                                      check_extractable=True):
            interpreter.process_page(page)
        self.pdf_file.close()
        self.device.close()
        self.outfp.close()
def pdf2xml(filename):
    rsrcmgr = PDFResourceManager(caching=True)
    outfp = StringIO.StringIO()
    device = XMLConverter(rsrcmgr, outfp, codec='utf-8', laparams=LAParams(), imagewriter=None)

    fp = file(filename, 'rb')
    pages = PDFPage.get_pages(fp, None, maxpages=0, password='', caching=True, check_extractable=True)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in pages:
        interpreter.process_page(page)
    fp.close()
    device.close()

    xml = outfp.getvalue()
    outfp.close()
    return xml
Esempio n. 16
0
 def getTitle(self,stream):
     stream.seek(0)
     input1 = PdfFileReader(stream)
     title = input1.getDocumentInfo().title
     # if fail to get thesis's title , we deal with it by using a special algorithm.
     if title in ['untitled','']:
         from pdfminer.pdfdocument import PDFDocument
         from pdfminer.pdfparser import PDFParser
         from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
         from pdfminer.pdfdevice import PDFDevice, TagExtractor
         from pdfminer.pdfpage import PDFPage
         from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
         from pdfminer.layout import LAParams
         try:
             from cStringIO import StringIO
         except ImportError:
             from StringIO import StringIO
         # init parameters
         caching = True
         codec = 'utf-8'
         imagewriter = None
         stripcontrol = False
         pagenos = set()
         password = ''
         maxpages = 0
         rotation = 0
         rsrcmgr = PDFResourceManager(caching=caching)
         laparams = LAParams()
         outfp = StringIO()
         # convert pdf to xml, using StringIO to store XML
         device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                           imagewriter=imagewriter,
                           stripcontrol=stripcontrol)
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         pagenos.update( int(x)-1 for x in '1'.split(',') )
         stream.seek(0)
         for page in PDFPage.get_pages(stream, pagenos,
                                   maxpages=maxpages, password=password,
                                   caching=caching, check_extractable=True):
             page.rotate = (page.rotate+rotation) % 360
             interpreter.process_page(page)
         device.close()
         outfp.seek(0)
         # parse the xml to get title
         title = self._getTitleFromXmlStr(outfp.read().encode(codec))
     return title
Esempio n. 17
0
def pdf_to_xml(pdfpath):
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    laparams = LAParams()
    device = XMLConverter(rsrcmgr, sio, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    fp = open(pdfpath, 'rb')
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()

    text = sio.getvalue()

    device.close()
    sio.close()

    return text
Esempio n. 18
0
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = BytesIO()
    manager = PDFResourceManager()
    converter = XMLConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, "rb")
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    return text
Esempio n. 19
0
def trasformaPDFinXML(doc, directoryOutfile):			# PDF Miner solo parte che mi serve (trasforma PDF in XML)

	password = ""
	pagenos = []
	i = 0
	codec ="utf-8"
	laparams = None 
	imagewriter = None 
 
	doc = doc.replace(" ","")
	fp = open(doc, 'rb')	

	doc = doc.replace("/", "_")
	pos = doc.find(".pdf")
	doc = doc[: pos]
	doc ="outfile "+ doc

	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	# Supply the password for initialization.
	document = PDFDocument(parser, password)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
	    raise PDFTextExtractionNotAllowed
	# Create a PDF resource manager object that stores shared resources.
	rsrcmgr = PDFResourceManager()
	outfp = file(directoryOutfile+"/"+doc,"w") 

	device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)

	# Create a PDF interpreter object.
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	# Process each page contained in the document.
	for page in PDFPage.create_pages(document):
	    interpreter.process_page(page)
	  
	fp.close()
	device.close()
	outfp.close()

	return doc
def parse_pdfs(pdf_filenames):
    # Set parameters
    pagenos = set()
    maxpages = 0
    password = ''
    imagewriter = None
    codec = 'utf-8'
    caching = True
    laparams = LAParams()

    rsrcmgr = PDFResourceManager(caching=caching)

    # Convert to XML as it retains the most information about text position (compared to text, html, etc).
    for pdf_file in pdf_filenames:

        print "Converting %s to xml." % pdf_file

        fname, ext = os.path.splitext(pdf_file)
        outfile = fname + '.xml'
        with open(pdf_file, 'rb') as fp, open(outfile, 'w') as outfp:

            device = XMLConverter(rsrcmgr,
                                  outfp,
                                  codec=codec,
                                  laparams=laparams,
                                  imagewriter=imagewriter)

            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)

            device.close()

        print "Conversion complete."
Esempio n. 21
0
def convert_pdf_to_xml(path):
    from pdfminer.converter import XMLConverter
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text
Esempio n. 22
0
 def get_xml_data(self):
     """Store XML representation fo file"""
     rm = PDFResourceManager(caching=True,
                             font_correctors=self.font_correctors)
     laparams = LAParams()
     outfp = open(self.xmlfile, "wb")
     device = XMLConverter(rm,
                           outfp,
                           codec="UTF-8",
                           laparams=laparams,
                           imagewriter=None)
     interpreter = PDFPageInterpreter(rm, device)
     infile = open(self.pdffile, "rb")
     pagenos = set()
     maxpages = 0
     rotation = 0
     password = ""
     for page in PDFPage.get_pages(infile,
                                   pagenos,
                                   maxpages=maxpages,
                                   password=password,
                                   caching=True,
                                   check_extractable=True):
         page.rotate = (page.rotate + rotation) % 360
         interpreter.process_page(page)
     self.font_metrics = {}
     for font in list(rm._cached_fonts.values()):
         try:
             self.font_metrics[font.fontname] = {
                 "bbox": font.bbox,
                 "descent": font.descent
             }
         except AttributeError:
             print((dir(font)))
     infile.close()
     device.close()
     outfp.close()
Esempio n. 23
0
def convert_xml(inf,
                outf,
                page_numbers=None,
                output_type='xml',
                codec='utf-8',
                laparams=None,
                maxpages=0,
                scale=1.0,
                rotation=0,
                output_dir=None,
                strip_control=False,
                debug=False,
                disable_caching=False):
    laparams = LAParams()
    imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)

    rsrcmgr = PDFResourceManager(caching=not disable_caching)

    device = XMLConverter(rsrcmgr,
                          outf,
                          codec='utf-8',
                          laparams=laparams,
                          imagewriter=imagewriter)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(inf,
                                  page_numbers,
                                  maxpages=maxpages,
                                  caching=not disable_caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    device.close()
    return page
Esempio n. 24
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='Convert PDF into text.')
    parser.add_argument('file',
                        nargs='*',
                        type=argparse.FileType('rb'),
                        default=sys.stdin,
                        help='file(s) to convert')
    parser.add_argument('-C',
                        '--nocache',
                        dest='cache',
                        action='store_false',
                        help='prevent object caching (slower)')
    parser.add_argument('-l',
                        metavar='level',
                        default='warn',
                        help='logging level (warn, info, debug)')
    parser.add_argument('-p',
                        metavar='page',
                        nargs='+',
                        default=[],
                        type=int,
                        help='page number(s) (space separated)')
    parser.add_argument('-m',
                        metavar='maxpages',
                        default=0,
                        type=int,
                        help='maximum number of pages to extract')
    parser.add_argument('-P',
                        metavar='password',
                        default='',
                        help='pdf password')
    parser.add_argument('-o',
                        metavar='outfile',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help='output file name (default: stdout)')
    parser.add_argument('-O',
                        metavar='directory',
                        type=ImageWriter,
                        help='extract images and save to directory')
    parser.add_argument('-t',
                        metavar='outtype',
                        help='output type (text, html, xml, tag)')
    parser.add_argument('-c',
                        metavar='codec',
                        default='utf-8',
                        help='output text encoding (default: %(default)s)')
    lagroup = parser.add_argument_group(title='layout analysis')
    lagroup.add_argument('-n',
                         action='store_true',
                         help='disable layout analysis')
    lagroup.add_argument('-A',
                         action='store_true',
                         help='force layout analysis on all text')
    lagroup.add_argument('-V',
                         action='store_true',
                         help='detect vertical text')
    lagroup.add_argument('-M',
                         metavar='char_margin',
                         type=float,
                         help='custom character margin')
    lagroup.add_argument('-L',
                         metavar='line_margin',
                         type=float,
                         help='custom line margin')
    lagroup.add_argument('-W',
                         metavar='word_margin',
                         type=float,
                         help='custom word margin')
    lagroup.add_argument('-F',
                         metavar='boxes_flow',
                         type=float,
                         help='custom boxes flow')
    lagroup.add_argument('-Y',
                         metavar='layout_mode',
                         default='normal',
                         help='layout mode for HTML (normal, exact, loose)')
    lagroup.add_argument('-s',
                         metavar='scale',
                         default=1,
                         type=float,
                         help='output scaling for HTML')
    args = parser.parse_args(argv)

    logging.basicConfig()
    logging.getLogger('pdfminer').setLevel(args.l.upper())

    laparams = LAParams()
    if args.n:
        laparams = None
    else:
        laparams.all_texts = args.A
        laparams.detect_vertical = args.V
        if args.M:
            laparams.char_margin = args.M
        if args.L:
            laparams.line_margin = args.L
        if args.W:
            laparams.word_margin = args.W
        if args.F:
            laparams.boxes_flow = args.F

    rsrcmgr = PDFResourceManager(caching=args.cache)
    outtype = args.t
    if not outtype:
        if args.o:
            if args.o.name.endswith('.htm') or args.o.name.endswith('.html'):
                outtype = 'html'
            elif args.o.name.endswith('.xml'):
                outtype = 'xml'
            elif args.o.name.endswith('.tag'):
                outtype = 'tag'
    if outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              args.o,
                              codec=args.c,
                              laparams=laparams,
                              imagewriter=args.O)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               args.o,
                               codec=args.c,
                               scale=args.s,
                               layoutmode=args.Y,
                               laparams=laparams,
                               imagewriter=args.O)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, args.o, codec=args.c)
    else:
        device = TextConverter(rsrcmgr,
                               args.o,
                               codec=args.c,
                               laparams=laparams,
                               imagewriter=args.O)
    for fp in args.file:
        process_pdf(rsrcmgr,
                    device,
                    fp, [i - 1 for i in args.p],
                    maxpages=args.m,
                    password=args.P,
                    caching=args.cache,
                    check_extractable=True)
        fp.close()
    device.close()
    if args.o is not sys.stdout:
        args.o.close()
def lambda_handler(event, context):

    # Grab file that was just uploaded to S3 bucket's "pdf" directory
    bucket = event['Records'][0]['s3']['bucket']['name']
    s3_new_arrived_filename = urllib.unquote_plus(
        event['Records'][0]['s3']['object']['key'].encode('utf8'))
    print('Reading file ' + s3_new_arrived_filename + ' from S3')
    extracted_results_from_pdf = '/tmp/extract.xml'
    downloaded_pdf_file = '/tmp/input.pdf'
    #download file into /tmp
    s3.meta.client.download_file(bucket, s3_new_arrived_filename,
                                 downloaded_pdf_file)
    print('Downloaded file ' + s3_new_arrived_filename + ' from S3')

    # extract pdf into xml and upload xml to S3 bucket's "xml" directory
    resource_mgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = XMLConverter(resource_mgr, retstr, codec=codec, laparams=laparams)
    maxpages = 0
    caching = True
    pagenos = set()
    infile_pdf_fp = file(downloaded_pdf_file, 'rb')
    interpreter = PDFPageInterpreter(resource_mgr, device)
    for page in PDFPage.get_pages(infile_pdf_fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password='',
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    data = retstr.getvalue()  # xml data extracted from pdf
    device.close()
    retstr.close()

    # write xml (extracted from pdf) to a new file
    print('Opening file ' + extracted_results_from_pdf +
          ' to write extracted xml from ' + s3_new_arrived_filename)
    outfile_xml_fp = file(extracted_results_from_pdf, 'w')
    print('Opened file ' + extracted_results_from_pdf)
    outfile_xml_fp.write(data)
    # pdfminer has a bug wherein it misses out the last </pages> tag in the extracted xml. Hence, adding this last tag manually.
    # Bug reported: https://github.com/euske/pdfminer/issues/229
    outfile_xml_fp.write("</pages>")
    outfile_xml_fp.close()
    filename_without_folderprefix_and_ext = re.sub(
        r'.*/', '',
        os.path.splitext(s3_new_arrived_filename)[0])
    extracted_xml_filename_in_s3 = 'xml/' + filename_without_folderprefix_and_ext + '.xml'
    s3.meta.client.upload_file(extracted_results_from_pdf, bucket,
                               extracted_xml_filename_in_s3)

    # Publish to "StockDataExtracted" SNS topic. Send location of newly extracted XML in S3 in the message to SNS topic. This topic triggers the next lambda function - get_recommended_stocks
    message = {"topten_trader_xml_filepath": extracted_xml_filename_in_s3}
    sns_client = boto3.client('sns', region_name='us-east-1')
    sns_response = sns_client.publish(
        TargetArn='arn:aws:sns:us-east-1:<aws_account_#>:stock_data_extracted',
        Message=json.dumps({'default': json.dumps(message)}),
        Subject='Stock Buy Recommendations ' + str(datetime.date.today()),
        MessageStructure='json')
Esempio n. 26
0
    if fname[-3:] == "pdf":

        # Set parameters
        pagenos = set()
        maxpages = 0
        password = ''
        imagewriter = None
        codec = 'utf-8'
        caching = True
        laparams = LAParams()
        outfile = fname + '.txt'

        rsrcmgr = PDFResourceManager(caching=caching)

        outfp = file(outfile, 'w')

        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                          imagewriter=imagewriter)

        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            interpreter.process_page(page)
        fp.close()

        device.close()
        outfp.close()
Esempio n. 27
0
def get_placeholder_image_info(filename, xmlfile, outputdir):
    if not os.path.isdir(outputdir):
        os.makedirs(outputdir)

    image_info = []
    password = ''
    caching = True
    rotation = 0
    fname = filename
    maxpages = 0
    pagenos = set()
    outputdir = outputdir
    placeholder_imgs = []
    outfile = os.path.join(outputdir, xmlfile)
    outfp = file(outfile, 'w')
    codec = 'utf-8'
    laparams = LAParams()
    #laparams = None
    imagewriter = MyImageWriter(outputdir)
    #imagewriter = ImageWriter(outputdir)
    #imagewriter = None
    rsrcmgr = PDFResourceManager(caching=caching)
    device = XMLConverter(rsrcmgr,
                          outfp,
                          codec=codec,
                          laparams=laparams,
                          imagewriter=imagewriter)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)

    fp.close()
    device.close()
    outfp.close()
    root = lxml.etree.parse(outfile)
    found_images = root.findall('.//image')
    found_image_boxes = root.xpath('.//figure[image]')
    jpg_count = 0
    #get_images(filename, imagewriter.get_jpgs())
    for i, e in enumerate(found_images):
        #imgpth = os.path.join(outputdir, found_image_boxes[i].attrib['name'] + '.jpg')
        imgpth = os.path.join(outputdir, e.attrib['src'])
        #print imgpth
        if not os.path.exists(imgpth):
            print "path doesnt exist - tag is none for " + imgpth
            tag = None
        else:
            tag = get_image_tag(imgpth)
            image_info.append({
                "id": i,
                "src": imgpth,
                "height": e.attrib['height'],
                "width": e.attrib['width'],
                "bbox": found_image_boxes[i].attrib['bbox'],
                "tag": tag
            })
            if tag is not None:
                placeholder_imgs.append(jpg_count)
            jpg_count += 1

    return {'image_info': image_info, 'placeholder_imgs': placeholder_imgs}
Esempio n. 28
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='Convert PDF into text.')
    parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert')
    parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)')
    parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)')
    parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)')
    parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract')
    parser.add_argument('-P', metavar='password', default='', help='pdf password')
    parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout,
                        help='output file name (default: stdout)')
    parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory')
    parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)')
    parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)')
    lagroup = parser.add_argument_group(title='layout analysis')
    lagroup.add_argument('-n', action='store_true', help='disable layout analysis')
    lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text')
    lagroup.add_argument('-V', action='store_true', help='detect vertical text')
    lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin')
    lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin')
    lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin')
    lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow')
    lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)')
    lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML')
    args = parser.parse_args(argv)

    logging.basicConfig()
    logging.getLogger('pdfminer').setLevel(args.l.upper())

    laparams = LAParams()
    if args.n:
        laparams = None
    else:
        laparams.all_texts = args.A
        laparams.detect_vertical = args.V
        if args.M:
            laparams.char_margin = args.M
        if args.L:
            laparams.line_margin = args.L
        if args.W:
            laparams.word_margin = args.W
        if args.F:
            laparams.boxes_flow = args.F

    rsrcmgr = PDFResourceManager(caching=args.cache)
    outtype = args.t
    if not outtype:
        if args.o:
            if args.o.name.endswith('.htm') or args.o.name.endswith('.html'):
                outtype = 'html'
            elif args.o.name.endswith('.xml'):
                outtype = 'xml'
            elif args.o.name.endswith('.tag'):
                outtype = 'tag'
    if outtype == 'xml':
        device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y,
                               laparams=laparams, imagewriter=args.O)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, args.o, codec=args.c)
    else:
        device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
    for fp in args.file:
        process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P,
                    caching=args.cache, check_extractable=True)
        fp.close()
    device.close()
    if args.o is not sys.stdout:
        args.o.close()
Esempio n. 29
0
def get_text(url, parse=True, laparams=laparams):
    url += 'v1.full.pdf'
    max_attempts = 4
    attempts = 0
    print(url)
    while attempts < max_attempts:
        r = requests.get(url)
        if r.status_code != 429:
            break
        # If rate limited, wait and try again (in seconds)
        time.sleep((2**attempts) + random.random())
        attempts = attempts + 1
    data = r.content

    try:
        f = io.BytesIO(data)

        rsrcmgr = PDFResourceManager()
        retstr = BytesIO()
        codec = 'utf-8'
        device = XMLConverter(rsrcmgr, retstr, codec=codec,
                              laparams=laparams)  # , rect_colors=rect_colors)

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0  # is for all
        caching = True
        pagenos = set()

        for page in PDFPage.get_pages(f,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            interpreter.process_page(page)
        device.close()
        pdf_data = retstr.getvalue()
        retstr.close()
    except:
        return ('.raw.txt', data)
    try:
        if parse == False:
            return ('.xml', pdf_data)
        else:

            # xmltest = convert_pdf_to_xml(pdf_data)
            root = ET.fromstring(pdf_data)

            temp = root.find('.//text')
            curr_font = temp.get('font')
            curr_size = float(temp.get('size'))
            text = ''

            rmargin = 70

            i = 0
            newline_pos = []
            for l in root.iterfind('.//textline'):
                for t in l.findall('./text'):
                    if (t.get('font') or t.get('size')) is None:
                        if t.text[0] == ' ':
                            text += ' '
                        else:
                            text += '<<NEWLINE>>'
                            newline_pos.append([])

                    else:
                        x0, y0, x1, y1 = [
                            float(z) for z in t.get('bbox').split(',')
                        ]
                        char_size = float(t.get('size', 0))
                        char_font = t.get('font', '')
                        if y0 > 750 or y0 < 75:
                            continue
                        if x0 < rmargin:
                            if re.search('[A-Za-z]+', t.text) is not None:
                                print('changing rmargin to ', str(x0 - 1))
                                rmargin = x0 - 1
                                text += t.text
                            continue

                        else:
                            if (char_size != curr_size) or (char_font !=
                                                            curr_font):
                                if (char_size) <= 8.:
                                    continue
                                text += '<<NEWFONT>>' + t.text
                                curr_font = t.get('font')
                                curr_size = float(t.get('size'))
                            else:
                                text += t.text
            lines = text.split('<<NEWLINE>>')
            [print(l) for l in lines[:min(len(lines), 5)]]

            doc = lines[0]
            open_parens = False
            parens = []

            if len(re.findall(r'\(', doc)) > len(re.findall(r'\)', doc)):
                parens.append(True)
            else:
                parens.append(False)
            for i, t in enumerate(lines):

                if (i == 0):
                    if re.search(r'^\s*[a-z(]', lines[1]) is None:
                        doc += '\n'
                    continue
                if len(t) < 1:
                    if open_parens == False:
                        doc += '\n'
                    else:
                        continue
                    continue
                else:
                    o = len(re.findall(r'\(', t))
                    if open_parens == True:
                        o += 1
                    c = len(re.findall(r'\)', t))
                    if o > c:
                        open_parens = True
                    else:
                        open_parens = False

                    if open_parens == False:
                        if t.startswith(' '):
                            t = re.sub(r'^ +', '<<PARAGRAPH>>', t)
                        if t.lstrip(' ').startswith('<<NEWFONT>>') and lines[
                                i - 1].rstrip(' ').endswith('.'):
                            t = re.sub(r'^<<NEWFONT>>', '<<PARAGRAPH>>',
                                       t.lstrip(' '))
                        if t.rstrip(' ').endswith('.'):
                            t += '<<PARAGRAPH>>'
                        if re.match(r'^\d{1,3}\.<<NEWFONT>>', t):
                            t = '<<PARAGRAPH>>' + t
                    doc += t
                    parens.append(open_parens)
            doc = re.sub(r'(?<=[^.])\n+', '', doc)
            doc = re.sub(r' {3,}', '<<PARAGRAPH>>', doc)
            print(doc[:50])

            parsed = []
            for _text in doc.split(r'<<PARAGRAPH>>'):
                _text = re.sub('(<<NEWLINE>>)+', '\n', _text)
                _text = re.sub(r'  ', r'\n', _text)
                _text = re.sub(
                    r'<<NEWFONT>>(?P<url>http[a-zA-Z0-9./+?_=:-]+)( <<NEWFONT>>)?',
                    r'\g<url>', _text)
                _text = re.sub(r'<<NEWFONT>> <<NEWFONT>>', r' ', _text)
                _text = re.sub(r'\(<<NEWFONT>>(.+)<<NEWFONT>>\)', r'(\g<1>)',
                               _text, re.M)

                pattern = re.compile(
                    r'<<NEWFONT>>(((\W|\d)+)|([A-Za-z_-]{1,2}\n?))<<NEWFONT>>')

                _text = pattern.sub(r'\g<1>', _text)

                pat2 = re.compile(
                    r'<<NEWFONT>>([A-Za-z- :]+)<<NEWFONT>>([.:]?)')
                _text = pat2.sub(r'\g<1>\g<2>\n', _text)
                pat3 = re.compile(
                    r'<<NEWFONT>>([A-Za-z_-]{1,3} *\n?)<<NEWFONT>>')

                _text = pat3.sub(r'\g<1>', _text)

                _text = re.sub(r'<<NEWFONT>>(.+)<<NEWFONT>>([a-z]+)',
                               r'\g<1> \g<2>', _text)
                _text = re.sub(r'<<NEWFONT>>(.+)<<NEWFONT>>(\W*)\.?',
                               r'\g<1> \g<2>', _text)
                _text = re.sub(r'-\n', r'-', _text)
                _text = re.sub(r'\((.+)(?:\n)(.+)\)', r'(\g<1>\g<2>))', _text)
                _text = re.sub(r'\((.+)<<NEWFONT>>(.+)\)', r'(\g<1>\g<2>)',
                               _text, re.M)

                if len(_text.strip(' \n')) > 0:
                    if len(re.findall(r'<<NEWFONT>>', _text)) == 1:
                        _text = re.sub(r'<<NEWFONT>>', '\n', _text)
                    parsed.append(_text)

            parsed2 = [
                parsed[0],
            ]
            for i, p in enumerate(parsed):
                if i > 0:
                    if re.search(r'^\s*\n*[a-z]', p) is not None:
                        parsed2[i - 1] += p
                        p = ''
                    parsed2.append(p)
            parsed2 = '\n===================================\n'.join(
                [p for p in parsed2 if p != ''])

            print(parsed2[:50])
            return ('.txt', parsed2)
    except:
        return ('.raw.xml', pdf_data)
Esempio n. 30
0
laparams = LAParams()
imagewriter = None
codec = 'utf-8'
outfp = sys.stdout
stripcontrol = True
pagenos = set()

fname = sys.argv[1]

rsrcmgr = PDFResourceManager(caching=True)
device = XMLConverter(rsrcmgr,
                      outfp,
                      codec=codec,
                      laparams=laparams,
                      imagewriter=imagewriter,
                      stripcontrol=stripcontrol)

fp = file(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.debug = 1
for page in PDFPage.get_pages(fp,
                              pagenos,
                              maxpages=0,
                              password='',
                              caching=True,
                              check_extractable=True):
    interpreter.process_page(page)
fp.close()
device.close()
outfp.close()