Example #1
0
def convert_pdf_to_txt(path):
  

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    #laparams.all_texts = True
    laparams.word_margin = float(0.15)
    #laparams=None
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    #fp = file(path, 'rb')
    fp=path
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()

    return text
Example #2
0
def read_pdf(path):
    fp = open(path, 'rb')

    pdfFileObj = open(path, 'rb')  #'rb' for read binary mode
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    pypdf_text = ''
    for page_number in range(0, pdfReader.numPages):
        pageObj = pdfReader.getPage(page_number)
        pypdf_text += pageObj.extractText() + "\n"
    pypdf_text = " ".join(pypdf_text.replace(u"\xa0", " ").strip().split())

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
    return extracted_text, pypdf_text
Example #3
0
def parse_pdf(path, pdf):
    fp = open(pdf, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        # text = open("EES Quote test reference.txt")
        text_file = open(path + '/result.txt', "w")

        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
                print(extracted_text)
                # extracted_text >> path + "result.txt"
    text_file.write(extracted_text)

    text_file.close()
    return text_file
    global file_one
    file_one = path + 'result.txt'
    global file_two
    file_two = "Quote test reference"  # '/home/rc/Downloads/EES I Packing List reference.txt'
Example #4
0
def get_pdf_text(pdf_file):
    pdf_text = ""

    with open(pdf_file, 'rb') as file_hdl:

        parser = PDFParser(file_hdl)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')

        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 1.0
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    pdf_text += lt_obj.get_text()

    if len(pdf_text) == 0:
        img_files = get_pdf_images(pdf_file)
        for img_file in img_files:
            pdf_text += get_text_from_image(img_file)

    return pdf_text
Example #5
0
def initialize_pdf_miner(fh, password = None):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser, password)

    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
Example #6
0
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    #doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    #doc.initialize("")
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        pass
        #raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams(line_overlap=0.3, char_margin=1.0, line_margin=0.5, word_margin=0.1,
            boxes_flow=0.1, detect_vertical=False, all_texts=False)
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
Example #7
0
def extract_text_from_pdf(path):
    '''
    extract text from pdf
    the function works for asian language

    path `str` to pdf file e.g. './folder_name/text.pdf'
    extracted_text `str`

    source: stackoverflow
    '''
    fp = open(path, 'rb')
    
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
                
    return extracted_text
Example #8
0
 def getTextPdf(self, filename):
     try:
         pdfFileObj = open(filename, 'rb')
         pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
         numeroDePaginas = pdfReader.numPages
         pdfFileObj.close()
         print(numeroDePaginas)
         if numeroDePaginas > 4:
             return ''
         else:
             file = open(filename, 'rb')
             parser = PDFParser(file)
             doc = PDFDocument()
             parser.set_document(doc)
             doc.set_parser(parser)
             doc.initialize('')
             rsrcmgr = PDFResourceManager()
             laparams = LAParams()
             laparams.char_margin = 1.0
             laparams.word_margin = 1.0
             device = PDFPageAggregator(rsrcmgr, laparams=laparams)
             interpreter = PDFPageInterpreter(rsrcmgr, device)
             extracted_text = ''
             for page in doc.get_pages():
                 interpreter.process_page(page)
                 layout = device.get_result()
                 for lt_obj in layout:
                     if isinstance(lt_obj, LTTextBox) or isinstance(
                             lt_obj, LTTextLine):
                         extracted_text += lt_obj.get_text()
             return extracted_text
     except:
         return ''
def convert_pdf_to_txt(path):
    # setup
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    extracted_text = ''
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()

    # close file and return its content
    fp.close()
    return extracted_text
Example #10
0
def extract_text_from_pdf(path_in, path_out, fichier, page_beg, page_end=0):
    if (page_end == 0):
        page_end = page_beg

    fp = open(path_in + '/' + fichier, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()

    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize('')

    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 4.0  # 2.0 by default :  two char whose distance is closer than this value are considered contiguous and get grouped into one.
    laparams.word_margin = 0.3  # 0.1 by default : distance between two words is greater than this value => insert space
    laparams.line_margin = 0.5  # 0.5 by default : Distance between 2 Lines under this value are grouped

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    x = list(doc.get_pages())
    for i in range(page_beg - 1, page_end):
        page = x[i]
        extracted_text += "EXTRACTION DE LA PAGE " + str(i + 1) + "\n\n"
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
                extracted_text += "\n"

    return extracted_text
Example #11
0
def convertPDF2txt(fname, pages=None):
    parser = PDFParser(open(fname, 'rb'))
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    #I changed the following 2 parameters to get rid of white spaces inside words:
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    # Process each page contained in the document.
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()

    print(extracted_text)
    return extracted_text
Example #12
0
def get_text_rows(path):
    rows = defaultdict(list)
    # Open a PDF file.
    fp = open(path, 'rb')

    # Create a PDF parser object associated with the file object.
    # parser = PDFParser(fp)

    # Create a PDF document object that stores the document structure.
    # Password for initialization as 2nd parameter
    # document = PDFDocument(parser)

    # Check if the document allows text extraction. If not, abort.
    # if not document.is_extractable:
    #     raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()

    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)

    # BEGIN LAYOUT ANALYSIS
    # Set parameters for analysis.
    laparams = LAParams()
    laparams.line_overlap = 0.01
    laparams.line_margin = 0.01
    laparams.word_margin = 0.15

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    def parse_obj(lt_objs, page):
        # loop over the object list
        for obj in lt_objs:
            # if it's a textbox, print text and location
            if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
                rows[(page, -int(obj.bbox[1]))].append(
                    (int(obj.bbox[0]), sanitize(obj.get_text())))
            # if it's a container, recurse
            elif isinstance(obj, pdfminer.layout.LTFigure):
                parse_obj(obj._objs, page)

    # loop over all pages in the document
    for page_num, page in enumerate(PDFPage.get_pages(fp)):
        # read the page into a layout object
        interpreter.process_page(page)
        layout = device.get_result()

        # extract text from this object
        parse_obj(layout._objs, page_num)

    for key in sorted(rows):
        rows[key] = sorted(rows[key])
        page, y = key
        y = -y
        yield (page, y, rows[key])
Example #13
0
def to_txt(infile: str, outfile: str):
    """
    Convert a pdf file to txt.
    :param infile: pdf file path;
    :param outfile: txt file path;
    :return: txt file path;
    """
    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    codec = 'utf-8'
    pagenos = set()
    maxpages = 0
    password = ''
    laparams = LAParams()
    laparams.word_margin = float(0)
    laparams.line_margin = float(1)
    outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
    device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    fp = io.open(infile, 'rb')
    process_pdf(rsrcmgr,
                device,
                fp,
                pagenos,
                maxpages=maxpages,
                password=password,
                caching=caching,
                check_extractable=True)
    fp.close()
    device.close()
    outfp.close()
    return outfile
Example #14
0
def parse_qp(docket_number):
    if "-Orig" in docket_number:
        docket = docket_number.split("-")[0] + ' orig'
    else:
        split_docket = docket_number.split("-")
        docket = '{term}-{num:05d}'.format(term=split_docket[0],
                                           num=int(split_docket[1]))

    fp = io.BytesIO(
        requests.get("https://www.supremecourt.gov/qp/" + docket +
                     "qp.pdf").content)
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                text = lt_obj.get_text().replace("(cid:160)", " ")
                if ("LOWER COURT CASE NUMBER:"
                        not in text) and ("DECISION BELOW:" not in text):
                    extracted_text += text

    return re.sub(' +', ' ', extracted_text)
Example #15
0
    def parse(self, path):
		out = StringIO.StringIO()
		fp = None
        # Directory
		if os.path.isdir(path):
			raise NotImplementedError()
        # File
	       	else:
			fp = file(path)		
		rsrc = PDFResourceManager()
		codec = 'utf-8'
		laparams = LAParams()
		laparams.char_margin = 2.0
		laparams.line_margin = 2.0
		laparams.word_margin = 0.0
		device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
		doc = PDFDocument()
		parser = PDFParser(fp)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize()
		interpreter = PDFPageInterpreter(rsrc, device)
		for page in doc.get_pages():
			interpreter.process_page(page)
		device.close()
		sample = Sample(path, None, out.getvalue())
		out.close()
		return sample
Example #16
0
def extractText(file_name):
    """
    extract text in file
    """
    connection = open(file_name, 'rb')
    parser = PDFParser(connection)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
    return extracted_text
Example #17
0
def extract_layout_by_page(pdf_path, page_number):
    """
    :param pdf_path:  pdf file path
    :param page_number:      the specific page that you want to parse(start from 1)
    :return: a list of pdfminer layout object
    """
    fp = open(pdf_path, 'rb')  # 以二进制读模式打开
    # 用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)  # 创建一个PDF文档
    doc = PDFDocument()  # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    doc.initialize()
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    # 创建PDf 资源管理器 来管理共享资源
    rsrcmgr = PDFResourceManager()  # 创建一个PDF设备对象
    laparams = LAParams()
    laparams.line_overlap = 0.3
    laparams.char_margin = 3
    laparams.word_margin = 0.3
    laparams.line_margin = 0.01

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)  # 创建一个PDF解释器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    layouts = []
    # 循环遍历列表,每次处理一个page的内容
    pages = list(doc.get_pages())

    interpreter.process_page(pages[page_number - 1])
    # 接受该页面的LTPage对象
    return device.get_result()
def pdf_to_text_file(file_path):
    from pdfminer.pdfparser import PDFParser, PDFDocument
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams, LTTextBox, LTTextLine
    extracted_text = ''

    # In file_path Provide the full file path including the pdf name for example C://UserName/Folder1/PdfFile.pdf
    file_content = open(file_path, 'rb')
    parser = PDFParser(file_content)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    #changing below 2 parameters to get rid of white spaces inside words
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the pdf document.
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()

    return (extracted_text.encode("utf-8"))
Example #19
0
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize("")
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
Example #20
0
def pdfToTextDict(filename):
    textDict = []
    datafiles = sorted(glob.glob(filename+'*.pdf'))
    for pdf in datafiles:
        fp = open(pdf, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        try:
            doc.set_parser(parser)

            doc.initialize('')
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            laparams.char_margin = 1.0
            laparams.word_margin = 1.0
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            extracted_text = ''

            for page in doc.get_pages():
                interpreter.process_page(page)
                layout = device.get_result()
                for lt_obj in layout:
                    if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    #if isinstance(lt_obj, LTTextBox):
                    #compare performance
                        extracted_text += lt_obj.get_text()

            textDict.append(extracted_text)
        except:
            print("set parser error)")

    return textDict
Example #21
0
def __extract_extra__(request, item_id=None):
    if not request.user.is_authenticated():
        return HttpResponse('Please sign in first')

    from pdfminer.layout import LAParams
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
    from pdfminer.pdfdevice import PDFDevice, TagExtractor
    from pdfminer.converter import TextConverter
    from cStringIO import StringIO

    laparams = LAParams()
    outtype = 'text'
    laparams.char_margin = 1.0
    laparams.line_margin = 0.3
    laparams.word_margin = 0.2
    codec = 'utf-8'
    caching = True

    if item_id:
        all_items = Item.objects.filter(id=item_id)
    else:
        all_items = Item.objects.all()

    for item in all_items:

        # Don't extract if no PDF exists; or if we already have search text
        if not item.pdf_file or item.other_search_text:
            continue

        rsrcmgr = PDFResourceManager(caching=caching)
        outfp = StringIO()
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
        fp = item.pdf_file.file
        try:
            process_pdf(rsrcmgr,
                        device,
                        fp,
                        pagenos=set(),
                        maxpages=0,
                        password='',
                        caching=caching,
                        check_extractable=True)
        except AssertionError:
            logger.warning('FAILED in completely PDF index "%s"' % item.title)
            return HttpResponse('FAILED in completely PDF index "%s"' \
                                % item.title)
        else:
            logger.debug('Full PDF index of item "%s"' % item.title)
        finally:
            fp.close()
            device.close()
            outfp.seek(0)
            page_text = outfp.read()
            outfp.close()

            item.other_search_text = page_text
            item.save()

    return HttpResponse('Full PDF indexed for item "%s"' % item.title)
Example #22
0
    def parse_pdf(self, test_parse=False):
        """
            Parse a PDF and return text contents as an array
        """

        dtpo_log("debug", "parsePDF sourceFile -> '%s'", self.source_file)

        # input options
        pagenos = set()
        maxpages = 0
        # output option
        codec = "utf-8"
        caching = True
        laparams = LAParams()
        laparams.char_margin = 8.0
        laparams.word_margin = 2.0

        rsrcmgr = PDFResourceManager(caching=caching)

        try:
            outfp = file(self.text_file, "w")
        except IOError as io_error:
            raise DTPOFileError(self.text_file, 0, str(io_error))

        try:
            fp = file(self.source_file, "rb")
        except IOError as io_error:
            raise DTPOFileError(self.source_file, 0, str(io_error))

        try:
            device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
            process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True)

        except PDFException as pdf_error:
            message = "Failed to parse file {0} -> {1}".format(self.source_file, str(pdf_error))
            raise DTPOFileError(self.source_file, 0, message)
        except Exception as exception:
            message = "Failed to parse PDF file Unknown exception {0} - > {1}".format(type(exception), str(exception))
            raise DTPOFileError(self.source_file, 0, message)

        fp.close()
        device.close()
        outfp.close()

        #   Got the PDF converted = now get it into an array
        self.file_array = []
        for line in open(self.text_file):
            self.file_array.append(line)

        #   Remove the last entry - it's always '\x0c'
        if len(self.file_array) > 0:
            del self.file_array[-1]

        #   Remove the outfile
        if not test_parse:
            os.remove(self.text_file)
def readPDFs(folder):

    lem = WordNetLemmatizer()

    filename_pattern = '.+\.pdf'
    my_corpus = PlaintextCorpusReader(folder, filename_pattern)
    list_of_files = my_corpus.fileids()

    corpus = []

    # Get list of file names
    for i, file in enumerate(list_of_files):

        fp = open(folder + file, 'rb')
        print(i)

        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 1.0
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        text = ''

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    text += lt_obj.get_text()

        last = text.rfind('Reg AC')

        if last != -1:
            text = text[0:last]

        corpus.append(text.split())

    # Alphabets, Lower and Lemmatize
    docs1 = [[w.lower() for w in sub_doc] for sub_doc in corpus]

    docs2 = [[w for w in sub_doc if re.search('^[a-z]+$', w)]
             for sub_doc in docs1]

    docs3 = [[w for w in sub_doc if len(w) > 3] for sub_doc in docs2]

    processed_docs = [[lem.lemmatize(w) for w in sub_doc] for sub_doc in docs3]

    # Return List of Lists
    return processed_docs
Example #24
0
def convert(fname, pages=None, M=1.0, L=0.3, W=0.2, F=0.5):
    """ Converts a pdf filename into plain text.

    Each value is specified not as an actual length, but as a proportion of the length
    to the size of each character in question.

    Parameters define layout analysis. In a PDF text is in several chunks of various types.
    Text extraction needs to recover text chunks which ar regarded as continuous if
    elements distance is closer than the char_margin (identified as M) and thus are
    grouped into one block. Two lines are part of the same text if they are closer than
    the line_margin (L). If the distance between two words is greater than the word_margin (W),
    blank characters (spaces) shall be inserted as necessary to keep format.
    Boxes flow (F) specifies how much a horizontal and vertical position of a text matters
    when determining text flow order. The value should be within the range from -1.0
    (only horizontal position matters) to +1.0 (only vertical position matters).

    Keyword arguments:

      fname -- PDF file name (string)
      pages -- Set of pages to extract (set)
      M -- char_margin (float)
      L -- line_margin (float)
      W -- word_margin (float)
      F -- boxes_flow (float)

    Return:
      text: pdf contents as plain text

    """
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = BytesIO()
    codec = "utf-8"

    manager = PDFResourceManager()
    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = False
    laparams.char_margin = M
    laparams.line_margin = L
    laparams.word_margin = W
    laparams.boxes_flow =  F
    converter = TextConverter(manager, output, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text
Example #25
0
 def readText(self,path, outtype='text', opts={}):
     outfile = path[:-3] + outtype
     outdir = '/'.join(path.split('/')[:-1])
     # debug option
     pagenos = set()
     maxpages = 0
     # output option
     # ?outfile = None
     # ?outtype = None
     outdir = None
     #layoutmode = 'normal'
     codec = 'utf-8'
     pageno = 1
     scale = 1
     showpageno = True
     laparams = LAParams()
     for (k, v) in opts:
         if k == '-d': debug += 1
         elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
         elif k == '-m': maxpages = int(v)
         elif k == '-P': password = v
         elif k == '-o': outfile = v
         elif k == '-n': laparams = None
         elif k == '-A': laparams.all_texts = True
         elif k == '-V': laparams.detect_vertical = True
         elif k == '-M': laparams.char_margin = float(v)
         elif k == '-L': laparams.line_margin = float(v)
         elif k == '-W': laparams.word_margin = float(v)
         elif k == '-F': laparams.boxes_flow = float(v)
         elif k == '-Y': layoutmode = v
         elif k == '-O': outdir = v
         elif k == '-t': outtype = v
         elif k == '-c': codec = v
         elif k == '-s': scale = float(v)
     print laparams
     #
     #PDFDocument.debug = debug
     #PDFParser.debug = debug
     CMapDB.debug = self.debug
     PDFResourceManager.debug = self.debug
     PDFPageInterpreter.debug = self.debug
     PDFDevice.debug = self.debug
     #
     rsrcmgr = PDFResourceManager()
     #outtype = 'text'
     outfp = StringIO()
     device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
     fp = file(path, 'rb')
     process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True)
     fp.close()
     device.close()
     print outfp.getvalue()
     outfp.close()
     return
Example #26
0
def convert_to_text_file(filename_in, filename_out, rewrite=False):
    """
        Parse file according to BORME PDF format

        filename:
        filenameOut:
    """

    if os.path.isdir(filename_out):
        filename_out = os.path.join(filename_out, os.path.basename(filename_in))

    if os.path.exists(filename_out) and not rewrite:
        logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out)
        return False

    # conf
    codec = 'utf-8'
    laparams = LAParams()
    imagewriter = None
    pagenos = set()
    maxpages = 0
    password = ''
    rotation = 0

    # <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>
    laparams.detect_vertical = True
    laparams.all_texts = False
    laparams.char_margin = 2.0
    laparams.line_margin = 0.5
    laparams.word_margin = 0.1

    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(filename_out, 'w')
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
    fp = open(filename_in, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # https://github.com/euske/pdfminer/issues/72
    #page = PDFPage()
    #PDFPage.cropbox =

    # y esto?
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages, password=password,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)

    fp.close()
    device.close()
    outfp.close()
    return True
Example #27
0
 def to_text(self):
     rsrcmgr = PDFResourceManager()
     output = StringIO()
     laparams = LAParams()
     laparams.detect_vertical = True
     laparams.all_texts = True
     laparams.word_margin = 0.4
     device = TextConverter(rsrcmgr, output, laparams=laparams)
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     for page in self._doc.get_pages():
             interpreter.process_page(page)
     return output.getvalue().decode('utf-8', 'ignore')
Example #28
0
def pdf2txt(pdfname, txtname):
    btxt = False
    try:
        fp = open(pdfname, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()

        laparams.char_margin = 1.0
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        ncount = 0
        print("pdf2txt %s..." %
              pdfname)  # informa por consola del nombre de archivo

        # abre archivo de texto para la salida
        fptxt = open(txtname, 'w')
        # recorre el documento procesando cada página
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            # recorre la página procesando cada objeto
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    spagetxt = lt_obj.get_text().strip() + " "
                    if (spagetxt != ""):
                        btxt = True
                        fptxt.write(spagetxt)
                        print("Palabra", spagetxt)
                elif isinstance(lt_obj, LTFigure):
                    print("LTFigure, pte implementar!")
                    spagetxt = ""
            ncount += 1

        print("end")
        fptxt.closed
        fp.closed
    except Exception as e:
        print("Error: %s" % (e))
    return btxt
Example #29
0
def initialize_pdf_interpreter():
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return interpreter, device
Example #30
0
def initialize_pdf_interpreter():
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return interpreter, device
Example #31
0
    def isSearchablePDF(self):
        searchable = True
        entityList_ = []
        print("PDF File")
        fp = open(self.filename, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 1.0
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = ''
        counter = 1

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extracted_text += lt_obj.get_text()

                    n1 = extracted_text.replace("\t", " ")
                    n2 = n1.replace("\r", "")
                    n3 = n2.replace("\n", "")
                    finaltext = n3.replace("\u00a0", "")

                    doc = nlp(finaltext)
                    # print([(X.text, X.label_) for X in doc.ents])
                    for X in doc.ents:
                        if X.text != ('\n') and X.label_ not in (
                                'ORDINAL', 'CARDINAL', 'NORP',
                                'Non-­‐binding'):
                            self.listEntities.append(X.text + ",")
                            entityList_.append((X.text, X.label_))

        if (entityList_ == []):
            searchable = False
        return searchable
def get_pdf(path):
    print("Begin: get_pdf")
    with open(path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 1.0
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        songs = []
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            nro_element = 0
            verse_text = ''
            for lt_obj in layout:
                if nro_element == 0:
                    nro_element += 1
                elif isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    text = lt_obj.get_text()
                    if nro_element == 1:
                        tittle = validate_tittle_to_save(text.replace('\n', ''))
                        nro_element += 1
                    else:
                        if text[:4] == 'Page' and text[4:].replace('\n', '').isdigit():
                            nro_page = text[4:].replace('\n', '')
                        else:
                            verse_text += text
                            verse_text += '\n'
            song = {}
            song["tittle"] = tittle
            song["nro_page"] = nro_page
            song["verses"] = verse_text
            print(f"Read Page: {nro_page}")
            songs.append(song)
    print("Finish: get_pdf")
    return songs
def parsePDFFile(filePath, everyNPages=13):
    print('')
    print('About to parse file at path: {:s}'.format(filePath))

    with open(filePath, 'rb') as pdfFile:
        pdfParser = PDFParser(pdfFile)
        document = PDFDocument()
        pdfParser.set_document(document)
        document.set_parser(pdfParser)

        document.initialize('')
        pdfResourceManager = PDFResourceManager()
        laParams = LAParams()
        laParams.char_margin = 1.0
        laParams.word_margin = 1.0
        pdfPageAggregator = PDFPageAggregator(pdfResourceManager,
                                              laparams=laParams)
        pdfPageInterpreter = PDFPageInterpreter(pdfResourceManager,
                                                pdfPageAggregator)
        extracted_text = {}

        pageNumber = 1
        chosenStoppingPage = random.randint(1, everyNPages - 1)
        try:
            for page in document.get_pages():
                if pageNumber % everyNPages == chosenStoppingPage:
                    extracted_text[pageNumber] = ''
                    pdfPageInterpreter.process_page(page)
                    layout = pdfPageAggregator.get_result()
                    for layoutObject in layout:
                        if isinstance(layoutObject, LTTextContainer):
                            text = layoutObject.get_text()
                            text = text.replace('-\n', '')
                            text = text.replace('\n', ' ')
                            extracted_text[pageNumber] += text
                pageNumber += 1
        except KeyError:
            pass

        print('URL parse complete')

        return extracted_text
Example #34
0
def read_pdf_data(filename):
    with open(filename, 'rb') as fp:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        # laparams.line_margin = 0.005
        laparams.word_margin = 0.05
        device = My(rsrcmgr, sys.stdout, laparams=laparams)
        device.reset()
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        result_data = []
        count = 0
        for page in PDFPage.get_pages(fp, set()):
            interpreter.process_page(page)
            result_data.append(device.group)
            device.word = ""
            device.group = []
            device.word_pos_info = {}
            count += 1

        return result_data
Example #35
0
def pdf_woorden(filename):
    # Open het bestand en lees bites(rb)
    fp = open(filename, 'rb')

    # Opzetten pdf parser voor lezen woorden op pagina. Standaard pdfminer setup.
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # In extracted komen uiteindelijk alle woorden.
    extracted_text = ''

    # Ittereren over elke pagina van de pdf.
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()

        # Ittereren over objecten op pagina.
        for lt_obj in layout:

            # Als object een woord is, toevoegen aan extracted text
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()

    # Omzetten van alle letter naar niet-hoofdletter.
    # Keywords maakt van aaneeschakeling van karakters een lijst met woorden, zodat er mee gewerkt kan worden.
    # "dit zijn woorden" wordt: ['dit', 'zijn', 'woorden']
    lower = extracted_text.lower()
    uit_elkaar = lower.split()
    stop_words = stopwords.words("dutch")
    keywords = [word for word in uit_elkaar if not word in stop_words]

    return keywords
Example #36
0
    def output_pdf_to_table(self, path, config):

        fp = open(path, "rb")
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.line_margin = line_margin_threshold
        laparams.word_margin = word_margin_threshold
        codec = 'utf-8'
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        password=""
        maxpages=pages_to_view
        caching=True
        pagenos=set()

        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, 
            password=password,caching=caching, check_extractable=False):
            interpreter.process_page(page)
            layout = device.get_result()
            self.getRows(layout, config)
Example #37
0
def extract_pdf():
    fp = open("timetable.pdf", 'rb')    #extract data from PDF

    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    parser.set_document(doc)
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text() + ","
    return extracted_text
Example #38
0
 def extract_text(self):
     resource_manager = PDFResourceManager()
     fake_file_handle = io.StringIO()
     laparams = LAParams()
     laparams.char_margin = 1000
     laparams.word_margin = 0.01
     laparams.line_margin = 0.01
     converter = TextConverter(resource_manager,
                               fake_file_handle,
                               laparams=laparams)
     page_interpreter = PDFPageInterpreter(resource_manager, converter)
     page_interpreter.device.handle_undefined_char = lambda f, c: chr(c)
     with open(self.path, 'rb') as fh:
         for page in PDFPage.get_pages(fh,
                                       caching=True,
                                       check_extractable=True):
             page_interpreter.process_page(page)
         self._text = fake_file_handle.getvalue()
         self.rows = self._text.split('\n')
     converter.close()
     fake_file_handle.close()
Example #39
0
    def read_file(self):
        with open(self.path, 'rb') as f:
            parser = PDFParser(f)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 0.1
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = []

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    extracted_text.append(lt_obj.get_text())
        self.content = ' '.join(extracted_text)
Example #40
0
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser)
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    codec = 'utf-8'
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
Example #41
0
def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
               '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
               '[-c codec] file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:c:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    codec = 'utf-8'
    pageno = 1
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-D': laparams.writing_mode = v
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-c': codec = v
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
   
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout

    device = CourseRegisterParser(rsrcmgr, outfp, codec=codec, laparams=laparams)
    
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password)
        fp.close()
    device.close()
    outfp.close()
    return
Example #42
0
def convert_pdf(path, outtype='txt', opts={}):
    outfile = path[:-3] + outtype
    outdir = '/'.join(path.split('/')[:-1])

    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    codec = 'utf-8'
    pageno = 1
    scale = 1
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-D': laparams.writing_mode = v
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
    if not outtype:
        outtype = 'txt'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'txt':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()

    fp = file(path, 'rb')
    process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password)
    fp.close()
    device.close()

    outfp.close()
    return
Example #43
0
def main(argv):
    import getopt

    def usage():
        print(
            "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]"
            " [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]"
            " [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]"
            " [-t text|html|xml|tag] [-c codec] [-s scale]"
            " file ..." % argv[0]
        )
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:")
    except getopt.GetoptError:
        return usage()
    if not args:
        return usage()
    # input option
    password = b""
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = "normal"
    codec = "utf-8"
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == "-d":
            logging.getLogger().setLevel(logging.DEBUG)
        elif k == "-p":
            pagenos.update(int(x) - 1 for x in v.split(","))
        elif k == "-m":
            maxpages = int(v)
        elif k == "-P":
            password = v
        elif k == "-o":
            outfile = v
        elif k == "-C":
            caching = False
        elif k == "-n":
            laparams = None
        elif k == "-A":
            laparams.all_texts = True
        elif k == "-V":
            laparams.detect_vertical = True
        elif k == "-M":
            laparams.char_margin = float(v)
        elif k == "-L":
            laparams.line_margin = float(v)
        elif k == "-W":
            laparams.word_margin = float(v)
        elif k == "-F":
            laparams.boxes_flow = float(v)
        elif k == "-Y":
            layoutmode = v
        elif k == "-O":
            imagewriter = ImageWriter(v)
        elif k == "-R":
            rotation = int(v)
        elif k == "-S":
            stripcontrol = True
        elif k == "-t":
            outtype = v
        elif k == "-c":
            codec = v
        elif k == "-s":
            scale = float(v)
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = "text"
        if outfile:
            if outfile.endswith(".htm") or outfile.endswith(".html"):
                outtype = "html"
            elif outfile.endswith(".xml"):
                outtype = "xml"
            elif outfile.endswith(".tag"):
                outtype = "tag"
    if outfile:
        outfp = open(outfile, "wb")
    else:
        outfp = sys.stdout
        if outfp.encoding is not None:
            codec = None
    if outtype == "text":
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
    elif outtype == "xml":
        device = XMLConverter(
            rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol
        )
    elif outtype == "html":
        device = HTMLConverter(
            rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter
        )
    elif outtype == "tag":
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = open(fname, "rb")
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(
            fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
        ):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
def readPDF2HTML(pdfFile, opts={}):
    # open a PDF file
    fp = StringIO(pdfFile.read())
    retstr = StringIO()
    # create a PDF parser object associated with the file object
    parser = PDFParser(fp)
    # create a PDF document allows text extraction
    document = PDFDocument(parser) # password if needed
    # check if document allows text extraction without password
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # create a PDF resource manager object that sotres shared resources
    rsrcmgr = PDFResourceManager()
    # create a PDF device object
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d':
            debug += 1
        elif k == '-p':
            pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m':
            maxpages = int(v)
        elif k == '-P':
            password = v
        elif k == '-o':
            outfile = v
        elif k == '-n':
            laparams = None
        elif k == '-A':
            laparams.all_texts = True
        elif k == '-V':
            laparams.detect_vertical = True
        elif k == '-M':
            laparams.char_margin = float(v)
        elif k == '-L':
            laparams.line_margin = float(v)
        elif k == '-W':
            laparams.word_margin = float(v)
        elif k == '-F':
            laparams.boxes_flow = float(v)
        elif k == '-Y':
            layoutmode = v
        elif k == '-O':
            outdir = v
        elif k == '-t':
            outtype = v
        elif k == '-c':
            codec = v
        elif k == '-s':
            scale = float(v)
    codec = 'utf-8'
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()
    # process each page contained in the document
    for page in PDFPage.get_pages(fp, pagenos):
        interpreter.process_page(page)
    # close streams and return text content
    fp.close()
    content = retstr.getvalue()
    device.close()
    retstr.close()
    return content
Example #45
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='Convert PDF into text.')
    parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert')
    parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)')
    parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)')
    parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)')
    parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract')
    parser.add_argument('-P', metavar='password', default='', help='pdf password')
    parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout,
                        help='output file name (default: stdout)')
    parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory')
    parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)')
    parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)')
    lagroup = parser.add_argument_group(title='layout analysis')
    lagroup.add_argument('-n', action='store_true', help='disable layout analysis')
    lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text')
    lagroup.add_argument('-V', action='store_true', help='detect vertical text')
    lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin')
    lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin')
    lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin')
    lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow')
    lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)')
    lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML')
    args = parser.parse_args(argv)

    logging.basicConfig()
    logging.getLogger('pdfminer').setLevel(args.l.upper())

    laparams = LAParams()
    if args.n:
        laparams = None
    else:
        laparams.all_texts = args.A
        laparams.detect_vertical = args.V
        if args.M:
            laparams.char_margin = args.M
        if args.L:
            laparams.line_margin = args.L
        if args.W:
            laparams.word_margin = args.W
        if args.F:
            laparams.boxes_flow = args.F

    rsrcmgr = PDFResourceManager(caching=args.cache)
    outtype = args.t
    if not outtype:
        if args.o:
            if args.o.name.endswith('.htm') or args.o.name.endswith('.html'):
                outtype = 'html'
            elif args.o.name.endswith('.xml'):
                outtype = 'xml'
            elif args.o.name.endswith('.tag'):
                outtype = 'tag'
    if outtype == 'xml':
        device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y,
                               laparams=laparams, imagewriter=args.O)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, args.o, codec=args.c)
    else:
        device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
    for fp in args.file:
        process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P,
                    caching=args.cache, check_extractable=True)
        fp.close()
    device.close()
    if args.o is not sys.stdout:
        args.o.close()
Example #46
0
def main(argv):
    import getopt			#getopt 模块,它的功能是 获取执行命令行时附带的参数,关于getopt模块详细可参照http://www.16kan.com/post/207647.html
    def usage(): 			#usage() 函数,用于在用户输入错误命令或者命令输入不规范时,输出py文件的使用范例。当参数不足或错误时,usage()被调用
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
		'''
		getopt函数的格式是getopt.getopt ( [命令行参数列表], "短选项", [长选项列表] )
		短选项名后的冒号(:)表示该选项必须有附加的参数。p,m,P,o,M,L,W,F,Y,O,t,c,s均为必须参数
		长选项名后的等号(=)表示该选项必须有附加的参数。
		返回opts和args。
		'''
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''			#参数P
    pagenos = set()			#参数p
    maxpages = 0			#参数m
    # output option
    outfile = None			#参数o output
    outtype = None			#参数t out type
    outdir = None			#参数O output directory
    layoutmode = 'normal'	#参数Y
    codec = 'utf-8'			#参数c
    pageno = 1				
    scale = 1				#参数s,暂缺M,L,F,Y四个参数
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:				#确认输出文件格式
        outtype = 'text'
        if outfile:			
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:					
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)  #TextConverter貌似不能指定outdir参数
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    outfp.close()
    return
Example #47
0
import StringIO as StringIO

import xlwt

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams

laparams = LAParams()
laparams.word_margin = float(1.0)
laparams.char_margin = float(2.0)
#laparams.line_margin = float(0.55)
#laparams.boxes_flow = float(0.7)
#laparams.detect_vertical = True
#laparams.all_texts = True
caching = True
fp = open('C:\Users\daniel.betteridge\Downloads\Aeopi 9 Sep to 23 Sept page set up.pdf', 'rb')
#outfp = open('C:\Users\daniel.betteridge\Documents\pdfextract\Aeopi.csv', 'wb')
rsrc = PDFResourceManager()
restr = StringIO.StringIO()
device =TextConverter(rsrc, restr,laparams=laparams) #replace restr with outfp for file output

interpreter = PDFPageInterpreter(rsrc, device)
book = xlwt.Workbook(encoding="utf-8")

for pageNumber,page in enumerate(PDFPage.get_pages(fp, [1200], password=None, caching=caching, check_extractable=True)):
    if (pageNumber+1)%3 == 0:        
        numcolumns = 8
    else:
        numcolumns = 15
Example #48
0
def main(argv):
    import getopt

    def usage():
        print(
            "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] "
            "[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] "
            "[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ..." % argv[0]
        )
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:")
    except getopt.GetoptError:
        return usage()
    if not args:
        return usage()
    # debug option
    debug = 0
    # input option
    password = ""
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = "normal"
    codec = "utf-8"
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == "-d":
            debug += 1
        elif k == "-p":
            pagenos.update(int(x) - 1 for x in v.split(","))
        elif k == "-m":
            maxpages = int(v)
        elif k == "-P":
            password = v
        elif k == "-o":
            outfile = v
        elif k == "-C":
            caching = False
        elif k == "-n":
            laparams = None
        elif k == "-A":
            laparams.all_texts = True
        elif k == "-V":
            laparams.detect_vertical = True
        elif k == "-M":
            laparams.char_margin = float(v)
        elif k == "-L":
            laparams.line_margin = float(v)
        elif k == "-W":
            laparams.word_margin = float(v)
        elif k == "-F":
            laparams.boxes_flow = float(v)
        elif k == "-Y":
            layoutmode = v
        elif k == "-O":
            outdir = v
        elif k == "-t":
            outtype = v
        elif k == "-c":
            codec = v
        elif k == "-s":
            scale = float(v)
    #
    # PDFDocument.debug = debug
    # PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = "text"
        if outfile:
            if outfile.endswith(".htm") or outfile.endswith(".html"):
                outtype = "html"
            elif outfile.endswith(".xml"):
                outtype = "xml"
            elif outfile.endswith(".tag"):
                outtype = "tag"
    if outfile:
        outfp = file(outfile, "w")
    else:
        outfp = sys.stdout
    if outtype == "text":
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == "xml":
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == "html":
        device = HTMLConverter(
            rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir
        )
    elif outtype == "tag":
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, "rb")
        process_pdf(
            rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
        )
        fp.close()
    device.close()
    outfp.close()
    return
Example #49
0
def main(argv):
    def usage():
        print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]))
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    debug = False
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug = True
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    
    if debug:
        set_debug_logging()
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
        close_outfp = True
    else:
        outfp = sys.stdout
        close_outfp = False
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode,
            laparams=laparams, outdir=outdir, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        fp = io.open(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    if close_outfp:
        outfp.close()
Example #50
0
    # Create a PDF parser object associated with the file object.
    #parser = PDFParser(open_file)
    # Create a PDF document object that stores the document structure.
    #doc = PDFDocument(parser)
    # Connect the parser and document objects.
    #print parser.nextline()
    #print parser.nextline()
    #print parser.nextline()


    ##ATTEMPT 2
    #Code from pdf2txt.py
    laparams = LAParams()
    laparams.char_margin = 2.0
    laparams.line_margin=0.5
    laparams.word_margin=0.1
    laparams.all_texts=False

    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, fp_out, codec='utf-8', laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pdf_pages = PDFPage.get_pages(fp_in, set())
    pagenum = 0
    pagelim = 3
    for page in pdf_pages:
        pagenum += 1
        if pagenum > pagelim:
            continue
        print "Transcribing page " + str(pagenum) + " from PDF to text"
        interpreter.process_page(page)
    fp_in.close()
Example #51
0
def main(argv):

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    using_optparse = False

    parser = ArgumentParser(prog='pdf2txt.py',
            description='Convert pdf to txt',
            formatter_class=ArgumentDefaultsHelpFormatter)

    if using_optparse:
        DEBUG(3, 'using optparse')
        parser.add_argument = parser.add_option
        parser.parse_known_args = parser.parse_args
        parser.disable_interspersed_args()

    parser.add_argument('-d', dest='debuglevel', action='count',
                       default = 0,
                       help='Debug (repeat for more verbose debugging)')

    parser.add_argument('-p', '--pages', dest='pagenos', action='store',
                       type=str,
                       default = '',
                       help='Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.')

    parser.add_argument('-c', '--codec', dest='codec', action='store',
                       type=str,
                       default='utf-8',
                       help='Specifies the output codec.')

    parser.add_argument('-t', '--type', dest='outtype', action='store',
                       type=str,
                       default='shape',
                       choices = ['text', 'html', 'xml', 'tag', 'shape'],
                       help='Specifies the output format, one of: shape, text, html, xml, tag')

    parser.add_argument('-m', dest='maxpages', action='store',
                       type=int,
                       default=0,
                       help='Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.')

    parser.add_argument('-P', '--password', dest='password', action='store',
                       type=str,
                       default='',
                       help='Provides the user password to access PDF contents.')

    parser.add_argument('-o', '--output', dest='outfile', action='store',
                       type=str,
                       default=None,
                       help='Specifies the output file name. By default, it prints the extracted contents to stdout in text format.')

    parser.add_argument('-C', '--no-caching', dest='caching', action='store_false',
                       default=True,
                       help='Suppress object caching. This will reduce the memory consumption but also slows down the process.')

    parser.add_argument('-n', '--no-layout', dest='layout', action='store_false',
                       default=True,
                       help='Suppress layout analysis.')

    parser.add_argument('--show-pageno', dest='show_pageno', action='store_true',
                       default=False,
                       help='Show page numbers.')


    parser.add_argument('-A', '--analyze-all', dest='all_texts', action='store_true',
                       default=False,
                       help='Forces to perform layout analysis for all the text strings, including text contained in figures.')

    parser.add_argument('-V', '--detect-vertical', dest='detect_vertical', action='store_true',
                       default=False,
                       help='Allows vertical writing detection.')

    parser.add_argument('-M', dest='char_margin', action='store',
                       type=float,
                       default=2.0,
                       help='Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.')

    parser.add_argument('-L', dest='line_margin', action='store',
                       type=float,
                       default=0.5,
                       help='Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.')

    parser.add_argument('-W', dest='word_margin', action='store',
                       type=float,
                       default=0.1,
                       help='It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.')

    parser.add_argument('-F', dest='boxes_flow', action='store',
                       type=float,
                       default=0.5,
                       help='Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).')

    parser.add_argument('-Y', '--layout-mode', dest='layoutmode', action='store',
                       type=str,
                       default='normal',
                       choices = ['exact', 'normal', 'loose'],
                       help='Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.')

    parser.add_argument('-O', '--image-writer', dest='imagewriter', action='store',
                       type=str,
                       default=None,
                       help='imagewriter')

    parser.add_argument('-R', '--rotation', dest='rotation', action='store',
                       type=int,
                       default=0,
                       help='rotation')

    parser.add_argument('-S', '--strip-control', dest='stripcontrol', action='store_true',
                       default=False,
                       help='stripcontrol')

    parser.add_argument('-s', dest='scale', action='store',
                       type=float,
                       default=1,
                       help='Specifies the output scale. Can be used in HTML format only.')

    parser.add_argument('--draw-lines', dest='draw_lines', action='store_true',
                       help="Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output.")

    parser.add_argument('--draw-boxes', dest='draw_boxes', action='store_true',
                       help="Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output.")

    parser.add_argument('--draw-blocks', dest='draw_blocks', action='store_true',
                       help="Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output.")

    parser.add_argument('--shear-limit', dest='shear_limit', action='store',
                        default=0.1,
                        type=float,
                        help="If the text is sheared above this limit, reject it. Valid only for the `shape' output.")

    parser.add_argument('--rotation-limit', dest='rotation_limit', action='store',
                        default=2,
                        type=float,
                        help="If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output.")

    parser.add_argument('--line-height-diff', dest='line_height_diff', action='store',
                       type=float,
                       default=0.1,
                       help='Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).')

    parser.add_argument('--heading-before', dest='heading_before', action='store',
                       type=str,
                       default='',
                       help='String to put before each heading, e.g. <h1>')

    parser.add_argument('--heading-after', dest='heading_after', action='store',
                       type=str,
                       default='',
                       help='String to put after each heading, e.g. </h1>')

    parser.add_argument('--box-separator', dest='box_separator', action='store',
                       type=str,
                       default=r'\n\n',
                       help=r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.')

    parser.add_argument('--block-separator', dest='block_separator', action='store',
                       type=str,
                       default=r'\n\n',
                       help=r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.')

    parser.add_argument('--indent-separator', dest='indent_separator', action='store',
                       type=str,
                       default=r'\n\n',
                       help=r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.')

    parser.add_argument('--indent-string', dest='indent_string', action='store',
                       type=str,
                       default=r'\t',
                       help=r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.')

    parser.add_argument('--indent-limit', dest='indent_limit', action='store',
                       type=float,
                       default=3,
                       help='If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.')

    parser.add_argument('--page-separator', dest='page_separator', action='store',
                       type=str,
                       default=r'\n\n',
                       help=r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.')

    parser.add_argument('--norm-whitespace', dest='norm_whitespace', action='store_true',
                       default=False,
                       help='Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).')

    parser.add_argument('--print-stats', dest='print_stats', action='store_true',
                       default=False,
                       help='Instead of the text, output some simple statistics about the file.')

    parser.add_argument('--max-blocks', dest='max_blocks', action='store',
                       default=0,
                       type=int,
                       help='If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.')

    parser.add_argument('--max-textlines', dest='max_textlines', action='store',
                       default=0,
                       type=int,
                       help='If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.')

    parser.add_argument('--line-height-method', dest='line_height_method', action='store',
                       type=str,
                       default='bbox',
                       choices = ['bbox', 'mean', 'median'],
                       help='Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.')


    parser.add_argument(dest='pdffile', help='List of PDF files to go through', default=None, nargs='+')

    args, rest = parser.parse_known_args()

    global debuglevel
    debuglevel = debug = args.debuglevel
    DEBUG(3, 'args:', str(args))
    DEBUG(3, 'rest:', str(rest))

    DEBUG(3, 'optparse:', using_optparse)

    if args.pagenos:
        pagenos.update( int(x)-1 for x in args.pagenos.split(',') )
    maxpages = args.maxpages
    outfile = args.outfile
    password = args.password
    caching = args.caching
    showpageno = args.show_pageno
    if not args.layout:
        laparams = None
    if laparams and args.all_texts:
        laparams.all_texts = True
    if laparams and args.detect_vertical:
        laparams.detect_vertical = True
    if laparams:
        laparams.char_margin = args.char_margin
        laparams.line_margin = args.line_margin
        laparams.word_margin = args.word_margin
        laparams.boxes_flow = args.boxes_flow
    layoutmode = args.layoutmode

    if args.imagewriter:
        imagewriter = ImageWriter(args.imagewriter)

    rotation = args.rotation
    stripcontrol = args.stripcontrol
    outtype = args.outtype
    codec = args.codec
    scale = args.scale

    args.box_separator = unescape_string(args.box_separator)
    args.block_separator = unescape_string(args.block_separator)
    args.indent_separator = unescape_string(args.indent_separator)
    args.indent_string = unescape_string(args.indent_string)

    args.page_separator = unescape_string(args.page_separator)



    global options
    options = args

    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
        DEBUG(2, 'output goes to', outfile)
    else:
        outfp = sys.stdout
        DEBUG(2, 'output goes to stdout')
    if outtype == 'shape':
        device = ShapeTextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               showpageno=showpageno, imagewriter=imagewriter)
    elif outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in options.pdffile:
        DEBUG(2, 'processing', fname)
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()

    outfp.close()
    DEBUG(2, 'finished.')

    return
def convert_pdf_To_Txt(path,opts={}):
    """
    this ALGO form pdfinterp modul  documentation


    """

        # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    retstr = StringIO()
    if outtype == 'text':
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
        interpreter.process_page(page)
    #print retstr.getvalue()
    txt2Pdf=retstr.getvalue()
    #print type(txt2Pdf)

    #fp.close()
    #device.close()
    #outfp.close()
    return txt2Pdf
Example #53
0
def pdfminerr(argv):
    global pdfminerr, install
    import getopt
  
    def usage():
        print ("usage: just put the path to the pdf file in pdf.txt, and make sure you create a seprate folder and put nothing there except for this repository.")
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'

    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
           caching=caching, check_extractable=True)

        fp.close()
    
    device.close()
    outfp.close()
    return
Example #54
0
def main(argv):
    import getopt
    def usage():
        print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:'
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
               ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
               ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
               ' [-t text|html|xml|tag] [-c codec] [-s scale]'
               ' file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = 'tag'
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'tag'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout

    for fname in args:
        l = glob.glob(fname)
        count = len(l)
        print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format'
        for pdf in l:
#             print pdf
            d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'}
            ext = '.' + d[outtype]
            outfile = pdf[0:-4] + ext
            print outfile
            outfp = file(outfile, 'wb')
            if outtype == 'text':
                device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'xml':
                device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                      imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'html':
                device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                                       layoutmode=layoutmode, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'tag':
                device = TagExtractor(rsrcmgr, outfp, codec=codec)
                device.showpageno = False
            else:
                return usage()
    
            fp = file(pdf, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp, pagenos,
                                          maxpages=maxpages, password=password,
                                          caching=caching, check_extractable=True):
                page.rotate = (page.rotate+rotation) % 360
                interpreter.process_page(page)
            fp.close()
            device.close()
            outfp.close()

        print 'Done'
    return
  doc = open_pdf(sys.argv[1])
  
  Point = Route = False
  pages = page_count(doc)
  if pages == 68:
    Point = True
  elif pages == 1143:
    Route = True
  else:
    sys.stderr.write("PDF file not of recognised (NRG) format\n")
    sys.exit(1)
  
  rsrcmgr = PDFResourceManager()
  laparams = LAParams()
  laparams.line_margin = 0    # Forces every line to be absolutely positioned
  laparams.word_margin = 0.2  # Prevents space before narrow characters
  device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  interpreter = PDFPageInterpreter(rsrcmgr, device)

  writer = BufferedWriter(sys.stdout)

  for (pageno, page) in enumerate(doc.get_pages()):
    interpreter.process_page(page)
    layout = device.get_result()  # returns LTPage

    (text, other) = fsplit(lambda obj: isinstance(obj, LTText), layout)

    header_y = 0
    if Point:
      # Locates bottom of header separator (lowest non-text < 10px height)
      header_y = reduce(lambda x, o: min(x, o.y0),
Example #56
0
def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
               '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
               '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    codec = 'utf-8'
    pageno = 1
    scale = 1
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-D': laparams.writing_mode = v
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrc = PDFResourceManager()
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrc, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
        fp.close()
    device.close()
    outfp.close()
    return
Example #57
0
def main(fname, k, v):

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()

    if k == '-d':
        debug += 1
    elif k == '-p':
        pagenos.update( int(x)-1 for x in v.split(',') )
    elif k == '-m':
        maxpages = int(v)
    elif k == '-P':
        password = v
    elif k == '-o':
        outfile = v
    elif k == '-C':
        caching = False
    elif k == '-n':
        laparams = None
    elif k == '-A':
        laparams.all_texts = True
    elif k == '-V':
        laparams.detect_vertical = True
    elif k == '-M':
        laparams.char_margin = float(v)
    elif k == '-L':
        laparams.line_margin = float(v)
    elif k == '-W':
        laparams.word_margin = float(v)
    elif k == '-F':
        laparams.boxes_flow = float(v)
    elif k == '-Y':
        layoutmode = v
    elif k == '-O':
        imagewriter = ImageWriter(v)
    elif k == '-R':
        rotation = int(v)
    elif k == '-S':
        stripcontrol = True
    elif k == '-t':
        outtype = v
    elif k == '-c':
        codec = v
    elif k == '-s':
        scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
 
    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages, password=password,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()
    outfp.close()
    return
Example #58
0
def pdf2txt(argv):
    import getopt
    (opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    outfp.close()
    return
Example #59
0
def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
               ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
               ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
               ' [-t text|html|xml|tag] [-c codec] [-s scale]'
               ' file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
Example #60
0
def main(argv):
    import getopt

    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] [-r] '
               '[-S] [-f] file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'fSrdp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    roundCoords = False
    simplifyOutput = False
    formatOutput = False
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d':
            debug += 1
        elif k == '-p':
            pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m':
            maxpages = int(v)
        elif k == '-P':
            password = v
        elif k == '-o':
            outfile = v
        elif k == '-C':
            caching = False
        elif k == '-n':
            laparams = None
        elif k == '-A':
            laparams.all_texts = True
        elif k == '-V':
            laparams.detect_vertical = True
        elif k == '-M':
            laparams.char_margin = float(v)
        elif k == '-L':
            laparams.line_margin = float(v)
        elif k == '-W':
            laparams.word_margin = float(v)
        elif k == '-F':
            laparams.boxes_flow = float(v)
        elif k == '-Y':
            layoutmode = v
        elif k == '-O':
            imagewriter = ImageWriter(v)
        elif k == '-t':
            outtype = v
        elif k == '-c':
            codec = v
        elif k == '-s':
            scale = float(v)
        elif k == '-r':
            roundCoords = True
        elif k == '-S':
            simplifyOutput = True
        elif k == '-f':
            formatOutput = True

    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if formatOutput and outtype.endswith('ml'):
        try:
            from cStringIO import StringIO
        except ImportError:
            from StringIO import StringIO
        outfp = StringIO()
    else:
        outfp = getRealOutput(outfile)
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter, layoutmode=layoutmode,
                              scale=scale, roundCoords=roundCoords, simplifyOutput=simplifyOutput)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    if formatOutput:
        root = outfp.getvalue()
        with getRealOutput(outfile) as realOutput:
            try:
                from bs4 import BeautifulSoup as bs
            except ImportError:
                bs = None
                sys.stderr.write('Could not import BeautifulSoup, skipping output formatting')
                realOutput.write(root)
            else:
                soup = bs(root)
                prettyHTML = soup.prettify()
                realOutput.write(prettyHTML)

    outfp.close()
    return