Example #1
0
def process_pdf(file):

    #Creating the required objects
    resource_manager = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(resource_manager, laparams=laparams)
    page_interpreter = PDFPageInterpreter(resource_manager, device)

    #This list will contain the text at each page of the document.
    pdfText = list()

    #Processing each page in the pdf.
    for page in PDFPage.get_pages(file):
        page_interpreter.process_page(page)
        layout = device.get_result()
        text = ""
        for element in layout:
            # Whenever, we encounter the layout type as text box, we get the text.
            # This is to skip images if any.
            if isinstance(element, LTTextBox):
                text += element.get_text()
        pdfText.append(text)

    #Returing a list, where element at each index contains the text at each page
    return pdfText
    def __init__(self, pdf, codec='utf-8'):
        """
        Parameters:
        --------------
        codec:      codific, default utf-8
        pdf:        path to the pdf file

        Attributes:
        ---------------
        records:        list of lines from the pdf file
        text:           string of joined records, default ""
        didascalies:    list of found didascalies with regexpr
        nimages:        int, number of found images

        """
        self.pdf = pdf
        self.text = ""
        self.records = []
        self.didascalies = []
        self.nimages = 0
        self.images = []

        parser = PDFParser(pdf)
        #parser = PDFParser(open(pdf, 'rb'))
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object
        # that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Create a buffer for the parsed text
        retstr = StringIO()
        # Spacing parameters for parsing
        laparams = LAParams()
        self.codec = codec
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        # Create a PDF interpreter object
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)

        #images

        img_device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        img_interpreter = PDFPageInterpreter(rsrcmgr, img_device)
        for page in PDFPage.create_pages(document):
            img_interpreter.process_page(page)
            pdf_item = img_device.get_result()
            if pdf_item is not None:
                for thing in pdf_item:
                    if isinstance(thing, LTImage):
                        self.save_image(thing)
                    if isinstance(thing, LTFigure):
                        self.find_images_in_thing(thing)

        lines = retstr.getvalue().splitlines()
        for line in lines:
            self.records.append(line)
Example #3
0
File: pdf.py Project: rrbn/tiltr
def _extract_pdf_scores(stream):
    # these laparams seem to work ok with the ILIAS default PDF
    # formatting as well as with UR custom styling.

    # see pdf/tests/default_style.pdf and pdf/tests.ur_style.pdf

    laparams = LAParams(line_overlap=0,
                        char_margin=20,
                        word_margin=0.1,
                        boxes_flow=0,
                        detect_vertical=False)

    rsrcmgr = PDFResourceManager()

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    parser = PDFParser(stream)
    document = PDFDocument(parser)

    page = next(PDFPage.create_pages(document))

    interpreter.process_page(page)
    layout = device.get_result()

    boxes = []
    table_head_y = None  # y position of result table header

    order_name = "Reihenfolge"  # FIXME localize

    for element in layout:
        if isinstance(element, LTTextBoxHorizontal):
            boxes.append(element)
            if order_name in element.get_text().strip():
                table_head_y = element.y0

    tboxes = list(filter(lambda box: box.y0 == table_head_y, boxes))

    # if LAParams is set correctly, head should extract the whole
    # results table's text now.
    table = tboxes[0].get_text().replace('\t', '')

    table = table[table.find(order_name):]

    # note: question titles might lack spaces; this is no problem
    # since we compare question names and scores only through
    # Result.normalize_question_title() later.

    scores = dict()
    cols = []
    for line in table.split("\n")[1:]:
        cols += re.split(r'\s+', line)
        if len(cols) >= 6:
            scores[cols[2]] = cols[4]
            cols = cols[6:]

    return scores
Example #4
0
 def __init__(self, ofile):
     rsrcmgr = PDFResourceManager()
     laparams = LAParams()
     self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
     self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
     self.last_font = None
     self.in_rule = False
     self.font_print_pending = False
     self.header_footer_skipping = False
     self.ofile = ofile
def convert(pdffile):
    my_file = pdffile
    extracted_text = ""
    # Open and read the pdf file in binary mode
    fp = open(my_file, "rb")

    # Create parser object to parse the pdf content
    parser = PDFParser(fp)

    # Store the parsed content in PDFDocument object
    document = PDFDocument(parser, password)

    # Check if document is extractable, if not abort
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create PDFResourceManager object that stores shared resources such as fonts or images
    rsrcmgr = PDFResourceManager()

    # set parameters for analysis
    laparams = LAParams()

    # Create a PDFDevice object which translates interpreted information into desired format
    # Device needs to be connected to resource manager to store shared resources
    # device = PDFDevice(rsrcmgr)
    # Extract the decive to page aggregator to get LT object elements
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create interpreter object to process page content from PDFDocument
    # Interpreter needs to be connected to resource manager for shared resources and device
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Ok now that we have everything to process a pdf document, lets process it page by page
    for page in PDFPage.create_pages(document):
        # As the interpreter processes the page stored in PDFDocument object
        interpreter.process_page(page)
        # The device renders the layout from interpreter
        layout = device.get_result()
        # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()

    #close the pdf file
    fp.close()

    # print (extracted_text.encode("utf-8"))

    with open(log_file, "wb") as my_log:
        my_log.write(extracted_text.encode("utf-8"))
    print("Done !!")
Example #6
0
def main(args):
    msg(SCRIPT, args)

    if len(args) != 1:
        msg('Parse a PDF file and print some pdfminer-specific stats')
        msg('Usage:', SCRIPT, '<PDF-filename>')
        return 1

    infilename, = args

    lt_types = collections.Counter()

    with open(infilename, 'rb') as pdf_file:

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(pdf_file)

        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        password = ''
        document = PDFDocument(parser, password)
        # Check if the document allows text extraction.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed(filename)

        # Make a page iterator
        pages = PDFPage.create_pages(document)


        # Set up for some analysis
        rsrcmgr = PDFResourceManager()
        laparams = LAParams(
            detect_vertical=True,
            all_texts=True,
            )
        #device = PDFDevice(rsrcmgr)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Look at all (nested) objects on each page
        for page_count, page in enumerate(pages, 1):
            # oh so stateful
            interpreter.process_page(page)
            layout = device.get_result()

            lt_types.update(type(item).__name__ for item in flat_iter(layout))

    msg('page_count', page_count)
    msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
Example #7
0
def pdfparser(filename):
    fp = open(filename, 'rb')
    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    paginas = []
    for page in PDFPage.get_pages(fp, check_extractable=False):
        interpreter.process_page(page)
        layout = device.get_result()
        pagina = []
        for element in layout:
            if isinstance(element, LTTextBox) or isinstance(element, LTText):
                texto = element.get_text()
                if texto:
                    if len(texto) < 2:
                        pagina[-1] += texto
                    else:
                        pagina += texto.split("\n")

        # limpa linhas vazias
        pagina_limpa = []
        for linha in pagina:
            if linha.strip():
                pagina_limpa.append(linha.strip())

        if len(pagina_limpa) > 0:
            paginas.append(pagina_limpa)

    text_filename = os.path.splitext(filename)[0] + '.txt'
    textfile = open(text_filename, 'w')

    # remove header
    header_candidato = ''
    for pagina in paginas:
        if pagina[0].strip() != header_candidato:
            header_candidato = pagina[0].strip()
        else:
            pagina.pop(0)

        for linha in pagina:
            textfile.write("%s\n" % linha)
    textfile.close()
Example #8
0
def convert_pdf_to_txt(path):
    fp = open(path, 'rb')
    txt = ''
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                txt += lt_obj.get_text()
    return (txt)
def parse(path):
    fp = open(path, 'rb')  # 以二进制读模式打开
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        fulltext = []
        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            str_page = ""
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):
                    results = x.get_text()
                    str_page += results
            fulltext.append(str_page)
    return fulltext
Example #10
0
from pdfminer3.layout import LAParams, LTTextBox, LTLine, LTTextLine
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
extracted_text = ''
fp = open('C:\\Users\\Ritvik\\Desktop\\Tekoaly\\PDF\\768686236423.pdf', 'rb')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.get_pages(fp)

for page in pages:
    print('Processing next page...')
    interpreter.process_page(page)
    layout = device.get_result()
    for lt_obj in layout:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            extracted_text += lt_obj.get_text()
    print(extracted_text)
    # for lobj in layout:
    #     if isinstance(lobj, LTTextBox):
    #         x, y, text = lobj.bbox[0], lobj.bbox[3], lobj
    #         print(' text: %s' % (text) ,end=' ')
Example #11
0
class VhdlSpecParser():
    FONT_TRANSLATION = {"HEFBHG+TimesNewRomanPS-ItalicMT": "it",
                        "HEFBAE+TimesNewRomanPS-BoldMT": "b",
                        "HEFBBF+TimesNewRomanPSMT": None,
                        None:None}
    FOOTER_STR = 'Copyright © 2009 IEEE. All rights reserved.'

    def __init__(self, ofile):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
        self.last_font = None
        self.in_rule = False
        self.font_print_pending = False
        self.header_footer_skipping = False
        self.ofile = ofile

    def parse_page(self, page):
        self.interpreter.process_page(page)
        layout = self.device.get_result()
        self.parse_obj(layout._objs)

    def parse_obj(self, objs):
        font_translation = self.FONT_TRANSLATION

        for obj in objs:
            if isinstance(obj, pdfminer3.layout.LTTextBox):
                for o in obj._objs:
                    if isinstance(o, pdfminer3.layout.LTTextLine):
                        if self.header_footer_skipping:
                            text = o.get_text()
                            if text.startswith("Std 1076-"):
                                self.header_footer_skipping = False
                            continue

                        text = o.get_text()
                        if text.startswith(self.FOOTER_STR):
                            self.header_footer_skipping = True
                            continue

                        # print(text)
                        is_rule_header = "::=" in text
                        if is_rule_header or self.in_rule:
                            self.in_rule = True
                            if not is_rule_header:
                                if text and not text.startswith(" "):
                                    self.in_rule = False
                                    continue

                            if text.strip():
                                for c in  o._objs:
                                    if isinstance(c, pdfminer3.layout.LTChar) and self.last_font != c.fontname:
                                        # this character has different font need to propagate it to output
                                        self.font_print_pending = True

                                    if c.get_text().isspace() and font_translation[self.last_font] is not None:
                                        # print the font enclosing string directly after this word (ignore whitespaces behind)
                                        self.font_print_pending = True
                                        self.ofile.write("</%s>" % f)
                                        self.last_font = None

                                    if self.font_print_pending and not (c.get_text().isspace()):
                                        self.font_print_pending = False
                                        f = font_translation[self.last_font]
                                        if f:
                                            self.ofile.write("</%s>" % f)

                                        f = font_translation[c.fontname]
                                        if f:
                                            self.ofile.write("<%s>" % f)

                                        self.last_font = c.fontname
                                    self.ofile.write(c.get_text())
            # if it's a container, recurse
            elif isinstance(obj, pdfminer3.layout.LTFigure):
                parse_obj(obj._objs)
            else:
                pass
Example #12
0
        for child in layout_obj:
            boxes.extend(find_textboxes_recursively(child))

        return boxes

    return []  # その他の場合は空リストを返す。


# Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。
laparams = LAParams(detect_vertical=True)

# 共有のリソースを管理するリソースマネージャーを作成。
resource_manager = PDFResourceManager()

# ページを集めるPageAggregatorオブジェクトを作成。
device = PDFPageAggregator(resource_manager, laparams=laparams)

# Interpreterオブジェクトを作成。
interpreter = PDFPageInterpreter(resource_manager, device)

# 出力用のテキストファイル
# output_txt = open('output.txt', 'w')


def print_and_write(txt):
    print(txt)
    # output_txt.write(txt)
    # output_txt.write('\n')


with open(sys.argv[1], 'rb') as f:
 def _createDeviceInterpreter(self):
     rsrcmgr = PDFResourceManager()
     laparams = LAParams()
     device = PDFPageAggregator(rsrcmgr, laparams=laparams)
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     return device, interpreter