Example #1
0
def get_text_rows(path):
    rows = defaultdict(list)
    # Open a PDF file.
    fp = open(path, 'rb')

    # Create a PDF parser object associated with the file object.
    # parser = PDFParser(fp)

    # Create a PDF document object that stores the document structure.
    # Password for initialization as 2nd parameter
    # document = PDFDocument(parser)

    # Check if the document allows text extraction. If not, abort.
    # if not document.is_extractable:
    #     raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()

    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)

    # BEGIN LAYOUT ANALYSIS
    # Set parameters for analysis.
    laparams = LAParams()
    laparams.line_overlap = 0.01
    laparams.line_margin = 0.01
    laparams.word_margin = 0.15

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    def parse_obj(lt_objs, page):
        # loop over the object list
        for obj in lt_objs:
            # if it's a textbox, print text and location
            if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
                rows[(page, -int(obj.bbox[1]))].append(
                    (int(obj.bbox[0]), sanitize(obj.get_text())))
            # if it's a container, recurse
            elif isinstance(obj, pdfminer.layout.LTFigure):
                parse_obj(obj._objs, page)

    # loop over all pages in the document
    for page_num, page in enumerate(PDFPage.get_pages(fp)):
        # read the page into a layout object
        interpreter.process_page(page)
        layout = device.get_result()

        # extract text from this object
        parse_obj(layout._objs, page_num)

    for key in sorted(rows):
        rows[key] = sorted(rows[key])
        page, y = key
        y = -y
        yield (page, y, rows[key])
Example #2
0
def extract_layout_by_page(pdf_path, page_number):
    """
    :param pdf_path:  pdf file path
    :param page_number:      the specific page that you want to parse(start from 1)
    :return: a list of pdfminer layout object
    """
    fp = open(pdf_path, 'rb')  # 以二进制读模式打开
    # 用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)  # 创建一个PDF文档
    doc = PDFDocument()  # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    doc.initialize()
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    # 创建PDf 资源管理器 来管理共享资源
    rsrcmgr = PDFResourceManager()  # 创建一个PDF设备对象
    laparams = LAParams()
    laparams.line_overlap = 0.3
    laparams.char_margin = 3
    laparams.word_margin = 0.3
    laparams.line_margin = 0.01

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)  # 创建一个PDF解释器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    layouts = []
    # 循环遍历列表,每次处理一个page的内容
    pages = list(doc.get_pages())

    interpreter.process_page(pages[page_number - 1])
    # 接受该页面的LTPage对象
    return device.get_result()
def main(files=None):
    if files is None:
        files = get_datafiles()
    # debug option level
    debug = 0
    # input option
    password = ''
    pagenos = set()
    # pagenos.update( int(x)-1 for x in v.split(',') )
    maxpages = 0
    # output option
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    showpageno = True

    # Line Agumentation ? Parameters
    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = True
    laparams.line_overlap = 0.3  # Line overlap
    laparams.char_margin = 2.0  # Letter Spacing
    laparams.line_margin = 0.5  # Line Spacing
    laparams.word_margin = 0.1  # Word spacing
    laparams.boxes_flow = 0.5  # +-1.0  how much hor vs. vertical matters
    # position maters for line continuation
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #

    for fname in files:
        fname = str(fname)
        imagedir = os.path.abspath(os.path.join(os.path.dirname(fname), 'img'))
        # print(imagedir)
        imagewriter = None
        imagewriter = ImageWriter(imagedir)  # output folder for images
        name = os.path.splitext(os.path.basename(fname))[0]
        print(name)
        outfile = fname[:-4] + '.txt'
        device = TextCon(rsrcmgr,
                         laparams=laparams,
                         imagewriter=imagewriter,
                         imagename=name)

        interpreter = PDFPageInterpreter(rsrcmgr, device)

        fp = file(fname, 'rb')
        try:
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)
        except:
            continue

        rows = [list(row) for row in device.rows]

        pages = max([row[0] for row in rows])
        max_y = max([row[4] for row in rows])
        min_y = min([row[2] for row in rows])

        list_0 = [int(row[4]) for row in rows]
        list_1 = []
        [
            list_1.append(obj) for obj in list_0
            if obj not in list_1 and list_0.count(obj) > pages - 1
        ]
        max_y2 = max(list_1)

        list_0 = [int(row[2]) for row in rows]
        list_1 = []
        [
            list_1.append(obj) for obj in list_0
            if obj not in list_1 and list_0.count(obj) > pages - 1
        ]
        min_y2 = min(list_1)

        print('max_ys:', max_y - max_y2)
        print('min_ys:', min_y - min_y2)

        # Get max and min the hard way because of stupid headers
        list_0 = [int(row[3]) for row in rows]
        list_1 = []
        [
            list_1.append(obj) for obj in list_0
            if obj not in list_1 and list_0.count(obj) > 10
        ]
        if list_1:
            max_x = max(list_1)
        else:
            max_x = max([int(row[3]) for row in device.rows])

        list_0 = [int(row[1]) for row in rows]
        list_1 = []
        [
            list_1.append(obj) for obj in list_0
            if obj not in list_1 and list_0.count(obj) > 10
        ]
        if list_1:
            min_x = min(list_1)
        else:
            min_x = min([int(row[3]) for row in device.rows])
        # Errors if more pics on one side then other
        # mid_x = (sum([(float(row[1]) + float(row[3]))/2 for row in
        #    device.rows])/len(device.rows))
        mid_x = (max_x + min_x) / 2
        # mid_x = 595/2  # center of A4 at 72px/in Letter would be 612/2
        l_height = sum([row[4] - row[2] for row in rows]) / len(rows)

        # print('max_x:', max_x)
        # print('min_x:', min_x)
        # print('mid_x:', mid_x)
        print('l_height:', l_height)

        column2 = []
        lines = []
        pagenumber = 0
        table_caps = ['\n']
        table_data = []
        table = False

        for i, row in enumerate(rows):
            #l_height = row[4]-row[2]
            l_space = rows[i - 1][2] - row[4]

            #print(l_height, l_space, rows[i-1][2], rows[i][4], str(row[5]))
            if row[0] == pagenumber + 1:
                lines += column2
                column2 = []
                pagenumber += 1

            if row[0] == pagenumber:
                if (max_y - min_y) * 0.95 > l_space > 0.8 * l_height:
                    # capture Table (assuming tables will span all columns)
                    if re.match(r"^table", str(row[5]), re.I):
                        table = True
                        table_caps.append(str(row[5]))
                        table_data.append('\n')
                        table_data.append(str(row[5]))
                        table_data.append('\n')
                        continue
                    else:
                        table = False

                # capture table captions multi lines
                elif (table_caps[-1] == str(rows[i - 1][5])
                      and -2 * l_height < l_space < 0.5 * l_height):
                    table_caps[-1] += str(row[5])
                    table_data[-2] += str(row[5])
                    continue

                if table:
                    # capture table data
                    if int(rows[i - 1][2]) == int(rows[i][2]):
                        table_data[-1] += '\t' + str(row[5])
                        continue
                    else:
                        table_data.append(str(row[5]))
                        continue

                elif int(row[1]) > mid_x and ((int(rows[i - 1][1]) < mid_x and
                                               int(rows[i - 1][3]) < mid_x) or
                                              (int(rows[i - 1][1]) > mid_x
                                               and int(rows[i - 1][3]) > mid_x)
                                              or rows[i - 1][3] > max_x * 0.9
                                              or l_space > 2.5 * l_height):
                    """
                        r_space > c_space or
                        previous[3] > max_x * 0.9 or
                        l_space > 2 * l_height):"""
                    if len(column2) > 0:
                        if 1 > (row[2] - column2[-1][2]) > -1:
                            # join if on same line
                            if int(row[1]) < int(column2[-1][1]):
                                column2[-1][5] = row[5] + " " + column2[-1][5]
                            else:
                                column2[-1][5] = column2[-1][5] + " " + row[5]
                        else:
                            column2.append(row)
                    else:
                        column2.append(row)
                    # print(2, str(row[5]))
                else:
                    if len(lines) > 0:
                        if 1 > (row[2] - lines[-1][2]) > -1:
                            # join if on same line
                            if int(row[1]) < int(lines[-1][1]):
                                lines[-1][5] = row[5] + " " + lines[-1][5]
                            else:
                                lines[-1][5] = lines[-1][5] + " " + row[5]
                        else:
                            lines.append(row)
                    else:
                        lines.append(row)
                    # print(3, str(row[5]))
        # add final column
        lines += column2

        fig_caps = ['\n']
        headers = ['\n']
        footers = ['\n']
        supp_info = ['\n']
        new_lines = []
        supp_re = re.compile(
            r"Corresponding author|Electronic mail|email"
            "|E-mail|^doi|doi:|^keywords|^pacs|^apc", re.I)

        for i, line in enumerate(lines):
            #l_height = lines[i][4]-lines[i][2]
            l_space = lines[i - 1][2] - lines[i][4]
            l_space_below = 0
            l_space_2below = 0
            if i + 1 < len(lines):
                l_space_below = lines[i][2] - lines[i + 1][4]
            if i + 2 < len(lines):
                l_space_2below = lines[i + 1][2] - lines[i + 2][4]
            fig = fig_caps[-1]
            print(l_space, l_space_below, l_space_2below, lines[i][2],
                  lines[i][4], str(line[5]))

            # capture figure captions multi lines
            if (fig_caps[-1] == str(lines[i - 1][5])
                    and -2 * l_height < l_space < 0.5 * l_height):
                fig_caps.append(str(line[5]))
                continue
            # capture headers (up to two lines)
            if (lines[i][2] > max_y * 0.95
                    and (l_space_below > 0.5 * l_height
                         or l_space_2below > 0.5 * l_height)):
                headers.append('\n')
                headers.append(str(line[5]))
                if supp_re.search(str(line[5])):
                    headers.append('\n')
                    headers.append(str(line[5]))
                else:
                    continue
            # capture supporting info
            if supp_re.search(str(line[5])):
                print(str(line[5]))
                supp_info.append('\n')
                supp_info.append(str(line[5]))
                continue
            if (max_y - min_y) * 0.95 > l_space > 0.5 * l_height:
                # capture figure captions
                if re.match(r"^fig", str(line[5]), re.I):
                    fig_caps.append('\n')
                    fig_caps.append(str(line[5]))
                    continue
                # capture footers
                elif lines[i][2] < min_y + max_y * 0.015:
                    footers.append('\n')
                    footers.append(str(line[5]))
                    continue
                else:
                    string = str(lines[i - 1][5])

                    if (any(string in s for s in fig_caps)
                            or any(string in s for s in headers)):  # or
                        #string == footers[-1] or string == supp_info[-1]):
                        pass
                    else:
                        new_lines.append('\n')
            new_lines.append(str(line[5]))

        with open(outfile, 'w') as f:
            f.write(' '.join(new_lines))
            f.write('\n\nFigures')
            f.write(' '.join(fig_caps))
            f.write('\n\nTables')
            #f.write(' '.join(table_caps))
            f.write('\n'.join(table_data))
            f.write('\n\nHeaders')
            f.write(' '.join(headers))
            f.write('\n\nFooters')
            f.write(' '.join(footers))
            f.write('\n\nSupporting Info')
            f.write(' '.join(supp_info))

    # the histogram of the data
    # n, bins, patches = plt.hist(x_data, 50)
    # plt.show()

    device.close()
    print('Done')
    return
Example #4
0
import pandas as pd
from bs4 import BeautifulSoup


def convert(case, pdfpath, targetfilepath, pages=100):
    if not pages: pagenums = set()
    else: pagenums = set(pages)
    manager = PDFResourceManager()
    codec = 'utf-8'
    caching = True
    laparams = LAParams(all_texts=True)
    laparams.boxes_flow = -0.5
    # laparams.paragraph_indent = 0.2
    laparams.detect_vertical = True
    # laparams.heuristic_word_margin = 0.03
    laparams.line_overlap = 0.2
    laparams.word_margin = 0.2
    laparams.line_margin = 0.5
    laparams.char_margin = 1000.0

    if case == 'text':
        output = io.StringIO()
        converter = TextConverter(manager,
                                  output,
                                  codec=codec,
                                  laparams=LAParams())
    if case == 'HTML':
        output = io.BytesIO()
        converter = HTMLConverter(manager,
                                  output,
                                  codec=codec,