Exemple #1
0
def convert_pdf2txt(input_path: str,
                    output_path: str,
                    verbose=params["DEFAULT_VERBOSE"]) -> None:
    for file in tqdm(glob.glob(input_path + '*.pdf'),
                     ascii=True,
                     desc='pdf->txt'):
        try:
            fp = open(file, 'rb')
            parser = PDFParser(fp)
            document = PDFDocument(parser)
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed

            rsrcmgr = PDFResourceManager()
            device = PDFDevice(rsrcmgr)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            retstr = StringIO()

            # Process each page contained in the document.
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                result = device.get_result()
            data = retstr.getvalue()
            print("RESULT:", result)
            print("DATA:", data)
            txt_file = output_path + file.split("/")[-1] + '.txt'
            if txt_file not in os.listdir(output_path):
                txt_out = open(txt_file, "w")
                txt_out.write(data)
        except Exception as e:
            print(e)
            print("Text document could not be created from %s" % (file))
Exemple #2
0
def get_text_rows(path):
    rows = defaultdict(list)
    # Open a PDF file.
    fp = open(path, 'rb')

    # Create a PDF parser object associated with the file object.
    # parser = PDFParser(fp)

    # Create a PDF document object that stores the document structure.
    # Password for initialization as 2nd parameter
    # document = PDFDocument(parser)

    # Check if the document allows text extraction. If not, abort.
    # if not document.is_extractable:
    #     raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()

    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)

    # BEGIN LAYOUT ANALYSIS
    # Set parameters for analysis.
    laparams = LAParams()
    laparams.line_overlap = 0.01
    laparams.line_margin = 0.01
    laparams.word_margin = 0.15

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    def parse_obj(lt_objs, page):
        # loop over the object list
        for obj in lt_objs:
            # if it's a textbox, print text and location
            if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
                rows[(page, -int(obj.bbox[1]))].append(
                    (int(obj.bbox[0]), sanitize(obj.get_text())))
            # if it's a container, recurse
            elif isinstance(obj, pdfminer.layout.LTFigure):
                parse_obj(obj._objs, page)

    # loop over all pages in the document
    for page_num, page in enumerate(PDFPage.get_pages(fp)):
        # read the page into a layout object
        interpreter.process_page(page)
        layout = device.get_result()

        # extract text from this object
        parse_obj(layout._objs, page_num)

    for key in sorted(rows):
        rows[key] = sorted(rows[key])
        page, y = key
        y = -y
        yield (page, y, rows[key])
Exemple #3
0
 def __init__(self, rsrc, outfp, codec='utf-8'):
   PDFDevice.__init__(self, rsrc)
   self.outfp = outfp
   self.codec = codec
   self.pageno = 0
   self.tag = None
   return
    def text_extraction(self):
        complete_report = []

        open_pdf_file = open(self.pdf_name, 'rb')
        pdf_parser = PDFParser(open_pdf_file)
        document = PDFDocument(pdf_parser)

        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        rsrcmgr = PDFResourceManager()
        device = PDFDevice(rsrcmgr)

        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        
        for page in PDFPage.create_pages(document):
            pages_list = []
            interpreter.process_page(page)
            layout = device.get_result()

            pages_list = self.parse_layout_obj_page_wise(layout._objs,pages_list)
            complete_report.append(pages_list)

        return complete_report
def handle_files(pdf_file, uTextList):
    #Handles File De-Identification and Passes the Uploaded Doc to Readctor Class
    print(pdf_file)
    newdoc = Document(docfile=pdf_file)
    newdoc.save()
    global docfileName
    docfileName = newdoc.docfile.name.rsplit('/', 1)[-1]

    # Create a PDF parser object associated with the file object.
    parser = PDFParser(pdf_file)

    # Create a PDF document object that stores the document structure.
    # Password for initialization as 2nd parameter if Password Protected PDF
    document = PDFDocument(parser)

    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()

    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)

    # BEGIN LAYOUT ANALYSIS
    # Set parameters for analysis.
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # loop over all pages in the document
    for page in PDFPage.create_pages(document):
        # read the page into a layout object
        interpreter.process_page(page)
        layout = device.get_result()

        # extract text from this object
        parse_obj(layout._objs)
        global text_cord_dict, coord, text_list
        text_cord_dict = dict(zip(cord_list, text_list))

    # Initializing The ReadctorOptions Class
    options = RedactorOptions()
    options.content_filters = []
    for coord, textlist in text_cord_dict.items():
        for i in uTextList:
            for j in textlist:
                if i in j:
                    options.content_filters += [
                        #First convert all dash-like characters to dashes.
                        (re.compile(i), lambda m: "XXXXXX"),
                    ]

# Call to readctor Function
    redactor(options, docfileName)
Exemple #6
0
def readPDFMinerTexts(fileObj):
    text_dict = {}
    file_pointer = open(fileObj, 'rb')

    parser = PDFParser(file_pointer)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
   
    resourceManager = PDFResourceManager()
    device = PDFDevice(resourceManager)
    laparams = LAParams()
    device = PDFPageAggregator(resourceManager, laparams = laparams)
    interpreter = PDFPageInterpreter(resourceManager, device)
    page_num = 1
    id = 0
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()
        for layout_obj in layout:
            if isinstance(layout_obj, LTTextBoxHorizontal):
                text_dict[id] = layout_obj.get_text()
                id += 1
        page_num += 1
    return text_dict
	def parse_page(self):
		# Create a PDF resource manager object that stores shared resources
		rsrcmgr = PDFResourceManager()
		
		# Create a PDF device object.
		device = PDFDevice(rsrcmgr)
	
		# BEGIN LAYOUT ANALYSIS
		# Set parameters for analysis.
		laparams = LAParams()
	
		# Create a PDF page aggregator object.
		device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	
		# Create a PDF interpreter object.
		interpreter = PDFPageInterpreter(rsrcmgr, device)
	
		text_content = list()
		page_count = 0
		
		for i, page in enumerate(PDFPage.create_pages(self.document)):
			# read the page into a layout object
			interpreter.process_page(page)
			layout = device.get_result()
			
			if self.pages:
				if page_count == self.pages:
					break
			page_count += 1
			self.parse_layout_objects(layout._objs, (i+1))
def main():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_book = path_project + os.sep + "input" + os.sep + "Contemporary Fixed Prosthodontics, 5ed.pdf"
    path_pdf_out = path_project + os.sep + "output" + os.sep + "pdf_result"
    # Open a PDF file.
    fp = open(path_book, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    cnt = 0
    for page in PDFPage.create_pages(document):
        if cnt < 10:
            interpreter.process_page(page)
            layout = device.get_result()

            cnt += 1
        else:
            break
Exemple #9
0
    def parse_pdf(self, file_name, start_page, end_page, save_folder):
        '''parse pdf to list of lists and save to csv'''
        fp = open(file_name, 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        rsrcmgr = PDFResourceManager()
        device = PDFDevice(rsrcmgr)
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        i = 0
        first_table = []
        second_table = []
        origen = ''
        for page in PDFPage.create_pages(document):
            if start_page <= i <= end_page:
                interpreter.process_page(page)
                layout = device.get_result()
                self.parse_obj(layout._objs, i)
            i += 1
            ListOfStrings.sort(key=lambda x: (-x[0][1], x[0][0]))

            required_section = False
            for a in ListOfStrings:
                if a[0][0] == 17 and a[0][1] == 482:
                    origen = a[1]
                if a[0][0] == 17 and '----' in a[1] and required_section:
                    required_section = False
                if a[0][0] == 17 and '----' in a[1] and not required_section:
                    required_section = True
                if 70 < a[0][0] < 72 and '----' not in a[1] and 'Container' not in a[1]:
                    temp_table_A = [''] * 6
                    temp_table_A[0] = ListOfStrings[ListOfStrings.index(a)][1].split()[0]
                    temp_table_A[1] = ListOfStrings[ListOfStrings.index(a)][1].split()[1]
                    temp_table_A[2] = ListOfStrings[ListOfStrings.index(a) + 1][1].split()[0]
                    temp_table_A[3] = ListOfStrings[ListOfStrings.index(a) + 1][1].split()[1]
                    temp_table_A[4] = ListOfStrings[ListOfStrings.index(a) + 2][1]
                    temp_table_A[5] = origen
                    if temp_table_A != [''] * 5:
                        first_table.append(temp_table_A)
                if a[0][0] == 17 and '----' not in a[1] and 'Freight' not in a[1] and required_section:
                    temp_table_B = [''] * 5
                    temp_table_B[0] = ListOfStrings[ListOfStrings.index(a)][1]
                    temp_table_B[1] = ListOfStrings[ListOfStrings.index(a) + 1][1]
                    temp_table_B[2] = ListOfStrings[ListOfStrings.index(a) + 2][1].split('.')[0]
                    temp_table_B[3] = ""
                    temp_table_B[4] = ListOfStrings[ListOfStrings.index(a) + 3][1].split('.')[0]
                    if temp_table_B != [''] * 5:
                        second_table.append(temp_table_B)
            ListOfStrings.clear()
        if first_table != []:
            self.create_csv(self.add_quotes_to_list(first_table), save_folder,
                            file_name.split('/')[-1].split('.')[0] + '_A.csv',
                            'sep=,\ncontainer,seall number,tare,type,packages,ORIGEN\n')
        if second_table != []:
            self.create_csv(self.add_quotes_to_list(second_table), save_folder,
                            file_name.split('/')[-1].split('.')[0] + '_B.csv',
                            'sep=,\nFreight/Charge ,Basis,Rated as,Prepaid,Collect\n')
Exemple #10
0
    def layout_pdf(self):

        # Headers
        self.headersDict = {
            **dict.fromkeys(next(self.headers), 'career'),
            **dict.fromkeys(next(self.headers), 'education'),
            **dict.fromkeys(next(self.headers), 'skill'),
            **dict.fromkeys(next(self.headers), 'interest'),
        }
        parser = PDFParser(self.fp)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        rsrcmgr = PDFResourceManager()
        device = PDFDevice(rsrcmgr)
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            line_count = sum(isinstance(x, LTLine) for x in layout)
            # Segmentation by Line separators (LT Line)
            if line_count > 3:
                self.line_segmentation(layout)
                if not self.is_valid():
                    self.header_segmentation(layout)
            else:
                self.header_segmentation(layout)
def parse_page(document, images_folder):
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
		raise PDFTextExtractionNotAllowed
	print dir(document)
	# Create a PDF resource manager object that stores shared resources
	rsrcmgr = PDFResourceManager()
	
	# Create a PDF device object.
	device = PDFDevice(rsrcmgr)

	# BEGIN LAYOUT ANALYSIS
	# Set parameters for analysis.
	laparams = LAParams()

	# Create a PDF page aggregator object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)

	# Create a PDF interpreter object.
	interpreter = PDFPageInterpreter(rsrcmgr, device)

	text_content = list()
	page_count = 0
	for i, page in enumerate(PDFPage.create_pages(document)):
		# read the page into a layout object
		interpreter.process_page(page)
		layout = device.get_result()
		# extract text from this object
		if page_count == 2:
			break
		return parse_lt_objs(layout._objs, (i+1), images_folder)
		#text_content.append(parse_lt_objs(layout._objs, (i+1), images_folder))
		page_count += 1
Exemple #12
0
    def textExtract(self, pdfFile, excelFile, lstPageNum):
        success = True

        lstSortedPageNum = sorted(lstPageNum)
        lastPage = lstSortedPageNum[-1]

        lstSortedPageNum = [x - 1 for x in lstSortedPageNum]

        try:
            workbook = xlsxwriter.Workbook(excelFile)
            with open(pdfFile, "rb") as pdf_file:
                pdf_reader = PdfFileReader(pdf_file)
                totalPDFPages = pdf_reader.numPages

                if lastPage > int(totalPDFPages):
                    success = False
                    msg = "Entered page number doesnot exist"
                    return [success, msg]

                # Create a PDF resource manager object that stores shared resources.
                rsrcmgr = PDFResourceManager()
                # Create a PDF device object.
                device = PDFDevice(rsrcmgr)
                # BEGIN LAYOUT ANALYSIS
                # Set parameters for analysis.
                laparams = LAParams()
                # Create a PDF page aggregator object.
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                # Create a PDF interpreter object.
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                pageNum = 0
                for pagecontent in PDFPage.get_pages(pdf_file,
                                                     lstSortedPageNum):
                    pageNum += 1
                    interpreter.process_page(pagecontent)
                    worksheet = workbook.add_worksheet("Page " + str(pageNum))
                    layout = device.get_result()
                    excelrowNum = 0
                    for obj in layout:
                        excelrowNum += 1
                        # if it's a textbox, print text and location
                        if isinstance(obj, LTTextBoxHorizontal):
                            df_extracted_Text = [
                                int(obj.bbox[0]),
                                int(obj.bbox[1]),
                                int(obj.bbox[2]),
                                int(obj.bbox[3]),
                                obj.get_text().replace('\n', '')
                            ]
                            for col_num, data in enumerate(df_extracted_Text):
                                worksheet.write(excelrowNum, col_num, data)
            msg = "Text Extraction successfull and saved to excel"
        except:
            msg = "Text Extraction! Throwing error"
            success = False
        finally:
            workbook.close()
        return [success, msg]
Exemple #13
0
 def __init__(self, rsrcmgr):
     PDFDevice.__init__(self, rsrcmgr)
     self.last_state = None
     # contains (font, font_size, string)
     self.blocks = []
     # current block
     # font, font size, glyph y, [chars]
     self.current_block = None
Exemple #14
0
def process_pdf_path(fname, page_num='all'):
    """
    Extract the path, which might be part of the ME, such as fraction line

    :param fname:
    :param page_num:
    :return:
    """
    if page_num == 'all':
        raise Exception("Not support get all at once")

    def print_layout(l):
        """get all the path such as fraction line and line for radical
        """
        for e in l:
            if isinstance(e, LTTextLineHorizontal) or isinstance(e, LTTextBoxHorizontal):  # recursively get the path
                print_layout(e)
            elif isinstance(e, LTRect) or isinstance(e, LTLine):
                # LTLine related with table
                # TODO, store as the candidates for the fraction
                path_list.append(e)
            else:
                # LTCurve might be related to the figure and drawings
                if debug:
                    print e, type(e)

    path_list = []
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    device = PDFDevice(rsrcmgr)

    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for i, page in enumerate(PDFPage.create_pages(document)):
        process_mark = (page_num == 'all' or page_num == i)
        if process_mark:
            interpreter.process_page(page)
            layout = device.get_result()
            print_layout(layout)
        if page_num == i:
            break

    crop_bbox = get_pdf_page_bbox_abandon(fname, page_num)

    # adjust the element bbox based on the crop bbox
    for path in path_list:
        adjust_element_bbox(path, crop_bbox)

    return path_list
def parsiraj():
    # Open a PDF file.
    fp = open(path, 'rb')

    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)

    # Create a PDF document object that stores the document structure.
    # Password for initialization as 2nd parameter
    document = PDFDocument(parser)

    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()

    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)

    # BEGIN LAYOUT ANALYSIS
    # Set parameters for analysis.
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    global tekst
    tekst = ""

    def parse_obj(lt_objs):
        # loop over the object list
        for obj in lt_objs:
            # if it's a textbox, print text and location
            if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
                #print(obj.get_text())
                global tekst
                tekst += obj.get_text()
            # if it's a container, recurse
            elif isinstance(obj, pdfminer.layout.LTFigure):
                parse_obj(obj._objs)

    # loop over all pages in the document
    for page in PDFPage.create_pages(document):
        # read the page into a layout object
        tekst = ""
        interpreter.process_page(page)
        layout = device.get_result()

        # extract text from this object
        parse_obj(layout._objs)
        pages.append(tekst)
Exemple #16
0
    def parse_pdf(self, pdf_file_name_with_path, text_dump_filename):

        Logger.getLogger().info("Parsing file " + pdf_file_name_with_path)

        if self._pdf_parsed == True:
            raise Exception('Error! PDF already parsed and loaded.')

        # Open a PDF file.
        fp = open(pdf_file_name_with_path, 'rb')

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)

        # Create a PDF document object that stores the document structure.
        # Password for initialization as 2nd parameter
        document = PDFDocument(parser)

        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()

        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)

        # BEGIN LAYOUT ANALYSIS
        # Set parameters for analysis.
        laparams = LAParams()

        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # loop over all pages in the document
        for page in PDFPage.create_pages(document):

            # read the page into a layout object
            interpreter.process_page(page)
            layout = device.get_result()

            # extract text from this object
            self._parse_obj(layout._objs)

        self._perform_sanity_check()

        self._dump_data_structures()

        if text_dump_filename is not None:
            self._dump_text(text_dump_filename)

        # If things came till here, successful parse
        self._pdf_parsed = True
Exemple #17
0
def _construct_thumbnail(filename, thumbnail_Width, thumbnail_Height,
                         destination_foldername):
    splitted_filename = filename.split('/')
    directoryname = l[1]
    pure_filename = l[2]

    # Open a PDF file.
    fp = open(filename, 'rb')
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pageNomber = 1
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        # pagesCounts: receive the LTPage object for the pagesCounts nth page.
        if pageNomber > PAGE_COUNT:
            break
        interpreter.process_page(page)
        layout = device.get_result()
        originalPage_Width = layout.width
        originalPage_Height = layout.height
        Matrix = ones((originalPage_Width, originalPage_Height, 3), int)
        Matrix = multiply(Matrix, BACKGROUND_COLOR)
        size = (int(originalPage_Width), int(originalPage_Height))

        for hbox in layout:
            #hbox can be: LTTextBox, LTFigure, LTLine, LTRect, LTImage
            if isinstance(hbox, pdfminer.layout.LTTextBoxHorizontal):
                _text_processing(hbox, Matrix)

            if isinstance(hbox, pdfminer.layout.LTRect):
                _rect_processing(hbox, Matrix)

            if isinstance(hbox, pdfminer.layout.LTLine):
                _line_processing(hbox, Matrix)

            if isinstance(hbox, pdfminer.layout.LTImage):
                _logo_processing(hbox, Matrix)

            if isinstance(hbox, pdfminer.layout.LTFigure):
                _figure_processing(hbox, Matrix)

        _construct_thumbnail_image(size, Matrix, thumbnail_Width,
                                   thumbnail_Height, destination_foldername)

        pageNomber += 1
Exemple #18
0
def process_pdf_internal(fname, page_num='all'):
    """
    Change from orignal name of process_pdf to process_pdf_internal
    get the raw character

    :param fname:
    :param page_num:
    :return:
    """
    tmp_path = get_tmp_path(fname)
    cache_path = "%s.chars.%s.pkl"%(tmp_path, str(page_num))

    if os.path.isfile(cache_path):
        try:
            return pickle.load(open(cache_path))
        except Exception as e:
            print "load failed, get again"

    # global char_list
    char_list = []
    if debug:
        print fname
    # Open a PDF file.
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    device = PDFDevice(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for i, page in enumerate(PDFPage.create_pages(document)):
        process_mark = (page_num == 'all' or page_num == i)
        if process_mark:
            interpreter.process_page(page)
            layout = device.get_result()
            print_layout(layout, char_list)

        if page_num == i:
            break

    crop_bbox = get_pdf_page_bbox_abandon(fname, page_num)
    for char in char_list:
        if isinstance(char, LTChar):
            adjust_element_bbox(char, crop_bbox)

    with open(cache_path, 'w') as f:
        pickle.dump(char_list, f)
    return char_list
def _construct_thumbnail(filename, thumbnail_Width, thumbnail_Height, destination_foldername):
    splitted_filename = filename.split('/')
    directoryname = l[1]
    pure_filename = l[2]
    
    # Open a PDF file.
    fp = open(filename, 'rb')
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams = laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pageNomber = 1
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        # pagesCounts: receive the LTPage object for the pagesCounts nth page.
        if pageNomber > PAGE_COUNT: 
            break
        interpreter.process_page(page)
        layout = device.get_result()
        originalPage_Width = layout.width
        originalPage_Height = layout.height
        Matrix = ones((originalPage_Width,originalPage_Height,3),int)
        Matrix = multiply(Matrix,BACKGROUND_COLOR)
        size = (int(originalPage_Width),int(originalPage_Height)) 

        for hbox in layout:         
         #hbox can be: LTTextBox, LTFigure, LTLine, LTRect, LTImage 
            if isinstance(hbox, pdfminer.layout.LTTextBoxHorizontal):
                _text_processing(hbox, Matrix)

            if isinstance(hbox, pdfminer.layout.LTRect):
                _rect_processing(hbox, Matrix)
                
            if isinstance(hbox, pdfminer.layout.LTLine):
                _line_processing(hbox, Matrix)

            if isinstance(hbox, pdfminer.layout.LTImage):
                _logo_processing(hbox, Matrix)
                        
            if isinstance(hbox, pdfminer.layout.LTFigure):
                _figure_processing(hbox, Matrix)
        
        _construct_thumbnail_image(size, Matrix, thumbnail_Width, thumbnail_Height, destination_foldername)

        pageNomber+=1
        def parse_document(pdfname):
            # Open a PDF file.writer
            fp = open(pdfname, 'rb')
            # Create a PDF parser object associated with the file object.
            parser = PDFParser(fp)
            # Create a PDF document object that stores the document structure.
            # Password for initialization as 2nd parameter
            document = PDFDocument(parser)
            # Check if the document allows text extraction. If not, abort.
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed
            # Create a PDF resource manager object that stores shared resources.
            rsrcmgr = PDFResourceManager()
            # Create a PDF device object.
            device = PDFDevice(rsrcmgr)
            # BEGIN LAYOUT ANALYSIS
            # Set parameters for analysis.
            laparams = LAParams()
            # Create a PDF page aggregator object.
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            # Create a PDF interpreter object.
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            def parse_obj(lt_objs):
                # loop over the object list
                # textList = []
                for obj in lt_objs:
                    # if it's a textbox, print text and location
                    if isinstance(obj, pdfminer.layout.LTTextLineHorizontal):
                        # print("%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', ' _')))
                        important(obj.get_text().replace('\n', ' _'))
                        # textItem = {
                        #     'text': obj.get_text().replace('\n', '_'),
                        #     'count': 1
                        # }
                        # if (obj.get_text().replace('\n', '_')) not in textList:
                        #     textList.append(obj.get_text().replace('\n', '_'))
                        # else:
                        #     for item in textList:

                    # if it's a container, recurse
                    elif isinstance(obj,
                                    pdfminer.layout.LTFigure) or isinstance(
                                        obj, pdfminer.layout.LTTextBox):
                        parse_obj(obj._objs)

            # loop over all pages in the document
            for page in PDFPage.create_pages(document):
                # read the page into a layout object
                interpreter.process_page(page)
                layout = device.get_result()
                # extract text from this object
                parse_obj(layout._objs)
Exemple #21
0
    def parsepdf(self, filename, startpage, endpage):

        # Open a PDF file.
        fp = open(filename, 'rb')

        # Create Position List
        position_list = []

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)

        # Create a PDF document object that stores the document structure.
        # Password for initialization as 2nd parameter
        document = PDFDocument(parser)

        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()

        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)

        # BEGIN LAYOUT ANALYSIS
        # Set parameters for analysis.
        laparams = LAParams()

        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        i = 0
        # loop over all pages in the document
        for page in PDFPage.create_pages(document):
            if i >= startpage and i <= endpage:
                # read the page into a layout object
                interpreter.process_page(page)
                layout = device.get_result()

                # extract text from this object
                # print(position_list)
                self.parse_obj(layout._objs, position_list, i)
                i += 1

        position_list = pd.DataFrame(position_list)
        position_list.columns = ["pos_x", "pos_y", "page", "text"]

        return (position_list)
Exemple #22
0
def extract_block_text(filename, pages=[]):
    if not os.path.isfile(filename):
        raise FileNotFoundError(filename)
    with open(filename, "rb") as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)

        # Create a PDF document object that stores the document structure.
        # Password for initialization as 2nd parameter
        document = PDFDocument(parser)

        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()

        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)

        # BEGIN LAYOUT ANALYSIS
        # Set parameters for analysis.
        laparams = LAParams()

        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # loop over all pages in the document
        data = []
        if len(pages) == 0:
            for page in PDFPage.create_pages(document):
                # read the page into a layout object
                interpreter.process_page(page)
                layout = device.get_result()

                # extract text from this object
                parse_obj(layout._objs, data)
        else:
            for page_i in pages:

                for j, page in enumerate(PDFPage.create_pages(document)):
                    # read the page into a layout object
                    if j + 1 == page_i:
                        interpreter.process_page(page)
                        layout = device.get_result()

                        # extract text from this object
                        parse_obj(layout._objs, data)
        return data
Exemple #23
0
def GetScript(filename):
    global scriptName
    ResetGlobals()
    scriptName = filename
    password = ""
    # Open a PDF file.
    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser, password)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        print "---Not translatable---"
        return
        #raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
    
    # Set parameters for analysis.
    laparams = LAParams()
    laparams.boxes_flow = 2
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for pgnum,page in enumerate(PDFPage.create_pages(document)):
        if pgnum == 0:
            continue
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        text = []
        for page in layout:
            try:
                if page.get_text().strip():
                    text.append(TextBlock(page.x0,page.y1,page.get_text().strip()))
            except:
                temp=5  
            print ".",
        text.sort(key = lambda row:(-row.y))
        # Parse all of the "line" objects in each page
        for line in text:
            ParseLine(line.text, line.x)
def calculate_locations(filename,keywords):
    locations = []
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)    
    #Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document)
    pagenum = 0
    reader = PdfFileReader(file(filename,"rb"))
    for page in pages:
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()    
        page = reader.getPage(pagenum)
        x = page.trimBox[0].as_numeric()
        y = page.trimBox[1].as_numeric()
        #Handling special case
        if  (x > 0 and y < 0):
                x = 0
#         print "At page = %s  X  = %s , y = %s"%(pagenum,x,y)
        for keyword in keywords:    
            print '********************************'
            co_ordinates = get_location(keyword,layout,x,y)
            print'Keyword %s , location %s'%(keyword,co_ordinates)
            print '********************************'
            if co_ordinates != None :
                for location in co_ordinates:
                    print "PageNum-->%s"%pagenum
                    l = LocationKeeper(keyword,location,pagenum)
                    locations.append(l)
        pagenum+=1
    return locations
    def parsepdf(self, full_filename,dirname,filename, startpage, endpage):

        # Open a PDF file.
        fp = open(full_filename, 'rb')

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)

        # Create a PDF document object that stores the document structure.
        # Password for initialization as 2nd parameter
        document = PDFDocument(parser)

        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()

        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)

        # BEGIN LAYOUT ANALYSIS
        # Set parameters for analysis.
        laparams = LAParams()

        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

            # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)


        i = 0
        # loop over all pages in the document
        for page in PDFPage.create_pages(document):
            #if i >= startpage and i <= endpage:
            # read the page into a layout object
            interpreter.process_page(page)
            layout = device.get_result()

            # extract text from this object
            self.parse_obj(layout._objs)

            li.sort(key=lambda x:x[2])
            if len(li) != 0:
                extract_tree(full_filename, dirname, filename, li, i)
            # print(li)
            del li[:]
            i += 1
Exemple #26
0
    def parsepdf(self):
        # Open a PDF file.
        fp = open(self.filename, 'rb')

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)

        # Create a PDF document object that stores the document structure.
        # Password for initialization as 2nd parameter
        document = PDFDocument(parser)
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            print('extraction not allowed')
            raise PDFTextExtractionNotAllowed

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()

        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)

        # BEGIN LAYOUT ANALYSIS
        # Set parameters for analysis.
        laparams = LAParams()

        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

            # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)


        i = 0
        # loop over all pages in the 
        for page in PDFPage.create_pages(document):
            # int i to keep track of page numbers
            i+=1

            # read the page into a layout object
            interpreter.process_page(page)
            layout = device.get_result()

            # extract text from this object
            self.parse_page(layout._objs, i)
        return self.word_array
    

#test = PDFpos("FinancialAccounting1.pdf")
#test.parsepdf()
Exemple #27
0
    def readPdf(self, filePath):
        self.result = ''

        # 二进制读取文件
        file = open(filePath, 'rb')
        # pdf解析器
        parser = PDFParser(file)
        # pdf文档
        doc = PDFDocument(parser)
        # 检测文档是否提供text转换
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed

        # 连接解析器和文档对象
        # parser.set_document(doc)
        # doc.set_parser(parser)
        # 提供初始密码
        # 没有密码创建一个空字符串
        # doc.initialize()

        # 创建pdf资源管理器
        resource_manager = PDFResourceManager()
        # pdf设备对象
        laparams = LAParams()
        device = PDFDevice(resource_manager)
        device = PDFPageAggregator(resource_manager, laparams=laparams)
        # pdf解释器
        interpreter = PDFPageInterpreter(resource_manager, device)
        pdf_str = ''
        # 遍历列表,每次处理一个page内容

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            for row in layout:
                if hasattr(row, "get_text"):
                    self.result = self.result + (str(row.get_text())) + '\n'

                    # if (isinstance(row, LTTextBoxHorizontal)):
                    #     with open('a.txt', 'a') as f:
                    #         f.write(row.get_text().encode('utf-8') + '\n')

        fileNames = os.path.splitext(filePath)
        if os.path.exists(fileNames[0] + '.txt'):
            return

        with open(fileNames[0] + '.txt', 'a') as f:
            f.write(self.result)
Exemple #28
0
def get_pdf_page_bbox_abandon(fname, pid=0):
    """
    Get the page number for the current pdf file
    NOTE that different page might have different number of pages
    could possible be the fraction lines, or the lines for the radical elements

    :param fname:
    :param pid:
    :return: tuple(left, xx, right, xx), only the last two value are valid for
    """
    # Open a PDF file.
    fp = open(fname, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for i, page in enumerate(PDFPage.create_pages(document)):
        if i == pid:
            interpreter.process_page(page)
            return page.cropbox
    return None
Exemple #29
0
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize("")
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
Exemple #30
0
def init(filename, verbose=True):
    '''Initiate analysis objs
    '''

    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Set parameters for analysis.
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    return document, interpreter, device
Exemple #31
0
    def readPdf(self):
        file1 = os.path.join(self.fPath, self.fileName)
        fp = open(file1, 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        rsrcmgr = PDFResourceManager()
        device = PDFDevice(rsrcmgr)
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        lt = []
        lt1 = []

        def parse_obj(lt_objs, pageNo):

            for obj in lt_objs:
                try:
                    if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
                        lt.append(obj.get_text().replace('\n', ''))
                        lt1.append([
                            obj.get_text().replace('\n', '').strip(),
                            int(obj.bbox[0]),
                            int(obj.bbox[1]),
                            int(obj.bbox[3]), pageNo + 1
                        ])
                        #print(pageNo + 1,int(obj.bbox[2]),int(obj.bbox[3]),obj.get_text().replace('\n', '').strip())

                except:
                    pass

        for pageNumber, page in enumerate(PDFPage.get_pages(fp)):
            try:

                interpreter.process_page(page)
                layout = device.get_result()
                parse_obj(layout._objs, pageNumber)

            except:
                pass

        self.coordcont = lt1
        self.content = lt
        return self.content
def with_pdf(path_book, images_folder):
    fp = open(path_book, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    rsrcmgr = PDFResourceManager()
    #     laparams = LAParams()
    #     device = PDFDevice(rsrcmgr, laparams=laparams)
    device = PDFDevice(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = []
    for i, page in enumerate(PDFPage.create_pages(document)):
        interpreter.process_page(page)
        # receive the LTPage object for this page
        layout = device.get_result()
        text_content.append(parse_lt_objs(layout, (i + 1), images_folder))
    return text_content
Exemple #33
0
def parse_pdf(pdf):
    rsrcmgr = PDFResourceManager()
    device = PDFDevice(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    return {
        'pages': parse_all_page_sizes(pdf, interpreter),
        'fields': parse_all_annotations(pdf, interpreter)
    }
Exemple #34
0
def pdf_to_txt(path):
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable():
        raise PDFPageAggregator
    else:
        rsrcmgr = PDFResourceManager()
        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 处理每一页
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(path[:-4] + '.txt', 'a') as f:
                        f.write(x.get_text().encode('utf-8') + '\n')
Exemple #35
0
def parsePDF(f):
    parser = PDFParser(f)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')

    rsrcmgr = PDFResourceManager()
    device = PDFDevice(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    from pdfminer.layout import LAParams
    from pdfminer.converter import PDFPageAggregator

    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for pobj in doc.get_pages():
        interpreter.process_page(pobj)
        yield device.get_result()
Exemple #36
0
def main():

    # Open a PDF file.
    fp = open('Divani_Kebir-1.pdf', 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize()
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        print 'not extraction'
        return

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in doc.get_pages():
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        if layout.pageid >= 2:
            break
        get_text(layout)
Exemple #37
0
def pdf_to_text(pdf):
    """
    Takes pdfminer PDFDocument and converts to plaintext.

    Returns a string.
    """
    output = ""
    # create PDFMiner objects for data extraction
    rsrcmgr = PDFResourceManager()
    device = PDFDevice(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # iterate over all pages, select textbox objects and extract plaintext
    for page in pdf.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for element in layout:
            if isinstance(element, LTTextBox) or isinstance(element, LTTextLine):
                output += element.get_text()
    return output
Exemple #38
0
def getPdfPages(path):
    with open(path, 'r') as fd:
        parser = PDFParser(fd)

        document = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        pages = []
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            pages.append({
                'layout' : device.get_result()
            })

            #for group in layout.groups:
            #    if group.get_text().startswith('section'):
            #        print(dir(page))
        return pages
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pickle

# Open a PDF file.
fp = open('full.pdf', 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
document = PDFDocument(parser)
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)

# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = list(enumerate(PDFPage.create_pages(document)))
pages_length = len(pages)
print 'Created page list of ' + str(pages_length) + ' pages'

def sort_text (text):
    return (height - text.y1) * 1000000 + text.x0