Example #1
0
def extract_citations(url):
        '''
        Arguments:
                url (string): url of a pdf of a research paper
        '''
        extracted_text = ""
        start_writing = False # Don't want to start writing until we hit References

        fp = open(my_file, "rb")

        print(fp)
        
        ####
        f = urllib2.urlopen(urllib2.Request(url)).read()
        fp = StringIO(f)
        ####
        
        
        parser = PDFParser(fp)
        document = PDFDocument(parser, password = "")
        if not document.is_extractable: raise PDFTextExtractionNotAllowed
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                layout = device.get_result()
                for lt_obj in layout:
                        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                                if start_writing: extracted_text += lt_obj.get_text()
                                if "References\n" in lt_obj.get_text(): start_writing = True
        fp.close()
                                
        with open(log_file, "w") as my_log:
                my_log.write(extracted_text.encode("utf-8"))
        print("Done !!")
Example #2
0
    def character_extraction(self, address):
        # Create a file pointer
        fp = open(address, 'rb')

        try:
            # Create parser object to parse the pdf content
            parser = PDFParser(fp)

            # Store the parsed content in PDFDocument object
            document = PDFDocument(parser, '')

            # Create PDFResourceManager object that stores shared resources such as fonts or images
            rsrcmgr = PDFResourceManager()

            # set parameters for analysis
            laparams = LAParams()

            # Create a PDFDevice object which translates interpreted information into desired format
            # Device needs to be connected to resource manager to store shared resources
            # device = PDFDevice(rsrcmgr)
            # Extract the decive to page aggregator to get LT object elements
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)

            # Create interpreter object to process page content from PDFDocument
            # Interpreter needs to be connected to resource manager for shared resources and device
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(document):
                # As the interpreter processes the page stored in PDFDocument object
                interpreter.process_page(page)
                # The device renders the layout from interpreter
                layout = device.get_result()
                # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
                for lt_obj in layout:
                    if isinstance(lt_obj, (LTTextBox, LTTextLine)):
                        self.fetch_chars(lt_obj)
                self.page_num += 1
        finally:
            fp.close()
def convert(input_file):
    f = input_file
    fp = open(f, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    objs = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        # collecting objects from the all pages, sorting them by their Y coordinate
        objs.append(
            sorted(get_objects(layout), key=lambda x: x.y0, reverse=True))
    objs = sum(objs, [])  # flattening to 1D array
    # getting objects from the corresponding sections

    resume_as_text = extract_text(objs)

    counter = Counter()

    for word in technology_jargon.keywords:
        count = count_of_technology_words(word, resume_as_text)
        if count > 0:
            counter[word] += count

    return json.dumps(counter)
Example #4
0
    def __init__(self, filename=''):

        # Open a PDF file.
        fp = open(filename, 'rb')
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        laparams = LAParams()

        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser, '')
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Create a PDF device object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        self.all_classifications = {}
        self.problematicClassmarks = []

        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            self.process_page(layout)

            self.process_classifications()

        self.remove_debug()

        self.seperate()

        self.results = self.all_classifications
        self.problems = self.problematicClassmarks
Example #5
0
def pdf_to_txt(url):
    fp = BytesIO(urlopen(url).read())
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()

    for param in ("all_texts", "detect_vertical", "word_margin", "char_margin",
                  "line_margin", "boxes_flow"):
        paramv = locals().get(param, None)
        if paramv is not None:
            setattr(laparams, param, paramv)

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    extracted_text = ''
    return_code = 0
    for page in doc.get_pages():
        try:
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extracted_text += lt_obj.get_text()
        except:
            print("PDF converting error: {0}".format(url))
            return_code = 1
            break

    if return_code == 0:
        print("convert ok")

    return return_code, extracted_text
Example #6
0
def imgminer(pdf_path, OUT_DIR, save=True, file_ext='png'):
    """
    Extract images from pdf file using PDFMiner (https://euske.github.io/pdfminer/programming.html)

    :param str pdf_path:
    :param str OUT_DIR:
    :param bool save:
    :param str file_ext:
    :rtype: list
    :return: list of images
    """

    resource_manager = PDFResourceManager()
    device = PDFPageAggregator(resource_manager)
    interpreter = PDFPageInterpreter(resource_manager, device)

    images = []
    with open(pdf_path, 'rb') as f:
        # 1ページずつ処理
        for p, page in enumerate(PDFPage.get_pages(f)):
            interpreter.process_page(page)
            layout = device.get_result()  # LTPage object

            ltimages = find_images_recursively(layout)  # list of LTImage

            # Sort based on coordinates
            ltimages.sort(key=lambda b: (-b.y1, b.x0))

            for ltimage in ltimages:
                imgarr = ltimage2imgarray(ltimage)
                if imgarr is not None:
                    images.append(imgarr)
                    if save:
                        file_name = '{}-page{}-{}.{}'.format(
                            os.path.basename(pdf_path), p, ltimage.name,
                            file_ext)
                        cv.imwrite(os.path.join(OUT_DIR, file_name), imgarr)
    return images
def get_total(filename):
    path = open(filename, 'rb')
    parser = PDFParser(path)
    document = PDFDocument(parser)
    temp_total = -1

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        check_total = False

        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = str(x.get_text())
                    if "Subtotal for all regions" in results:
                        check_total = True
                    if check_total:
                        # print("results: " + results)
                        temp_results = re.search(r'(.*)\n', results,
                                                 re.M | re.I).group(1)
                        temp_results = temp_results.replace(" ", "").replace(
                            "\\n", "")
                        try:
                            temp_num = int(temp_results)
                            if temp_num > temp_total:
                                temp_total = temp_num
                        except ValueError:
                            continue

    return temp_total
Example #8
0
    def parse_document(self):
        self.res = []  # result set
        self.media_boxes = dict()  # media coordinate dictionary
        self.n = 0  # page count
        pdf = open(self.pdf, "rb")
        pdf_parser = PDFParser(pdf)
        pdf_document = PDFDocument(pdf_parser)
        la_params = LAParams(detect_vertical=True)
        if constants.USE_CUSTOM_PDF_PARAMETERS:
            la_params = LAParams(detect_vertical=constants.DEFAULT_DETECT_VERTICAL,
                                 line_overlap=constants.DEFAULT_LINE_OVERLAP,
                                 line_margin=constants.DEFAULT_LINE_MARGIN,
                                 word_margin=constants.DEFAULT_WORD_MARGIN,
                                 char_margin=constants.DEFAULT_CHAR_MARGIN,
                                 boxes_flow=constants.DEFAULT_BOXES_FLOW)

        if pdf_document.is_extractable:
            resource_manager = PDFResourceManager()
            page_aggregator = PDFPageAggregator(resource_manager,
                                                laparams=la_params)
            page_interpreter = PDFPageInterpreter(resource_manager,
                                                  page_aggregator)
            pages = PDFPage.create_pages(pdf_document)

            for page in pages:
                page_interpreter.process_page(page)
                layout = page_aggregator.get_result()
                crop_box = page.cropbox
                page_box = page.mediabox
                self.media_boxes[self.n] = {"x0": crop_box[0], "y0": crop_box[1],
                                            "x1": crop_box[2], "y1": crop_box[3],
                                            "x0page": page_box[0], "y0page": page_box[1],
                                            "x1page": page_box[2], "y1page": page_box[3]}
                self.box_id = -1
                self.res = self.get_objects(layout._objs, self.res, self.n, self.media_boxes)
                self.n += 1

            return self.res, self.media_boxes
Example #9
0
def parse():
	f = codecs.open(path, mode='rb', encoding='ISO-8859-1')  # 二进制
	parser = PDFParser(f)
	doc = PDFDocument()
	# 连接解析器与文档对象
	parser.set_document(doc)
	doc.set_parser(parser)
	doc.initialize()  # 若有密码则提供密码
	if not doc.is_extractable:
		raise PDFTextExtractionNotAllowed
	else:
		manager = PDFResourceManager()
		# 创建一个PDF设备对象
		params = LAParams()
		# 创建一个PDF页面聚合对象
		device = PDFPageAggregator(manager, laparams=params)
		# 解释器
		interpreter = PDFPageInterpreter(manager, device)

		for page in doc.get_pages():
			interpreter.process_page(page)
			layout = device.get_result()
			for x in layout:
				if isinstance(x, LTTextBox):
					with codecs.open(r'./test1.txt', mode='a', encoding='ISO-8859-1') as f:
						result = x.get_text()
						print(result)
						f.write(result + '\n')
				elif isinstance(x, LTTextBoxHorizontal):
					with codecs.open(r'./test2.txt', mode='a', encoding='ISO-8859-1') as f2:
						result2 = x.get_text()
						print(result2)
						f2.write(result2 + '\n')
				elif isinstance(x, LTTextLine):
					with codecs.open(r'./test3.txt', mode='a', encoding='ISO-8859-1') as f3:
						result3 = x.get_text()
						print(result3)
						f3.write(result3 + '\n')
Example #10
0
def parsePdf(fp):
    print("parsePdf begin.")
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)

    doc.initialize()

    if not doc.is_extractable:
        print("document is not extractable")
        return

    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    i = 0
    list = []
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        results = ""

        for x in layout:
            if (isinstance(x, LTTextBoxHorizontal)):
                results += x.get_text()
        i += 1
        if i > 10:
            list.append(results)
            #if i > 41:
            #   break;

    #for table in list:
    #    print(table);
    #    print("======================\n");
    return list
Example #11
0
def pdfminer(f):

    # Open a PDF file.
    fp = open(f, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    #    device = PDFDevice(rsrcmgr)

    laparams = LAParams(all_texts=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    converter = HTMLConverter(os.path.basename(f))

    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):

        interpreter.process_page(page)

        layout = device.get_result()
        converter.current_page = page
        converter.render(layout)
        break  # stop after first page.

    converter.add_features()

    return converter
Example #12
0
    def isSearchablePDF(self):
        searchable = True
        entityList_ = []
        print("PDF File")
        fp = open(self.filename, 'rb')
        print("PDF Filejhhfyf")

        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 1.0
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = ''
        counter = 1
        try:
            for page in doc.get_pages():
                interpreter.process_page(page)
                layout = device.get_result()
                for lt_obj in layout:
                    if isinstance(lt_obj, LTTextBox) or isinstance(
                            lt_obj, LTTextLine):
                        extracted_text = lt_obj.get_text()
                        entities = self.extractentities(extracted_text)
                        for entity in entities:
                            entityList_.append(entity)
            if entityList_ == []:
                searchable = False

        except Exception as e:
            print("File is not parsaing")

        return searchable
Example #13
0
 def noimgpdf_change_word(self, _path):
     """
     没有图片的pdf文件转word
     :param _path: pdf文件路径
     :return:
     """
     try:
         if 'http://www' in _path:
             re = Request(
                 url=_path,
                 headers={'User-Agent': random.choice(self.user_agent)})
             fp = urlopen(re)  # 打开在线PDF文档
         else:
             fp = open(_path, 'rb')  # 打开本地pdf文档
         praser_pdf = PDFParser(fp)
         doc = PDFDocument()
         praser_pdf.set_document(doc)
         doc.set_parser(praser_pdf)
         doc.initialize()
         if not doc.is_extractable:
             raise PDFTextExtractionNotAllowed
         else:
             rsrcmgr = PDFResourceManager()
             laparams = LAParams()
             device = PDFPageAggregator(rsrcmgr, laparams=laparams)
             interpreter = PDFPageInterpreter(rsrcmgr, device)
             all_results = ''
             for page in doc.get_pages():
                 interpreter.process_page(page)
                 layout = device.get_result()
                 for out in layout:
                     if isinstance(out, LTTextBoxHorizontal):
                         results = out.get_text()
                         all_results += results
             print(all_results)
             return all_results
     except:
         return None
Example #14
0
def ParsePDF():
    filename = open(pdfpath, 'rb')  #以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    parser = PDFParser(filename)
    # 创建一个PDF文档对象存储文档结构,提供密码初始化,没有就不用传该参数
    doc = PDFDocument(parser, password='')
    #检查文件是否允许文本提取
    if not doc.is_extractable:
        print("Not Allowd Extractable")
        raise PDFTextExtractionNotAllowed
    
    # 创建PDf 资源管理器来管理共享资源,#caching = False不缓存
    rsrcmgr = PDFResourceManager(caching = False)
    # 创建一个PDF设备对象
    laparams = LAParams()
    # 创建一个PDF页面聚合对象
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    #device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
    # 创建一个PDF解析器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)  

    # 获取page列表list对象,
    # print(PDFPage.get_pages(doc))

    #获取page列表循环遍历列表,每次处理一个page的内容
    for page in PDFPage.create_pages(doc):
        # 接受该页面的LTPage对象
        interpreter.process_page(page)
        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
        layout = device.get_result()
        for i in layout:
            if hasattr(i,"get_text") :
                content = i.get_text().replace(u'\xa0',u'').replace('\n','')
                document.add_paragraph(content , style=None)
        break    
    document.save("a.docx")
    filename.close()
    return 1
Example #15
0
 def parse(self, text_path):
     '''解析PDF文本,并保存到TXT文件中'''
     fp = open(text_path, 'rb')
     # 用文件对象创建一个PDF文档分析器
     parser = PDFParser(fp)
     # 创建一个PDF文档
     doc = PDFDocument()
     # 连接分析器,与文档对象
     parser.set_document(doc)
     doc.set_parser(parser)
     # 提供初始化密码,如果没有密码,就创建一个空的字符串
     doc.initialize()
     # 检测文档是否提供txt转换,不提供就忽略
     if not doc.is_extractable:
         raise PDFTextExtractionNotAllowed
     else:
         # 创建PDF,资源管理器,来共享资源
         rsrcmgr = PDFResourceManager()
         # 创建一个PDF设备对象
         laparams = LAParams()
         device = PDFPageAggregator(rsrcmgr, laparams=laparams)
         # 创建一个PDF解释其对象
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         # 循环遍历列表,每次处理一个page内容
         # doc.get_pages() 获取page列表
         results = ''
         for page in doc.get_pages():
             interpreter.process_page(page)
             # 接受该页面的LTPage对象
             layout = device.get_result()
             # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
             # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
             # 想要获取文本就获得对象的text属性
             for x in layout:
                 if (isinstance(x, LTTextBoxHorizontal)):
                     result = x.get_text()
                     results += result + "\n"
         return results
Example #16
0
    def generateFileContent(self):

        import tempfile
        import urllib

        abbreviationsPdfUrl = u"http://www.realacademiagalega.org/c/document_library/get_file?uuid=f29e6ce1-9ac5-42e3-8c15-73c4b9b5f48b&groupId=10157"
        temporaryFile = tempfile.NamedTemporaryFile()
        urllib.urlretrieve(abbreviationsPdfUrl, temporaryFile.name)

        entries = set()
        fileObject = open(temporaryFile.name, "rb")
        parser = PDFParser(fileObject)
        document = PDFDocument(parser)
        resourceManager = PDFResourceManager()
        device = PDFPageAggregator(resourceManager)
        interpreter = PDFPageInterpreter(resourceManager, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            objects = [
                object for object in layout if not isinstance(object, LTRect)
                and not isinstance(object, LTCurve)
            ]
            params = LAParams()
            for line in layout.group_objects(params, objects):
                text = line.get_text()
                if u":" in text:
                    entry = text.split(u":")[0]
                    entry = entry.strip()
                    entry = entry.replace(u"..", ".")
                    entries.add(entry)

        dictionary = u"# Abreviaturas empregadas no Dicionario da Real Academia Galega\n"
        dictionary += u"# http://www.realacademiagalega.org/abreviaturas\n"
        dictionary += u"\n"
        for entry in formatEntriesForDictionary(entries, u"abreviatura"):
            dictionary += entry
        return dictionary
Example #17
0
def parse(pdffile, txtfile):
    '''解析PDF文本,并保存到txt文件中'''
    fp = open(pdffile, 'rb')
    parser = PDFParser(fp)  #创建一个PDF分析器

    #创建PDF文档,连接分析器与文档对象
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    #提供初始化密码,如果没有密码,就创建一个空的字符串
    doc.initialize()

    #检测文档是否提供Txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #创建PDF资源管理器
        rsrcmgr = PDFResourceManager()
        #创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        #遍历列表,每次处理一个page内容
        for page in doc.get_pages():
            interpreter.process_page(page)
            #接受该页面的LTPage对象
            layout = device.get_result()
            #layout是一个LTPage对象,里面存放这这个page解析出的各种对象
            #一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等
            #想要获取文本就获得对象的text属性
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(txtfile, 'a') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + "\n")
Example #18
0
def parse():
    fp = open(path, 'rb')  # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(r'./1.txt', 'a') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + '\n')
Example #19
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #20
0
    def PDF_to_text(PDF):
        # https://stackoverflow.com/questions/44024697/how-to-read-pdf-file-using-pdfminer3k
        # Extract text from PDF upload
        parser = PDFParser(PDF)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        device = PDFPageAggregator(PDFResourceManager(), laparams=LAParams())
        interpreter = PDFPageInterpreter(PDFResourceManager(), device)
        extrctd_txt = ''

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extrctd_txt += lt_obj.get_text()
        extrctd_txt = extrctd_txt.replace("•\n",
                                          "-").replace("•", "-").replace(
                                              u'\u2013', '–')
        return extrctd_txt
Example #21
0
def convert(fname):
    fp = open(fname, 'rb')
    
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
    return extracted_text
Example #22
0
    def save_content(self,page_index=None, save_path=None):
        pages = self.get_content_from_pdf()
        num_page = 0
        pdfresourcemanager = PDFResourceManager()  # 创建资源管理器来共享资源
        laparams = LAParams()
        pdfpageAggregator = PDFPageAggregator(pdfresourcemanager, laparams=laparams) # 创建一个PDF设备对象

        interpreter = PDFPageInterpreter(pdfresourcemanager, pdfpageAggregator)# 创建一个PDF解释器对象
        if page_index:  # 如果想提取某些页,就把具体页数指明出来
            pages = list(pages[i-1] for i in page_index)
        save_path = save_path if save_path else self.pdfpath.replace('pdf', 'doc')
        with open(save_path, 'w', encoding='utf-8') as f:  # 生成doc文件的文件名及路径
            for page in pages:  # doc.get_pages() 获取page列表
                num_page += 1  # 页面增一
                interpreter.process_page(page)
                layout = pdfpageAggregator.get_result()  # 提取页面的信息
                for x in layout:
                    if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容,这里获取的是一行
                        # 保存文本内容
                        print("232")
                        results = x.get_text()
                        f.write(results)
                        f.write('\n')
Example #23
0
def get_pdf_contents(file, dirname=GUMMY_DIR):
    """Get PDF contents.

    Args:
        file (data, str) : url or path or data of PDF.
        dirname (str)    : if ``file`` is url, download and save it to ``dirname``. (defalt= ``GUMMY_DIR``)

    Returns:
        list : Each element is a list which contains [text, bbox(x0,y0,x1,y1)]
    """
    # Settings.
    rsrcmgr = PDFResourceManager()
    laparams = LAParams(detect_vertical=True)
    device = PDFPageAggregator(rsrcmgr=rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr=rsrcmgr, device=device)
    #  parse PDF pages
    pdf_pages = []
    with get_pdf_pages(file=file, dirname=dirname) as pages:
        for page in pages:
            interpreter.process_page(page)
            layout = device.get_result()
            pdf_pages.append(parser_pdf_pages(layout_objs=layout._objs))
    return pdf_pages
Example #24
0
 def __init__(self, stream, pages=None, laparams=None, precision=0.001):
     self.laparams = None if laparams == None else LAParams(**laparams)
     self.stream = stream
     self.pages_to_parse = pages
     self.precision = precision
     rsrcmgr = PDFResourceManager()
     self.doc = PDFDocument(PDFParser(stream))
     self.metadata = {}
     for info in self.doc.info:
         self.metadata.update(info)
     for k, v in self.metadata.items():
         if hasattr(v, "resolve"):
             v = v.resolve()
         if type(v) == list:
             self.metadata[k] = list(map(decode_text, v))
         elif isinstance(v, PSLiteral):
             self.metadata[k] = decode_text(v.name)
         elif isinstance(v, bool):
             self.metadata[k] = v
         else:
             self.metadata[k] = decode_text(v)
     self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
     self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
Example #25
0
    def read_file(self):
        with open(self.path, 'rb') as f:
            parser = PDFParser(f)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 0.1
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = []

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extracted_text.append(lt_obj.get_text())
        self.content = ' '.join(extracted_text).replace('\x00', '')
Example #26
0
def Pdf2Txt(DataIO,Save_path):                     #来创建一个pdf文档分析器
    parser = PDFParser(DataIO)                     #创建一个PDF文档对象存储文档结构
    document = PDFDocument(parser) 
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #创建一个PDF资源管理器对象来存储共赏资源
        rsrcmgr=PDFResourceManager();            #设定参数进行分析
        laparams=LAParams();                    #创建一个PDF设备对象
        #device=PDFDevice(rsrcmgr)
        device=PDFPageAggregator(rsrcmgr,laparams=laparams);#创建一个PDF解释器对象
        interpreter=PDFPageInterpreter(rsrcmgr,device)
        #处理每一页
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page);        #接受该页面的LTPage对象
            layout=device.get_result()
            for x in layout:
                try:
                    if(isinstance(x,LTTextBoxHorizontal)):
                        with open('%s'%(Save_path),'a') as f:
                            f.write(x.get_text().encode('utf-8')+'\n')
                except:
                    print "Failed!"
def parse_pdf(path, output_path):
    with open(path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 1.0
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = ''
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extracted_text += lt_obj.get_text()
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(extracted_text)
    def parse_question_file(question_file_path):
        text_content = []
        with open(question_file_path, 'rb') as question_file:
            parser = PDFParser(question_file)
            document = PDFDocument(parser)

            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed
            else:
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                for page in PDFPage.create_pages(document):
                    interpreter.process_page(page)
                    layout = device.get_result()
                    for x in layout:
                        if isinstance(x, LTTextBoxHorizontal):
                            line = x.get_text().decode().strip()
                            if line:
                                text_content.append(line + '\n')
        return text_content
Example #29
0
def main(fname):
    with open(fname, 'rb') as fd:
        parser = PDFParser(fd)
        doc = PDFDocument(parser)

        # Check if document is extractable, if not abort
        if not doc.is_extractable:
            raise Exception

        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        all_txt = ""
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            txt = parse_layout(layout)
            all_txt += txt

        #print "Converted text\n", all_txt
        snip = find_pattern(all_txt, "volunteer recycling", 200)
        print snip
Example #30
0
def parse(fo):
    praser = PDFParser(fo)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)
    doc.initialize()
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pages = []
        for page in doc.get_pages():  # get page list
            interpreter.process_page(page)

            layout = device.get_result()

            p = ''.join(x.get_text() for x in layout
                        if isinstance(x, LTTextBoxHorizontal))
            pages.append(p)
    return pages