Example #1
0
    def __init__(self, filename=''):

        # Open a PDF file.
        fp = open(filename, 'rb')
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        laparams = LAParams()

        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser, '')
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Create a PDF device object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        self.all_classifications = {}
        self.problematicClassmarks = []

        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            self.process_page(layout)

            self.process_classifications()

        self.remove_debug()

        self.seperate()

        self.results = self.all_classifications
        self.problems = self.problematicClassmarks
Example #2
0
    def generateFileContent(self):

        import tempfile
        import urllib

        abbreviationsPdfUrl = u"http://www.realacademiagalega.org/c/document_library/get_file?uuid=f29e6ce1-9ac5-42e3-8c15-73c4b9b5f48b&groupId=10157"
        temporaryFile = tempfile.NamedTemporaryFile()
        urllib.urlretrieve(abbreviationsPdfUrl, temporaryFile.name)

        entries = set()
        fileObject = open(temporaryFile.name, "rb")
        parser = PDFParser(fileObject)
        document = PDFDocument(parser)
        resourceManager = PDFResourceManager()
        device = PDFPageAggregator(resourceManager)
        interpreter = PDFPageInterpreter(resourceManager, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            objects = [
                object for object in layout if not isinstance(object, LTRect)
                and not isinstance(object, LTCurve)
            ]
            params = LAParams()
            for line in layout.group_objects(params, objects):
                text = line.get_text()
                if u":" in text:
                    entry = text.split(u":")[0]
                    entry = entry.strip()
                    entry = entry.replace(u"..", ".")
                    entries.add(entry)

        dictionary = u"# Abreviaturas empregadas no Dicionario da Real Academia Galega\n"
        dictionary += u"# http://www.realacademiagalega.org/abreviaturas\n"
        dictionary += u"\n"
        for entry in formatEntriesForDictionary(entries, u"abreviatura"):
            dictionary += entry
        return dictionary
Example #3
0
def ParsePDF():
    filename = open(pdfpath, 'rb')  #以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    parser = PDFParser(filename)
    # 创建一个PDF文档对象存储文档结构,提供密码初始化,没有就不用传该参数
    doc = PDFDocument(parser, password='')
    #检查文件是否允许文本提取
    if not doc.is_extractable:
        print("Not Allowd Extractable")
        raise PDFTextExtractionNotAllowed
    
    # 创建PDf 资源管理器来管理共享资源,#caching = False不缓存
    rsrcmgr = PDFResourceManager(caching = False)
    # 创建一个PDF设备对象
    laparams = LAParams()
    # 创建一个PDF页面聚合对象
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    #device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
    # 创建一个PDF解析器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)  

    # 获取page列表list对象,
    # print(PDFPage.get_pages(doc))

    #获取page列表循环遍历列表,每次处理一个page的内容
    for page in PDFPage.create_pages(doc):
        # 接受该页面的LTPage对象
        interpreter.process_page(page)
        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
        layout = device.get_result()
        for i in layout:
            if hasattr(i,"get_text") :
                content = i.get_text().replace(u'\xa0',u'').replace('\n','')
                document.add_paragraph(content , style=None)
        break    
    document.save("a.docx")
    filename.close()
    return 1
Example #4
0
def extract_citations(url):
        '''
        Arguments:
                url (string): url of a pdf of a research paper
        '''
        extracted_text = ""
        start_writing = False # Don't want to start writing until we hit References

        fp = open(my_file, "rb")

        print(fp)
        
        ####
        f = urllib2.urlopen(urllib2.Request(url)).read()
        fp = StringIO(f)
        ####
        
        
        parser = PDFParser(fp)
        document = PDFDocument(parser, password = "")
        if not document.is_extractable: raise PDFTextExtractionNotAllowed
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                layout = device.get_result()
                for lt_obj in layout:
                        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                                if start_writing: extracted_text += lt_obj.get_text()
                                if "References\n" in lt_obj.get_text(): start_writing = True
        fp.close()
                                
        with open(log_file, "w") as my_log:
                my_log.write(extracted_text.encode("utf-8"))
        print("Done !!")
Example #5
0
    def get_pdf_metadata(self, pdf):
        """Get PDF metadata with PDF content

        Args:
            pdf: PDF content (in bytes)

        Returns:
            metadata: PDF metadata dictionary

        """

        temp_pdf_file = tempfile.TemporaryFile()
        temp_pdf_file.write(pdf)

        metadata = {'author': 'UNKNOWN_AUTHOR',
                    'title': 'UNKNOWN_TITLE',
                    'year': 'UNKNOWN_YEAR'}

        pdf_parser = PDFParser(temp_pdf_file)
        pdf_doc = PDFDocument(pdf_parser)
        pdf_metadata = pdf_doc.info[0]

        author = make_pdf_metadata_str(pdf_metadata['Author'] if 'Author' in pdf_metadata else '')
        if author and author != '':
            metadata['author'] = author

        title = make_pdf_metadata_str(pdf_metadata['Title'] if 'Title' in pdf_metadata else '')
        if title and title != '':
            metadata['title'] = title

        year = pdf_metadata_moddate_to_year(
            make_pdf_metadata_str(pdf_metadata['ModDate'] if 'ModDate' in pdf_metadata else ''))
        if year and year != '':
            metadata['year'] = year

        temp_pdf_file.close()

        return metadata
Example #6
0
def pdfminer(f):

    # Open a PDF file.
    fp = open(f, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    #    device = PDFDevice(rsrcmgr)

    laparams = LAParams(all_texts=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    converter = HTMLConverter(os.path.basename(f))

    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):

        interpreter.process_page(page)

        layout = device.get_result()
        converter.current_page = page
        converter.render(layout)
        break  # stop after first page.

    converter.add_features()

    return converter
def main():
    #First: extract text from PDF
    path_to_pdf = sys.argv[1]
    infile = open(path_to_pdf, 'rb')

    output = io.StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=None)
    interpreter = PDFPageInterpreter(manager, converter)
    parser = PDFParser(infile)
    doc = PDFDocument(parser)
    for page in PDFPage.get_pages(infile, caching=False):
        interpreter.process_page(page)
        break
    infile.close()
    converter.close()
    text = [output.getvalue()]
    output.close()

    #Second: classify between Single Stock or not
    model_tfidf_ss = joblib.load('my_2D_tfidf_full_model.pkl')
    text_tfidf_ss_transformed = model_tfidf_ss.transform(text)

    model_clf_ss = joblib.load('my_2D_linear_reg_model.pkl')
    label_clf_ss_predicted = model_clf_ss.predict(text_tfidf_ss_transformed)

    #Third: if SngleStock, return, otherwise classify between Economy and Industry
    if label_clf_ss_predicted == 'SingleStock':
        return label_clf_ss_predicted[0]
    else:
        model_tfidf_2D3D = joblib.load('my_2D3D_tfidf_model.pkl')
        text_tfidf_2D3D_transformed = model_tfidf_2D3D.transform(text)

        model_clf_2D3D = joblib.load('my_2D3D_linear_SVC_model.pkl')
        label_clf_2D3D_predicted = model_clf_2D3D.predict(
            text_tfidf_2D3D_transformed)

        return label_clf_2D3D_predicted[0]
Example #8
0
    def character_extraction(self, address):
        # Create a file pointer
        fp = open(address, 'rb')

        try:
            # Create parser object to parse the pdf content
            parser = PDFParser(fp)

            # Store the parsed content in PDFDocument object
            document = PDFDocument(parser, '')

            # Create PDFResourceManager object that stores shared resources such as fonts or images
            rsrcmgr = PDFResourceManager()

            # set parameters for analysis
            laparams = LAParams()

            # Create a PDFDevice object which translates interpreted information into desired format
            # Device needs to be connected to resource manager to store shared resources
            # device = PDFDevice(rsrcmgr)
            # Extract the decive to page aggregator to get LT object elements
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)

            # Create interpreter object to process page content from PDFDocument
            # Interpreter needs to be connected to resource manager for shared resources and device
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(document):
                # As the interpreter processes the page stored in PDFDocument object
                interpreter.process_page(page)
                # The device renders the layout from interpreter
                layout = device.get_result()
                # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
                for lt_obj in layout:
                    if isinstance(lt_obj, (LTTextBox, LTTextLine)):
                        self.fetch_chars(lt_obj)
                self.page_num += 1
        finally:
            fp.close()
Example #9
0
    def parse_data(self, path, filetype, **kwargs):
        self.filename = path
        self.metadata = {}

        if not filetype == FileTypes.PDF:
            return None

        with open(self.filename, 'rb') as fp:
            parser = PDFParser(fp)
            doc = PDFDocument(parser)

            if doc:
                try:
                    for xref in doc.xrefs:
                        info_ref = xref.trailer.get('Info')
                        info = None
                        if info_ref:
                            info = resolve1(info_ref)
                        self.metadata = info
                        for k, v in info.items():
                            if isinstance(v, PDFObjRef):
                                self.metadata[k] = resolve1(v)
                        break
                    if not self.metadata:
                        self.errors.append('No metadata found')
                        out = None
                    else:
                        self._parse_data()
                        out = self
                except Exception as e:
                    self.logger.error(str(e))
                    self.errors.append(str(e))
                    out = None
            else:
                self.errors.append('Cannot parse document')

            parser.close()
        return out
Example #10
0
def extractembedded(outfp,
                    fname,
                    objids,
                    pagenos,
                    password='',
                    dumpall=False,
                    codec=None,
                    extractdir=None):
    def extract1(obj):
        filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile'
                % (filename))
        path = os.path.join(extractdir, filename)
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print >> sys.stderr, 'extracting: %r' % path
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                extract1(obj)
    return
Example #11
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #12
0
    def parse_document(self):
        self.res = []  # result set
        self.media_boxes = dict()  # media coordinate dictionary
        self.n = 0  # page count
        pdf = open(self.pdf, "rb")
        pdf_parser = PDFParser(pdf)
        pdf_document = PDFDocument(pdf_parser)
        la_params = LAParams(detect_vertical=True)
        if constants.USE_CUSTOM_PDF_PARAMETERS:
            la_params = LAParams(detect_vertical=constants.DEFAULT_DETECT_VERTICAL,
                                 line_overlap=constants.DEFAULT_LINE_OVERLAP,
                                 line_margin=constants.DEFAULT_LINE_MARGIN,
                                 word_margin=constants.DEFAULT_WORD_MARGIN,
                                 char_margin=constants.DEFAULT_CHAR_MARGIN,
                                 boxes_flow=constants.DEFAULT_BOXES_FLOW)

        if pdf_document.is_extractable:
            resource_manager = PDFResourceManager()
            page_aggregator = PDFPageAggregator(resource_manager,
                                                laparams=la_params)
            page_interpreter = PDFPageInterpreter(resource_manager,
                                                  page_aggregator)
            pages = PDFPage.create_pages(pdf_document)

            for page in pages:
                page_interpreter.process_page(page)
                layout = page_aggregator.get_result()
                crop_box = page.cropbox
                page_box = page.mediabox
                self.media_boxes[self.n] = {"x0": crop_box[0], "y0": crop_box[1],
                                            "x1": crop_box[2], "y1": crop_box[3],
                                            "x0page": page_box[0], "y0page": page_box[1],
                                            "x1page": page_box[2], "y1page": page_box[3]}
                self.box_id = -1
                self.res = self.get_objects(layout._objs, self.res, self.n, self.media_boxes)
                self.n += 1

            return self.res, self.media_boxes
def convert(input_file):
    f = input_file
    fp = open(f, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    objs = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        # collecting objects from the all pages, sorting them by their Y coordinate
        objs.append(
            sorted(get_objects(layout), key=lambda x: x.y0, reverse=True))
    objs = sum(objs, [])  # flattening to 1D array
    # getting objects from the corresponding sections

    resume_as_text = extract_text(objs)

    counter = Counter()

    for word in technology_jargon.keywords:
        count = count_of_technology_words(word, resume_as_text)
        if count > 0:
            counter[word] += count

    return json.dumps(counter)
def get_total(filename):
    path = open(filename, 'rb')
    parser = PDFParser(path)
    document = PDFDocument(parser)
    temp_total = -1

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        check_total = False

        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = str(x.get_text())
                    if "Subtotal for all regions" in results:
                        check_total = True
                    if check_total:
                        # print("results: " + results)
                        temp_results = re.search(r'(.*)\n', results,
                                                 re.M | re.I).group(1)
                        temp_results = temp_results.replace(" ", "").replace(
                            "\\n", "")
                        try:
                            temp_num = int(temp_results)
                            if temp_num > temp_total:
                                temp_total = temp_num
                        except ValueError:
                            continue

    return temp_total
Example #15
0
def extract_layout_by_page(pdf_path):
    """
    Extracts LTPage objects from a pdf file.
    """
    laparams = LAParams()

    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    layouts = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layouts.append(device.get_result())

    return layouts
Example #16
0
def pdf_to_string(pdf_file):
    fp = open(pdf_file, 'rb')

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()

    # Configuração das margens
    laparams = LAParams()
    laparams.line_margin = 0.3
    laparams.word_margin = 0.3
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            print(lt_obj)
Example #17
0
def get_problem_page(problem, pdf_path):
    """
    Returns the pdf object belonging to the page of a problem widget

    Parameters
    ----------
    problem : Problem
        Problem object in the database of the currently selected problem
    pdf_path : str
        Path to the PDF file of the exam for this problem

    Returns
    -------
    page : PDFPage
        PDFPage object with information about the current page
    """
    fp = open(pdf_path, 'rb')

    parser = PDFParser(fp)
    document = PDFDocument(parser)

    page_number = problem.widget.page
    return next(itertools.islice(PDFPage.create_pages(document), page_number, page_number + 1))
    def parse_question_file(question_file_path):
        text_content = []
        with open(question_file_path, 'rb') as question_file:
            parser = PDFParser(question_file)
            document = PDFDocument(parser)

            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed
            else:
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                for page in PDFPage.create_pages(document):
                    interpreter.process_page(page)
                    layout = device.get_result()
                    for x in layout:
                        if isinstance(x, LTTextBoxHorizontal):
                            line = x.get_text().decode().strip()
                            if line:
                                text_content.append(line + '\n')
        return text_content
Example #19
0
def extract_text(in_path, out_path):
    #https://towardsdatascience.com/pdf-text-extraction-in-python-5b6ab9e92dd
    files = glob.glob(in_path + '*.pdf')

    for i in range(len(files)):
        print(str(i / len(files) * 100)[:4] + "%", end="\r")
        name = files[i]
        file_path = in_path + name
        output_string = StringIO()

        with open(file_path, 'rb') as infile:
            parser = PDFParser(infile)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)

        out_filename = out_path + os.path.basename(name).replace("pdf", "txt")

        with open(out_filename, 'w') as outfile:
            outfile.write(output_string.getvalue())
Example #20
0
def load_data_from_pdf(pdf):
    with open(pdf, 'rb') as file:
        parser = PDFParser(file)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        outcome = [
            load_fields_from_pdf(resolve1(f))
            for f in resolve1(doc.catalog['AcroForm'])['Fields']
        ]
    # format the outcome of data extract from ics pdf
    outcome = split_data(outcome)
    if outcome['Max Dynamic Reader Limit sets supported']:
        outcome['Max Dynamic Reader Limit sets supported'] = True if int(
            outcome['Max Dynamic Reader Limit sets supported']) > 4 else False
    if outcome['Product Configuration']:
        outcome['Product Configuration'] = True if outcome[
            'Product Configuration'] == '(A) PCDA (IRWIN Reader) / S-ICR' else False
    for key in outcome:
        if outcome[key] == 'Yes':
            outcome[key] = True
        elif outcome[key] in ['Off', 'No']:
            outcome[key] = False
    return outcome
Example #21
0
def extract_pages(fp, start=None, end=None):
    """ extracts LTPage objects from a pdf file
    
    slightly modified from: https://euske.github.io/pdfminer/programming.html
    """
    laparams = LAParams()

    parser = PDFParser(fp)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    manager = PDFResourceManager()
    device = PDFPageAggregator(manager, laparams=laparams)
    interpreter = PDFPageInterpreter(manager, device)

    for i, page in enumerate(PDFPage.create_pages(document)):
        if start is not None and end is not None and i < start or i >= end:
            continue

        interpreter.process_page(page)
        yield device.get_result()
Example #22
0
def shan_convert(pdf_path):
    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        temp_file = pikepdf.open(pdf_path)
        temp_path = pdf_path[:-4] + "shan_temp" + ".pdf"
        temp_file.save(temp_path)
        fp = open(temp_path, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()
    for page in PDFPage.get_pages(fp, pagenos):
        interpreter.process_page(page)
    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
    return text
Example #23
0
def main(fname):
    with open(fname, 'rb') as fd:
        parser = PDFParser(fd)
        doc = PDFDocument(parser)

        # Check if document is extractable, if not abort
        if not doc.is_extractable:
            raise Exception

        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        all_txt = ""
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            txt = parse_layout(layout)
            all_txt += txt

        #print "Converted text\n", all_txt
        snip = find_pattern(all_txt, "volunteer recycling", 200)
        print snip
Example #24
0
def parse_pdf(path, output_path):
    with open(path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams(all_texts=True,
                            boxes_flow=2.0,
                            heuristic_word_margin=True)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = ''
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extracted_text += lt_obj.get_text()
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(extracted_text)
Example #25
0
def with_pdf(pdf_doc, fn, pdf_pwd, *args):
    """Open the pdf document, and apply the function, returning the results"""
    result = None
    try:
        # open the pdf file
        fp = open(pdf_doc, "rb")
        # create a parser object associated with the file object
        parser = PDFParser(fp)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument(parser, pdf_pwd)
        # connect the parser and document objects
        parser.set_document(doc)

        if doc.is_extractable:
            # apply the function and return the result
            result = fn(doc, *args)

        # close the pdf file
        fp.close()
    except IOError:
        # the file doesn't exist or similar problem
        pass
    return result
Example #26
0
 def __init__(self, stream, pages=None, laparams=None, precision=0.001):
     self.laparams = None if laparams == None else LAParams(**laparams)
     self.stream = stream
     self.pages_to_parse = pages
     self.precision = precision
     rsrcmgr = PDFResourceManager()
     self.doc = PDFDocument(PDFParser(stream))
     self.metadata = {}
     for info in self.doc.info:
         self.metadata.update(info)
     for k, v in self.metadata.items():
         if hasattr(v, "resolve"):
             v = v.resolve()
         if type(v) == list:
             self.metadata[k] = list(map(decode_text, v))
         elif isinstance(v, PSLiteral):
             self.metadata[k] = decode_text(v.name)
         elif isinstance(v, bool):
             self.metadata[k] = v
         else:
             self.metadata[k] = decode_text(v)
     self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
     self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
Example #27
0
def Pdf2Txt(DataIO,Save_path):                     #来创建一个pdf文档分析器
    parser = PDFParser(DataIO)                     #创建一个PDF文档对象存储文档结构
    document = PDFDocument(parser) 
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #创建一个PDF资源管理器对象来存储共赏资源
        rsrcmgr=PDFResourceManager();            #设定参数进行分析
        laparams=LAParams();                    #创建一个PDF设备对象
        #device=PDFDevice(rsrcmgr)
        device=PDFPageAggregator(rsrcmgr,laparams=laparams);#创建一个PDF解释器对象
        interpreter=PDFPageInterpreter(rsrcmgr,device)
        #处理每一页
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page);        #接受该页面的LTPage对象
            layout=device.get_result()
            for x in layout:
                try:
                    if(isinstance(x,LTTextBoxHorizontal)):
                        with open('%s'%(Save_path),'a') as f:
                            f.write(x.get_text().encode('utf-8')+'\n')
                except:
                    print "Failed!"
Example #28
0
def parse(Path):
    parser = PDFParser(Path) #parser的意思是解析器、分析程序
    document = PDFDocument(parser)
    re_list = []

    # 判断PDF是否能够解析
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text()
                    re_list.append(results)
    print(re_list)
    return re_list
Example #29
0
    def parse_pdf(self, source_pdf: str = None) -> None:
        """Parse source PDF into entities which can be
        used for text searches for example.

        :param source_pdf: source
        """
        if source_pdf is not None:
            self.switch_to_pdf_document(source_pdf)
        source_parser = PDFParser(self.active_fileobject)
        source_document = PDFDocument(source_parser)
        source_pages = PDFPage.create_pages(source_document)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams(
            detect_vertical=True,
            all_texts=True,
        )
        device = RPAConverter(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # # Look at all (nested) objects on each page
        for _, page in enumerate(source_pages, 0):
            interpreter.process_page(page)
        self.rpa_pdf_document = device.close()
Example #30
0
File: app.py Project: rjeli/docfeed
def get_blurb():
    pdfs = glob.glob('/pdfs/*')
    if not pdfs:
        print >> sys.stderr, 'NO PDFS'
        return '', ''
    pdf = random.choice(pdfs)
    print >> sys.stderr, 'pdf:', pdf
    with open(pdf, 'rb') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        assert document.is_extractable
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        device = TextConverter(rsrcmgr,
                               retstr,
                               codec='utf-8',
                               laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pages = list(PDFPage.get_pages(f))
        pnum = random.randint(0, len(pages))
        interpreter.process_page(pages[pnum])
        txt = retstr.getvalue()
    return pdf.replace('pdfs', 'view') + '#page=' + str(pnum), txt[:100]