def convert(fname):
    pages = None
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    # if names and address on odd page; remove not if names and addresses are on even page
    for pagenumber, page in enumerate(PDFPage.get_pages(infile, pagenums)):
        if pagenumber % 2:
            interpreter.process_page(page)
        else:
            pass

    # for page in PDFPage.get_pages(infile, pagenums):
    # interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    return text
    def __get_pdf_text__(self):
        """Extracts all the text from the pdf while removing
        superfluous/unmatched space characters

        Returns:
            text (string): A string of all pdf text
        Code from:
        https://stackoverflow.com/questions/56494070/how-to-use-pdfminer-six-with-python-3
        """
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager,
                                  fake_file_handle,
                                  laparams=LAParams())
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        with open(self.file_path, 'rb') as fh:

            for page in PDFPage.get_pages(fh,
                                          caching=True,
                                          check_extractable=True):
                page_interpreter.process_page(page)

            text = fake_file_handle.getvalue()

        # close open handles
        converter.close()
        fake_file_handle.close()
        text = text.replace('\n', '').replace('\r', '  ')
        return text
Exemple #3
0
def extract_pdf_content(pdf_path):
    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    outfp = StringIO()  # 开始捕捉字节流(outfp)
    laparams = LAParams()
    device = TextConverter(rsrcmgr=rsrcmgr,
                           outfp=outfp,
                           codec=codec,
                           laparams=laparams)
    with open(pdf_path, 'rb') as fp:  # 将pdf文件转换为二进制数据
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            interpreter.process_page(page)  # 解析pdf的每一页,以二进制数据缓存
    mystr = outfp.getvalue()  # 捕获二进制信息流,以字符串的形式返回
    device.close()
    outfp.close()
    return mystr
    def __extracttxt2(self):
        """Helper function to extract text by pdfminer, slower but handles
        formats not recongnised by PyMupdf"""

        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = open(str(self.filepath), 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                      password=password, caching=caching,
                                      check_extractable=True):
            interpreter.process_page(page)

        text = retstr.getvalue()

        fp.close()
        device.close()
        retstr.close()
        return text
Exemple #5
0
    def to_html(self, pdf_path):
        rsrcmgr = PDFResourceManager()
        retstr = io.BytesIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0  #is for all
        caching = True
        pagenos = set()
        try:
            with open(pdf_path, 'rb') as fp:
                for page in PDFPage.get_pages(fp,
                                              pagenos,
                                              maxpages=maxpages,
                                              password=password,
                                              caching=caching,
                                              check_extractable=True):
                    interpreter.process_page(page)
        except Exception:
            print("[ERR] file path=", pdf_path)

        device.close()
        report = retstr.getvalue()
        retstr.close()
        self.html = report.decode('utf-8')
        chtml = self._replace(self.html)
        self.body = soup(chtml, 'html.parser')
Exemple #6
0
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text
Exemple #7
0
def pdf_pages_to_list_of_strings(pdf_path):
    pdf = open(pdf_path, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    pages_text = []

    pdf_pages = PDFPage.get_pages(pdf)

    for page in pdf_pages:
        # Get (and store) the "cursor" position of stream before reading from PDF
        # On the first page, this will be zero
        read_position = retstr.tell()

        # Read PDF page, write text into stream
        interpreter.process_page(page)

        # Move the "cursor" to the position stored
        retstr.seek(read_position, 0)

        # Read the text (from the "cursor" to the end)
        page_text = retstr.read()

        # Add this page's text to a convenient list
        pages_text.append(page_text)

    return pages_text
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    manager = PDFResourceManager()
    codec = 'utf-8'
    caching = True
    output = io.StringIO()
    converter = TextConverter(manager,
                              output,
                              codec=codec,
                              laparams=LAParams())

    interpreter = PDFPageInterpreter(manager, converter)
    infile = open(fname, 'rb')

    for page in PDFPage.get_pages(infile,
                                  pagenums,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    convertedPDF = output.getvalue()

    infile.close()
    converter.close()
    output.close()
    return convertedPDF
Exemple #9
0
def process_pdf(file):

    #Creating the required objects
    resource_manager = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(resource_manager, laparams=laparams)
    page_interpreter = PDFPageInterpreter(resource_manager, device)

    #This list will contain the text at each page of the document.
    pdfText = list()

    #Processing each page in the pdf.
    for page in PDFPage.get_pages(file):
        page_interpreter.process_page(page)
        layout = device.get_result()
        text = ""
        for element in layout:
            # Whenever, we encounter the layout type as text box, we get the text.
            # This is to skip images if any.
            if isinstance(element, LTTextBox):
                text += element.get_text()
        pdfText.append(text)

    #Returing a list, where element at each index contains the text at each page
    return pdfText
def text_extractor(file_name):
    '''
    input: a file name of an earnings transcript
    output: extracted text from the transcript
    '''
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager,
                              fake_file_handle,
                              laparams=LAParams())
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    with open(file_name, 'rb') as fh:

        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)

        text = fake_file_handle.getvalue()

    # close open handles
    converter.close()
    fake_file_handle.close()

    return text
 def pdf_to_txt(fp):
     sentence = []
     rsrcmgr = PDFResourceManager()
     retstr = StringIO()
     codec = 'utf-8'
     laparams = LAParams()
     device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
     # fp= open(name, 'rb')
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     password = ""
     maxpages = 0
     caching = True
     pagenos = set()
     for page in PDFPage.get_pages(fp,
                                   pagenos,
                                   maxpages=maxpages,
                                   password=password,
                                   caching=caching,
                                   check_extractable=True):
         interpreter.process_page(page)
     fp.close()
     device.close()
     str = retstr.getvalue()
     str = str.replace('\n', ' ')
     retstr.close()
     sentence = str.split('.')
     return sentence
Exemple #12
0
    def _pdf(self, path: str) -> str:
        """Load a PDF file and split document to pages.
        Args:
            pdf: PDF file or path to file
        Returns:
            list of pages.
        """
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        device = TextConverter(rsrcmgr, retstr, codec="utf-8", laparams=LAParams())
        list_of_pages = []
        with open(path, "rb") as pdf_file:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()

            for page in PDFPage.get_pages(
                pdf_file,
                pagenos,
                maxpages=maxpages,
                password=password,
                caching=caching,
                check_extractable=True,
            ):
                read_position = retstr.tell()
                interpreter.process_page(page)
                retstr.seek(read_position, 0)
                page_text = retstr.read()
                list_of_pages.append(page_text)
        device.close()
        retstr.close()
        return "\n".join(list_of_pages)
    def __init__(self, pdf, codec='utf-8'):
        """
        Parameters:
        --------------
        codec:      codific, default utf-8
        pdf:        path to the pdf file

        Attributes:
        ---------------
        records:        list of lines from the pdf file
        text:           string of joined records, default ""
        didascalies:    list of found didascalies with regexpr
        nimages:        int, number of found images

        """
        self.pdf = pdf
        self.text = ""
        self.records = []
        self.didascalies = []
        self.nimages = 0
        self.images = []

        parser = PDFParser(pdf)
        #parser = PDFParser(open(pdf, 'rb'))
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object
        # that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Create a buffer for the parsed text
        retstr = StringIO()
        # Spacing parameters for parsing
        laparams = LAParams()
        self.codec = codec
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        # Create a PDF interpreter object
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)

        #images

        img_device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        img_interpreter = PDFPageInterpreter(rsrcmgr, img_device)
        for page in PDFPage.create_pages(document):
            img_interpreter.process_page(page)
            pdf_item = img_device.get_result()
            if pdf_item is not None:
                for thing in pdf_item:
                    if isinstance(thing, LTImage):
                        self.save_image(thing)
                    if isinstance(thing, LTFigure):
                        self.find_images_in_thing(thing)

        lines = retstr.getvalue().splitlines()
        for line in lines:
            self.records.append(line)
Exemple #14
0
def handlefile(myfile):
    kind = filetype.guess('app/static/upload/' + myfile.name)
    if kind is None:
        print('Cannot guess file type!')

    print('File extension: %s' % kind.extension)
    print('File MIME type: %s' % kind.mime)

    if (kind.extension == "pdf"):
        from pdfminer3.layout import LAParams, LTTextBox
        from pdfminer3.pdfpage import PDFPage
        from pdfminer3.layout import LAParams, LTTextBox
        from pdfminer3.pdfpage import PDFPage
        from pdfminer3.pdfinterp import PDFResourceManager
        from pdfminer3.pdfinterp import PDFPageInterpreter
        from pdfminer3.converter import PDFPageAggregator
        from pdfminer3.converter import TextConverter
        import io

        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        codec = 'utf-8'
        converter = TextConverter(resource_manager,
                                  fake_file_handle,
                                  codec=codec,
                                  laparams=LAParams())
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        with open('app/static/upload/' + myfile.name, 'rb') as fh:

            for page in PDFPage.get_pages(fh,
                                          caching=True,
                                          check_extractable=True):

                page_interpreter.process_page(page)
                text = fake_file_handle.getvalue()

        converter.close()
        fake_file_handle.close()
        print(text)

    if (kind.extension == "png" or kind.extension == "jpg"
            or kind.extension == "webp"):
        from PIL import Image, ImageFilter, ImageChops
        import pytesseract
        from pytesseract import image_to_string
        import cv2
        filename = 'app/static/upload/' + myfile.name
        imgcv = cv2.imread(filename, 0)
        imp = Image.open(filename)
        text = image_to_string(imp)
        #text = main_fun(imgcv,imp,kind.extension)
        #text=main_fun(im)
        print(text)

    dictionary = MakeForm(text)
    #dictionary.replace('"', "'")
    #print(dictionary)
    return dictionary
Exemple #15
0
def _extract_pdf_scores(stream):
    # these laparams seem to work ok with the ILIAS default PDF
    # formatting as well as with UR custom styling.

    # see pdf/tests/default_style.pdf and pdf/tests.ur_style.pdf

    laparams = LAParams(line_overlap=0,
                        char_margin=20,
                        word_margin=0.1,
                        boxes_flow=0,
                        detect_vertical=False)

    rsrcmgr = PDFResourceManager()

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    parser = PDFParser(stream)
    document = PDFDocument(parser)

    page = next(PDFPage.create_pages(document))

    interpreter.process_page(page)
    layout = device.get_result()

    boxes = []
    table_head_y = None  # y position of result table header

    order_name = "Reihenfolge"  # FIXME localize

    for element in layout:
        if isinstance(element, LTTextBoxHorizontal):
            boxes.append(element)
            if order_name in element.get_text().strip():
                table_head_y = element.y0

    tboxes = list(filter(lambda box: box.y0 == table_head_y, boxes))

    # if LAParams is set correctly, head should extract the whole
    # results table's text now.
    table = tboxes[0].get_text().replace('\t', '')

    table = table[table.find(order_name):]

    # note: question titles might lack spaces; this is no problem
    # since we compare question names and scores only through
    # Result.normalize_question_title() later.

    scores = dict()
    cols = []
    for line in table.split("\n")[1:]:
        cols += re.split(r'\s+', line)
        if len(cols) >= 6:
            scores[cols[2]] = cols[4]
            cols = cols[6:]

    return scores
Exemple #16
0
 def __init__(self, ofile):
     rsrcmgr = PDFResourceManager()
     laparams = LAParams()
     self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
     self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
     self.last_font = None
     self.in_rule = False
     self.font_print_pending = False
     self.header_footer_skipping = False
     self.ofile = ofile
Exemple #17
0
def coverpageinfo(earnings_call_file):
    """
    takes in a pdf file and returns a dictionary with information regarding the cover page(company_name,symbol,date,year)
    :param earnings_call_file: File - representation of the earnings call file
    :return: dictionary with keys corresponding to the pdf basic info such as 'symbol','quarter_year','company_name' and 'published date'
    """
    coverpage_info = {}
    PASSWORD = ""
    # setting parameters for pdfminer's get_pages function
    MAXPAGES = 0
    CACHING = True
    PAGE_NUMBERS = set()
    # using loop count to count until first page in order to only extract cover page
    loop_count = 0
    text = ""
    for page in PDFPage.get_pages(earnings_call_file, PAGE_NUMBERS, maxpages=MAXPAGES, password=PASSWORD,
                                  caching=CACHING, check_extractable=True):
        # setting parameters for PDFMiner's TextConverter function
        resource_manager = PDFResourceManager()
        return_string = StringIO()
        CODENCODING = 'utf-8'
        analysis_parameter = LAParams()
        device = TextConverter(resource_manager, return_string,
                               codec=CODENCODING, laparams=analysis_parameter)
        # updates interpreter with current page
        interpreter = PDFPageInterpreter(resource_manager, device)
        interpreter.process_page(page)
        # stop at 0 as function only extract title page,loop breaks after going through cover page
        if loop_count == 0:
            text = return_string.getvalue()
            break
    device.close()
    return_string.close()
    # removing footer from page
    cleansed_text = re.search(
        "(?<= FactSet CallStreet, LLC).*", text).group(0).strip()
    # use regex to match and extract company ticker
    ticker = re.search("\((.*?)\)", cleansed_text).group(0).strip()
    # extracting company ticker symbol
    splits = cleansed_text.split(ticker)
    # using ticker as a regex input to identify quarter and year of pdf
    splits = [i.strip() for i in splits]
    # extracting quarter_year information as it exist and reolacing white spaces(numerical integers are
    # used as ingested data is assumed to be structured)
    quarter_year = splits[1][:7].replace(" ", "_")
    # using ticker as a regex input to identify published date of pdf
    published_date_time = splits[0][:11]
    # using ticker as a regex input to identify company namne
    company_name = splits[0][12:]
    # putting all the information of the cover page in a dictionary.
    coverpage_info['symbol'] = ticker[1:-1]
    coverpage_info['quarter_year'] = quarter_year
    coverpage_info['company_name'] = company_name
    coverpage_info['published_date'] = published_date_time
    return coverpage_info
Exemple #18
0
def convert_pdf_to_txt(filepath):
    rm = PDFResourceManager()
    sio = StringIO()
    device = TextConverter(rm, sio, codec='utf-8', laparams=LAParams())
    interpreter = PDFPageInterpreter(rm, device)
    with open(filepath, 'rb') as fp:
        for page in PDFPage.get_pages(fp=fp, pagenos=set(), maxpages=0, password='',
                                      caching=True, check_extractable=True):
            interpreter.process_page(page)
    text = sio.getvalue()
    device.close()
    sio.close()
    return text
def convert(pdffile):
    my_file = pdffile
    extracted_text = ""
    # Open and read the pdf file in binary mode
    fp = open(my_file, "rb")

    # Create parser object to parse the pdf content
    parser = PDFParser(fp)

    # Store the parsed content in PDFDocument object
    document = PDFDocument(parser, password)

    # Check if document is extractable, if not abort
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create PDFResourceManager object that stores shared resources such as fonts or images
    rsrcmgr = PDFResourceManager()

    # set parameters for analysis
    laparams = LAParams()

    # Create a PDFDevice object which translates interpreted information into desired format
    # Device needs to be connected to resource manager to store shared resources
    # device = PDFDevice(rsrcmgr)
    # Extract the decive to page aggregator to get LT object elements
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create interpreter object to process page content from PDFDocument
    # Interpreter needs to be connected to resource manager for shared resources and device
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Ok now that we have everything to process a pdf document, lets process it page by page
    for page in PDFPage.create_pages(document):
        # As the interpreter processes the page stored in PDFDocument object
        interpreter.process_page(page)
        # The device renders the layout from interpreter
        layout = device.get_result()
        # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()

    #close the pdf file
    fp.close()

    # print (extracted_text.encode("utf-8"))

    with open(log_file, "wb") as my_log:
        my_log.write(extracted_text.encode("utf-8"))
    print("Done !!")
Exemple #20
0
def main(args):
    msg(SCRIPT, args)

    if len(args) != 1:
        msg('Parse a PDF file and print some pdfminer-specific stats')
        msg('Usage:', SCRIPT, '<PDF-filename>')
        return 1

    infilename, = args

    lt_types = collections.Counter()

    with open(infilename, 'rb') as pdf_file:

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(pdf_file)

        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        password = ''
        document = PDFDocument(parser, password)
        # Check if the document allows text extraction.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed(filename)

        # Make a page iterator
        pages = PDFPage.create_pages(document)


        # Set up for some analysis
        rsrcmgr = PDFResourceManager()
        laparams = LAParams(
            detect_vertical=True,
            all_texts=True,
            )
        #device = PDFDevice(rsrcmgr)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Look at all (nested) objects on each page
        for page_count, page in enumerate(pages, 1):
            # oh so stateful
            interpreter.process_page(page)
            layout = device.get_result()

            lt_types.update(type(item).__name__ for item in flat_iter(layout))

    msg('page_count', page_count)
    msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
Exemple #21
0
def get_pdf_content(file_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    with open(file_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh,
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
        text = fake_file_handle.getvalue()
    # close open handles
    converter.close()
    fake_file_handle.close()
    return text
Exemple #22
0
def get_text_from_pdf(in_file):
	resource_manager = PDFResourceManager()
	fake_file_handle = io.StringIO()
	converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
	page_interpreter = PDFPageInterpreter(resource_manager, converter)
	with open(in_file, 'rb') as fh:
		for page in PDFPage.get_pages(fh,
									caching=True,
									check_extractable=True):
			page_interpreter.process_page(page)
		pdftext = fake_file_handle.getvalue()
	converter.close()
	fake_file_handle.close()
	content=pdftext.replace('\n', '').replace(" ", "")
	return content
def convert_pdf_to_txt(path_to_file):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    co = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=co, laparams=laparams)
    path_ = os.getcwd() + path_to_file
    print(path_)
    fp = open(path_, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()

    BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    parentdir = os.path.abspath(os.path.join(BASE_DIR, os.pardir))
    mediadir = os.path.join(BASE_DIR, "media")
    txtdir = os.path.join(mediadir, "textfiles")

    base = os.path.basename(path_to_file)
    fileNAME = os.path.splitext(base)[0]
    fileTXT = fileNAME + '.txt'
    filePDF = fileNAME + '.pdf'
    filetxtpath = os.path.join(txtdir, fileTXT)
    filePDF = os.path.join(mediadir, filePDF)

    print(filePDF)
    convertedPDF = convert(filePDF, pages=None)
    fileConverted = open(filetxtpath, 'w+', encoding="utf-8")
    ######## EITHER
    fileConverted.write(convertedPDF)
    fileConverted.close()
    return text
Exemple #24
0
def get_cv_email(cv_path):
    pagenums = set()
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    infile = open(cv_path, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    match = re.search(r'[\w\.-]+@[a-z0-9\.-]+', text)
    email = match.group(0)
    return email
Exemple #25
0
def pdfparser(filename):
    fp = open(filename, 'rb')
    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    paginas = []
    for page in PDFPage.get_pages(fp, check_extractable=False):
        interpreter.process_page(page)
        layout = device.get_result()
        pagina = []
        for element in layout:
            if isinstance(element, LTTextBox) or isinstance(element, LTText):
                texto = element.get_text()
                if texto:
                    if len(texto) < 2:
                        pagina[-1] += texto
                    else:
                        pagina += texto.split("\n")

        # limpa linhas vazias
        pagina_limpa = []
        for linha in pagina:
            if linha.strip():
                pagina_limpa.append(linha.strip())

        if len(pagina_limpa) > 0:
            paginas.append(pagina_limpa)

    text_filename = os.path.splitext(filename)[0] + '.txt'
    textfile = open(text_filename, 'w')

    # remove header
    header_candidato = ''
    for pagina in paginas:
        if pagina[0].strip() != header_candidato:
            header_candidato = pagina[0].strip()
        else:
            pagina.pop(0)

        for linha in pagina:
            textfile.write("%s\n" % linha)
    textfile.close()
Exemple #26
0
def check():
    for key, item in rpt_filename.items():
        studb = wb[key]
        schrpt_src_file = schrpt_import_folder + item
        print('Start checking {}'.format(schrpt_src_file))
        SchRptOpen = PdfFileReader(schrpt_src_file)

        num_student = studb.max_row - 1
        num_pagesPdf = SchRptOpen.getNumPages()
        print('Checking (no. of student, no. of pages):', num_student, num_pagesPdf)

        if num_student != num_pagesPdf:
            print('PDF pages do not match the number of students.')
            exit(3)

        else:
            StuRange = range(2, studb.max_row + 1)
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
            page_interpreter = PDFPageInterpreter(resource_manager, converter)

            for row in StuRange:
                pagePdf = SchRptOpen.getPage(row - 2)
                stuname1 = studb.cell(row=row, column=stufile_ind['name1']).value
                stuname2 = studb.cell(row=row, column=stufile_ind['name2']).value
                classcode_no = studb.cell(row=row, column=stufile_ind['key']).value
                password = studb.cell(row=row, column=stufile_ind['searchid']).value
                pdf_writer = PdfFileWriter()
                pdf_writer.addPage(pagePdf)
                #report_temp = schrpt_temp_folder + sep + 'temp.pdf'
                report_temp = schrpt_output2_folder + sep + key + sep \
                                    + '{}_{}_{}.pdf'.format(schrptexam, classcode_no, stuname2)

                with io.open(report_temp, 'wb') as out:
                    pdf_writer.write(out)

                txt = convert_pdf_to_txt(report_temp, password)
                find_name = txt.find(stuname1)

                if find_name == -1:
                    print(stuname1, find_name)

                else:
                    student_name_in_pdf = txt[find_name:(len(stuname1) + find_name)]
                    print('Checked: #{} {} - {}'.format(row - 1, stuname1, find_name))
def pdf2txt_page(file_path):
    list_page = []
    with open(file_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
            list_page.append(fake_file_handle.getvalue())
            converter.close()
            fake_file_handle.close()
    list_para = []
    for i in range(len(list_page)):
        list_para.append(split_text(list_page[i]))

    return list_para
Exemple #28
0
def append_pdf_pages(earnings_call_file, exclude_page):
    """
    append pages of the pdf and also removing header,footers,line breaks and disclaimer section.
    :param earnings_call_file: File - representation of the earnings call file
    :param exclude_page: list of numbers indicating pages you want to exclude for appending
    :return: string in pdf with all linebreaks, headers,footers and disclaimer section removed.
    """
    PASSWORD = ""
    MAXPAGES = 0
    CACHING = True
    pdf_string = ""
    PAGE_NUMBERS = set()
    loop_count = 0
    for page in PDFPage.get_pages(earnings_call_file, PAGE_NUMBERS, maxpages=MAXPAGES, password=PASSWORD,
                                  caching=CACHING,
                                  check_extractable=True):
        # skips through pages which we dont want to append, used as we dont want to capture cover page info.
        if loop_count in exclude_page:
            loop_count += 1
        else:
            # reseting the fit of PDFResourceManager for each page and extracting text
            # from the page and appending it to pdf_string.
            resource_manager = PDFResourceManager()
            return_string = StringIO()
            CODENCODING = 'utf-8'
            analysis_parameter = LAParams()
            device = TextConverter(
                resource_manager, return_string, codec=CODENCODING, laparams=analysis_parameter)
            interpreter = PDFPageInterpreter(resource_manager, device)
            interpreter.process_page(page)
            text = return_string.getvalue()
            # check what character is being replaced
            text = re.sub('\\uf0b7', '.', text)
            text = re.sub('\\x0c', '', text)
            text = text.replace("\\", "")
            # remove header and footer
            text = re.search("(?<=CallStreet, LLC).*", text).group(0).strip()
            pdf_string = pdf_string + ' ' + text
            loop_count += 1
    device.close()
    return_string.close()
    # removing disclaimer section
    pdf_string = re.search(
        ".*(?=\sDisclaimer The information)", pdf_string).group(0).strip()
    return pdf_string.strip()
def convert_pdf_to_txt(path, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(path, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    return text
Exemple #30
0
def PDF_alimentos(path,
                  keywords=[
                      'arroz', 'aceite', 'azúcar', 'azucar', 'café', 'cafe',
                      'harina', 'atún', 'atun', 'panela', 'pasta', 'fríjol',
                      'frijol', 'lenteja', 'chocolate', 'leche'
                  ]):

    #PDFminer3 works better

    try:
        String = "|".join(keywords)

        # open the pdf file
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager,
                                  fake_file_handle,
                                  laparams=LAParams())
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        with open(path, 'rb') as fh:

            for page in PDFPage.get_pages(fh,
                                          caching=True,
                                          check_extractable=True):
                page_interpreter.process_page(page)

            Text = fake_file_handle.getvalue()

        # close open handles
        converter.close()
        fake_file_handle.close()

        # define keyterms
        ResSearch = re.search(String, Text.lower())

        if ResSearch != None:
            return True
        else:
            return False

    except:
        return np.nan