コード例 #1
0
def get_pdf_text(pdf_path):
    p = pdfbox.PDFBox()
    text = p.extract_text(pdf_path, sort=True)
    text = re.sub(r'[^\x00-\x7F]+','', text)
    #print("*********** Extracted Text *********\n" + text)
    #__writeFile(__get_output_file_path(pdf_path, "_unprocessed_text.txt"), text)
    return text;
コード例 #2
0
    def test_extract_images(self):
        p = pdfbox.PDFBox()

        with TemporaryDirectory() as output_dir:
            output_prefix = (Path(output_dir) / 'test').resolve()
            result = p.extract_images('./test3.pdf', prefix=output_prefix)
            self.assertTrue('test-1.png' in os.listdir(output_dir))
コード例 #3
0
 def test_extract_text(self):
     p = pdfbox.PDFBox()
     text = p.extract_text('./test.pdf')
     if platform == "linux" or platform == "linux2" or platform == "darwin":
         self.assertEqual(text, 'this is a test PDF\n')
     elif platform == "win32":
         self.assertEqual(text, 'this is a test PDF\r\n')
コード例 #4
0
    def test_pdf_to_images(self):
        p = pdfbox.PDFBox()

        with TemporaryDirectory() as output_dir:
            output_prefix = (Path(output_dir) / 'test').resolve()
            result = p.pdf_to_images('./test2.pdf', outputPrefix=output_prefix)
            self.assertTrue('test1.jpg' in os.listdir(output_dir)
                            and 'test2.jpg' in os.listdir(output_dir))
コード例 #5
0
def test_python_pdfbox_extract_text_generates_correct_output():

    file = CONFIG.test_files_dir / 'test.pdf'
    expected_output = CONFIG.test_files_dir / 'test.txt'
    p = pdfbox.PDFBox()

    with TemporaryDirectory() as output_dir:
        output_path = (Path(output_dir) / f'{file.stem}.txt').resolve()
        p.extract_text(file, output_path=output_path)

        assert output_path.exists()
        assert len(list(Path(output_dir).iterdir())) == 1
        assert filecmp.cmp(output_path, expected_output) is True
コード例 #6
0
def extract_pdf(file_path, upload_folder, file_name, *args):

    print('extracting text from: {0}'.format(file_path))
    p = pdfbox.PDFBox()
    text = p.extract_text(file_path)
    formatted_text = "<br/>".join(text.splitlines())
    translation = translate(formatted_text[0:4000])

    formatted_translation = translation.replace('<br/>', '\n')

    f_name = file_name.split('.')[0]
    uploaded_file_path = os.path.join(upload_folder,
                                      f_name + '_translated.txt')

    with open(uploaded_file_path, "w") as f:
        f.write(formatted_translation)
コード例 #7
0
ファイル: etl.py プロジェクト: jfan1998/AutoLibrary
def convert_txt(indir, outdir, pdfname):
    print("\n")
    print(
        ">>>>>>>>>>>>>>>>>>>>>>>> Installing PDFBox... <<<<<<<<<<<<<<<<<<<<<<<<<<<<"
    )
    os.system('pip install python-pdfbox')

    print("\n")
    print(
        ">>>>>>>>>>>>>>>>>>>>>>>> Converting File... <<<<<<<<<<<<<<<<<<<<<<<<<<<<"
    )
    print("  => Inputing a document...")
    # remove single quotes from file name
    pdfname = pdfname.replace("'", "")
    #os.system('bash src/rename.sh')
    # extract text using PDFBox
    input_fp = os.path.join(indir, pdfname)
    temp_txt = input_fp.replace('.pdf', '.txt')
    temp_txt = temp_txt.replace(' ', '_')
    p = pdfbox.PDFBox()
    p.extract_text(input_fp, temp_txt)

    # make a directory if outdir does not exist
    command = 'mkdir -p ' + outdir
    os.system(command)

    print("  => Converting pdf to txt...")
    textname = pdfname.replace('.pdf', '_converted.txt')
    textname = textname.replace(' ', '_')
    output_fp = os.path.join(outdir, textname)
    output_txt = open(output_fp, 'w')
    # concatenate split lines
    with open(temp_txt, 'rb') as f:
        for line in f:
            line = line.decode()
            if len(line) >= 2 and line[-2] == '-':
                output_txt.write(line[:-2])
            else:
                output_txt.write(line[:-1] + ' ')
    output_txt.close()

    # save output
    command = 'rm ' + temp_txt
    os.system(command)
    print(" => Done! File is saved as '" + output_fp + "'")
    print("\n")
    return
コード例 #8
0
    def extract(self, filename, method="pdfbox"):
        """
            Extract the raw text of a PDF file using PDFBox or Textract.
            Default method: PDFBox
        """
        if method == "pdfbox":
            p = pdfbox.PDFBox()
            text = p.extract_text(filename)
            if len(text) == 0:
                method = "textract"

        if method == "textract":
            byte_text = textract.process(filename,
                                         encoding="utf-8",
                                         method="pdfminer")
            text = byte_text.decode("utf-8")

        return text
コード例 #9
0
class PDFBoxExtractor(ITextExtractor):

    p: pdfbox.PDFBox = pdfbox.PDFBox()
    encoding: str = 'utf-8'
    html: bool = False
    sort: bool = False
    ignore_beads: bool = False
    console: bool = False

    def pdf_to_txt(
        self,
        filename: Union[str, os.PathLike],
        output_folder: Union[str, os.PathLike],
        first_page: int = 1,
        last_page: Optional[int] = None,
    ) -> None:
        basename = Path(filename).stem
        # TODO Remove num_pages
        num_pages = pdf2image.pdfinfo_from_path(filename)['Pages']
        if last_page is None or last_page > num_pages:
            last_page = int(num_pages)

        # TODO
        # for page in p.get_pages('filename'): -> sorted list of strings (or list of strings + titles, or markup)
        for page in range(first_page, last_page + 1):
            output_filename = Path(output_folder) / f'{basename}_{page:04}.txt'
            self.p.extract_text(
                filename,
                output_path=output_filename,
                encoding=self.encoding,
                html=self.html,
                sort=self.sort,
                ignore_beads=self.ignore_beads,
                start_page=page,
                end_page=page,
                console=self.console,
            )
        logger.success(f'Extracted: {basename}, pages: {num_pages}')

    def batch_extract(
        self,
        files: List[Path],
        output_folder: Union[str, os.PathLike],
        *,
        first_page: int = 1,
        last_page: Optional[int] = None,
    ) -> None:

        logfile = Path(output_folder) / 'extract.log'
        if logfile.exists():
            files = self._skip_completed(files, logfile)
        if len(files) == 0:
            return
        file_logger = self._add_logger(logfile)

        total_files = len(files)
        for i, filename in enumerate(files, start=1):
            print(f'Processing {filename.stem}\t{i:03}/{total_files}',
                  end='\r')
            self.pdf_to_txt(filename, output_folder, first_page, last_page)

        self._remove_logger(file_logger)
コード例 #10
0
ファイル: main.py プロジェクト: jcnaud/snippet
def main():
    import pdfbox
    p = pdfbox.PDFBox()
    text = p.extract_text('./simple1.pdf')
    print('--------------------')
    print(text)
コード例 #11
0
ファイル: pdf.py プロジェクト: jomycs/ai
# -*- coding: utf-8 -*-
import pdfbox
from pathlib import Path


p = pdfbox.PDFBox()
end_tags = [
	'Introduction', 'introduction', 'Introductoin', 'INTRODUCTION',
	'Motivation', 'Background', 'motivation', 'background'
	]


def extract_abstract(filepath, start_tag='Abstract', end_tags=end_tags):
	"""Extract abstract from a PDF-formatted scientific and technological article
	从 PDF 格式的科技论文中抽取摘要
	"""
	text = p.extract_text(filepath, 'pdf.txt')
	res = []
	flag = False
	with open('pdf.txt', 'r', encoding='utf-8') as f:
		for l in f.readlines():
			if start_tag in l:
				flag = True
			elif flag:
				l = l.strip()
				if l == '':
					continue
				for t in end_tags:
					if t in l or t.lower() in l:
						if res:
							res[-1] = res[-1].rstrip()
コード例 #12
0
 def setUpClass(cls):
     cls.p = pdfbox.PDFBox()
コード例 #13
0
 def test_init_multiple(self):
     # Try to initialize and use a second
     # instance of the class:
     p2 = pdfbox.PDFBox()
コード例 #14
0
ファイル: lattice.py プロジェクト: trifacta/camelot
 def _generate_image(self):
     pdfbox.PDFBox().pdf_to_images(self.filename, outputPrefix=self.rootname)
     self.imagename = str(self.rootname) + '1.jpg'