Python PDF Beispiele, pdftotext.PDF Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: pdf_voice_converter.py Projekt: Michael-Meeus/AudioFactor

from tkinter import Tk
from tkinter.filedialog import askopenfilename
from os.path import splitext
import pdftotext
from gtts import gTTS

Tk().withdraw(
)  # we don't want a full GUI, so keep the root window from appearing
filelocation = askopenfilename()  # open the dialog GUI

with open(filelocation,
          "rb") as f:  # open the file in reading (rb) mode and call it f
    pdf = pdftotext.PDF(
        f)  # store a text version of the pdf file f in pdf variable

pdf = ''.join(pdf)
# join the text together

final_file = gTTS(text=pdf, lang='en')  # store file in variable
outname = splitext(filelocation)[0] + '.mp3'
final_file.save(outname)  # save file to computer

Beispiel #2

0

Datei anzeigen

def PDF(filename):
    with open(filename, "rb") as f:
        pdf = pdftotext.PDF(f)
    print("\n\n".join(pdf))
    file.write("\n\n".join(pdf))
    file.close()

Beispiel #3

0

Datei anzeigen

Datei: test_pdf.py Projekt: rodrigobeavis/pdftotext-3

 def test_locked_with_both_passwords(self):
     with self.assertRaises(pdftotext.Error):
         pdftotext.PDF(get_file("both_passwords.pdf"))

Beispiel #4

0

Datei anzeigen

 def get_text(self) -> str:
     pdf_pages_text: Iterable[str] = pdftotext.PDF(self.file)
     # TODO: Make the newline replacement smarter to handle dashes etc.
     return "\n".join(page_text.replace("\n", " ") for page_text in pdf_pages_text)

Beispiel #5

0

Datei anzeigen

Datei: PDFTextReader.py Projekt: Hicks48/pdf-word-analyzer

 def get_num_pages(self):
     with open(self.file_path, 'rb') as pdf_file:
         pdf = pdftotext.PDF(pdf_file)
         return len(pdf)

Beispiel #6

0

Datei anzeigen

url = 'https://api.wimc.ctf.allesctf.net/1.0/admin/createReport'

headers = {'X-API-TOKEN': ''}

with open('stage2.html', 'r') as f:
    template = f.read()

# r = requests.post(url, headers=headers, data={"html": f.read()})

token = ''

while len(token) < 30:
    payload = template.replace('__TOKEN__', token)
    r = requests.post(url, headers=headers, data={"html": payload})
    if len(r.text) < 100:
        print(r.text, file=sys.stderr)
        exit(1)

    result = open('result.pdf', 'wb')
    result.write(b''.join(r.iter_content()))
    result.close()

    result = open('result.pdf', 'rb')
    pdf = pdftotext.PDF(result)
    result.close()

    token = pdf[0].split('\n')[0]

os.remove('result.pdf')
print(f'Got token: {token}')

Beispiel #7

0

Datei anzeigen

Datei: app.py Projekt: MowlanicaBilla/TextSummarization

def extract_data(feed):
    with open(feed, "r") as f:
        pdf = pdftotext.PDF(f)
        pdf_text = "\n\n".join(pdf)
        return pdf_text

Beispiel #8

0

Datei anzeigen

def to_text(path):
    with open(path, "rb") as f:
        pdf = pdftotext.PDF(f)
        text = "".join(pdf)
        return text

Beispiel #9

0

Datei anzeigen

def proceed_document(dokumen_id):
    import numpy
    from dlnn.Dlnn import Dlnn
    from dlnn.Dlnn import DLNN_DEFAULT_CONFIG
    dlnn = Dlnn(**DLNN_DEFAULT_CONFIG)
    # Todo : Load Dokumen by id (doc_id) [Dokumen.objects.filter(id=doc_id).first()]
    dokumen = Dokumen.objects.filter(id=dokumen_id).first()
    dokumen.state = "Process"
    dokumen.save()
    # Todo : Load pdf
    # spell = SpellChecker()
    with open(dokumen.filenya.path, "rb") as f:
        pdf = pdftotext.PDF(f)
        text = "".join(pdf)

    # Todo : Normalisasi
    # pecah kalimat menjadi kata kata
    text = text.lower()  # Converting to lowercase
    cleanr = re.compile('<.*?>')
    sentence = re.sub(cleanr, ' ', text)  # Removing HTML tags
    sentence = re.sub(r'[?|!|\'|"|#]', r'', sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]', r' ',
                      sentence)  # Removing Punctuations

    data_pdf = "".join(sentence)
    token_data_pdf = nltk.word_tokenize(data_pdf, preserve_line=True)

    # Fitur 1 - cek salah ketik Bahasa Indonesia
    url_dic_indo = settings.STATIC_ROOT + '/admin/db_text/kamus_indonesia.txt'
    kamus_indonesia = open(url_dic_indo, "r")
    katadasar = kamus_indonesia.read().split('\n')
    for i in range(len(katadasar)):
        katadasar[i] = katadasar[i].split("/")[0]

    salah_ketik_indo = 0
    for token in token_data_pdf:
        if token not in katadasar:
            salah_ketik_indo += 1

    f1 = salah_ketik_indo
    dokumen.fitur1 = f1
    dokumen.save()

    # Fitur 2 - cek salah ketik Bahasa Inggris
    url_dic_en = settings.STATIC_ROOT + '/admin/db_text/kamus_english.txt'
    kamus_inggris = open(url_dic_en, "r")
    katadasar_en = kamus_inggris.read().split('\n')
    for i in range(len(katadasar_en)):
        katadasar_en[i] = katadasar_en[i].split("/")[0]

    salah_ketik_english = 0
    for token in token_data_pdf:
        if token not in katadasar_en:
            salah_ketik_english += 1

    f2 = salah_ketik_english
    dokumen.fitur2 = f2
    dokumen.save()

    f3, f4 = calculate_feature_34(dokumen_id)
    dokumen.fitur3 = f3
    dokumen.fitur4 = f4
    dokumen.save()

    f5, f6 = calculate_feature_56(dokumen_id)
    dokumen.fitur5 = f5
    dokumen.fitur6 = f6
    dokumen.save()

    # Todo : masukkan fitur f[1..4] ke database
    network = dlnn.get_model()
    result = network.predict(numpy.array([[f1, f2, f3, f4, f5, f6]]),
                             batch_size=1)
    class_data = result.argmax(axis=1)[0]
    # print("Class Data {}".format(class_data))
    # Todo : masukkan class_data sebagai hasil kelas data [mappingkan dengan kelas seharusnya] [zero based indexing]
    dokumen.kualitas = class_data
    dokumen.state = "Done"
    dokumen.save()

Beispiel #10

0

Datei anzeigen

Datei: clause_analyze.py Projekt: Edelweiss35/leasing-backend

def convert_pdf2text(fname):
    with open(settings.BASE_DIR + '/leasingai/ai/temp/' + fname, "rb") as f:
        pdf = pdftotext.PDF(f)
    text = "\n\n".join(pdf)
    return text

Beispiel #11

0

Datei anzeigen

def testing_apps(gap_data):
    f1 = [[]]

    cek = Pengujian.objects.all()
    for a in cek:
        a.delete()
    dataset = Data.objects.filter(is_dataset=True)
    x = 0
    for data in dataset:
        x += 1
        print("data ke" + str(x))
        # Todo : Load pdf
        with open(data.url_file.path, "rb") as f:
            pdf = pdftotext.PDF(f)
            text = "".join(pdf)

        # Todo : Normalisasi
        # pecah kalimat menjadi kata kata
        text = text.lower()  # Converting to lowercase
        cleanr = re.compile('<.*?>')
        sentence = re.sub(cleanr, ' ', text)  # Removing HTML tags
        sentence = re.sub(r'[?|!|\'|"|#]', r'', sentence)
        sentence = re.sub(r'[.|,|)|(|\|/]', r' ',
                          sentence)  # Removing Punctuations

        data_pdf = "".join(sentence)
        token_data_pdf = nltk.word_tokenize(data_pdf, preserve_line=True)

        # Fitur 1 - cek salah ketik Bahasa Indonesia
        url_dic_indo = settings.STATIC_ROOT + '/admin/db_text/kamus_indonesia.txt'
        kamus_indonesia = open(url_dic_indo, "r")
        katadasar = kamus_indonesia.read().split('\n')
        for i in range(len(katadasar)):
            katadasar[i] = katadasar[i].split("/")[0]

        salah_ketik_indo = 0
        for token in token_data_pdf:
            if token not in katadasar:
                salah_ketik_indo += 1

        # Fitur 2 - cek salah ketik Bahasa Inggris
        url_dic_en = settings.STATIC_ROOT + '/admin/db_text/kamus_english.txt'
        kamus_inggris = open(url_dic_en, "r")
        katadasar_en = kamus_inggris.read().split('\n')
        for i in range(len(katadasar_en)):
            katadasar_en[i] = katadasar_en[i].split("/")[0]

        salah_ketik_english = 0
        for token in token_data_pdf:
            if token not in katadasar_en:
                salah_ketik_english += 1

        akurasi_indo = int((len(token_data_pdf) - salah_ketik_indo) /
                           len(token_data_pdf) * 100)
        akurasi_en = int((len(token_data_pdf) - salah_ketik_english) /
                         len(token_data_pdf) * 100)

        new_hasil = Pengujian(perbandingan=str(x),
                              fitur1=akurasi_indo,
                              fitur2=akurasi_en)
        new_hasil.save()

Beispiel #12

0

Datei anzeigen

def main():
    # Parse the args for file location
    parser = argparse.ArgumentParser(
        description=
        "Reads in a pdf or txt file, outputs an audio file of it being read")
    parser.add_argument('file', help="The file you would like to input")
    args = parser.parse_args()
    # Saving the file location
    file = args.file

    # TODO might need to make this more robust
    # Check the extension of the file and save the texxt
    if file[-3:] == 'pdf':
        try:
            with open(file, "rb") as file_in:
                text = pdftotext.PDF(file_in)

            print("Number of pages: ", len(text))
        # print("\n\n".join(pdf))
        except IOError as e:
            print("Could not open the file!")
            print(e)
            exit(1)
    elif file[-3:] == 'txt':
        #f = open(file, 'r')
        with open(file, 'r') as file_in:
            text = file_in.read()
    else:
        print("Input a txt or pdf file")
        exit(1)

    make_sentences(text)
    """
    # Delete sentences.txt if it already exists
    if os.path.exists("/WaveRNN/sentences.txt"):
        os.remove("WaveRNN/sentences.txt")
    """

    # Delete the quick_start dir if exists, does not do anything if it does not
    os.system("rm -rf WaveRNN/quick_start/")

    working_dir = os.getcwd()
    tmp_dir = working_dir + "/WaveRNN"
    os.chdir(tmp_dir)
    assert (os.getcwd() == tmp_dir), "Issues changing into WaveRNN directory"

    # Run wavrnn
    #os.system("python WaveRNN/quick_start.py")
    try:
        os.system("python quick_start.py")
    except:
        print("isues with wavernn")
        exit(1)

    os.chdir(working_dir)
    assert (os.getcwd() == working_dir
            ), "Issues changing from WaveRNN dir to working dir"

    out_file = file[:-4] + ".wav"

    # Concatenate the wav files and save as the output file
    concat_cmd = "sox WaveRNN/quick_start/*.wav '" + out_file + "'"
    os.system(concat_cmd)

    # TODO output the file to the user
    os.system("ls")
    print("Your generated audio is ", out_file)

Beispiel #13

0

Datei anzeigen

Datei: test_pdf.py Projekt: rodrigobeavis/pdftotext-3

 def test_read(self):
     pdf = pdftotext.PDF(get_file("abcde.pdf"))
     result = pdf[0]
     self.assertIn("abcde", result)

Beispiel #14

0

Datei anzeigen

Datei: test_pdf.py Projekt: rodrigobeavis/pdftotext-3

 def test_locked_with_both_passwords_owner_unlock(self):
     pdf = pdftotext.PDF(get_file("both_passwords.pdf"), "owner_password")
     self.assertIn("secret", pdf[0])

Beispiel #15

0

Datei anzeigen

Datei: ScoreSheetParsers.py Projekt: even311379/PdfParserGUI

def missing_patterns(file):

    with open(file, "rb") as f:
        pdf = pdftotext.PDF(f)
    '''
    skipped pattern 1
    either lacking of 年級百分 or 類組百分 for each semester
    (62 matched)
    '''
    try:
        score_sheet = pdf[2]
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])
        output += re.findall('姓名： (.*?) ', score_sheet)
        output.append(re.findall('^(.*?)\n', score_sheet)[0].replace(' ', ''))
        # so dirty, just can't come up with a better way to handle these consecutive try
        for i in [2, 3]:
            try:
                tl = re.split(' +', re.findall('學科平均(.*?)\n', pdf[i])[0])
                break
            except:
                try:
                    tl = re.split(' +', re.findall('智育成績(.*?)\n', pdf[i])[0])
                    break
                except:
                    pass

        if len(tl) == 38:
            ip = [4, 10, 16, 22, 28], [6, 12, 18, 24, 30]
        else:
            ip = [4, 12, 20, 28, 36], [7, 15, 23, 31, 39]

        if re.search('班 +班 +年 +年', score_sheet):
            output += [int(tl[i]) / 100 for i in ip[0]]
            output += [np.nan] * 5
            output += [int(tl[i]) / 100 for i in ip[1]]

        elif re.search('班 +班 +類 +類', score_sheet):
            output += [int(tl[i]) / 100 for i in ip[0]]
            output += [int(tl[i]) / 100 for i in ip[1]]
            output += [np.nan] * 5
        else:
            pass

        if len(output) == 19:
            output.append('Missing pattern 1')
            return output
    except:
        pass
    '''
    skipped pattern 2
    only have mean of 班,組, and 年百分, so I fill all values for each semester by mean
    There are 22 cases specify 班百分 in each semester
    ***so many different style met this pattern, must seperate them carefully
    (30 matched)
    '''
    try:
        score_sheet = pdf[2]
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])
        output += re.findall('姓名：(.*?) ', score_sheet)
        output += re.findall('^ +(.*?) ', score_sheet)
        # ... so many different style met this pattern, must seperate them carefully
        tl = re.split(' +', re.findall('學業平均(.*?)\n', score_sheet)[0])

        if len(tl) in [18, 23, 24]:
            if len(tl) == 24:
                k = -6
            else:
                k = -5
            if '班級排名百分' in score_sheet:
                ttl = re.findall(' \d\d',
                                 re.findall('班級排名百分(.*?)\n', score_sheet)[0])
                output += [float(i) / 100 for i in ttl]
            else:
                output += [float(tl[k]) / 100] * 5

            for p in [-3, -1]:
                output += [float(tl[p]) / 100] * 5
            if len(output) == 19:
                output.append('Missing pattern 2')
                return output
            else:
                pass

    except:
        pass
    '''
    skipped pattern 3
    similar situation as p2
    '''
    try:
        score_sheet = pdf[2]
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])
        output += re.findall('姓名： (.*?) ', score_sheet)
        output.append(re.findall('^(.*?)\n', score_sheet)[0].replace(' ', ''))
        try:
            tl = re.split(' +', re.findall('學科平均(.*?)\n', score_sheet)[0])
        except:
            tl = re.split(' +', re.findall('智育成績(.*?)\n', score_sheet)[0])
        if len(tl) == 21:
            for p in [-8, -5, -2]:
                output += [float(tl[p]) / 100] * 5
            output.append('Missing pattern 3')
            return output
        else:
            pass

    except:
        pass
    '''
    skipped pattern 4
    10 matched
    '''
    try:
        score_sheet = pdf[2]
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])
        output += re.findall('姓名：(.*?) ', score_sheet)
        output += re.findall('^ +(.*?) +\w', score_sheet)
        output += [
            float(i) / 100
            for i in re.findall(' (\d\d)',
                                re.findall('班百分比(.*?)\n', score_sheet)[0])
        ]
        output += [np.nan] * 5
        output += [
            float(i) / 100
            for i in re.findall(' (\d\d)',
                                re.findall('年百分比(.*?)\n', score_sheet)[0])
        ]
        if len(output) == 19:
            output.append('Missing pattern 4')
            return output
        else:
            pass
    except:
        pass
    '''
    pattern 5
    '''
    try:
        score_sheet = pdf[2]
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])
        output += re.findall('姓名：(.*?) ', score_sheet)
        output.append('XXXX')
        try:
            tl = re.split(' +', re.findall('學業平均(.*?)\n', score_sheet)[0])
        except:
            tl = re.split(' +', re.findall('學業平均(.*?)\n', pdf[3])[0])
        if '\x1b' in tl:
            tl.remove('\x1b')

        ip = [8, 13, 18, 23, 28], [10, 15, 20, 25, 30]

        if len(tl) == 31:
            output += [round(float(tl[i]) / 100, 2) for i in ip[0]]
            output += [round(float(tl[i]) / 100, 2) for i in ip[1]]
            output += [np.nan] * 5
            output.append('Missing pattern 5')
            return output
    except:
        pass
    '''
    pattern 6
    9 matched
    '''
    try:
        score_sheet = pdf[2]
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])
        output += re.findall('姓名：(.*?) ', score_sheet)
        school = re.findall('^ +(.*?)\n', score_sheet)
        if '列印日期' in school[0]:
            raise AssertionError("text")
        output += school
        tl = re.split(' +', re.findall('學業成績(.*?)\n', score_sheet)[0])

        if len(tl) == 13:
            for j in [round(eval(i), 2) for i in tl if '/' in i]:
                output += [j] * 5
            output.append('Missing pattern 6')
            return output
    except:
        return False

Beispiel #16

0

Datei anzeigen

    part = part.replace(".", ",")
    return (part)


# obtendo os nomes dos arquivos na pasta de entrada
dirIn = '/home/labcsjt/doi_in/'
dirOut = '/home/labcsjt/doi_out/'
listaPdf = os.listdir(dirIn)
relatorio = ''
for indPdf in range(len(listaPdf)):
    arq = listaPdf[indPdf]
    if arq.find('.pdf') < 0:
        continue
    # Abrindo arquivo de entrada, lendo PDF
    entrada = open(dirIn + arq, 'rb')
    pdf = pdftotext.PDF(entrada)
    texto = "\n\n".join(pdf)
    entrada.close()

    # Descobrindo se o arquivo é DOI
    doi = re.findall(r'DOI - Declaração (.*?) Operações Imobiliárias', texto,
                     re.M | re.I | re.S)
    if doi[0] != 'sobre':
        continue

    # Criando arquivo de saida
    saida = open(dirOut + arq[0:arq.find('.')] + '.txt', 'w')

    # Buscando todos os quadros da DOI
    quadros = re.findall(
        r'01 Identificação do Cartório(.*?)02 Identificação da Operação(.*?)03 Identificação do\(s\) Alienante\(s\)(.*?)04 Identificação do\(s\) Adquirente\(s\)(.*?)05 Informações sobre a Alienação(.*?)06 Informações sobre o Imóvel(.*?)Página',

Beispiel #17

0

Datei anzeigen

Datei: ScoreSheetParsers.py Projekt: even311379/PdfParserGUI

def complete_patterns(file):
    with open(file, "rb") as f:
        pdf = pdftotext.PDF(f)
    '''
    handling ocr-requried
    '''
    if pdf[2] == '':
        #print(f'{file}: This file is protected, no way to parse except ocr!')
        return 'Ocr required'

    if ('\x10' in pdf[2]) or ('\u2e4e' in pdf[2]):
        #print(f'{file}: This file has severe codex issue, no way to parse except ocr!')
        return 'Ocr required'
    '''
    pattern 1 EX: 惠文高中
    '''
    try:
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])

        score_sheet = pdf[2]
        output += re.findall('姓名：(\w{2,3})', score_sheet)
        output.append(re.findall('^(.*?)\n', score_sheet)[0].replace(' ', ''))
        for s in ['班排', '群排', '年排']:
            output += [
                round(eval(i), 3)
                for i in re.findall('\d{1,3}/\d{2,3}',
                                    re.findall(s + '(.*?\n)', score_sheet)[0])
            ]

        output.append('Complete pattern 1')
        return output

    except:
        pass
    '''
    pattern 2 ex: 臺北市立和平高級中學
    '''
    try:
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])

        score_sheet = pdf[2]
        output.append(re.findall('姓 名(.*?)班', score_sheet)[0].replace(' ', ''))
        school = score_sheet[20:80].replace(' ', '')
        if school == '':
            school = re.findall('目前成績(.*?)申請入學',
                                score_sheet)[0].replace(' ', '')
        output.append(school)
        try:
            L = re.split(' +', re.findall('\n學業成績(.*?)\n', score_sheet)[0])[1:]
        except:
            L = re.split(' +', re.findall('\n學業成績(.*?)\n', pdf[3])[0])[1:]
        try:
            PL = re.split(' +', re.findall('\n總人數(.*?)\n', score_sheet)[0])[1:]
        except:
            PL = re.split(' +', re.findall('\n總人數(.*?)\n', pdf[3])[0])[1:]
        RL = [
            L[i]
            for i in [1, 2, 3, 8, 9, 10, 15, 16, 17, 22, 23, 24, 29, 30, 31]
        ]
        TL = [round(eval(i + '/' + j), 3) for i, j in zip(RL, PL)]
        for i in range(3):
            output += TL[i::3]

        output.append('Complete pattern 2')
        return output

    except:
        pass
    '''
    pattern 3 ex: 高雄市立中正高級中學
    '''
    try:
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])

        score_sheet = pdf[2]
        name = re.findall('姓名： (\w*?) ', score_sheet)
        if name:
            output += name
        else:
            output.append('XXX')
        output.append(re.findall('^(.*?)\n', score_sheet)[0].replace(' ', ''))
        output += [
            round(eval(i), 3) for i in re.findall(
                '\d{1,3}/\d{2,3}',
                re.findall('班級人數 百分比(.*?)符號註記', score_sheet)[0])
        ]
        output += [
            round(eval(i), 3) for i in re.findall(
                '\d{1,3}/\d{2,3}',
                re.findall('類組排名/類組人數(.*?)為不及格', score_sheet)[0])
        ]
        output += [
            round(eval(i), 3) for i in re.findall(
                '\d{1,3}/\d{2,3}',
                re.findall('年級排名/年級人數(.*?)M 為重修', score_sheet)[0])
        ]

        output.append('Complete pattern 3')
        return output
    except:
        pass
    '''
    中文複製貼上會錯亂的pattern pattern4
    '''
    try:
        score_sheet = pdf[2]
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])

        output.append(re.findall('姓名：(.*?)\n', pdf[0])[0].replace(' ', ''))
        output += re.findall('學校：(\w*?) ', score_sheet[0:100])
        try:
            for p in ['班', '組', '年級']:
                output += [
                    round(float(i) / 100, 2) for i in re.findall(
                        '\d{1,3}\.\d{2,3}',
                        re.findall(f'{p}百身比(.*){p}百身比', score_sheet)[0])
                ]
            output.append('Complete pattern 4-1')
        except:
            for p in ['班', '組', '年級']:
                if p == '班':
                    output += [
                        round(float(i) / 100, 2) for i in regex.findall(
                            ' (\d.*?) ',
                            re.findall(f'{p}百分比(.*){p}百分比', score_sheet)[0],
                            overlapped=True)[1::2]
                    ]
                else:
                    output += [
                        round(float(i) / 100, 2) for i in re.findall(
                            ' (\d.*?) ',
                            re.findall(f'{p}百分比(.*){p}百分比', score_sheet)[0])
                    ]

            output.append('Complete pattern 4')

        return output
    except:
        pass

    try:
        score_sheet = pdf[2]
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])

        output.append(re.findall('姓名：(.*?)\n', pdf[0])[0].replace(' ', ''))
        output += ['XXX']
        output += [
            round(float(i) / 100, 2) for i in re.findall(
                '\d{1,3}\.\d{2,3}',
                re.findall('班百身比(.*)班百身比', score_sheet)[0])[1::2]
        ]
        output += [
            round(float(i) / 100, 2)
            for i in re.findall(' (\d.*?) ',
                                re.findall('組百身比(.*)組百身比', score_sheet)[0])
        ]

        for p in ['學級', '學國', '學揚']:
            try:
                output += [
                    round(float(i) / 100, 2) for i in re.findall(
                        ' (\d.*?) ',
                        re.findall(f'{p}百身比(.*){p}百身比', score_sheet)[0])
                ]
                if len(output) != 19:
                    #print('Very special case...')
                    return 'skipped'
                output.append('Complete pattern 4-2')
                return output
            except:
                pass
    except:
        pass
    '''
    pattern 5
    '''
    try:
        score_sheet = pdf[2]
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])

        output += re.findall('姓名：(\w*?) ', score_sheet)
        output.append(re.findall('(.*?)學生', score_sheet)[0].replace(' ', ''))
        if '類組(科別)排名' not in score_sheet:
            score_sheet = pdf[3]

        for p in ['班級', '類組\(科別\)', '年級']:
            output += [
                round(eval(i), 2) for i in re.findall(
                    '\d{1,3}/\d{2,3}',
                    re.findall(p + '排名(.*?)\n', score_sheet)[0])
            ]

        output.append('Complete pattern 5')
        return output
    except:
        pass
    '''
    pattern 6
    '''
    try:
        score_sheet = pdf[2]
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])
        output += re.findall('姓名：(\w.*?)  ', score_sheet)
        output += ['XXX']
        temp = []
        for i in range(1, 4):
            tl = re.split(
                ' +',
                re.findall('學業平均(.*?)\n', score_sheet)[0].split('│')[i])
            if i == 3:
                temp += [tl[j] for j in [4, 8, 6]]
            else:
                temp += [tl[j] for j in [4, 8, 6, 11, 15, 13]]
        for i in range(3):
            output += [round(float(i) / 100, 2) for i in temp[i::3]]

        output.append('Complete pattern 6')
        return output

    except:
        pass
    '''
    pattern 7
    '''
    try:
        score_sheet = pdf[2]
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])

        output += re.findall('姓名： (\w*?) ', score_sheet)
        output.append(re.findall('^(.*?)\n', score_sheet)[0].replace(' ', ''))

        for p in ['班級', '學程', '年級']:
            output += [
                round(eval(i), 2) for i in re.findall(
                    '\d{1,3}/\d{2,3}',
                    re.findall(p + '排名(.*?)\n', score_sheet)[0])
            ]

        output.append('Complete pattern 7')
        return output
    except:
        pass
    '''
    pattern 8 
    a format very similar to the dominant skipped ones, however, 
    it contains one additional line to show all required info
    '''
    try:
        score_sheet = pdf[2]
        output = []
        output.append(file.split('/')[-3])
        output.append(file.split('/')[-2])

        output.append(re.findall('姓名：(\w*?) ', score_sheet)[0])
        output.append(re.findall('^(.*?)\n', score_sheet)[0].replace(' ', ''))
        for p in ['班', '類', '校']:
            output += [
                int(i) / 100 for i in re.findall(
                    ' (\d\d) ',
                    re.findall('各學期成績校排(.*?)\n', score_sheet)[0])
            ]

        output.append('Complete pattern 8')
        return output
    except:
        pass
    '''
    Skipped files
    '''
    score_sheet = pdf[2]
    for p in [
            '個人成績單暨班級百分比對照表', '個人成績單暨百分比對照表', '個人成績單暨類組百分比對照表', '學生個人成績單暨百分比對',
            '個人成績單暨年級百分比對照表'
    ]:
        if p in score_sheet[0:120]:
            #print('This is a type of score sheet that doesnot contain enough info')
            return 'skipped'

    if (' 成績證明書\n' in score_sheet[0:120]) or \
    ('學生個人成績證明書\n' in score_sheet[0:120]) \
    or ('學 生 成 績 表' in score_sheet)\
    or ('成績一覽表' in score_sheet)\
    or ('學生個人成績單\n' in score_sheet[0:120]) \
    or ('成 績 報 告 單' in score_sheet[0:300]) \
    or ('歷年成績單-補考、重修後' in score_sheet[:300]) \
    or ('桃園市新興高級中等學校' in score_sheet[:300]):
        #print('This is a type of score sheet that doesnot contain enough info')
        return 'skipped'
    '''
    Nothing matched at all, return False
    '''
    #print('No pattern matched!')
    return False

Beispiel #18

0

Datei anzeigen

#=======================================================================================================================
print('Init:')
#=======================================================================================================================
file_pdf_input = 'resources/pdf-simple.pdf'
file_pdf_output = 'resources/pdf-simple-out.pdf'
#
debug = True
debug = False
#
line_tag_separate_init = '__________________________________________________'
line_tag_separate_done = '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'
#
pdf_input1 = None
with open(file_pdf_input, "rb") as in_f:
    pdf_input1 = pdftotext.PDF(in_f)
#
with open(file_pdf_input, "rb") as in_f:
    pdf_input2 = PyPDF2.PdfFileReader(in_f)
    pdf_output = PyPDF2.PdfFileWriter()

    pdf_blank = None
    pdf_blank_page = None

    page_box = 639
    page_box_add = 0
    # page_box_add = 100
    page_num = pdf_input2.getNumPages()
    print("Read document %s ..." % file_pdf_input)
    print("Document has %s pages." % page_num)

Beispiel #19

0

Datei anzeigen

Datei: utils.py Projekt: kimdhamilton/mitxpro

def read_pdf(pdf_file):
    """
    Take in Voucher PDFs and parse them as either international or domestic, and return unified response.
    """
    domestic_settings_keys = [
        "VOUCHER_DOMESTIC_EMPLOYEE_KEY",
        "VOUCHER_DOMESTIC_EMPLOYEE_ID_KEY",
        "VOUCHER_DOMESTIC_KEY",
        "VOUCHER_DOMESTIC_COURSE_KEY",
        "VOUCHER_DOMESTIC_DATES_KEY",
    ]

    international_settings_keys = [
        "VOUCHER_INTERNATIONAL_EMPLOYEE_KEY",
        "VOUCHER_INTERNATIONAL_EMPLOYEE_ID_KEY",
        "VOUCHER_INTERNATIONAL_DATES_KEY",
        "VOUCHER_INTERNATIONAL_COURSE_NAME_KEY",
        "VOUCHER_INTERNATIONAL_COURSE_NUMBER_KEY",
    ]

    for key in domestic_settings_keys + international_settings_keys:
        if not getattr(settings, key):
            log.warning("Required setting %s missing for read_pdf", key)
            return
    try:
        pdf = pdftotext.PDF(pdf_file)
        if any("Entity Name:" in page for page in pdf):
            values = read_pdf_international(pdf)
            for key in international_settings_keys:
                if not values.get(getattr(settings, key)):
                    return None
            course_id_input = values.get(
                settings.VOUCHER_INTERNATIONAL_COURSE_NUMBER_KEY)
            return {
                "pdf":
                pdf_file,
                "employee_id":
                values.get(settings.VOUCHER_INTERNATIONAL_EMPLOYEE_ID_KEY),
                "voucher_id":
                None,
                "course_start_date_input":
                datetime.strptime(
                    values.get(settings.VOUCHER_INTERNATIONAL_DATES_KEY).split(
                        " ")[0],
                    "%d-%b-%Y",
                ).date(),
                "course_id_input":
                remove_extra_spaces(course_id_input)
                if len(course_id_input) >= 3 else "",
                "course_title_input":
                remove_extra_spaces(
                    values.get(
                        settings.VOUCHER_INTERNATIONAL_COURSE_NAME_KEY)),
                "employee_name":
                values.get(settings.VOUCHER_INTERNATIONAL_EMPLOYEE_KEY),
            }
        else:
            values = read_pdf_domestic(pdf)
            for key in domestic_settings_keys:
                if not values.get(getattr(settings, key)):
                    return None
            course_id_input = values.get(
                settings.VOUCHER_DOMESTIC_COURSE_KEY).split(" ")[0]
            return {
                "pdf":
                pdf_file,
                "employee_id":
                values.get(settings.VOUCHER_DOMESTIC_EMPLOYEE_ID_KEY),
                "voucher_id":
                values.get(settings.VOUCHER_DOMESTIC_KEY),
                "course_start_date_input":
                datetime.strptime(
                    values.get(
                        settings.VOUCHER_DOMESTIC_DATES_KEY).split(" ")[0],
                    "%m/%d/%Y",
                ).date(),
                "course_id_input":
                remove_extra_spaces(course_id_input)
                if len(course_id_input) >= 3 else "",
                "course_title_input":
                remove_extra_spaces(" ".join(
                    values.get(
                        settings.VOUCHER_DOMESTIC_COURSE_KEY).split(" ")[1:])),
                "employee_name":
                values.get(settings.VOUCHER_DOMESTIC_EMPLOYEE_KEY),
            }
    except Exception:  # pylint: disable=broad-except
        log.exception("Could not parse PDF")
        return None

Beispiel #20

0

Datei anzeigen

def findPDFMatchesBruteForce(f,
                             text_patterns,
                             env_matches,
                             og_file=None,
                             raw=False):
    '''
    This processes the environments which weren't already matched by deleting them from the file, running pdftotext to see the difference with the original, 
    Arguments:
    f: file object in read bytes mode: the pdf to whiteout
    text_patterns: a list of byte strings to whiteout
    env_matches: a list of ranges to test in f
    og_file: f.readlines()
    raw: whether to use pdftotext raw
    '''
    def searchDiff(og_text, new_text, patterns, brute_results):
        '''
        Returns True if a match was found and collect data about which pattern was matched
        Arguments:
        A string of the original text 
        A string of the edited text
        patterns: a list of compiled re patterns from text_patterns (not bytes)
        brute_results: a dictionary like { 'c' : {e:0 for e in text_patterns}}
            where text_patterns are strings that were compiled into patterns
        '''
        # check to see if the new text has at least one fewer instance of the
        # search pattern
        for i, pattern in enumerate(patterns):
            if len(pattern.findall(og_text)) > len(pattern.findall(new_text)):
                brute_results['c'][text_patterns[i]] += 1
                return True
        return False

    # initialize search items
    brute_search_matches = []
    brute_search_unmatched = []
    # initialize data collector
    brute_results = {'c': {e: 0 for e in text_patterns}}
    # compile re's as strings (output of pdftotext)
    if raw:
        patterns = [
            re.compile(''.join(e.decode('utf-8').split()))
            for e in text_patterns
        ]
    else:
        patterns = [re.compile(e.decode('utf-8')) for e in text_patterns]

    # Using pdftotext python library to read text
    # all manipulations are done in memory so hopefull this is quick
    # produce original text
    og_text = pdftotext.PDF(f, raw=raw)

    # remove text in each range once, one by one, checking for diffs in each page
    if not og_file:
        f.seek(0)
        og_file = f.readlines()
    tmp_pdf_file = filenames.fileOut(
        re.sub('.pdf', '_tmp_whiteout.pdf', f.name))
    exists_text = re.compile(rb'[\(<].*?[\)>] *?Tj')
    for rng in env_matches:
        with open(tmp_pdf_file, 'w+b') as g:
            # if the rng has no text objects, skip it
            if not exists_text.search(b''.join(og_file[rng.start:rng.stop])):
                brute_search_unmatched.append(rng)
                continue
            g.writelines([
                replacePDFTextWithSpace(e) if i in rng else e
                for i, e in enumerate(og_file)
            ])
            g.seek(0)
            try:
                tmp_text = pdftotext.PDF(g, raw=raw)
            except pdftotext.Error as e:
                print(f'Warning: pdftotext.Error: {e}')
                brute_search_unmatched.append(rng)
                continue
            try:
                is_match = False
                for i, page in enumerate(og_text):
                    if searchDiff(page, tmp_text[i], patterns, brute_results):
                        brute_search_matches.append(rng)
                        # Exception case for bad pdfs
                        if b' ' in new_patterns:
                            brute_search_unmatched.append(env)
                            continue
                        is_match = True
                        new_patterns = []
                        for line in og_file[rng.start:rng.stop]:
                            m = replacePDFTextWithSpace(line, just_match=True)
                            if bool(m):
                                new_patterns.append(m)
                        new_ranges, _, new_results = findEnvAndMatchRanges(
                            og_file, new_patterns, ['c'], rb'^\d+ \d+ obj',
                            rb'^endobj')
                        [
                            brute_search_matches.append(env_matches.pop(i))
                            for i, r in enumerate(env_matches)
                            if r in new_ranges
                        ]
                        brute_results['c'].update(new_results['c'])
                        break
                    else:
                        pass
                if not is_match:
                    brute_search_unmatched.append(rng)
            except BaseException as e:
                print(f'Warning: {e}')
                brute_search_unmatched.append(rng)
    os.remove(tmp_pdf_file)

    return (brute_search_matches, brute_search_unmatched, brute_results)

Beispiel #21

0

Datei anzeigen

                os.unlink(file_path_del)
        except Exception as e:
            print(e)

# path = os.path.join(cwd_path,'pdf',file)
path = file

from PyPDF2 import PdfFileReader

pdf = PdfFileReader(open(path, 'rb'))
pages = pdf.getNumPages()

import pdftotext

with open(file, 'rb') as f:
    pdf = pdftotext.PDF(f, raw=False)

for p in pdf:
    with open(sys.argv[3] + '/firsttext/ne.txt', 'a') as f:
        f.write(p)

file_name1 = sys.argv[3] + '/firsttext/ne.txt'

###########################


def fo(li):
    line = li.replace("$", "").replace(",", "").replace("(",
                                                        "-").replace(")", "")
    return line

Beispiel #22

0

Datei anzeigen

Datei: app.bkp.py Projekt: streamlit-badge-bot/Voice_reader

def main():
    
    """Ouça e Fale App """
    
    st.title("Reader & Voice")
   
    activities = ["Home","PDF","TXT","About"]
    choice = st.sidebar.radio("Home",activities)

    if choice == 'Home':
        st.write("Only files:")
        st.markdown("### PDF or TXT")
        st.write("After uploading you can convert to 7 languages")
        st.markdown("### English, Spanish, French, Italian, Japanese, Russian  and Chinese")

        #st.write("Definitions")
        #st.write("PCA is not a statistical method to infer parameters or test hypotheses. Instead, it provides a method to reduce a complex dataset to lower dimension to reveal sometimes hidden, simplified structure that often underlie it.")
        #st.write("")
        #st.write("PCA is a statistical method routinely used to analyze interrelationships among large numbers of objects.")
        #st.write("")
        #st.write("Principal component analysis (PCA) is a mathematical algorithm that reduces the dimensionality of the data while retaining most of the variation in the data set.")
        
    if choice == 'PDF':
        
        file = carregar_texto('pdf')
        pdf = pdftotext.PDF(file)
            #for page in pdf:
            #    st.text(page)
            
        blob = TextBlob(pdf[0])
        st.text(blob)
        st.write(blob.detect_language())

        #dict_idioma_full = lista_idiomas_full()
        #idioma_original = get_value(blob.detect_language(),dict_idioma_full)
            #original_key = get_key(idioma_original, dict_idioma_full)
                    
            #st.success("Original Language"+":  "+ idioma_original + " ("+original_key+")")

            # Original sound
            #play(raw_text,original_key)
                    
                              
            #dict_idioma = lista_idiomas(idioma_original)
            #options = st.multiselect("Choose a language", tuple(dict_idioma.values()))
                                      
                    

            #for i in range(len(options)):
            #    value = options[i]
            #    idioma_final_key = get_key(value, dict_idioma)
            #    try:
            #        if (idioma_original != idioma_final_key):
            #            texto_convertido = str(blob.translate(to=idioma_final_key))
            #            st.success("Language"+": "+ value + " ("+idioma_final_key+")")
            #            st.write(texto_convertido)
            #            #st.text(idioma_final_key)
            #            play(texto_convertido,idioma_final_key)
            #            
            #    except:
            #        st.error("ERROR: some languages will fail to play the sound.")

            #dict_idioma_full = lista_idiomas_full()
            #idioma_original = get_value(blob.detect_language(),dict_idioma_full)
            #original_key = get_key(idioma_original, dict_idioma_full)
                    
            #st.success("Original Language"+":  "+ idioma_original + " ("+original_key+")")

            # Original sound
            #play(blob,original_key)
            #convert(blob)
        #except:
        #    st.warning("PDF please")

      
    if choice == 'TXT':
        try:
            file = carregar_texto('txt')
            blob= TextBlob(file.getvalue())
            st.markdown(blob)
            #dict_idioma_full = lista_idiomas_full()
            #idioma_original = get_value(blob.detect_language(),dict_idioma_full)
            #original_key = get_key(idioma_original, dict_idioma_full)
                    
            #st.success("Original Language"+":  "+ idioma_original + " ("+original_key+")")
            # Original sound
            #play(file.getvalue(),original_key)

            #st.write(blob.detect_language())
            #st.subheader(blob)
            convert(file, blob)
                          
                              
            #dict_idioma = lista_idiomas(idioma_original)
            #options = st.multiselect("Choose a language", tuple(dict_idioma.values()))
                                      
                    

            #for i in range(len(options)):
            #    value = options[i]
            #    idioma_final_key = get_key(value, dict_idioma)
            #    try:
            #        if (idioma_original != idioma_final_key):
            #            texto_convertido = str(blob.translate(to=idioma_final_key))
            #            st.success("Language"+": "+ value + " ("+idioma_final_key+")")
            #            st.write(texto_convertido)
            #            #st.text(idioma_final_key)
            #            play(texto_convertido,idioma_final_key)
            #            
            #    except:
            #        st.error("ERROR: some languages will fail to play the sound.")

        except:
            st.warning("TXT please")

Beispiel #23

0

Datei anzeigen

Datei: PDFTextReader.py Projekt: Hicks48/pdf-word-analyzer

 def extract_page_text(self, page_index):
     with open(self.file_path, 'rb') as pdf_file:
         pdf = pdftotext.PDF(pdf_file)
         return pdf[page_index]

Beispiel #24

0

Datei anzeigen

from progress.bar import Bar
from progress.bar import PixelBar

from classes import (
                        Arquivo,
                        Material,
                        RegexMaterial,
                        RegexArquivo,
                    )

##### Extraindo dados arquivo PDF

pdf_file = "SICRO/GO 10-2020 Relatório Sintético de Materiais.pdf"

with open( pdf_file, "rb" ) as f:
    cadastro = pdftotext.PDF( f )
    num_pages = len( cadastro )

with PixelBar('Extraindo dados do PDF', max=num_pages, suffix='%(index)d/%(max)d - %(percent).1f%% - %(eta)ds') as bar:

    lista_material = list()

    for pagina in cadastro:
        linhas_pagina_atual_pdf = pagina.split('\n') 
        linhas_pagina_atual_pdf.pop(-2)

        for linha in linhas_pagina_atual_pdf:

            obj_regex = RegexMaterial( linha )

            if ( obj_regex.cabecalho is None ) and ( obj_regex.principal is not None ) and ( len( obj_regex.principal.groups() ) == 4 ):

Beispiel #25

0

Datei anzeigen

Datei: pdf2voice-shell.py Projekt: hongqin/text2voice

#from tkinter import Tk
#from tkinter.filedialog import askopenfilename
import sys
import pdftotext
from gtts import gTTS

#Tk().withdraw()
#filelocation= askopenfilename()
file = sys.argv[1]

with open(file, "rb") as f:
    pdf = pdftotext.PDF(f)

string_of_text = ''
for text in pdf:
    string_of_text += text

final_file = gTTS(text=string_of_text, lang='en')
final_file.save(file + ".mp3")

Beispiel #26

0

Datei anzeigen

import pdftotext
import itertools
import re

# pdf 파일 경로를 입력합니다..
file = open("./Whiplash.pdf", 'rb')
fileReader = pdftotext.PDF(file)
text = []
# pdf로 읽을 파일을 한줄씩 txt파일에 입력해줍니다.
with open("test.txt", "w") as f:
    for i in fileReader:
        i.replace('\t', '').replace('\n', '').strip()
        f.write(i)
# 캐릭터별 대사 분리
character = []
# 아까 저장한 txt파일 열기
with open('test.txt') as f:
    for line in f:
        if line == '':
            continue
        text.append(line.strip().replace('.', ''))
talk = []
# 첫 대사를 말하는 캐릭터는 직접 지정해줍니다..
text = text[text.index('ANDREW'):]

# 대화 처리
for word in text:
    if word.isupper() and len(talk) < 1:
        #         print(f"{word}일때 word.isupper()실행")
        name = word
        if len(talk) > 1:

Beispiel #27

0

Datei anzeigen

Datei: pdf_to_txt_sintetico_equipamento.py Projekt: JoberthDavid/extract_text

from progress.bar import Bar
from progress.bar import PixelBar

from classes import (
                        RegexEquipamento,
                        Equipamento,
                        RegexArquivo,
                        Arquivo,
                    )

##### Abrindo arquivo PDF onerado

pdf_file_onerado = "SICRO/GO 10-2020 Relatório Sintético de Equipamentos.pdf"

with open( pdf_file_onerado, "rb" ) as f_onerado:
    cadastro_onerado = pdftotext.PDF( f_onerado )
    num_pages_onerado = len( cadastro_onerado )

##### Extraindo dados do PDF onerado

with PixelBar('Extraindo dados do PDF onerado', max=num_pages_onerado, suffix='%(index)d/%(max)d - %(percent).1f%% - %(eta)ds') as bar:


###### Populando lista com instância de Equipamento

    lista_equipamento = list()

    for pagina in cadastro_onerado:
        linhas_pagina_atual_pdf_file_onerado = pagina.split('\n')
        linhas_pagina_atual_pdf_file_onerado.pop(-2)

Beispiel #28

0

Datei anzeigen

Datei: test_pdf.py Projekt: rodrigobeavis/pdftotext-3

 def test_locked_with_only_user_password_user_unlock(self):
     pdf = pdftotext.PDF(get_file("user_password.pdf"), "user_password")
     self.assertIn("secret", pdf[0])