from tkinter import Tk from tkinter.filedialog import askopenfilename from os.path import splitext import pdftotext from gtts import gTTS Tk().withdraw( ) # we don't want a full GUI, so keep the root window from appearing filelocation = askopenfilename() # open the dialog GUI with open(filelocation, "rb") as f: # open the file in reading (rb) mode and call it f pdf = pdftotext.PDF( f) # store a text version of the pdf file f in pdf variable pdf = ''.join(pdf) # join the text together final_file = gTTS(text=pdf, lang='en') # store file in variable outname = splitext(filelocation)[0] + '.mp3' final_file.save(outname) # save file to computer
def PDF(filename): with open(filename, "rb") as f: pdf = pdftotext.PDF(f) print("\n\n".join(pdf)) file.write("\n\n".join(pdf)) file.close()
def test_locked_with_both_passwords(self): with self.assertRaises(pdftotext.Error): pdftotext.PDF(get_file("both_passwords.pdf"))
def get_text(self) -> str: pdf_pages_text: Iterable[str] = pdftotext.PDF(self.file) # TODO: Make the newline replacement smarter to handle dashes etc. return "\n".join(page_text.replace("\n", " ") for page_text in pdf_pages_text)
def get_num_pages(self): with open(self.file_path, 'rb') as pdf_file: pdf = pdftotext.PDF(pdf_file) return len(pdf)
url = 'https://api.wimc.ctf.allesctf.net/1.0/admin/createReport' headers = {'X-API-TOKEN': ''} with open('stage2.html', 'r') as f: template = f.read() # r = requests.post(url, headers=headers, data={"html": f.read()}) token = '' while len(token) < 30: payload = template.replace('__TOKEN__', token) r = requests.post(url, headers=headers, data={"html": payload}) if len(r.text) < 100: print(r.text, file=sys.stderr) exit(1) result = open('result.pdf', 'wb') result.write(b''.join(r.iter_content())) result.close() result = open('result.pdf', 'rb') pdf = pdftotext.PDF(result) result.close() token = pdf[0].split('\n')[0] os.remove('result.pdf') print(f'Got token: {token}')
def extract_data(feed): with open(feed, "r") as f: pdf = pdftotext.PDF(f) pdf_text = "\n\n".join(pdf) return pdf_text
def to_text(path): with open(path, "rb") as f: pdf = pdftotext.PDF(f) text = "".join(pdf) return text
def proceed_document(dokumen_id): import numpy from dlnn.Dlnn import Dlnn from dlnn.Dlnn import DLNN_DEFAULT_CONFIG dlnn = Dlnn(**DLNN_DEFAULT_CONFIG) # Todo : Load Dokumen by id (doc_id) [Dokumen.objects.filter(id=doc_id).first()] dokumen = Dokumen.objects.filter(id=dokumen_id).first() dokumen.state = "Process" dokumen.save() # Todo : Load pdf # spell = SpellChecker() with open(dokumen.filenya.path, "rb") as f: pdf = pdftotext.PDF(f) text = "".join(pdf) # Todo : Normalisasi # pecah kalimat menjadi kata kata text = text.lower() # Converting to lowercase cleanr = re.compile('<.*?>') sentence = re.sub(cleanr, ' ', text) # Removing HTML tags sentence = re.sub(r'[?|!|\'|"|#]', r'', sentence) sentence = re.sub(r'[.|,|)|(|\|/]', r' ', sentence) # Removing Punctuations data_pdf = "".join(sentence) token_data_pdf = nltk.word_tokenize(data_pdf, preserve_line=True) # Fitur 1 - cek salah ketik Bahasa Indonesia url_dic_indo = settings.STATIC_ROOT + '/admin/db_text/kamus_indonesia.txt' kamus_indonesia = open(url_dic_indo, "r") katadasar = kamus_indonesia.read().split('\n') for i in range(len(katadasar)): katadasar[i] = katadasar[i].split("/")[0] salah_ketik_indo = 0 for token in token_data_pdf: if token not in katadasar: salah_ketik_indo += 1 f1 = salah_ketik_indo dokumen.fitur1 = f1 dokumen.save() # Fitur 2 - cek salah ketik Bahasa Inggris url_dic_en = settings.STATIC_ROOT + '/admin/db_text/kamus_english.txt' kamus_inggris = open(url_dic_en, "r") katadasar_en = kamus_inggris.read().split('\n') for i in range(len(katadasar_en)): katadasar_en[i] = katadasar_en[i].split("/")[0] salah_ketik_english = 0 for token in token_data_pdf: if token not in katadasar_en: salah_ketik_english += 1 f2 = salah_ketik_english dokumen.fitur2 = f2 dokumen.save() f3, f4 = calculate_feature_34(dokumen_id) dokumen.fitur3 = f3 dokumen.fitur4 = f4 dokumen.save() f5, f6 = calculate_feature_56(dokumen_id) dokumen.fitur5 = f5 dokumen.fitur6 = f6 dokumen.save() # Todo : masukkan fitur f[1..4] ke database network = dlnn.get_model() result = network.predict(numpy.array([[f1, f2, f3, f4, f5, f6]]), batch_size=1) class_data = result.argmax(axis=1)[0] # print("Class Data {}".format(class_data)) # Todo : masukkan class_data sebagai hasil kelas data [mappingkan dengan kelas seharusnya] [zero based indexing] dokumen.kualitas = class_data dokumen.state = "Done" dokumen.save()
def convert_pdf2text(fname): with open(settings.BASE_DIR + '/leasingai/ai/temp/' + fname, "rb") as f: pdf = pdftotext.PDF(f) text = "\n\n".join(pdf) return text
def testing_apps(gap_data): f1 = [[]] cek = Pengujian.objects.all() for a in cek: a.delete() dataset = Data.objects.filter(is_dataset=True) x = 0 for data in dataset: x += 1 print("data ke" + str(x)) # Todo : Load pdf with open(data.url_file.path, "rb") as f: pdf = pdftotext.PDF(f) text = "".join(pdf) # Todo : Normalisasi # pecah kalimat menjadi kata kata text = text.lower() # Converting to lowercase cleanr = re.compile('<.*?>') sentence = re.sub(cleanr, ' ', text) # Removing HTML tags sentence = re.sub(r'[?|!|\'|"|#]', r'', sentence) sentence = re.sub(r'[.|,|)|(|\|/]', r' ', sentence) # Removing Punctuations data_pdf = "".join(sentence) token_data_pdf = nltk.word_tokenize(data_pdf, preserve_line=True) # Fitur 1 - cek salah ketik Bahasa Indonesia url_dic_indo = settings.STATIC_ROOT + '/admin/db_text/kamus_indonesia.txt' kamus_indonesia = open(url_dic_indo, "r") katadasar = kamus_indonesia.read().split('\n') for i in range(len(katadasar)): katadasar[i] = katadasar[i].split("/")[0] salah_ketik_indo = 0 for token in token_data_pdf: if token not in katadasar: salah_ketik_indo += 1 # Fitur 2 - cek salah ketik Bahasa Inggris url_dic_en = settings.STATIC_ROOT + '/admin/db_text/kamus_english.txt' kamus_inggris = open(url_dic_en, "r") katadasar_en = kamus_inggris.read().split('\n') for i in range(len(katadasar_en)): katadasar_en[i] = katadasar_en[i].split("/")[0] salah_ketik_english = 0 for token in token_data_pdf: if token not in katadasar_en: salah_ketik_english += 1 akurasi_indo = int((len(token_data_pdf) - salah_ketik_indo) / len(token_data_pdf) * 100) akurasi_en = int((len(token_data_pdf) - salah_ketik_english) / len(token_data_pdf) * 100) new_hasil = Pengujian(perbandingan=str(x), fitur1=akurasi_indo, fitur2=akurasi_en) new_hasil.save()
def main(): # Parse the args for file location parser = argparse.ArgumentParser( description= "Reads in a pdf or txt file, outputs an audio file of it being read") parser.add_argument('file', help="The file you would like to input") args = parser.parse_args() # Saving the file location file = args.file # TODO might need to make this more robust # Check the extension of the file and save the texxt if file[-3:] == 'pdf': try: with open(file, "rb") as file_in: text = pdftotext.PDF(file_in) print("Number of pages: ", len(text)) # print("\n\n".join(pdf)) except IOError as e: print("Could not open the file!") print(e) exit(1) elif file[-3:] == 'txt': #f = open(file, 'r') with open(file, 'r') as file_in: text = file_in.read() else: print("Input a txt or pdf file") exit(1) make_sentences(text) """ # Delete sentences.txt if it already exists if os.path.exists("/WaveRNN/sentences.txt"): os.remove("WaveRNN/sentences.txt") """ # Delete the quick_start dir if exists, does not do anything if it does not os.system("rm -rf WaveRNN/quick_start/") working_dir = os.getcwd() tmp_dir = working_dir + "/WaveRNN" os.chdir(tmp_dir) assert (os.getcwd() == tmp_dir), "Issues changing into WaveRNN directory" # Run wavrnn #os.system("python WaveRNN/quick_start.py") try: os.system("python quick_start.py") except: print("isues with wavernn") exit(1) os.chdir(working_dir) assert (os.getcwd() == working_dir ), "Issues changing from WaveRNN dir to working dir" out_file = file[:-4] + ".wav" # Concatenate the wav files and save as the output file concat_cmd = "sox WaveRNN/quick_start/*.wav '" + out_file + "'" os.system(concat_cmd) # TODO output the file to the user os.system("ls") print("Your generated audio is ", out_file)
def test_read(self): pdf = pdftotext.PDF(get_file("abcde.pdf")) result = pdf[0] self.assertIn("abcde", result)
def test_locked_with_both_passwords_owner_unlock(self): pdf = pdftotext.PDF(get_file("both_passwords.pdf"), "owner_password") self.assertIn("secret", pdf[0])
def missing_patterns(file): with open(file, "rb") as f: pdf = pdftotext.PDF(f) ''' skipped pattern 1 either lacking of 年級百分 or 類組百分 for each semester (62 matched) ''' try: score_sheet = pdf[2] output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) output += re.findall('姓名: (.*?) ', score_sheet) output.append(re.findall('^(.*?)\n', score_sheet)[0].replace(' ', '')) # so dirty, just can't come up with a better way to handle these consecutive try for i in [2, 3]: try: tl = re.split(' +', re.findall('學科平均(.*?)\n', pdf[i])[0]) break except: try: tl = re.split(' +', re.findall('智育成績(.*?)\n', pdf[i])[0]) break except: pass if len(tl) == 38: ip = [4, 10, 16, 22, 28], [6, 12, 18, 24, 30] else: ip = [4, 12, 20, 28, 36], [7, 15, 23, 31, 39] if re.search('班 +班 +年 +年', score_sheet): output += [int(tl[i]) / 100 for i in ip[0]] output += [np.nan] * 5 output += [int(tl[i]) / 100 for i in ip[1]] elif re.search('班 +班 +類 +類', score_sheet): output += [int(tl[i]) / 100 for i in ip[0]] output += [int(tl[i]) / 100 for i in ip[1]] output += [np.nan] * 5 else: pass if len(output) == 19: output.append('Missing pattern 1') return output except: pass ''' skipped pattern 2 only have mean of 班,組, and 年百分, so I fill all values for each semester by mean There are 22 cases specify 班百分 in each semester ***so many different style met this pattern, must seperate them carefully (30 matched) ''' try: score_sheet = pdf[2] output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) output += re.findall('姓名:(.*?) ', score_sheet) output += re.findall('^ +(.*?) ', score_sheet) # ... so many different style met this pattern, must seperate them carefully tl = re.split(' +', re.findall('學業平均(.*?)\n', score_sheet)[0]) if len(tl) in [18, 23, 24]: if len(tl) == 24: k = -6 else: k = -5 if '班級排名百分' in score_sheet: ttl = re.findall(' \d\d', re.findall('班級排名百分(.*?)\n', score_sheet)[0]) output += [float(i) / 100 for i in ttl] else: output += [float(tl[k]) / 100] * 5 for p in [-3, -1]: output += [float(tl[p]) / 100] * 5 if len(output) == 19: output.append('Missing pattern 2') return output else: pass except: pass ''' skipped pattern 3 similar situation as p2 ''' try: score_sheet = pdf[2] output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) output += re.findall('姓名: (.*?) ', score_sheet) output.append(re.findall('^(.*?)\n', score_sheet)[0].replace(' ', '')) try: tl = re.split(' +', re.findall('學科平均(.*?)\n', score_sheet)[0]) except: tl = re.split(' +', re.findall('智育成績(.*?)\n', score_sheet)[0]) if len(tl) == 21: for p in [-8, -5, -2]: output += [float(tl[p]) / 100] * 5 output.append('Missing pattern 3') return output else: pass except: pass ''' skipped pattern 4 10 matched ''' try: score_sheet = pdf[2] output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) output += re.findall('姓名:(.*?) ', score_sheet) output += re.findall('^ +(.*?) +\w', score_sheet) output += [ float(i) / 100 for i in re.findall(' (\d\d)', re.findall('班百分比(.*?)\n', score_sheet)[0]) ] output += [np.nan] * 5 output += [ float(i) / 100 for i in re.findall(' (\d\d)', re.findall('年百分比(.*?)\n', score_sheet)[0]) ] if len(output) == 19: output.append('Missing pattern 4') return output else: pass except: pass ''' pattern 5 ''' try: score_sheet = pdf[2] output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) output += re.findall('姓名:(.*?) ', score_sheet) output.append('XXXX') try: tl = re.split(' +', re.findall('學業平均(.*?)\n', score_sheet)[0]) except: tl = re.split(' +', re.findall('學業平均(.*?)\n', pdf[3])[0]) if '\x1b' in tl: tl.remove('\x1b') ip = [8, 13, 18, 23, 28], [10, 15, 20, 25, 30] if len(tl) == 31: output += [round(float(tl[i]) / 100, 2) for i in ip[0]] output += [round(float(tl[i]) / 100, 2) for i in ip[1]] output += [np.nan] * 5 output.append('Missing pattern 5') return output except: pass ''' pattern 6 9 matched ''' try: score_sheet = pdf[2] output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) output += re.findall('姓名:(.*?) ', score_sheet) school = re.findall('^ +(.*?)\n', score_sheet) if '列印日期' in school[0]: raise AssertionError("text") output += school tl = re.split(' +', re.findall('學業成績(.*?)\n', score_sheet)[0]) if len(tl) == 13: for j in [round(eval(i), 2) for i in tl if '/' in i]: output += [j] * 5 output.append('Missing pattern 6') return output except: return False
part = part.replace(".", ",") return (part) # obtendo os nomes dos arquivos na pasta de entrada dirIn = '/home/labcsjt/doi_in/' dirOut = '/home/labcsjt/doi_out/' listaPdf = os.listdir(dirIn) relatorio = '' for indPdf in range(len(listaPdf)): arq = listaPdf[indPdf] if arq.find('.pdf') < 0: continue # Abrindo arquivo de entrada, lendo PDF entrada = open(dirIn + arq, 'rb') pdf = pdftotext.PDF(entrada) texto = "\n\n".join(pdf) entrada.close() # Descobrindo se o arquivo é DOI doi = re.findall(r'DOI - Declaração (.*?) Operações Imobiliárias', texto, re.M | re.I | re.S) if doi[0] != 'sobre': continue # Criando arquivo de saida saida = open(dirOut + arq[0:arq.find('.')] + '.txt', 'w') # Buscando todos os quadros da DOI quadros = re.findall( r'01 Identificação do Cartório(.*?)02 Identificação da Operação(.*?)03 Identificação do\(s\) Alienante\(s\)(.*?)04 Identificação do\(s\) Adquirente\(s\)(.*?)05 Informações sobre a Alienação(.*?)06 Informações sobre o Imóvel(.*?)Página',
def complete_patterns(file): with open(file, "rb") as f: pdf = pdftotext.PDF(f) ''' handling ocr-requried ''' if pdf[2] == '': #print(f'{file}: This file is protected, no way to parse except ocr!') return 'Ocr required' if ('\x10' in pdf[2]) or ('\u2e4e' in pdf[2]): #print(f'{file}: This file has severe codex issue, no way to parse except ocr!') return 'Ocr required' ''' pattern 1 EX: 惠文高中 ''' try: output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) score_sheet = pdf[2] output += re.findall('姓名:(\w{2,3})', score_sheet) output.append(re.findall('^(.*?)\n', score_sheet)[0].replace(' ', '')) for s in ['班排', '群排', '年排']: output += [ round(eval(i), 3) for i in re.findall('\d{1,3}/\d{2,3}', re.findall(s + '(.*?\n)', score_sheet)[0]) ] output.append('Complete pattern 1') return output except: pass ''' pattern 2 ex: 臺北市立和平高級中學 ''' try: output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) score_sheet = pdf[2] output.append(re.findall('姓 名(.*?)班', score_sheet)[0].replace(' ', '')) school = score_sheet[20:80].replace(' ', '') if school == '': school = re.findall('目前成績(.*?)申請入學', score_sheet)[0].replace(' ', '') output.append(school) try: L = re.split(' +', re.findall('\n學業成績(.*?)\n', score_sheet)[0])[1:] except: L = re.split(' +', re.findall('\n學業成績(.*?)\n', pdf[3])[0])[1:] try: PL = re.split(' +', re.findall('\n總人數(.*?)\n', score_sheet)[0])[1:] except: PL = re.split(' +', re.findall('\n總人數(.*?)\n', pdf[3])[0])[1:] RL = [ L[i] for i in [1, 2, 3, 8, 9, 10, 15, 16, 17, 22, 23, 24, 29, 30, 31] ] TL = [round(eval(i + '/' + j), 3) for i, j in zip(RL, PL)] for i in range(3): output += TL[i::3] output.append('Complete pattern 2') return output except: pass ''' pattern 3 ex: 高雄市立中正高級中學 ''' try: output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) score_sheet = pdf[2] name = re.findall('姓名: (\w*?) ', score_sheet) if name: output += name else: output.append('XXX') output.append(re.findall('^(.*?)\n', score_sheet)[0].replace(' ', '')) output += [ round(eval(i), 3) for i in re.findall( '\d{1,3}/\d{2,3}', re.findall('班級人數 百分比(.*?)符號註記', score_sheet)[0]) ] output += [ round(eval(i), 3) for i in re.findall( '\d{1,3}/\d{2,3}', re.findall('類組排名/類組人數(.*?)為不及格', score_sheet)[0]) ] output += [ round(eval(i), 3) for i in re.findall( '\d{1,3}/\d{2,3}', re.findall('年級排名/年級人數(.*?)M 為重修', score_sheet)[0]) ] output.append('Complete pattern 3') return output except: pass ''' 中文複製貼上會錯亂的pattern pattern4 ''' try: score_sheet = pdf[2] output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) output.append(re.findall('姓名:(.*?)\n', pdf[0])[0].replace(' ', '')) output += re.findall('學校:(\w*?) ', score_sheet[0:100]) try: for p in ['班', '組', '年級']: output += [ round(float(i) / 100, 2) for i in re.findall( '\d{1,3}\.\d{2,3}', re.findall(f'{p}百身比(.*){p}百身比', score_sheet)[0]) ] output.append('Complete pattern 4-1') except: for p in ['班', '組', '年級']: if p == '班': output += [ round(float(i) / 100, 2) for i in regex.findall( ' (\d.*?) ', re.findall(f'{p}百分比(.*){p}百分比', score_sheet)[0], overlapped=True)[1::2] ] else: output += [ round(float(i) / 100, 2) for i in re.findall( ' (\d.*?) ', re.findall(f'{p}百分比(.*){p}百分比', score_sheet)[0]) ] output.append('Complete pattern 4') return output except: pass try: score_sheet = pdf[2] output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) output.append(re.findall('姓名:(.*?)\n', pdf[0])[0].replace(' ', '')) output += ['XXX'] output += [ round(float(i) / 100, 2) for i in re.findall( '\d{1,3}\.\d{2,3}', re.findall('班百身比(.*)班百身比', score_sheet)[0])[1::2] ] output += [ round(float(i) / 100, 2) for i in re.findall(' (\d.*?) ', re.findall('組百身比(.*)組百身比', score_sheet)[0]) ] for p in ['學級', '學國', '學揚']: try: output += [ round(float(i) / 100, 2) for i in re.findall( ' (\d.*?) ', re.findall(f'{p}百身比(.*){p}百身比', score_sheet)[0]) ] if len(output) != 19: #print('Very special case...') return 'skipped' output.append('Complete pattern 4-2') return output except: pass except: pass ''' pattern 5 ''' try: score_sheet = pdf[2] output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) output += re.findall('姓名:(\w*?) ', score_sheet) output.append(re.findall('(.*?)學生', score_sheet)[0].replace(' ', '')) if '類組(科別)排名' not in score_sheet: score_sheet = pdf[3] for p in ['班級', '類組\(科別\)', '年級']: output += [ round(eval(i), 2) for i in re.findall( '\d{1,3}/\d{2,3}', re.findall(p + '排名(.*?)\n', score_sheet)[0]) ] output.append('Complete pattern 5') return output except: pass ''' pattern 6 ''' try: score_sheet = pdf[2] output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) output += re.findall('姓名:(\w.*?) ', score_sheet) output += ['XXX'] temp = [] for i in range(1, 4): tl = re.split( ' +', re.findall('學業平均(.*?)\n', score_sheet)[0].split('│')[i]) if i == 3: temp += [tl[j] for j in [4, 8, 6]] else: temp += [tl[j] for j in [4, 8, 6, 11, 15, 13]] for i in range(3): output += [round(float(i) / 100, 2) for i in temp[i::3]] output.append('Complete pattern 6') return output except: pass ''' pattern 7 ''' try: score_sheet = pdf[2] output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) output += re.findall('姓名: (\w*?) ', score_sheet) output.append(re.findall('^(.*?)\n', score_sheet)[0].replace(' ', '')) for p in ['班級', '學程', '年級']: output += [ round(eval(i), 2) for i in re.findall( '\d{1,3}/\d{2,3}', re.findall(p + '排名(.*?)\n', score_sheet)[0]) ] output.append('Complete pattern 7') return output except: pass ''' pattern 8 a format very similar to the dominant skipped ones, however, it contains one additional line to show all required info ''' try: score_sheet = pdf[2] output = [] output.append(file.split('/')[-3]) output.append(file.split('/')[-2]) output.append(re.findall('姓名:(\w*?) ', score_sheet)[0]) output.append(re.findall('^(.*?)\n', score_sheet)[0].replace(' ', '')) for p in ['班', '類', '校']: output += [ int(i) / 100 for i in re.findall( ' (\d\d) ', re.findall('各學期成績校排(.*?)\n', score_sheet)[0]) ] output.append('Complete pattern 8') return output except: pass ''' Skipped files ''' score_sheet = pdf[2] for p in [ '個人成績單暨班級百分比對照表', '個人成績單暨百分比對照表', '個人成績單暨類組百分比對照表', '學生個人成績單暨百分比對', '個人成績單暨年級百分比對照表' ]: if p in score_sheet[0:120]: #print('This is a type of score sheet that doesnot contain enough info') return 'skipped' if (' 成績證明書\n' in score_sheet[0:120]) or \ ('學生個人成績證明書\n' in score_sheet[0:120]) \ or ('學 生 成 績 表' in score_sheet)\ or ('成績一覽表' in score_sheet)\ or ('學生個人成績單\n' in score_sheet[0:120]) \ or ('成 績 報 告 單' in score_sheet[0:300]) \ or ('歷年成績單-補考、重修後' in score_sheet[:300]) \ or ('桃園市新興高級中等學校' in score_sheet[:300]): #print('This is a type of score sheet that doesnot contain enough info') return 'skipped' ''' Nothing matched at all, return False ''' #print('No pattern matched!') return False
#======================================================================================================================= print('Init:') #======================================================================================================================= file_pdf_input = 'resources/pdf-simple.pdf' file_pdf_output = 'resources/pdf-simple-out.pdf' # debug = True debug = False # line_tag_separate_init = '__________________________________________________' line_tag_separate_done = '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^' # pdf_input1 = None with open(file_pdf_input, "rb") as in_f: pdf_input1 = pdftotext.PDF(in_f) # with open(file_pdf_input, "rb") as in_f: pdf_input2 = PyPDF2.PdfFileReader(in_f) pdf_output = PyPDF2.PdfFileWriter() pdf_blank = None pdf_blank_page = None page_box = 639 page_box_add = 0 # page_box_add = 100 page_num = pdf_input2.getNumPages() print("Read document %s ..." % file_pdf_input) print("Document has %s pages." % page_num)
def read_pdf(pdf_file): """ Take in Voucher PDFs and parse them as either international or domestic, and return unified response. """ domestic_settings_keys = [ "VOUCHER_DOMESTIC_EMPLOYEE_KEY", "VOUCHER_DOMESTIC_EMPLOYEE_ID_KEY", "VOUCHER_DOMESTIC_KEY", "VOUCHER_DOMESTIC_COURSE_KEY", "VOUCHER_DOMESTIC_DATES_KEY", ] international_settings_keys = [ "VOUCHER_INTERNATIONAL_EMPLOYEE_KEY", "VOUCHER_INTERNATIONAL_EMPLOYEE_ID_KEY", "VOUCHER_INTERNATIONAL_DATES_KEY", "VOUCHER_INTERNATIONAL_COURSE_NAME_KEY", "VOUCHER_INTERNATIONAL_COURSE_NUMBER_KEY", ] for key in domestic_settings_keys + international_settings_keys: if not getattr(settings, key): log.warning("Required setting %s missing for read_pdf", key) return try: pdf = pdftotext.PDF(pdf_file) if any("Entity Name:" in page for page in pdf): values = read_pdf_international(pdf) for key in international_settings_keys: if not values.get(getattr(settings, key)): return None course_id_input = values.get( settings.VOUCHER_INTERNATIONAL_COURSE_NUMBER_KEY) return { "pdf": pdf_file, "employee_id": values.get(settings.VOUCHER_INTERNATIONAL_EMPLOYEE_ID_KEY), "voucher_id": None, "course_start_date_input": datetime.strptime( values.get(settings.VOUCHER_INTERNATIONAL_DATES_KEY).split( " ")[0], "%d-%b-%Y", ).date(), "course_id_input": remove_extra_spaces(course_id_input) if len(course_id_input) >= 3 else "", "course_title_input": remove_extra_spaces( values.get( settings.VOUCHER_INTERNATIONAL_COURSE_NAME_KEY)), "employee_name": values.get(settings.VOUCHER_INTERNATIONAL_EMPLOYEE_KEY), } else: values = read_pdf_domestic(pdf) for key in domestic_settings_keys: if not values.get(getattr(settings, key)): return None course_id_input = values.get( settings.VOUCHER_DOMESTIC_COURSE_KEY).split(" ")[0] return { "pdf": pdf_file, "employee_id": values.get(settings.VOUCHER_DOMESTIC_EMPLOYEE_ID_KEY), "voucher_id": values.get(settings.VOUCHER_DOMESTIC_KEY), "course_start_date_input": datetime.strptime( values.get( settings.VOUCHER_DOMESTIC_DATES_KEY).split(" ")[0], "%m/%d/%Y", ).date(), "course_id_input": remove_extra_spaces(course_id_input) if len(course_id_input) >= 3 else "", "course_title_input": remove_extra_spaces(" ".join( values.get( settings.VOUCHER_DOMESTIC_COURSE_KEY).split(" ")[1:])), "employee_name": values.get(settings.VOUCHER_DOMESTIC_EMPLOYEE_KEY), } except Exception: # pylint: disable=broad-except log.exception("Could not parse PDF") return None
def findPDFMatchesBruteForce(f, text_patterns, env_matches, og_file=None, raw=False): ''' This processes the environments which weren't already matched by deleting them from the file, running pdftotext to see the difference with the original, Arguments: f: file object in read bytes mode: the pdf to whiteout text_patterns: a list of byte strings to whiteout env_matches: a list of ranges to test in f og_file: f.readlines() raw: whether to use pdftotext raw ''' def searchDiff(og_text, new_text, patterns, brute_results): ''' Returns True if a match was found and collect data about which pattern was matched Arguments: A string of the original text A string of the edited text patterns: a list of compiled re patterns from text_patterns (not bytes) brute_results: a dictionary like { 'c' : {e:0 for e in text_patterns}} where text_patterns are strings that were compiled into patterns ''' # check to see if the new text has at least one fewer instance of the # search pattern for i, pattern in enumerate(patterns): if len(pattern.findall(og_text)) > len(pattern.findall(new_text)): brute_results['c'][text_patterns[i]] += 1 return True return False # initialize search items brute_search_matches = [] brute_search_unmatched = [] # initialize data collector brute_results = {'c': {e: 0 for e in text_patterns}} # compile re's as strings (output of pdftotext) if raw: patterns = [ re.compile(''.join(e.decode('utf-8').split())) for e in text_patterns ] else: patterns = [re.compile(e.decode('utf-8')) for e in text_patterns] # Using pdftotext python library to read text # all manipulations are done in memory so hopefull this is quick # produce original text og_text = pdftotext.PDF(f, raw=raw) # remove text in each range once, one by one, checking for diffs in each page if not og_file: f.seek(0) og_file = f.readlines() tmp_pdf_file = filenames.fileOut( re.sub('.pdf', '_tmp_whiteout.pdf', f.name)) exists_text = re.compile(rb'[\(<].*?[\)>] *?Tj') for rng in env_matches: with open(tmp_pdf_file, 'w+b') as g: # if the rng has no text objects, skip it if not exists_text.search(b''.join(og_file[rng.start:rng.stop])): brute_search_unmatched.append(rng) continue g.writelines([ replacePDFTextWithSpace(e) if i in rng else e for i, e in enumerate(og_file) ]) g.seek(0) try: tmp_text = pdftotext.PDF(g, raw=raw) except pdftotext.Error as e: print(f'Warning: pdftotext.Error: {e}') brute_search_unmatched.append(rng) continue try: is_match = False for i, page in enumerate(og_text): if searchDiff(page, tmp_text[i], patterns, brute_results): brute_search_matches.append(rng) # Exception case for bad pdfs if b' ' in new_patterns: brute_search_unmatched.append(env) continue is_match = True new_patterns = [] for line in og_file[rng.start:rng.stop]: m = replacePDFTextWithSpace(line, just_match=True) if bool(m): new_patterns.append(m) new_ranges, _, new_results = findEnvAndMatchRanges( og_file, new_patterns, ['c'], rb'^\d+ \d+ obj', rb'^endobj') [ brute_search_matches.append(env_matches.pop(i)) for i, r in enumerate(env_matches) if r in new_ranges ] brute_results['c'].update(new_results['c']) break else: pass if not is_match: brute_search_unmatched.append(rng) except BaseException as e: print(f'Warning: {e}') brute_search_unmatched.append(rng) os.remove(tmp_pdf_file) return (brute_search_matches, brute_search_unmatched, brute_results)
os.unlink(file_path_del) except Exception as e: print(e) # path = os.path.join(cwd_path,'pdf',file) path = file from PyPDF2 import PdfFileReader pdf = PdfFileReader(open(path, 'rb')) pages = pdf.getNumPages() import pdftotext with open(file, 'rb') as f: pdf = pdftotext.PDF(f, raw=False) for p in pdf: with open(sys.argv[3] + '/firsttext/ne.txt', 'a') as f: f.write(p) file_name1 = sys.argv[3] + '/firsttext/ne.txt' ########################### def fo(li): line = li.replace("$", "").replace(",", "").replace("(", "-").replace(")", "") return line
def main(): """Ouça e Fale App """ st.title("Reader & Voice") activities = ["Home","PDF","TXT","About"] choice = st.sidebar.radio("Home",activities) if choice == 'Home': st.write("Only files:") st.markdown("### PDF or TXT") st.write("After uploading you can convert to 7 languages") st.markdown("### English, Spanish, French, Italian, Japanese, Russian and Chinese") #st.write("Definitions") #st.write("PCA is not a statistical method to infer parameters or test hypotheses. Instead, it provides a method to reduce a complex dataset to lower dimension to reveal sometimes hidden, simplified structure that often underlie it.") #st.write("") #st.write("PCA is a statistical method routinely used to analyze interrelationships among large numbers of objects.") #st.write("") #st.write("Principal component analysis (PCA) is a mathematical algorithm that reduces the dimensionality of the data while retaining most of the variation in the data set.") if choice == 'PDF': file = carregar_texto('pdf') pdf = pdftotext.PDF(file) #for page in pdf: # st.text(page) blob = TextBlob(pdf[0]) st.text(blob) st.write(blob.detect_language()) #dict_idioma_full = lista_idiomas_full() #idioma_original = get_value(blob.detect_language(),dict_idioma_full) #original_key = get_key(idioma_original, dict_idioma_full) #st.success("Original Language"+": "+ idioma_original + " ("+original_key+")") # Original sound #play(raw_text,original_key) #dict_idioma = lista_idiomas(idioma_original) #options = st.multiselect("Choose a language", tuple(dict_idioma.values())) #for i in range(len(options)): # value = options[i] # idioma_final_key = get_key(value, dict_idioma) # try: # if (idioma_original != idioma_final_key): # texto_convertido = str(blob.translate(to=idioma_final_key)) # st.success("Language"+": "+ value + " ("+idioma_final_key+")") # st.write(texto_convertido) # #st.text(idioma_final_key) # play(texto_convertido,idioma_final_key) # # except: # st.error("ERROR: some languages will fail to play the sound.") #dict_idioma_full = lista_idiomas_full() #idioma_original = get_value(blob.detect_language(),dict_idioma_full) #original_key = get_key(idioma_original, dict_idioma_full) #st.success("Original Language"+": "+ idioma_original + " ("+original_key+")") # Original sound #play(blob,original_key) #convert(blob) #except: # st.warning("PDF please") if choice == 'TXT': try: file = carregar_texto('txt') blob= TextBlob(file.getvalue()) st.markdown(blob) #dict_idioma_full = lista_idiomas_full() #idioma_original = get_value(blob.detect_language(),dict_idioma_full) #original_key = get_key(idioma_original, dict_idioma_full) #st.success("Original Language"+": "+ idioma_original + " ("+original_key+")") # Original sound #play(file.getvalue(),original_key) #st.write(blob.detect_language()) #st.subheader(blob) convert(file, blob) #dict_idioma = lista_idiomas(idioma_original) #options = st.multiselect("Choose a language", tuple(dict_idioma.values())) #for i in range(len(options)): # value = options[i] # idioma_final_key = get_key(value, dict_idioma) # try: # if (idioma_original != idioma_final_key): # texto_convertido = str(blob.translate(to=idioma_final_key)) # st.success("Language"+": "+ value + " ("+idioma_final_key+")") # st.write(texto_convertido) # #st.text(idioma_final_key) # play(texto_convertido,idioma_final_key) # # except: # st.error("ERROR: some languages will fail to play the sound.") except: st.warning("TXT please")
def extract_page_text(self, page_index): with open(self.file_path, 'rb') as pdf_file: pdf = pdftotext.PDF(pdf_file) return pdf[page_index]
from progress.bar import Bar from progress.bar import PixelBar from classes import ( Arquivo, Material, RegexMaterial, RegexArquivo, ) ##### Extraindo dados arquivo PDF pdf_file = "SICRO/GO 10-2020 Relatório Sintético de Materiais.pdf" with open( pdf_file, "rb" ) as f: cadastro = pdftotext.PDF( f ) num_pages = len( cadastro ) with PixelBar('Extraindo dados do PDF', max=num_pages, suffix='%(index)d/%(max)d - %(percent).1f%% - %(eta)ds') as bar: lista_material = list() for pagina in cadastro: linhas_pagina_atual_pdf = pagina.split('\n') linhas_pagina_atual_pdf.pop(-2) for linha in linhas_pagina_atual_pdf: obj_regex = RegexMaterial( linha ) if ( obj_regex.cabecalho is None ) and ( obj_regex.principal is not None ) and ( len( obj_regex.principal.groups() ) == 4 ):
#from tkinter import Tk #from tkinter.filedialog import askopenfilename import sys import pdftotext from gtts import gTTS #Tk().withdraw() #filelocation= askopenfilename() file = sys.argv[1] with open(file, "rb") as f: pdf = pdftotext.PDF(f) string_of_text = '' for text in pdf: string_of_text += text final_file = gTTS(text=string_of_text, lang='en') final_file.save(file + ".mp3")
import pdftotext import itertools import re # pdf 파일 경로를 입력합니다.. file = open("./Whiplash.pdf", 'rb') fileReader = pdftotext.PDF(file) text = [] # pdf로 읽을 파일을 한줄씩 txt파일에 입력해줍니다. with open("test.txt", "w") as f: for i in fileReader: i.replace('\t', '').replace('\n', '').strip() f.write(i) # 캐릭터별 대사 분리 character = [] # 아까 저장한 txt파일 열기 with open('test.txt') as f: for line in f: if line == '': continue text.append(line.strip().replace('.', '')) talk = [] # 첫 대사를 말하는 캐릭터는 직접 지정해줍니다.. text = text[text.index('ANDREW'):] # 대화 처리 for word in text: if word.isupper() and len(talk) < 1: # print(f"{word}일때 word.isupper()실행") name = word if len(talk) > 1:
from progress.bar import Bar from progress.bar import PixelBar from classes import ( RegexEquipamento, Equipamento, RegexArquivo, Arquivo, ) ##### Abrindo arquivo PDF onerado pdf_file_onerado = "SICRO/GO 10-2020 Relatório Sintético de Equipamentos.pdf" with open( pdf_file_onerado, "rb" ) as f_onerado: cadastro_onerado = pdftotext.PDF( f_onerado ) num_pages_onerado = len( cadastro_onerado ) ##### Extraindo dados do PDF onerado with PixelBar('Extraindo dados do PDF onerado', max=num_pages_onerado, suffix='%(index)d/%(max)d - %(percent).1f%% - %(eta)ds') as bar: ###### Populando lista com instância de Equipamento lista_equipamento = list() for pagina in cadastro_onerado: linhas_pagina_atual_pdf_file_onerado = pagina.split('\n') linhas_pagina_atual_pdf_file_onerado.pop(-2)
def test_locked_with_only_user_password_user_unlock(self): pdf = pdftotext.PDF(get_file("user_password.pdf"), "user_password") self.assertIn("secret", pdf[0])