def convert_SRLDC(): yesterday = datetime.datetime.now() - datetime.timedelta(days=1) print(yesterday.strftime("%d%m%y")) ystrday = str(yesterday.strftime("%d-%m-%Y")) print(ystrday) df = tabula.read_pdf("SRLDC" + ystrday + ".pdf", encoding="cp932", pages="all", multiple_tables=True, lattice=True) tabula.convert_into("SRLDC" + ystrday + ".pdf", "SRLDC" + ystrday + ".csv", output_format="csv", multiple_tables=True, lattice=True, pages="all") REG_AV = pd.DataFrame(df[0]) State_Demand = pd.DataFrame(df[1]) S_D_Energy_Forecast = pd.DataFrame(df[2]) GEN = pd.DataFrame(df[3]) GEN_2 = pd.DataFrame(df[4]) GEN_3 = pd.DataFrame(df[5]) GEN_4 = pd.DataFrame(df[6]) IRE_HVDC_Physical_Flows = pd.DataFrame(df[7]) IRE_SCH_WHELNG_UI_TLCHR = pd.DataFrame(df[8]) FREQ_Profile = pd.DataFrame(df[9]) VOL_Critical_Sub_Station = pd.DataFrame(df[10]) Maj_Res_Particulars = pd.DataFrame(df[11]) Overdrawls_below_49HZ_Constituents = pd.DataFrame(df[12]) Overdrawls_below_49HZ_Generators = pd.DataFrame(df[13])
def convert_WRLDC(): today = datetime.datetime.now() today = str(today.strftime("%d%m%Y")) print(today) df = tabula.read_pdf("WRLDC" + today + ".pdf", encoding="cp932", pages="all", multiple_tables=True, lattice=True) tabula.convert_into("WRLDC" + today + ".pdf", "WRLDC" + today + ".csv", output_format="csv", multiple_tables=True, lattice=True, pages="all") REQ_WR = pd.Dataframe(df[0]) FREQ = pd.DataFrame(df[1]) G_D_S_in_CA = pd.DataFrame(df[2]) S_D_M = pd.DataFrame(df[3]) REG = pd.DataFrame(df[4]) REG_IPP = pd.DataFrame(df[5]) REG_IPP_2 = pd.DataFrame(df[6]) IRE = pd.DataFrame(df[7]) VOL_Profile_765kV = pd.DataFrame(df[8]) VOL_Profile_400kV = pd.DataFrame(df[9]) STOA_BI_PX = pd.DataFrame(df[10]) State_Genrtrs = pd.DataFrame(df[11]) State_Genrtrs_2 = pd.DataFrame(df[12]) Z_C_UI_S = pd.DataFrame(df[13])
def convert_pdf_to_csv(path): """Converts the pdf to csv and saves the csv Parameters ---------- path : str The file location of the pdf """ filename_wo_ext = os.path.splitext(os.path.basename(path))[0] tabula.convert_into(path, OUTPUT_NAME1, format='csv', stream=True, pages=1, area=(TOP1, LEFT1, TOP_HEIGHT1, LEFT_WIDTH1)) tabula.convert_into(path, OUTPUT_NAME2, format='csv', stream=True, pages=1, area=(TOP2, LEFT2, TOP_HEIGHT2, LEFT_WIDTH2)) df1 = pd.read_csv(OUTPUT_NAME1) df2 = pd.read_csv(OUTPUT_NAME2, header=0, names=NAMES) pd.concat([df1, df2], axis=1).to_csv(OUTPUT_NAME, index=False) os.remove(OUTPUT_NAME1) os.remove(OUTPUT_NAME2)
def convert_NRLDC(): df = tabula.read_pdf("NRLDC" + ystrday + ".pdf", encoding="cp932", pages="all", multiple_tables=True, lattice=True) tabula.convert_into("NRLDC" + ystrday + ".pdf", "NRLDC" + ystrday + ".csv", output_format="csv", multiple_tables=True, lattice=True, pages="all") R_A_D = pd.DataFrame(df[0]) S_L_D = pd.DataFrame(df[1]) S_D_M = pd.DataFrame(df[2]) REG_ENT = pd.DataFrame(df[3]) State_ENT = pd.DataFrame(df[4]) State_ENT_2 = pd.DataFrame(df[5]) T_Hydro_GEN = pd.DataFrame(df[6]) T_Ren_GEN = pd.DataFrame(df[7]) IRE = pd.DataFrame(df[8]) IR_S_ACT_EX = pd.DataFrame(df[9]) IR_Analysis = pd.DataFrame(df[10]) IRE_Nepal = pd.DataFrame(df[11]) FREQ_Profile = pd.DataFrame(df[12]) FREQ_Profile_3 = pd.DataFrame(df[13]) VOL_Profile_400kV = pd.DataFrame(df[14]) VOL_Profile_765kV = pd.DataFrame(df[15]) Res_parameters = pd.DataFrame(df[16]) STOA = pd.DataFrame(df[17]) STOA_2 = pd.DataFrame(df[18]) Sys_Rel_Indics = pd.DataFrame(df[19]) Sys_Rel_Indics_2 = pd.DataFrame(df[20]) Z_C_violations = pd.DataFrame(df[21])
def convert_into_csv(filenames, output_dir, ext='pdf', table=[]): ''' Converts `pdf/xls/xlsx` files to `csv`. Also writes a `csv` file from a list ''' if len(table) != 0: for filename in filenames: filename = os.path.join(output_dir, filename) with open(filename, 'w') as f: writer = csv.writer(f) writer.writerows(table) return if ext == 'pdf': for filename in filenames: filename = os.path.join(output_dir, filename) tabula.convert_into( filename, filename.replace(ext, 'csv'), lattice=True, pages='all' ) elif ext in ['xls', 'xlsx']: for filename in filenames: filename = os.path.join(output_dir, filename) excel_file = pd.read_excel(filename) excel_file.to_csv(filename.replace(ext, 'csv'), index=None, header=True)
def read_table(file, user): tabula.convert_into(file, 'file.csv', output_format='csv', pages='all') head = [ 'Txn Date', 'Value Date', 'Description', 'Ref No./Cheque No.', 'Debit', 'Credit', 'Balance' ] with open('file.csv') as f: data = csv.reader(f) data = list(data) row0 = data[0] m = len(row0) i = 0 while i < len(data): f = data[i] i += 1 while i < len(data): rowi = data[i] if rowi[-1] != '': break for j in range(m): if rowi[j] != '': f[j] = f[j] + ' ' + rowi[j] i += 1 if f == head: print f else: entry = StatEntry(txn_date=f[0], val_date=f[1], description=f[2], ref_no=f[3], debit=f[4], credit=f[5], balance=f[6], user=user) entry.save()
def df2csv(filename, page): tabula.convert_into(filename, r"E:\scrapy\json\output.csv", output_format="csv", pages=page) print("Convert Complete!")
def translatepdf(): """ Use tabula to convert pdf file into .csv """ list_of_files = glob.glob('/Users/shaneshimizu/Downloads/*') latest_file = max(list_of_files, key=os.path.getctime) validateFile = input("name this file: ") schoolCode = input("School Code: ") os.system(f"open '{latest_file}'") if validateFile.lower() != None: dateRange = validateFile pathToFile = f'/Users/{username}/Desktop/service_billings/tabula_csv/' fileName = f'Work_Order_{dateRange}.csv' joinPath = os.path.join(pathToFile, fileName) #read recent file in download, read only set area, output into a csv file try: data = read_pdf(latest_file, pages='all') tabula.convert_into(latest_file, joinPath, guess=False, stream=True, area=(18.05, 17.9, 568.49, 756.57), output_format="csv", pages='all') #most recent file is not a accepted file for conversion except: print("not a fleet report, check recent download") return workBookName = joinPath time.sleep(1) executeAutomation(workBookName, schoolCode, dateRange) else: exit()
def translatepdf(): """ use tabula to convert pdf file into .csv """ list_of_files = glob.glob( f'/Users/{username}/Downloads/*' ) # * means all if need specific format then *.csv latest_file = max(list_of_files, key=os.path.getctime) validateFile = input("name this file: ") schoolCode = input("School Code: ") os.system(f"open '{latest_file}'") if validateFile.lower() != None: dateRange = validateFile #read recent file in download, read only set area, output into a csv file try: pathToFile = f'/Users/{username}/Desktop/service_billings/tabula_csv/' fileName = f'Fuel_{dateRange}.csv' joinPath = os.path.join(pathToFile, fileName) data = read_pdf(latest_file, pages='all') tabula.convert_into(latest_file, joinPath, output_format="csv", pages='all') #most recent file is not a accepted file for conversion except: print("not a fleet report, check recent download") return workBookName = joinPath time.sleep(1) executeAutomation(workBookName, schoolCode, dateRange) else: exit()
def getTablepdf(): # df = read_pdf('C:/Users/wenji/Desktop/subway.pdf', multiple_tables=True) tabula.convert_into('C:/Users/wenji/Desktop/subway.pdf', "C:/Users/wenji/Desktop/result/output.csv", output_format="csv", pages="all", multiple_tables=True)
def extract_info(tiliote): tapahtumat_list = list() nested_list = list() if path.exists("sptili.csv") == False: tabula.convert_into("stili.pdf", "sptili.csv", output_format="csv", pages='all') with open("sptili.csv", "r") as f: read_file = f.read() tapahtumat_object = re.compile( r'\d*,(\d{4})(.*),,\d+,,"(\d*\.?\d*\.?\d{1,3},\d{2}[+-])"' ) #read only line with amount matches = tapahtumat_object.finditer(read_file) for match in matches: print("function", match) # tapahtumat_list.append(match) for group_index in range(1, len(match.groups()) + 1): nested_list.append(match.group(group_index)) tapahtumat_list.append(nested_list) nested_list = list() # remove("accounts/bank_statements/{date}_{user}_tili.csv") for i in tapahtumat_list: print(i[0][:2])
def get_tables(): # tables = camelot.read_pdf(file, pages = "1-end") # tables.export("tables/output_tables.csv", f = "csv") tabula.convert_into(file, "tables/output.csv", output_format='csv', pages='all')
def with_table(path, out, png_path, page='all'): try: tabula.convert_into(path, out, output_format="csv", pages=page) except: return 'This pdf is not parasble!' # return tabula.read_pdf(pdf_path, header=-1, pages=page) df = pd.read_csv(out, encoding='utf-8', header=-1) df = df.drop(cleanRows(df), axis=0) df = df.reset_index() del df['index'] df = df.drop(longColumns(df), axis=1) df = df.drop(longRows(df), axis=0) df = move_nans(df) df.to_csv(out, encoding='utf-8', header=0, index=False) row, column = firstCell(df) print(row, column) if not (3 > row > 0 and 3 > column > 0): return 'No table is detected in the pdf.' render_mpl_table(df, header=(row, column), col_width=1.8) plt.savefig(png_path) # plt.show() return df
def pdf_scanner(file_path, scanner="pdfplumber", verbose=False): """ hvis tabula ikke virker til at skanne pdf prøves med pdfpumber """ if scanner == "tabula": try: tabula.convert_into(file_path, file_path + ".csv", all=True, pages='all') except Exception as exc: if verbose: print( 'Exception - Fejl i konvertering af pdf til csv-fil (pdfplumber): %s' % (exc)) return "Fejl i tabula skriving af csv fil" else: try: manuel_skanning(file_path, verbose) except Exception as exc: if verbose: print( 'Exception - Fejl i konvertering af pdf til csv-fil (pdfplumber): %s' % (exc)) return "Fejl i manuel skriving af csv fil"
def it_new_data(): from tabula import convert_into import pandas as pd ## Somehow tabula did not manage to extract, ## but it can convert into csv directly URL = it_link_update() convert_into(URL, "./granular_cases_europe/it_tmp.csv", output_format="csv") italy_updated = pd.read_csv('./granular_cases_europe/it_tmp.csv', skiprows=2, thousands='.') totals = [ x for x in range(len(italy_updated.iloc[:, 0])) if italy_updated.iloc[:, 0][x] == "TOTALE" ] italy_updated = italy_updated.drop( range(totals[0], len(italy_updated.iloc[:, 0]))) #.reset_index(drop = True) italy = pd.DataFrame({ "country": "Italy", "region": italy_updated.iloc[:, 0], "confirmed_infected": italy_updated.iloc[:, 7], "dead": italy_updated.iloc[:, 6], "recovered": italy_updated.iloc[:, 5] }) return (italy)
def PDFcheck(mail, _verbose=None): domain = mail.split("@")[1] term = "site:" + domain + " filetype:PDF intext:" + '"' + "email" + '"' try: data = search(term, num_results=5) for i in data: r = requests.get(i, stream=True) with open('data.pdf', 'wb') as f: f.write(r.content) pdfFileObj = open('data.pdf', 'rb') for pageNumber in range(1, 3): tabula.convert_into("data.pdf", "out.txt", pages=pageNumber, silent=True) file = open("out.txt", "r", encoding="utf-8") read = file.read() findPDFs = re.findall('[\w\.-]+@[a-z0-9\.-]+', read) try: if (findPDFs[0] is not None): for pdfs in findPDFs: print(pdfs) except: pass pdfFileObj.close() file.close() if os.path.exists("data.pdf"): os.remove("data.pdf") if os.path.exists("out.txt"): os.remove("out.txt") except: print("PDF Search error!")
def downPDF(request): # building a unique name for the schedule file being downloaded fname = '{}{}{}'.format("sched/schedule_", str(datetime.now().strftime("%Y%d%m")), randomString().upper()) # file name definition fnameCSV = os.path.join(str(settings.MEDIA_ROOT), '{}{}'.format(fname, '.csv')) print('fnameCSV > {}'.format(fnameCSV)) fnamePDF = os.path.join(str(settings.MEDIA_ROOT), '{}{}'.format(fname, '.pdf')) print('fnamePDF > {}'.format(fnamePDF)) # URL from where the FILE will be downloaded url = "http://localhost:9003/" # url = "http://webservices.globalterminalscanada.com/sites/default/files/DPVesselSchedule.pdf" # save the PDF file urlretrieve(url, fnamePDF) # 2) convert PDF into CSV file tabula.convert_into(fnamePDF, fnameCSV, output_format="csv", pages='all') # 3) Saving file names to Database dwnld = SchedFILE(uploaded_by=request.user, fnamePDF=fnamePDF, fnameCSV=fnameCSV) dwnld.save() return dwnld.fnameCSV
def pdf_to_csv(download_path): """ パス文字列で指定されたフォルダの最新のPDFファイルをDataframeで返す Parameters ---------- download_path : str パス文字列 Returns ------- df : pandas.DataFrame PDFファイルから生成したDataframe """ # 指定フォルダから最新ファイルを取得 list_of_files = glob.glob(download_path) latest_pdf_file = max(list_of_files, key=os.path.getctime) # tabulaでCSVに変換 latest_csv_file = latest_pdf_file.replace('.pdf', '.csv') tabula.convert_into(latest_pdf_file, latest_csv_file, pages="all", output_format="csv") # pandas df = pd.read_csv(latest_csv_file) return df
def translatepdf(): """ use tabula to convert pdf file into .csv """ list_of_files = glob.glob( '/Users/shaneshimizu/Downloads/*' ) # * means all if need specific format then *.csv latest_file = max(list_of_files, key=os.path.getctime) validatefile = input("name this file: ") print("\n") #validatefile = input("Is " + latest_file + " the file you want to process?" + "\n type yes or no: ") if validatefile.lower() != None: dateRange = validatefile #dateRange = input("Please enter the date range for this report (use underscore instead of spaces): ") try: data = read_pdf(latest_file, pages='all') tabula.convert_into(latest_file, f'Fleet_Invoice_{dateRange}.csv', guess=False, stream=True, area=(54.85, 15.76, 775.07, 595.18), output_format="csv", pages='all') except: print("not a fleet report, check recent download") return workbookname = f'Fleet_Invoice_{dateRange}.csv' #time.sleep(1) executeAutomation(workbookname) else: exit()
def pdf2csv(input_file, output_file, verbose=False): ''' pdf2csv() - Read a PDF file and write tables to a csv file Parameters ---------- input_file : str name of input PDF file output_file : str name of output csv file verbose : bool, default=False turn command-line output on or off Returns ------- nothing ''' import tabula tabula.convert_into(input_file, output_file, output_format='csv', pages='all') if verbose: print(f' Wrote table(s) in {input_file} to {output_file}') return
def oldPdfToCsv(inp_file): top = 60 left = 40 width = 744 height = 912 y1 = top x1 = left y2 = top + height x2 = left + width out_file = 'out2.csv' tabula.convert_into(inp_file, out_file, stream=True, output_format='csv', pages="11") df = pd.read_csv('out2.csv', header=None) # df.Name.apply(lambda x: pd.Series(str(x).split("_"))) ndf = df[0].apply(lambda x: pd.Series(num_name(x))) ndf[[2, 3, 4, 5, 6]] = df[[1, 2, 3, 5, 6]] ndf.to_csv('out2.csv', index=False, header=False)
def pdf2Csv(pdfpath, fileName, output_path): pdfName = pdfpath + fileName csvName = output_path + fileName + '.csv' # Read pdf into DataFrame df = tabula.read_pdf(pdfName, pages='all') # convert PDF into CSV tabula.convert_into(pdfName, csvName, output_format="csv", pages='all') data = readCsv(output_path + fileName) new_data = [] for i in range(len(data)): row = data[i] new_row = [] for j in range(len(row)): column = row[j] if j == 1: if isDate(column.lower()): new_row[0] == column column = "" new_row.append(column) new_data.append(new_row) with open('b.csv', 'w') as writeFile: writer = csv.writer(writeFile) writer.writerows(new_data)
def parse_statement_pdf(creditor, path): result = [] temp_file_path = "test.csv" unsanitized = [] pdf = Pdf(path, creditor) pgs = pdf.get_page_range() relative_area = pdf.get_relative_area_percentages() convert_into(path, temp_file_path, output_format="csv", pages=pgs, guess=False, area=relative_area, relative_area=True) with open(temp_file_path, 'r') as f: reader = csv.reader(f) unsanitized = list(reader) for line in unsanitized: sanitized_line = [] filtered_line = list(filter(None, line)) # removes empty elements # Check if transaction entry date_name_field = filtered_line[0] if type(date_name_field) is str: date_name_list = date_name_field.split(' ', 1) try: datetime.datetime.strptime(date_name_list[0], "%m/%d") sanitized_line.extend(date_name_list) price_field = filtered_line[1] sanitized_line.append(eval(price_field.replace(',', ''))) result.append(sanitized_line) print(sanitized_line) except ValueError as err: continue return result
def extract_info(tiliote, user, date, bank): tapahtumat_list = list() nested_list = list() tabula.convert_into(tiliote, f"accounts/bank_statements/{date}_{user}_tili.csv", output_format="csv", pages='all') with open(f"accounts/bank_statements/{date}_{user}_tili.csv", "r") as f: read_file = f.read() if bank == "nordea": tapahtumat_object = re.compile( r'(\d{2}\.\d{2}\s\d{2}\.\d{2})(.*?)(,,,,,+)"(\d*\.?\d*\.?\d{1,3},\d{2}[+-])"' ) #read only line with amount elif bank == "sp": tapahtumat_object = re.compile( r'\d*,(\d{4})(.*),,\d+,,"(\d*\.?\d*\.?\d{1,3},\d{2}[+-])"') matches = tapahtumat_object.finditer(read_file) for match in matches: print("function", match) # tapahtumat_list.append(match) for group_index in range(1, len(match.groups()) + 1): nested_list.append(match.group(group_index)) tapahtumat_list.append(nested_list) nested_list = list() # remove("accounts/bank_statements/{date}_{user}_tili.csv") return tapahtumat_list
def convert(inputfilepath,inputfilename): ''' function to convert pdf file to csv input => full qualified file name ''' logger.debug("going to convert pdf file to csv") fileInputPath =inputfilepath+"/"+inputfilename validateFile(fileInputPath) createFolder(outputPath) logger.debug("pdf file validation done") milliseconds = int(round(time.time() * 1000)) fileTempPath=outputPath+"temp_"+str(milliseconds)+".csv" outputFile=inputfilename.split(".")[0]+"_"+str(milliseconds)+".csv" fileOutputPath=outputPath+outputFile tabula.convert_into(fileInputPath, fileTempPath ,stream=True) logger.debug("pdf file to csv done") try: output=pandas.read_csv(fileTempPath,header=None) arr=output[[2][0]].str.split(" ",n=1,expand=True) output[[2][0]]=arr[0] output[[3][0]]=arr[1] output.to_csv(fileOutputPath,index=False, header=False) logger.debug("csv data is cleaned") except Exception as e: logger.debug("error while data cleaning"+str(e)) raise incorrectPdfFile return outputFile
def ImportMatricula(ruta, usuario): # Creating a pdf file object try: convert_into(ruta, 'media/conversion.csv', pages='all') safe = pd.read_csv('media/conversion.csv', encoding='latin-1') logging.debug(str(safe.columns[2])) if (str(safe.columns[2]) == 'DATOS DE MATRICULA'): file = pd.read_csv('media/conversion.csv', encoding='latin-1') file.to_excel('media/conversion.xls', index=False, header=False) loc = 'media/conversion.xls' wb = xlrd.open_workbook(loc) sheet = wb.sheet_by_index(0) for i in range(sheet.nrows): code = str(sheet.cell_value(i, 0))[0:8] logging.debug(code) asig = Asignatura.objects.filter(sid=code) if asig.exists(): ##ASIGNATURA SOPORTADA POR LA PLATAFORMA asig = Asignatura.objects.get(sid=code) logging.debug("Asignatura OK") AlAs = AlumAsig.objects.filter(uid=usuario, sid=asig, passed=False) logging.debug(AlAs.exists()) if not AlAs.exists(): ##ALUMNO YA MATRICULADO logging.debug(code) logging.debug('No existe esta relacion aun') new = AlumAsig.objects.create( uid=usuario, sid=asig, amount=0, grade=0, passed=False, ) new.save() else: logging.debug( "Alumno ya matriculado de esta asignatura.") os.remove(ruta) os.remove('media/conversion.csv') os.remove('media/conversion.xls') usuario.doc = None usuario.save() return 0 else: logging.debug("Matrícula no válida") os.remove(ruta) os.remove('media/conversion.csv') usuario.doc = None usuario.save() return 1 except Exception as e: logging.debug("Matrícula no válida (EXCEPCION)") logging.exception(e) os.remove(ruta) usuario.doc = None usuario.save() return 1
def extrPdfFromZipTo2TXT(nameSourse="cmegroup", workFilePDF="Section61_Energy_Futures_Products.pdf", info="Futures", col=-1): #Нахождение местоположение проекта на компьюторе path = os.getcwd() #Указание адреса исходной директории pathDirIn = "\Source" + "\\" + nameSourse #Указание адреса итоговой директории pathDirOut = "\Data" + "\\" + nameSourse + "\\" + info #Задание абсолютного пути pathIn = path + pathDirIn pathOut = path + pathDirOut #Список файлов в исходной директории fileNamesIn = os.listdir(pathIn) #print(fileNamesIn) fileNamesOut = os.listdir(pathOut) #Тело цикла #Счетчик файлов из директории i = 0 if col == -1: maxI = len(fileNamesIn) else: maxI = col #Номер символа с которого идет дата adressDateFromName = 18 print("Исходный файл:", workFilePDF) while i < maxI: #Задание имени итогового файла fileNameOut = fileNamesIn[i][adressDateFromName:(adressDateFromName + 8)] + ".xlsx" #print(fileNameOut) if fileNamesOut.count(fileNameOut) == 0: #Исходный архив Zip nameFileZip = pathIn + "\\" + fileNamesIn[i] #print(workFileZip) #Проверка на существование и то что это действительно zip архив if zipfile.is_zipfile(nameFileZip): #Открываем архив для работы workFileZip = zipfile.ZipFile(nameFileZip, 'r') #Указываем файл для обработки #Извлекаем файлы для обработки workFileZip.extract(workFilePDF) print("Идет создание ", fileNameOut, "исходный архив №", i) convert_pdf_to_txtfitz(workFilePDF, "PDFtoTXTpymupdf.txt") #pathFileNameOut = path + pathDirOut+"\Futures\\"+fileNameOut tabula.convert_into(workFilePDF, output_path="PDFtoTXTTabula.txt", output_format="csv", pages="all") if info == "Futures": txtToXlsxFut(pathOut=(pathOut + "\\" + fileNameOut)) elif info == "Options": txtToXlsxOpt(pathOut=(pathOut + "\\" + fileNameOut)) i += 1 print("Переформатирование завершено") #extrPdfFromZipTo2TXT(workFilePDF = "Section63_Energy_Options_Products.pdf",col=1,info="Options")
def pdf_to_csv(filepath): """ Convierte un archivo pdf a un csv para luego analizarlo :param filepath: ruta al archivo pdf :return: """ tabula.convert_into(filepath, RUTA_CSV, pages="all", output_format="csv", guess=False, stream=True) leer_csv()
def Convert(self): '''This function will handle the converting process of the pdf tables using the convert_into function of the tabula-py''' try: tabula.convert_into("%s" % usefile, output_path="C:/OutputFolder/Output.csv", output_format="csv", pages ="%s" % page) self.statusBar().showMessage('Converted') #show when the file is already converted except: #user hints for error handling QMessageBox.information(self, "WARNING", "Hint:Close the output file or Select the page.")
def pdf_to_text(pdf_path, outfile, columns, output_format='tsv', password = None): logging.debug("Extracting text from pdf - {}".format(pdf_path)) options_dict = {} if password is not None: options_dict["password"] = password tabula.convert_into(pdf_path, outfile, pages="all", stream=True, guess=False, output_format=output_format, columns=columns, **options_dict)
def test_convert_into_exception(self): pdf_path = 'tests/resources/data.pdf' with self.assertRaises(AttributeError): tabula.convert_into(pdf_path, 'test.csv', output_format='dataframe') with self.assertRaises(AttributeError): tabula.convert_into(pdf_path, None) with self.assertRaises(AttributeError): tabula.convert_into(pdf_path, '')
def test_conver_from(self): pdf_path = 'tests/resources/data.pdf' expected_csv = 'tests/resources/data_1.csv' expected_tsv = 'tests/resources/data_1.tsv' expected_json = 'tests/resources/data_1.json' temp = tempfile.NamedTemporaryFile() tabula.convert_into(pdf_path, temp.name, output_format='csv') self.assertTrue(filecmp.cmp(temp.name, expected_csv)) tabula.convert_into(pdf_path, temp.name, output_format='tsv') self.assertTrue(filecmp.cmp(temp.name, expected_tsv)) tabula.convert_into(pdf_path, temp.name, output_format='json') self.assertTrue(filecmp.cmp(temp.name, expected_json))
def test_convert_remote_file(self): uri = "https://github.com/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/12s0324.pdf" temp = tempfile.NamedTemporaryFile() tabula.convert_into(uri, temp.name, output_format='csv') self.assertTrue(os.path.exists(temp.name))