def test_read_pdf_with_option(self): pdf_path = 'tests/resources/data.pdf' expected_csv1 = 'tests/resources/data_1.csv' expected_csv2 = 'tests/resources/data_2-3.csv' self.assertTrue(tabula.read_pdf(pdf_path, pages=1).equals(pd.read_csv(expected_csv1))) self.assertTrue(tabula.read_pdf(pdf_path, pages='2-3', guess=False).equals(pd.read_csv(expected_csv2))) self.assertTrue(tabula.read_pdf(pdf_path, pages=(2, 3), guess=False).equals(pd.read_csv(expected_csv2)))
def san_bernardino_sovc(): sovc_url = 'http://www.sbcountyelections.com/Portals/9/Elections/2015/0317/SOV%20Book.pdf' primary = read_pdf(sovc_url, pages="1-6", pandas_options={'error_bad_lines': False}) first = primary.loc[((primary.index > 2) & (primary.index < 64))][ ['Unnamed: 0', 'Unnamed: 5']].rename(columns={'Unnamed: 5': 'votes'}) second = primary.loc[((primary.index > 67) & (primary.index < 129)) | ((primary.index > 132) & (primary.index < 194)) | ((primary.index > 197) & (primary.index < 243))][ ['Unnamed: 0', 'Unnamed: 4']].rename(columns={'Unnamed: 4': 'votes'}) primary = read_pdf(sovc_url, pages="3,5", pandas_options={'error_bad_lines': False}) third = primary.loc[primary.index > 2][ ['Unnamed: 0', 'Unnamed: 5']].rename(columns={'Unnamed: 5': 'votes'}) pcand = 'SHARON RUNNER' merged = first.append(second).append(third) merged['precinct'] = merged['Unnamed: 0'].apply( lambda x: x[len(pcand):][:7] if str( x).startswith(pcand) else str(x)[:7]) return prepare_output(merged[['precinct', 'votes']], 'San Bernardino', 21, ['Sharon Runner'])
def test_read_pdf_with_multiple_areas(self): # Original files are taken from https://github.com/tabulapdf/tabula-java/pull/213 pdf_path = 'tests/resources/MultiColumn.pdf' expected_csv = 'tests/resources/MultiColumn.csv' expected_df = pd.read_csv(expected_csv) self.assertTrue(tabula.read_pdf( pdf_path, pages=1, area=[[0, 0, 100, 50], [0, 50, 100, 100]], relative_area=True).equals(expected_df)) self.assertTrue(tabula.read_pdf( pdf_path, pages=1, area=[[0, 0, 451, 212], [0, 212, 451, 425]]).equals(expected_df))
def test_read_pdf_for_multiple_tables(self): pdf_path = 'tests/resources/data.pdf' expected_csv1 = 'tests/resources/data_1.csv' self.assertEqual(len(tabula.read_pdf(pdf_path, pages=2, multiple_tables=True)), 2) self.assertTrue(tabula.read_pdf(pdf_path, pages=1, multiple_tables=True)[0].equals( pd.read_csv(expected_csv1, header=None))) with self.assertRaises(pd.errors.ParserError): tabula.read_pdf(pdf_path, pages=2)
def read_pdf_page_by_page(pdf_file_path, extraction_path): error_pages = [] # pages = 0 # try: # pages = PdfFileReader(open(pdf_file_path, 'rb')).getNumPages() # except: # print("File can not be read. Please provide a correct file.") pages = 5 for page in range(1, pages): try: # data = tabula.read_pdf(pdf_file_path, pages=page, silent=True) data = tabula.read_pdf( pdf_file_path, pages=page, nospreadsheet=True, area=(210, 20, 691, 573) ) except: error_pages.append("|".join([str(page), str(pages)])) print("Reading error -", page) continue if not data.empty: out_path = os.path.join(extraction_path, str(page) + ".csv") data.to_csv(out_path, index=False) print("Page -", page, "(out of", pages, ")", data.shape, out_path) else: error_pages.append((page, pages)) error_file = os.path.join(extraction_path, "errors.txt") with open(error_file, "w") as f: f.writelines(error_pages)
def test_read_pdf_into_json(self): pdf_path = 'tests/resources/data.pdf' expected_json = 'tests/resources/data_1.json' json_data = tabula.read_pdf(pdf_path, output_format='json') self.assertTrue(isinstance(json_data, list)) with open(expected_json) as json_file: data = json.load(json_file) self.assertEqual(json_data, data)
def read_pdf_whole(pdf_path, extraction_path): try: # data = tabula.read_pdf(pdf_path, pages="all", silent=True) data = tabula.read_pdf(pdf_path, pages=2, silent=False) out_file = pdf_dir = os.path.basename(pdf_file)[:-4] out_file_path = os.path.join(extraction_path, out_file + ".csv") data.to_csv(out_file_path, index=False) print("File-", out_file_path) return True except: print("Error reading the whole file. Switching to page-by-page mode...") return False
def test_read_pdf_with_pandas_option(self): pdf_path = 'tests/resources/data.pdf' expected_csv1 = 'tests/resources/data_1.csv' column_name = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] self.assertTrue(tabula.read_pdf(pdf_path, pages=1, pandas_options={'header': None} ).equals(pd.read_csv(expected_csv1, header=None))) self.assertTrue(tabula.read_pdf(pdf_path, pages=1, pandas_options={'header': 0} ).equals(pd.read_csv(expected_csv1, header=0))) self.assertTrue(tabula.read_pdf(pdf_path, pages=1, pandas_options={'header': 'infer'} ).equals(pd.read_csv(expected_csv1, header='infer'))) self.assertTrue( tabula.read_pdf( pdf_path, pages=1, pandas_options={'header': 'infer', 'names': column_name} ).equals(pd.read_csv(expected_csv1, header='infer', names=column_name)) ) self.assertTrue( tabula.read_pdf( pdf_path, pages=1, multiple_tables=True, pandas_options={'header': 'infer', 'names': column_name} )[0].equals(pd.read_csv(expected_csv1, header='infer', names=column_name)) ) self.assertTrue( tabula.read_pdf( pdf_path, pages=1, multiple_tables=True, pandas_options={'header': 'infer', 'columns': column_name} )[0].equals(pd.read_csv(expected_csv1, header='infer', names=column_name)) )
def parse_pdf(path, cuatri, indice, out): cols = ["Materia", "Curso", "Nombre", "Docentes", "Día", "Inicio", "Fin", "Sede", "Vacantes"] dfr = read_pdf(path, stream=True, guess=False, pages='all', area=[[82, 14.39, 795.38, 581.8], [42.93, 14.39, 796.43, 581.8]], columns=[14.39, 46.94, 68.05, 197.33, 451.53, 475.29, 499.52, 524.23, 556.55]) dfr = dfr.drop([col for col in dfr.columns if "Unnamed" in col], axis=1) dfr.columns = cols DIAS = { 'L': 'Lunes', 'Ma': 'Martes', 'Mi': 'Miércoles', 'J': 'Jueves', 'V': 'Viernes', 'S': 'Sábados' } dfr['Día'] = dfr['Día'].apply(lambda x: str(x).strip()).apply(DIAS.get) dfr = dfr.drop(dfr[~dfr.Materia.str.isdigit()].index) cols_horarios = ["Día", "Inicio", "Fin", "Sede"] dfr['Horarios'] = dfr[cols_horarios].apply(lambda x: ' '.join(map(str, x)), axis=1) dfr = dfr.drop(cols_horarios, axis=1) def tjoin(x, s): try: return s.join(set(x)) except: return 'Sin especificar' dfr = dfr.groupby(["Materia", "Curso"]).agg({ 'Nombre': lambda x: list(x)[0], 'Docentes': partial(tjoin, s='-'), 'Vacantes': lambda x: list(x)[0], 'Horarios': partial(tjoin, s="\n"), }).reset_index() dfr['Cuatri'] = cuatri dfr['Indice'] = dfr.index + indice dfr = dfr[['Indice', 'Cuatri', 'Nombre', 'Materia', 'Horarios']] dfr.to_csv(out, index=False)
def read_pdf(pdf_file_path): pdf_dir = pdf_file_path[-10:-7] pdf_dir_path = os.path.join(output_dir, pdf_dir) try: os.mkdir(pdf_dir_path) except OSError: pass # try: # data = tabula.read_pdf(pdf_file_path, pages="all", silent=True) # out_path = os.path.join(pdf_dir_path, pdf_dir+".csv") # data.to_csv(out_path, index=False) # print("File-", pdf_file_path) # except: # print("Error reading the whole file. Switching to page-by-page mode...") # pass error_pages = [] pages = PdfFileReader(open(pdf_file_path, 'rb')).getNumPages() for page in range(pages, pages + 1): try: data = tabula.read_pdf(pdf_file_path, pages=page, silent=True) except: error_pages.append("%s|%s" % (page, pages)) print("Reading error -", page) continue if not (data is None or data.empty): out_path = os.path.join(output_dir, pdf_dir, str(page) + ".csv") data.to_csv(out_path, index=False) print("Page -", page, "(out of", pages, ")", data.shape, out_path) else: error_pages.append("%s|%s" % (page, pages)) error_file = os.path.join(pdf_dir_path, "errors.txt") with open(error_file, "w") as f: f.writelines(error_pages)
import numpy as np import matplotlib.pyplot as plt import pandas as pd from tabula import read_pdf # after this: pip install tabula-py # and install homebrew java (NOT default java) for year in range(2011, 2019): for table in range(21, 30): try: fname = 'pdfs/table_%d_%d.pdf' % (table, year) df = read_pdf(fname) df.to_csv('csvs/table_%d_%d.csv' % (table, year)) except: pass
"income", "profit", "income tax", "expenses", "operat", "year ended", "$M" ] #assets, current assets, balance sheet, cashflow statement pdf_document = fitz.open(financial_report) pages_wterm = [] print(len(pdf_document)) iffound = False for current_page in range(len(pdf_document)): page = pdf_document.loadPage(current_page) for index in range(len(search_term)): if page.searchFor(search_term[index]): iffound = True else: iffound = False break if (iffound): pages_wterm.append(current_page + 1) iffound = False print(pages_wterm) # Read pdf int list of DataFrame df = tabula.read_pdf(financial_report, pages=pages_wterm) # convert PDF into CSV file tabula.convert_into(financial_report, "output.csv", output_format="csv", pages=pages_wterm)
#lkoval #6-19-19 from tabula import read_pdf import pandas as pd import string #Table 1.5 temp = read_pdf("document_1374394.pdf", pages="22,23", lattice=True, pandas_options={'header': None}) temp["raw_chem_name"] = temp.iloc[:, 1] temp = temp.drop([0, 15]) temp = temp.dropna(subset=["raw_chem_name"]) temp = temp.reset_index() temp = temp[["raw_chem_name"]] chemList = [] clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty)) for j in range(0, len(temp)): temp["raw_chem_name"].iloc[j] = str( temp["raw_chem_name"].iloc[j]).strip().lower().replace( ".", "").replace("α", "alpha") temp["raw_chem_name"].iloc[j] = clean(str(temp["raw_chem_name"].iloc[j])) for k in range(0, len(temp["raw_chem_name"].iloc[j].split(","))): chemList.append(temp["raw_chem_name"].iloc[j].split(",")[k]) table_1_5 = pd.DataFrame() table_1_5["raw_chem_name"] = "" table_1_5["raw_chem_name"] = chemList
def test_read_remote_pdf(self): uri = "https://github.com/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/12s0324.pdf" df = tabula.read_pdf(uri) self.assertTrue(isinstance(df, pd.DataFrame))
def test_read_pdf_exception(self): invalid_pdf_path = 'notexist.pdf' with self.assertRaises(FileNotFoundError): tabula.read_pdf(invalid_pdf_path)
def readpdf(file): convert_into(file, file.replace(".pdf", ".json"), output_format="json") pdf = read_pdf(file) write_to_file(pdf)
''' S = single family, T_C = townhouse and condo %Chg = year to year % change Sales = Closed Sales ''' column_list = ['Area', 'Single Family Sales', 'Single Family Sales Y2Y %Chg', 'Single Family Median Price', 'Single Family Median Price Y2Y %Chg', r'Townhouse/Condo Sales', r'Townhouse/Condo Sales Y2Y %Chg', r'Townhouse/Condo Median Price', r'Townhouse/Condo Median Price Y2Y %Chg'] for index, pdf_filename in enumerate(all_pdf): if pdf_filename[1] != 'Q' and pdf_filename.find('MSA') >= 0 and pdf_filename.find('End') < 0: # This is monthly MSA stats print('%d - %s' % (index, pdf_filename)) t1 = tabula.read_pdf(os.path.join(download_folder, pdf_filename), lattice = True) t2 = t1.iloc[1:24, [1, 3, 4, 6, 7, 9, 10, 12, 13]] t2.columns = column_list temp = pdf_filename.split('-') t2['Month'] = temp[1] + '-' + month_mapping[temp[0]] df_list.append(t2) # Area column may contain noise monthly_MSA_df = pd.concat(df_list, axis = 0) monthly_MSA_df.iloc[:, 1] = monthly_MSA_df.iloc[:, 1].str.replace(',', '').astype('float') monthly_MSA_df.iloc[:, 2] = monthly_MSA_df.iloc[:, 2].str.replace('%', '').astype('float') monthly_MSA_df.iloc[:, 3] = monthly_MSA_df.iloc[:, 3].str.replace(',', '').str.replace('$', '').astype('float') monthly_MSA_df.iloc[:, 4] = monthly_MSA_df.iloc[:, 4].str.replace('%', '').astype('float') monthly_MSA_df.iloc[:, 5] = monthly_MSA_df.iloc[:, 5].str.replace(',', '').astype('float')
def test_read_pdf_file_like_obj(self): with open(self.pdf_path, "rb") as f: df = tabula.read_pdf(f, stream=True) self.assertTrue(len(df), 1) self.assertTrue(isinstance(df[0], pd.DataFrame)) self.assertTrue(df[0].equals(pd.read_csv(self.expected_csv1)))
def test_read_remote_pdf_with_custom_user_agent(self): df = tabula.read_pdf(self.uri, user_agent="Mozilla/5.0", stream=True) self.assertTrue(len(df), 1) self.assertTrue(isinstance(df[0], pd.DataFrame))
def test_read_remote_pdf(self): df = tabula.read_pdf(self.uri) self.assertTrue(len(df), 1) self.assertTrue(isinstance(df[0], pd.DataFrame))
def test_read_pdf(self): df = tabula.read_pdf(self.pdf_path, stream=True) self.assertTrue(len(df), 1) self.assertTrue(isinstance(df[0], pd.DataFrame)) self.assertTrue(df[0].equals(pd.read_csv(self.expected_csv1)))
def test_read_pdf_with_java_option(self): self.assertTrue( tabula.read_pdf( self.pdf_path, pages=1, stream=True, java_options=["-Xmx256m"] )[0].equals(pd.read_csv(self.expected_csv1)) )
# -*- coding: utf-8 -*- """ Created on Wed Oct 21 13:17:36 2020 @author: hmusugu """ import tabula import pandas as pd df1 = tabula.read_pdf('ethylene-prices.pdf', pages=1, lattice=True) df2 = tabula.read_pdf('ethylene-prices.pdf', pages=2, lattice=True) df3 = tabula.read_pdf('ethylene-prices.pdf', pages=3, lattice=True) df4 = tabula.read_pdf('ethylene-prices.pdf', pages=4, lattice=True) df5 = tabula.read_pdf('ethylene-prices.pdf', pages=5, lattice=True) df6 = tabula.read_pdf('ethylene-prices.pdf', pages=6, lattice=True) list1 = (df1[0], df2[0], df3[0], df4[0], df5[0], df6[0]) final = pd.DataFrame() final = pd.concat(list1)
def test_read_pdf_exception(self): invalid_pdf_path = 'notexist.pdf' with self.assertRaises(subprocess.CalledProcessError): tabula.read_pdf(invalid_pdf_path)
def lambda_handler(event, context): # Get the secret sm = boto3.client('secretsmanager') secretobj = sm.get_secret_value(SecretId='ni-covid-tweets') secret = json.loads(secretobj['SecretString']) s3 = boto3.client('s3') messages = [] # Download the most recently updated PDF file for change in event: tmp = tempfile.NamedTemporaryFile(suffix='.pdf') with open(tmp.name, 'wb') as fp: s3.download_fileobj(secret['bucketname'],change['keyname'],fp) # Get the date range covered by the report text = textract.process(tmp.name, method='pdfminer').decode('utf-8') regex = re.compile(r'(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})\s+\–+\s+(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})') start_date = None end_date = None for line in text.split('\n'): m = regex.search(line) if m: start_date = pandas.to_datetime('%s %s %s' %(m.group(1),m.group(2),m.group(3)), format='%d %B %Y').date() end_date = pandas.to_datetime('%s %s %s' %(m.group(4),m.group(5),m.group(6)), format='%d %B %Y').date() break if start_date is None: logging.error('Unable to find start date in report') return { "statusCode": 404, "body": 'Unable to find start date in report %s' %change['url'], } # Get the tables from the report - note that it was not possible to get data from 4th April or earlier due to # tables that will not parse properly in the PDF tables = tabula.read_pdf(tmp.name, pages = "all", multiple_tables = True) tablecount = 0 dataset = pandas.DataFrame() for df in tables: if 'Total' not in df.columns: firstrow = df.iloc[0] newcols = [] for i in range(len(firstrow)): if isinstance(firstrow[i], float) and math.isnan(firstrow[i]): newcols.append(df.columns[i]) else: newcols.append(firstrow[i]) df.columns = newcols df = df[1:] df['Setting'] = df['Setting'].str.strip() df.dropna(axis='index',subset=['Total','Open','Closed'],inplace=True) df['Total'] = df['Total'].astype(int) df['Open'] = df['Open'].astype(int) df['Closed'] = df['Closed'].astype(int) df = df[df['Setting']!='Total'] if tablecount==0: df['Type'] = 'Probable Outbreak' elif tablecount==1: df['Type'] = 'Cluster' else: logging.warning('Unexpected table: %s' %df) tablecount += 1 dataset = pandas.concat([dataset, df]) dataset['Start Date'] = pandas.to_datetime(start_date) dataset['End Date'] = pandas.to_datetime(end_date) week = int((end_date - pandas.to_datetime('1 January 2020', format='%d %B %Y').date()).days / 7) dataset['Week'] = week # Create a simple summary and the tweet text summary = dataset.groupby('Type').sum() tweet = 'NI Contact Tracing reports from %s to %s:\n' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y')) for Type,data in summary.to_dict('index').items(): tweet += '\u2022 %d %ss (%d open, %d closed)\n' %(data['Total'], Type.lower(), data['Open'], data['Closed']) tweet += '\n%s' %change['url'] # Pull current data from s3 try: obj = s3.get_object(Bucket=secret['bucketname'],Key=secret['pha-clusters-datastore'])['Body'] except s3.exceptions.NoSuchKey: print("The object %s does not exist in bucket %s." %(secret['pha-clusters-datastore'], secret['bucketname'])) datastore = pandas.DataFrame(columns=['Week']) else: stream = io.BytesIO(obj.read()) datastore = pandas.read_csv(stream) # Clean out any data with matching dates datastore = datastore[datastore['Week'] != week] # Append the new data datastore = pandas.concat([datastore, dataset]) datastore['Start Date'] = pandas.to_datetime(datastore['Start Date']) datastore['End Date'] = pandas.to_datetime(datastore['End Date']) # Replace any known duplicates datastore['Setting'] = datastore['Setting'].replace({ 'Cinema/ Theatre / Entertainment': 'Cinema / Theatre / Entertainment Venue', 'Cinema/ Theatre / Entertainment Venue': 'Cinema / Theatre / Entertainment Venue', 'Funeral / Wakes': 'Funeral / Wake', 'Restaurant / Cafe': 'Restaurant / Café' }) # Push the data to s3 stream = io.BytesIO() datastore.to_csv(stream, index=False) stream.seek(0) s3.upload_fileobj(stream, secret['bucketname'], secret['pha-clusters-datastore']) # Set up chromedriver so we can save altair plots driver = get_chrome_driver() plots = [] if driver is None: logging.error('Failed to start chrome') else: p = altair.vconcat( altair.Chart( dataset ).mark_bar().encode( x = altair.X('Total:Q', axis=altair.Axis(title='Total reported')), y = altair.Y('Setting:O'), color='Type', order=altair.Order( 'Type', sort='ascending' ), ).properties( height=450, width=800, title='NI COVID-19 Contact Tracing reports from %s to %s' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y')) ), ).properties( title=altair.TitleParams( ['Data from Public Health Agency, does not include education or home settings', 'Covers the preceding four weeks', 'https://twitter.com/ni_covid19_data on %s' %datetime.datetime.now().date().strftime('%A %-d %B %Y')], baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10 ), ) plotname = 'pha-outbreaks-week-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m') plotstore = io.BytesIO() p.save(fp=plotstore, format='png', method='selenium', webdriver=driver) plotstore.seek(0) plots.append({'name': plotname, 'store': plotstore}) p = altair.vconcat( altair.Chart( datastore.groupby(['End Date','Type'])['Total'].sum().reset_index() ).mark_area().encode( x = altair.X('End Date:T', axis=altair.Axis(title='Date reported (for preceding four weeks)')), y = altair.Y('Total:Q', axis=altair.Axis(title='Total reported', orient="right")), color='Type', order=altair.Order( 'Type', sort='ascending' ), ).properties( height=450, width=800, title='NI COVID-19 Contact Tracing reports from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y')) ), ).properties( title=altair.TitleParams( ['Data from Public Health Agency, does not include education or home settings', 'Reported weekly for the preceding four weeks', 'https://twitter.com/ni_covid19_data on %s' %datetime.datetime.now().date().strftime('%A %-d %B %Y')], baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10 ), ) plotname = 'pha-outbreaks-time-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m') plotstore = io.BytesIO() p.save(fp=plotstore, format='png', method='selenium', webdriver=driver) plotstore.seek(0) plots.append({'name': plotname, 'store': plotstore}) p = altair.vconcat( altair.Chart( datastore.groupby(['End Date','Setting','Type'])['Total'].sum().reset_index() ).mark_area().encode( x = altair.X('End Date:T', axis=altair.Axis(title='')), y = altair.Y('Total:Q', axis=altair.Axis(title='', orient="right")), color='Type', facet=altair.Facet('Setting:O', columns=5, title=None, spacing=0), order=altair.Order( 'Type', sort='ascending' ), ).properties( height=90, width=160, title=altair.TitleParams( 'NI COVID-19 Contact Tracing reports by setting from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y')), anchor='middle', ), ), ).properties( title=altair.TitleParams( ['Data from Public Health Agency, does not include education or home settings', 'Reported weekly for the preceding four weeks', 'https://twitter.com/ni_covid19_data on %s' %datetime.datetime.now().date().strftime('%A %-d %B %Y')], baseline='bottom', orient='bottom', anchor='end', fontWeight='normal', fontSize=10, dy=10 ), ) plotname = 'pha-outbreaks-small-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m') plotstore = io.BytesIO() p.save(fp=plotstore, format='png', method='selenium', webdriver=driver) plotstore.seek(0) plots.append({'name': plotname, 'store': plotstore}) # Convert to dates to ensure correct output to CSV datastore['Start Date'] = datastore['Start Date'].dt.date datastore['End Date'] = datastore['End Date'].dt.date # Tweet out the text and images if change.get('notweet') is not True: api = TwitterAPI(secret['twitter_apikey'], secret['twitter_apisecretkey'], secret['twitter_accesstoken'], secret['twitter_accesstokensecret']) upload_ids = api.upload_multiple(plots) if change.get('testtweet') is True: if len(upload_ids) > 0: resp = api.dm(secret['twitter_dmaccount'], tweet, upload_ids[0]) if len(upload_ids) > 1: resp = api.dm(secret['twitter_dmaccount'], 'Test 1', upload_ids[1]) if len(upload_ids) > 2: resp = api.dm(secret['twitter_dmaccount'], 'Test 2', upload_ids[2]) else: resp = api.dm(secret['twitter_dmaccount'], tweet) messages.append('Tweeted DM ID %s' %(resp.id)) else: if len(upload_ids) > 0: resp = api.tweet(tweet, media_ids=upload_ids) else: resp = api.tweet(tweet) # Download and update the index status = S3_scraper_index(s3, secret['bucketname'], secret['pha-clusters-index']) index = status.get_dict() for i in range(len(index)): if index[i]['filedate'] == change['filedate']: index[i]['tweet'] = resp.id break status.put_dict(index) messages.append('Tweeted ID %s and updated %s' %(resp.id, secret['pha-clusters-index'])) else: print(tweet) messages.append('Did not tweet') return { "statusCode": 200, "body": json.dumps({ "message": messages, }), }
def test_read_pdf(self): pdf_path = 'tests/resources/data.pdf' expected_csv1 = 'tests/resources/data_1.csv' df = tabula.read_pdf(pdf_path) self.assertTrue(isinstance(df, pd.DataFrame)) self.assertTrue(df.equals(pd.read_csv(expected_csv1)))
def clean_page_single_file(file_path: str) -> pd.DataFrame: state_abbr_dict = { 'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'District_of_Columbia': 'DC', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New_Hampshire': 'NH', 'New_Jersey': 'NJ', 'New_Mexico': 'NM', 'New_York': 'NY', 'North_Carolina': 'NC', 'North_Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode_Island': 'RI', 'South_Carolina': 'SC', 'South_Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West_Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY' } state_name = file_path.split('/')[-1].split('.')[0] state_code = state_abbr_dict[state_name] df = tabula.read_pdf(file_path, pages='all') p1 = df[0] p2 = df[1] p1.columns = list('abcde') p1 = p1.iloc[2:] p1['a'] = p1['b'] p1 = p1[['a', 'c']] p1 = p1.iloc[:19] p1 = p1.transpose() new_header = p1.iloc[0] p1 = p1.iloc[1:] p1.columns = new_header p1.insert(0, 'State', [state_code]) p1 = p1.reset_index() del p1['index'] p1.columns.name = None p1 = p1.replace('---', 'NA') p2_columns = [ 'Speciality', 'Physicians', 'People per Physicians', 'Total female Physicians', 'Percent Female Physicians', 'Total Physicians > Age 60', 'Percent Physicians > Age 60' ] p2.columns = p2_columns p2 = p2.replace('*', 'NA') p2 = p2[['Speciality', 'Physicians']] p2 = p2.transpose() new_header = p2.iloc[0] p2 = p2.iloc[1:] p2.columns = new_header del p2['SpecialtyPhysiciansPeople Per PhysicianNumberPercentNumberPercent'] p2.insert(0, 'State', [state_code]) p2 = p2.reset_index() del p2['index'] p2.columns.name = None p1 = p1.join(p2, rsuffix='_p2') del p1['State_p2'] return (p1)
def upload(request): import yaml """upload page""" now = timezone.now() output = "" status_output = "" # Query qs = ExpiredProduct.objects.all() product_contains = request.GET.get('product') pick_date_min = request.GET.get('pick_date_min') qs = qs.filter(expired_date__lt=now) if is_valid_queryparam(product_contains): qs = qs.filter(product__icontains=product_contains) if is_valid_queryparam(pick_date_min): qs = qs.filter(expired_date__gte=pick_date_min) expired_prod = set(qs.values_list('product', flat=True)) # Check List if request.method == 'POST' and "document" in request.FILES: if expired_prod: dt_str = now.strftime("%d%m%Y_%H%M%S") # Read PDF myfile = request.FILES['document'] fs = FileSystemStorage() path_file = os.path.join( os.path.dirname(__file__), "static/upload_files/temp_%s.pdf" % dt_str) filename = fs.save(path_file, myfile) # Read PDF Setting setting_file = os.path.join(os.path.dirname(__file__), "setting/pdf_settings.yaml") pdf_setting = open(setting_file) parsed_yaml_file = yaml.load(pdf_setting, Loader=yaml.FullLoader) df = tabula.read_pdf(filename, encoding='utf-8', pages="all", area=parsed_yaml_file["detext_position"]) # Read Column from pdf curr_prod = [] for page in range(len(df)): if "型 式" in df[page]: try: curr_prod = curr_prod + list(df[page]["Unnamed: 0"]) except: curr_prod = curr_prod + list(df[page]["型 式"]) print("Column data (page %d): " % (page + 1), curr_prod) # to check data in column # Combine Rule output = [] expired_prod = ["^" + s + "$" for s in list(expired_prod)] combined = "(" + ")|(".join(expired_prod) + ")" # Matching for prod in set(curr_prod): if re.match(combined, str(prod)): output.append(prod) os.remove(filename) if output: output = " 結果 (%s %d): %s" % (str(myfile), len(output), output) else: output = " 結果 (%s): 問題がありません" % str(myfile) status_output = "pass" else: output = "pass" status_output = "pass" # UPLOAD XLSX TO DB if request.method == 'POST' and "document-db" in request.FILES: dt_str = now.strftime("%d%m%Y_%H%M%S") count = 0 # Read XLSX file myfile = request.FILES['document-db'] fs = FileSystemStorage() path_file = os.path.join(os.path.dirname(__file__), "static/upload_files/temp_%s.pdf" % dt_str) filename = fs.save(path_file, myfile) df_expired = pd.read_excel(filename, index_col=0) expired_prod_file = set(df_expired["Unnamed: 4"]) # Cleaning Data for origin_item in expired_prod_file: if origin_item and origin_item != "品番" and type( origin_item) == str: item = origin_item.replace(" ", "") for sub_item in item.split(","): sub_item = sub_item.replace("**/**", "*") for i in range(6, 1, -1): sub_item = sub_item.replace("*" * i, "*") sub_item = sub_item.replace('*', '.*') if sub_item not in expired_prod: count += 1 expired_date = list( df_expired[df_expired["Unnamed: 4"] == origin_item]["Unnamed: 7"])[0] expired_date = convert_datetime(expired_date) ExpiredProduct.objects.get_or_create( product=sub_item, expired_date=expired_date) else: pass else: pass os.remove(filename) context = { 'all': qs, 'request': request, 'output': output, 'status': status_output } return render(request, 'upload.html', context)
import pandas as pd import numpy as np from PIL import Image import cv2 import pytesseract from matplotlib import pyplot as plt pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract' TESSDATA_PREFIX = 'C:/Program Files/Tesseract-OCR' statement = pytesseract.image_to_string(Image.open('statement2.png'),lang='eng',config='-c preserve_interword_spaces=1x1 --psm 1 --oem 3') df = pytesseract.image_to_data(Image.open('statement2.png'),lang='eng', output_type='data.frame') from tabula import read_pdf df = read_pdf('statement_sample1.pdf') print(df) print(statement) statement import re s = statement start = s.find('\n\n \n\n \n\n') + 8
def parse(self, response): df = tabula.read_pdf('/home/ait-python/Desktop/583/1.pdf', pages = '1',delimeter=',', encoding='ISO-8859-1', area=(146.498,30.0,749.318,579.27), guess=False, pandas_options={'header': 'infer'}) for _, row in df.iterrows(): self.a=row.tolist() lic_no = str(self.a[0]) print("###################",lic_no) fname = str(self.a[2])+" "+str(self.a[3])+" "+str(self.a[4]) lname = fname.replace("nan","") lic_type = "Funeral Director License" daob = str(self.a[6]) daoe = str(self.a[7]) il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'NV_Funeral_Licenses') il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') il.add_value('permit_lic_no', lic_no) il.add_value('permit_lic_eff_date', daob) il.add_value('permit_subtype', lic_type) il.add_value('person_name', fname) il.add_value('location_address_string', 'NV') il.add_value('company_phone', '') il.add_value('permit_type', 'cemetery_funeral_license') il.add_value('permit_lic_exp_date', daoe) il.add_value('company_name', '') yield il.load_item() df2 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Embalmers.pdf', pages = '2',delimeter=',', encoding='ISO-8859-1', area=(70.763,30.0,535.883,580.035), guess=False, pandas_options={'header': 'infer'}) for _, row in df2.iterrows(): self.b=row.tolist() lic_no = str(self.b[0])+str(self.b[1]).replace('nan','') fname = str(self.b[2])+" "+str(self.b[3])+" "+str(self.b[4]).replace('nan','') lname = fname.replace('nan','') lic_type = "Embalmer License" daob = str(self.b[6]) daoe = str(self.b[7]) il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'NV_Funeral_Licenses') il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') il.add_value('permit_lic_no', lic_no) il.add_value('permit_lic_eff_date', daob) il.add_value('permit_subtype', lic_type) il.add_value('person_name', lname) il.add_value('location_address_string', 'NV') il.add_value('company_phone', '') il.add_value('permit_type', 'cemetery_funeral_license') il.add_value('permit_lic_exp_date', daoe) il.add_value('company_name', '') yield il.load_item() # #----------------------------------------------------------------------------------2 # df3 = tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralArrangersLicensees.pdf', # pages = '1',delimeter=',', # encoding='ISO-8859-1', # area=(155.678,29.835,752.378,582.93), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df3.iterrows(): # self.c=row.tolist() # lic_no = str(self.c[0]).replace('nan','') # fname = str(self.c[1])+" "+str(self.c[2])+" "+str(self.c[3]) # lname = fname.replace('nan','') # lic_type = "Funeral Arranger License" # daob = str(self.c[5]).replace('nan','') # daoe = str(self.c[6]).replace('nan','') # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date', daob) # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', lname) # il.add_value('location_address_string', 'NV') # il.add_value('company_phone', '') # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', daoe) # il.add_value('company_name', '') # yield il.load_item() # df4 =tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralArrangersLicensees.pdf', # pages = '2',delimeter=',', # encoding='ISO-8859-1', # area=(60.818,30.0,767.678,583.86), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df4.iterrows(): # self.d=row.tolist() # lic_no = str(self.d[0]).replace('nan','') # fname = str(self.d[1])+" "+ str(self.d[2])+" "+str(self.d[3]) # lname = fname.replace('nan','') # lic_type = "Funeral Arranger License" # daob = str(self.d[5]) # daoe = str(self.d[6]).replace('nan','') # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date', daob) # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', lname) # il.add_value('location_address_string', 'NV') # il.add_value('company_phone', '') # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', daoe) # il.add_value('company_name', '') # yield il.load_item() # df5 = tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralArrangersLicensees.pdf', # pages = '3',delimeter=',', # encoding='ISO-8859-1', # area=(60.818,30.0,393.593,580.035), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df5.iterrows(): # self.e = row.tolist() # lic_no = str(self.e[0]).replace('nan','') # fname = str(self.e[1])+" "+ str(self.e[2])+" "+str(self.e[3]) # lname = fname.replace('nan','') # lic_type = "Funeral Arranger License" # daob = str(self.e[5]) # daoe = str(self.e[6]) # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date', daob) # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', lname) # il.add_value('location_address_string', 'NV') # il.add_value('company_phone', '') # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', daoe) # il.add_value('company_name', '') # yield il.load_item() # # #-----------------------------------------------------------------------------------------------------------------------------------3 # df6 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Funeral-Directors.pdf', # pages = '1',delimeter=',', # encoding='ISO-8859-1', # area=(146.498,30.0,763.853,580.035), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df6.iterrows(): # self.f = row.tolist() # lic_no = str(self.f[0]).replace('nan','') # fname = str(self.f[1])+" "+ str(self.f[2])+" "+str(self.f[3]) # lname = fname.replace('nan','') # lic_type = "Funeral Director License" # daob = str(self.f[5]) # daoe = str(self.f[6]) # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date', daob) # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', lname) # il.add_value('location_address_string', 'NV') # il.add_value('company_phone', '') # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', daoe) # il.add_value('company_name', '') # yield il.load_item() # df7 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Funeral-Directors.pdf', # pages = '2,3,4',delimeter=',', # encoding='ISO-8859-1', # area=(60.818,30.0,751.613,580.035), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df7.iterrows(): # self.g = row.tolist() # lic_no = str(self.g[0]).replace('nan','') # fname = str(self.g[1])+" "+ str(self.g[2])+" "+str(self.g[3]) # lname = fname.replace('nan','') # lic_type = "Funeral Director License" # daob = str(self.g[5]) # daoe = str(self.g[6]) # # print("^^^^^^^^^^^^^^^^^",daoe) # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date', daob) # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', lname) # il.add_value('location_address_string', 'NV') # il.add_value('company_phone', '') # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', daoe) # il.add_value('company_name', '') # yield il.load_item() # #----------------------------------------------------- # # df7 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Funeral-Directors.pdf', # # pages = '3', # # encoding='ISO-8859-1', # # area=(60.818,30.0,751.613,580.035), # # guess=False, # # pandas_options={'header': 'infer'}) # # for _, row in df7.iterrows(): # # self.g = row.tolist() # # lic_no = str(self.g[0]).replace('nan','') # # fname = str(self.g[1])+" "+ str(self.g[2])+" "+str(self.g[3]) # # lname = fname.replace('nan','') # # lic_type = "Funeral Director License" # # daob = str(self.g[5]) # # daoe = str(self.g[6]) # # # print("@@@@@@@@@@@@@@@@",daoe) # # df8 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Funeral-Directors.pdf', # # pages = '4', # # encoding='ISO-8859-1', # # area=(60.818,30.0,751.613,580.035), # # guess=False, # # pandas_options={'header': 'infer'}) # # for _, row in df8.iterrows(): # # self.h = row.tolist() # # lic_no = str(self.h[0]).replace('nan','') # # print("!!!!!!!!!!!!!!!!!!1",lic_no) # # # fname = str(self.h[1])+" "+ str(self.h[2])+" "+str(self.h[3]) # # # lname = fname.replace('nan','') # # # lic_type = "Funeral Director License" # # # daob = str(self.h[5]) # # # daoe = str(self.h[6]) # #-----------------------------------------------------------------------------------------4 # df9 = tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralEstablishmentsAndDirectCremationFacilities.pdf', # pages = '1',delimeter=',', # encoding='ISO-8859-1', # area=(163.845,23.76,574.695,768.24), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df9.iterrows(): # self.i = row.tolist() # lic_no = str(self.i[0]).replace('nan','') # cname = str(self.i[1]) # ad = str(self.i[2])+", "+str(self.i[3])+", "+"NV "+str(self.i[4]) # addr = ad.replace("nan","") # print("#@@#@#@#@#@#@#@#@#@#",addr) # phone = str(self.i[5]).replace("nan","") # lic_type = str(self.i[6]).replace("nan","") # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date','') # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', '') # il.add_value('location_address_string', addr) # il.add_value('company_phone', phone) # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', '') # il.add_value('company_name', cname) # yield il.load_item() # df10 = tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralEstablishmentsAndDirectCremationFacilities.pdf', # pages = '2',delimeter=',', # encoding='ISO-8859-1', # area=(68.805,30.0,576.675,762.6), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df10.iterrows(): # self.j = row.tolist() # lic_no = str(self.j[0]).replace('nan','') # cname = str(self.j[1]) # ad = str(self.j[2])+", "+str(self.j[3])+", "+"NV "+str(self.j[4]) # addr = ad.replace("nan","") # phone = str(self.j[5]).replace("nan","") # lic_type = str(self.j[6]).replace("nan","") # print("#####################",lic_type) # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date','') # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', '') # il.add_value('location_address_string', addr) # il.add_value('company_phone', phone) # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', '') # il.add_value('company_name', cname) # yield il.load_item() # df11 =tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralEstablishmentsAndDirectCremationFacilities.pdf', # pages = '3',delimeter=',', # encoding='ISO-8859-1', # area=(68.805,30.0,576.675,762.6), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df11.iterrows(): # self.k = row.tolist() # lic_no = str(self.k[0]).replace('nan','') # cname = str(self.k[1]) # ad = str(self.k[2])+", "+str(self.k[3])+", "+"NV "+str(self.k[4]) # addr = ad.replace("nan","") # phone = str(self.k[5]).replace("nan","") # lic_type = str(self.k[6]).replace("nan","") # print("#####################",lic_type) # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date','') # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', '') # il.add_value('location_address_string', addr) # il.add_value('company_phone', phone) # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', '') # il.add_value('company_name', cname) # yield il.load_item() # self.state['items_count'] = self.state.get('items_count', 0) + 1 # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'nv_funeral_licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_xpath('permit_lic_no', '') # il.add_xpath('permit_lic_eff_date', '') # il.add_xpath('permit_subtype', '') # il.add_xpath('person_name', '') # il.add_xpath('location_address_string', '') # il.add_xpath('company_phone', '') # il.add_xpath('permit_type', '') # il.add_xpath('permit_lic_exp_date', '') # il.add_xpath('company_name', '') # return il.load_item()
def pdf_table_to_df(self): df = read_pdf(self.get_file_name(), multiple_tables=True, pages='all') print(type(df)) return df
from tabula import read_pdf df = read_pdf(r"./credits.pdf", encoding="utf-8", spreadsheet=True, pages="all") print(df) #print(df)
# response = requests.get(url) # filename.write_bytes(response.content) ##### df_list = [] col2str = {'dtype': str} kwargs = { 'output_format': 'dataframe', 'pandas_options': col2str, 'stream': True } for x in os.listdir('data/rki_wochenberichte/'): df_list.append( tabula.read_pdf('data/rki_wochenberichte/' + x, pages=['2'], multiple_tables=False, **kwargs)[0]) print(x) # clean some misread dataframes df_list[0] = df_list[0].drop(df_list[0].index[[7, 9]]) df_list[0].loc[df_list[0].Bundesland.isna(), 'Bundesland'] = 'Mecklenburg-Vorpommern' df_list[2] = pd.DataFrame(np.vstack([df_list[2].columns, df_list[2]])) df_list[2].columns = df_list[1].columns df_list[2].columns = df_list[0].columns for x in range(0, 18): print(x) df_list[x]['date'] = datelist[x]
Python 3.6.8 (default, Jan 14 2019, 11:02:34) [GCC 8.0.1 20180414 (experimental) [trunk revision 259383]] on linux Type "help", "copyright", "credits" or "license()" for more information. >>> from tabula import read_pdf >>> df =read_pdf("/home/mark/Downloads/451401XXXXXX3663-2019May09-2019Jun10.pdf") Traceback (most recent call last): File "/usr/local/lib/python3.6/dist-packages/tabula/wrapper.py", line 108, in read_pdf output = subprocess.check_output(args) File "/usr/lib/python3.6/subprocess.py", line 356, in check_output **kwargs).stdout File "/usr/lib/python3.6/subprocess.py", line 423, in run with Popen(*popenargs, **kwargs) as process: File "/usr/lib/python3.6/subprocess.py", line 729, in __init__ restore_signals, start_new_session) File "/usr/lib/python3.6/subprocess.py", line 1364, in _execute_child raise child_exception_type(errno_num, err_msg, err_filename) FileNotFoundError: [Errno 2] No such file or directory: 'java': 'java' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "<pyshell#1>", line 1, in <module> df =read_pdf("/home/mark/Downloads/451401XXXXXX3663-2019May09-2019Jun10.pdf") File "/usr/local/lib/python3.6/dist-packages/tabula/wrapper.py", line 111, in read_pdf raise JavaNotFoundError(JAVA_NOT_FOUND_ERROR) tabula.errors.JavaNotFoundError: `java` command is not found from this Python process. Please ensure Java is installed and PATH is set for `java` >>> df =read_pdf("/home/mark/Downloads/451401XXXXXX3663-2019May09-2019Jun10.pdf") >>> df MAY 06 MAY 10 ... -$770.00 Unnamed: 4 0 NaN NaN ... NaN Points adjusted this statement 53 1 NaN NaN ... NaN NaN
def contra_costa_p14(_): # TODO(charles-difazio): Grab this in a single parse and then split gov_df = tabula.read_pdf( 'http://www.cocovote.us/wp-content/uploads/14Jun03_ContraCostaStatementofVote.pdf', pages=list(range(465, 483))) house_df = tabula.read_pdf( 'http://www.cocovote.us/wp-content/uploads/14Jun03_ContraCostaStatementofVote.pdf', pages=list(range(483, 495))) assembly_df = tabula.read_pdf( 'http://www.cocovote.us/wp-content/uploads/14Jun03_ContraCostaStatementofVote.pdf', pages=list(range(495, 501))) # Rename candidate columns gov_df.columns = ['precinct'] + \ gov_df['Unnamed: 4'][3:6].tolist() + ['dummy'] # Drop NA rows and total row gov_df = gov_df[gov_df.columns[:-1]].dropna(how='any')[:-1] # Tidy data gov_df = pd.melt(gov_df, id_vars='precinct', value_vars=gov_df.columns[1:], var_name='candidate', value_name='votes') gov_df.votes = gov_df.votes.astype(np.int64) gov_df = gov_df[gov_df.votes != 0].assign(county='Contra Costa', office='Governor') gov_df['party'] = gov_df.candidate.apply(lambda x: p[x]) gov_df.precinct = gov_df.precinct.apply(lambda x: x.split(' ')[1]) house_df = house_df.rename(columns={'2014 STATEWIDE PRIMARY': 'precinct', 'Unnamed: 1': 'Virginia Fuller'}) house_df['Virginia Fuller'] = pd.to_numeric( house_df['Virginia Fuller'], errors='coerce') house_df = house_df.dropna(how='any')[:-1] house_df = house_df[house_df.precinct != 'CANDIDATES'] house_df = pd.melt(house_df, id_vars='precinct', value_vars=house_df.columns[1:], var_name='candidate', value_name='votes') house_df.votes = house_df.votes.astype(np.int64) house_df = house_df[house_df.votes != 0].assign(county='Contra Costa', office='U.S. House', district='11') house_df['party'] = house_df.candidate.apply(lambda x: p[x]) house_df.precinct = house_df.precinct.apply( lambda x: x.split(' ')[1]) c = assembly_df['Unnamed: 2'][3:5].tolist() assembly_df[c[0]], assembly_df[c[1]] = assembly_df[ 'Unnamed: 1'].str.split(' ', n=1).str assembly_df = assembly_df.rename( columns={'2014 STATEWIDE PRIMARY': 'precinct'}) assembly_df[c[0]] = pd.to_numeric(assembly_df[c[0]], errors='coerce') assembly_df[c[1]] = pd.to_numeric(assembly_df[c[1]], errors='coerce') assembly_df = assembly_df[['precinct'] + c] assembly_df = assembly_df.dropna(how='any')[:-1] assembly_df = pd.melt(assembly_df, id_vars='precinct', value_vars=assembly_df.columns[1:], var_name='candidate', value_name='votes') assembly_df.votes = assembly_df.votes.astype(np.int64) assembly_df = assembly_df[assembly_df.votes != 0].assign(county='Contra Costa', office='State Assembly', district='14') assembly_df['party'] = assembly_df.candidate.apply(lambda x: p[x]) assembly_df.precinct = assembly_df.precinct.apply( lambda x: x.split(' ')[1]) return gov_df.to_dict(orient='records') + \ house_df.to_dict(orient='records') + \ assembly_df.to_dict(orient='records')
from tabula import read_pdf ## a build-in ssl module of python import ssl url = "url.pdf" try: df = read_pdf( "http://www.sed1.org/afterschool/tookits/science/pdf/ast_sci_data_table_sample.pdf" ) # print the data frame (pandas) print(df) except Excemtion as e: print("Error {}".format(e))
# https://towardsdatascience.com/scraping-table-data-from-pdf-files-using-a-single-line-in-python-8607880c750 import tabula infile_name = '/Users/craig/Documents/GEORGIAN-MEGRELIAN-LAZ-SVAN-ENGLISH_DICTIONARY.pdf' table = tabula.read_pdf(infile_name, pages=1) table[0]
import tabula import pandas as pd # import time # start_time = time.time() tb_name="test" df = tabula.read_pdf(tb_name+".pdf", pages='all', multiple_tables=True) df = pd.DataFrame(df) # df.to_excel('test.xlsx', header=False, index=False) # df=pd.read_csv(tb_name+".csv", error_bad_lines=False, header=None) # res = pd.DataFrame() # n=len(df) # arr = df.values # idx = [] # for i in range(0,n): # t=str(arr[i][0])[0:2] # if (t!="31"): # idx.append(i) # df.drop(df.index[idx], inplace=True) # df.to_csv(tb_name+'.csv', header=False, index=False) # Printing Time # print("--- %s seconds ---" % (time.time() - start_time))
import tabula import pandas as pd # Read pdf into DataFrame df = tabula.read_pdf("test.pdf", options) # Read remote pdf into DataFrame df2 = tabula.read_pdf("https://github.com/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/arabic.pdf") # convert PDF into CSV tabula.convert_into("test.pdf", "output.csv", output_format="csv") # convert all PDFs in a directory tabula.convert_into_by_batch("input_directory", output_format='csv')
def get_pdf_dict(lsdyna_manual_file_path, page_numbers, only_page_num_dict=False): ''' @description : returns a dictionary of keywords and corresponding tables and page numbers in lists @lsdyna_manual_file_path : the path to the LS_Dyna Manual @page_numbers : list of page numbers for which keys and tables must be extracted; if "All" is given, all pages are analyzed @only_page_num_dict : (default:False) if this is set to True, only page numbers are written out for given key word manual @returns : returns a dictionary of keywords and corresponding tables and page numbers in lists ''' ls_key_dict = {} ls_file = open(lsdyna_manual_file_path, 'rb') ls_file_contents = pdf.PdfFileReader(ls_file) if isinstance(page_numbers, int): page_numbers = [page_numbers] if page_numbers == "All": total_pages = ls_file_contents.getNumPages() page_numbers = list(range(1, total_pages+1)) print("\nLooking for tables in the pages - " + str(page_numbers) + ".\n") error_pages = [] for page_num in page_numbers: page_key = get_page_key(page_num, ls_file_contents) if page_key == "": continue if ls_key_dict.get(page_key): ls_key_dict[page_key]['page_numbers'].append(page_num) else: ls_key_dict[page_key] = {'tables' : [], 'page_numbers' : [page_num]} if only_page_num_dict: continue #page nos. for this function start from 1 try: page_tables = tabula.read_pdf(lsdyna_manual_file_path, pages=[page_num], multiple_tables=True) except: error_pages.append(page_num) page_tables = None if not page_tables: ls_key_dict[page_key]['page_numbers'].pop(-1) continue #collecting only LS Dyna key cards (the first cell in the table has the phrase "Card") valid_page_tables = [] for table in page_tables: try: if "Card" in table[0][0]: #TODO: this area is meant to cleanup tables; try the keycard *DEFINE_COORDINATE_SYSTEM and notice table on Page 1560 valid_page_tables.append(table) except: continue if valid_page_tables: ls_key_dict[page_key]['tables'].append(valid_page_tables) else: ls_key_dict[page_key]['page_numbers'].pop(-1) if error_pages: print("There were errors while extracting tables from pages " + str(error_pages)) return ls_key_dict
import tabula import camelot import json import pymongo import math client = pymongo.MongoClient("mongodb://localhost:27017/") db = client.PreCog_DB_a # File 1c1edeee-a13e-4b2e-90be-eb1dd03c3384.pdf ------------------------- # Get table path = "./Rec_Task/1c1edeee-a13e-4b2e-90be-eb1dd03c3384.pdf" dfs = tabula.read_pdf(path, pages='all') # conversion to required format dfs_formatted = dfs[0].to_dict() for i,v in dfs_formatted.items(): for j,k in v.items(): dfs_formatted[i] = k # insert into collection tables_1c1edeee = db["tables_1c1edeee"] tables_1c1edeee.insert(dfs_formatted) # ------------------------- # File 1c1edeee-a13e-4b2e-90be-eb1dd03c3384.pdf ------------------------- Get table path = "./Rec_Task/a6b29367-f3b7-4fb1-a2d0-077477eac1d9.pdf"
import os import tabula files = os.listdir() pdf_files = list(filter(lambda x: x.endswith(".pdf"), files)) for file in pdf_files: try: data = tabula.read_pdf(file, spreadsheet=True, pages='all') data.to_csv(file.replace(".pdf", ".csv"), index=False, quoting=1, encoding='utf8') print("File done-", file) print() except: print("Reading error -", file) print()
#lkoval #6-19-19 from tabula import read_pdf import pandas as pd import string #Table 5 table_5=read_pdf("document_1374295.pdf", pages="18", lattice=True, pandas_options={'header': None}) table_5["raw_chem_name"]=table_5.iloc[1:,0] table_5["raw_cas"]=table_5.iloc[1:,2] table_5=table_5.dropna(subset=["raw_chem_name"]) table_5=table_5[["raw_chem_name","raw_cas"]] clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty)) for j in range(0, len(table_5)): table_5["raw_chem_name"].iloc[j]=str(table_5["raw_chem_name"].iloc[j]).strip().lower().replace(".","").replace("α","alpha").replace("β","beta") table_5["raw_chem_name"].iloc[j]=clean(str(table_5["raw_chem_name"].iloc[j])) if len(table_5["raw_chem_name"].iloc[j].split())>1: table_5["raw_chem_name"].iloc[j]=" ".join(table_5["raw_chem_name"].iloc[j].split()) if len(str(table_5["raw_cas"].iloc[j]).split())>1: table_5["raw_cas"].iloc[j]=" ".join(str(table_5["raw_cas"].iloc[j]).split()) table_5["data_document_id"]="1374295" table_5["data_document_filename"]="DCPS_83_a.pdf" table_5["doc_date"]="2007" table_5["raw_category"]="" table_5["cat_code"]="" table_5["description_cpcat"]="" table_5["cpcat_code"]="" table_5["cpcat_sourcetype"]="ACToR Assays and Lists"
import numpy as np import pandas as pd import geopandas as gpd import tabula import matplotlib.pyplot as plt plt.style.use('seaborn') yestersay = '4/16/20' today = '4/17/20' df = tabula.read_pdf('data/mexico.pdf', pages='all', output_format='dataframe', multiple_tables=False) df[0].to_csv('data/covidmxraw.csv') confirmedraw = pd.read_csv('data/covidmxraw.csv', index_col=0) print(confirmedraw) confirmedraw = confirmedraw.drop( ['N° Caso', 'Identificación de COVID-\r19 por RT-PCR en tiempo\rreal'], axis=1) confirmedraw = confirmedraw.rename( columns={'Fecha de Inicio\rde síntomas': 'Fecha de Inicio de síntomas'}) confirmedraw.to_csv('data/confirmedraw.csv') confirmed = pd.read_csv('data/time_series_confirmed_MX.csv', index_col=0) current = confirmedraw.groupby('Estado')['Sexo'].count() last = confirmed[yestersay] confirmed[today] = last + (current - last) confirmed[today][-1] = current.sum()
def test_read_pdf_with_java_option(self): pdf_path = 'tests/resources/data.pdf' expected_csv1 = 'tests/resources/data_1.csv' self.assertTrue(tabula.read_pdf(pdf_path, pages=1, java_options=['-Xmx256m'] ).equals(pd.read_csv(expected_csv1)))
def fill_t5008_21(ff_driver, box_21): div = find_fill_element_with_text(ff_driver, "Proceeds of disposition or settlement amount") div.find_element_by_tag_name("input").send_keys(box_21) if __name__ == '__main__': driver = webdriver.Firefox() driver.get("https://secure.ufile.ca/T1-2018/Interview?file=e0dfbb3b") sleep(5) driver.find_element_by_name("Username").send_keys(USERNAME) driver.find_element_by_name("Password").send_keys(PASSWORD) driver.find_elements_by_class_name("blue-button")[0].click() sleep(5) driver.find_element_by_class_name("filefieldmain").find_element_by_tag_name("a").click() sleep(3) df = read_pdf("T5008.pdf", pages=2) list_t5008 = [] for row in df.iterrows(): if row[1]["Box 13"] == "USD": list_t5008.append(row[1]) for t5008 in list_t5008: add_t5008(driver) sleep(3) fill_t5008(driver, t5008) sleep(3)
import tabula import os tables = tabula.read_pdf("1710.05006.pdf", pages="all") # 폴더를 만들어 저장 folder_name = "171005006-tables" if not os.path.isdir(folder_name): os.mkdir(folder_name) # 각각의 테이블을 엑셀 파일로 저장 for i, table in enumerate(tables, start=1): table.to_excel(os.path.join(folder_name, f"table_{i}.xlsx"), index=False) # 저장과 변환을 한 번에 가능 # csv, json, tsv 지원 # tabula.convert_into("1710.05006.pdf", "output.csv", output_format="csv", pages="all") # pdfs 라는 폴더가 필요 # tabula.convert_into_by_batch("pdfs", output_format="csv", pages="all")