def search_dir_only(pattern, filename): try: fn = os.path.join(os.getcwd(), 'file_search.config') last_dir = configsectionmap(fn, 'last path') dir_selected = fdd.get_directory(last_dir) if not dir_selected: return df = pd.DataFrame(find_dirs(dir_selected, pattern), columns=['Directory']) df['Directory'] = '=HYPERLINK("' + df['Directory'] + '")' if df.shape[0] == 0: msgbox.show_message('Bummer', 'No directories found using that expression') return filename = os.path.splitext(filename)[0] # if using pandas to save to excel (requires openpyxl) # df.to_excel(filename + '.xlsx', index=False) # os.startfile(filename + '.xlsx') # use xlwings to save to excel new_excel_wb = New_Excel_WB() new_excel_wb.df_to_excel(df=df, path=filename + '.xlsx') except PermissionError: msgbox.show_error('Permission Error', filename + '.xlsx already open') except Exception as e: msgbox.show_error('Error', e)
def pws_detail(url_detail, url_summary, detail='Detail', buyers='Buyers', purchases='Purchases', wells='Wells', sources='Sources'): """ Called from Excel when user clicks hyperlink for a PWS :param url_detail: :param url_summary: :param detail: :param buyers: :param purchases: :param wells :param sources: :return: """ get_buyers_and_purchases = True sht_detail = xw.Book.caller().sheets[detail] sht_sources = xw.Book.caller().sheets[sources] sht_wells = xw.Book.caller().sheets[wells] sht_buyers = xw.Book.caller().sheets[buyers] sht_purchases = xw.Book.caller().sheets[purchases] sht_detail.book.app.screen_updating = False pwsd = PWS_Detail() try: pwsd.get_detail(url_detail, url_summary, buyers=True, purchases=True) except Exception as e: msgbox.show_error('Python Error in get_detail', e) try: lrow_detail = sht_detail.range('A1').end('down').last_cell.row lrow_sources = sht_sources.range('A1').end('down').last_cell.row lrow_wells = sht_wells.range('A1').end('down').last_cell.row lrow_buyers = sht_buyers.range('A1').end('down').last_cell.row lrow_purchases = sht_purchases.range('A1').end('down').last_cell.row lrow_detail = lrow_detail if lrow_detail < 2**20 else 0 lrow_sources = lrow_sources if lrow_sources < 2**20 else 0 lrow_wells = lrow_wells if lrow_wells < 2**20 else 0 lrow_buyers = lrow_buyers if lrow_buyers < 2**20 else 0 lrow_purchases = lrow_purchases if lrow_purchases < 2**20 else 0 sht_detail.range(lrow_detail + 1, 1).options(index=False, header=not lrow_detail).value = \ pwsd.get_basic_detail() sht_sources.range(lrow_sources + 1, 1).options(index=False, header=not lrow_sources).value = \ pwsd.get_sources() sht_wells.range(lrow_wells + 1, 1).options(index=False, header=not lrow_wells).value = \ pwsd.get_wells() if get_buyers_and_purchases: sht_buyers.range(lrow_buyers + 1, 1).options(index=False, header=not lrow_buyers).value = \ pwsd.get_buyers() sht_purchases.range(lrow_purchases + 1, 1).options(index=False, header=not lrow_purchases).value = \ pwsd.get_purchases() except Exception as e: msgbox.show_error('Python Error', e) sht_detail.book.app.screen_updating = True
def search_dir_topdown(pattern, filename, dt1=None, dt2=None): try: fn = os.path.join(os.getcwd(), 'file_search.config') last_dir = configsectionmap(fn, 'last path') # create dataframe from generator, tweak it, write to excel dir_selected = fdd.get_directory(last_dir) # be sure directory is not None type, and that it exists before setting last path if dir_selected and os.path.isdir(dir_selected): update_setting(fn, 'last path', 'last', dir_selected) if not dir_selected: return df = pd.DataFrame(find_files(dir_selected, pattern, dt1, dt2), columns=[ 'file', 'directory', 'filename', 'extension', 'file_size', 'lastmod' ]) df['directory'] = '=HYPERLINK("' + df['directory'] + '")' df['filename'] = '=HYPERLINK("' + df['file'] + '", "' + df[ 'filename'] + '")' df = df[['directory', 'filename', 'extension', 'file_size', 'lastmod']] files_searched_header = 'files searched: {:,.0f}'.format( num_files_searched) df.insert(5, files_searched_header, '') df.insert(6, pattern, '') if df.shape[0] == 0: msgbox.show_message('Bummer', 'No files found using that expression') return filename = os.path.splitext(filename)[0] # if using pandas to save to excel: # df.to_excel(filename + '.xlsx', index=False) # os.startfile(filename + '.xlsx') # if using xlwings to save to excel try: new_excel_wb = New_Excel_WB() new_excel_wb.df_to_excel(df=df, path=filename + '.xlsx') except: # msgbox.show_message('debug', str(len(df.index)) + ': ' + str(df.shape)) # msgbox.show_error('Error', 'Error creating Excel file') new_excel_wb.close() msgbox.show_message( 'Info', 'There are too many rows to write to Excel.\nSwitching to csv.' ) df.to_csv(filename + '.csv', index=False) os.startfile(filename + '.csv') except PermissionError: msgbox.show_error('Permission Error', filename + '.xlsx already open') except Exception as e: msgbox.show_error('Error', e)
def okclick(self): self.county = self.entered_county.get() print(self.county) self.buyers = self.chk_val_buyers.get() self.purchases = self.chk_val_purchases.get() # reset dataframes and dictionaries self.df = pd.DataFrame() self.df_buyer = pd.DataFrame() self.df_source = pd.DataFrame() self.df_purchase = pd.DataFrame() self.pwsDict = {} self.buyerDict = {} self.sourceDict = {} self.purchaseDict = {} # create a Pandas Excel writer using XlsxWriter as the engine. xlsfile = os.path.join(os.getcwd(), self.county + '.xlsx') writer = pd.ExcelWriter(xlsfile, engine='xlsxwriter') # get the list of counties and the pws detail for each # get a county pws object county_pws = CountyPWS() # get the urls for the selected county # can be sliced --> county_pws.get_url(self.county.upper())[0:3] pws_list = county_pws.get_urls(self.county.upper()) max_to_process = len(pws_list) self.num_systems = len(pws_list) for i, url in enumerate(pws_list[:max_to_process]): print( str(int(round((i + 1) / max_to_process * 100, 0))) + '% : ' + url) try: self.pws_detail(url) except Exception as e: msgbox.show_error('Error', 'Error processing ' + url) # convert the dataframe to an XlsxWriter Excel object. self.df.to_excel(writer, sheet_name='Detail') self.df_source.to_excel(writer, sheet_name='Sources') if self.buyers: self.df_buyer.to_excel(writer, sheet_name='Buyers') if self.purchases: self.df_purchase.to_excel(writer, sheet_name='Purchases') # close the Pandas Excel writer and output the Excel file. writer.save() # open the file file = xlsfile os.startfile(file)
def get_samples(): try: # sht = xw.Book.caller().sheets[0] sht = xw.Book.caller().sheets.active sht.range('D1').expand().clear_contents() url = sht.range('B4').value if not 'AnalyteListByCode' in url: raise ValueError('url must represent Analyte List by Code') headers = { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.103 Safari/537.36' } r = requests.get(url=url, headers=headers) doc = lxml.html.fromstring(r.content) tables = doc.findall('.//table') # get name of PWS in 4th table info_table = tables[3] tr_info = info_table.findall('.//tr') td_info = [_unpack(tr, kind='p') for tr in tr_info[2]] sys_name = td_info[1] sysnum = sht.range('A3').value sht.range('B8').value = sys_name[0] + ' (' + sysnum + ')' # links are in the 5th table analyte_table = tables[4] # each analyte's info is in a separate tr tr_tags = analyte_table.findall('.//tr') a_tags = [tr.findall('.//a') for tr in tr_tags[2:]] # use a list comprehension to get all the url's base_url = 'https://dww2.tceq.texas.gov/DWW/JSP/' urls = [ base_url + re.sub(r'\s', '', a_tag[0].get('href')) for a_tag in a_tags ] # for each tr, get all the td's td_vals = [_unpack(tr) for tr in tr_tags[2:]] # get the table values and create dataframe analyte_codes = [el[0] for el in td_vals] analyte_names = [el[1] for el in td_vals] analyte_type = [el[2] for el in td_vals] analyte_num_results = [el[3] for el in td_vals] df_analyte = pd.DataFrame({ 'URL': urls, 'Code': analyte_codes, 'Name': analyte_names, 'Type': analyte_type, 'Number Results': analyte_num_results }) # get the list of analytes to filter the dataframe filter_list = sht.range('B10').options(ndim=1).expand('down').value if type(filter_list) is str: filter_list = [filter_list] if filter_list is None: raise TypeError('There is no analyte list') df_scrape = df_analyte[df_analyte['Name'].isin(filter_list)] if df_scrape.empty: raise ValueError( 'None of the analytes in the list exist for this PWS') # for each url, get the sampling data and build a pandas dataframe df_sampling = pd.DataFrame() columns = [ 'Code', 'Name', 'Facility', 'Sample Point', 'Sample Collection Date', 'TCEQ Sample ID', 'Laboratory Sample ID', 'Method', 'Less Than Ind.', 'Level Type', 'Reporting Level', 'Concentration', 'Current MCL' ] for index, row in df_scrape.iterrows(): r = requests.get(url=row['URL'], headers=headers) doc = lxml.html.fromstring(r.content) tables = doc.findall('.//table') # sampling data in the 5th table sampling_table = tables[4] # get the tr's: headings in the 2nd tr, data starts in 3rd tr tr_tags = sampling_table.findall('.//tr') # get the headers: html is inconsistent - use xpath function to get all nodes within the th # headers = [_unpack2(tr, xpath_arg='node()') for tr in tr_tags[1]] # headers = [el[0] for el in headers] # get the values td_vals = [_unpack(tr) for tr in tr_tags[2:]] # create a sampling dataframe for i, item in enumerate(td_vals): zipped = zip(columns, item) dict_item = {k: v for k, v in zipped} df_sampling = df_sampling.append(dict_item, ignore_index=True) df_sampling['Sample Collection Date'] = pd.to_datetime( df_sampling['Sample Collection Date']) # put columns in correct order df_sampling = df_sampling[columns] if not df_sampling.empty: sht.range('D1').options(index=False).value = df_sampling else: msgbox.show_message('Message', 'No results') except Exception as e: msgbox.show_error('Error', e)
def get_detail(self, url_detail, url_summary, buyers=True, purchases=True): headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.103 Safari/537.36'} r = requests.get(url=url_detail, headers=headers) doc = lxml.html.fromstring(r.content) tables = doc.findall('.//table') # sys. no., name, pop in 4th table systable = tables[3] rows = systable.findall('.//tr') sysinfo = self._unpack(rows[1]) sysinfo = ["".join(x.split()) for x in sysinfo] self.pwsDict['Sys Num'] = sysinfo[1].strip() self.pwsDict['Sys Type'] = sysinfo[3].strip() # name is in the 3rd tr sysinfo = self._unpack(rows[2]) sysinfo = [re.sub(r'(\s+| )', ' ', val) for val in sysinfo] self.pwsDict['Sys Name'] = sysinfo[1].strip() self.pwsDict['Primary Source Type'] = sysinfo[3].strip() # pop is in the 6th tr sysinfo = self._unpack(rows[5]) sysinfo = ["".join(x.split()) for x in sysinfo] self.pwsDict['Population'] = sysinfo[1].strip() # contact in the 5th table systable = tables[4] rows = systable.findall('.//tr') if len(rows) >= 3: sysinfo = self._unpack(rows[2]) sysinfo = [re.sub(r'(\s+| )', ' ', val) for val in sysinfo] self.pwsDict['Contact'] = sysinfo[1].strip() if len(sysinfo) >= 5: self.pwsDict['Business Phone'] = sysinfo[4].strip() if len(sysinfo) >= 7: self.pwsDict['Mobile Phone'] = sysinfo[6].strip() # the number of buyers and sellers dictate how many tables there are, so certain tables cannot always be # found in the same location. have to look at the th values to find them. for table in tables: rows = table.findall('.//tr') # if rows has list elements, its boolean is True; if an empty list, False if rows: header = self._unpack(rows[0], kind='th') if header is not None: if header[0] == 'Buyers of Water': self.buyertable = table elif header[0] == 'Water Purchases': self.purchasetable = table elif header[0] == 'Sources of Water': self.sourcetable = table elif header[0] == 'WS Flow Rates': self.flowratetable = table elif header[0] == 'WS Measures': self.measurestable = table # WS Flow Rates table if self.flowratetable is not None: rows = self.flowratetable.findall('.//tr') if len(rows) >= 3: flowinfo = self._unpack(rows[2]) flowinfo = [re.sub(r'(\s+| )', ' ', val) for val in flowinfo] self.pwsDict['Max Daily Demand'] = flowinfo[1].strip() + ' (' + flowinfo[2].strip() + ')' if len(rows) >= 4: flowinfo = self._unpack(rows[3]) flowinfo = [re.sub(r'(\s+| )', ' ', val) for val in flowinfo] self.pwsDict['Provided Prod. Capacity'] = flowinfo[1].strip() + ' (' + flowinfo[2].strip() + ')' if len(rows) >= 5: flowinfo = self._unpack(rows[4]) flowinfo = [re.sub(r'(\s+| )', ' ', val) for val in flowinfo] self.pwsDict['Provided Service Pump Capacity'] = flowinfo[1].strip() + ' (' + flowinfo[2].strip() + ')' if len(rows) >= 6: flowinfo = self._unpack(rows[5]) flowinfo = [re.sub(r'(\s+| )', ' ', val) for val in flowinfo] self.pwsDict['Avg. Daily Usage'] = flowinfo[1].strip() + ' (' + flowinfo[2].strip() + ')' # WS Measures table if self.measurestable is not None: rows = self.measurestable.findall('.//tr') if len(rows) >= 3: flowinfo = self._unpack(rows[2]) flowinfo = [re.sub(r'(\s+| )', ' ', val) for val in flowinfo] self.pwsDict['Total Storage Cap.'] = flowinfo[1].strip() + ' (' + flowinfo[2].strip() + ')' if len(rows) >= 4: flowinfo = self._unpack(rows[3]) flowinfo = [re.sub(r'(\s+| )', ' ', val) for val in flowinfo] self.pwsDict['Total Pressure Tank Cap.'] = flowinfo[1].strip() + ' (' + flowinfo[2].strip() + ')' if len(rows) >= 5: flowinfo = self._unpack(rows[4]) flowinfo = [re.sub(r'(\s+| )', ' ', val) for val in flowinfo] self.pwsDict['Elevated Storage Cap.'] = flowinfo[1].strip() + ' (' + flowinfo[2].strip() + ')' # add dictionary to dataframe self.df = self.df.append(self.pwsDict, ignore_index=True) # put columns in correct order self.df = self.df[['Sys Num', 'Sys Name', 'Sys Type', 'Primary Source Type', 'Population', 'Contact', 'Business Phone', 'Mobile Phone', 'Max Daily Demand', 'Provided Prod. Capacity', 'Provided Service Pump Capacity', 'Avg. Daily Usage', 'Total Storage Cap.', 'Total Pressure Tank Cap.', 'Elevated Storage Cap.']] ################### # get the sources # ################### self.sourceDict['Sys Name'] = self.pwsDict['Sys Name'] self.sourceDict['Sys Num'] = self.pwsDict['Sys Num'] if self.sourcetable is not None: rows = self.sourcetable.findall('.//tr') # get sources as a list of lists if len(rows) >= 3: sources = [self._unpack(row) for row in rows[2:]] for xlr, row in enumerate(sources): for xlc, val in enumerate(row): if xlc == 0: self.sourceDict['Source Name'] = val.strip() if xlc == 1: self.sourceDict['Type'] = val.strip() if xlc == 2: self.sourceDict['Activity'] = val.strip() if xlc == 3: self.sourceDict['Availability'] = val.strip() self.df_source = self.df_source.append(self.sourceDict, ignore_index=True) else: # there are no sources self.sourceDict['Source Name'] = 'NO SOURCES LISTED' self.df_source = self.df_source.append(self.sourceDict, ignore_index=True) else: self.sourceDict['Source Name'] = 'SOURCE TABLE NOT FOUND' self.df_source = self.df_source.append(self.sourceDict, ignore_index=True) self.df_source = self.df_source[['Sys Name', 'Sys Num', 'Source Name', 'Type', 'Activity', 'Availability']] ################### # get the buyers # ################### try: self.buyerDict['Sys Name'] = self.pwsDict['Sys Name'] self.buyerDict['Sys Num'] = self.pwsDict['Sys Num'] if self.buyertable is not None and buyers: rows = self.buyertable.findall('.//tr') if rows and len(rows) >= 3: thebuyers = [self._unpack(row) for row in rows[2:]] # buyers contains who is buying, their population, and their status...separated by '/' # remove the whitespace if thebuyers: thebuyers = [re.sub(r'(\s+| )', ' ', val) for vals in thebuyers for val in vals] # split in '/', creating a list of lists buyers_split = [x.split('/') for x in thebuyers] if buyers_split: for xlr, row in enumerate(buyers_split): if row: for xlc, val in enumerate(row): if xlc == 0: self.buyerDict['Buyer'] = val.strip() if xlc == 1: self.buyerDict['Buyer Pop'] = val.strip() if xlc == 2: self.buyerDict['Buyer Status'] = val.strip() self.df_buyer = self.df_buyer.append(self.buyerDict, ignore_index=True) if xlr > 25000: self.buyerDict['Buyer'] = 'BUYER DATA TRUNCATED DUE TO LENGTH. ' \ 'SEE TCEQ FOR MORE INFO.' self.buyerDict['Buyer Pop'] = '' self.buyerDict['Buyer Status'] = '' self.df_buyer = self.df_buyer.append(self.buyerDict, ignore_index=True) break else: self.buyerDict['Buyer'] = 'BUYER TABLE NOT FOUND' self.df_buyer = self.df_buyer.append(self.buyerDict, ignore_index=True) except Exception as e: msgbox.show_error('Python Error', str(e)) self.buyerDict['Buyer'] = 'PROBLEM READING BUYER TABLE IN HTML.' self.df_buyer = self.df_buyer.append(self.buyerDict, ignore_index=True) self.df_buyer = self.df_buyer[['Sys Name', 'Sys Num', 'Buyer', 'Buyer Pop', 'Buyer Status']] self.df_buyer = self.df_buyer.drop_duplicates() ###################### # get the purchases # ###################### try: self.purchaseDict['Sys Name'] = self.pwsDict['Sys Name'] self.purchaseDict['Sys Num'] = self.pwsDict['Sys Num'] if self.purchasetable is not None and purchases: rows = self.purchasetable.findall('.//tr') if len(rows) >= 3: purchases = [self._unpack(row) for row in rows[2:]] # remove the whitespace purchases = [re.sub(r'(\s+| )', ' ', val) for vals in purchases for val in vals] for xlr, row in enumerate(purchases): self.purchaseDict['Purchase Info'] = row.strip() self.df_purchase = self.df_purchase.append(self.purchaseDict, ignore_index=True) if xlr > 2000: self.purchaseDict[ 'Purchase Info'] = 'PURCHASE DATA TRUNCATED DUE TO LENGTH: SEE TCEQ FOR MORE INFO' self.df_purchase = self.df_purchase.append(self.purchaseDict, ignore_index=True) break else: self.purchaseDict['Purchase Info'] = 'PURCHASE TABLE NOT FOUND' self.df_purchase = self.df_purchase.append(self.purchaseDict, ignore_index=True) except Exception: self.purchaseDict['Purchase Info'] = 'PURCHASE TABLE NOT FOUND' self.df_purchase = self.df_purchase.append(self.purchaseDict, ignore_index=True) self.df_purchase = self.df_purchase[['Sys Name', 'Sys Num', 'Purchase Info']] self.df_purchase = self.df_purchase.drop_duplicates() ############################################ # get the well details from summary sheet # ############################################ try: headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.103 Safari/537.36'} r = requests.get(url=url_summary, headers=headers) doc = lxml.html.fromstring(r.content) tables = doc.findall('.//table') for table in tables: rows = table.findall('.//tr') # if rows has list elements, its boolean is True; if an empty list, False if rows: stuff = [self._unpack2(row) for row in rows] if stuff and len(stuff) >= 3: if stuff[0] and stuff[0][0] == '(Active Sources)': for i, item in enumerate(stuff): if isinstance(item, list): if item[0] == 'SourceNumber': try: headings = [re.sub(r'(\s+| )', ' ', item) for item in stuff[i]] values = [re.sub(r'(\s+| )', ' ', item).replace(' GPM', '') for item in stuff[i + 1]] if 'Depth' not in headings: headings.append('Depth') values.append('') if 'Tested GPM' not in headings: headings.append('Tested GPM') values.append('') if 'Rated GPM' not in headings: headings.append('Rated GPM') values.append('') except IndexError as e: pass dict_well = dict(zip(headings, values)) try: dict_well['Latitude'] = re.sub(r'(\s+| )', '', stuff[i + 7][0]) dict_well['Longitude'] = re.sub(r'(\s+| )', '', stuff[i + 7][1]) except IndexError as e: dict_well['Latitude'] = '' dict_well['Longitude'] = '' try: dict_well['Drill Date'] = re.sub(r'(\s+| )', '', stuff[i + 5][0]) dict_well['Source Summary'] = re.sub(r'(\s+| )*$', '', stuff[i + 5][1]) except IndexError as e: dict_well['Drill Date'] = '-' dict_well['Source Summary'] = '-' dict_well['Sys Name'] = self.pwsDict['Sys Name'] dict_well['Sys Num'] = self.pwsDict['Sys Num'] self.df_well = self.df_well.append(dict_well, ignore_index=True) if self.df_well.empty: dict_well = {'Sys Name': self.pwsDict['Sys Name'], 'Sys Num': self.pwsDict['Sys Num'], 'SourceNumber': 'No Active Sources', 'Source Name (Activity Status)': '-', 'OperationalStatus': '-', 'SourceType': '-', 'Depth': '-', 'Tested GPM': '-', 'Rated GPM': '-', 'Drill Date': '-', 'Source Summary': '-', 'Latitude': '-', 'Longitude': '-'} self.df_well = self.df_well.append(dict_well, ignore_index=True) self.df_well = self.df_well[['Sys Name', 'Sys Num', 'SourceNumber', 'Source Name (Activity Status)', 'OperationalStatus', 'SourceType', 'Depth', 'Tested GPM', 'Rated GPM', 'Drill Date', 'Source Summary', 'Latitude', 'Longitude']] self.df_well = self.df_well.drop_duplicates() except Exception as e: msgbox.show_error('Error in well summary', e)