def main(url, mode='txt'): if os.path.exists('baojianhui_log.txt'): with open('baojianhui_log.txt', 'r', encoding='utf-8') as f: params = f.read().split(',') curr_page = params[0] else: # 如果是第一次执行,全取初始值; curr_page = 1 url_link = str(url) + str(curr_page) web_init(url_link) total_page = int( t.read(element_identifier='//div[@class = "ng-binding"][last()]'). split('/')[-1]) while int(curr_page) < int(total_page): #从1开始,做完之后翻页; main_operation(url, mode) #如果有页可翻,就翻页 print('click once') t.click(element_identifier='//a[@ng-click = "pager.next()"]') #翻页 t.wait(5) curr_page = int( t.read(element_identifier='//div[@class = "ng-binding"][last()]'). split('/')[0]) with open('baojianhui_log.txt', 'w', encoding='utf-8') as f: f.write(str(curr_page) + ',' + str(1) + ',' + str(1)) #翻页之后,index重置;i更新; if curr_page == total_page: main_operation(url, mode) #如果是最后一页了,只需要做一次main t.close() return True
def read_content(page_num, url_prefix, i, today): t.url(url_prefix + str(page_num) + '.html') # 启动很慢 t.wait(2) if t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//span[@class = "hui12"]') < today: t.close() return '', '', '', '' if '' == t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']'): print("no here") raise Exception("an exception") file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']') file_name = file_name[:-10] + str("_") + file_name[-10:] + str('.txt') time = file_name[-14:-4] prefix = 'http://www.pbc.gov.cn' content_url = prefix + t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') if '' == t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): print("no here") raise Exception("an exception") flag = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') # 判断是否需要下载 return flag, time, content_url, file_name
def rpa_process(to_date, preferred_time, from_date, phone_number, token): t.init() t.url("https://sangam-test-website.herokuapp.com/change_input") util.wait_for_pageload('//button[@id="btnsubmit"]') t.click('//input[@id="txtHandNo"]') t.type('//input[@name="txtHandNo"]', phone_number) t.click('//button[@id="btnsubmit"]') util.wait_for_pageload('//button[@id="btnsubmit"]') from_date_obj = from_date from_date = from_date.strftime("%d/%m/%Y") t.click('//label[contains(.,"' + str(from_date) + '")]') to_date_obj = to_date hour = to_date.hour minute = to_date.minute to_date = to_date.strftime("%d/%m/%Y") t.click('//input[@name="txtDateTimePicker"]') t.type('//input[@name="txtDateTimePicker"]', to_date) t.click('//div[@class="filter-option-inner-inner"]') t.click('//a[@role= "option"][.=' + str(hour) + ']') t.click('//select[@id="ddlMin"]') t.click('//a[@role= "option"][.=' + str(minute) + ']') t.click('//button[@id="btnsubmit"]') t.close() change_appointment_slot(from_date_obj, to_date_obj, token)
def propertydata(project_name): t.close() t.init() project_url = f'https://www.propertyguru.com.sg/property-for-sale?market=residential&freetext={project_name}&newProject=all' t.url(project_url) wait_for_pageload('//div[@class="header-wrapper"]') num_result_ad = 3 # load main page, get detail page url link url = [''] * num_result_ad for n in [x for x in range(1, num_result_ad + 1) if x != 4 and x != 8]: # skip 4th and 8th advertisement wait_for_pageload( f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)' ) url[n - 1] = read_if_present( f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)' ) print(f"{n}. url = " + url[n - 1]) property_title = [''] * num_result_ad id = [''] * num_result_ad pdf = [''] * num_result_ad pdf_link = [''] * num_result_ad for n in [x for x in range(1, num_result_ad + 1) if x != 4 and x != 8]: t.url("https://www.propertyguru.com.sg" + url[n - 1]) wait_for_pageload('//h1[@class="h2"]') property_title[n - 1] = read_if_present('//h1[@class="h2"]') print(f"{n}. property_title = " + property_title[n - 1]) id[n - 1] = read_if_present( '//*[@id="details"]/div/div[1]/div[2]/div[10]/div/div[2]') print(f"{n}. id = " + id[n - 1]) pdf[n - 1] = read_if_present( '//*[@id="sticky-right-col"]/div[3]/a[2]/@href') pdf_link[n - 1] = 'https://www.propertyguru.com.sg' + pdf[n - 1] print(f"{n}. pdf_link = " + pdf_link[n - 1]) property_info = { 'property_title': property_title, 'url': ['https://www.propertyguru.com.sg' + x for x in url], 'id': id, 'pdf_link': pdf_link, } df = DataFrame(property_info, columns=['property_title', 'id', 'url', 'pdf_link']) df.to_excel('Property Monitor.xlsx', encoding='utf8', index=None) print('======== Property Monitor.xlsx saved ==========') print(f'======== Monitoring every {interval} second ==========')
def rpa_process(lmp_date, doctor_name, preferred_time, phone_number, patient_name, symptoms, email, sub_id): hour = preferred_time.hour minute = preferred_time.minute checkup_dates = [] day_list = [ 45, 75, 105, 135, 165, 195, 210, 225, 240, 255, 262, 269, 275, 280 ] week_list = [6, 10, 14, 18, 22, 26, 28, 30, 32, 34, 36, 37, 38, 39] for day in day_list: checkup = lmp_date + timedelta(days=day) checkup = str(checkup.day) + "/" + str(checkup.month) + "/" + str( checkup.year) checkup_dates.append(checkup) t.init() for index, i in enumerate(checkup_dates): t.url("https://sangam-test-website.herokuapp.com/") util.wait_for_pageload('//button[@id="btnsubmit"]') t.click('//input[@class="form-control"]') t.type('//input[@name="name"]', patient_name) t.click('//input[@id="email"]') t.type('//input[@name="email"]', email) symptoms = "Pregnancy checkup after week " + str(week_list[index]) t.type('//textarea', symptoms) t.click('//input[@id="txtHandNo"]') t.type('//input[@name="txtHandNo"]', phone_number) t.click('//div[@class="filter-option-inner-inner"]') t.click('//a[@role= "option"][.=' + str(hour) + ']') t.click('//select[@id="ddlMin"]') t.click('//a[@role= "option"][.=' + str(minute) + ']') t.click('//input[@name="txtDateTimePicker"]') t.type('//input[@name="txtDateTimePicker"]', i) t.click('//select[@id="txtSpecificDoc"]') t.click('//a[@role= "option"][.="' + str(doctor_name) + '"]') t.click('//button[@id="btnsubmit"]') t.close() request_url = "https://sangam-test-website.herokuapp.com/get_future_appointments?email=" + str( email) future_appointments = requests.get(request_url) book_calendar_slot(future_appointments.json()['data'], sub_id)
def run(): conn = util.create_connection("./db/news.db") site = util.getSiteByName(conn, "New York Times") site_url = site[0][2] site_id = site[0][0] t.init(visual_automation = True, chrome_browser = True) t.url(site_url) t.wait(10) df = catchContent() df = util.fixImgLink(df,"https://cf-templates-fghyux9ggb7t-ap-southeast-1.s3-ap-southeast-1.amazonaws.com/NewYorkTimes.png") df = util.fixSummary(df) t.wait(20) t.close() util.updateNews(conn, site_id, df)
def run(): conn = util.create_connection("./db/news.db") site = util.getSiteByName(conn, "Today Online") site_url = site[0][2] site_id = site[0][0] t.init(visual_automation=True, chrome_browser=True) t.url(site_url) t.wait(2) t.hover('//div[@class="container footer-main"]') t.wait(6) df = catchContent() t.wait(20) t.close() util.updateNews(conn, site_id, df)
def getFlightExcel(info,ind): flight_main, time_lst, code_lst, dur_lst, ind = getFlightInfo(info['dates'], ind) #print(flight_main['Details']) print(code_lst) print(dur_lst) print(time_lst) k = len(info['dates']) q = len(info['city']) flight_lst = [] for i in range(k): if i == (k-1) and i > 0 and q == k: flight_lst.append(info['dates'][i]) flight = info['city'][i] + '-' + info['city'][0] flight_lst.append(flight) else: flight_lst.append(info['dates'][i]) flight = info['city'][i] + '-' + info['city'][i + 1] flight_lst.append(flight) print(flight_lst) ###Compare Price with Expedia (Hyperlink/Multi to be added) for j in range(2): t.close() t.init() t.wait(0.5) flight_search(info) t.wait(5) flight_main['Flight Info'][j] = flight_lst price_exp, url_exp = getExpFlightPrice(code_lst[k*j:k*(j+1)], time_lst[k*j:k*(j+1)], dur_lst[k*j:k*(j+1)]) print(price_exp) print(url_exp) print(flight_main['Price']) if price_exp < flight_main['Price'][j]: if price_exp != 0: flight_main['Price'][j] = price_exp flight_main['Hyperlink'][j] = url_exp print(flight_main['Price']) print(flight_main['Hyperlink']) return flight_main
def history_data(url_prefix, start_page=1): curr_page = 1 curr_doc = 1 try: t.init() page_file = get_max_page(url_prefix) with open(page_file, 'r') as f: max_page = int(f.read()) + 1 # 拿到最大page,加1因为python index是开区间; os.remove(page_file) for page_num in range(start_page, max_page): curr_page = page_num count_values_file = get_count_values(page_num, url_prefix) with open(count_values_file, 'r') as f: # 拿到每一页的item数量; count_values = int(f.read().split(':')[-1]) + 1 os.remove(count_values_file) for i in range(1, count_values): if os.path.exists('complete_log'+str(url_prefix.split('/')[-2])+'.txt'): with open('complete_log' + str(url_prefix.split('/')[-2]) + '.txt', 'r') as f: start_doc = f.read().split(',')[1] if i < int(start_doc): continue else: pass curr_doc = i flag, time, content_url, file_name = read_content(page_num, url_prefix, i) if '.html' not in flag: # 当直接跳到需要下载的文件的时候:需要提供 当前url,time后缀,目前的文件index direct_download(content_url, time, i) else: # 当没有直接下载的时候,需要读取网页 # 读取网页 read_text_content(content_url, file_name, page_num, i, time, url_prefix) #顺利完成了item的循环,当前页完成,complete log翻页,start doc放在1;如果page_num已经是count - 1,就不用做事情了。 if page_num != max_page - 1: with open('complete_log' + str(url_prefix.split('/')[-2]) + '.txt', 'w') as f: f.write(str(page_num+1) + ',' + str(1)) else: pass t.close() return True except: #如果检测到错误 with open('complete_log' + str(url_prefix.split('/')[-2]) + '.txt', 'w') as f: f.write(str(curr_page) + ',' + str(curr_doc)) #留点 t.close() return False
def rpa_process(from_date, phone_number, token): t.init() t.url("https://sangam-test-website.herokuapp.com/cancel_input") util.wait_for_pageload('//button[@id="btnsubmit"]') t.click('//input[@id="txtHandNo"]') t.type('//input[@name="txtHandNo"]', phone_number) t.click('//button[@id="btnsubmit"]') util.wait_for_pageload('//button[@id="btnsubmit"]') from_date_obj = from_date from_date = from_date.strftime("%d/%m/%Y") t.click('//label[contains(.,"' + str(from_date) + '")]') t.click('//button[@id="btnsubmit"]') t.close() cancel_appointment_slot(from_date_obj, token)
def flight_search(flight_request): search_dt = dt.today() request_id = flight_request['Request_ID'] info = flight_request['Request_Details'] t.init() t.url('https://www.skyscanner.com.sg/') tu.wait_for_pageload('//input[@id="fsc-trip-type-selector-return"]') fill_search(info) ind = 0 flight_main = getFlightExcel(info, ind) t.wait(10.0) t.close() flight_main.update({ 'Request_ID': request_id, 'Search_Datetime': search_dt }) dbf.newFlightDeals(flight_main) outFile = dbf.export_FlightDeals(request_id, search_dt) return outFile
def get_news_using_crawler(): try: t.url( 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/news' ) wait_for_pageload('//p[@class="heading text-underline"]') num_news = t.count('//p[@class="heading text-underline"]') if num_news > 5: num_news = 5 delete_news_data_db() date_stamp = datetime.datetime.now( pytz.timezone('Singapore')).strftime('%Y-%m-%d') for n in range(1, num_news + 1): data = {} data['date_stamp'] = date_stamp news_link = t.read( f'(//p[@class="heading text-underline"])[{n}]/ancestor-or-self::a/@href' ) data['news_link'] = news_link news_title = t.read( f'(//p[@class="heading text-underline"])[{n}]/ancestor-or-self::a/@aria-label' ) data['news_title'] = news_title print('Article', n, ":", news_title) print('') news_summaries = SummarizeUrl(news_link) data['news_summary'] = str(news_summaries) print(news_summaries) status = insert_db(data) return status except Exception as e: print(e) finally: t.close()
def gethistorylist(input): # 获取xxxx年的数据 input = str(input) date_start = input + '-08-01' #一年开始的日期 (试一试10天的) date_end = input + '-12-31' #一年结束的日期 #初始化页面 t.init() #输入url进入 t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1") #等5秒网页加载 t.wait(5) #鼠标放上去,点击精简选项 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="zksq"]') #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="fxr"]') t.type(element_identifier='//*[@id="fxr"]', text_to_type=date_start) #再点击,确保日期不会遮住底下的搜索按钮 t.click(element_identifier='//*[@id="fxr"]') t.hover(element_identifier='//*[@class="ipf01"]') t.click(element_identifier='//*[@class="ipf01"]') #把展示的尺寸设置为50个产品每页: t.hover(element_identifier='//*[@data-pagesize="50"]') t.click(element_identifier='//*[@data-pagesize="50"]') #点击以发行日升序排行,等价于"倒过来取" t.hover(element_identifier='//*[@data-sort = "sell_org_date"]') t.click(element_identifier='//*[@data-sort = "sell_org_date"]') #当下一页没有被disable的时候,有以下超参数 page_curr = 1 #当前页面index value_dict = {} #存放data max_page = 1 #最大的页面数记录 #存放列名 name_list = ['序号', '综合评级', 'url'] for col_name in name_list: value_dict.setdefault(col_name, []) #初始化空数据集 #当可以翻页,或数据只有一页的时候,进行循环 stop_flag = False #当当前页面不是最后一页,或只有一页时,都进行如下循环 while (t.read(element_identifier= '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): if stop_flag == True: #如果没有今年的数据,就没必要翻页了 break max_page = page_curr #每页的数据量大小(row number) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 #爬取页面所有一个table里的值 filename = str(input) + str("_") + str(page_curr) + "history_data.csv" t.wait(1) #等1秒,万一加载错误了 t.table( element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table', filename_to_save=filename) #爬取当前页面 (只有title和href) for i in range(1, count_values): # 判定条件:如果是今年内(小于今年12-31或等于12-31的),全都要 if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) <= date_end: # print("number {} is running".format(str(i))) #爬取产品名称作为primary key,之后join用: # 产品序号 value_dict[name_list[0]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[2]')) # 综合评级 value_dict[name_list[1]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[12]//i/@title')) # url value_dict[name_list[2]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//a/@href')) else: # 如果已经超过今年的数据了,此线程结束,flag置true, while循环结束 stop_flag = True # print("thread stops here..") break # 翻页 page_curr += 1 # print("turn the page..") # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') # #关闭tagui流 t.close() #输出格式为:"今年年份.csv" hist_data = pd.DataFrame(value_dict) #双格式(csv + xlsx 输出) hist_data.to_csv(input + ".csv", encoding='UTF-8', index=False) return max_page
def close(self): t.close()
def getdailyincrement(str_to_append): #初始化页面 t.init() #输入url进入 t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1") #等5秒反应 t.wait(15) #鼠标放上去,点击精简选项 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="zksq"]') #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="fxr"]') t.type(element_identifier='//*[@id="fxr"]', text_to_type=str_to_append) #再点击,确保日期不会遮住底下的搜索按钮 t.click(element_identifier='//*[@id="fxr"]') t.hover(element_identifier='//*[@class="ipf01"]') t.click(element_identifier='//*[@class="ipf01"]') #把展示的尺寸设置为50个产品每页: t.hover(element_identifier='//*[@data-pagesize="50"]') t.click(element_identifier='//*[@data-pagesize="50"]') #当下一页没有被disable的时候,有以下超参数 page_curr = 1 #当前页面index value_dict = {} #存放data count = 1 #csv 命名用 #存放列名 name_list = ['序号', '综合评级', 'url'] for col_name in name_list: value_dict.setdefault(col_name, []) #初始化空数据集 #当可以翻页,或数据只有一页的时候,进行循环 while (t.read(element_identifier= '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): #每页的数据量大小(row number) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 # 爬取页面所有一个table里的值 if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(count_values - 1) + ']//td[@class = "px"]')) > str_to_append: # print("direct continue..") # 翻页 page_curr += 1 # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') continue filename = str(count) + "daily_data.csv" count += 1 t.wait(1) # 等1秒,万一加载错误了 t.table( element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table', filename_to_save=filename) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 for i in range(1, count_values): # 判定条件:如果是今天刚发行的,拿到所有主页面上的数据; #如果最下面那条数据都大于今天,就直接翻页 if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(count_values - 1) + ']//td[@class = "px"]')) > str_to_append: # print("direct break..") break else: if str( t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) == str_to_append: #序号 value_dict[name_list[0]].append( t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[2]')) #综合评级 value_dict[name_list[1]].append( t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[12]//i/@title')) #url value_dict[name_list[2]].append( t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//a/@href')) else: #如果不是今天增量,什么都不做 pass # print("turn the page..") # 翻页 page_curr += 1 # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') #关闭tagui流 t.close() #输出格式为:"今日日期.csv" today_data = pd.DataFrame(value_dict) today_data.to_csv(str_to_append + ".csv", index=False, encoding='UTF-8') return count - 1
def main_operation(url, mode='txt'): # 当前页面 curr_page = int( t.read(element_identifier='//div[@class = "ng-binding"][last()]'). split('/')[0]) # 点击按钮 list_count = t.count( element_identifier='//div[@class = "list caidan-right-list"]' ) # 循环列表,取出总list有几个 #如果是断点,读取断电数据 if os.path.exists('baojianhui_log.txt'): with open('baojianhui_log.txt', 'r', encoding='utf-8') as f: params = f.read().split(',') curr_page = params[0] start_i = params[1] start_j = params[2] else: #如果是第一次执行,全取初始值; start_i = 1 start_j = 1 #常规操作 for i in range(1, list_count + 1): t.wait(3) if i < int(start_i): continue item_count = t.count( element_identifier='//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"]') # 取出每个list里的具体法规有几条 print('当前是list {}, 里面的元素有 {} 个'.format(str(i), str(item_count))) t.wait(3) for j in range(1, item_count + 1): if j < int(start_j): continue item_title = t.read( element_identifier='//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a') time_suffix = t.read( element_identifier='//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//span[@class = "date ng-binding"]') file_name = item_title + '_' + time_suffix + '.txt' if '/' in file_name: file_name = file_name.replace('/', ' ') if mode == 'txt': #点击 link = t.read(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a/@ng-href') prefix = 'http://www.cbirc.gov.cn/cn/view/pages/' final_link = prefix + link t.url(final_link) t.wait(1) while not os.path.exists(file_name): type_1 = t.read( element_identifier='//div[@class = "Section0"]' ) + t.read(element_identifier='//div[@class = "Section1"]') type_2 = t.read( element_identifier='//div[@class = "WordSection1"]') type_3 = t.read( element_identifier= '//div[@class = "wenzhang-content ng-binding"]') if type_1 != '': content = type_1 with open(file_name, 'w', encoding='utf-8') as f: f.write(content) break elif type_2 != '': content = type_2 with open(file_name, 'w', encoding='utf-8') as f: f.write(content) break elif type_3 != '': content = type_3 with open(file_name, 'w', encoding='utf-8') as f: f.write(content) break else: content = ' ' with open(file_name, 'w', encoding='utf-8') as f: f.write(content) break elif mode == 'doc': t.click(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a[@ng-click = "fileDownload(x.docFileUrl)"]') doc_id = t.read(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a/@ng-href').split('=')[1][:-7] doc_name = doc_id + '.doc' curr_clock = 5 while not os.path.exists(doc_name): t.wait(curr_clock) curr_clock += 5 if curr_clock > MAX_WAIT: break t.wait(2) os.rename(doc_name, item_title + '_' + time_suffix + '.doc') elif mode == 'pdf': t.click(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a[@ng-click = "fileDownload(x.pdfFileUrl)"]') pdf_id = t.read(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a/@ng-href').split('=')[1][:-7] pdf_name = pdf_id + '.pdf' curr_clock = 5 while not os.path.exists(pdf_name): t.wait(curr_clock) curr_clock += 5 if curr_clock > MAX_WAIT: break t.wait(2) os.rename(pdf_name, item_title + '_' + time_suffix + '.pdf') else: print('unknown format..') t.close() raise Exception("unknown input mode") # 返回主页面 t.url(url + str(curr_page)) t.wait(2) with open('baojianhui_log.txt', 'w', encoding='utf-8') as f: f.write(str(curr_page) + ',' + str(i) + ',' + str(j)) with open('baojianhui_log.txt', 'w', encoding='utf-8') as f: f.write(str(curr_page) + ',' + str(i) + ',' + str(1)) #当前list取完,j更新
def compliance_data(url_prefix): t.init() # init_url = url_prefix + '1.html' t.url(init_url) #初始url max_page = int( t.read(element_identifier='//td[@class = "Normal"]').split('/') [1]) + 1 #最大page数量 for page_num in range(1, max_page): t.url(url_prefix + str(page_num) + '.html') print("现在所在页面 {}".format(page_num)) t.wait(5) # 拿到value count_values = t.count( element_identifier='//td[@colspan = "2"]//table') + 1 today = datetime.datetime.today() today = str(today.date()) # today = '2018-04-24' if t.read(element_identifier= '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]' ) < today: print("今日无增量") break print("页面有{}个文件".format(count_values - 1)) t.wait(5) for i in range(1, count_values): t.url(url_prefix + str(page_num) + '.html') if t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//span[@class = "hui12"]') < today: t.close() exit(1) file_name = t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']') + str('.txt') prefix = 'http://www.pbc.gov.cn' content_url = prefix + t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') if 'cnhttp' in content_url: content_url = content_url[21:] # 不知道为什么会出错这个 t.url(content_url) text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) print("文件{} 是文档。".format(i)) continue t.url(content_url) #进入二级目录 #获取pdf的数量,pdf的名字和pdf应该有的名字 t.wait(2) pdf_count = t.count( element_identifier='//div[@id = "zoom"]//a/@href') if pdf_count == 0: ##如果是正常的txt文件 # 取到列表 print("文件{} 是文档。".format(i)) # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") elif ('pdf' in t.read( element_identifier='//div[@id = "zoom"]//a/@href')): print("文件{} 含有 {} 个pdf。".format(i, pdf_count)) pdf_count += 1 #python从0开始,所以至少有一个pdf count for j in range(1, pdf_count): #取pdf的名字 if t.read(element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href') != '': print("当前是第{}个pdf。。".format(j)) pdf_name = t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href').split('/')[-1] #取合规名 pdf_name_to_change = t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a') #下载 prefix = 'http://www.pbc.gov.cn' t.url(prefix + t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href')) wait_seconds = 1 total_seconds = 0 while os.path.exists(pdf_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > 30: print('download fails') break os.rename(pdf_name, pdf_name_to_change) #改名 t.url(content_url) #返回二级目录 else: print("不合规,当文档处理!不读了!!!") # 取text if t.read(element_identifier='//div[@id = "zoom"]' ) != '': text = t.read( element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]' ) != '': text = t.read( element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") t.url(url_prefix + str(page_num) + '.html') break else: print("文件{} 含有 {} 个pdf。".format(i, pdf_count)) print("含有其他format的href,当文档处理!不读了!!!") # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") t.url(url_prefix + str(page_num) + '.html') break t.close()
def make_reservation(reservation_date,reservation_time,party_size,restaurant_name,first_name,last_name,email_address,phone_number): try: #Convert User Defined Values to System Usable Values reservation_day=reservation_date.split('/')[0] reservation_month =reservation_date.split('/')[1] reservation_month=int(reservation_month)-1 reservation_year =reservation_date.split('/')[2] reservation_time_int=int(reservation_time) start_time_hr= reservation_time[:2] if reservation_time_int>1159: if start_time_hr!="12": start_time_hr=int(start_time_hr)-12 start_time_option = str(start_time_hr)+":"+reservation_time[2:4]+" pm" else: start_time_option = str(start_time_hr)+":"+reservation_time[2:4]+" am" #Booking Parameters chope_url ='https://www.chope.co/singapore-restaurants/category/restaurant/' t.init() t.url(chope_url) t.wait(10) #Date Field t.click(f"(//span[contains(@class,'input-group-addon icon-calendar')])[1]") t.wait(7) boolean_flag=1 while boolean_flag: if t.present(f"//td[@data-handler='selectDay'and @data-year='{reservation_year}' and @data-month='{reservation_month}']/a[text()='{reservation_day}']"): t.click(f"//td[@data-handler='selectDay'and @data-year='{reservation_year}' and @data-month='{reservation_month}']/a[text()='{reservation_day}']") boolean_flag=0 else: t.click('//a[@title="Next"]') t.click(f"//td[@data-handler='selectDay'and @data-month='{reservation_month}']/a[text()='{reservation_day}']") #Time Field t.select(f"//select[contains(@id,'time-field')]",start_time_option) #Number of Diners Field t.click(f"(//span[contains(@class,'input-group-addon icon-person')])[1]") t.select(f"//select[contains(@id,'adults')]",party_size) #Restaurant Field t.type(f"//select[contains(@id,'sb-sel-restaurant')]",restaurant_name) t.click('//button[@id="btn-search"]') t.wait(5) #Secondary Page to Confirm Timing t.click(f"//a[contains(@rname,'{restaurant_name}') and text()='{start_time_option}']") t.wait(5) t.click(f"//input[@id='btn_sub' and @value='Book Now']") t.wait(5) #Booking Confirmation t.popup('https://book.chope.co/') #First Name t.type('//input[@id="forename"]',first_name) #Last Name t.type('//input[@id="surname"]',last_name) #Email t.type('//input[@id="email"]',email_address) #Phone Number t.type('//input[@id="telephone"]',phone_number) #Agree Terms & Conditions if t.present(f"//input[@name='agree_term_conditions']"): t.click(f"//input[@name='agree_term_conditions']") #Confirm Booking t.click(f"//button[@id='check_book_now']") t.wait(5) t.close() print('Success') schedule_reservation(reservation_date,reservation_time,party_size,restaurant_name,first_name,sample_restaurant_address) return 'Reservation Successful' except: print('Error') return 'Reservation Unsuccessful. Unforunately, the restaurant was not able to accomodate your reservation.'
def history_data(url_prefix): t.init() init_url = url_prefix + '1.html' t.url(init_url) max_page = int( t.read(element_identifier='//td[@class = "Normal"]').split('/')[1]) + 1 for page_num in range(1, max_page): #主页面 t.url(url_prefix + str(page_num) + '.html') print("现在所在页面 {}".format(page_num)) t.wait(5) #拿到value count_values = t.count( element_identifier='//td[@colspan = "2"]//table') + 1 today = datetime.datetime.today() today = str(today.date()) if t.read(element_identifier= '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]' ) < today: print("今日无增量") break print("页面有{}个文件".format(count_values - 1)) t.wait(5) for i in range(1, count_values): t.url(url_prefix + str(page_num) + '.html') if t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//span[@class = "hui12"]') < today: break if '.html' in t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): #取到列表 print("文件{} 是文档。".format(i)) file_name = t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']') + str('.txt') prefix = 'http://www.pbc.gov.cn' content_url = prefix + t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//td//a/@href') # 点击url if content_url == 'http://www.pbc.gov.cnhttp://www.pbc.gov.cn/goutongjiaoliu/113456/113469/3487563/index.html': content_url = 'http://www.pbc.gov.cn/goutongjiaoliu/113456/113469/3487563/index.html' #不知道为什么会出错这个 t.url(content_url) # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") elif '.doc' in t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): # 取到数据 print("文件{} 是下载doc。".format(i)) file_name = t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href').split('/')[-1] prefix = 'http://www.pbc.gov.cn' content_url = prefix + t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') t.url(content_url) wait_seconds = 1 total_seconds = 0 while os.path.exists(file_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > 30: print('download fails') break else: print("unknown format..") print("爬好一次,返回页面 {}".format(page_num)) #close out t.close()
def main_operation(url, mode='txt'): # 当前页面 curr_page = int( t.read(element_identifier='//div[@class = "ng-binding"][last()]'). split('/')[0]) # 点击按钮 list_count = t.count( element_identifier='//div[@class = "list caidan-right-list"]' ) # 循环列表,取出总list有几个 #如果是断点,读取断电数据 if os.path.exists('baojianhui_log.txt'): with open('baojianhui_log.txt', 'r') as f: params = f.read().split(',') curr_page = params[0] start_i = params[1] start_j = params[2] else: #如果是第一次执行,全取初始值; start_i = 1 start_j = 1 #常规操作 for i in range(1, list_count + 1): t.wait(5) if i < int(start_i): continue item_count = t.count( element_identifier='//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"]') # 取出每个list里的具体法规有几条 print('当前是list {}, 里面的元素有 {} 个'.format(str(i), str(item_count))) t.wait(5) for j in range(1, item_count + 1): if j < int(start_j): continue item_title = t.read( element_identifier='//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a') time_suffix = t.read( element_identifier='//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//span[@class = "date ng-binding"]') if str(time_suffix) != str((datetime.datetime.today()).date( )): #如果不是今日日期,直接return;str((datetime.datetime.today()).date()) print('今日增量已取完') return True, '今日无增量' file_name = item_title + '_' + time_suffix + '.txt' if '/' in file_name: file_name = file_name.replace('/', ' ') if mode == 'txt': #点击 link = t.read(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a/@ng-href') prefix = 'http://www.cbirc.gov.cn/cn/view/pages/' final_link = prefix + link t.url(final_link) t.wait(1) while not os.path.exists(file_name): # type_1 = t.read(element_identifier='//div[@class = "Section0"]') + t.read(element_identifier='//div[@class = "Section1"]') # type_2 = t.read(element_identifier='//div[@class = "WordSection1"]') # type_3 = t.read(element_identifier='//div[@class = "wenzhang-content ng-binding"]') if t.read( element_identifier='//div[@class = "Section0"]' ) + t.read(element_identifier='//div[@class = "Section1"]' ) != '': #p0 p_counts_section0 = t.count( element_identifier='//div[@class = "Section0"]//p') content_list = [] with open(file_name, 'w', encoding='utf-8') as f: f.write(file_name.split("_")[0] + "\n") for p in range(1, p_counts_section0 + 1): content_list.append( t.read(element_identifier= '//div[@class = "Section0"]//p[' + str(p) + ']')) with open(file_name, 'a', encoding='utf-8') as f: f.writelines( [content + "\n" for content in content_list]) #p1 p_counts_section1 = t.count( element_identifier='//div[@class = "Section1"]//p') content_list = [] for p in range(1, p_counts_section1 + 1): content_list.append( t.read(element_identifier= '//div[@class = "Section1"]//p[' + str(p) + ']')) with open(file_name, 'a', encoding='utf-8') as f: f.writelines( [content + "\n" for content in content_list]) break elif t.read( element_identifier='//div[@class = "WordSection1"]' ) != '': p_counts = t.count(element_identifier= '//div[@class = "WordSection1"]//p') if p_counts <= 1: content_list = t.read( element_identifier= '//div[@class = "WordSection1"]//p') with open(file_name, 'w', encoding='utf-8') as f: f.write(file_name.split("_")[0] + "\n") f.writelines([ content + "\n" for content in content_list.split(" ") ]) else: content_list = [] for p in range(1, p_counts + 1): content_list.append( t.read( element_identifier= '//div[@class = "WordSection1"]//p[' + str(p) + ']')) with open(file_name, 'w', encoding='utf-8') as f: f.write(file_name.split("_")[0] + "\n") f.writelines([ content + "\n" for content in content_list ]) break elif t.read(element_identifier= '//div[@class = "wenzhang-content ng-binding"]' ) != '': #有p》1 #无p 《=1, 用split p_counts = t.count( element_identifier= '//div[@class = "wenzhang-content ng-binding"]//p') if p_counts <= 1: content_list = t.read( element_identifier= '//div[@class = "wenzhang-content ng-binding"]//p' ) with open(file_name, 'w', encoding='utf-8') as f: f.write(file_name.split("_")[0] + "\n") f.writelines([ content + "\n" for content in content_list.split(" ") ]) else: content_list = [] for p in range(1, p_counts + 1): content_list.append( t.read( element_identifier= '//div[@class = "wenzhang-content ng-binding"]//p[' + str(p) + ']')) with open(file_name, 'w', encoding='utf-8') as f: f.write(file_name.split("_")[0] + "\n") f.writelines([ content + "\n" for content in content_list ]) break else: content = ' ' with open(file_name, 'w') as f: f.write(content) break elif mode == 'doc': t.click(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a[@ng-click = "fileDownload(x.docFileUrl)"]') doc_id = t.read(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a/@ng-href').split('=')[1][:-7] doc_name = doc_id + '.doc' curr_clock = 5 while not os.path.exists(doc_name): t.wait(curr_clock) curr_clock += 5 if curr_clock > MAX_WAIT: break t.wait(5) os.rename(doc_name, item_title + '_' + time_suffix + '.doc') elif mode == 'pdf': t.click(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a[@ng-click = "fileDownload(x.pdfFileUrl)"]') pdf_id = t.read(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a/@ng-href').split('=')[1][:-7] pdf_name = pdf_id + '.pdf' curr_clock = 5 while not os.path.exists(pdf_name): t.wait(curr_clock) curr_clock += 5 if curr_clock > MAX_WAIT: break t.wait(5) os.rename(pdf_name, item_title + '_' + time_suffix + '.pdf') else: print('unknown format..') t.close() raise Exception("unknown input mode") # 返回主页面 t.url(url + str(curr_page)) t.wait(5) with open('baojianhui_log.txt', 'w') as f: f.write(str(curr_page) + ',' + str(i) + ',' + str(j)) with open('baojianhui_log.txt', 'w') as f: f.write(str(curr_page) + ',' + str(i) + ',' + str(1)) #当前list取完,j更新
def propertydata_update(project_name): df1 = pd.read_excel('Property Monitor.xlsx') t.close() t.init() project_url = f'https://www.propertyguru.com.sg/property-for-sale?market=residential&freetext={project_name}&newProject=all' print(project_url) t.url(project_url) wait_for_pageload('//div[@class="header-wrapper"]') num_result_ad = 3 # load main page, get detail page url link url = [''] * num_result_ad id = [''] * num_result_ad for n in [x for x in range(1, num_result_ad + 1) if x != 4 and x != 8]: # skip 4th and 8th advertisement wait_for_pageload( f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)' ) url[n - 1] = read_if_present( f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)' ) print(f"{n}. url = " + url[n - 1]) id[n - 1] = read_if_present( f'(//*[@id="wrapper-inner"]/section[1]/div[2]/div[1]/div[2]/div[2]/section/div[2]/div[{n}]/@data-listing-id)' ) print(f'searching: {id}') # ['22036842', '21725956', '20648962'] id_int = list(df1['id']) id_str = list() for n in id_int: id_str.append(str(n)) print(id_str) new_url = list() for n in id: if n not in id_str: print(f'new property found: {n}') u = f"https://www.propertyguru.com.sg/listing/{n}/for-sale-{project_name}" new_url.append(u) if new_url == []: return print(f'======== no new property found! ==========') print(f'======== new property found==========') property_title = [''] * len(new_url) pdf = [''] * len(new_url) pdf_link = [''] * len(new_url) for (n, i) in zip(new_url, range(1, len(new_url) + 1)): t.url(n) wait_for_pageload('//h1[@class="h2"]') property_title[i - 1] = read_if_present('//h1[@class="h2"]') print(f"{i}. property_title = " + property_title[i - 1]) pdf[i - 1] = read_if_present( '//*[@id="sticky-right-col"]/div[3]/a[2]/@href') pdf_link[i - 1] = 'https://www.propertyguru.com.sg' + pdf[i - 1] print(f"{i}. pdf_link = " + pdf_link[i - 1]) property_info = { 'property_title': property_title, 'url': ['https://www.propertyguru.com.sg' + x for x in url], 'id': id, 'pdf_link': pdf_link, } df2 = DataFrame(property_info, columns=['property_title', 'id', 'url', 'pdf_link']) new_df = pd.concat([df1, df2]) new_df.to_excel('Property Monitor.xlsx', encoding='utf8', index=None) print('======== Property Monitor.xlsx update ==========') pdf_filename = download_pdf(property_title, pdf_link, id) mail_subscription(input_email, input_name, pdf_filename)
# use snap() to save screenshot of page or UI element # page = web page, page.png = computer screen t.snap('page', 'results.png') t.snap('logo', 'logo.png') # another example of interacting with a web page # include http:// or https:// in URL parameter t.url('https://duckduckgo.com') t.type('search_form_input_homepage', 'The search engine that doesn\'t track you.') t.snap('page', 'duckduckgo.png') t.wait(4.4) # use close() to close TagUI process and web browser # if you forget to close, just close() next time t.close() # in above web automation example, web element identifier can be XPath selector, CSS selector or # attributes id, name, class, title, aria-label, text(), href, in decreasing order of priority # if you don't mind using ugly and less robust XPath, it can be copied from Chrome inspector # otherwise recommend googling on writing XPath manually, or simply make use of attributes # also supports visual element identifier using .png or .bmp image snapshot # representing the UI element (can be on desktop applications or web browser) # for eg t.click('start_menu.png'), t.type('username_box.png', 'Sonic') # image transparency (0% opacity) is supported, ie images with empty sections # t.read('image_preview_frame.png'), t.snap('application_window_frame.png') # visual element identifiers can also be x, y coordinates of elements on the screen # for eg t.click(600, 300), t.type(600, 300, 'Mario'), t.select(600, 300, 600, 400)
def url2png(url): t.init() t.url(url) # t.type('q', 'decentralization[enter]') t.snap('page', 'results-' + str(uuid.uuid1()) + '.png') t.close()
def getblanklist(): #初始化页面 t.init() #输入url进入 t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1") #直接开始搜索,不需要任何筛选条件 t.click(element_identifier='//*[@id="fxr"]') t.hover(element_identifier='//*[@class="ipf01"]') t.click(element_identifier='//*[@class="ipf01"]') #把展示的尺寸设置为50个产品每页: t.hover(element_identifier='//*[@data-pagesize="50"]') t.click(element_identifier='//*[@data-pagesize="50"]') #点击以发行日升序排行,等价于"倒过来取",这样发行日为空的会在最前面 t.hover(element_identifier='//*[@data-sort = "sell_org_date"]') t.click(element_identifier='//*[@data-sort = "sell_org_date"]') #当下一页没有被disable的时候,有以下超参数 page_curr = 1 #当前页面index max_page = 1 # 最大的页面数记录 # 存放列名 value_dict = {} # 存放data name_list = ['序号', '综合评级', 'url'] for col_name in name_list: value_dict.setdefault(col_name, []) # 初始化空数据集 #当可以翻页,或数据只有一页的时候,进行循环 stop_flag = False # 初始化一个flag,flag = true代表我们需要的数据已经取完了,没必要再翻页了 while (t.read(element_identifier= '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): if stop_flag == True: #如果没有空白数据了,就没必要翻页了 break max_page = page_curr #每页的数据量大小(row number) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 # 爬取页面所有一个table里的值 filename = str(page_curr) + "blank_date.csv" t.wait(1) # 等1秒,万一加载错误了 t.table( element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table', filename_to_save=filename) #爬取当前页面 (只有title和href) for i in range(1, count_values): # 判定条件:如果发行日是空(--),进入此if if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) == '--': # print("number {} is running".format(str(i))) # 序号 value_dict[name_list[0]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[2]')) # 综合评级 value_dict[name_list[1]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[12]//i/@title')) # url value_dict[name_list[2]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//a/@href')) else: # 如果不再是空值-- ,此线程结束,flag置true, while循环结束 stop_flag = True # print("thread stops here..") break # 翻页 page_curr += 1 # print("turn the page..") # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') # #关闭tagui流 t.close() # # 输出格式为:"blank_date.csv" hist_data = pd.DataFrame(value_dict) hist_data.to_csv("blank_date.csv", index=False, encoding='UTF-8') return max_page
def history_data_daily(url_prefix): t.init() # init_url = url_prefix + '1.html' t.url(init_url) # 初始url max_page = int(t.read(element_identifier='//td[@class = "Normal"]').split('/')[1]) + 1 # 最大page数量 for page_num in range(1, max_page): t.url(url_prefix + str(page_num) + '.html') print("现在所在页面 {}".format(page_num)) t.wait(5) # 拿到value count_values = t.count(element_identifier='//td[@colspan = "2"]//table') + 1 today = datetime.datetime.today() today = str(today.date()) # today = '2018-04-24' if t.read(element_identifier='//td[@colspan = "2"]//table[1]//span[@class = "hui12"]') < today: print("今日无增量") break print("页面有{}个文件".format(count_values - 1)) t.wait(5) for i in range(1, count_values): t.url(url_prefix + str(page_num) + '.html') if t.read(element_identifier='//td[@colspan = "2"]//table['+str(i)+']//span[@class = "hui12"]') < today: t.close() exit(1) file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']') file_name = file_name[:-10] + str("_") + file_name[-10:] + str('.txt') time = file_name[-14:-4] prefix = 'http://www.pbc.gov.cn' content_url = prefix + t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') if '.html' not in t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): # 当直接跳到需要下载的文件的时候 if 'cnhttp' in content_url: content_url = content_url[21:] # 不知道为什么会出错这个 # 取到数据 print("文件{} 是直接下载文件。".format(i)) file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') suffix = file_name.split('.')[-1] file_name = file_name.split('/')[-1] t.url(content_url) wait_seconds = 1 total_seconds = 0 while os.path.exists(file_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > 30: print('download fails') break os.rename(file_name, file_name[:-(len(suffix)+1)] + "_" + time +'.'+file_name[-(len(suffix)+1):]) else: # 取到数据 print("文件{} 是直接下载文件。".format(i)) file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') suffix = file_name.split('.')[-1] file_name = file_name.split('/')[-1] t.url(content_url) wait_seconds = 1 total_seconds = 0 while os.path.exists(file_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > 30: print('download fails') break os.rename(file_name, file_name[:-(len(suffix)+1)] + "_" + time +'.'+file_name[-(len(suffix)+1):]) else: # 当没有直接下载的时候 if 'cnhttp' in content_url: content_url = content_url[21:] # 不知道为什么会出错这个 t.url(content_url) else: t.url(content_url) # 获取pdf的数量,pdf的名字和pdf应该有的名字 t.wait(2) pdf_count = t.count(element_identifier='//div[@id = "zoom"]//a/@href') if pdf_count == 0: ##如果是正常的txt文件 # 取到列表 print("文件{} 是文档。".format(i)) # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") else: # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") print("文件{} 含有 {} 个文件要下载。".format(i, pdf_count)) pdf_count += 1 # python从0开始,所以至少有一个pdf count current_count = 0 for j in range(1, pdf_count): # 取pdf的名字 if '.htm' not in t.read(element_identifier='//div[@id = "zoom"]//p//a/@href'): print("当前是第{}个文件。。".format(j)) p_count = t.count(element_identifier='//div[@id = "zoom"]//p') while current_count <= p_count: if t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a') != '': #如果取到了 print("这个p有!") pdf_name = t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href') # 取合规名 pdf_name_to_change = t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a') # 下载 suffix = pdf_name.split('.')[-1] pdf_name = pdf_name.split('/')[-1] prefix = 'http://www.pbc.gov.cn' download_link = prefix + t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href') if 'cnhttp' in download_link: t.url(t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href')) else: t.url(download_link) wait_seconds = 1 total_seconds = 0 while os.path.exists(pdf_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > 30: print('download fails') break os.rename(pdf_name, pdf_name_to_change) # 改名 os.rename(pdf_name_to_change, pdf_name_to_change[:-(len(suffix)+1)] + '_' + time + pdf_name_to_change[-(len(suffix)+1):]) t.url(content_url) # 返回二级目录 current_count += 1 break else: current_count += 1 print("这个p没有") else: print("是个网页,当文档处理!") prefix = 'http://www.pbc.gov.cn' download_link = prefix + t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href') if 'cnhttp' in download_link: t.url(t.read(element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href')) else: t.url(download_link) # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") t.close()
def gethistorylist(inputyear): # 获取xxxx年的数据 input = inputyear date_start = input + '-01-01' #一年开始的日期 (试一试10天的) date_end = input + '-12-31' #一年结束的日期 #初始化页面 t.init() #输入url进入 t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1") #鼠标放上去,点击精简选项 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="zksq"]') #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="fxr"]') t.type(element_identifier='//*[@id="fxr"]', text_to_type=date_start) #再点击,确保日期不会遮住底下的搜索按钮 t.click(element_identifier='//*[@id="fxr"]') t.hover(element_identifier='//*[@class="ipf01"]') t.click(element_identifier='//*[@class="ipf01"]') #把展示的尺寸设置为50个产品每页: t.hover(element_identifier='//*[@data-pagesize="50"]') t.click(element_identifier='//*[@data-pagesize="50"]') #点击以发行日升序排行,等价于"倒过来取" t.hover(element_identifier='//*[@data-sort = "sell_org_date"]') t.click(element_identifier='//*[@data-sort = "sell_org_date"]') #当下一页没有被disable的时候,有以下超参数 page_curr = 1 #当前页面index value_dict = {} #存放data #存放列名 name_list = [ '序号', '产品名称', '发行银行', '委托货币', '发行日', '停售日', '管理期(天)', '预期收益率', '到期收益率', '与同期储蓄比', '综合评级', 'url' ] for col_name in name_list: value_dict.setdefault(col_name, []) #初始化空数据集 #当可以翻页,或数据只有一页的时候,进行循环 stop_flag = False #当当前页面不是最后一页,或只有一页时,都进行如下循环 while (t.read(element_identifier= '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): if stop_flag == True: #如果没有今年的数据,就没必要翻页了 break #每页的数据量大小(row number) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 #爬取当前页面 for i in range(1, count_values): # 判定条件:如果是今年内(小于今年12-31或等于12-31的),全都要 if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) <= date_end: # 序号 value_dict[name_list[0]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[2]')) # 产品名称 value_dict[name_list[1]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[3]')) # 发行银行 value_dict[name_list[2]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[4]')) # 委托货币 value_dict[name_list[3]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[5]')) # 发行日 value_dict[name_list[4]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[6]')) # 停售日 value_dict[name_list[5]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[7]')) # 管理期(天) value_dict[name_list[6]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[8]')) # 预期收益率 value_dict[name_list[7]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[9]')) # 到期收益率 value_dict[name_list[8]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[10]')) # 与同期储蓄比 value_dict[name_list[9]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[11]')) # 综合评级 value_dict[name_list[10]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[12]//i/@title')) # url value_dict[name_list[11]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//a/@href')) else: # 如果已经超过今年的数据了,此线程结束,flag置true, while循环结束 stop_flag = True print("thread stops here..") break # 翻页 page_curr += 1 print("turn the page..") # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') # #关闭tagui流 t.close() #输出格式为:"今年年份.csv" hist_data = pd.DataFrame(value_dict) #双格式(csv + xlsx 输出) hist_data.to_csv(input + ".csv", index=False, encoding='UTF-8') hist_data.to_excel(input + ".xlsx", index=False, encoding='UTF-8') #gethistorylist('2003')
def get_shoe(shoe, gender, email): t.init(visual_automation=True) t.url("https://www.farfetch.com/sg/") details = [] if gender == ' men': t.click('(//span[@class="tabs__span"])[.="Men"]') t.type('//input[@class="js-searchboxABTest force-ltr"]', shoe + " Shoes") t.click('//form[@class="ff-search"]/button') t.wait(3) count = t.count('(//li[@data-test="productCard"])') if count != 0: for i in range(1, min(count, 4)): name = t.read( f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/p' ) price = t.read( f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/div' ).replace('$', '') if 'Off' in price: price = price.split('Off')[1] img = t.read( f'(//li[@data-test="productCard"])[{i}]//img/@src') link = "https://www.farfetch.com" + t.read( f'(//li[@data-test="productCard"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "Farfetch", "link": link }) # print(f"name: {name}, price: {price} img_source = {img}") else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "Farfetch", "link": "NA" }) elif gender == ' women': t.click('(//span[@class="tabs__span"])[.="Women"]') t.type('//input[@class="js-searchboxABTest force-ltr"]', shoe + " Shoes") t.click('//form[@class="ff-search"]/button') t.wait(3) count = t.count('(//li[@data-test="productCard"])') if count != 0: for i in range(1, min(count, 4)): name = t.read( f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/p' ) price = t.read( f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/div' ).replace('$', '') if 'Off' in price: price = price.split('Off')[1] img = t.read( f'(//li[@data-test="productCard"])[{i}]//img/@src') link = "https://www.farfetch.com" + t.read( f'(//li[@data-test="productCard"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "Farfetch", "link": link }) # print(f"name: {name}, price: {price} img_source = {img}") else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "Farfetch", "link": "NA" }) else: t.type('//input[@class="js-searchboxABTest force-ltr"]', shoe + " Shoes") t.click('//form[@class="ff-search"]/button') t.wait(3) count = t.count('(//li[@data-test="productCard"])') if count != 0: for i in range(1, min(count, 4)): name = t.read( f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/p' ) price = t.read( f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/div' ).replace('$', '') if 'Off' in price: price = price.split('Off')[1] img = t.read( f'(//li[@data-test="productCard"])[{i}]//img/@src') link = "https://www.farfetch.com" + t.read( f'(//li[@data-test="productCard"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "Farfetch", "link": link }) # print(f"name: {name}, price: {price} img_source = {img}") else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "Farfetch", "link": "NA" }) t.close() return details