def main(url, mode='txt'): if os.path.exists('baojianhui_log.txt'): with open('baojianhui_log.txt', 'r', encoding='utf-8') as f: params = f.read().split(',') curr_page = params[0] else: # 如果是第一次执行,全取初始值; curr_page = 1 url_link = str(url) + str(curr_page) web_init(url_link) total_page = int( t.read(element_identifier='//div[@class = "ng-binding"][last()]'). split('/')[-1]) while int(curr_page) < int(total_page): #从1开始,做完之后翻页; main_operation(url, mode) #如果有页可翻,就翻页 print('click once') t.click(element_identifier='//a[@ng-click = "pager.next()"]') #翻页 t.wait(5) curr_page = int( t.read(element_identifier='//div[@class = "ng-binding"][last()]'). split('/')[0]) with open('baojianhui_log.txt', 'w', encoding='utf-8') as f: f.write(str(curr_page) + ',' + str(1) + ',' + str(1)) #翻页之后,index重置;i更新; if curr_page == total_page: main_operation(url, mode) #如果是最后一页了,只需要做一次main t.close() return True
def catchContent(): number_to = t.count( '(//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")])' ) df_to = pd.DataFrame(index=range(0, number_to), columns=['Sno', 'Title', 'URL', 'Summary', 'Img_URL']) t.hover('//div[@class="container footer-main"]') t.wait(2) for n in range(1, number_to): title = t.read( '//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")][{}]//div[contains(@class, "article-listing_content")]//h2' .format(n)) URL_o = t.read( '//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")][{}]//@href' .format(n)) URL = "https://www.todayonline.com" + str(URL_o) Img_link = t.read( '//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")][{}]//img/@src' .format(n)) df_to.iloc[n - 1, 0] = n df_to.iloc[n - 1, 1] = title.decode('utf-8') df_to.iloc[n - 1, 2] = URL df_to.iloc[n - 1, 4] = Img_link for i in range(0, df_to.shape[0]): if df_to['Img_URL'][i] == "": df_to['Img_URL'][i] = np.nan df_to.dropna(subset=['Img_URL'], inplace=True, how='any') df_to = df_to.reset_index(drop=True) df_to['Sno'] = df_to.index df_to = util.fixImgLink( df_to, "https://cf-templates-fghyux9ggb7t-ap-southeast-1.s3-ap-southeast-1.amazonaws.com/todayOnline.png" ) for n in range(0, df_to.shape[0]): t.url(df_to.URL[n]) t.wait(4) t.hover('//div[@class="article-detail_subscription"]') t.wait(2) number_p = t.count('//div/p[not(@class)]') Content = "" for i in range(1, number_p - 2): cont = t.read('//div/p[not(@class)][{}]'.format(i)) Content = Content + "" + cont summaries = Summarize(df_to.Title[n], unicode(str(Content), "utf-8")) df_to.iloc[n - 1, 3] = summaries[0] return df_to
def catchContent(): number_bb = t.count( '(//div[contains(@data-vr-zone, "Top Stories")]//span[contains(@class, "story-headline")])' ) df_bb = pd.DataFrame(index=range(0, number_bb - 2), columns=['Sno', 'Title', 'URL', 'Summary', 'Img_URL']) for n in range(0, number_bb - 2): title = t.read( '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]' .format(n)) URL_b = t.read( '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]//@href' .format(n)) URL = "https://www.straitstimes.com/" + str(URL_b) Img_URL = t.read( '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]/ancestor::div[contains(@class, "body")]/..//img/@src' .format(n)) summaries = SummarizeUrl(URL) df_bb.iloc[n, 0] = n df_bb.iloc[n, 1] = title df_bb.iloc[n, 2] = URL df_bb.iloc[n, 3] = summaries df_bb.iloc[n, 4] = Img_URL return df_bb
def direct_download(content_url, time, i): # 当直接跳到需要下载的文件的时候 if 'cnhttp' in content_url: content_url = content_url[21:] # 不知道为什么会出错这个 # 取到数据 print("文件{} 是直接下载文件。".format(i)) if '' == t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): print("no here") raise Exception("an exception") file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') suffix = file_name.split('.')[-1] file_name = file_name.split('/')[-1] t.url(content_url) # 启动很慢 wait_seconds = 1 total_seconds = 0 while os.path.exists(file_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > MAX_WAIT: print('download fails') break os.rename( file_name, file_name[:-(len(suffix) + 1)] + "_" + time + '.' + file_name[-(len(suffix) + 1):]) else: # 取到数据 print("文件{} 是直接下载文件。".format(i)) if '' == t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): print("no here") raise Exception("an exception") file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') suffix = file_name.split('.')[-1] file_name = file_name.split('/')[-1] t.url(content_url) # 启动很慢 wait_seconds = 1 total_seconds = 0 while os.path.exists(file_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > MAX_WAIT: print('download fails') break os.rename( file_name, file_name[:-(len(suffix) + 1)] + "_" + time + '.' + file_name[-(len(suffix) + 1):])
def get_shoe(shoe, g, email): gender = g # print('[nike]',gender) t.init(visual_automation=True) t.url('https://www.nike.com/sg/') t.type('//input[@id = "TypeaheadSearchInput"]', shoe + " shoes") t.click('//button[@class = "btn-search z2 bg-transparent"]') t.wait(3) if gender == " men": t.click('(//span[contains(@class,"filter-item")])[1]') elif gender == " women": t.click('(//span[contains(@class,"filter-item")])[2]') t.wait(1) count = t.count('//a[@class ="product-card__link-overlay"]') # print('[nike]',count) details = [] if count != 0: for i in range(0, min(count, 3)): k = i + 1 name = t.read(f'(//a[@class = "product-card__link-overlay"])[{k}]') price = t.read(f'(//div[@data-test="product-price"])[{k}]') img = t.read( f'(//div[contains(@class, "product-card__hero")]/picture/img)[{k}]/@src' ) link = t.read(f'(//a[contains(@class,"product-card")])[{k}]/@href') # print('[nike]',name , price, img) details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "Nike", "link": link }) else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "Nike", "link": "NA" }) # print(details) return details
def number_of_travellers(adult_pax, children_pax, children_age): print(f"Adults: {adult_pax} and Children: {children_pax}") form_adult_pax = int( t.read('//input[@id="search-controls-adults-nudger"]')) form_children_pax = int( t.read('//input[@id="search-controls-children-nudger"]')) print( f"Form Current Adults: {form_adult_pax} and Children: {form_children_pax}" ) # set the number of adult travellers if adult_pax > form_adult_pax: for n in range(form_adult_pax, adult_pax): t.click('//button[@title="Increase number of adults"]') t.wait(1) elif adult_pax < form_adult_pax: for x in range(0, form_adult_pax - adult_pax): t.click('//button[@title="Decrease number of adults"]') t.wait(1) else: for n in range(form_adult_pax, adult_pax): t.click('//button[@title="Increase number of adults"]') t.wait(1) # set the number of child travellers if children_pax > form_children_pax: for n in range(form_children_pax, children_pax): t.click('//button[@title="Increase number of children"]') t.wait(1) elif children_pax < form_children_pax: for x in range(0, form_children_pax - children_pax): t.click('//button[@title="Decrease number of children"]') t.wait(1) else: for n in range(form_children_pax, children_pax): t.click('//button[@title="Increase number of children"]') t.wait(1) # Set the age for each child traveller if len(children_age) > 0: for m in range(0, len(children_age)): t.click(f'//select[@id="children-age-dropdown-{m}"]') t.select(f'//select[@id="children-age-dropdown-{m}"]', str(children_age[m])) t.click( '//section[@id="cabin-class-travellers-popover"]//button[.="Done"]')
def get_max_page(url_prefix): # 当达到max-page的最后一个count value文件index的时候,不用再做了。输出成功日志;进行下一个任务。。 init_url = url_prefix + '1.html' t.url(init_url) # 初始url max_page = int(t.read(element_identifier='//td[@class = "Normal"]').split('/')[1]) # 最大page数量 with open('max_page_' + str(url_prefix.split('/')[-2]) + '.txt', 'w', encoding='utf-8') as f: f.write(str(max_page)) return 'max_page_' + str(url_prefix.split('/')[-2]) + '.txt'
def extract_global(date_stamp): data = {} region_detail = {} data['date_stamp'] = date_stamp # World data data['country_name'] = 'Global' t.url('https://www.worldometers.info/coronavirus/') wait_for_pageload('//div[@class="maincounter-number"]') global_case = t.read('(//div[@class="maincounter-number"])[1]/span') global_death = t.read('(//div[@class="maincounter-number"])[2]/span') global_recovered = t.read('(//div[@class="maincounter-number"])[3]/span') region_detail['total_cases'] = int(global_case.replace(',', '')) region_detail['total_deaths'] = int(global_death.replace(',', '')) region_detail['total_recovered'] = int(global_recovered.replace(',', '')) conv_info_str = json.dumps(region_detail) data['conv_info_str'] = conv_info_str status = insert_db(data) return status
def read_content(page_num, url_prefix, i): t.url(url_prefix + str(page_num) + '.html') # 启动很慢 t.wait(2) if '' == t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']'): print("no here") raise Exception("an exception") file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']') file_name = file_name[:-10] + str("_") + file_name[-10:] + str('.txt') time = file_name[-14:-4] prefix = 'http://www.pbc.gov.cn' content_url = prefix + t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') if '' == t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): print("no here") raise Exception("an exception") flag = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') # 判断是否需要下载 return flag, time, content_url, file_name
def catchContent(): number = t.count('(//li[contains(@class, "css-1iski2w")]/a)') df = pd.DataFrame(index=range(0,number), columns = ['Sno', 'Title', 'URL', 'Summary','Img_URL']) for n in range(1, number+1): title=t.read('//li[contains(@class, "css-1iski2w")][{}]/a/div'.format(n)) URL=t.read('//li[contains(@class, "css-1iski2w")][{}]//@href'.format(n)) Img_link=t.read('//li[contains(@class, "css-1iski2w")][{}]//img/@src'.format(n)) summaries = SummarizeUrl(URL) df.iloc[n-1, 0] = n df.iloc[n-1, 1] = title.decode('utf-8') df.iloc[n-1, 2] = URL df.iloc[n-1, 3] = summaries df.iloc[n-1, 4] = Img_link df['Summary'].replace('None', np.nan, inplace=True) df.dropna(subset=['Summary'], inplace=True, how='any') df= df.reset_index(drop=True) df['Sno'] = df.index return df
def get_news_using_crawler(): try: t.url( 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/news' ) wait_for_pageload('//p[@class="heading text-underline"]') num_news = t.count('//p[@class="heading text-underline"]') if num_news > 5: num_news = 5 delete_news_data_db() date_stamp = datetime.datetime.now( pytz.timezone('Singapore')).strftime('%Y-%m-%d') for n in range(1, num_news + 1): data = {} data['date_stamp'] = date_stamp news_link = t.read( f'(//p[@class="heading text-underline"])[{n}]/ancestor-or-self::a/@href' ) data['news_link'] = news_link news_title = t.read( f'(//p[@class="heading text-underline"])[{n}]/ancestor-or-self::a/@aria-label' ) data['news_title'] = news_title print('Article', n, ":", news_title) print('') news_summaries = SummarizeUrl(news_link) data['news_summary'] = str(news_summaries) print(news_summaries) status = insert_db(data) return status except Exception as e: print(e) finally: t.close()
def extract_all_countries(date_stamp, status): num_country = int( t.count('(//a[@class="mt_a"])') / 2) # first half is for today, second half is for yesterday for n in range(1, num_country + 1): data = {} region_detail = {} data['date_stamp'] = date_stamp country_row_xpath = f'(//a[@class="mt_a"])[{n}]' country_total_cases_xpath = country_row_xpath + '/../following-sibling::td[1]' country_new_cases_xpath = country_row_xpath + '/../following-sibling::td[2]' country_total_deaths_xpath = country_row_xpath + '/../following-sibling::td[3]' country_new_deaths_xpath = country_row_xpath + '/../following-sibling::td[4]' country_total_recovered_xpath = country_row_xpath + '/../following-sibling::td[5]' country_active_cases_xpath = country_row_xpath + '/../following-sibling::td[6]' country_serious_cases_xpath = country_row_xpath + '/../following-sibling::td[7]' region_detail['total_cases'] = convert_extracted_numbers( t.read(country_total_cases_xpath)) region_detail['new_cases'] = convert_extracted_numbers( t.read(country_new_cases_xpath)) region_detail['total_deaths'] = convert_extracted_numbers( t.read(country_total_deaths_xpath)) region_detail['new_deaths'] = convert_extracted_numbers( t.read(country_new_deaths_xpath)) region_detail['total_recovered'] = convert_extracted_numbers( t.read(country_total_recovered_xpath)) region_detail['active_cases'] = convert_extracted_numbers( t.read(country_active_cases_xpath)) region_detail['serious_cases'] = convert_extracted_numbers( t.read(country_serious_cases_xpath)) conv_info_str = json.dumps(region_detail) data['conv_info_str'] = conv_info_str country_name = t.read(country_row_xpath) data['country_name'] = country_name status = insert_db(data) return status
def get_count_values(page_num, url_prefix, today): t.url(url_prefix + str(page_num) + '.html') print("现在所在页面 {}".format(page_num)) t.wait(5) # 拿到value count_values = t.count(element_identifier='//td[@colspan = "2"]//table') # today = '2018-04-24' if t.read( element_identifier= '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]') < today: return '今日无增量' print("页面有{}个文件".format(count_values)) with open('count_items_' + str(page_num) + '_' + str(url_prefix.split('/')[-2]) + '.txt', 'w', encoding='utf-8') as f: f.write('page:' + str(page_num) + ':' + str(count_values)) # 以:为分隔符;记录当前页面和页面总共item数量 return 'count_items_' + str(page_num) + '_' + str( url_prefix.split('/')[-2]) + '.txt'
def return_trip(enquiry): start_date = dt.strptime(enquiry["dates"][0], '%d/%m/%Y') end_date = dt.strptime(enquiry["dates"][1], '%d/%m/%Y') t.click('//input[@id="flight-type-roundtrip-hp-flight"]') t.type('//input[@id="flight-origin-hp-flight"]', enquiry["city"][0]) t.type('//input[@id="flight-destination-hp-flight"]', enquiry["city"][1]) t.type('//input[@id="flight-departing-hp-flight"]', '[clear]') t.type('//input[@id="flight-departing-hp-flight"]', start_date.strftime("%d/%m/%Y")) t.click('//*[@id="traveler-selector-hp-flight"]/div/ul/li/button') t.click('//a[@id="flight-advanced-options-hp-flight"]') t.select('//select[@id="flight-advanced-preferred-class-hp-flight"]', lookup_cabin_class(enquiry["cabin_class"])) t.click('//*[@id="gcw-flights-form-hp-flight"]/div[8]/label/button') tu.wait_for_pageload('//button[@id="flights-advanced-options-toggle"]') curr_enddate = t.read('//input[@id="return-date-1"]') if curr_enddate != end_date.strftime("%d/%m/%Y"): t.type('//input[@id="return-date-1"]', '[clear]') t.type('//input[@id="return-date-1"]', end_date.strftime("%d/%m/%Y")) t.click('//*[@id="flight-wizard-search-button"]')
def hover_and_read(selector): t.hover(selector) str = t.read(selector) return str
def get_email_content(self, item_xpath, is_unread): click(item_xpath) content = t.read('//div[@role="listitem"]') t.hover('//div[contains(@title,"Mark as unread")]') click('(//div[contains(@data-tooltip,"Mark as unread")])[2]') return content
def read(xpath): wait_element(xpath) return t.read(xpath)
import tagui as t login = ['7038157994'] cont = len(login) aux = 0 while aux != cont: t.init() t.url( 'http://servicos.coelba.com.br/servicos-ao-cliente/Pages/login-av.aspx?UrlUc=http://servicos.coelba.com.br/servicos-ao-cliente/Pages/2-via-de-conta-coelba.aspx' ) t.click( 'ctl00$m$g_2d0a0930_51e9_4b08_addf_fccd4023f2e8$ctl00$txtContaContrato' ) t.type( 'ctl00$m$g_2d0a0930_51e9_4b08_addf_fccd4023f2e8$ctl00$txtContaContrato', login[aux]) captcha = t.read('textCaptcha') t.click('ctl00$m$g_2d0a0930_51e9_4b08_addf_fccd4023f2e8$ctl00$txtCaptcha') t.type('ctl00$m$g_2d0a0930_51e9_4b08_addf_fccd4023f2e8$ctl00$txtCaptcha', captcha) t.click( 'ctl00$m$g_2d0a0930_51e9_4b08_addf_fccd4023f2e8$ctl00$btnAutenticar') t.close() aux += 1
# 把展示的尺寸设置为50个产品每页: t.hover(element_identifier='//*[@data-pagesize="50"]') t.click(element_identifier='//*[@data-pagesize="50"]') # 当下一页没有被disable的时候,有以下超参数 page_curr = 1 # 当前页面index value_dict = {} # 存放data count = 1 # csv 命名用 # 存放列名 name_list = ['序号', '综合评级', 'url'] for col_name in name_list: value_dict.setdefault(col_name, []) # 初始化空数据集 # 当可以翻页,或数据只有一页的时候,进行循环 while (t.read(element_identifier= '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): # 每页的数据量大小(row number) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 # 爬取页面所有一个table里的值 if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(count_values - 1) + ']//td[@class = "px"]')) > str_to_append: # print("direct continue..") # 翻页 page_curr += 1 # 鼠标模拟移动,并点击翻页
def compliance_data(url_prefix): t.init() # init_url = url_prefix + '1.html' t.url(init_url) #初始url max_page = int( t.read(element_identifier='//td[@class = "Normal"]').split('/') [1]) + 1 #最大page数量 for page_num in range(1, max_page): t.url(url_prefix + str(page_num) + '.html') print("现在所在页面 {}".format(page_num)) t.wait(5) # 拿到value count_values = t.count( element_identifier='//td[@colspan = "2"]//table') + 1 today = datetime.datetime.today() today = str(today.date()) # today = '2018-04-24' if t.read(element_identifier= '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]' ) < today: print("今日无增量") break print("页面有{}个文件".format(count_values - 1)) t.wait(5) for i in range(1, count_values): t.url(url_prefix + str(page_num) + '.html') if t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//span[@class = "hui12"]') < today: t.close() exit(1) file_name = t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']') + str('.txt') prefix = 'http://www.pbc.gov.cn' content_url = prefix + t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') if 'cnhttp' in content_url: content_url = content_url[21:] # 不知道为什么会出错这个 t.url(content_url) text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) print("文件{} 是文档。".format(i)) continue t.url(content_url) #进入二级目录 #获取pdf的数量,pdf的名字和pdf应该有的名字 t.wait(2) pdf_count = t.count( element_identifier='//div[@id = "zoom"]//a/@href') if pdf_count == 0: ##如果是正常的txt文件 # 取到列表 print("文件{} 是文档。".format(i)) # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") elif ('pdf' in t.read( element_identifier='//div[@id = "zoom"]//a/@href')): print("文件{} 含有 {} 个pdf。".format(i, pdf_count)) pdf_count += 1 #python从0开始,所以至少有一个pdf count for j in range(1, pdf_count): #取pdf的名字 if t.read(element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href') != '': print("当前是第{}个pdf。。".format(j)) pdf_name = t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href').split('/')[-1] #取合规名 pdf_name_to_change = t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a') #下载 prefix = 'http://www.pbc.gov.cn' t.url(prefix + t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href')) wait_seconds = 1 total_seconds = 0 while os.path.exists(pdf_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > 30: print('download fails') break os.rename(pdf_name, pdf_name_to_change) #改名 t.url(content_url) #返回二级目录 else: print("不合规,当文档处理!不读了!!!") # 取text if t.read(element_identifier='//div[@id = "zoom"]' ) != '': text = t.read( element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]' ) != '': text = t.read( element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") t.url(url_prefix + str(page_num) + '.html') break else: print("文件{} 含有 {} 个pdf。".format(i, pdf_count)) print("含有其他format的href,当文档处理!不读了!!!") # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") t.url(url_prefix + str(page_num) + '.html') break t.close()
def getFlightInfo(date, ind): t.wait(2) util.wait_for_pageload('//div[@class="ResultsSummary_summaryContainer__3_ZX_"]//span[@class="BpkText_bpk-text__2NHsO BpkText_bpk-text--sm__345aT SummaryInfo_itineraryCountContainer__30Hjd"]') price_lst = [] href_lst = [] deal_lst = [] type = len(date) time_lst = [] code_lst = [] details_lst = [] dur_ref = [] for n in range(2): leg_lst = [] bound_lst = [] time_dep_lst = [] time_arr_lst = [] time_arr_day_lst = [] airline_lst = [] dur_lst = [] transfer_lst = [] transfer_plc_lst = [] index_lst = [] if t.present('//span[@class="BpkBadge_bpk-badge__2mEjm "]'): k = n + 1 else: k = n ### href and price check href = t.read(f'(//a[@class="FlightsTicket_link__kl4DL"])[{n + 1}]//@href') if t.present('//span[@class="BpkText_bpk-text__2NHsO BpkText_bpk-text--sm__345aT Price_totalPrice__24xz2"]'): price = t.read(f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div[@class="TicketStub_horizontalStubContainer__2aEis"]//div//span[@class="BpkText_bpk-text__2NHsO BpkText_bpk-text--sm__345aT Price_totalPrice__24xz2"]') price_lst.append(float(price.replace(',', '').replace(' total', '').replace('$', ''))) print(price_lst) else: price = t.read(f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div[@class="TicketStub_horizontalStubContainer__2aEis"]//div//div//span') price_lst.append(float(price.replace(',', '').replace('$', ''))) print(price_lst) ind = ind + 1 print(ind) for i in range(type): leg = i+1 print(leg) code = t.read( f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i+1}]//div//div//div//img[@class="BpkImage_bpk-image__img__3HwXN"]/@src') airline = t.read(f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i+1}]//div//div//div//img[@class="BpkImage_bpk-image__img__3HwXN"]/@alt') print(airline) code_lst.append(code[43:45]) print(code_lst) time_dep = t.read(f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i+1}]//div//div[@class="LegInfo_routePartialDepart__37kr9"]//span//div//span') time_arr = t.read(f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i+1}]//div//div[@class="LegInfo_routePartialArrive__ZsZxc"]//span//div//span[@class="BpkText_bpk-text__2NHsO BpkText_bpk-text--lg__3vAKN"]') dur = t.read(f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i+1}]//div//div[@class="LegInfo_stopsContainer__1XNWn"]//span[@class="BpkText_bpk-text__2NHsO BpkText_bpk-text--sm__345aT Duration_duration__1QA_S"]') transfer = t.read(f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i+1}]//div//div[@class="LegInfo_stopsContainer__1XNWn"]//div//span') print(transfer) if transfer == 'Direct': transfer_plc = '' elif transfer == '1 stop': transfer_plc = t.read(f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i+1}]//div//div[@class="LegInfo_stopsContainer__1XNWn"]//div//div//span//span') elif transfer == '2 stops': transfer_plc1 = t.read(f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i+1}]//div[@class="LegInfo_stopsContainer__1XNWn"]//div//div//span[1]//span') transfer_plc2 = t.read(f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k + 1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i + 1}]//div[@class="LegInfo_stopsContainer__1XNWn"]//div//div//span[2]//span') transfer_plc = transfer_plc1 + ',' + transfer_plc2 elif transfer == '3 stops': transfer_plc1 = t.read( f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k + 1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i + 1}]//div[@class="LegInfo_stopsContainer__1XNWn"]//div//div//span[1]//span') transfer_plc2 = t.read( f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k + 1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i + 1}]//div[@class="LegInfo_stopsContainer__1XNWn"]//div//div//span[2]//span') transfer_plc3 = t.read( f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k + 1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i + 1}]//div[@class="LegInfo_stopsContainer__1XNWn"]//div//div//span[3]//span') transfer_plc = transfer_plc1 + ',' + transfer_plc2 + ',' + transfer_plc3 print(transfer_plc) ### Arrival Time plus 1 day check if t.present(f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i+1}]//div//div[@class="LegInfo_routePartialArrive__ZsZxc"]//span//div//span[@class="BpkText_bpk-text__2NHsO BpkText_bpk-text--sm__345aT TimeWithOffsetTooltip_offsetTooltip__24Ffv"]'): date_pls = 1 else: date_pls = 0 ### Bound Check dep = t.read( f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i+1}]//div//div[@class="LegInfo_routePartialDepart__37kr9"]//span//span[@class="BpkText_bpk-text__2NHsO BpkText_bpk-text--base__2vfTl LegInfo_routePartialCityTooltip__ZqOZK"]') arr = t.read( f'(//div[@class="BpkTicket_bpk-ticket__Brlno BpkTicket_bpk-ticket--with-notches__2i2HX"])[{k+1}]//div//div//div//div[@class="LegDetails_container__11hQT TicketBody_leg__1_ia3"][{i+1}]//div//div[@class="LegInfo_routePartialArrive__ZsZxc"]//span//span[@class="BpkText_bpk-text__2NHsO BpkText_bpk-text--base__2vfTl LegInfo_routePartialCityTooltip__ZqOZK"]') bound = dep + ' - ' + arr bound_lst.append(bound) time_lst.append(time_dep) date_time_dep = date[i] + ' ' + time_dep datetime_dep = datetime.strptime(date_time_dep, "%d/%m/%Y %H:%M") time_dep_lst.append(datetime_dep) date_time_arr = date[i] + ' ' + time_arr if date_pls == 0: datetime_arr = datetime.strptime(date_time_arr, "%d/%m/%Y %H:%M") else: datetime_arr = datetime.strptime(date_time_arr, "%d/%m/%Y %H:%M") + timedelta(days=1) time_arr_lst.append(datetime_arr) airline_lst.append(airline) dur_lst.append(dur) dur_ref.append(dur) time_arr_day = '' time_arr_day_lst.append(time_arr_day) transfer_lst.append(transfer) transfer_plc_lst.append(transfer_plc) leg_lst.append(leg) index_lst.append(ind) href_lst.append(t.url()[0:-2] + href) deal_lst.append(ind) details = {'Deal Index': index_lst, 'Flight Leg': leg_lst, 'Bound': bound_lst, 'Departure Time': time_dep_lst, 'Arrival Time': time_arr_lst, 'Duration': dur_lst, 'Transfer': transfer_lst, 'Transfer Place': transfer_plc_lst, 'Airline': airline_lst} details_lst.append(details) flight_info = [[] for _ in range(2)] main = {'Deal': deal_lst, 'Flight Info': flight_info, 'Price': price_lst, 'Hyperlink': href_lst, 'Details': details_lst} return main, time_lst, code_lst, dur_ref, ind
END_DATE) # type of sale t.click('//label[@for="checkbox1"]') t.click('//label[@for="checkbox2"]') t.click('//label[@for="checkbox3"]') project_total = t.count('//div[@id="projectContainerBox"]/a') # select projects for _ in range(SELECTION_LIMIT): if project_count > project_total - 1: PROCEED = False break selected = t.read(f'//*[@id="addToProject_{project_count}"]') print(f'select {selected}') t.click(f'//*[@id="addToProject_{project_count}"]') logging.info( f'batch: {batch_count}, project: {selected}, id: {project_count}') project_count += 1 # search t.click('//input[@class="btn btn-primary btn-lg"]') t.wait(2) # wait for page load complete t.click('//input[@value="Download into CSV"]') t.wait(2) # wait for download complete print('File downloaded')
def read_if_present(selector): str = "" if t.present(selector): str = t.read(selector) return str
def get_shoe(shoe_name, g, email): """ Get shoe details from jdsports.com.sg :param shoe_name: name of the shoe to search for :param gender: gender of the subscriber :param email: email id of the subscriber :return: details, list of shoe details. """ details = [] t.init(visual_automation=True) t.url('https://www.jdsports.com.sg/') t.wait(5) final_command = shoe_name + " shoes" + '[enter]' t.keyboard('[esc]') t.type('//input[@id = "srchInput"]', final_command) #t.click('//input[@id ="srchButton"]') t.wait(3) if g == ' men': if t.read( '(//a[@data-e2e="plp-filterMenu-catItem"]/span)[contains(.,"Men")]' ): t.click('(//a[@data-e2e="plp-filterMenu-catItem"]/span)[1]') count = t.count( '//ul[@id="productListMain"]//li[@class="productListItem "]') t.wait(3) if count != 0: for i in range(1, min(count, 4)): price = t.read(f'(//span[@class="pri"])[{i}]') name = t.read(f'(//span[@class="itemTitle"])[{i}]') img = t.read( f'(//a[@class="itemImage"]/picture/img/@srcset)[{i}]') link = "https://www.jdsports.com.sg" + t.read( f'(//span[@class = "itemTitle"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "JD", "link": link }) else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "JD", "link": "NA" }) elif g == ' women': if t.read( '(//a[@data-e2e="plp-filterMenu-catItem"]/span)[contains(.,"Women")]' ): t.click( '(//a[@data-e2e="plp-filterMenu-catItem"]/span)[.="Women"]') count = t.count( '//ul[@id="productListMain"]//li[@class="productListItem "]') t.wait(3) if count != 0: for i in range(1, min(count, 4)): price = t.read(f'(//span[@class="pri"])[{i}]') name = t.read(f'(//span[@class="itemTitle"])[{i}]') img = t.read( f'(//a[@class="itemImage"]/picture/img/@srcset)[{i}]') link = "https://www.jdsports.com.sg" + t.read( f'(//span[@class = "itemTitle"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "JD", "link": link }) else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "JD", "link": "NA" }) else: count = t.count( '//ul[@id="productListMain"]//li[@class="productListItem "]') t.wait(3) if count != 0: for i in range(1, min(count, 4)): price = t.read(f'(//span[@class="pri"])[{i}]') name = t.read(f'(//span[@class="itemTitle"])[{i}]') img = t.read( f'(//a[@class="itemImage"]/picture/img/@srcset)[{i}]') link = "https://www.jdsports.com.sg" + t.read( f'(//span[@class = "itemTitle"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "JD", "link": link }) else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "JD", "link": "NA" }) #t.close() if len(details) == 0: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "JD", "link": "NA" }) # print("JD BOT",details) return details
def read_text_content(content_url, file_name, page_num, i, time, url_prefix): # 读取网页 if 'cnhttp' in content_url: content_url = content_url[21:] # 不知道为什么会出错这个 t.url(content_url) # 启动很慢 else: t.url(content_url) # 启动很慢 # 获取pdf的数量,pdf的名字和pdf应该有的名字 t.wait(2) pdf_count = t.count(element_identifier='//div[@id = "zoom"]//a/@href') if pdf_count == 0: ##如果是正常的txt文件 # 取到列表 print("文件{} 是文档。".format(i)) # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') try: with open(file_name, 'w', encoding='utf-8') as f: f.write(text) except: with open('实施《全国企业兼并破产和职工再就业工作计划》银行呆、坏帐准备金核销办法_1997-10-01.txt', 'w', encoding='utf-8') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) else: with open('wrong_log.txt', 'a', encoding='utf-8') as f: string = 'page {} doc {} didnt write in '.format(page_num, i) f.write(string) f.write("\n") print("write files fails...") else: # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) else: with open('wrong_log.txt', 'a', encoding='utf-8') as f: string = 'page {} doc {} didnt write in '.format(page_num, i) f.write(string) f.write("\n") print("write files fails...") print("文件{} 含有 {} 个文件要下载。".format(i, pdf_count)) pdf_count += 1 # python从0开始,所以至少有一个pdf count current_count = 0 for j in range(1, pdf_count): # 取pdf的名字 if '.htm' not in t.read( element_identifier='//div[@id = "zoom"]//p//a/@href'): print("当前是第{}个文件。。".format(j)) p_count = t.count(element_identifier='//div[@id = "zoom"]//p') while current_count <= p_count: try: if t.read(element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a') != '': # 如果取到了 print("这个p有!") pdf_name = t.read( element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href') # 取合规名 pdf_name_to_change = t.read( element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a') # 下载 suffix = pdf_name.split('.')[-1] pdf_name = pdf_name.split('/')[-1] prefix = 'http://www.pbc.gov.cn' download_link = prefix + t.read( element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href') if 'cnhttp' in download_link: t.url( t.read(element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href')) # 启动很慢 else: t.url(download_link) # 启动很慢 wait_seconds = 1 total_seconds = 0 while os.path.exists(pdf_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if os.path.exists(pdf_name_to_change): break if total_seconds > MAX_WAIT: print('download fails') with open('download_log.txt', 'a', encoding='utf-8') as f: string = 'page {} doc {} file {} didnt download '.format( page_num, i, j) f.write(string) f.write("\n") break if os.path.exists(pdf_name_to_change): pass else: os.rename(pdf_name, pdf_name_to_change) # 改名 os.rename( pdf_name_to_change, pdf_name_to_change[:-(len(suffix) + 1)] + '_' + time + pdf_name_to_change[-(len(suffix) + 1):]) t.url(content_url) # 返回二级目录 # 启动很慢 current_count += 1 break else: current_count += 1 print("这个p没有") except: print('some error occurs, nvm') continue else: print("是个网页,当文档处理!") prefix = 'http://www.pbc.gov.cn' download_link = prefix + t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href') if 'cnhttp' in download_link: t.url( t.read(element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href')) # 启动很慢 else: t.url(download_link) # 启动很慢 # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) else: with open('wrong_log.txt', 'a', encoding='utf-8') as f: string = 'page {} doc {} didnt write in '.format( page_num, i) f.write(string) f.write("\n") print("write files fails...")
def getdailyincrement(str_to_append): #初始化页面 t.init() #输入url进入 t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1") #等5秒反应 t.wait(15) #鼠标放上去,点击精简选项 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="zksq"]') #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="fxr"]') t.type(element_identifier='//*[@id="fxr"]', text_to_type=str_to_append) #再点击,确保日期不会遮住底下的搜索按钮 t.click(element_identifier='//*[@id="fxr"]') t.hover(element_identifier='//*[@class="ipf01"]') t.click(element_identifier='//*[@class="ipf01"]') #把展示的尺寸设置为50个产品每页: t.hover(element_identifier='//*[@data-pagesize="50"]') t.click(element_identifier='//*[@data-pagesize="50"]') #当下一页没有被disable的时候,有以下超参数 page_curr = 1 #当前页面index value_dict = {} #存放data count = 1 #csv 命名用 #存放列名 name_list = ['序号', '综合评级', 'url'] for col_name in name_list: value_dict.setdefault(col_name, []) #初始化空数据集 #当可以翻页,或数据只有一页的时候,进行循环 while (t.read(element_identifier= '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): #每页的数据量大小(row number) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 # 爬取页面所有一个table里的值 if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(count_values - 1) + ']//td[@class = "px"]')) > str_to_append: # print("direct continue..") # 翻页 page_curr += 1 # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') continue filename = str(count) + "daily_data.csv" count += 1 t.wait(1) # 等1秒,万一加载错误了 t.table( element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table', filename_to_save=filename) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 for i in range(1, count_values): # 判定条件:如果是今天刚发行的,拿到所有主页面上的数据; #如果最下面那条数据都大于今天,就直接翻页 if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(count_values - 1) + ']//td[@class = "px"]')) > str_to_append: # print("direct break..") break else: if str( t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) == str_to_append: #序号 value_dict[name_list[0]].append( t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[2]')) #综合评级 value_dict[name_list[1]].append( t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[12]//i/@title')) #url value_dict[name_list[2]].append( t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//a/@href')) else: #如果不是今天增量,什么都不做 pass # print("turn the page..") # 翻页 page_curr += 1 # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') #关闭tagui流 t.close() #输出格式为:"今日日期.csv" today_data = pd.DataFrame(value_dict) today_data.to_csv(str_to_append + ".csv", index=False, encoding='UTF-8') return count - 1
#当下一页没有被disable的时候,有以下超参数 page_curr = 1 #当前页面index value_dict = {} #存放data #存放列名 name_list = ['序号', '产品名称', '发行银行', '委托货币', '发行日', '停售日', '管理期(天)', '预期收益率', '到期收益率', '与同期储蓄比', '综合评级','url'] for col_name in name_list: value_dict.setdefault(col_name, []) #初始化空数据集 #当可以翻页,或数据只有一页的时候,进行循环 stop_flag = False #当当前页面不是最后一页,或只有一页时,都进行如下循环 while (t.read(element_identifier='//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): if stop_flag == True: #如果没有今年的数据,就没必要翻页了 break #每页的数据量大小(row number) count_values = int(t.count(element_identifier='//tbody[@id = "content"]//tr')) + 1 # python从0开始 #爬取当前页面 for i in range(1, count_values): # 判定条件:如果是今年内(小于今年12-31或等于12-31的),全都要 if str(t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) <= date_end: # 序号 value_dict[name_list[0]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[2]')) # 产品名称
def getExpFlightPrice(airline, dep_ref, dur_ref): print(airline) print(dep_ref) print(dur_ref) util.wait_for_pageload('//input[@classes="filter-checkbox"]') t.wait(3) t.click(f'//a[@data-content-id="airlineToggleContainer"]') for i in range(len(dep_ref)): if i == 0: if t.present(f'//input[@id="airlineRowContainer_{airline[i]}"]'): t.wait(3) t.click(f'//input[@id="airlineRowContainer_{airline[i]}"]') else: print('Not match') return 0, '' elif airline[i] != airline[i-1]: if t.present(f'//input[@id="airlineRowContainer_{airline[i]}"]'): t.wait(1) t.click(f'//input[@id="airlineRowContainer_{airline[i]}"]') else: print('Not match') return 0, '' if dep_ref[i][0] == '0': dep_ref[i] = dep_ref[i][1:] if dur_ref[i][-1:] == 'h': dur_ref[i] = dur_ref[i] + ' 0m' else: dur_ref[i] = dur_ref[i] + 'm' print(airline) print(dep_ref) print(dur_ref) util.wait_for_pageload('//button[@data-test-id="select-button"]') t.wait(5) for i in range(t.count(f'//ul[@id="flightModuleList"]//li')): i = i + 1 print(i) dep = t.read(f'(//span[@class="medium-bold"]//span[@data-test-id="departure-time"])[{i}]') if len(dur_ref) == 1: if dep == dep_ref[0]: print('dep OK') dur = t.read(f'(//span[@data-test-id="duration"])[{i}]') t.click(f'(//button[@data-test-id="select-button"])[{i}]') t.wait(5) if t.present('//a[@id="forcedChoiceNoThanks"]'): t.click(f'//a[@id="forcedChoiceNoThanks"]') t.wait(5) for x in range(5): print(x) if t.popup('Flight-Information?'): break else: t.wait(5) price = t.read(f'(//span[@class="packagePriceTotal"])[2]') price = float(price.replace(',', '').replace('SG', '').replace('$', '').replace(' ', '')) print(price) url = t.url() return price, url else: return 0, '' elif len(dur_ref) == 2: print('trip', len(dur_ref)) if dep == dep_ref[0]: print('dep OK') dur = t.read(f'(//span[@data-test-id="duration"])[{i}]') t.click(f'(//button[@data-test-id="select-button"])[{i}]') t.wait(5) util.wait_for_pageload('//button[@data-test-id="select-button"]') t.click(f'//input[@id="airlineRowContainer_{airline[1]}"]') t.wait(2) for j in range(t.count(f'//ul[@id="flightModuleList"]//li')): j = j + 1 print(j) dep = t.read(f'(//span[@data-test-id="departure-time"])[{j}+1]') if dep == dep_ref[1]: print('return dep ok') dur = t.read(f'(//span[@data-test-id="duration"])[{j}+1]') if dur == dur_ref[1]: t.click(f'(//button[@data-test-id="select-button"])[{j}]') t.wait(5) if t.present('//a[@id="forcedChoiceNoThanks"]'): t.click(f'//a[@id="forcedChoiceNoThanks"]') t.wait(5) for x in range(5): print(x) if t.popup('Flight-Information?'): break else: t.wait(5) util.wait_for_pageload('//h1[@class="section-header-main"]') price = t.read(f'(//span[@class="packagePriceTotal"])[2]') price = float(price.replace(',', '').replace('SG', '').replace('$', '').replace(' ', '')) print(price) url = t.url() print(url) return price, url else: return 0, '' elif len(dur_ref) >= 3: dep_lst = [] dur_lst = [] print('multi-trip ', len(dur_ref)) for k in range(len(dur_ref)): dep_lst.append(t.read(f'(//span[@data-test-id="departure-time"])[{3*i+k+1}]')) dur_lst.append(t.read(f'(//span[@data-test-id="duration"])[{3*i+k+1}]')) print(dep_lst) print(dep_ref) if dep_lst == dep_ref: print(dur_lst) print(dur_ref) if dur_lst == dur_ref: t.click(f'(//button[@data-test-id="select-button"])[{j}]') t.wait(5) if t.present('//a[@id="forcedChoiceNoThanks"]'): t.click(f'//a[@id="forcedChoiceNoThanks"]') t.wait(5) for x in range(5): print(x) if t.popup('Flight-Information?'): break else: t.wait(5) price = t.read(f'(//span[@class="packagePriceTotal"])[2]') price = float(price.replace(',', '').replace('SG', '').replace('$', '').replace(' ', '')) print(price) url = t.url() print(url) return price, url else: return 0, ''
# to use in Jupyter notebook, Python script or interactive shell import tagui as t # use init() to start TagUI, it autoruns setup() to download TagUI # default init(visual_automation = False, chrome_browser = True) t.init() # use url('your_url') to go to web page, url() returns current URL t.url('https://ca.yahoo.com') # use type() to enter text into an UI element or x, y location # '[enter]' = enter key, '[clear]' = clear field t.type('search-box', 'github') # use read() to fetch and return text from UI element search_text = t.read('search-box') # use echo() to print to output, same as Python print() t.echo(search_text) # use click() to click on an UI element or x, y location # rclick() = right-click, dclick() = double-click t.click('search-button') # use wait() to wait for a number of seconds # default wait() is 5 seconds t.wait(6.6) # use snap() to save screenshot of page or UI element # page = web page, page.png = computer screen t.snap('page', 'results.png')
#当下一页没有被disable的时候,有以下超参数 page_curr = 1 #当前页面index value_dict = {} #存放data #存放列名 name_list = [ '序号', '产品名称', '发行银行', '委托货币', '发行日', '停售日', '管理期(天)', '预期收益率', '到期收益率', '与同期储蓄比', '综合评级', 'url' ] for col_name in name_list: value_dict.setdefault(col_name, []) #初始化空数据集 #当可以翻页,或数据只有一页的时候,进行循环 while t.read(element_identifier='//*[@class = "pf-disabled"][last()]' ) != '下一页>>' or (page_curr == 1): #每页的数据量大小(row number) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 #print输出完成 print("count value for this page finishes..") t.wait(1) for i in range(1, count_values): # 判定条件:如果是今天刚发行的,拿到所有主页面上的数据, if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) == str_to_append: #序号 value_dict[name_list[0]].append(