def putian_query(day_putian): startTime = day_putian.strftime('%Y-%m-%d') endTime = (day_putian + datetime.timedelta(days=1)).strftime('%Y-%m-%d') # http://10.221.17.131:9091/report/bizman/common/result.jsp?timename=jiakuandahuizhan # http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=5614146&iam=15614135&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1552455614217&iam=15614135&page=null&pageSizeCus=null&timetype=day&datefromto=2019-04-03~2019-04-04&bar=true # url = 'http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=56141' \ # '46&iam=15614135&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1552455614217&iam=15614135&' \ # 'page=null&pageSizeCus=null&timetype=day&datefromto={}~{}&bar=true'.format(startTime, endTime) # url = 'http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=2762197&iam=12675442&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1554792716199&u=r&page=null&pageSizeCus=null&timetype=customday&datefromto=2019-04-02~2019-04-02' url = 'http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=2762197&iam=12675442&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1554792716199&u=r&page=null&pageSizeCus=null&timetype=customday&datefromto={}~{}'.format( startTime, startTime) print(url) f = ww.get_web_page(url) soup = BeautifulSoup(f, "html.parser") # print(soup.prettify()) # 这很beautiful # print(soup.find(attrs={'id': 'jiakuandahuizhan'}).prettify()) res = list() for table_rows in soup.find(attrs={ 'id': 'jiakuandahuizhan' }).find_all('tr'): res.append(table_rows.find_all('td')[2].find('span').get_text()) return res
会返回HTML,提取值 """ import web.webCrawler.webcrawler as ww import json import xlrd import xlwt import datetime from xlutils.copy import copy import time from bs4 import BeautifulSoup url = 'http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=5614146&iam=15614135&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1552455614217&iam=15614135&page=null&pageSizeCus=null&timetype=day&datefromto=2019-03-11~2019-03-12&bar=true' url = 'http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=5614146&iam=15614135&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1552455614217&iam=15614135&page=null&pageSizeCus=null&timetype=day&datefromto=2019-03-10~2019-03-11&bar=true' url = 'http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=2762197&iam=12675442&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1554792716199&u=r&page=null&pageSizeCus=null&timetype=customday&datefromto=2019-04-02~2019-04-02' f = ww.get_web_page(url) soup = BeautifulSoup(f, "html.parser") # print(soup.prettify()) # 这很beautiful # print(soup.find(attrs={'id': 'jiakuandahuizhan'}).prettify()) for item in soup.find(attrs={'id': 'jiakuandahuizhan'}).find_all('tr'): print(item.find_all('td')[2].find('span').get_text()) exit() avg_1st_screen_delay_web = soup.find(attrs={'id': 'td_jiakuandahuizhan_2_3'}).find(attrs={'style': 'cursor:text;'}).get_text() key_local_web_access_delay = soup.find(attrs={'id': 'td_jiakuandahuizhan_3_3'}).find(attrs={'style': 'cursor:text;'}).get_text() video_buffering_ratio_home_broadband = soup.find(attrs={'id': 'td_jiakuandahuizhan_4_3'}).find(attrs={'style': 'cursor:text;'}).get_text() game_ping_home_broadband = soup.find(attrs={'id': 'td_jiakuandahuizhan_5_3'}).find(attrs={'style': 'cursor:text;'}).get_text() game_packet_loss_rate_home_broadband = soup.find(attrs={'id': 'td_jiakuandahuizhan_6_3'}).find(attrs={'style': 'cursor:text;'}).get_text() top_20 = soup.find(attrs={'id': 'td_jiakuandahuizhan_7_3'}).find(attrs={'style': 'cursor:text;'}).get_text() migu_video_lag_count = soup.find(attrs={'id': 'td_jiakuandahuizhan_8_3'}).find(attrs={'style': 'cursor:text;'}).get_text() migu_music_delay = soup.find(attrs={'id': 'td_jiakuandahuizhan_9_3'}).find(attrs={'style': 'cursor:text;'}).get_text()
print('总页数:', page) for p in range(page): print('第{}页:'.format(p + 1)) '''各分页url''' url = 'http://10.221.246.108/eoms35/sheet/commontask/commontask.do?titleStringExpression=like&sendRoleIdStringExp' \ 'ression=in&sheetIdStringExpression=like&main.sendTime=&operateDeptIdStringExpression=in&sendDeptIdStringEx' \ 'pression=in&operateUserIdStringExpression=in&sendUserIdStringExpression=in&queryType=record&main.mainNetSo' \ 'rt1=&main.mainNetSort2=&main.sheetId=&main.mainNetSort3=&statusChoiceExpression=&sendTimeEndDateExpression' \ '=%3C%3D&main.sendRoleId=&d-4025513-p=' + str(p+1) \ + '&sendTimeStartDateExpression=%3E%3D&sendTimeStartDate=' + start_time + '+00%3A00%3A00&main.sendDeptId=&main.send' \ 'UserId=&main.status=&mainNetSort1ChoiceExpression=&sendTimeLogicExpression=and&method=performQuery&link.' \ 'operateRoleId=&main.title=%E5%85%B3%E4%BA%8E%E9%85%8D%E5%90%88%E5%B8%82%E5%9C%BA%E9%83%A8%E5%88%B6&metho' \ 'd.save=%E6%8F%90%E4%BA%A4&link.operateDeptId=&sendTimeEndDate=' + end_time_url + '&link.operateUserId' \ '=&task.taskName=&operateRoleIdStringExpression=in' f = ww.get_web_page(url, cookie) selector = etree.HTML(f) # 获取 当前页全部工单号 content_tasks = selector.xpath( '/html/body/div[1]/div/div/table/tbody/tr/td[2]/a/text()') print(content_tasks) for t, item in enumerate(content_tasks): task_sequence = item # 获取 a标签 content = selector.xpath('/html/body/div[1]/div/div/table/tbody/tr[' + str(t + 1) + ']/td[2]/a/@href') address = content[0] # print(address) # 访问详情页 只是去取excel的名字 url = 'http://10.221.246.108/eoms35/sheet/commontask/' + address f_detail = ww.get_web_page(url, cookie)
# print(num_complaints) if num_complaints > 0: tmp_csv_content = list() for i, table_rows in enumerate(soup.find('tbody').find_all('tr')): table_colums = table_rows.find_all('td')[0] sheet_number = table_colums.find('a').get_text().strip() print(sheet_number) address = table_colums.find('a').get('href') tmp_csv_content.append([table_colums.find('a').get_text().strip()]) if len(sheet_number) > 19: break # 'http://10.221.246.104/eoms35/sheet/complaint/complaint.do?method=showMainDetailPage&sheetKey=8a5d76eb6b09603c016b465b60442261' url = 'http://10.221.246.104/eoms35/sheet/complaint/' + address f = ww.get_web_page(url, cookie) # print(f) soup = BeautifulSoup(f, "html.parser") title = soup.find_all('table')[0].find_all('tr')[1].find_all( 'td')[1].get_text().strip() print(title) tmp_csv_content[i].append(title) content = soup.find_all('table')[2].find_all('tr')[13].find_all( 'td')[1].get_text().strip() print(content) tmp_csv_content[i].append(content) note = soup.find_all('table')[2].find_all('tr')[15].find_all( 'td')[1].get_text().strip() print(note) tmp_csv_content[i].append(note)
for p in range(page): print('第{}页:'.format(p+1)) '''各分页url''' url = 'http://10.221.246.104/eoms35/sheet/complaint/complaint.do?titleStringExpression=like&sendRoleIdStringExpre' \ 'ssion=in&main.parentCorrelation=&sheetIdStringExpression=like&complaintType1ChoiceExpression=&main.sendTim' \ 'e=&operateDeptIdStringExpression=in&main.complaintType=&sendDeptIdStringExpression=in&operateUserIdStringE' \ 'xpression=in&sendUserIdStringExpression=in&parentCorrelationStringExpression=like&main.complaintType1=&que' \ 'ryType=record&main.complaintType2=&main.complaintType4=&main.sheetId=&main.complaintType5=&customPhoneStri' \ 'ngExpression=like&main.complaintType6=&main.complaintType7=&showArea=&statusChoiceExpression=&sendTimeEndD' \ 'ateExpression=%3C%3D&main.sendRoleId=&main.toDeptId=&d-4025513-p=' + str(p+1) + '&sendTimeStartDateExpress' \ 'ion=%3E%3D&sendTimeStartDate=2018-10-01+00%3A00%3A00&main.sendDeptId=&main.sendUserId=&toDeptIdStringExpre' \ 'ssion=in&main.s' \ 'tatus=&main.customPhone=&sendTimeLogicExpression=and&link.operateRoleId=&method=performQuery&main.title=&m' \ 'ethod.save=%E6%8F%90%E4%BA%A4&link.operateDeptId=&sendTimeEndDate=2018-11-13+23%3A32%3A22&link.operateUser' \ 'Id=&task.taskName=&operateRoleIdStringExpression=in' f = ww.get_web_page(url, cookie) # print(f) selector = etree.HTML(f) # 获取 当前页全部工单号 content_tasks = selector.xpath('/html/body/div/div/div/table/tbody/tr/td[1]/a/text()') print(content_tasks) for t, item in enumerate(content_tasks): task_sequence = item # 获取 a标签 content = selector.xpath('/html/body/div/div/div/table/tbody/tr[' + str(t+1) + ']/td[1]/a/@href') address = content[0] # print(address) url = 'http://10.221.246.104/eoms35/sheet/complaint/' + address f_detail = ww.get_web_page(url, cookie)