Python get_web_page Examples, web.webCrawler.webcrawler.get_web_page Python Examples

Example #1

0

Show file

def putian_query(day_putian):
    startTime = day_putian.strftime('%Y-%m-%d')
    endTime = (day_putian + datetime.timedelta(days=1)).strftime('%Y-%m-%d')

    # http://10.221.17.131:9091/report/bizman/common/result.jsp?timename=jiakuandahuizhan
    # http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=5614146&iam=15614135&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1552455614217&iam=15614135&page=null&pageSizeCus=null&timetype=day&datefromto=2019-04-03~2019-04-04&bar=true
    # url = 'http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=56141' \
    #       '46&iam=15614135&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1552455614217&iam=15614135&' \
    #       'page=null&pageSizeCus=null&timetype=day&datefromto={}~{}&bar=true'.format(startTime, endTime)
    # url = 'http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=2762197&iam=12675442&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1554792716199&u=r&page=null&pageSizeCus=null&timetype=customday&datefromto=2019-04-02~2019-04-02'
    url = 'http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=2762197&iam=12675442&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1554792716199&u=r&page=null&pageSizeCus=null&timetype=customday&datefromto={}~{}'.format(
        startTime, startTime)
    print(url)

    f = ww.get_web_page(url)
    soup = BeautifulSoup(f, "html.parser")
    # print(soup.prettify())  # 这很beautiful
    # print(soup.find(attrs={'id': 'jiakuandahuizhan'}).prettify())
    res = list()
    for table_rows in soup.find(attrs={
            'id': 'jiakuandahuizhan'
    }).find_all('tr'):
        res.append(table_rows.find_all('td')[2].find('span').get_text())

    return res

Example #2

0

Show file

会返回HTML，提取值
"""
import web.webCrawler.webcrawler as ww
import json
import xlrd
import xlwt
import datetime
from xlutils.copy import copy
import time
from bs4 import BeautifulSoup


url = 'http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=5614146&iam=15614135&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1552455614217&iam=15614135&page=null&pageSizeCus=null&timetype=day&datefromto=2019-03-11~2019-03-12&bar=true'
url = 'http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=5614146&iam=15614135&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1552455614217&iam=15614135&page=null&pageSizeCus=null&timetype=day&datefromto=2019-03-10~2019-03-11&bar=true'
url = 'http://10.221.17.131:9091/report/bizman/common/report.jsp?timename=jiakuandahuizhan&reportType=&cac=2762197&iam=12675442&timename=jiakuandahuizhan&change=true&sid=null&reportFileName=1554792716199&u=r&page=null&pageSizeCus=null&timetype=customday&datefromto=2019-04-02~2019-04-02'
f = ww.get_web_page(url)
soup = BeautifulSoup(f, "html.parser")
# print(soup.prettify())  # 这很beautiful
# print(soup.find(attrs={'id': 'jiakuandahuizhan'}).prettify())
for item in soup.find(attrs={'id': 'jiakuandahuizhan'}).find_all('tr'):
    print(item.find_all('td')[2].find('span').get_text())

exit()
avg_1st_screen_delay_web = soup.find(attrs={'id': 'td_jiakuandahuizhan_2_3'}).find(attrs={'style': 'cursor:text;'}).get_text()
key_local_web_access_delay = soup.find(attrs={'id': 'td_jiakuandahuizhan_3_3'}).find(attrs={'style': 'cursor:text;'}).get_text()
video_buffering_ratio_home_broadband = soup.find(attrs={'id': 'td_jiakuandahuizhan_4_3'}).find(attrs={'style': 'cursor:text;'}).get_text()
game_ping_home_broadband = soup.find(attrs={'id': 'td_jiakuandahuizhan_5_3'}).find(attrs={'style': 'cursor:text;'}).get_text()
game_packet_loss_rate_home_broadband = soup.find(attrs={'id': 'td_jiakuandahuizhan_6_3'}).find(attrs={'style': 'cursor:text;'}).get_text()
top_20 = soup.find(attrs={'id': 'td_jiakuandahuizhan_7_3'}).find(attrs={'style': 'cursor:text;'}).get_text()
migu_video_lag_count = soup.find(attrs={'id': 'td_jiakuandahuizhan_8_3'}).find(attrs={'style': 'cursor:text;'}).get_text()
migu_music_delay = soup.find(attrs={'id': 'td_jiakuandahuizhan_9_3'}).find(attrs={'style': 'cursor:text;'}).get_text()

Example #3

0

Show file

print('总页数:', page)

for p in range(page):
    print('第{}页:'.format(p + 1))
    '''各分页url'''
    url = 'http://10.221.246.108/eoms35/sheet/commontask/commontask.do?titleStringExpression=like&sendRoleIdStringExp' \
          'ression=in&sheetIdStringExpression=like&main.sendTime=&operateDeptIdStringExpression=in&sendDeptIdStringEx' \
          'pression=in&operateUserIdStringExpression=in&sendUserIdStringExpression=in&queryType=record&main.mainNetSo' \
          'rt1=&main.mainNetSort2=&main.sheetId=&main.mainNetSort3=&statusChoiceExpression=&sendTimeEndDateExpression' \
          '=%3C%3D&main.sendRoleId=&d-4025513-p=' + str(p+1) \
          + '&sendTimeStartDateExpression=%3E%3D&sendTimeStartDate=' + start_time + '+00%3A00%3A00&main.sendDeptId=&main.send' \
            'UserId=&main.status=&mainNetSort1ChoiceExpression=&sendTimeLogicExpression=and&method=performQuery&link.' \
            'operateRoleId=&main.title=%E5%85%B3%E4%BA%8E%E9%85%8D%E5%90%88%E5%B8%82%E5%9C%BA%E9%83%A8%E5%88%B6&metho' \
            'd.save=%E6%8F%90%E4%BA%A4&link.operateDeptId=&sendTimeEndDate=' + end_time_url + '&link.operateUserId' \
            '=&task.taskName=&operateRoleIdStringExpression=in'
    f = ww.get_web_page(url, cookie)
    selector = etree.HTML(f)
    # 获取 当前页全部工单号
    content_tasks = selector.xpath(
        '/html/body/div[1]/div/div/table/tbody/tr/td[2]/a/text()')
    print(content_tasks)
    for t, item in enumerate(content_tasks):
        task_sequence = item
        # 获取 a标签
        content = selector.xpath('/html/body/div[1]/div/div/table/tbody/tr[' +
                                 str(t + 1) + ']/td[2]/a/@href')
        address = content[0]
        # print(address)
        # 访问详情页 只是去取excel的名字
        url = 'http://10.221.246.108/eoms35/sheet/commontask/' + address
        f_detail = ww.get_web_page(url, cookie)

Example #4

0

Show file

File: eoms_complaints_work_sheet.py Project: mtbdc-dy/LittlePrograms

    # print(num_complaints)

    if num_complaints > 0:
        tmp_csv_content = list()

        for i, table_rows in enumerate(soup.find('tbody').find_all('tr')):
            table_colums = table_rows.find_all('td')[0]
            sheet_number = table_colums.find('a').get_text().strip()
            print(sheet_number)
            address = table_colums.find('a').get('href')
            tmp_csv_content.append([table_colums.find('a').get_text().strip()])
            if len(sheet_number) > 19:
                break
            # 'http://10.221.246.104/eoms35/sheet/complaint/complaint.do?method=showMainDetailPage&sheetKey=8a5d76eb6b09603c016b465b60442261'
            url = 'http://10.221.246.104/eoms35/sheet/complaint/' + address
            f = ww.get_web_page(url, cookie)
            # print(f)
            soup = BeautifulSoup(f, "html.parser")
            title = soup.find_all('table')[0].find_all('tr')[1].find_all(
                'td')[1].get_text().strip()
            print(title)
            tmp_csv_content[i].append(title)
            content = soup.find_all('table')[2].find_all('tr')[13].find_all(
                'td')[1].get_text().strip()
            print(content)
            tmp_csv_content[i].append(content)
            note = soup.find_all('table')[2].find_all('tr')[15].find_all(
                'td')[1].get_text().strip()
            print(note)
            tmp_csv_content[i].append(note)

Example #5

0

Show file

File: eoms_complaints.py Project: mtbdc-dy/LittlePrograms

for p in range(page):
    print('第{}页:'.format(p+1))
    '''各分页url'''
    url = 'http://10.221.246.104/eoms35/sheet/complaint/complaint.do?titleStringExpression=like&sendRoleIdStringExpre' \
          'ssion=in&main.parentCorrelation=&sheetIdStringExpression=like&complaintType1ChoiceExpression=&main.sendTim' \
          'e=&operateDeptIdStringExpression=in&main.complaintType=&sendDeptIdStringExpression=in&operateUserIdStringE' \
          'xpression=in&sendUserIdStringExpression=in&parentCorrelationStringExpression=like&main.complaintType1=&que' \
          'ryType=record&main.complaintType2=&main.complaintType4=&main.sheetId=&main.complaintType5=&customPhoneStri' \
          'ngExpression=like&main.complaintType6=&main.complaintType7=&showArea=&statusChoiceExpression=&sendTimeEndD' \
          'ateExpression=%3C%3D&main.sendRoleId=&main.toDeptId=&d-4025513-p=' + str(p+1) + '&sendTimeStartDateExpress' \
          'ion=%3E%3D&sendTimeStartDate=2018-10-01+00%3A00%3A00&main.sendDeptId=&main.sendUserId=&toDeptIdStringExpre' \
          'ssion=in&main.s' \
          'tatus=&main.customPhone=&sendTimeLogicExpression=and&link.operateRoleId=&method=performQuery&main.title=&m' \
          'ethod.save=%E6%8F%90%E4%BA%A4&link.operateDeptId=&sendTimeEndDate=2018-11-13+23%3A32%3A22&link.operateUser' \
          'Id=&task.taskName=&operateRoleIdStringExpression=in'
    f = ww.get_web_page(url, cookie)
    # print(f)

    selector = etree.HTML(f)
    # 获取 当前页全部工单号
    content_tasks = selector.xpath('/html/body/div/div/div/table/tbody/tr/td[1]/a/text()')
    print(content_tasks)

    for t, item in enumerate(content_tasks):
        task_sequence = item
        # 获取 a标签
        content = selector.xpath('/html/body/div/div/div/table/tbody/tr[' + str(t+1) + ']/td[1]/a/@href')
        address = content[0]
        # print(address)
        url = 'http://10.221.246.104/eoms35/sheet/complaint/' + address
        f_detail = ww.get_web_page(url, cookie)