Exemple #1
0
def sitedate(url):
    bru = 'http://www.link114.cn/get.php?baiduprzz&%s&71842' % url.strip()
    slu = 'http://www.link114.cn/get.php?baidu&%s&71152' % url.strip()

    headers = {
        'Host':
        'www.link114.cn',
        'Connection':
        'keep-alive',
        'Cache-Control':
        'max-age=0',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        useragent.pcualist(),
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate, sdch',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'Cookie':
        'bdshare_firstime=1488939027441 AlexaToolbar-ALX_NS_PH: AlexaToolbar/alx-4.0'
    }
    data = None
    try:
        request = urllib2.Request(bru, data, headers)
        sock1 = urllib2.urlopen(request)
        brt = sock1.read()
    except Exception, e:
        brt = 'broken'
Exemple #2
0
def getua():

    User_Agent = pcualist()

    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie':
        'freshGuide=1; BIDUPSID=7AD6B1F0076AD0D5E2810FF36119FB56; __cfduid=dbd033b635bd1268fdaefa87e2d34c1261470191700; PSTM=1472281639; BAIDUID=F44B09D83A8074889DE20E167B6F4EBB:FG=1; MCITY=-153%3A268%3A; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a02234882544; H_PS_PSSID=1468_20792_18240_21125_17001_20856_20733_20837_20885',
        'Host': 'baike.baidu.com',
        'Referer': 'http://baike.baidu.com/',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': User_Agent
    }
    return headers
Exemple #3
0
def getHTml(url):

    host = search('^([^/]*?)/', re.sub(r'(https|http)://', '', url))
    headers = {
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, sdch",
        "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        #"Cookie":"__cfduid=df26a7c536a0301ccf36481a14f53b4a81469608715; BIDUPSID=E9B0B6A35D4ABC6ED4891FCC0FD085BD; PSTM=1474352745; lsv=globalTjs_97273d6-wwwTcss_8eba1c3-routejs_6ede3cf-activityControllerjs_b6f8c66-wwwBcss_eabc62a-framejs_902a6d8-globalBjs_2d41ef9-sugjs_97bfd68-wwwjs_8d1160b; MSA_WH=1433_772; BAIDUID=E9B0B6A35D4ABC6ED4891FCC0FD085BD:FG=1; plus_cv=1::m:2a9fb36a; H_WISE_SIDS=107504_106305_100040_100100_109550_104341_107937_108437_109700_109794_107961_108453_109737_109558_109506_110022_107895_107917_109683_109588_110072_107318_107300_107242_100457; BDUSS=XNNMTJlWEdDdzFPdU1nSzVEZ1REYn4tNWNwZk94NVducXpaaThjWjE4bU1TQXRZQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIy741eMu-NXQ; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDRCVFR[uLXjBGr0i56]=mbxnW11j9Dfmh7GuZR8mvqV; rsv_jmp_slow=1474644236473; sug=3; sugstore=1; ORIGIN=0; bdime=21110; H_PS_645EC=60efFRJ1dM8ial205oBcDuRmtLgH3Q6NaRzxDuIkbMkGVXNSHmXBfW0GZL4l5pnj; BD_UPN=123253; BD_CK_SAM=1; BDSVRTM=110; H_PS_PSSID=17947",
        "Host": host,
        "Pragma": "no-cache",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": useragent.pcualist()
    }
    # 代理服务器
    proxyHost = "proxy.abuyun.com"
    proxyPort = "9010"

    # 代理隧道验证信息
    proxyUser = "******"
    proxyPass = "******"

    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host": proxyHost,
        "port": proxyPort,
        "user": proxyUser,
        "pass": proxyPass,
    }

    proxies = {
        "http": proxyMeta,
        "https": proxyMeta,
    }

    html = requests.get(url, headers=headers, timeout=30)
    code = html.encoding
    return html.content
Exemple #4
0
from BeautifulSoup import BeautifulSoup
import re
import useragent

headers = {
    "Accept":
    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, sdch",
    "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    #"Cookie":"__cfduid=df26a7c536a0301ccf36481a14f53b4a81469608715; BIDUPSID=E9B0B6A35D4ABC6ED4891FCC0FD085BD; PSTM=1474352745; lsv=globalTjs_97273d6-wwwTcss_8eba1c3-routejs_6ede3cf-activityControllerjs_b6f8c66-wwwBcss_eabc62a-framejs_902a6d8-globalBjs_2d41ef9-sugjs_97bfd68-wwwjs_8d1160b; MSA_WH=1433_772; BAIDUID=E9B0B6A35D4ABC6ED4891FCC0FD085BD:FG=1; plus_cv=1::m:2a9fb36a; H_WISE_SIDS=107504_106305_100040_100100_109550_104341_107937_108437_109700_109794_107961_108453_109737_109558_109506_110022_107895_107917_109683_109588_110072_107318_107300_107242_100457; BDUSS=XNNMTJlWEdDdzFPdU1nSzVEZ1REYn4tNWNwZk94NVducXpaaThjWjE4bU1TQXRZQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIy741eMu-NXQ; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDRCVFR[uLXjBGr0i56]=mbxnW11j9Dfmh7GuZR8mvqV; rsv_jmp_slow=1474644236473; sug=3; sugstore=1; ORIGIN=0; bdime=21110; H_PS_645EC=60efFRJ1dM8ial205oBcDuRmtLgH3Q6NaRzxDuIkbMkGVXNSHmXBfW0GZL4l5pnj; BD_UPN=123253; BD_CK_SAM=1; BDSVRTM=110; H_PS_PSSID=17947",
    "Host": "www.baidu.com",
    "Pragma": "no-cache",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": useragent.pcualist()
}  # 设置UA模拟用户,还可设置多个UA提高搜索成功率


def baidu_url(word):  # 构建百度搜索URL;因为是查收录,所以只显示了前10个搜索结果,还可以通过rn参数来调整搜索结果的数量
    '''
    get baidu search url
    '''
    return 'http://www.baidu.com/s?wd=%s' % word


def baidu_cont(url):  # 获取百度搜索结果页内容
    r = requests.get(url, headers=headers)
    return r.content