def sitedate(url): bru = 'http://www.link114.cn/get.php?baiduprzz&%s&71842' % url.strip() slu = 'http://www.link114.cn/get.php?baidu&%s&71152' % url.strip() headers = { 'Host': 'www.link114.cn', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': useragent.pcualist(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cookie': 'bdshare_firstime=1488939027441 AlexaToolbar-ALX_NS_PH: AlexaToolbar/alx-4.0' } data = None try: request = urllib2.Request(bru, data, headers) sock1 = urllib2.urlopen(request) brt = sock1.read() except Exception, e: brt = 'broken'
def getua(): User_Agent = pcualist() headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'freshGuide=1; BIDUPSID=7AD6B1F0076AD0D5E2810FF36119FB56; __cfduid=dbd033b635bd1268fdaefa87e2d34c1261470191700; PSTM=1472281639; BAIDUID=F44B09D83A8074889DE20E167B6F4EBB:FG=1; MCITY=-153%3A268%3A; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a02234882544; H_PS_PSSID=1468_20792_18240_21125_17001_20856_20733_20837_20885', 'Host': 'baike.baidu.com', 'Referer': 'http://baike.baidu.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': User_Agent } return headers
def getHTml(url): host = search('^([^/]*?)/', re.sub(r'(https|http)://', '', url)) headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6", "Cache-Control": "no-cache", "Connection": "keep-alive", #"Cookie":"__cfduid=df26a7c536a0301ccf36481a14f53b4a81469608715; BIDUPSID=E9B0B6A35D4ABC6ED4891FCC0FD085BD; PSTM=1474352745; lsv=globalTjs_97273d6-wwwTcss_8eba1c3-routejs_6ede3cf-activityControllerjs_b6f8c66-wwwBcss_eabc62a-framejs_902a6d8-globalBjs_2d41ef9-sugjs_97bfd68-wwwjs_8d1160b; MSA_WH=1433_772; BAIDUID=E9B0B6A35D4ABC6ED4891FCC0FD085BD:FG=1; plus_cv=1::m:2a9fb36a; H_WISE_SIDS=107504_106305_100040_100100_109550_104341_107937_108437_109700_109794_107961_108453_109737_109558_109506_110022_107895_107917_109683_109588_110072_107318_107300_107242_100457; BDUSS=XNNMTJlWEdDdzFPdU1nSzVEZ1REYn4tNWNwZk94NVducXpaaThjWjE4bU1TQXRZQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIy741eMu-NXQ; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDRCVFR[uLXjBGr0i56]=mbxnW11j9Dfmh7GuZR8mvqV; rsv_jmp_slow=1474644236473; sug=3; sugstore=1; ORIGIN=0; bdime=21110; H_PS_645EC=60efFRJ1dM8ial205oBcDuRmtLgH3Q6NaRzxDuIkbMkGVXNSHmXBfW0GZL4l5pnj; BD_UPN=123253; BD_CK_SAM=1; BDSVRTM=110; H_PS_PSSID=17947", "Host": host, "Pragma": "no-cache", "Upgrade-Insecure-Requests": "1", "User-Agent": useragent.pcualist() } # 代理服务器 proxyHost = "proxy.abuyun.com" proxyPort = "9010" # 代理隧道验证信息 proxyUser = "******" proxyPass = "******" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } proxies = { "http": proxyMeta, "https": proxyMeta, } html = requests.get(url, headers=headers, timeout=30) code = html.encoding return html.content
from BeautifulSoup import BeautifulSoup import re import useragent headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6", "Cache-Control": "no-cache", "Connection": "keep-alive", #"Cookie":"__cfduid=df26a7c536a0301ccf36481a14f53b4a81469608715; BIDUPSID=E9B0B6A35D4ABC6ED4891FCC0FD085BD; PSTM=1474352745; lsv=globalTjs_97273d6-wwwTcss_8eba1c3-routejs_6ede3cf-activityControllerjs_b6f8c66-wwwBcss_eabc62a-framejs_902a6d8-globalBjs_2d41ef9-sugjs_97bfd68-wwwjs_8d1160b; MSA_WH=1433_772; BAIDUID=E9B0B6A35D4ABC6ED4891FCC0FD085BD:FG=1; plus_cv=1::m:2a9fb36a; H_WISE_SIDS=107504_106305_100040_100100_109550_104341_107937_108437_109700_109794_107961_108453_109737_109558_109506_110022_107895_107917_109683_109588_110072_107318_107300_107242_100457; BDUSS=XNNMTJlWEdDdzFPdU1nSzVEZ1REYn4tNWNwZk94NVducXpaaThjWjE4bU1TQXRZQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIy741eMu-NXQ; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDRCVFR[uLXjBGr0i56]=mbxnW11j9Dfmh7GuZR8mvqV; rsv_jmp_slow=1474644236473; sug=3; sugstore=1; ORIGIN=0; bdime=21110; H_PS_645EC=60efFRJ1dM8ial205oBcDuRmtLgH3Q6NaRzxDuIkbMkGVXNSHmXBfW0GZL4l5pnj; BD_UPN=123253; BD_CK_SAM=1; BDSVRTM=110; H_PS_PSSID=17947", "Host": "www.baidu.com", "Pragma": "no-cache", "Upgrade-Insecure-Requests": "1", "User-Agent": useragent.pcualist() } # 设置UA模拟用户,还可设置多个UA提高搜索成功率 def baidu_url(word): # 构建百度搜索URL;因为是查收录,所以只显示了前10个搜索结果,还可以通过rn参数来调整搜索结果的数量 ''' get baidu search url ''' return 'http://www.baidu.com/s?wd=%s' % word def baidu_cont(url): # 获取百度搜索结果页内容 r = requests.get(url, headers=headers) return r.content