Beispiel #1
0
 def get_page_urls(self):
     '''获取需要爬取页面的urls'''
     try:
         _headers = self.headers
         user_agent.handle_headers(headers=_headers)
         res = requests.get(self.url, headers=_headers)
         res.encoding = res.apparent_encoding
         htmlEmt = etree.HTML(res.text)
         tr_list = htmlEmt.xpath("//ul[@class='textlarge22']/li")[1:]
         return [self.url+i.xpath("a/@href")[0] for i in tr_list]
     except Exception as e:
         print('get_page_urls:',e)
Beispiel #2
0
 def get_page_urls(self,timeout=10):
     try:
         page_num = 10
         user_agent.handle_headers(self.headers)
         text = self.get_html(url=self.url)
         reres = re.findall('<li><a class="false" href="([\s\S]*?)">国内([\s\S]*?)</a></li>',text)
         aurl = (self.url + i[0][1:] for i in reres)
         urls = [u+str(i)+'/'for u in aurl for i in range(1,1+page_num)]
         return urls
     except Exception as e:
         print('------------------>',e)
         pass
Beispiel #3
0
 def get_img_url(self, n):
     try:
         user_agent.handle_headers(self.headers)
         self.jsonformdata['pn'] = str(int(self.jsonformdata['rn']) * n)
         self.jsonformdata['word'] = self.target
         self.jsonformdata['queryWord'] = self.target
         u = 'https://image.baidu.com/search/acjson?' + urllib.parse.urlencode(
             self.jsonformdata)
         r = requests.get(u, headers=self.headers)
         r.encoding = 'utf-8'
         j = json.loads(r.text)
         revlist = []
         revlist += [i['thumbURL'] for i in j['data'] if i]
         return revlist
     except Exception as e:
         print('get_img_url:', e)
         return []
Beispiel #4
0
 def get_headers(self):
     headers = {
         # 'Host': 'passport.baidu.com',
         # 'Connection': 'keep-alive',
         # 'Content-Length': '2145',
         # 'Cache-Control': 'max-age=0',
         # 'Origin': 'http://www.baidu.com',
         # 'Upgrade-Insecure-Requests': '1',
         # 'Content-Type': 'application/x-www-form-urlencoded',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         # 'Referer': 'http://www.baidu.com/',
         'Accept-Encoding': 'gzip, deflate, br',
         'Accept-Language': 'zh-CN,zh;q=0.9',
     }
     return user_agent.handle_headers(headers)