def get_search_article(self, keyword, offset=0): keyword = urllib.request.quote(keyword) req_url = "https://www.toutiao.com/search_content/?offset={}&format=json&keyword={}&autoload=true&count=20&cur_tab=1&from=search_tab".format( offset, keyword) headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'Connection': 'keep-alive', 'authority': 'www.toutiao.com', 'referer': "https://www.toutiao.com/search/?keyword={}".format(keyword), 'method': 'GET', 'path': "/search_content/?offset={}&format=json&keyword={}&autoload=true&count=20&cur_tab=1&from=search_tab" .format(offset, keyword), 'scheme': 'https' } self.s.headers.update(headers) req = self.s.get(req_url, proxies=get_proxy_ip()) time.sleep(random.random() * 2 + 3) data = json.loads(req.text) items = data['data'] if data['has_more'] == 1: self.page = self.page + 1 offset = 20 * self.page self.parse_data(items) time.sleep(2) self.get_search_article(keyword, offset) else: self.parse_data(items) toutiaodb.save(self.search_item_list)
def fetch_user_articles(self, user, browser): honey = json.loads(self.get_js()) signature = honey['_signature'] max_behot_time = "0" _as = honey['as'] cp = honey['cp'] if self.user_page > 0: signature = browser.execute_script("return window.TAC.sign(" + user.user_id + max_behot_time + ")") headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'Connection': 'keep-alive', 'authority': 'www.toutiao.com', 'referer': user.media_url, 'method': 'GET', 'path': "/c/user/article/?page_type=1&user_id={}&max_behot_time={}&count=20&as={}&cp={}&_signature={}" .format(user.user_id, max_behot_time, _as, cp, signature), 'scheme': 'https' } self.s.headers.update(headers) req_url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id={}&max_behot_time={}&count=20&as={}&cp={}&_signature={}".format( user.user_id, max_behot_time, _as, cp, signature) req = self.s.get(req_url, proxies=get_proxy_ip()) # 通过随机数控制请求速度 time.sleep(random.random() * 2 + 2) data = json.loads(req.text) max_behot_time = str(data['next'][max_behot_time]) if data['has_more']: self.user_page = self.user_page + 1 self.parse_user_artcle(data['data'], toutiaoitem.user_id, toutiaoitem.media_url) #在休眠2s time.sleep(2) self.fetch_user_articles(user, browser) else: self.parse_user_artcle(data['data'], toutiaoitem.user_id, toutiaoitem.media_url) toutiaodb.save(self.user_artcile_list)
def get_channel_data(self, page): #获取数据 req = self.s.get(url=self.url, verify=False, proxies=get_proxy_ip()) #print (self.s.headers) #print(req.text) headers = {'referer': self.url} max_behot_time = '0' signature = '.1.hXgAApDNVcKHe5jmqy.9f4U' eas = 'A1E56B6786B47FE' ecp = '5B7674A7FF2E9E1' self.s.headers.update(headers) item_list = [] browser = webdriver.Chrome() browser.implicitly_wait(10) browser.get(self.url) for i in range(0, page): Honey = json.loads(self.get_js()) # eas = self.getHoney(int(max_behot_time))[0] # ecp = self.getHoney(int(max_behot_time))[1] eas = Honey['as'] ecp = Honey['cp'] signature = Honey['_signature'] if i > 0: signature = browser.execute_script("return window.TAC.sign(" + max_behot_time + ")") url = 'https://www.toutiao.com/api/pc/feed/?category={}&utm_source=toutiao&widen=1&max_behot_time={}&max_behot_time_tmp={}&tadrequire=true&as={}&cp={}&_signature={}'.format( self.channel, max_behot_time, max_behot_time, eas, ecp, signature) req = self.s.get(url=url, verify=False, proxies=get_proxy_ip()) time.sleep(random.random() * 2 + 2) # print(req.text) # print(url) j = json.loads(req.text) for k in range(0, 10): item = toutiaoitem() now = time.time() if j['data'][k]['tag'] != 'ad' or j['data'][k][ 'tag'] != 'ad.platform.site': item.title = j['data'][k]['title'] ##标题 item.source = j['data'][k]['source'] ##作者 item.source_url = 'https://www.toutiao.com/' + j['data'][ k]['source_url'] ##文章链接 item.media_url = 'https://www.toutiao.com/' + j['data'][k][ 'media_url'] #作者主页 item.article_genre = j['data'][k]['article_genre'] #文章类型 try: item.comments_count = j['data'][k][ 'comments_count'] ###评论 except: item.comments_count = 0 item.tag = j['data'][k]['tag'] ###频道名 try: item.chinese_tag = j['data'][k]['chinese_tag'] ##频道中文名 except: item.chinese_tag = '' try: item.label = j['data'][k]['label'] ## 标签 except: item.label = [] try: item.abstract = j['data'][k]['abstract'] ###文章摘要 except: item.abstract = '' behot = int(j['data'][k]['behot_time']) item.behot_time = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(behot)) ####发布时间 item.collect_time = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(now)) ##抓取时间 item.item_id = j['data'][k]['item_id'] try: item.image_list = j['data'][k]['image_list'] except: item.image_list = [] item.image_url = j['data'][k]['image_url'] item.middle_image = j['data'][k]['middle_image'] item_list.append(item) toutiaodb.save(item_list) time.sleep(2) max_behot_time = str(j['next']['max_behot_time'])