def run(self): super().run() headers = HEADERS.copy() headers['Upgrade-Insecure-Requests'] = '1' headers['Referer'] = 'https://www.douban.com/group/explore' headers['Host'] = 'www.douban.com' res = requests.get(douban_url, headers=headers) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') _list = soup.select('div.channel-item') print(len(_list)) for item in _list: a_el = item.select('div.bd a')[0] title = a_el.text.strip() url = a_el.get('href') desc = item.select('div.block p')[0].text.strip() hot_item = HotItem(title, url, cate=types['douban'], desc=desc) self.arr.append(hot_item) hot_collection.delete_many({'cate': types['douban']}) hot_collection.insert_many([vars(item) for item in self.arr])
def run(self): super().run() headers = HEADERS.copy() headers['Upgrade-Insecure-Requests'] = '1' headers['Referer'] = 'https://36kr.com/' headers['Host'] = '36kr.com' res = requests.get(url_36kr, headers=HEADERS) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') _list = soup.select('div.kr-home-flow-item') for item in _list: a_tag = item.select('a.article-item-title') if a_tag: title = a_tag[0].text.strip() url = 'https://36kr.com' + a_tag[0].get('href') desc = item.select( 'a.article-item-description')[0].text.strip() hot_item = HotItem(title, url, cate=types['36kr'], desc=desc) self.arr.append(hot_item) else: continue hot_collection.delete_many({'cate': types['36kr']}) hot_collection.insert_many([vars(item) for item in self.arr])
def run(self): super().run() res = requests.get(WEIBOT_URL, headers=HEADERS) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') _list = soup.select('td.td-02 a') for item in _list: title = item.text.strip() url = 'https://s.weibo.com{}'.format(item.get('href')) hot_item = HotItem(title, url, cate=types['weibo']) self.arr.append(hot_item) hot_collection.delete_many({'cate': types['weibo']}) hot_collection.insert_many([vars(item) for item in self.arr])
def run(self): super().run() res = requests.get(V2EX_URL, headers=HEADERS) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') _list = soup.select('.box a.topic-link') for item in _list: title = item.text url = 'https://www.v2ex.com{}'.format(item.get('href')) hot_item = HotItem(title, url, cate=types['v2ex']) self.arr.append(hot_item) hot_collection.delete_many({'cate': types['v2ex']}) hot_collection.insert_many([item.__dict__ for item in self.arr])
def run(self): super().run() res = requests.get(tieba_url, headers=HEADERS) res.encoding = 'utf-8' res = res.json() for item in res['data']['bang_topic']['topic_list']: hot_item = HotItem(title=item['topic_name'], url=item['topic_url'], cate=types['tieba'], desc=item['topic_desc']) self.arr.append(hot_item) hot_collection.delete_many({'cate': types['tieba']}) hot_collection.insert_many([vars(item) for item in self.arr])
def run(self): super().run() res = requests.get(HUPU_URL, headers=HEADERS) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') top_list = soup.select('.bbsHotPit li') for item in top_list: a_tag = item.select('.textSpan a')[0] url = 'https://bbs.hupu.com/{}'.format(a_tag.get('href')) title = a_tag.get('title') title = title.replace('zt', '') hot_item = HotItem(title, url, cate=types['hupu']) self.arr.append(hot_item) hot_collection.delete_many({'cate': types['hupu']}) hot_collection.insert_many([item.__dict__ for item in self.arr])
def run(self): super().run() headers = HEADERS.copy() headers['Upgrade-Insecure-Requests'] = '1' headers['Host'] = 'top.baidu.com' res = requests.get(baidu_url, headers=HEADERS) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'html.parser') _list = soup.select('a.list-title') for item in _list: title = item.text.strip() url = item.get('href') hot_item = HotItem(title, url, cate=types['baidu']) self.arr.append(hot_item) hot_collection.delete_many({'cate': types['baidu']}) hot_collection.insert_many([vars(item) for item in self.arr])
def run(self): super().run() res = requests.get(github_URL, headers=HEADERS) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') repository = soup.select('article.Box-row') for row in repository: title = row.select('h1.h3.lh-condensed')[0].text.strip() title = title.replace(' ', '').replace('\n', '') url = 'https://github.com' + row.select( 'h1.h3.lh-condensed')[0].find('a').get('href') desc_el = row.select('p.col-9') desc = desc_el[0].text.strip() if desc_el else None hot_item = HotItem(title, url, cate=types['github'], desc=desc) self.arr.append(hot_item) hot_collection.delete_many({'cate': types['github']}) hot_collection.insert_many([vars(item) for item in self.arr])
def run(self): super().run() headers = HEADERS.copy() headers.update( referer='https://www.zhihu.com/signin?next=%2Fhot', cookie='tgw_l7_route=a37704a413efa26cf3f23813004f1a3b; _zap=4369dfa8-8757-4a0a-9b23-114fe13b449c; _xsrf=8ce5a6f2-6d38-4c4f-8a94-f343511a4dc8; d_c0="AMDmP0jPHRCPTnmL4dcBXTUEm7lWNYvWtO0=|1569658401"; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1569658400; capsion_ticket="2|1:0|10:1569658401|14:capsion_ticket|44:YThjZTVmZmI3NDQwNGRlNGIyZTQ5NmQzYzUzZTY0MGQ=|62c39d95c612658919bf053eb0b13b70ccddcde4d6b4c534127f37c900e2411e"; l_n_c=1; r_cap_id="NTkxYTFkN2M3ODZiNDJmNTlkYTVhNzBiZjEzODIxYTA=|1569658760|3f15e311b6f99af3f1d95ed3a1ea1619ac75eef1"; cap_id="M2E1OWVmNTM5YjMwNDQ4M2JkZGIyN2IyYmUxYWYwYzA=|1569658760|be545d54fcaeacdd6084952d60d42c2a6477e9dc"; l_cap_id="OTg5NTkzMGEzZDcxNGU0ZDhjZmYyYWI0MzZjOWFmYTE=|1569658760|e4477741cdddda2bf782bbefac43bdb6dc414112"; n_c=1; z_c0=Mi4xejFiZEFRQUFBQUFBd09ZX1NNOGRFQmNBQUFCaEFsVk5rbVY4WGdBdS04a1NoY0otV0ZUQk5ydjh0a0RGb2ZjaFBn|1569658770|1b7f9d825f104d60dc719882276ec00880677c27; tshl=; tst=h; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1569659055; unlock_ticket="ABDMJM49ZggXAAAAYQJVTbkfj12-gPv33gH7u9Oq7-u4wMqZ5VMUMw=="' ) res = requests.get(ZHIHU_URL,headers=headers) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') top_list = soup.select('.HotList-list .HotItem-content') for item in top_list: a_tag = item.find('a') url = a_tag.get('href') title = a_tag.get('title') desc_tag = item.select('p') desc = desc_tag[0].text if desc_tag else None hot_item = HotItem(title, url,cate=types['zhihu'],desc=desc) self.arr.append(hot_item) hot_collection.delete_many({'cate':types['zhihu']}) hot_collection.insert_many([item.__dict__ for item in self.arr])
def run(self): super().run() headers = HEADERS.copy() headers['Upgrade-Insecure-Requests'] = '1' headers['Referer'] = 'https://www.huxiu.com/channel/107.html' headers['Host'] = 'www.huxiu.com' res = requests.get(huxiu_url,headers=headers) res.encoding= 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') _list = soup.select('div.article-items') for item in _list: content_el = item.find('div',class_='article-item__content') a_tag = content_el.select('a')[-1] title = a_tag.find('h5',class_='article-item__content__title').text.strip() url = 'https://www.huxiu.com' + a_tag.get('href') desc = a_tag.find('p',class_='article-item__content__intro').text.strip() hot_item = HotItem(title,url, cate = types['huxiu'],desc=desc) self.arr.append(hot_item) hot_collection.delete_many({'cate':types['huxiu']}) hot_collection.insert_many([vars(item) for item in self.arr])
def run(self): super().run() headers = HEADERS.copy() headers['Upgrade-Insecure-Requests'] = '1' headers['Referer'] = 'https://www.guokr.com/scientific/' headers['Host'] = 'www.guokr.com' res = requests.get(guokr_url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') _list = soup.select('div.article') for item in _list: a_tag = item.find('a', class_='article-title') if a_tag: title = a_tag.text.strip() url = a_tag.get('href') desc = item.find('p', class_='article-summary').text.strip() hot_item = HotItem(title, url, cate=types['guokr'], desc=desc) self.arr.append(hot_item) else: continue hot_collection.delete_many({'cate': types['guokr']}) hot_collection.insert_many([vars(item) for item in self.arr])
def run(self): super().run() headers = HEADERS.copy() headers['Upgrade-Insecure-Requests'] = '1' headers[ 'Referer'] = 'http://bbs.tianya.cn/list.jsp?item=funinfo&grade=3&order=1' headers['Host'] = 'bbs.tianya.cn' res = requests.get(tinaya_url, headers=HEADERS) soup = BeautifulSoup(res.text, 'html.parser') _list = soup.select('td.td-title') for item in _list: a_tag = item.find('a') title = a_tag.text.strip() url = 'http://bbs.tianya.cn' + a_tag.get('href') hot_item = HotItem(title, url, cate=types['tianya']) self.arr.append(hot_item) hot_collection.delete_many({'cate': types['tianya']}) hot_collection.insert_many([vars(item) for item in self.arr])
def run(self): super().run() headers = HEADERS.copy() headers['Upgrade-Insecure-Requests'] = '1' headers['Referer'] = 'https://www.qdaily.com/tags/30.html' headers['Host'] = 'www.qdaily.com' res = requests.get(qdaily_url, headers=headers) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') _list = soup.select('div.packery-item.article') for item in _list: title_tag = item.find('h3', class_='title') if title_tag: title = title_tag.text.strip() url = 'https://qdaily.com' + item.find( 'a', class_='com-grid-banner-article').get('href') else: a_tag = item.find('a', class_='com-grid-article') url = 'https://qdaily.com' + a_tag.get('href') title = a_tag.find('h3', class_='smart-dotdotdot').text.strip() hot_item = HotItem(title, url, cate=types['qdaily']) self.arr.append(hot_item) hot_collection.delete_many({'cate': types['qdaily']}) hot_collection.insert_many([vars(item) for item in self.arr])