def fetch_news(): news_list = [] resp = r.get(index_url, headers=headers) print("抓取:", resp.url) if resp is not None: resp.encoding = 'utf8' pq = PyQuery(resp.text) data_list = pq('ul#date-list-ul') for li in data_list('li').items(): img = li('a > img') print(li('p').text()) news_list.append( News( url=li('a').attr('href'), _id=li('a').attr('href').split('/')[-1].replace( '.html', ''), title=img.attr('alt'), image=img.attr('src'), overview=li('div#list-t p#list-abs').text(), publish_time=li( 'div#list-t > p#list-sm span:first').text(), origin=li('div#list-t > p#list-sm > span:last').text(), ).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('xinlvjie') client.insert_many(fetch_news()) print("新旅社爬取完毕!")
resp = r.post(ajax_url, data=json.dumps(ajax_params), headers=headers) if resp is not None: res = resp.json() for i in res['res']: news_list.append( News(_id=i['id'], title=i['title'], overview=i['content'], publish_time=i['create_time'], origin=i['src_name'], url=news_detail_url + i['uid']).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('dongmaiwang') cur_page = 1 while True: print("爬取第%d页" % cur_page) result_list = fetch_news(cur_page) client.insert_many(result_list) if int(round(time.time())) - int( time.mktime( time.strptime(result_list[-1]['publish_time'], "%Y-%m-%d %H:%M:%S"))) < 43200: cur_page += 1 continue else: break print("动脉网爬取完毕!")
def fetch_diyicaijing_news(): news_list = [] resp = r.get(diyicaijing_url, params={'page': 2}, headers=diyicaijing_headers) bs = BeautifulSoup(resp.text, 'lxml') articles = bs.findAll('article', attrs={'class': 'article-item clearfix'}) for article in articles: detail_url = diyicaijing_url[:-1] + article.a['href'] if not detail_url.endswith('subscribe'): news_content = article.div.text.replace(' ', '').replace('\n', '') text_result = msg_extract_pattern.findall(news_content) if text_result is not None: for content in text_result: news_list.append( News( _id=detail_url.split('/')[-1], url=detail_url, image=url_extract_pattern.search( article.a['style']).group(1), origin=content[0], title=content[1], publish_time=content[2], ).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('diyicaijing') client.insert_many(fetch_diyicaijing_news())
data_dict = json.loads(data_json) for data in data_dict['data']['list']: news_list.append(News( _id=data['id'], title=data['title'], overview=data['brief'], image=data['thumb'], publish_time=data['time'], url=data['url'], origin=data['columnName'] ).to_dict()) sort_field = data['sort_field'] return news_list, sort_field if __name__ == '__main__': client = MongodbClient('jueshengwang') result = fetch_index_news() client.insert_many(result[0]) count_time -= 1 min_id = result[1] while True: result = fetch_more_news(min_id) client.insert_many(result[0]) if int(round(time.time())) - int(result[0][-1]['publish_time']) < 432000: count_time -= 1 min_id = result[1] continue else: break
resp = r.get(penpai_ajax_url, params=ajax_params, headers=penpai_headers) resp_content = resp.text print("爬取:", resp.url) results = news_pattern.findall(resp_content) for result in results: if '小时前' in result[5]: hours_before = hours_pattern.search(result[5]) if hours_before is not None: if int(hours_before.group(1)) > 12: return news_list else: news_list.append( News(_id=result[0].split('_')[-1], title=result[2], overview=result[3].replace('\n', '').replace( ' ', ''), url=penpai_url + result[0], image='http:' + result[1], publish_time=result[5], origin=result[4]).to_dict()) pageidx += 1 time.sleep(random.randint(0, 2)) if __name__ == '__main__': client = MongodbClient('penpai') data_list = fetch_penpai_news() client.insert_many(data_list)
'pagesize': 20 }, headers=iheima_headers) print("爬取:", resp.url) if resp is not None: resp_json = resp.json() contents = resp_json['contents'] for content in contents: # 只抓取12个小时以内的新闻 if int(round(time.time())) - int( time.mktime( time.strptime(content['published'], "%Y-%m-%d %H:%M"))) > 86400: return news_list else: news_list.append( News(_id=content['contentid'], title=content['title'], url=iheima_url[:-1] + content['url'], image=content['thumb'], publish_time=content['published'], origin=content['author'], overview=content['description']).to_dict()) page += 1 if __name__ == '__main__': client = MongodbClient('iheima') client.insert_many(fetch_iheima_news()) print("爱黑马爬取完毕!")
bs = BeautifulSoup(resp.text, 'lxml') data_list = bs.find("ul", attrs={'class': 'gallery l-list-selected l-m'}) lis = data_list.findAll('li') for li in lis: l_cbox = li.find('div', attrs={'class': 'l-cbox'}) spans = l_cbox.find('div', attrs={ 'class': 'l-foot-par' }).findAll('span') news_id_result = xhs_news_id_pattern.search(li.a['href']) if news_id_result is not None: # 判断新闻的发布时间与当前时间的时间间隔,只保存12个小时以内的新闻 publish_time = spans[1].text.replace('\n', '').strip() if int(round(time.time())) - int( time.mktime( time.strptime(publish_time, "%Y-%m-%d %H:%M:%S"))) < 43200: news_list.append( News(_id=news_id_result.group(1), url=li.a['href'], title=li.a.img['alt'], image=xhs_gd_url + li.a.img['src'], origin=spans[0].text, publish_time=publish_time, overview=l_cbox.p.text).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('xinhuashe') client.insert_many(fetch_xh_focus()) client.insert_many(fetch_gd_news())
from news import News, MongodbClient from tools import user_agents base_url = 'https://www.iyiou.com/breaking/' headers = { 'User-Agent': user_agents.random_user_agent() } def fetch_news(url): news_list = [] resp = r.get(url, headers=headers) print("抓取:", resp.url) if resp is not None: pq = PyQuery(resp.text) a_s = pq('.newsFlashListWrap > div > ul > li > a') for item in a_s.items(): news_list.append(News( _id=item.attr('href').split('/')[-1].replace('.html', ''), url=item.attr('href'), title=item('span.fl').text(), publish_time=item('span.fr').text() ).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('yiou') for i in range(1, 3): client.insert_many(fetch_news("{}p{}.html".format(base_url, i)))