def writesth(path, hot_topics): file = open(path, "a", encoding="utf-8") file.write("抓取豆瓣内容") for topic in hot_topics: try: title = topic.find('a').get_text() href = topic.find('a').get('href') print(title, href) headers2 = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Referer': href } com_id = re.findall("\d+", href)[0] nums = ["20"] for num in nums: url = "https://m.douban.com/rexxar/api/v2/gallery/topic/" + str( com_id ) + "/items?sort=hot&start=0&count=" + num + "&status_full_text=1&guest_only=0&ck=null" try: html = download_page.download_html_waitting( url, headers2, 1) res = json.loads(html) for item in res["items"]: file.write(format_str(item["abstract"]) + '\n') print(item["abstract"]) except Exception as e: print("Except——豆瓣:爬取热评失败", e) except Exception as e: print("Except——豆瓣:爬取24小时热门话题失败", e) file.close() return 'success'
def newsCrawler(path): tags = ['china','society','world'] items = [] for tag in tags: url = r'http://news.cctv.com/' + tag + r'/data/index.json' try: result = download_page.download_html_waitting(url,headers,1) result = json.loads(result,strict=False) items = result['rollData'] except Exception as e: print("Except-新闻列表",e) # 写入文件 file = open(path, "a",encoding="utf-8") file.write("抓取新闻内容") if items != []: for item in items: title = item["title"] url = item['url'] try: soup = download_page.download_soup_waitting(url,headers,1) content = soup.find('div', {'class': 'cnt_bd'}) # 剔除无关标签 [s.extract() for s in content(['div', 'script'])] # print title, content.get_text().strip().replace('\n', '') result = title + ":" + content.get_text().strip().replace('\n', '') file.write(format_str(result.encode('utf-8','ignore').decode('utf-8','ignore'))+'\n') print(result) except Exception as e: print ("Except - 新闻:"+url,e) file.close() return 'success'
def sportCrawler(path): # 写诶文件 file = open(path, 'a', encoding="utf-8") file.write("抓取体育内容") items = [] for url in urlcol: result = download_page.download_html_waitting(url, headers, 1) try: result = str(result, encoding="gbk").replace( "data_callback(", '{"data_callback":', 1)[:-1] + "}" result = json.loads(result, strict=False) items = result['data_callback'] except Exception as e: print("Except-体育列表", e) if items != []: for item in items: title = item['title'] docurl = item['docurl'] file.write(format_str(title)) print(title, docurl) from bs4 import BeautifulSoup res = requests.get(docurl, headers=headers) res.encoding = 'gb2312' soup = BeautifulSoup(res.text, "html.parser") # print(soup) try: post = soup.find('div', id="endText") if post is None: print("格式不相符") else: text = post.get_text().strip() result = text.replace('\n', '') file.write(format_str(result) + '\n') print(result) except: print("Except -- ,跳往下一链接") file.close() return 'success'
def getdata(self): # 获取数据 req = self.s.get(url=self.url, verify=False) headers = {'referer': self.url} max_behot_time = '0' signature = '.1.hXgAApDNVcKHe5jmqy.9f4U' eas = 'A1E56B6786B47FE' ecp = '5B7674A7FF2E9E1' self.s.headers.update(headers) titles = [] abstracts = [] for i in range(0, 10): Honey = json.loads(self.get_js()) eas = Honey['as'] ecp = Honey['cp'] signature = Honey['_signature'] url = 'https://www.toutiao.com/api/pc/feed/?category={}&utm_source=toutiao&widen=1&max_behot_time={}&max_behot_time_tmp={}&tadrequire=true&as={}&cp={}&_signature={}'.format( self.channel, max_behot_time, max_behot_time, eas, ecp, signature) req = self.s.get(url=url, verify=False) time.sleep(random.random() * 2 + 2) # print(req.text) print(url) j = json.loads(req.text) items = j['data'] # 写入文件 file = open(self.path,'a',encoding="utf-8") file.write("抓取电影内容") for item in items: try : title = item['title'] abstract = item['abstract'] file.write(format_str(title + ":" +abstract)+'\n') print(title + " : " + abstract) titles.append(title) ##标题 try: abstracts.append(abstract) ###文章摘要 except Exception as e: abstracts.append('') except Exception as e: print("Except - 头条",e) file.close() time.sleep(2) print('------------' + str(j['next']['max_behot_time']))
def esportsCrawler(path): for i in range(1, 10): url = 'http://www.dadianjing.cn/index.php?m=Index&a=xhrList&cid=1&page=' + str( i) try: result = download_page.download_html_waitting(url, headers, 1) result = json.loads(result, strict=False) items = result["data"]["list"] # 写入文件 file = open(path, "a", encoding="utf-8") file.write("抓取电竞内容") for item in items: title = item['title'] summary = item['summary'] file.write(format_str((title + ":" + summary)) + '\n') print(title + "---" + summary) file.close() except Exception as e: print("Except - 电竞:" + url, e) return 'success'
def hotmovieCrawler(path): start = 0 # 不断请求,直到返回结果为 file = open(path, "a", encoding="utf-8") file.write("抓取电影内容") while start <= 40: # 拼接需要请求的链接,包括标签和开始编号 url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=recommend&page_limit=20&page_start=' + str( start) result = download_page.download_html_waitting(url, headers, 1) if result != "none": print("-------------没有使用代理---------------") result = json.loads(result) movie_items = result['subjects'] for movie_item in movie_items: movie_url = movie_item['url'] # 提取电影简介 # 捕捉异常,有的电影详情页中并没有简介 try: html = requests.get(movie_url).content soup = BeautifulSoup(html, "html.parser") description = soup.find_all( "span", attrs={"property": "v:summary" })[0].get_text().strip().replace('\n', '') file.write( format_str( description.encode('utf-8', 'ignore').decode( 'utf-8', 'ignore')) + '\n') print(description) except Exception as e: print("该电影没有简介", e) time.sleep(0.5) else: print("-------------使用代理---------------") # 获取代理IP ip_list = ipAgency.get_ip_list(headers) # print(ip_list) for ip in ip_list: hd, port = ip.split(':') try: telnetlib.Telnet(hd, port=port, timeout=20) except: print(str(ip) + '失败') else: try: proxies = get_proxy(ip) requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数 s = requests.session() s.keep_alive = False # 关闭多余连接 s.proxies = proxies s.headers = headers html = requests.get(url).content if html == "none": continue result = json.loads(html) movie_items = result['subjects'] for movie_item in movie_items: movie_url = movie_item['url'] # 提取电影简介 # 捕捉异常,有的电影详情页中并没有简介 try: html = requests.get(movie_url).content soup = BeautifulSoup(html, "html.parser") description = soup.find_all( "span", attrs={ "property": "v:summary" })[0].get_text().strip().replace('\n', '') file.write( format_str( description.encode('utf-8', 'ignore'). decode('utf-8', 'ignore')) + '\n') print(description) except Exception as e: print("该电影没有简介", e) time.sleep(0.5) except Exception as e: print("Except——电影:", e) start += 20 file.close() return 'success'
def sinaCrawler(path): # a、微博信息流、热搜 hot_tag = [] hot_tag.append("realtimehot") hot_tag.append("socialevent") # 写入文件 file = open(path, "a", encoding="utf-8") file.write("抓取新浪微博内容") for tag in hot_tag: soup = download_page.download_soup_waitting( "https://s.weibo.com/top/summary?cate=" + tag, headers, 1) try: hot_list = soup.find_all('td', attrs={"class": "td-02"}) for hot in hot_list: title = hot.find("a").get_text() href = "https://s.weibo.com" + hot.find("a").get('href') file.write(format_str(title)) print(title + ":" + href) # 过滤无效链接 if href != "https://s.weibo.comjavascript:void(0);": detail_soup = download_page.download_soup_waitting( href, headers, 1) # print(detail_soup) cards = detail_soup.find_all("div", attrs={"class": "card-feed"}) for card in cards: content = card.find("div", attrs={"class": "content"}) blogger = content.find("p", attrs={ "class": "txt" }).get('nick-name') blog = content.find("p", attrs={ "class": "txt" }).get_text() file.write(format_str(blogger + ":" + blog)) print(blogger + ":" + blog) except Exception as e: print("Except——新浪:爬取异常,已跳过", e) # b、微博信息流 url_hotfeed = "https://api.weibo.cn/2/guest/cardlist?gsid=_2AkMu5Br-f8NhqwJRmPAcz2PmZYl_yQ3EieKYuOslJRM3HRl-3T9kqnwvtRWwLB-1C2SEmptvAP1Bfy0s7kgEgw..&uid=1008938494835&wm=3333_2001&i=8bb4ee5&b=1&from=1073193010&checktoken=807ca79ae3fa897b262e3b63c3882698&c=iphone&networktype=wifi&v_p=45&skin=default&s=ee9f63c1&v_f=1&did=eb4621d547f0e7cb9eef4a41403ee866&lang=zh_CN&sflag=1&ua=iPhone9,2__weibo__7.3.1__iphone__os10.3.1&aid=01AhjayctpFPjOzJEmy46JLMop9TgsXKgsxZQYIpcPoBa-nn8.&lon=116.2697240292689&count=20&fid=230584&containerid=230584&uicode=10000011&lat=40.04127809492162&offset=1&max_id=4151604225452173&page=1&moduleID=pagecard" url_starfeed = "https://api.weibo.cn/2/guest/cardlist?gsid=_2AkMu5WfMf8NhqwJRmPAcz2PmZYl_yQ3EieKYuZYXJRM3HRl-3T9kqnZftRVqWDRdwTGKDWtA7iBOAX-N3elOcA..&uid=1008938494835&wm=3333_2001&i=8bb4ee5&b=1&from=1073193010&checktoken=807ca79ae3fa897b262e3b63c3882698&c=iphone&networktype=wifi&v_p=45&skin=default&s=ee9f63c1&v_f=1&did=eb4621d547f0e7cb9eef4a41403ee866&lang=zh_CN&sflag=1&ua=iPhone9,2__weibo__7.3.1__iphone__os10.3.1&aid=01AhjayctpFPjOzJEmy46JLMop9TgsXKgsxZQYIpcPoBa-nn8.&lon=116.2697240292689&count=20&fid=230781&containerid=230781&uicode=10000011&lat=40.04127809492162&offset=1&max_id=4140648884038081&page=1&moduleID=pagecard" urlcol = [] urlcol.append(url_hotfeed) urlcol.append(url_starfeed) for url in urlcol: print("正在获取微博信息流...") res = download_page.download_html_waitting(url, headers, 1) try: res = json.loads(res) for cards in res["cards"]: # print cards if cards["card_type"] == 9: if "text" in cards["mblog"]: # print cards["mblog"]["text"] file.write(cards["mblog"]["text"]) print(cards["mblog"]["text"]) except KeyError as e: print("Except——新浪: " + str(e)) file.close() return 'success'