class bjNews(scrapy.Spider): name = "chinanetNewsSpider" start_url = "http://www.china.com.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://news.china.com.cn/node_7247300.htm", callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): detail_urls = response.xpath( """/html/body/div/div/ul/li/a/@href""").extract() for detail_url in detail_urls: time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: # content_time = response.xpath("""//*[@id="pubtime_baidu"]/text()""").extract() public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: contents = response.xpath( """//*[@id="articleBody"]/p/text()""").extract() content = "".join(contents) except: spiderUtil.log_level(7, response.url) source = "http://www.china.com.cn/" try: author_arr = response.xpath( """//*[@id="source_baidu"]//text()""").extract() author = "".join(author_arr) if author == '': author = "中国网" else: author = author.split("来源:")[1].strip() except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "/html/body/div/h1/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class chinaNews(scrapy.Spider): name = "chinaSpider" start_url = "http://www.chinanews.com/scroll-news/news1.html" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url=self.start_url, callback=self.parse_item_list_news, headers=self.header) def parse_item_list_news(self, response): detail_urls = response.xpath( """//*[@id="content_right"]/div/ul/li/div/a/@href""").extract() for detail_url in detail_urls: if len(detail_url) > 40: # time.sleep(3) url = "http:" + str(detail_url) yield scrapy.Request(url=url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})", response.text).group(0).replace("年", "-").replace( "月", "-").replace("日", "") + ":00" except: # spiderUtil.log_level(8, response.url) pass try: content_arr = response.xpath( "//div[@class='left_zw']/p//text()").extract() content = "".join(content_arr).strip() # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: title_arr = response.xpath( "//div/div/div/div/h1//text()").extract() title = "".join(title_arr) except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//div[@class='left-t']//text()").extract() author = "".join(author_arr) if author == "": author = "中国新闻网" except: spiderUtil.log_level(9, response.url) source = "http://www.chinanews.com/" try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class bjNews(scrapy.Spider): name = "bjnewsSpider" start_url = "http://www.bjnews.com.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url=self.start_url, callback=self.parse_item_home,headers=self.header) def parse_item_home(self, response): list_page_arr1 = response.xpath("//div[@class='nav']/a/@href").extract() list_page_arr2 = response.xpath("//div[@class='menu_drop_list']/a/@href").extract() for list_page in list_page_arr1: if not list_page.startswith("http") and list_page != "wevideo" and list_page != "video": yield scrapy.Request(url=response.url + list_page[1:], callback=self.parse_item_page_list) for list_page in list_page_arr2: yield scrapy.Request(url=response.url + list_page[1:], callback=self.parse_item_page_list) def parse_item_page_list(self, response): for page in range(1, 3): yield scrapy.Request(url=response.url + "?page=" + str(page), callback=self.parse_item_news_list) def parse_item_news_list(self, response): news_url_arr1 = response.xpath("//ul[@id='news_ul']/li/a/@href").extract() news_url_arr2 = response.xpath("//ul[@id='news_ul']/li/div/a/@href").extract() news_url_arr1.extend(news_url_arr2) for news_url in news_url_arr1: if news_url.startswith("http"): time.sleep(random.uniform(1, 2)) yield scrapy.Request(url=news_url, callback=self.parse) else: news_url = "http://www.bjnews.com.cn" + news_url time.sleep(random.uniform(1, 2)) yield scrapy.Request(url=news_url, callback=self.parse) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@class='content']/p/text()").extract() content = "".join(content_arr) # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) source = "http://www.bjnews.com.cn/" try: author = response.xpath("//span[@class='author']/text()").extract()[0].strip() except: spiderUtil.log_level(9, response.url) try: title = response.xpath("//div[@class='title']/h1/text()").extract()[0] except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class bjNews(scrapy.Spider): name = "baiduNewsSpider" start_url = "http://news.baidu.com.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="https://news.baidu.com/guonei", callback=self.parse_item_home,headers=self.header) yield scrapy.Request(url="https://news.baidu.com/guoji", callback=self.parse_item_home,headers=self.header) yield scrapy.Request(url="https://news.baidu.com/mil", callback=self.parse_item_home,headers=self.header) yield scrapy.Request(url="https://news.baidu.com/finance", callback=self.parse_item_home,headers=self.header) yield scrapy.Request(url="https://news.baidu.com/ent", callback=self.parse_item_home,headers=self.header) yield scrapy.Request(url="https://news.baidu.com/sports", callback=self.parse_item_home,headers=self.header) yield scrapy.Request(url="https://news.baidu.com/internet", callback=self.parse_item_home,headers=self.header) yield scrapy.Request(url="https://news.baidu.com/tech", callback=self.parse_item_home,headers=self.header) yield scrapy.Request(url="https://news.baidu.com/game", callback=self.parse_item_home,headers=self.header) yield scrapy.Request(url="https://news.baidu.com/lady", callback=self.parse_item_home,headers=self.header) yield scrapy.Request(url="https://news.baidu.com/auto", callback=self.parse_item_home,headers=self.header) yield scrapy.Request(url="https://news.baidu.com/auto", callback=self.parse_item_home,headers=self.header) yield scrapy.Request(url="https://news.baidu.com/house", callback=self.parse_item_home,headers=self.header) def parse_item_home(self, response): detail_urls = response.xpath("""//a/@href""").extract() for detail_url in detail_urls: if detail_url.startswith("http://baijiahao.baidu.com"): time.sleep(random.uniform(1, 2)) yield scrapy.Request(url=detail_url, callback=self.parse) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_time = response.xpath("""//*[@id="article"]/div/div/div/span/text()""").extract() public_time = str(time.strftime('%Y', time.localtime(time.time()))) +"-"+ str(content_time[0]) + " " + str(content_time[1]) + ":00" except: # spiderUtil.log_level(8, response.url) pass try: content_arr = response.xpath("""//*[@id="article"]/div/p/span/text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://news.baidu.com/" try: author_arr = response.xpath("""//*[@id="article"]/div/div/p/text()""").extract() author = "".join(author_arr) except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath("""//*[@id="article"]/div/h2/text()""").extract() title = "".join(title_arr) except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class wenmingNews(scrapy.Spider): name = "wenmingNewsSpider" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://www.wenming.cn/a/yw/", callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): detail_urls = response.xpath( """/html/body/div/div/ul/li/div/a/@href""").extract() for detail_url in detail_urls: yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0) + "00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='TRS_Editor']/div/p//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://www.wenming.cn/" try: author_arr = response.xpath( "//div[@class='box01']/div[@class='fl']/a//text()" ).extract() author = "".join(author_arr).strip() if author == "": author = "文明网" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//div[@id='title_tex']//text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_yesterday_date()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class dazhongNews(scrapy.Spider): name = "dazhongSpider" start_url = [ "http://www.dzwww.com/xinwen/guoneixinwen/", "http://www.dzwww.com/xinwen/guojixinwen/", "http://www.dzwww.com/xinwen/shehuixinwen/" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: yield scrapy.Request(url=url, callback=self.parse_item_list_news, headers=self.header) def parse_item_list_news(self, response): detail_urls = response.xpath("""//h3/a/@href""").extract() for detail_url in detail_urls: if detail_url.startswith("./"): url = response.url + detail_url.replace("./", "") yield scrapy.Request(url=url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: # spiderUtil.log_level(8, response.url) pass try: content_arr = response.xpath( """//div/div/div/div/div/p/text()""").extract() content = "".join(content_arr).strip() # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: title_arr = response.xpath("//div/div/h2/text()").extract() title = "".join(title_arr) except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( """//*[@id="xl-headline"]/div/div/text()""").extract() author = "".join(author_arr).strip() if author == "": author = "大众网" else: author = author.split("来源: ")[1].split("作者:")[0].strip() except: spiderUtil.log_level(9, response.url) source = "http://www.dzwww.com/" try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class huanqiuNews(scrapy.Spider): name = "huanqiuSpider" start_url = ["http://world.huanqiu.com/article/?agt=15438", "http://china.huanqiu.com/article/?agt=15438"] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: yield scrapy.Request(url=url, callback=self.parse_item_list_news, headers=self.header) def parse_item_list_news(self, response): detail_urls = response.xpath("""/html/body/div/div/div/ul/li/h5/em/a/@href""").extract() for detail_url in detail_urls: if "article" in detail_url: yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: # spiderUtil.log_level(8, response.url) pass try: content_arr = response.xpath("""/html/body/div/div/div/div/div/p/text()""").extract() content = "".join(content_arr).strip() # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: title_arr = response.xpath("/html/body/div/div/div/div/h1/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath("""/html/body/div/div/div/div/div/span/a/text()""").extract() author = "".join(author_arr).strip() if author == "": author = "环球网" except: spiderUtil.log_level(9, response.url) source = "http://www.huanqiu.com/" try: if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class fenghuangNews(scrapy.Spider): name = "fenghuangNewsSpider" header = spiderUtil.header_util() def start_requests(self): url = "http://news.ifeng.com/" yield scrapy.Request(url=url, callback=self.parsepage, headers=self.header) def parsepage(self,response): newslist = response.xpath("//h2/a/@href").extract() for url in newslist: newsurl = "http://" + url yield scrapy.Request(newsurl, callback=self.parsebody) def parsebody(self,response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) all_arr = response.xpath("""//script//text()""").extract() data = "".join(all_arr).split("allData = ")[1].split("var adData")[0].strip()[:-1] data = json.loads(data) doc = data['docData'] try: public_time = doc['newsTime'] except: spiderUtil.log_level(8, response.url) try: content = doc['contentData']['contentList'][0]['data'] content = "".join(etree.HTML(content).xpath("//p//text()")).strip() except: spiderUtil.log_level(7, response.url) source = "http://news.ifeng.com/" try: author = doc['source'] except: spiderUtil.log_level(9, response.url) try: title = doc['title'] except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["html_size"] = html_size item["crawl_time"] = spiderUtil.get_time() # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class eastmoneyNews(scrapy.Spider): name = "eastmoneySpider" start_url = "http://finance.eastmoney.com/yaowen.html" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url=self.start_url, callback=self.parse_item_list_news,headers=self.header) def parse_item_list_news(self, response): detail_urls = response.xpath("""//p[@class='title']/a/@href""").extract() for detail_url in detail_urls: yield scrapy.Request(url=detail_url, callback=self.parse,headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})", response.text).group(0).replace("年", "-").replace("月", "-").replace("日", "") + ":00" except: # spiderUtil.log_level(8, response.url) pass try: content_arr = response.xpath("//div[@id='ContentBody']/p//text()").extract() content = "".join(content_arr).strip() # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: title_arr = response.xpath("//h1//text()").extract() title = "".join(title_arr) except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath("//div[@class='source data-source']//text()").extract() author = "".join(author_arr).replace("来源:","").strip() if author == "": author = "东方财富网" except: spiderUtil.log_level(9, response.url) source = "http://finance.eastmoney.com/" try: if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class wangyiNews(scrapy.Spider): name = "wangyiNewsSpider" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request( url="http://money.163.com/special/002526BH/rank.html", callback=self.parsepage, headers=self.header) def parsepage(self, response): newslist = response.xpath("//div/table/tr/td/a") for news in newslist: url = news.xpath("./@href").extract()[0] title = news.xpath("./text()").extract()[0].strip() yield scrapy.Request(url, callback=self.parsebody, meta={"title": title}) def parsebody(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='endText']/p/text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) try: author_arr = response.xpath( "//a[@id='ne_article_source']//text()").extract() author = "".join(author_arr).strip() if author == "": author = "网易新闻" except: spiderUtil.log_level(9, response.url) source = "https://news.163.com/" try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = response.meta["title"] item["author"] = author item["html_size"] = html_size item["crawl_time"] = spiderUtil.get_time() # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class ceNews(scrapy.Spider): name = "ceSpider" start_url = "http://intl.ce.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url=self.start_url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): kinds_arr = response.xpath("//div[@class='ceallnava']/ul/li/a/@href").extract() for kinds in kinds_arr: if kinds != "http://intl.ce.cn/specials/": yield scrapy.Request(url=kinds, callback=self.parse_item_kinds) def parse_item_kinds(self, response): yield scrapy.Request(url=response.url, callback=self.parse_item_page_list, dont_filter=True) def parse_item_page_list(self, response): news_url_arr = response.xpath("//span[@class='f1']/a/@href").extract() for news_url in news_url_arr: if news_url.startswith("http") and "more" not in news_url: yield scrapy.Request(url=news_url, callback=self.parse) else: head = response.url.split("/") if news_url.startswith("../../"): news_url = "/".join(head[:3]) + "/" + news_url.replace("../../", "") yield scrapy.Request(url=news_url, callback=self.parse) elif news_url.startswith("../"): news_url = "/".join(head[:4]) + "/" + news_url.replace("../", "") yield scrapy.Request(url=news_url, callback=self.parse) elif news_url.startswith("./"): news_url = response.url.split("index")[0] + news_url.replace("./", "") yield scrapy.Request(url=news_url, callback=self.parse) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath("//div[@id='articleText']//text()").extract() content = "".join(content_arr).strip() # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: public_time = (re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})", response.text).group( 0) + ":00").replace( "年", "-").replace("月", "-").replace("日", "") except: spiderUtil.log_level(8, response.url) source = "http://www.ce.cn/" try: title = response.xpath("//head/title/text()").extract()[0].split("_")[0] except: spiderUtil.log_level(6, response.url) try: author = response.xpath("//head/meta[@name='author']/@content").extract()[0] except: spiderUtil.log_level(9, response.url) try: if public_time.startswith(spiderUtil.get_first_hour()) and content != "": item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class bjNews(scrapy.Spider): name = "ckxxNewsSpider" start_url = "http://www.cankaoxiaoxi.com/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://china.cankaoxiaoxi.com/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://world.cankaoxiaoxi.com/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://mil.cankaoxiaoxi.com/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://finance.cankaoxiaoxi.com//", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://culture.cankaoxiaoxi.com/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://science.cankaoxiaoxi.com/", callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): detail_urls = response.xpath( """//*[@id="allList"]/div/div/div/div/p/a/@href""").extract() for detail_url in detail_urls: time.sleep(random.uniform(1, 2)) yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_time = response.xpath( """//*[@id="pubtime_baidu"]/text()""").extract() public_time = str(content_time[0]) except: spiderUtil.log_level(8, response.url) try: contents = response.xpath( """//*[@id="allList"]/div/div/div/p/text()""").extract() content = "".join(contents) except: spiderUtil.log_level(7, response.url) source = "http://www.cankaoxiaoxi.com/" try: author = str( response.xpath("""//*[@id="source_baidu"]/text()"""). extract()[0].strip()).replace("来源:", "") except: spiderUtil.log_level(9, response.url) try: title = response.xpath("//div/div/h1/text()").extract()[0] except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): # if content != "" : item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class sohuNews(scrapy.Spider): name = "xinhuaNewsSpider" header = spiderUtil.header_util() def start_requests(self): url = 'http://qc.wa.news.cn/nodeart/list?nid=11147664&pgnum={}&cnt={}&tp=1&orderby=1' num = 30000 pgnum = 1 while num / 200 > 0: cnt = (num - 1) % 200 + 1 url = str(url).format(pgnum, cnt) pgnum += 1 num -= cnt yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): data_str = response.text data_str = data_str[1:-1] data_str = eval(data_str, type('Dummy', (dict,), dict(__getitem__=lambda s, n: n))()) data_str = json.dumps(data_str) data_str = json.loads(data_str) data_str = data_str['data']['list'] for r in data_str: try: public_time = datetime.strptime(r['PubTime'], '%Y-%m-%d %H:%M:%S') except: spiderUtil.log_level(8, response.url) try: author = str(r['Author']) except: spiderUtil.log_level(9, response.url) try: title = str(r['Title']) except: spiderUtil.log_level(6, response.url) r_url = r['LinkUrl'] public_time = public_time title = title author = author yield scrapy.Request(url=r_url, callback=self.parse, headers=self.header, meta={"public_time":public_time,"title":title, "author":author}) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath("""//div[contains(@id,'detail')]//p/text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.xinhuanet.com/" try: if content != "" and str(response.meta["public_time"]).startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = response.meta["public_time"] item["url"] = response.url item["title"] = response.meta["title"] item["author"] = response.meta["author"] item["html_size"] = html_size item["crawl_time"] = spiderUtil.get_time() # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class cyolOld(scrapy.Spider): name = "cyolSpider" ids = { 767, 548, 777, 885, 895, 886, 785, 797, 803, 804, 811, 812, 814, 818, 826, 834, 837, 842, 550, 849 } header = spiderUtil.header_util() def start_requests(self): tmp = "https://zqbapp.cyol.com/zqzxapi/api.php?s=/Web/getNewsListCache/version/3.0.8/tid/%s/page/%s" for id in self.ids: for page in range(1, 2): url = tmp % (id, page) yield scrapy.Request(url=url, callback=self.parse_item_news_list, headers=self.header) def parse_item_news_list(self, response): p = re.compile(r'[(](.*)[)]', re.S) r = re.findall(p, response.body.decode('utf-8'))[0] json_loads = json.loads(r) data = json_loads["data"] for i in data: newsurl = i['newsurl'] yield scrapy.Request(url=newsurl, callback=self.parse) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath( "//div[@class='section-main']/p/text()").extract() content = "".join(content_arr) # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: public_time = re.search(r"(\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" public_time = response.url.split( "/")[-3][:-2] + "-" + public_time except: # spiderUtil.log_level(8, response.url) pass try: title_arr = response.xpath("//head/title//text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//span[@id='copyfrom']//text()").extract() author = "".join(author_arr).strip() if author == "": author = "中青在线" except: spiderUtil.log_level(9, response.url) source = "http://www.cyol.com/" try: if content != "" and len( content) >= 100 and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class sohuNews(scrapy.Spider): name = "xuexiqiangguoNewsSpider" header = spiderUtil.header_util() item = NewsAllItem() def start_requests(self): home_url = 'https://www.xuexi.cn/lgdata/1jscb6pu1n2.json?_st=26044379' yield scrapy.Request(url=home_url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): data_str = response.text data_str = data_str[1:-1] data_str = eval( data_str, type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))()) data_str = json.dumps(data_str) data_str = json.loads(data_str) for r in data_str: try: public_time = str(r['publishTime']) except: spiderUtil.log_level(8, response.url) try: author = str(r['source']) except: spiderUtil.log_level(9, response.url) try: title = str(r['title']) except: spiderUtil.log_level(6, response.url) try: r_url1 = str(r_url).split("id=")[-1] r_url = "https://boot-source.xuexi.cn/data/app/" + r_url1 + ".js?callback=callback" yield scrapy.Request(url=r_url, callback=self.parse, headers=self.header, meta={ "public_time": public_time, "title": title, "author": author }) except: pass def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) data_str = response.text data_str = data_str[9:-1] data_str = eval( data_str, type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))()) data_str = json.dumps(data_str) data_str = json.loads(data_str) try: content = str(data_str['normalized_content']) except: spiderUtil.log_level(7, response.url) source = "http://www.xuexi.cn/" try: if content != "" and str( response.meta["public_time"]).startswith( spiderUtil.get_first_hour()): # if content != "" : item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = response.meta["public_time"] item["url"] = response.url item["title"] = response.meta["title"] item["author"] = response.meta["author"] item["html_size"] = html_size item["crawl_time"] = spiderUtil.get_time() # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class qqNews(scrapy.Spider): name = "qqNewsSpider" taglist = [ 'ent', 'sports', 'finance', 'tech', 'news', 'sports_nba', 'fashion' ] header = spiderUtil.header_util() def start_requests(self): for tag in self.taglist: url = 'https://pacaio.match.qq.com/openapi/json?key=' + tag + ':' + spiderUtil.get_time( )[:-9].replace("-", "") + '&num=50' yield scrapy.Request(url=url, callback=self.parsepage, headers=self.header) def parsepage(self, response): newsjson = json.loads(response.text) newslist = newsjson['data'] for news in newslist: url = news['url'] publice_time = news['publish_time'] author = news['source'] title = news['title'] meta = { 'title': title, 'public_time': publice_time, 'author': author } yield scrapy.Request(url, callback=self.parsebody, meta=meta) def parsebody(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath( "//div[@class='content-article']/p//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "https://news.qq.com/" try: if content != "" and str( response.meta["public_time"]).startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = response.meta["public_time"] item["url"] = response.url item["title"] = response.meta["title"] item["author"] = response.meta["author"] item["html_size"] = html_size item["crawl_time"] = spiderUtil.get_time() # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class bjNews(scrapy.Spider): name = "71cnNewsSpider" start_url = "http://www.71.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://www.71.cn/acastudies/bjyw/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request( url="http://www.71.cn/acastudies/expcolumn/economy/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request( url="http://www.71.cn/acastudies/expcolumn/politics/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request( url="http://www.71.cn/acastudies/expcolumn/culture/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request( url="http://www.71.cn/acastudies/expcolumn/community/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request( url="http://www.71.cn/acastudies/expcolumn/ecology/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request( url="http://www.71.cn/acastudies/expcolumn/dangjian/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://www.71.cn/acastudies/expcolumn/law/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://www.71.cn/acastudies/expcolumn/keji/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request( url="http://www.71.cn/acastudies/expcolumn/jiaoyu/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request( url="http://www.71.cn/acastudies/expcolumn/nationaldefense/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request( url="http://www.71.cn/acastudies/expcolumn/international/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request( url="http://www.71.cn/acastudies/expcolumn/history/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://www.71.cn/acastudies/impremarks/", callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): detail_urls = response.xpath( """/html/body/div/div/div/div/ul/li/div/a/@href""").extract() for detail_url in detail_urls: time.sleep(random.uniform(1, 2)) yield scrapy.Request(url=detail_url, callback=self.parse) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_time = response.xpath( """//*[@id="main"]/div/div/div/div/div/span[1]/text()""" ).extract() public_time = str(str(content_time[0]) + ":00") except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( """//*[@id="article-content"]/p/text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.71.cn/" try: author_arr = response.xpath( """//*[@id="main"]/div/div/div/div/div/span[2]/text()""" ).extract() author = "".join(author_arr) except: spiderUtil.log_level(9, response.url) try: title = response.xpath( """//*[@id="main"]/div/div/div/div/h1/text()""").extract( )[0] except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): # if content != "" : item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size print(content, public_time, title, author) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class cctvNews(scrapy.Spider): name = "cctvNewsSpider" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request( url="http://news.cctv.com/?spm=C96370.PsikHJQ1ICOX.Eu7sfGTzJUS0.1", callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): detail_urls = response.xpath( """//a[starts-with(@href,'http://news.cctv.com/20')]/@href""" ).extract() for detail_url in detail_urls: if len(detail_url) < 70: time.sleep(random.uniform(1, 2)) yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})", response.text).group(0).replace("年", "-").replace( "月", "-").replace("日", "") + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( """/html/body/div/div/div/p/text()""").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://www.southcn.com/" try: author_arr = response.xpath( """/html/body/div/div/div/div/span/i/a/text()""").extract( ) author = "".join(author_arr).strip() if author == "": author = "央视网" else: author = author.replace("来源:", "") except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( """/html/body/div/div/div/h1/text()""").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class cniiNews(scrapy.Spider): name = "cniiSpider" start_url = ["http://www.cnii.com.cn/node_33989.htm", "http://www.cnii.com.cn/node_34000.htm", "http://www.cnii.com.cn/telecom/node_34020.htm", "http://www.cnii.com.cn/city/node_34051.htm"] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: yield scrapy.Request(url=url, callback=self.parse_item_list_news,headers=self.header) # for page in range(2, 4): # yield scrapy.Request(url=url.replace(".htm", "_" + str(page) + ".htm"), # callback=self.parse_item_list_news) def parse_item_list_news(self, response): url_arr = response.xpath("//ul[@class='list2']/li/a/@href").extract() for url in url_arr: time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=response.url.split("node")[0] + url, callback=self.parse) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: # spiderUtil.log_level(8, response.url) pass try: content_arr = response.xpath("//div[@class='conzw']//text()").extract() content = "".join(content_arr).strip() # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: title_arr = response.xpath("//head/title//text()").extract() title = "".join(title_arr).split("_")[0] except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath("//div[@class='conzz']//text()").extract() author = "".join(author_arr) if author == "": author = "中国信息产业网" except: spiderUtil.log_level(9, response.url) source = "http://www.cnii.com.cn/" try: if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class sohuNews(scrapy.Spider): name = "sohuNewsSpider" header = spiderUtil.header_util() def start_requests(self): url_list = [ 15, 10, 9, 8, 17, 18, 19, 23, 24, 25, 26, 27, 28, 29, 30, 34, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 ] for i in url_list: url = 'http://v2.sohu.com/public-api/feed?scene=CHANNEL&sceneId=' + str( i) + '&page=1&size=80' yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): url = response.url request = urllib.request.Request(url) data_str = urllib.request.urlopen(request, timeout=10).read() data_str = data_str.decode('utf-8') data_str = data_str[1:-1] data_str = eval( data_str, type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))()) data_str = json.dumps(data_str) data_str = json.loads(data_str) for r in data_str: try: public_time_rt = datetime.fromtimestamp(r['publicTime'] // 1000) public_time = datetime.strftime(public_time_rt, '%Y-%m-%d %H:%M:%S') except: spiderUtil.log_level(8, response.url) try: author = str(r['authorName']) except: spiderUtil.log_level(9, response.url) try: title = str(r['title']) except: spiderUtil.log_level(6, response.url) url = 'http://www.sohu.com/a/' + str(r['id']) + '_' + str( r['authorId']) yield scrapy.Request(url=url, callback=self.parse, headers=self.header, meta={ 'public_time': public_time, 'url': url, 'title': title, 'author': author }) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath( """//article//p//text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://news.sohu.com/" try: if content != "" and str( response.mete['public_time']).startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["public_time"] = response.mete['public_time'] item["url"] = response.mete['url'] item["title"] = response.mete['title'] item["author"] = response.mete['author'] item["source"] = source item["content"] = content item["html_size"] = html_size item["crawl_time"] = spiderUtil.get_time() # print(self.item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class xuanjiangjiaNews(scrapy.Spider): name = "188cfwNewsSpider" start_url = "http://www.188cf.net/" header = spiderUtil.header_util() def start_requests(self): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36', 'Accepy': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Host': 'www.188cf.net', 'Upgrade-Insecure-Requests': '1', 'Referer': 'http://www.188cf.net/' } Cookie = { 'security_session_verify': '87456e2f5288fc9e54c5508487c724ae', 'security_session_mid_verify': '2a568bbe22a10d992a688d95dc309586' } yield scrapy.Request(url="http://www.188cf.net/gegu/", callback=self.parse_item_home, headers=headers, cookies=Cookie) # yield scrapy.Request(url="http://www.188cf.net/gupiao/", callback=self.parse_item_home,headers=self.header, cookies=Cookie) # yield scrapy.Request(url="http://www.188cf.net/licai/", callback=self.parse_item_home,headers=self.header, cookies=Cookie) # yield scrapy.Request(url="http://www.188cf.net/jijin/", callback=self.parse_item_home,headers=self.header, cookies=Cookie) # yield scrapy.Request(url="http://www.188cf.net/qihuo/", callback=self.parse_item_home,headers=self.header, cookies=Cookie) # yield scrapy.Request(url="http://www.188cf.net/huangjin/", callback=self.parse_item_home,headers=self.header, cookies=Cookie) # yield scrapy.Request(url="http://www.188cf.net/waihui/", callback=self.parse_item_home,headers=self.header, cookies=Cookie) # yield scrapy.Request(url="http://www.188cf.net/zhaiquan/", callback=self.parse_item_home,headers=self.header, cookies=Cookie) # yield scrapy.Request(url="http://www.188cf.net/caijing/", callback=self.parse_item_home,headers=self.header, cookies=Cookie) # yield scrapy.Request(url="http://www.188cf.net/yinhang/", callback=self.parse_item_home,headers=self.header, cookies=Cookie) # yield scrapy.Request(url="http://www.188cf.net/xueyuan/", callback=self.parse_item_home,headers=self.header, cookies=Cookie) def parse_item_home(self, response): detail_urls = response.xpath("""//div[@class="bt"]//@href""").extract() for detail_url in detail_urls: print(detail_url) time.sleep(random.uniform(1, 2)) yield scrapy.Request(url=detail_url, callback=self.parse_item_home, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_time = response.xpath("""//div[@class="info"]//text()""").extract() public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", content_time[0]).group(0) except: spiderUtil.log_level(8, response.url) try: content_arrs = response.xpath("""//td[@class="content"]//p//text()""").extract() content_arr = content_arrs.split('推荐信息')[0] content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.188cf.net/" try: author = "188财富网" except: spiderUtil.log_level(9, response.url) try: title = response.xpath("""//h1//text()""").extract()[0] except: spiderUtil.log_level(6, response.url) try: if content != "" and str(public_time).startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class xuanjiangjiaNews(scrapy.Spider): name = "rednetNewsSpider" start_url = "http://www.rednet.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="https://jishou.rednet.cn/channel/7250.html", callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): detail_urls = response.xpath( """//*[@id="div_newsList"]/ul/li/a/@href""").extract() for detail_url in detail_urls: time.sleep(random.uniform(1, 2)) yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_time = response.xpath( "//main/section/section/div/span[4]/text()").extract() public_time = str(str(content_time[0]) + ":00") except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "/html/body/main/section/section/article/section/p/text()" ).extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.rednet.cn/" try: author_arr = response.xpath( "//main/section/section/div/span[1]/text()").extract() author = "".join(author_arr).strip() if author == "": author = "红网" else: author = author.split("来源:")[1] except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//main/section/section/h1/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: # if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size print(item) # yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class ceNews(scrapy.Spider): name = "chinadailySpider" start_url = [ "http://china.chinadaily.com.cn/5bd5639ca3101a87ca8ff636", "http://china.chinadaily.com.cn/5bd5639ca3101a87ca8ff62e", "http://world.chinadaily.com.cn/5bd55927a3101a87ca8ff610", "http://world.chinadaily.com.cn/5bda6641a3101a87ca904fe6", "http://caijing.chinadaily.com.cn/finance/", "http://cn.chinadaily.com.cn/lvyou/5b7628c6a310030f813cf48a", "http://cn.chinadaily.com.cn/lvyou/5b7628c6a310030f813cf48c", "http://cn.chinadaily.com.cn/lvyou/5b7628c6a310030f813cf48b", "http://cn.chinadaily.com.cn/lvyou/5b7628c6a310030f813cf493", "http://cn.chinadaily.com.cn/lvyou/5bac7d20a3101a87ca8ff52d", "http://fashion.chinadaily.com.cn/5b762404a310030f813cf467", "http://cn.chinadaily.com.cn/jiankang", "http://fashion.chinadaily.com.cn/5b762404a310030f813cf461", "http://fashion.chinadaily.com.cn/5b762404a310030f813cf462", "http://fashion.chinadaily.com.cn/5b762404a310030f813cf463", "http://fashion.chinadaily.com.cn/5b8f77a7a310030f813ed4c8" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): for page in range(1, 2): if response.url.endswith("/"): list_url = response.url + "page_" + str(page) + ".html" yield scrapy.Request(url=list_url, callback=self.parse_item_list_news) else: list_url = response.url + "/page_" + str(page) + ".html" yield scrapy.Request(url=list_url, callback=self.parse_item_list_news) def parse_item_list_news(self, response): url_arr = response.xpath("//h3/a/@href").extract() for url in url_arr: url = "http:" + url yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath( "//div[@id='Content']//text()").extract() content = "".join(content_arr).strip() # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: title_arr = response.xpath("//head/title//text()").extract() title = "".join(title_arr).strip().strip()[:-8] except: spiderUtil.log_level(6, response.url) try: author = response.xpath( "//head/meta[@name='author']/@content").extract() if author == []: author = "中国日报网" else: author = author[0] except: spiderUtil.log_level(9, response.url) source = "http://www.chinadaily.com.cn/" try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: # spiderUtil.log_level(8, response.url) pass try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class peopleNews(scrapy.Spider): name = "peopleNewsSpider" header = spiderUtil.header_util() def start_requests(self): now = int(time.time()) url = 'http://news.people.com.cn/210801/211150/index.js?_=' + str(now) yield scrapy.Request(url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): newsjson = json.loads(response.text) newslist = newsjson['items'] for news in newslist: post_title = news['title'] post_url = str(news['url']) yield scrapy.Request(url=post_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}:\d{1,2})", response.text).group(0).replace("年", "-").replace( "月", "-").replace("日", " ") + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='rwb_zw']/p//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://www.people.com.cn/" try: author_arr = response.xpath( "//div[@class='box01']/div[@class='fl']/a//text()" ).extract() author = "".join(author_arr).strip() if author == "": author = "人民网" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath("//h1//text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class xuanjiangjiaNews(scrapy.Spider): name = "zgswxwwNewsSpider" start_url = "http://www.comnews.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://www.comnews.cn/article/pnews/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://www.comnews.cn/article/photo/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request( url="http://www.comnews.cn/article/international/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://www.comnews.cn/article/local/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://www.comnews.cn/article/dzone/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://www.comnews.cn/article/abing/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://www.comnews.cn/article/ibdnews/", callback=self.parse_item_home, headers=self.header) yield scrapy.Request(url="http://www.comnews.cn/article/home/", callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): detail_urls = response.xpath( """//ul[@class="col-ls alist"]//@href""").extract() for detail_url in detail_urls: # time.sleep(random.uniform(1, 2)) detail_url = "http://www.comnews.cn" + detail_url yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_time = response.xpath( """//meta[@name="PubDate"]//@content""").extract() # print(content_time) # content_times = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", content_time[0]).group(0) # print(content_times) public_time = str(content_time[0]) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( """//div[@class="content"]//p//text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.comnews.cn/" try: author = response.xpath( """//meta[@name="ContentSource"]//@content""").extract( )[0].strip() except: spiderUtil.log_level(9, response.url) try: titles = response.xpath( """//meta[@name="ArticleTitle"]//@content""").extract() title = "".join(titles) except: spiderUtil.log_level(6, response.url) try: # if content != "" and str(public_time).startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size print(item) # yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class southnetNews(scrapy.Spider): name = "southnetNewsSpider" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://www.southcn.com/pc2018/yw/node_384370.htm", callback=self.parse_item_home, headers=self.header) # 要闻新闻 yield scrapy.Request(url="http://news.southcn.com/gd/", callback=self.parse_item_home, headers=self.header) # 广东新闻 yield scrapy.Request(url="http://news.southcn.com/china/default.htm", callback=self.parse_item_home, headers=self.header) # 中国新闻 yield scrapy.Request(url="http://news.southcn.com/international/default.htm", callback=self.parse_item_home, headers=self.header) # 国际新闻 yield scrapy.Request(url="http://news.southcn.com/community/", callback=self.parse_item_home, headers=self.header) # 社会新闻 yield scrapy.Request(url="http://kb.southcn.com/default.htm", callback=self.parse_item_home, headers=self.header) # 南方快报 yield scrapy.Request(url="http://news.southcn.com/g/node_74681.htm", callback=self.parse_item_home, headers=self.header) # 权威公告 def parse_item_home(self, response): detail_urls = response.xpath("""//div/div/div/div/div/h3/a/@href""").extract() for detail_url in detail_urls: time.sleep(random.uniform(1, 2)) yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = str(response.xpath("""//*[@id="pubtime_baidu"]/text()""").extract()[0].strip())+":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("""//*[@id="content"]/p/text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.southcn.com/" try: author_arr = response.xpath("""//*[@id="source_baidu"]/text()""").extract() author = "".join(author_arr).strip() if author == "": author = "澎湃新闻" else: author = author.replace("来源:","") except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath("""//*[@id="article_title"]/text()""").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class sinaNews(scrapy.Spider): name = "sinaNewsSpider" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request( url= "https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2509&k=&num=50&page=1", callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): response = requests.get(response.url, headers=self.header) data = response.json() datas = data['result']['data'] for dict_data in datas: yield scrapy.Request(url=dict_data['url'], callback=self.parse) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: if response.xpath( """//*[@id="top_bar"]/div/div/span[1]/text()""" ).extract(): content_time = response.xpath( """//*[@id="top_bar"]/div/div/span[1]/text()""" ).extract() public_datetime = str(content_time[0]).replace( "年", "-").replace("月", "-").replace("日", "") public_time = public_datetime + ":00" else: if response.xpath( """//*[@id="top_bar"]/div/div[2]/span/text()""" ).extract(): content_time = response.xpath( """//*[@id="top_bar"]/div/div[2]/span/text()""" ).extract() public_datetime = str(content_time[0]).replace( "年", "-").replace("月", "-").replace("日", "") public_time = public_datetime + ":00" else: content_time = response.xpath( """//*[@id="pub_date"]/text()""").extract() public_datetime = str(content_time[0]).replace( "年", "-").replace("月", "-").replace("日", " ") public_time = public_datetime + ":00" except: spiderUtil.log_level(8, response.url) try: if response.xpath( """//*[@id="artibody"]/p/text()""").extract(): content_arr = response.xpath( """//*[@id="artibody"]/p/text()""").extract() else: content_arr = response.xpath( """//*[@id="article"]/p/text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "https://news.sina.com.cn/" try: if response.xpath("""//span[@class="source"]""").extract()[0]: author = response.xpath( """//span[@class="source"]""").extract()[0].strip() else: author = "新浪网" except: spiderUtil.log_level(9, response.url) try: if response.xpath("""/html/body/div/h1/text()""").extract()[0]: title = response.xpath( """/html/body/div/h1/text()""").extract()[0] else: title = response.xpath( """//*[@id="artibodyTitle"]/text()""").extract()[0] except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): # if content != "": item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)