def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0).replace( "年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//founder-content/p//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://dz.jjckb.cn/www/pages/webpage2009/" try: author_arr = response.xpath( "//td[@class='black12']//text()").extract() author = "".join(author_arr).split("来源:")[1].strip() if author == "": author = "经济参考报" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//tr/td[@class='hei16b']//text()").extract() title_arr1 = response.xpath( "//tr/td[@class='hui12']//text()").extract() title = "".join(title_arr).strip() + "".join( title_arr1).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace( "年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='content']/p//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://szb.ddswcm.com/" try: author = response.xpath( "//div[@class='property']//text()").extract()[0].split( "作者:")[1].split("浏览次数")[0].strip() if author == "": author = "当代商报" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//div[@class='papertitle']//text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='txtContent']//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://epaper.21jingji.com/" try: author_arr = response.xpath( "//div[@class='newsInfo']//text()").extract() author = "".join(author_arr).split(" ")[1].strip() if author == "": author = "21世纪经济报道数字报" except: spiderUtil.log_level(9, response.url) try: title_arr1 = response.xpath( "//div[@class='titleHead']/h1//text()").extract() title = "".join(title_arr1).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0) + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='content']//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "https://hzdaily.hangzhou.com.cn/hzrb/" try: author = "杭州日报" except: spiderUtil.log_level(9, response.url) try: title_arr1 = response.xpath( "//div[@class='head']/h1//text()").extract() title_arr2 = response.xpath( "//div[@class='head']/h2//text()").extract() title_arr3 = response.xpath( "//div[@class='head']/h3//text()").extract() title = "".join(title_arr1).strip() + "".join( title_arr2).strip() + "".join(title_arr3).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace( "年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//span[@id='contenttext']//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://bddsb.bandao.cn/pc/bddsb/" try: author = "半岛都市报" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//div[@class='neirong']/h3//text()").extract() title_arr1 = response.xpath( "//div[@class='neirong']/h2//text()").extract() title = "".join(title_arr).strip() + "".join( title_arr1).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}/\d{1,2}/\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0).replace("/", "-") except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='article_content']//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://www.qhdb.com.cn/" try: author_arr = response.xpath( "//span[@id='sZuoZhe']/text()").extract() author = "".join(author_arr) if author == "": author = "期货日报" except: spiderUtil.log_level(9, response.url) try: title = response.xpath("//div[@class='article_title']//text()" ).extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace( "年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//p[@class='info_p']//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://epaper.qdcaijing.com/" try: author = response.xpath("//p[@class='txtc']/span/text()" ).extract()[0].split("来源:")[0].strip() if author == "": author = "青岛财经日报" except: spiderUtil.log_level(9, response.url) try: title = response.xpath( "//h3[@class='txtc']//text()").extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace( "年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='ozoom']/div/span/p//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://media.workercn.cn/" try: author_arr = response.xpath( "//div[@class='lai']/span//text()").extract() author = "".join(author_arr).strip().split("(")[0] if author == "": author = "劳动新闻" except: spiderUtil.log_level(9, response.url) try: title = response.xpath("//h1//text()").extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0) + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//td[@class='xilan_content_tt']/p/text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://fjrb.fjsen.com/" try: authors = response.xpath("//td[@class='bt4']/text()").extract() author_arr = "".join(authors).split("记者 ") if len(author_arr) == 2: author = author_arr[1] else: author = "福建日报" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath("//td[@class='bt1']/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0) + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='ozoom']/founder-content/text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://epaper.ssrb.com.cn/" try: author = "石狮日报" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//td[@class='font01']/founder-title/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0) + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//founder-content//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://124.224.204.62:8081/szb/pc/" try: author_arr = response.xpath("//h3[@class='title-author']/text()").extract() if author_arr == []: author = "中卫日报" else: author=author_arr[0] except: spiderUtil.log_level(9, response.url) try: title = response.xpath("//h2[@id='Title']/text()").extract()[0] except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace("年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//founder-content//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://www.hngrrb.cn/" try: author = "河南工人日报" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath("//table[3]/tbody/tr/td//text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)