def test_count(self): self.assertEqual(pytime.count('2015517', '2015519'), datetime.timedelta(-2)) self.assertEqual(pytime.count('2015517', '2015519 23:23:23'), datetime.timedelta(-3, 2197)) self.assertEqual(pytime.count('2015517 23:23:23', '2015519 23:23:23'), datetime.timedelta(-2)) self.assertEqual(pytime.count('2015519 23:23:23', '2015-5-17'), datetime.timedelta(2, 84203))
def parse(self, response): # 解析列表页中的所有文章url并交给scrapy下载后并进行解析 while response: post_nodes = response.css( "#dnn_ctr59828_ArticleList__ctl0_ArtDataList a::attr(href)" ).extract() news_time = response.css( "#dnn_ctr59828_ArticleList__ctl0_ArtDataList__ctl0_Label6::text" ).extract_first() if pytime.count(pytime.today(), news_time) > datetime.timedelta(TIME_DELTA_DAYS): print(news_time + "\n") return for post_node in post_nodes: yield Request(url=parse.urljoin(response.url, post_node), callback=self.parse_detail) self.browser.find_element_by_css_selector( "#dnn_ctr59828_ArticleList__ctl0_lbtnNext").click() selector = Selector(text=self.browser.page_source) page_num = selector.css( "#dnn_ctr59828_ArticleList__ctl0_plPageNum::text" ).extract_first() print("page is " + page_num) response = HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8") return
def parse(self, response): # 解析列表页中的所有文章url并交给scrapy下载后并进行解析 if response.url == "http://www.jwc.shu.edu.cn/index/tzgg.htm": self.tag = "通知公告" elif response.url == "http://www.jwc.shu.edu.cn/index/xw.htm": self.tag = "新闻" post_nodes = response.css( "#dnn_ctr43516_ArticleList__ctl0_ArtDataList__ctl1_titleLink1::attr(href)" ).extract() news_time = response.css( "dnn_ctr43516_ArticleList__ctl0_ArtDataList__ctl1_Label6::text" ).extract_first() if pytime.count(pytime.today(), news_time) > datetime.timedelta(TIME_DELTA_DAYS): print(news_time) return for post_node in post_nodes: yield Request(url=parse.urljoin(response.url, post_node), callback=self.parse_detail) # 提取下一页并交给scrapy进行下载 next_url = response.css( "a.Next:nth-child(3)::attr(href)").extract_first("") if next_url: yield Request(url=parse.urljoin(response.url, next_url), meta={"tag": self.tag}, callback=self.parse)
def parse(self, response): # 解析列表页中的所有文章url并交给scrapy下载后并进行解析 post_nodes = response.css(".views-table > tbody:nth-child(1) tr") for post_node in post_nodes: create_date = post_node.css( ".views-field-created::text").extract_first().strip() create_date = datetime.datetime.strptime(create_date, "%y-%m-%d") post_node_url = post_node.css("a::attr(href)").extract_first() if pytime.count(pytime.today(), create_date) < datetime.timedelta(TIME_DELTA_DAYS): url = parse.urljoin(response.url, post_node_url) delta = pytime.count(pytime.today(), create_date) yield Request(url=parse.urljoin(response.url, post_node_url), meta={"create_date": create_date}, callback=self.parse_detail, dont_filter=True) else: break
def parse(self, response): # 解析列表页中的所有文章url并交给scrapy下载后并进行解析//*[@id="dnn_dnnBREADCRUMB_lblBreadCrumb"]/a[2] tag = response.css("#dnn_dnnBREADCRUMB_lblBreadCrumb > a:nth-child(2)::text").extract_first() post_nodes = response.css( "#dnn_ctr1053_ArticleList_ctl00_lstArticles > tbody:nth-child(1) a::attr(href)").extract() news_time = response.css( "#dnn_ctr1053_ArticleList_ctl00_lstArticles_ctl00_lblPublishDate::text").extract_first() if pytime.count(pytime.today(), news_time) > datetime.timedelta(TIME_DELTA_DAYS): print(news_time) return for post_node in post_nodes: yield Request(url=parse.urljoin(response.url, post_node), meta ={"tag": tag}, callback=self.parse_detail) # 提取下一页并交给scrapy进行下载 next_url = response.css("a.Next:nth-child(3)::attr(href)").extract_first("") if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse(self, response): # 解析列表页中的所有文章url并交给scrapy下载后并进行解析 post_nodes = response.css("#dnn_ctr59825_ArticleList__ctl0_ArtDataList a::attr(href)").extract() news_time = response.css("#dnn_ctr59825_ArticleList__ctl0_ArtDataList__ctl0_Label6::text") .extract_first() if pytime.count(pytime.today(), news_time) > datetime.timedelta(TIME_DELTA_DAYS): print(news_time+"\n") return tag = response.css("#dnn_dnnBREADCRUMB_lblBreadCrumb > a::text") if "tabid=31641" in response.url: tag = '学工新闻' for post_node in post_nodes: yield Request(url=parse.urljoin(response.url, post_node), meta={"tag",tag}, callback=self.parse_detail) # 提取下一页并交给scrapy进行下载 next_url = response.css("#dnn_ctr59825_ArticleList__ctl0_lbtnNext::attr(href)").extract_first("") if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def test_count(self): self.assertTrue(pytime.count('2015517', '2015519'), datetime.timedelta(-2))
def process_item(self, item, spider): if pytime.count( pytime.today(), item['create_date']) < datetime.timedelta(TIME_DELTA_DAYS): return item