Beispiel #1
0
 def test_count(self):
     self.assertEqual(pytime.count('2015517', '2015519'),
                      datetime.timedelta(-2))
     self.assertEqual(pytime.count('2015517', '2015519 23:23:23'),
                      datetime.timedelta(-3, 2197))
     self.assertEqual(pytime.count('2015517 23:23:23', '2015519 23:23:23'),
                      datetime.timedelta(-2))
     self.assertEqual(pytime.count('2015519 23:23:23', '2015-5-17'),
                      datetime.timedelta(2, 84203))
    def parse(self, response):
        # 解析列表页中的所有文章url并交给scrapy下载后并进行解析

        while response:
            post_nodes = response.css(
                "#dnn_ctr59828_ArticleList__ctl0_ArtDataList a::attr(href)"
            ).extract()
            news_time = response.css(
                "#dnn_ctr59828_ArticleList__ctl0_ArtDataList__ctl0_Label6::text"
            ).extract_first()
            if pytime.count(pytime.today(),
                            news_time) > datetime.timedelta(TIME_DELTA_DAYS):
                print(news_time + "\n")
                return

            for post_node in post_nodes:
                yield Request(url=parse.urljoin(response.url, post_node),
                              callback=self.parse_detail)

            self.browser.find_element_by_css_selector(
                "#dnn_ctr59828_ArticleList__ctl0_lbtnNext").click()
            selector = Selector(text=self.browser.page_source)
            page_num = selector.css(
                "#dnn_ctr59828_ArticleList__ctl0_plPageNum::text"
            ).extract_first()
            print("page is " + page_num)
            response = HtmlResponse(url=self.browser.current_url,
                                    body=self.browser.page_source,
                                    encoding="utf-8")

        return
Beispiel #3
0
    def parse(self, response):
        # 解析列表页中的所有文章url并交给scrapy下载后并进行解析
        if response.url == "http://www.jwc.shu.edu.cn/index/tzgg.htm":
            self.tag = "通知公告"
        elif response.url == "http://www.jwc.shu.edu.cn/index/xw.htm":
            self.tag = "新闻"
        post_nodes = response.css(
            "#dnn_ctr43516_ArticleList__ctl0_ArtDataList__ctl1_titleLink1::attr(href)"
        ).extract()
        news_time = response.css(
            "dnn_ctr43516_ArticleList__ctl0_ArtDataList__ctl1_Label6::text"
        ).extract_first()

        if pytime.count(pytime.today(),
                        news_time) > datetime.timedelta(TIME_DELTA_DAYS):
            print(news_time)
            return

        for post_node in post_nodes:
            yield Request(url=parse.urljoin(response.url, post_node),
                          callback=self.parse_detail)

        # 提取下一页并交给scrapy进行下载
        next_url = response.css(
            "a.Next:nth-child(3)::attr(href)").extract_first("")
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url),
                          meta={"tag": self.tag},
                          callback=self.parse)
Beispiel #4
0
    def parse(self, response):
        # 解析列表页中的所有文章url并交给scrapy下载后并进行解析
        post_nodes = response.css(".views-table > tbody:nth-child(1) tr")
        for post_node in post_nodes:
            create_date = post_node.css(
                ".views-field-created::text").extract_first().strip()
            create_date = datetime.datetime.strptime(create_date, "%y-%m-%d")
            post_node_url = post_node.css("a::attr(href)").extract_first()

            if pytime.count(pytime.today(),
                            create_date) < datetime.timedelta(TIME_DELTA_DAYS):
                url = parse.urljoin(response.url, post_node_url)
                delta = pytime.count(pytime.today(), create_date)
                yield Request(url=parse.urljoin(response.url, post_node_url),
                              meta={"create_date": create_date},
                              callback=self.parse_detail,
                              dont_filter=True)
            else:
                break
Beispiel #5
0
    def parse(self, response):
        # 解析列表页中的所有文章url并交给scrapy下载后并进行解析//*[@id="dnn_dnnBREADCRUMB_lblBreadCrumb"]/a[2]
        tag = response.css("#dnn_dnnBREADCRUMB_lblBreadCrumb > a:nth-child(2)::text").extract_first()
        post_nodes = response.css(
            "#dnn_ctr1053_ArticleList_ctl00_lstArticles > tbody:nth-child(1) a::attr(href)").extract()
        news_time = response.css(
            "#dnn_ctr1053_ArticleList_ctl00_lstArticles_ctl00_lblPublishDate::text").extract_first()
        if pytime.count(pytime.today(), news_time) > datetime.timedelta(TIME_DELTA_DAYS):
            print(news_time)
            return

        for post_node in post_nodes:
            yield Request(url=parse.urljoin(response.url, post_node),  meta ={"tag": tag}, callback=self.parse_detail)

        # 提取下一页并交给scrapy进行下载
        next_url = response.css("a.Next:nth-child(3)::attr(href)").extract_first("")

        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
    def parse(self, response):
        # 解析列表页中的所有文章url并交给scrapy下载后并进行解析
        post_nodes =  response.css("#dnn_ctr59825_ArticleList__ctl0_ArtDataList a::attr(href)").extract()
        news_time = response.css("#dnn_ctr59825_ArticleList__ctl0_ArtDataList__ctl0_Label6::text") .extract_first()
        if pytime.count(pytime.today(), news_time) > datetime.timedelta(TIME_DELTA_DAYS):
            print(news_time+"\n")
            return

        tag = response.css("#dnn_dnnBREADCRUMB_lblBreadCrumb > a::text")

        if "tabid=31641" in response.url:
            tag = '学工新闻'

        for post_node in post_nodes:
            yield Request(url=parse.urljoin(response.url, post_node), meta={"tag",tag}, callback=self.parse_detail)

        # 提取下一页并交给scrapy进行下载
        next_url = response.css("#dnn_ctr59825_ArticleList__ctl0_lbtnNext::attr(href)").extract_first("")
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
Beispiel #7
0
 def test_count(self):
     self.assertEqual(pytime.count('2015517', '2015519'), datetime.timedelta(-2))
     self.assertEqual(pytime.count('2015517', '2015519 23:23:23'), datetime.timedelta(-3, 2197))
     self.assertEqual(pytime.count('2015517 23:23:23', '2015519 23:23:23'), datetime.timedelta(-2))
     self.assertEqual(pytime.count('2015519 23:23:23', '2015-5-17'), datetime.timedelta(2, 84203))
Beispiel #8
0
 def test_count(self):
     self.assertTrue(pytime.count('2015517', '2015519'), datetime.timedelta(-2))
Beispiel #9
0
 def process_item(self, item, spider):
     if pytime.count(
             pytime.today(),
             item['create_date']) < datetime.timedelta(TIME_DELTA_DAYS):
         return item