Esempio n. 1
0
 def test_function(self):
     this1 = pytime.today() == datetime.date.today()
     self.assertTrue(this1)
     this2 = pytime.today(2014) == datetime.date.today().replace(year=2014)
     self.assertTrue(this2)
     this3 = pytime.tomorrow() == datetime.date.today() + datetime.timedelta(days=1)
     self.assertTrue(this3)
     this4 = pytime.tomorrow('2015-5-19') == datetime.date(2015, 5, 20)
     self.assertTrue(this4)
     this5 = pytime.yesterday() == datetime.date.today() - datetime.timedelta(days=1)
     self.assertTrue(this5)
     this6 = pytime.yesterday('2015-5-29') == datetime.date(2015, 5, 28)
     self.assertTrue(this6)
    def parse(self, response):
        # 解析列表页中的所有文章url并交给scrapy下载后并进行解析

        while response:
            post_nodes = response.css(
                "#dnn_ctr59828_ArticleList__ctl0_ArtDataList a::attr(href)"
            ).extract()
            news_time = response.css(
                "#dnn_ctr59828_ArticleList__ctl0_ArtDataList__ctl0_Label6::text"
            ).extract_first()
            if pytime.count(pytime.today(),
                            news_time) > datetime.timedelta(TIME_DELTA_DAYS):
                print(news_time + "\n")
                return

            for post_node in post_nodes:
                yield Request(url=parse.urljoin(response.url, post_node),
                              callback=self.parse_detail)

            self.browser.find_element_by_css_selector(
                "#dnn_ctr59828_ArticleList__ctl0_lbtnNext").click()
            selector = Selector(text=self.browser.page_source)
            page_num = selector.css(
                "#dnn_ctr59828_ArticleList__ctl0_plPageNum::text"
            ).extract_first()
            print("page is " + page_num)
            response = HtmlResponse(url=self.browser.current_url,
                                    body=self.browser.page_source,
                                    encoding="utf-8")

        return
Esempio n. 3
0
    def parse(self, response):
        # 解析列表页中的所有文章url并交给scrapy下载后并进行解析
        if response.url == "http://www.jwc.shu.edu.cn/index/tzgg.htm":
            self.tag = "通知公告"
        elif response.url == "http://www.jwc.shu.edu.cn/index/xw.htm":
            self.tag = "新闻"
        post_nodes = response.css(
            "#dnn_ctr43516_ArticleList__ctl0_ArtDataList__ctl1_titleLink1::attr(href)"
        ).extract()
        news_time = response.css(
            "dnn_ctr43516_ArticleList__ctl0_ArtDataList__ctl1_Label6::text"
        ).extract_first()

        if pytime.count(pytime.today(),
                        news_time) > datetime.timedelta(TIME_DELTA_DAYS):
            print(news_time)
            return

        for post_node in post_nodes:
            yield Request(url=parse.urljoin(response.url, post_node),
                          callback=self.parse_detail)

        # 提取下一页并交给scrapy进行下载
        next_url = response.css(
            "a.Next:nth-child(3)::attr(href)").extract_first("")
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url),
                          meta={"tag": self.tag},
                          callback=self.parse)
Esempio n. 4
0
def main():

    config = get_config()
    flags = sys.argv

    pull_latest = False

    #get flags
    for flag in flags:
        if flag == '--latest':
            pull_latest = True

    query = ''

    #if we are only pulling the latest data find out when we ran the program last
    if pull_latest:
        last_run = get_last_run_time(config['last_run'])
        query = 'after:{}'.format(last_run)

    #try to find data
    try:
        pull_gmail_data(query)
        config['last_run'] = str(pytime.today())
        save_config(config)
    except NoMessagesFoundException as e:
        print(e)

    #read an html file now
    html = HtmlReader.HtmlReader()
    events = html.read_all('./htmlFilesv2/*.html')

    write_to_csv(events)
Esempio n. 5
0
 def test_function(self):
     this1 = pytime.today() == datetime.date.today()
     self.assertTrue(this1)
     this2 = pytime.today(2014) == datetime.date.today().replace(year=2014)
     self.assertTrue(this2)
     this3 = pytime.tomorrow() == datetime.date.today() + datetime.timedelta(days=1)
     self.assertTrue(this3)
     this4 = pytime.tomorrow('2015-5-19') == datetime.date(2015, 5, 20)
     self.assertTrue(this4)
     this5 = pytime.yesterday() == datetime.date.today() - datetime.timedelta(days=1)
     self.assertTrue(this5)
     this6 = pytime.yesterday('2015-5-29') == datetime.date(2015, 5, 28)
     self.assertTrue(this6)
     this7 = pytime.yesterday(1432310400 + gmt8offset) == datetime.datetime(2015, 5, 22)
     self.assertTrue(this7)
     this8 = pytime.tomorrow(1432310400 + gmt8offset) == datetime.datetime(2015, 5, 24)
     self.assertTrue(this8)
Esempio n. 6
0
    def parse(self, response):
        # 解析列表页中的所有文章url并交给scrapy下载后并进行解析
        post_nodes = response.css(".views-table > tbody:nth-child(1) tr")
        for post_node in post_nodes:
            create_date = post_node.css(
                ".views-field-created::text").extract_first().strip()
            create_date = datetime.datetime.strptime(create_date, "%y-%m-%d")
            post_node_url = post_node.css("a::attr(href)").extract_first()

            if pytime.count(pytime.today(),
                            create_date) < datetime.timedelta(TIME_DELTA_DAYS):
                url = parse.urljoin(response.url, post_node_url)
                delta = pytime.count(pytime.today(), create_date)
                yield Request(url=parse.urljoin(response.url, post_node_url),
                              meta={"create_date": create_date},
                              callback=self.parse_detail,
                              dont_filter=True)
            else:
                break
Esempio n. 7
0
    def parse(self, response):
        # 解析列表页中的所有文章url并交给scrapy下载后并进行解析//*[@id="dnn_dnnBREADCRUMB_lblBreadCrumb"]/a[2]
        tag = response.css("#dnn_dnnBREADCRUMB_lblBreadCrumb > a:nth-child(2)::text").extract_first()
        post_nodes = response.css(
            "#dnn_ctr1053_ArticleList_ctl00_lstArticles > tbody:nth-child(1) a::attr(href)").extract()
        news_time = response.css(
            "#dnn_ctr1053_ArticleList_ctl00_lstArticles_ctl00_lblPublishDate::text").extract_first()
        if pytime.count(pytime.today(), news_time) > datetime.timedelta(TIME_DELTA_DAYS):
            print(news_time)
            return

        for post_node in post_nodes:
            yield Request(url=parse.urljoin(response.url, post_node),  meta ={"tag": tag}, callback=self.parse_detail)

        # 提取下一页并交给scrapy进行下载
        next_url = response.css("a.Next:nth-child(3)::attr(href)").extract_first("")

        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
    def parse(self, response):
        # 解析列表页中的所有文章url并交给scrapy下载后并进行解析
        post_nodes =  response.css("#dnn_ctr59825_ArticleList__ctl0_ArtDataList a::attr(href)").extract()
        news_time = response.css("#dnn_ctr59825_ArticleList__ctl0_ArtDataList__ctl0_Label6::text") .extract_first()
        if pytime.count(pytime.today(), news_time) > datetime.timedelta(TIME_DELTA_DAYS):
            print(news_time+"\n")
            return

        tag = response.css("#dnn_dnnBREADCRUMB_lblBreadCrumb > a::text")

        if "tabid=31641" in response.url:
            tag = '学工新闻'

        for post_node in post_nodes:
            yield Request(url=parse.urljoin(response.url, post_node), meta={"tag",tag}, callback=self.parse_detail)

        # 提取下一页并交给scrapy进行下载
        next_url = response.css("#dnn_ctr59825_ArticleList__ctl0_lbtnNext::attr(href)").extract_first("")
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
Esempio n. 9
0
 def process_item(self, item, spider):
     if pytime.count(
             pytime.today(),
             item['create_date']) < datetime.timedelta(TIME_DELTA_DAYS):
         return item