Example #1
0
    def parse_next_page(self, response):
        origin = "中华人民共和国教育部"
        column = response.meta['column']
        result_list = response.xpath("//li")
        if result_list is None:
            return
        for item in result_list:
            title = item.xpath("./a/text()").extract_first()
            link = item.xpath("./a/@href").extract_first()
            birth = item.xpath("./span/text()").extract_first()
            date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            edu_info_item = CommonItem()
            edu_info_item["origin"] = origin
            edu_info_item["column"] = column
            edu_info_item["title"] = title
            edu_info_item["link"] = link
            edu_info_item["birth"] = birth
            edu_info_item["date"] = date
            yield Request(url=edu_info_item["link"], meta={"item": edu_info_item}, callback=self.parse_content)
        next_page = response.meta['next_page']
        url = response.meta['url']
        page_num = int(next_page)

        if item is not None and url is not None:
            yield Request(url=(url + next_page), meta={"next_page": (page_num + 1), "url": url, "column": column},
                          callback=self.parse_next_page)
Example #2
0
    def parse(self, response):
        # print("---------------------------------------------------------------------------------------")
        # newsName = response.xpath("//div[@class='title trim']/a").xpath('string(.)').extract()
        # print(newsName)
        # list =[]
        p = "http://www.tech.net.cn"
        str(self).encode("utf-8")

        # 列表页
        news_urls = response.xpath("//h4/a/@href")
        nest_page_url = response.xpath("//div[@class='digg']/a[last()]/@href").extract_first()
        title = response.xpath("//h1[@class='m-t-10 m-b-5']/text()").extract_first()

        i = 0
        # print(news_urls)
        if len(news_urls) != 0 and title is None:
            for url in news_urls:
                page = response.xpath("//div[@class='page-box']/div[@class='digg']/span[@class='current']/text()").extract_first()
                link = p + url.extract()
                i = i + 1
                # print("link", i, ":", link)
                news_info = CommonItem()
                news_info["link"] = link
                news_info["dataOriginId"] = self.origin_id
                yield scrapy.Request(news_info["link"], meta={"news_info": news_info})
                # time.sleep(1)

            # 翻页
            # print("翻页", p + nest_page_url)
            if nest_page_url is not None:
                yield scrapy.Request(url=p + nest_page_url)
        else:
            title = response.xpath("//h1[@class='m-t-10 m-b-5']/text()").extract_first()
            if title is not None and len(title) > 0:
                content = response.xpath("//div[@class='contentabnc']/section").extract_first()
                date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                birth = response.xpath("//div[@class='info font-weight-300']/span[1]/text()").extract_first()
                if birth is not None:
                    birth = birth.split(':')[-1]
                column = response.xpath("//ol/li[@class='breadcrumb-item']/text()").extract()[-1]
                if column is not None:
                    column = column.split('> ')[-1]
                # print("----------------------", column)
                intro = response.xpath("//h4/text()").extract_first()
                source = response.xpath("//div[@class='info font-weight-300']/span[4]/text()").extract_first()
                if source is not None:
                    source = source.split(': ')[-1]
                # print("----------------------", birth)
                news_info = response.meta["news_info"]
                news_info["content"] = content
                news_info["date"] = date
                news_info["birth"] = birth
                news_info["intro"] = intro
                news_info["organ"] = source
                news_info["origin"] = "中国高职高专网"
                news_info["title"] = title
                news_info["column"] = column
                # print("info", news_info)
                yield news_info
        pass
Example #3
0
 def parse_next_page(self, response):
     origin = "中华人民共和国教育部"
     column = "教育要闻"
     result_list = response.xpath("//li")
     for item in result_list:
         title = item.xpath("./a/text()").extract_first()
         link = item.xpath("./a/@href").extract_first()
         birth = item.xpath("./span/text()").extract_first()
         date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         edu_info_item = CommonItem()
         edu_info_item["origin"] = origin
         edu_info_item["column"] = column
         edu_info_item["title"] = title
         if not link.startswith("http"):
             link = "http://www.moe.gov.cn" + link
         edu_info_item["link"] = link
         edu_info_item["birth"] = birth
         edu_info_item["date"] = date
         yield Request(url=link, meta={"item": edu_info_item}, callback=self.parse_content)
     next_page = response.meta['next_page']
     url = response.meta['url']
     page_num = int(next_page)
     next_page_num = str(page_num + 1)
     if item is not None and url is not None:
         yield Request(url=url + str(next_page), meta={"next_page": str(next_page_num), "url": url},
                       callback=self.parse_next_page)
Example #4
0
 def parse(self, response):
     item = CommonItem()
     detail_url = response.xpath("//div[@class='newsList right']/ul[@class='list']/li/a/@href").extract()
     next_url = response.xpath("//div[@class='page']/a[@class='next']/@href").extract_first()
     for i in range(len(detail_url)):
         path_url = "//div[@class='newsList right']/ul[@class='list']/li" + "[" + str(i + 1) + "]" + "/a/@href"
         reality_url = response.xpath(path_url).extract_first()
         yield scrapy.Request(reality_url, meta={'item': item, 'url': reality_url}, callback=self.parse2)
     yield scrapy.Request(next_url)
Example #5
0
    def parse(self, response):
        origin = "广西八桂职教"
        print(origin)
        # 保存下载html文件
        save_to_file("gfjyb_edu_info.html", response.text)

        next_url = response.xpath(
            "//div[@class='d-h d-h-content']/div[@class='container clearfix']/div[@class='search_fl_Content']/div[@id='d_pagination']/a[@class='next']/@href"
        ).extract_first()
        next_page = response.urljoin(next_url)

        items = response.xpath(
            "//div[@class='d-h d-h-content']/div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='top-10']/div[@class='title trim']"
        )
        bgzi_info_item = CommonItem()
        bgzi_info_item['origin'] = origin
        bgzi_info_item['birth'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                time.localtime())
        # for detail in detail_url:
        #     if detail is not None:
        #         yield SplashRequest(detail, callback=self.parse)
        for item in items:
            link = item.xpath("./a/@href").extract_first()
            if link is not None:
                bgzi_info_item['link'] = link
                print(link)
                yield SplashRequest(link,
                                    meta={"item": bgzi_info_item},
                                    callback=self.parse_content)

        # for item in items:
        #     title = item.xpath("./div[@class='title trim']/a/text()|./div[@class='title trim']/a/b/text()").extract_first()
        #     link = item.xpath("./div[@class='title trim']/a/@href").extract_first()
        #     # 栏目与生产日期
        #     messages = item.xpath("./div[@class='info']/text()").extract_first().split("&nbsp&nbsp&nbsp")
        #     column = messages[0].replace('栏目:','')
        #     birth = messages[2].replace('日期:','')
        #     date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        #     edu_info_item = BGzjInfoItem()
        #     edu_info_item["origin"] = origin
        #     edu_info_item["column"] = column
        #     edu_info_item["title"] = title
        #     edu_info_item["link"] = link
        #     edu_info_item["birth"] = birth
        #     edu_info_item["date"] = date
        #     yield edu_info_item

        if next_url is not None:
            # 继续爬取分类分页的其它页面
            try:
                yield SplashRequest(next_page)
            except:
                pass
Example #6
0
 def parse(self, response):
     origin = "中华人民共和国教育部"
     # 保存下载html文件
     common.save_to_file("gfjyb_edu_info.html", response.text)
     next_url = response.xpath(
         "//div[@class='scy_tylb_fy-nr']//li[@class='m_page_a m_page_btn'][2]/a/@href").extract_first()
     next_page = response.urljoin(next_url)
     column = response.xpath("//div[@id='curpage']/a[@class='CurrChnlCls'][2]",
                             "//div[@id='curpage']/a[@class='CurrChnlCls']").extract_first()
     items = response.xpath("//div[@id='wcmpagehtml']//ul[@id='list']/li")
     for item in items:
         title = item.xpath("./a/text()").extract_first()
         link = item.xpath("./a/@href").extract_first()
         birth = item.xpath("./span/text()").extract_first()
         date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         edu_info_item = CommonItem()
         edu_info_item["origin"] = origin
         edu_info_item["column"] = column
         edu_info_item["title"] = title
         if link.startswith("http:", 0, len(link) - 1) or link.startswith("https:", 0, len(link) - 1):
             edu_info_item["link"] = link
         else:
             http_link = "http://www.moe.gov.cn/" + link.replace("../../", "")
             edu_info_item["link"] = http_link
         edu_info_item["birth"] = birth
         edu_info_item["date"] = date
         yield Request(url=edu_info_item["link"], meta={"item": edu_info_item}, callback=self.parse_content)
     if next_url is not None:
         # 继续爬取分类分页的其它页面
         if next_url.startswith("javascript", 0, len(next_url) - 1):
             page_info = response.xpath(
                 "//div[@class='scy_tylb_fy-nr']//li[@class='m_page_a m_page_btn'][2]/a/@onclick",
                 "//ul[@id='page']/li[@class='m_page_a m_page_btn'][2]/a/@onclick").extract_first()
             page = page_info.replace("getWasRecord(", "").replace(");", "").split(",")
             url = "http://www.moe.gov.cn/was5/web/search?channelid=" + page[1] + "&chnlid=" + page[
                 0] + "&page="
             page_num = int(page[2])
             yield Request(url=(url + page[2]), meta={"next_page": (page_num + 1), "url": url, "column": column},
                           callback=self.parse_next_page)
         else:
             try:
                 yield SplashRequest(next_page)
             except:
                 pass
Example #7
0
 def parse_context(self, response):
     content_select = response.xpath(
         "//div[@class='view TRS_UEDITOR trs_paper_default trs_web'] "
         "| //div[@class='view TRS_UEDITOR trs_paper_default trs_web trs_key4format']"
         "| //div[@class='view TRS_UEDITOR trs_paper_default trs_web trs_word trs_key4format']"
         "| //div[@class='view TRS_UEDITOR trs_paper_default trs_word']"
         "| //div[@class='view TRS_UEDITOR trs_paper_default']"
         "| //div[@class='view TRS_UEDITOR trs_paper_default trs_web trs_word']"
         "| //div[@class='view TRS_UEDITOR trs_paper_default trs_word trs_key4format']"
     )
     content = content_select.extract_first()
     img_list = content_select.xpath("//img/@src").extract()
     if len(img_list) > 0:
         for img in img_list:
             new_img = "http://jyt.gxzf.gov.cn/jyxw/jyyw/" + img.replace(
                 ".", "", 1)
             content = content.replace(img, new_img)
     title = response.xpath(
         "//div[@class='article']/h1/text()").extract_first()
     origin_and_birth = response.xpath(
         "//div[@class='article-inf-left']/text()").extract_first().lstrip(
         ).rstrip().split("\n", 1)
     link = response.xpath("//meta[@name='Url']/@content").extract_first()
     origin = "广西壮族自治区教育厅"
     column = "教育要闻"
     date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     birth = origin_and_birth[0]
     organ = "广西壮族自治区教育厅"
     if len(origin_and_birth) > 1:
         from_origin = origin_and_birth[1].lstrip().replace("来源:", "")
     gxjyt_info_item = CommonItem()
     gxjyt_info_item["origin"] = origin
     gxjyt_info_item["column"] = column
     gxjyt_info_item["title"] = title
     gxjyt_info_item["link"] = link
     gxjyt_info_item["birth"] = birth
     gxjyt_info_item["date"] = date
     gxjyt_info_item["organ"] = organ
     gxjyt_info_item["content"] = content
     gxjyt_info_item["dataOriginId"] = self.origin_id
     yield gxjyt_info_item
Example #8
0
 def parse(self, response):
     origin = "广东省教育厅"
     # 保存下载html文件
     common.save_to_file("edu_info.html", response.text)
     # 爬取资讯首页,获取urls信息
     urls = response.xpath(
         "//div[@class='indexbox']//div[contains(@class,'ggjy_title')]//tr/td[2]/a/@href"
     )
     if len(urls) != 0:
         for url in urls:
             # 继续爬取分类页面
             yield scrapy.Request(url.extract())
     # 如果不是资讯首页,则是资讯下面的分类
     else:
         next_url = response.xpath(
             "//div[@class='page']/a[@class='next']/@href").extract_first()
         column = response.xpath(
             "//div[@class='listright_title']//td[@class='lmbt_td']/span/text()"
         ).extract_first()
         items = response.xpath(
             "//div[@class='main_cen']//div[@class='list_list']/ul/li[@class='list_li']"
         )
         for item in items:
             title = item.xpath("./a/text()").extract_first()
             link = item.xpath("./a/@href").extract_first()
             birth = item.xpath("./span/text()").extract_first()
             date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
             edu_info_item = CommonItem()
             edu_info_item["origin"] = origin
             edu_info_item["column"] = column
             edu_info_item["title"] = title
             edu_info_item["link"] = link
             edu_info_item["birth"] = birth
             edu_info_item["date"] = date
             yield Request(url=link,
                           meta={"item": edu_info_item},
                           callback=self.parse_content)
     if next_url is not None:
         # 继续爬取分类分页的其它页面
         yield scrapy.Request(next_url)
Example #9
0
    def parse_content(self, response):
        item = CommonItem()
        item['link'] = response.url
        # 详情内容页面链接
        # 标题
        detail_title = response.xpath(
            "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='fTitle']/text()"
        ).extract_first()
        detail_url = response.xpath(
            "//div[@class='search_fl_Content']/div[@class='top-10'] /div[@class='title trim']/a/@href"
        )
        # 数据来源 栏目
        detail_column = response.xpath(
            "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='top-10']/div[@class='fPost']/a[3]/text()"
        ).extract_first()
        # 部分页面html不一样
        other_detail_column = response.xpath(
            "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='top-10']/div[@class='fPost']/a/text()"
        ).extract_first()
        # 发文机构
        detail_organ = response.xpath(
            "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='flai']/span[1]/text()"
        ).extract_first()
        # 发文作者
        detail_author = response.xpath(
            "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='flai']/span[2]/text()"
        ).extract_first()
        # 发文时间
        detail_birth = response.xpath(
            "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='flai']/span[3]/text()"
        ).extract_first()
        # 文章内容
        detail_content = response.xpath(
            "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='fcontent']/div[@class='contentBoxF']/p|//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='fcontent']/div[@class='contentBoxF']/div"
        )

        item['title'] = detail_title
        # print(bgzi_info_item)
        if detail_column is not None:
            item['column'] = detail_column
        else:
            item['column'] = other_detail_column
        # print(bgzi_info_item)
        # if detail_source is not None:
        item['organ'] = detail_organ
        # print(bgzi_info_item)
        item["dataOriginId"] = self.origin_id
        # if detail_author is not None:
        item['author'] = detail_author
        # print(bgzi_info_item)
        # if detail_birth is not None:
        item['birth'] = detail_birth
        # print(bgzi_info_item)
        # if detail_content is not None:
        detail_content_str = ""
        # 图片路径相对路径转换为绝对路径
        for content in detail_content:
            old_src = content.xpath(
                "./img/@src |./strong/img/@src ").extract_first()
            if old_src is not None:
                new_src = response.urljoin(old_src)
                detail_content_str = detail_content_str + content.extract(
                ).replace(old_src, new_src)

            else:
                detail_content_str = detail_content_str + content.extract()

        item['content'] = detail_content_str
        item['date'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        # fo = open("rawcodes.txt", "wb")
        # fo.write((item, -1))
        #
        # # 关闭打开的文件
        # fo.close()
        yield item