def parse_next_page(self, response): origin = "中华人民共和国教育部" column = response.meta['column'] result_list = response.xpath("//li") if result_list is None: return for item in result_list: title = item.xpath("./a/text()").extract_first() link = item.xpath("./a/@href").extract_first() birth = item.xpath("./span/text()").extract_first() date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) edu_info_item = CommonItem() edu_info_item["origin"] = origin edu_info_item["column"] = column edu_info_item["title"] = title edu_info_item["link"] = link edu_info_item["birth"] = birth edu_info_item["date"] = date yield Request(url=edu_info_item["link"], meta={"item": edu_info_item}, callback=self.parse_content) next_page = response.meta['next_page'] url = response.meta['url'] page_num = int(next_page) if item is not None and url is not None: yield Request(url=(url + next_page), meta={"next_page": (page_num + 1), "url": url, "column": column}, callback=self.parse_next_page)
def parse(self, response): # print("---------------------------------------------------------------------------------------") # newsName = response.xpath("//div[@class='title trim']/a").xpath('string(.)').extract() # print(newsName) # list =[] p = "http://www.tech.net.cn" str(self).encode("utf-8") # 列表页 news_urls = response.xpath("//h4/a/@href") nest_page_url = response.xpath("//div[@class='digg']/a[last()]/@href").extract_first() title = response.xpath("//h1[@class='m-t-10 m-b-5']/text()").extract_first() i = 0 # print(news_urls) if len(news_urls) != 0 and title is None: for url in news_urls: page = response.xpath("//div[@class='page-box']/div[@class='digg']/span[@class='current']/text()").extract_first() link = p + url.extract() i = i + 1 # print("link", i, ":", link) news_info = CommonItem() news_info["link"] = link news_info["dataOriginId"] = self.origin_id yield scrapy.Request(news_info["link"], meta={"news_info": news_info}) # time.sleep(1) # 翻页 # print("翻页", p + nest_page_url) if nest_page_url is not None: yield scrapy.Request(url=p + nest_page_url) else: title = response.xpath("//h1[@class='m-t-10 m-b-5']/text()").extract_first() if title is not None and len(title) > 0: content = response.xpath("//div[@class='contentabnc']/section").extract_first() date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) birth = response.xpath("//div[@class='info font-weight-300']/span[1]/text()").extract_first() if birth is not None: birth = birth.split(':')[-1] column = response.xpath("//ol/li[@class='breadcrumb-item']/text()").extract()[-1] if column is not None: column = column.split('> ')[-1] # print("----------------------", column) intro = response.xpath("//h4/text()").extract_first() source = response.xpath("//div[@class='info font-weight-300']/span[4]/text()").extract_first() if source is not None: source = source.split(': ')[-1] # print("----------------------", birth) news_info = response.meta["news_info"] news_info["content"] = content news_info["date"] = date news_info["birth"] = birth news_info["intro"] = intro news_info["organ"] = source news_info["origin"] = "中国高职高专网" news_info["title"] = title news_info["column"] = column # print("info", news_info) yield news_info pass
def parse_next_page(self, response): origin = "中华人民共和国教育部" column = "教育要闻" result_list = response.xpath("//li") for item in result_list: title = item.xpath("./a/text()").extract_first() link = item.xpath("./a/@href").extract_first() birth = item.xpath("./span/text()").extract_first() date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) edu_info_item = CommonItem() edu_info_item["origin"] = origin edu_info_item["column"] = column edu_info_item["title"] = title if not link.startswith("http"): link = "http://www.moe.gov.cn" + link edu_info_item["link"] = link edu_info_item["birth"] = birth edu_info_item["date"] = date yield Request(url=link, meta={"item": edu_info_item}, callback=self.parse_content) next_page = response.meta['next_page'] url = response.meta['url'] page_num = int(next_page) next_page_num = str(page_num + 1) if item is not None and url is not None: yield Request(url=url + str(next_page), meta={"next_page": str(next_page_num), "url": url}, callback=self.parse_next_page)
def parse(self, response): item = CommonItem() detail_url = response.xpath("//div[@class='newsList right']/ul[@class='list']/li/a/@href").extract() next_url = response.xpath("//div[@class='page']/a[@class='next']/@href").extract_first() for i in range(len(detail_url)): path_url = "//div[@class='newsList right']/ul[@class='list']/li" + "[" + str(i + 1) + "]" + "/a/@href" reality_url = response.xpath(path_url).extract_first() yield scrapy.Request(reality_url, meta={'item': item, 'url': reality_url}, callback=self.parse2) yield scrapy.Request(next_url)
def parse(self, response): origin = "广西八桂职教" print(origin) # 保存下载html文件 save_to_file("gfjyb_edu_info.html", response.text) next_url = response.xpath( "//div[@class='d-h d-h-content']/div[@class='container clearfix']/div[@class='search_fl_Content']/div[@id='d_pagination']/a[@class='next']/@href" ).extract_first() next_page = response.urljoin(next_url) items = response.xpath( "//div[@class='d-h d-h-content']/div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='top-10']/div[@class='title trim']" ) bgzi_info_item = CommonItem() bgzi_info_item['origin'] = origin bgzi_info_item['birth'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # for detail in detail_url: # if detail is not None: # yield SplashRequest(detail, callback=self.parse) for item in items: link = item.xpath("./a/@href").extract_first() if link is not None: bgzi_info_item['link'] = link print(link) yield SplashRequest(link, meta={"item": bgzi_info_item}, callback=self.parse_content) # for item in items: # title = item.xpath("./div[@class='title trim']/a/text()|./div[@class='title trim']/a/b/text()").extract_first() # link = item.xpath("./div[@class='title trim']/a/@href").extract_first() # # 栏目与生产日期 # messages = item.xpath("./div[@class='info']/text()").extract_first().split("   ") # column = messages[0].replace('栏目:','') # birth = messages[2].replace('日期:','') # date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # edu_info_item = BGzjInfoItem() # edu_info_item["origin"] = origin # edu_info_item["column"] = column # edu_info_item["title"] = title # edu_info_item["link"] = link # edu_info_item["birth"] = birth # edu_info_item["date"] = date # yield edu_info_item if next_url is not None: # 继续爬取分类分页的其它页面 try: yield SplashRequest(next_page) except: pass
def parse(self, response): origin = "中华人民共和国教育部" # 保存下载html文件 common.save_to_file("gfjyb_edu_info.html", response.text) next_url = response.xpath( "//div[@class='scy_tylb_fy-nr']//li[@class='m_page_a m_page_btn'][2]/a/@href").extract_first() next_page = response.urljoin(next_url) column = response.xpath("//div[@id='curpage']/a[@class='CurrChnlCls'][2]", "//div[@id='curpage']/a[@class='CurrChnlCls']").extract_first() items = response.xpath("//div[@id='wcmpagehtml']//ul[@id='list']/li") for item in items: title = item.xpath("./a/text()").extract_first() link = item.xpath("./a/@href").extract_first() birth = item.xpath("./span/text()").extract_first() date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) edu_info_item = CommonItem() edu_info_item["origin"] = origin edu_info_item["column"] = column edu_info_item["title"] = title if link.startswith("http:", 0, len(link) - 1) or link.startswith("https:", 0, len(link) - 1): edu_info_item["link"] = link else: http_link = "http://www.moe.gov.cn/" + link.replace("../../", "") edu_info_item["link"] = http_link edu_info_item["birth"] = birth edu_info_item["date"] = date yield Request(url=edu_info_item["link"], meta={"item": edu_info_item}, callback=self.parse_content) if next_url is not None: # 继续爬取分类分页的其它页面 if next_url.startswith("javascript", 0, len(next_url) - 1): page_info = response.xpath( "//div[@class='scy_tylb_fy-nr']//li[@class='m_page_a m_page_btn'][2]/a/@onclick", "//ul[@id='page']/li[@class='m_page_a m_page_btn'][2]/a/@onclick").extract_first() page = page_info.replace("getWasRecord(", "").replace(");", "").split(",") url = "http://www.moe.gov.cn/was5/web/search?channelid=" + page[1] + "&chnlid=" + page[ 0] + "&page=" page_num = int(page[2]) yield Request(url=(url + page[2]), meta={"next_page": (page_num + 1), "url": url, "column": column}, callback=self.parse_next_page) else: try: yield SplashRequest(next_page) except: pass
def parse_context(self, response): content_select = response.xpath( "//div[@class='view TRS_UEDITOR trs_paper_default trs_web'] " "| //div[@class='view TRS_UEDITOR trs_paper_default trs_web trs_key4format']" "| //div[@class='view TRS_UEDITOR trs_paper_default trs_web trs_word trs_key4format']" "| //div[@class='view TRS_UEDITOR trs_paper_default trs_word']" "| //div[@class='view TRS_UEDITOR trs_paper_default']" "| //div[@class='view TRS_UEDITOR trs_paper_default trs_web trs_word']" "| //div[@class='view TRS_UEDITOR trs_paper_default trs_word trs_key4format']" ) content = content_select.extract_first() img_list = content_select.xpath("//img/@src").extract() if len(img_list) > 0: for img in img_list: new_img = "http://jyt.gxzf.gov.cn/jyxw/jyyw/" + img.replace( ".", "", 1) content = content.replace(img, new_img) title = response.xpath( "//div[@class='article']/h1/text()").extract_first() origin_and_birth = response.xpath( "//div[@class='article-inf-left']/text()").extract_first().lstrip( ).rstrip().split("\n", 1) link = response.xpath("//meta[@name='Url']/@content").extract_first() origin = "广西壮族自治区教育厅" column = "教育要闻" date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) birth = origin_and_birth[0] organ = "广西壮族自治区教育厅" if len(origin_and_birth) > 1: from_origin = origin_and_birth[1].lstrip().replace("来源:", "") gxjyt_info_item = CommonItem() gxjyt_info_item["origin"] = origin gxjyt_info_item["column"] = column gxjyt_info_item["title"] = title gxjyt_info_item["link"] = link gxjyt_info_item["birth"] = birth gxjyt_info_item["date"] = date gxjyt_info_item["organ"] = organ gxjyt_info_item["content"] = content gxjyt_info_item["dataOriginId"] = self.origin_id yield gxjyt_info_item
def parse(self, response): origin = "广东省教育厅" # 保存下载html文件 common.save_to_file("edu_info.html", response.text) # 爬取资讯首页,获取urls信息 urls = response.xpath( "//div[@class='indexbox']//div[contains(@class,'ggjy_title')]//tr/td[2]/a/@href" ) if len(urls) != 0: for url in urls: # 继续爬取分类页面 yield scrapy.Request(url.extract()) # 如果不是资讯首页,则是资讯下面的分类 else: next_url = response.xpath( "//div[@class='page']/a[@class='next']/@href").extract_first() column = response.xpath( "//div[@class='listright_title']//td[@class='lmbt_td']/span/text()" ).extract_first() items = response.xpath( "//div[@class='main_cen']//div[@class='list_list']/ul/li[@class='list_li']" ) for item in items: title = item.xpath("./a/text()").extract_first() link = item.xpath("./a/@href").extract_first() birth = item.xpath("./span/text()").extract_first() date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) edu_info_item = CommonItem() edu_info_item["origin"] = origin edu_info_item["column"] = column edu_info_item["title"] = title edu_info_item["link"] = link edu_info_item["birth"] = birth edu_info_item["date"] = date yield Request(url=link, meta={"item": edu_info_item}, callback=self.parse_content) if next_url is not None: # 继续爬取分类分页的其它页面 yield scrapy.Request(next_url)
def parse_content(self, response): item = CommonItem() item['link'] = response.url # 详情内容页面链接 # 标题 detail_title = response.xpath( "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='fTitle']/text()" ).extract_first() detail_url = response.xpath( "//div[@class='search_fl_Content']/div[@class='top-10'] /div[@class='title trim']/a/@href" ) # 数据来源 栏目 detail_column = response.xpath( "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='top-10']/div[@class='fPost']/a[3]/text()" ).extract_first() # 部分页面html不一样 other_detail_column = response.xpath( "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='top-10']/div[@class='fPost']/a/text()" ).extract_first() # 发文机构 detail_organ = response.xpath( "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='flai']/span[1]/text()" ).extract_first() # 发文作者 detail_author = response.xpath( "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='flai']/span[2]/text()" ).extract_first() # 发文时间 detail_birth = response.xpath( "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='flai']/span[3]/text()" ).extract_first() # 文章内容 detail_content = response.xpath( "//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='fcontent']/div[@class='contentBoxF']/p|//div[@class='container clearfix']/div[@class='search_fl_Content']/div[@class='fcontent']/div[@class='contentBoxF']/div" ) item['title'] = detail_title # print(bgzi_info_item) if detail_column is not None: item['column'] = detail_column else: item['column'] = other_detail_column # print(bgzi_info_item) # if detail_source is not None: item['organ'] = detail_organ # print(bgzi_info_item) item["dataOriginId"] = self.origin_id # if detail_author is not None: item['author'] = detail_author # print(bgzi_info_item) # if detail_birth is not None: item['birth'] = detail_birth # print(bgzi_info_item) # if detail_content is not None: detail_content_str = "" # 图片路径相对路径转换为绝对路径 for content in detail_content: old_src = content.xpath( "./img/@src |./strong/img/@src ").extract_first() if old_src is not None: new_src = response.urljoin(old_src) detail_content_str = detail_content_str + content.extract( ).replace(old_src, new_src) else: detail_content_str = detail_content_str + content.extract() item['content'] = detail_content_str item['date'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # fo = open("rawcodes.txt", "wb") # fo.write((item, -1)) # # # 关闭打开的文件 # fo.close() yield item