def parse_detail(self, response): # 提取文章中的图片的url image_url = response.css(".img_vsb_content::attr(src)").extract() image_url_list = [parse.urljoin(response.url, url) for url in image_url] # 提取文章具体字段 # title author webname url create_date content tag apartment news_item = NewsItem() item_loader = NewsItemLoader(item=NewsItem(), response=response) # 文章标题 item_loader.add_css("title", "#dnn_ctr63596_ArtDetail_lblTitle::text") # 文章地址 item_loader.add_value("url", response.url) item_loader.add_value("md5_id", get_md5(response.url)) # 发布时间 create_date = response.meta.get("create_date", "") item_loader.add_value("create_date", create_date) # 类型标签 item_loader.add_value("tag", "通知公告") # 一级标签:一般为来源(网站名) item_loader.add_value("webname", ["本科招生网"]) # 内容#vsb_content_2 item_loader.add_css("content", "#vsb_content") # 发布人 item_loader.add_css("author", "#dnn_ctr63596_ArtDetail_hypFirst::text") news_item = item_loader.load_item() yield news_item
def parse_detail(self, response): # 提取文章中的图片的url image_url = response.css(".content img::attr(src)").extract() image_url_list = [ parse.urljoin(response.url, url) for url in image_url ] # 提取文章具体字段 # title author webname url create_date content image_url_list tag apartment news_item = NewsItem() item_loader = NewsItemLoader(item=NewsItem(), response=response) # 文章标题 需要处理空格 item_loader.add_css("title", "#page-title::text") # 文章地址 item_loader.add_value("url", response.url) # key:md5_id md5_id = get_md5(response.url) item_loader.add_value("md5_id", [md5_id]) # item_loader.add_value("url_object_id", get_md5(response.url)) # 发布时间 item_loader.add_value("create_date", response.meta.get("create_date", "")) # 图片地址 # 类型标签 tag = response.css( "div.field:nth-child(2) >" " div:nth-child(1) > div:nth-child(1) > a:nth-child(1)::text" ).extract_first() item_loader.add_value("tag", tag) # 一级标签:一般为来源(网站名) item_loader.add_value("webname", ["图书馆"]) # 内容#vsb_content_2 item_loader.add_css("content", "div.field:nth-child(1)") news_item = item_loader.load_item() yield news_item
def parse_detail(self, response): # 提取文章中的图片的url image_url = response.css("p.vsbcontent_img img::attr(src)").extract() image_url_list = [parse.urljoin(response.url, url) for url in image_url] # 提取文章具体字段 # title author webname url create_date content image_url_list tag apartment news_item = NewsItem() item_loader = NewsItemLoader(item=NewsItem(), response=response) # 文章标题 item_loader.add_xpath("title",'//*[@id="dnn_ctr1055_ArticleDetails_ctl00_lblTitle"]/text()|//*[' '@id="dnn_ctr1053_ArticleDetails_ctl00_lblTitle"]/text()') # 文章地址 item_loader.add_value("url", response.url) # key:md5_id md5_id = get_md5(response.url) item_loader.add_value("md5_id", [md5_id]) # item_loader.add_value("url_object_id", get_md5(response.url)) # 发布时间 //*[@id="dnn_ctr1053_ArticleDetails_ctl00_lblDatePosted"] item_loader.add_xpath("create_date", "//*[@id='dnn_ctr1053_ArticleDetails_ctl00_lblDatePosted']/text()| //*[" "@id='dnn_ctr1055_ArticleDetails_ctl00_lblDatePosted']/text()") # 图片地址 item_loader.add_value("image_url_list", image_url_list) # 类型标签 item_loader.add_value("tag", response.meta.get("tag","")) # 一级标签:一般为来源(网站名) item_loader.add_value("webname", ["新闻网"]) # 一级标签:一般为来源(网站名) # 内容#vsb_content_2 //*[@id="dnn_ctr43465_ModuleContent"] item_loader.add_xpath("content", "//div[@id='vsb_content_2'] | /html/body/div[1]/div[3]/div/table/tbody/tr/td/div/div[2]/div/div/div/form") # 部门 item_loader.add_xpath("apartment", "//*[@id='dnn_ctr1053_ArticleDetails_ctl00_hypDept']/text()| //*[" "@id='dnn_ctr1055_ArticleDetails_ctl00_hypDept']/text()") # 发布人 item_loader.add_xpath("author", "//*[@id='dnn_ctr1053_ArticleDetails_ctl00_hypUser']/text()| //*[" "@id='dnn_ctr1055_ArticleDetails_ctl00_hypUser']/text()") # item_loader.add_css("image_url_list","p.vsbcontent_img img::attr(src)") news_item = item_loader.load_item() yield news_item
def parse_detail(self, response): # 提取文章中的图片的url image_url = response.css(".img_vsb_content::attr(src)").extract() image_url_list = [ parse.urljoin(response.url, url) for url in image_url ] # 提取文章具体字段 # title author webname url create_date content image_url_list tag apartment news_item = NewsItem() item_loader = NewsItemLoader(item=NewsItem(), response=response) # 文章标题 item_loader.add_css("title", "#dnn_ctr43465_ArtDetail_lblTitle::text") # 文章地址 item_loader.add_value("url", response.url) # key:md5_id md5_id = get_md5(response.url) item_loader.add_value("md5_id", [md5_id]) # item_loader.add_value("url_object_id", get_md5(response.url)) # 发布时间 item_loader.add_css("create_date", "#dnn_ctr43465_ArtDetail_lblDatePosted::text") # 图片地址 item_loader.add_value("image_url_list", image_url_list) # 类型标签 item_loader.add_value("tag", response.meta.get("tag", "")) # item_loader.add_value("tag_id", ["6"]) # item_loader.add_value("tag_id", ["9"]) # 一级标签:一般为来源(网站名) item_loader.add_value("webname", ["教务处"]) # 一级标签:一般为来源(网站名) item_loader.add_value("user_id", ["3"]) # 内容#vsb_content item_loader.add_xpath( "content", "//div[@id='vsb_content'] | //*[@id='dnn_ctr43465_ModuleContent']") # 部门 item_loader.add_css("apartment", "#dnn_ctr1053_ArticleDetails_ctl00_hypDept::text") # 发布人 item_loader.add_css("author", "#dnn_ctr43465_ArtDetail_hypFirst::text") news_item = item_loader.load_item() yield news_item
def parse_detail(self, response): # 提取文章中的图片的url image_url = response.css(".img_vsb_content::attr(src)").extract() image_url_list = [ parse.urljoin(response.url, url) for url in image_url ] # 提取文章具体字段 # title author webname url create_date content image_url_list tag apartment news_item = NewsItem() item_loader = NewsItemLoader(item=NewsItem(), response=response) # 文章标题 item_loader.add_css("title", "#dnn_ctr59828_ArtDetail_lblTitle::text") # 文章地址 item_loader.add_value("url", response.url) # key:md5_id md5_id = get_md5(response.url) item_loader.add_value("md5_id", [md5_id]) # item_loader.add_value("url_object_id", get_md5(response.url)) # 发布时间 item_loader.add_css("create_date", "#dnn_ctr59828_ArtDetail_lblDatePosted::text") # 图片地址 # 类型标签 item_loader.add_value("tag", ["通知公告"]) # 一级标签:一般为来源(网站名) item_loader.add_value("webname", ["学生工作办公室"]) # 一级标签:一般为来源(网站名) # 内容#vsb_content_2 item_loader.add_css("content", "#dnn_ctr59828_ArtDetail_lblArticle") # 部门 item_loader.add_css( "apartment", "#dnn_ctr59828_ArticleDetails_ctl00_hypDept::text") # 发布人 author = response.css( "#dnn_ctr59828_ArtDetail_hypFirst::text").extract_first( ) + response.css( "#dnn_ctr59828_ArtDetail_hypLast::text").extract_first() item_loader.add_value("author", [author]) # item_loader.add_css("image_url_list","p.vsbcontent_img img::attr(src)") news_item = item_loader.load_item() yield news_item