コード例 #1
0
ファイル: enrolnews.py プロジェクト: Actooors/SHUMES_Spider
    def parse_detail(self, response):
        # 提取文章中的图片的url
        image_url = response.css(".img_vsb_content::attr(src)").extract()
        image_url_list = [parse.urljoin(response.url, url) for url in image_url]
        # 提取文章具体字段
        # title author webname url create_date content tag apartment
        news_item = NewsItem()
        item_loader = NewsItemLoader(item=NewsItem(), response=response)
        # 文章标题
        item_loader.add_css("title", "#dnn_ctr63596_ArtDetail_lblTitle::text")
        # 文章地址
        item_loader.add_value("url", response.url)
        item_loader.add_value("md5_id", get_md5(response.url))
        # 发布时间
        create_date = response.meta.get("create_date", "")
        item_loader.add_value("create_date", create_date)
        # 类型标签
        item_loader.add_value("tag", "通知公告")
        # 一级标签:一般为来源(网站名)
        item_loader.add_value("webname", ["本科招生网"])
        # 内容#vsb_content_2
        item_loader.add_css("content", "#vsb_content")
        # 发布人
        item_loader.add_css("author", "#dnn_ctr63596_ArtDetail_hypFirst::text")
        news_item = item_loader.load_item()

        yield news_item
コード例 #2
0
ファイル: librarynews.py プロジェクト: Actooors/SHUMES_Spider
    def parse_detail(self, response):
        # 提取文章中的图片的url
        image_url = response.css(".content img::attr(src)").extract()
        image_url_list = [
            parse.urljoin(response.url, url) for url in image_url
        ]
        # 提取文章具体字段
        # title author webname url create_date content image_url_list tag apartment
        news_item = NewsItem()
        item_loader = NewsItemLoader(item=NewsItem(), response=response)
        # 文章标题 需要处理空格
        item_loader.add_css("title", "#page-title::text")
        # 文章地址
        item_loader.add_value("url", response.url)
        # key:md5_id
        md5_id = get_md5(response.url)
        item_loader.add_value("md5_id", [md5_id])
        # item_loader.add_value("url_object_id", get_md5(response.url))
        # 发布时间
        item_loader.add_value("create_date",
                              response.meta.get("create_date", ""))
        # 图片地址
        # 类型标签
        tag = response.css(
            "div.field:nth-child(2) >"
            " div:nth-child(1) > div:nth-child(1) > a:nth-child(1)::text"
        ).extract_first()
        item_loader.add_value("tag", tag)
        # 一级标签:一般为来源(网站名)
        item_loader.add_value("webname", ["图书馆"])
        # 内容#vsb_content_2
        item_loader.add_css("content", "div.field:nth-child(1)")
        news_item = item_loader.load_item()

        yield news_item
コード例 #3
0
ファイル: SHUnews.py プロジェクト: Actooors/SHUMES_Spider
    def parse_detail(self, response):
        # 提取文章中的图片的url
        image_url = response.css("p.vsbcontent_img img::attr(src)").extract()
        image_url_list = [parse.urljoin(response.url, url) for url in image_url]
        # 提取文章具体字段
        # title author webname url create_date content image_url_list tag apartment
        news_item = NewsItem()
        item_loader = NewsItemLoader(item=NewsItem(), response=response)
        # 文章标题
        item_loader.add_xpath("title",'//*[@id="dnn_ctr1055_ArticleDetails_ctl00_lblTitle"]/text()|//*['
                                      '@id="dnn_ctr1053_ArticleDetails_ctl00_lblTitle"]/text()')
        # 文章地址
        item_loader.add_value("url", response.url)
        # key:md5_id
        md5_id = get_md5(response.url)
        item_loader.add_value("md5_id", [md5_id])
        # item_loader.add_value("url_object_id", get_md5(response.url))
        # 发布时间 //*[@id="dnn_ctr1053_ArticleDetails_ctl00_lblDatePosted"]
        item_loader.add_xpath("create_date",
                              "//*[@id='dnn_ctr1053_ArticleDetails_ctl00_lblDatePosted']/text()| //*["
                              "@id='dnn_ctr1055_ArticleDetails_ctl00_lblDatePosted']/text()")
        # 图片地址
        item_loader.add_value("image_url_list", image_url_list)
        # 类型标签

        item_loader.add_value("tag", response.meta.get("tag",""))
        # 一级标签:一般为来源(网站名)
        item_loader.add_value("webname", ["新闻网"])
        # 一级标签:一般为来源(网站名)
        # 内容#vsb_content_2   //*[@id="dnn_ctr43465_ModuleContent"]
        item_loader.add_xpath("content",
                              "//div[@id='vsb_content_2'] | /html/body/div[1]/div[3]/div/table/tbody/tr/td/div/div[2]/div/div/div/form")
        # 部门
        item_loader.add_xpath("apartment",
                              "//*[@id='dnn_ctr1053_ArticleDetails_ctl00_hypDept']/text()| //*["
                              "@id='dnn_ctr1055_ArticleDetails_ctl00_hypDept']/text()")
        # 发布人
        item_loader.add_xpath("author",
                              "//*[@id='dnn_ctr1053_ArticleDetails_ctl00_hypUser']/text()| //*["
                              "@id='dnn_ctr1055_ArticleDetails_ctl00_hypUser']/text()")
        # item_loader.add_css("image_url_list","p.vsbcontent_img img::attr(src)")

        news_item = item_loader.load_item()

        yield news_item
コード例 #4
0
    def parse_detail(self, response):
        # 提取文章中的图片的url
        image_url = response.css(".img_vsb_content::attr(src)").extract()
        image_url_list = [
            parse.urljoin(response.url, url) for url in image_url
        ]
        # 提取文章具体字段
        # title author webname url create_date content image_url_list tag apartment
        news_item = NewsItem()
        item_loader = NewsItemLoader(item=NewsItem(), response=response)
        # 文章标题
        item_loader.add_css("title", "#dnn_ctr43465_ArtDetail_lblTitle::text")
        # 文章地址
        item_loader.add_value("url", response.url)
        # key:md5_id
        md5_id = get_md5(response.url)
        item_loader.add_value("md5_id", [md5_id])
        # item_loader.add_value("url_object_id", get_md5(response.url))
        # 发布时间
        item_loader.add_css("create_date",
                            "#dnn_ctr43465_ArtDetail_lblDatePosted::text")
        # 图片地址
        item_loader.add_value("image_url_list", image_url_list)
        # 类型标签
        item_loader.add_value("tag", response.meta.get("tag", ""))
        # item_loader.add_value("tag_id", ["6"])
        # item_loader.add_value("tag_id", ["9"])
        # 一级标签:一般为来源(网站名)
        item_loader.add_value("webname", ["教务处"])
        # 一级标签:一般为来源(网站名)
        item_loader.add_value("user_id", ["3"])
        # 内容#vsb_content

        item_loader.add_xpath(
            "content",
            "//div[@id='vsb_content'] | //*[@id='dnn_ctr43465_ModuleContent']")
        # 部门
        item_loader.add_css("apartment",
                            "#dnn_ctr1053_ArticleDetails_ctl00_hypDept::text")
        # 发布人
        item_loader.add_css("author", "#dnn_ctr43465_ArtDetail_hypFirst::text")

        news_item = item_loader.load_item()

        yield news_item
コード例 #5
0
    def parse_detail(self, response):
        # 提取文章中的图片的url
        image_url = response.css(".img_vsb_content::attr(src)").extract()
        image_url_list = [
            parse.urljoin(response.url, url) for url in image_url
        ]
        # 提取文章具体字段
        # title author webname url create_date content image_url_list tag apartment
        news_item = NewsItem()
        item_loader = NewsItemLoader(item=NewsItem(), response=response)
        # 文章标题
        item_loader.add_css("title", "#dnn_ctr59828_ArtDetail_lblTitle::text")
        # 文章地址
        item_loader.add_value("url", response.url)
        # key:md5_id
        md5_id = get_md5(response.url)
        item_loader.add_value("md5_id", [md5_id])
        # item_loader.add_value("url_object_id", get_md5(response.url))
        # 发布时间
        item_loader.add_css("create_date",
                            "#dnn_ctr59828_ArtDetail_lblDatePosted::text")
        # 图片地址
        # 类型标签
        item_loader.add_value("tag", ["通知公告"])
        # 一级标签:一般为来源(网站名)
        item_loader.add_value("webname", ["学生工作办公室"])
        # 一级标签:一般为来源(网站名)
        # 内容#vsb_content_2
        item_loader.add_css("content", "#dnn_ctr59828_ArtDetail_lblArticle")
        # 部门
        item_loader.add_css(
            "apartment", "#dnn_ctr59828_ArticleDetails_ctl00_hypDept::text")
        # 发布人
        author = response.css(
            "#dnn_ctr59828_ArtDetail_hypFirst::text").extract_first(
            ) + response.css(
                "#dnn_ctr59828_ArtDetail_hypLast::text").extract_first()
        item_loader.add_value("author", [author])
        # item_loader.add_css("image_url_list","p.vsbcontent_img img::attr(src)")

        news_item = item_loader.load_item()

        yield news_item