Beispiel #1
0
    def parse_article(self, response):
        child_url = response.url
        url_md5 = get_md5(child_url)
        articleItemLoader = ItemLoader(item=ArticleSpiderItem(),
                                       response=response)
        articleItemLoader.add_xpath('title', '//h1/text()')
        articleItemLoader.add_xpath('content',
                                    "//div[@class='content']/p/text()")
        articleItemLoader.add_xpath('key_words',
                                    "//meta[@name='keywords']/@content")
        articleItemLoader.add_value('ref', response.url)
        articleItemLoader.add_value('url_md5', url_md5)
        articleInfo = articleItemLoader.load_item()

        # print("articleInfo", articleInfo)
        yield articleInfo
Beispiel #2
0
    def parse_article(self, response):
        child_url = response.url
        print(response.url)
        suffix = child_url.split('/')[-1]
        url_md5 = get_md5(child_url)
        # item = ArmsSpiderItem()
        # name = response.xpath("//div[@class='dataInfo']/ul[1]/li[1]/text()").extract()[0].replace("\t", "")
        # print("name",name)
        # item['name'] = name
        # content = response.xpath("//div[@class='intron']/div[@class='module']/p/text()").extract()[0]
        # item['content'] = content.replace("\t",)

        articleItemLoader = ItemLoader(item=ArmsSpiderItem(),
                                       response=response)
        articleItemLoader.add_xpath('src', "//div[@class='maxPic']/img/@src")
        articleItemLoader.add_xpath(
            'content', "//div[@class='intron']/div[@class='module']//text()")
        articleItemLoader.add_xpath(
            'ycg', "//div[@class='maxPic']/span[@class='country']/b/a/text()")
        articleItemLoader.add_xpath(
            'datainfo',
            "//div[@class='dataInfo']/ul[1]/li/span/text() | //div[@class='dataInfo']/ul[1]/li/text()"
        )
        # articleItemLoader.add_xpath('datalist', "//div[@class='dataInfo']/u2[@class='dataList']/li//text() ")
        # articleItemLoader.add_xpath('xingneng', "//div[@class='dataInfo']/u3[@class='dataList']/li//text()")

        articleItemLoader.add_xpath(
            'othercontent',
            "//div[@class='info']/div[@class='module']//text()")

        articleItemLoader.add_value('suffix', suffix)
        articleItemLoader.add_value('child_url', child_url)
        articleInfo = articleItemLoader.load_item()

        # print("articleInfo", articleInfo)
        yield articleInfo