def parse(self, response):
        #获取一个选择器
        hxs = Selector(response)

        for con in hxs.xpath(self.xpathConf.get("parse_xpath")):
            ss = Selector(text=con.extract())
            el = WeiboComItemLoader(selector=ss)
            el.add_value("site_source", "news.sina.com.cn")
            el.add_value("site_type", "news")
            el.add_value("task_id", self.task_id)
            nowTime = time.localtime()
            nowDate = datetime.datetime(nowTime[0], nowTime[1], nowTime[2])
            el.add_value("catch_date", nowDate.strftime('%Y-%m-%d'))

            baidu_url = ss.xpath(self.xpathConf.get("site_url")).extract()
            if baidu_url and len(baidu_url) > 0:
                source_url = urlUtil.getRedirectUrl(baidu_url[0], timeout=10)
                sinaUrl = fo.findSinaNewsUrl(source_url)
                if sinaUrl:
                    el.add_value('site_url', sinaUrl)
                else:
                    continue
            else:
                continue

            yield el.load_item()
Ejemplo n.º 2
0
    def parse(self, response):
        #获取一个选择器
        hxs = Selector(response)

        for con in hxs.xpath(self.xpathConf.get("parse_xpath")):
            ss = Selector(text=con.extract())
            el = WeiboComItemLoader(selector=ss)
            el.add_value("site_source", "mp.weixin.qq.com")
            el.add_value("site_type", "weixin")
            el.add_value("task_id", self.task_id)
            nowTime = time.localtime()
            nowDate = datetime.datetime(nowTime[0], nowTime[1], nowTime[2])
            el.add_value("catch_date", nowDate.strftime('%Y-%m-%d'))

            el.add_xpath('site_url', self.xpathConf.get("site_url"))

            yield el.load_item()
    def parse(self, response):
        #获取一个选择器
        hxs = Selector(response)

        for con in hxs.xpath(self.xpathConf.get("parse_xpath")):

            ss = Selector(text=con.extract())
            el = WeiboComItemLoader(selector=ss)
            #'site_source','site_type','site_url'
            el.add_value("spider_type", self.spider_type)
            el.add_value("site_source", "weibo.com")
            el.add_value("site_type", "weibo")
            el.add_value("task_id", self.task_id)
            nowTime = time.localtime()
            nowDate = datetime.datetime(nowTime[0], nowTime[1], nowTime[2])
            el.add_value("catch_date", nowDate.strftime('%Y-%m-%d'))

            url = response.url
            el.add_xpath('author', self.xpathConf.get("author"))
            el.add_value('user_url', url[0:url.rfind('/')])
            el.add_value('site_url', url)
            el.add_value('content',
                         ss.xpath(self.xpathConf.get("content")).extract())
            el.add_xpath('publish_time', self.xpathConf.get("publish_time"))

            attitude = ss.xpath(self.xpathConf.get("attitude")).extract()

            if attitude:
                el.add_value("attitude", attitude)
            else:
                el.add_value("attitude", "0")

            comments = ss.xpath(self.xpathConf.get("comments")).extract()
            if comments:
                el.add_value("comments", comments)
            else:
                el.add_value("comments", "0")
            repost = ss.xpath(self.xpathConf.get("repost")).extract()
            if repost:
                el.add_value("repost", repost)
            else:
                el.add_value("repost", "0")

            log.msg(repost)
            yield el.load_item()