コード例 #1
0
    def parse(self, response):
        #获取一个选择器
        hxs = Selector(response)

        for con in hxs.xpath(self.xpathConf.get("parse_xpath")):
            ss = Selector(text=con.extract())
            el = WeiboComItemLoader(selector=ss)
            el.add_value("site_source", "mp.weixin.qq.com")
            el.add_value("site_type", "weixin")
            el.add_value("task_id", self.task_id)
            nowTime = time.localtime()
            nowDate = datetime.datetime(nowTime[0], nowTime[1], nowTime[2])
            el.add_value("catch_date", nowDate.strftime('%Y-%m-%d'))

            el.add_xpath('site_url', self.xpathConf.get("site_url"))

            yield el.load_item()
コード例 #2
0
    def parse(self, response):
        #获取一个选择器
        hxs = Selector(response)
        for con in hxs.xpath(self.xpathConf.get("parse_xpath")):

            ss = Selector(text=con.extract())
            el = WeiboComItemLoader(selector=ss)
            #'site_source','site_type','site_url'
            el.add_value("site_source", "weibo.com")
            el.add_value("site_type", "weibo")
            el.add_value("task_id", self.task_id)
            nowTime = time.localtime()
            nowDate = datetime.datetime(nowTime[0], nowTime[1], nowTime[2])
            el.add_value("catch_date", nowDate.strftime('%Y-%m-%d'))

            el.add_xpath('author', self.xpathConf.get("author"))
            el.add_xpath('user_url', self.xpathConf.get("user_url"))
            el.add_xpath('site_url', self.xpathConf.get("site_url"))
            el.add_value('content',
                         ss.xpath(self.xpathConf.get("content")).extract())
            el.add_xpath('publish_time', self.xpathConf.get("publish_time"))
            attitude = ss.xpath(self.xpathConf.get("attitude")).extract()

            if attitude:
                el.add_value("attitude", attitude)
            else:
                el.add_value("attitude", "0")

            comments = ss.xpath(self.xpathConf.get("comments")).extract()
            if comments:
                el.add_value("comments", comments)
            else:
                el.add_value("comments", "0")
            repost = ss.xpath(self.xpathConf.get("repost")).extract()
            if repost:
                el.add_value("repost", repost)
            else:
                el.add_value("repost", "0")

            log.msg(repost)
            yield el.load_item()
コード例 #3
0
    def parse(self, response):
        #获取一个选择器
        hxs = Selector(response)

        for con in hxs.xpath(self.xpathConf.get("parse_xpath")):
            ss = Selector(text=con.extract())
            el = WeiboComItemLoader(selector=ss)
            el.add_value("site_source","news.sina.com.cn")
            el.add_value("site_type","news")
            el.add_value("task_id",self.task_id)
            nowTime=time.localtime()
            nowDate=datetime.datetime(nowTime[0],nowTime[1],nowTime[2])
            el.add_value("catch_date",nowDate.strftime('%Y-%m-%d'))

            baidu_url = ss.xpath(self.xpathConf.get("site_url")).extract()
            if baidu_url and len(baidu_url)>0:
                source_url = urlUtil.getRedirectUrl(baidu_url[0],timeout=10)
                sinaUrl = fo.findSinaNewsUrl(source_url)
                if sinaUrl:
                    el.add_value('site_url',sinaUrl)
                else:
                    continue
            else:
                continue


            yield el.load_item()
コード例 #4
0
    def parse(self, response):
        #获取一个选择器
        hxs = Selector(response)

        for con in hxs.xpath(self.xpathConf.get("parse_xpath")):
            ss = Selector(text=con.extract())
            el = WeiboComItemLoader(selector=ss)
            el.add_value("site_source","mp.weixin.qq.com")
            el.add_value("site_type","weixin")
            el.add_value("task_id",self.task_id)
            nowTime=time.localtime()
            nowDate=datetime.datetime(nowTime[0],nowTime[1],nowTime[2])
            el.add_value("catch_date",nowDate.strftime('%Y-%m-%d'))

            el.add_xpath('site_url',self.xpathConf.get("site_url"))


            yield el.load_item()
コード例 #5
0
    def parse(self, response):
        #获取一个选择器
        hxs = Selector(response)
        for con in hxs.xpath(self.xpathConf.get("parse_xpath")):

            ss = Selector(text=con.extract())
            el = WeiboComItemLoader(selector=ss)
            #'site_source','site_type','site_url'
            el.add_value("site_source","weibo.com")
            el.add_value("site_type","weibo")
            el.add_value("task_id",self.task_id)
            nowTime=time.localtime()
            nowDate=datetime.datetime(nowTime[0],nowTime[1],nowTime[2])
            el.add_value("catch_date",nowDate.strftime('%Y-%m-%d'))

            el.add_xpath('author', self.xpathConf.get("author"))
            el.add_xpath('user_url',self.xpathConf.get("user_url"))
            el.add_xpath('site_url',self.xpathConf.get("site_url"))
            el.add_value('content',ss.xpath(self.xpathConf.get("content")).extract())
            el.add_xpath('publish_time',self.xpathConf.get("publish_time"))
            attitude = ss.xpath(self.xpathConf.get("attitude")).extract()

            if attitude:
                el.add_value("attitude",attitude)
            else:
                el.add_value("attitude","0")

            comments = ss.xpath(self.xpathConf.get("comments")).extract()
            if comments:
                el.add_value("comments",comments)
            else:
                el.add_value("comments","0")
            repost = ss.xpath(self.xpathConf.get("repost")).extract()
            if repost:
                el.add_value("repost",repost)
            else:
                el.add_value("repost","0")

            log.msg(repost)
            yield el.load_item()
コード例 #6
0
    def parse(self, response):
        #获取一个选择器
        hxs = Selector(response)

        for con in hxs.xpath(self.xpathConf.get("parse_xpath")):
            ss = Selector(text=con.extract())
            el = WeiboComItemLoader(selector=ss)
            el.add_value("site_source", "news.sina.com.cn")
            el.add_value("site_type", "news")
            el.add_value("task_id", self.task_id)
            nowTime = time.localtime()
            nowDate = datetime.datetime(nowTime[0], nowTime[1], nowTime[2])
            el.add_value("catch_date", nowDate.strftime('%Y-%m-%d'))

            baidu_url = ss.xpath(self.xpathConf.get("site_url")).extract()
            if baidu_url and len(baidu_url) > 0:
                source_url = urlUtil.getRedirectUrl(baidu_url[0], timeout=10)
                sinaUrl = fo.findSinaNewsUrl(source_url)
                if sinaUrl:
                    el.add_value('site_url', sinaUrl)
                else:
                    continue
            else:
                continue

            yield el.load_item()