Ejemplo n.º 1
0
    def __init__(self, **kwargs):
        super(SougouWeixinFilterSpider,self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("sougou.json").get("sougou").get("sougou_weixin")
        self.Wait_Element = self.xpathConf.get("wait_element")
Ejemplo n.º 2
0
    def __init__(self, **kwargs):
        super(BaiduSinaNewsFilterSpider,self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("baidu.json").get("baidu").get("sina_news")
        self.Wait_Element = self.xpathConf.get("wait_element")
Ejemplo n.º 3
0
 def _process_item(self, item, spider):
     cols,vals,key = self.item_key(item, spider)
     print cols
     print vals
     print key
     mutations = [Mutation(column=col, value=val) for col,val in zip(cols,vals)]
     self.client.mutateRow(self.tableName,confUtil.getMd5(key),mutations,None)
     return item
Ejemplo n.º 4
0
    def __init__(self, **kwargs):
        super(BaiduSearchBySiteKeywordSpider,self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("baidu.json").get("baidu").get("search_conf")
        self.Wait_Element = self.xpathConf.get("wait_element")
Ejemplo n.º 5
0
    def __init__(self, **kwargs):
        super(WeiboSearchStartSpider,self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("weibo.json").get("weibo_com").get(self.spider_type)
        self.Wait_Element = self.xpathConf.get("wait_element")
Ejemplo n.º 6
0
    def __init__(self, **kwargs):
        super(SougouWeixinFilterSpider, self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("sougou.json").get("sougou").get(
            "sougou_weixin")
        self.Wait_Element = self.xpathConf.get("wait_element")
    def __init__(self, **kwargs):
        super(BaiduSearchBySiteKeywordSpider, self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("baidu.json").get("baidu").get(
            "search_conf")
        self.Wait_Element = self.xpathConf.get("wait_element")
    def __init__(self, **kwargs):
        super(WeiboSearchStartSpider, self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("weibo.json").get(
            "weibo_com").get(self.spider_type)
        self.Wait_Element = self.xpathConf.get("wait_element")
    def __init__(self, **kwargs):
        super(BaiduSinaNewsFilterSpider, self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("baidu.json").get("baidu").get(
            "sina_news")
        self.Wait_Element = self.xpathConf.get("wait_element")
Ejemplo n.º 10
0
    def __init__(self,**kwargs):
        super(WeixinContentSpider,self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("weixin.json").get(self.siteName).get(self.spider_type)
        self.Wait_Element = self.xpathConf.get("wait_element")

        self.itemKeys.append("title")
Ejemplo n.º 11
0
    def hbase_tables(self):

        tables = self.client.getTableNames()
        print tables

        cols =['detail:publish_time', 'detail:site_source', 'detail:site_type', 'detail:site_url', 'detail:task_id',
               'detail:author',
               'detail:catch_date'
        ]
        vals = ['2015-03-10 02:39', 'news.sina.com.cn', 'news', 'http://news.sina.com.cn/c/2015-03-10/023931587440.shtml', '-1',
                u'\u4eac\u534e\u65f6\u62a5'.encode("utf-8"),
                '2015-03-27'
        ]
        key = "http://news.sina.com.cn/c/2015-03-10/023931587440.shtml"
        print confUtil.getMd5(key)


        mutations = [Mutation(column=col, value=val) for col,val in zip(cols,vals)]
        self.client.mutateRow(self.tableName,confUtil.getMd5(key),mutations,None)
Ejemplo n.º 12
0
    def __init__(self, **kwargs):
        super(WeiboComUserInfoContentSpider,self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")
        #爬去结果输出到Redist的Key。
        self.out_key=self.name[0:self.name.index(":")]

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("weibo.json").get("weibo_cn").get("user_info")
        self.Wait_Element = self.xpathConf.get("wait_element")
Ejemplo n.º 13
0
    def __init__(self, **kwargs):
        super(WeixinContentSpider, self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("weixin.json").get(
            self.siteName).get(self.spider_type)
        self.Wait_Element = self.xpathConf.get("wait_element")

        self.itemKeys.append("title")
    def __init__(self, **kwargs):
        super(WeiboComUserInfoContentSpider, self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")
        #爬去结果输出到Redist的Key。
        self.out_key = self.name[0:self.name.index(":")]

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("weibo.json").get("weibo_cn").get(
            "user_info")
        self.Wait_Element = self.xpathConf.get("wait_element")
 def _process_item(self, item, spider):
     cols, vals, key = self.item_key(item, spider)
     print cols
     print vals
     print key
     mutations = [
         Mutation(column=col, value=val) for col, val in zip(cols, vals)
     ]
     self.client.mutateRow(self.tableName, confUtil.getMd5(key), mutations,
                           None)
     return item
Ejemplo n.º 16
0
    def __init__(self, **kwargs):
        super(WeiboComSearchSpider,self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("weibo.json").get("weibo_com").get("search_conf")
        self.Wait_Element = self.xpathConf.get("wait_element")

        self.itemKeys.append('attitude')
        self.itemKeys.append('comments')
        self.itemKeys.append('repost')
        self.itemKeys.append('user_url')
    def __init__(self, **kwargs):
        super(WeiboComSearchSpider, self).__init__(**kwargs)
        self.name = kwargs.get("name")
        self.redis_key = kwargs.get("redis_key")

        #获取微博的XPath配置
        self.xpathConf = confUtil.getJsonStr("weibo.json").get(
            "weibo_com").get("search_conf")
        self.Wait_Element = self.xpathConf.get("wait_element")

        self.itemKeys.append('attitude')
        self.itemKeys.append('comments')
        self.itemKeys.append('repost')
        self.itemKeys.append('user_url')
    def hbase_tables(self):

        tables = self.client.getTableNames()
        print tables

        cols = [
            'detail:publish_time', 'detail:site_source', 'detail:site_type',
            'detail:site_url', 'detail:task_id', 'detail:author',
            'detail:catch_date'
        ]
        vals = [
            '2015-03-10 02:39', 'news.sina.com.cn', 'news',
            'http://news.sina.com.cn/c/2015-03-10/023931587440.shtml', '-1',
            u'\u4eac\u534e\u65f6\u62a5'.encode("utf-8"), '2015-03-27'
        ]
        key = "http://news.sina.com.cn/c/2015-03-10/023931587440.shtml"
        print confUtil.getMd5(key)

        mutations = [
            Mutation(column=col, value=val) for col, val in zip(cols, vals)
        ]
        self.client.mutateRow(self.tableName, confUtil.getMd5(key), mutations,
                              None)
Ejemplo n.º 19
0
 def __init__(self):
     self.local_ip = confUtil.getLocalIp()
     self.redisConfUtil = RedisConfUtil()
Ejemplo n.º 20
0
 def __init__(self):
     self.local_ip = confUtil.getLocalIp()
     self.redisConfUtil = RedisConfUtil()