Example #1
0
 def from_settings(cls):
     redis_cli = CyeRedis.getInstance()
     # create one-time key. needed to support to use this
     # class as standalone dupefilter with scrapy's default scheduler
     # if scrapy passes spider on open() method this wouldn't be needed
     key = "dupefilter:%s" % int(time.time())
     return cls(redis_cli, key)
Example #2
0
 def from_settings(cls):
     redis_cli = CyeRedis.getInstance()
     # create one-time key. needed to support to use this
     # class as standalone dupefilter with scrapy's default scheduler
     # if scrapy passes spider on open() method this wouldn't be needed
     key = "dupefilter:%s" % int(time.time())
     return cls(redis_cli, key)
Example #3
0
 def initCye(self):
     self.session = scoped_session(MygiftSession)
     self.query_product = self.session.query(ProductObj)
     self.query_price = self.session.query(ProductPriceObj)
     
     self.start_urls = (settings.get('START_URLS'))[self.namespace]     
     self.rules = [
         Rule(SgmlLinkExtractor(allow=['/\d+\.html']), 'parse_product', process_links=self._process_product_links)
      ]
     self.rules.append(self._getNextRuleByUrl(self.start_urls, 'parse_Request'))
     
     #self.urls_key = settings.get('REDIS_URLS_KEY', '%s:urls') % self.namespace
     
     #redis instance
     self.redis_cli = CyeRedis.getInstance()
Example #4
0
    def initCye(self):
        self.session = scoped_session(MygiftSession)
        self.query_product = self.session.query(ProductObj)
        self.query_price = self.session.query(ProductPriceObj)

        self.start_urls = (settings.get('START_URLS'))[self.namespace]
        self.rules = [
            Rule(SgmlLinkExtractor(allow=['/\d+\.html']),
                 'parse_product',
                 process_links=self._process_product_links)
        ]
        self.rules.append(
            self._getNextRuleByUrl(self.start_urls, 'parse_Request'))

        #self.urls_key = settings.get('REDIS_URLS_KEY', '%s:urls') % self.namespace

        #redis instance
        self.redis_cli = CyeRedis.getInstance()
Example #5
0
 def _init_urls(self):
     self.session = scoped_session(MygiftSession)
     self.query_product = self.session.query(ProductObj)
     self.query_price = self.session.query(ProductPriceObj)
     
     self.redis_cli = CyeRedis.getInstance()
     self.update_urls_key = settings.get('REDIS_UPDATE_URLS_KEY', '%s:update') % self.namespace
     results = self.redis_cli.zrange(self.update_urls_key, 0, lite_max_num, withscores=True)
     
     if results:
         for one in results:
             pkey = hashlib.md5(one[0]).hexdigest()
             product = self.query_product.filter(ProductObj.pkey == pkey).filter(or_("last_crawl_time is null", "last_crawl_time<DATE_SUB(NOW(), INTERVAL :time_interval HOUR)")).\
         params(time_interval=crawl_time_interval).first()
             if product:
                 self.start_urls.append(one[0]) 
         #self.start_urls.extend(results)
         self.log("The number of  links : %d" % len(results), log.INFO)
     else:
         self.log("Not found link to update.", log.INFO)
Example #6
0
 def from_settings(cls, settings):
     redis_cli = CyeRedis.getInstance()
     persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
     queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
     return cls(redis_cli, persist, queue_key)
Example #7
0
            if match:
                flag = True
        return flag

    @classmethod
    def isJingdongProduct(cls, varobj):
        flag = False
        if cls.isString(varobj):
            rule = r'http://www.360buy\.com/product/\d+\.[a-zA-Z]+'
            match = re.match(rule, varobj)
            if match:
                flag = True
        return flag


redis_cli = CyeRedis.getInstance()


def pushUrl(url, score=0):
    ret = False
    if Validation.isJingdongProduct(url):
        namespace = 'jingdong'
        update_urls_key = settings.get('REDIS_UPDATE_URLS_KEY',
                                       '%s:update') % namespace
        print update_urls_key
        if redis_cli.zadd(update_urls_key, url, score):
            ret = True
    return ret


if __name__ == "__main__":
Example #8
0
            match = re.match(rule, varobj)
            if match:
                flag = True
        return flag
    
    @classmethod
    def isJingdongProduct(cls, varobj):
        flag = False
        if cls.isString(varobj):
            rule = r'http://www.360buy\.com/product/\d+\.[a-zA-Z]+'
            match = re.match(rule, varobj)
            if match:
                flag = True
        return flag

redis_cli = CyeRedis.getInstance()


def pushUrl(url, score=0):
    ret = False
    if Validation.isJingdongProduct(url):
        namespace = 'jingdong'
        update_urls_key = settings.get('REDIS_UPDATE_URLS_KEY', '%s:update') % namespace
        print update_urls_key
        if redis_cli.zadd(update_urls_key, url, score):
            ret = True
    return ret

if __name__ == "__main__":
    print sys.path
    url= None
Example #9
0
 def from_settings(cls, settings):
     redis_cli = CyeRedis.getInstance()
     persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
     queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
     return cls(redis_cli, persist, queue_key)