def from_settings(cls): redis_cli = CyeRedis.getInstance() # create one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed key = "dupefilter:%s" % int(time.time()) return cls(redis_cli, key)
def initCye(self): self.session = scoped_session(MygiftSession) self.query_product = self.session.query(ProductObj) self.query_price = self.session.query(ProductPriceObj) self.start_urls = (settings.get('START_URLS'))[self.namespace] self.rules = [ Rule(SgmlLinkExtractor(allow=['/\d+\.html']), 'parse_product', process_links=self._process_product_links) ] self.rules.append(self._getNextRuleByUrl(self.start_urls, 'parse_Request')) #self.urls_key = settings.get('REDIS_URLS_KEY', '%s:urls') % self.namespace #redis instance self.redis_cli = CyeRedis.getInstance()
def initCye(self): self.session = scoped_session(MygiftSession) self.query_product = self.session.query(ProductObj) self.query_price = self.session.query(ProductPriceObj) self.start_urls = (settings.get('START_URLS'))[self.namespace] self.rules = [ Rule(SgmlLinkExtractor(allow=['/\d+\.html']), 'parse_product', process_links=self._process_product_links) ] self.rules.append( self._getNextRuleByUrl(self.start_urls, 'parse_Request')) #self.urls_key = settings.get('REDIS_URLS_KEY', '%s:urls') % self.namespace #redis instance self.redis_cli = CyeRedis.getInstance()
def _init_urls(self): self.session = scoped_session(MygiftSession) self.query_product = self.session.query(ProductObj) self.query_price = self.session.query(ProductPriceObj) self.redis_cli = CyeRedis.getInstance() self.update_urls_key = settings.get('REDIS_UPDATE_URLS_KEY', '%s:update') % self.namespace results = self.redis_cli.zrange(self.update_urls_key, 0, lite_max_num, withscores=True) if results: for one in results: pkey = hashlib.md5(one[0]).hexdigest() product = self.query_product.filter(ProductObj.pkey == pkey).filter(or_("last_crawl_time is null", "last_crawl_time<DATE_SUB(NOW(), INTERVAL :time_interval HOUR)")).\ params(time_interval=crawl_time_interval).first() if product: self.start_urls.append(one[0]) #self.start_urls.extend(results) self.log("The number of links : %d" % len(results), log.INFO) else: self.log("Not found link to update.", log.INFO)
def from_settings(cls, settings): redis_cli = CyeRedis.getInstance() persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) return cls(redis_cli, persist, queue_key)
if match: flag = True return flag @classmethod def isJingdongProduct(cls, varobj): flag = False if cls.isString(varobj): rule = r'http://www.360buy\.com/product/\d+\.[a-zA-Z]+' match = re.match(rule, varobj) if match: flag = True return flag redis_cli = CyeRedis.getInstance() def pushUrl(url, score=0): ret = False if Validation.isJingdongProduct(url): namespace = 'jingdong' update_urls_key = settings.get('REDIS_UPDATE_URLS_KEY', '%s:update') % namespace print update_urls_key if redis_cli.zadd(update_urls_key, url, score): ret = True return ret if __name__ == "__main__":
match = re.match(rule, varobj) if match: flag = True return flag @classmethod def isJingdongProduct(cls, varobj): flag = False if cls.isString(varobj): rule = r'http://www.360buy\.com/product/\d+\.[a-zA-Z]+' match = re.match(rule, varobj) if match: flag = True return flag redis_cli = CyeRedis.getInstance() def pushUrl(url, score=0): ret = False if Validation.isJingdongProduct(url): namespace = 'jingdong' update_urls_key = settings.get('REDIS_UPDATE_URLS_KEY', '%s:update') % namespace print update_urls_key if redis_cli.zadd(update_urls_key, url, score): ret = True return ret if __name__ == "__main__": print sys.path url= None