def prepare(self, fromtext=False, start_idx=0, end_idx=100): if not fromtext: host = settings.get("REDIS_HOST", REDIS_HOST) port = settings.get("REDIS_PORT", REDIS_PORT) self.r = _default_redis(host, port) uids_set = UIDS_SET.format(spider=self.name) log.msg(format="Load uids from %(uids_set)s", level=log.WARNING, uids_set=uids_set) uids = self.r.smembers(uids_set) if uids == []: log.msg(format="Not load any uids from %(uids_set)s", level=log.WARNING, uids_set=uids_set) else: uids = [] fname = "uidlist_20140103.txt" log.msg(format="Load uids from %(uids_set)s", level=log.WARNING, uids_set=fname) if os.getcwd()[-8:] == "cron4win": f = open("../test/%s" % fname, "r") else: f = open("./test/%s" % fname, "r") count = 0 for line in f.readlines(): count += 1 if count >= start_idx and count <= end_idx: uids.append(int(line.strip().split(",")[0])) elif count < start_idx: pass else: break if uids == []: log.msg(format="Not load any uids from %(uids_set)s", level=log.WARNING, uids_set=fname) f.close() return uids
def prepare(self, fromtext=False, start_idx=0, end_idx=100): if not fromtext: host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) self.r = _default_redis(host, port) uids_set = UIDS_SET.format(spider=self.name) log.msg(format='Load uids from %(uids_set)s', level=log.WARNING, uids_set=uids_set) uids = self.r.smembers(uids_set) if uids == []: log.msg(format='Not load any uids from %(uids_set)s', level=log.WARNING, uids_set=uids_set) else: uids = [] fname = 'uid_about_marine' log.msg(format='Load uids from %(uids_set)s', level=log.WARNING, uids_set=fname) f = open('./source/%s' % fname) count = 0 for line in f: count += 1 if count >= start_idx and count <= end_idx: uids.append(int(line.strip())) elif count > start_idx: break else: pass if uids == []: log.msg(format='Not load any uids from %(uids_set)s', level=log.WARNING, uids_set=fname) f.close() return uids
def prepare(self): host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) self.r = _default_redis(host, port) mids_set = MIDS_SET.format(spider=self.name) log.msg(format='Load mids from %(mids_set)s', level=log.WARNING, mids_set=mids_set) mids = self.r.smembers(mids_set) if mids == []: log.msg(format='Not load any mids from %(mids_set)s', level=log.WARNING, mids_set=mids_set) return mids
def prepare(self): host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) self.r = _default_redis(host, port) uids_set = UIDS_SET.format(spider=self.name) log.msg(format='Load uids from %(uids_set)s', level=log.INFO, uids_set=uids_set) uids = self.r.smembers(uids_set) if uids == []: log.msg(format='Not load any uids from %(uids_set)s', level=log.INFO, uids_set=uids_set) return uids
def prepare(self): host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) self.r = _default_redis(host, port) weiboids_set = WEIBOIDS_SET.format(spider=self.name) log.msg(format='Load weiboids from %(weiboids_set)s', level=log.INFO, weiboids_set=weiboids_set) weiboids = self.r.smembers(weiboids_set) if weiboids == []: log.msg(format='Not load any weiboids from %(weiboids_set)s', level=log.INFO, weiboids_set=weiboids_set) return weiboids
def prepare(self, fromtext=False, start_idx=0, end_idx=100): if not fromtext: host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) self.r = _default_redis(host, port) uids_set = UIDS_SET.format(spider=self.name) log.msg(format='Load uids from %(uids_set)s', level=log.WARNING, uids_set=uids_set) uids = self.r.smembers(uids_set) if uids == []: log.msg(format='Not load any uids from %(uids_set)s', level=log.WARNING, uids_set=uids_set) else: uids = [] fname = 'uidlist_20140103.txt' log.msg(format='Load uids from %(uids_set)s', level=log.WARNING, uids_set=fname) if os.getcwd()[-8:] == 'cron4win': f = open('../test/%s' % fname, 'r') else: f = open('./test/%s' % fname, 'r') count = 0 for line in f.readlines(): count += 1 if count >= start_idx and count <= end_idx: uids.append(int(line.strip().split(',')[0])) elif count < start_idx: pass else: break if uids == []: log.msg(format='Not load any uids from %(uids_set)s', level=log.WARNING, uids_set=fname) f.close() return uids
def prepare(self): host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) self.r = _default_redis(host, port) uids_set = UIDS_SET.format(spider=self.name) uids_priority_set = UIDS_PRIORITY_SET.format(spider=self.name) self.uids_priority_set = uids_priority_set log.msg(format='Load uids from %(uids_set)s', level=log.INFO, uids_set=uids_set) uids = self.r.smembers(uids_set) if uids == []: log.msg(format='Not load any uids from %(uids_set)s', level=log.INFO, uids_set=uids_set) # 初始化priority for uid in uids: if not self.r.hexists(self.uids_priority_set, uid): self.r.hset(self.uids_priority_set, uid, DEFAULT_SCORE) # 根据priority过滤uids uids = [uid for uid in uids if self.gt <= int(self.r.hget(self.uids_priority_set, uid)) <= self.lt] log.msg(format='%(length)s uids between %(gt)s -> %(lt)s will be process', level=log.INFO, length=len(uids), gt=self.gt, lt=self.lt, uids_set=uids_set) return uids
def prepare(self): host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) self.r = _default_redis(host, port) uids_set = UIDS_SET.format(spider=self.name) uids_priority_set = UIDS_PRIORITY_SET.format(spider=self.name) self.uids_priority_set = uids_priority_set log.msg(format='Load uids from %(uids_set)s', level=log.WARNING, uids_set=uids_set) uids = self.r.smembers(uids_set) if uids == []: log.msg(format='Not load any uids from %(uids_set)s', level=log.WARNING, uids_set=uids_set) # 初始化priority for uid in uids: if not self.r.hexists(self.uids_priority_set, uid): self.r.hset(self.uids_priority_set, uid, DEFAULT_SCORE) # 根据priority过滤uids uids = [ uid for uid in uids if self.gt <= int(self.r.hget(self.uids_priority_set, uid)) <= self.lt ] log.msg( format='%(length)s uids between %(gt)s -> %(lt)s will be process', level=log.INFO, length=len(uids), gt=self.gt, lt=self.lt, uids_set=uids_set) return uids