def from_spider(cls, spider): """Returns instance from crawler. Parameters ---------- spider : Returns ------- RFPDupeFilter Instance of RFPDupeFilter. """ settings = spider.settings server = get_redis_from_settings(settings) dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY) key = dupefilter_key % {'spider': spider.name} debug = settings.getbool('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG) bit = settings.getint('BLOOMFILTER_BIT', BLOOMFILTER_BIT) hash_number = settings.getint('BLOOMFILTER_HASH_NUMBER', BLOOMFILTER_HASH_NUMBER) print(key, bit, hash_number) instance = cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number) return instance
def from_settings(cls, settings): """Returns an instance from given settings. This uses by default the key ``dupefilter:<timestamp>``. When using the ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as it needs to pass the spider name in the key. Parameters ---------- settings : scrapy.settings.Settings Returns ------- RFPDupeFilter A RFPDupeFilter instance. """ server = get_redis_from_settings(settings) # XXX: This creates one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} debug = settings.getbool('DUPEFILTER_DEBUG') return cls(server, key=key, debug=debug)
def from_crawler(cls, crawler): settings = crawler.settings if settings.getbool('REDIS_IP_PROXY_ENABLED'): ipproxy_key = crawler.settings.get('REDIS_IP_PROXY_KEY') server = get_redis_from_settings(crawler.settings) return cls(server=server, key=ipproxy_key) return cls(crawler.settings.get('PROXIES'))
def from_spider(cls, spider): settings = spider.settings server = get_redis_from_settings(settings) dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY) key = dupefilter_key % {'spider': spider.name} debug = settings.getbool('DUPEFILTER_DEBUG') return cls(server, key=key, debug=debug)
def from_settings(cls, settings, server=None, key=None, debug=None): redis_server = get_redis_from_settings(settings) debug = settings.getbool('DUPEFILTER_DEBUG') redis_db = settings.getint('REDIS_DB') redis_blockNum = settings.getint('REDIS_BLOCKNUM') redis_key = settings['REDIS_KEY'] # return cls(redis_server, key=key, debug=debug, db=redis_db, blockNum=redis_blockNum, redis_key=redis_key) return cls()
def compete_key(self): self.server = get_redis_from_settings(self.settings) self.redis_compete = self.settings.get('REDIS_COMPETE') % {'spider': self.name} self.redis_wait = self.settings.get('REDIS_WAIT') % {'spider': self.name} self.key = 1 # self.server.sadd(self.key, fp) while self.server.sadd(self.redis_compete, self.key) == 0: self.key = self.key + 1 self.logger.info("get key %s" % self.key)
def __init__(self, spider_name, spider_num=psutil.cpu_count(logical=True), write_asyn=True): self.write_asyn = write_asyn self.spider_name = spider_name self.spider_num = spider_num self.start_urls_redis_key = "%(name)s:start_urls" % {"name": self.spider_name} self.items_redis_key = "%(name)s:items" % {"name": self.spider_name} self.setting = get_project_settings() self.logger = self.get_loger() self.redis = get_redis_from_settings(self.setting) self.redis = redis.Redis(host='192.168.0.117', port=6379, db=0) self.logger.info(self.redis)
def __init__(self, settings): super(CookieMiddleware, self).__init__(settings) self.site = settings.get('SITE', None) if not self.site: raise CrwyScrapyPlugsException('SITE_NOT_SET') self.server = get_redis_from_settings(settings) self.h = RedisHash( 'cookie_pool:{}'.format(self.site), server=self.server )
def __init__(self, crawler): self.crawler = crawler self.setting = crawler.settings self.spider = crawler.spider self.spider_name = self.spider.name self.http_proxies_queue_redis_key = self.setting.get( "HTTP_PROXIES_QUEUE_REDIS_KEY", "%(name)s:http_proxies_queue") % { "name": self.spider_name } self.logger.info(self.http_proxies_queue_redis_key) self.user_agent = UserAgent() self.redis = get_redis_from_settings(self.setting) self.current_proxy = self.get_new_proxy()
def from_settings(cls, settings): key = 'isbnfilter:%(timestamp)s' % {'timestamp': int(time.time())} server = get_redis_from_settings(settings) mysql = {} mysql['host'] = settings.get('MYSQL_HOST') mysql['user'] = settings.get('MYSQL_USER') mysql['passwd'] = settings.get('MYSQL_PASSWD') mysql['db'] = settings.get('MYSQL_DB') mysql['table'] = settings.get('MYSQL_TABLE') return cls(server=server, key=key, mysql=mysql)
def from_settings(cls, settings): server = get_redis_from_settings(settings) debug = settings.getbool('DUPEFILTER_DEBUG') bot_name = settings.get('BOT_NAME') spider_name = settings.get('SPIDER_NAME') duperliter_delay_day = settings.getint('DUPEFILTER_DELAY_DAY', 0) do_hash = settings.getbool('DUPEFILTER_DO_HASH', True) if not spider_name: raise NotConfigured('%s - "SPIDER_NAME" is not found.' % cls.__name__) return cls(debug=debug, server=server, bot_name=bot_name, spider_name=spider_name, duperliter_delay_day=duperliter_delay_day, do_hash=do_hash)
def __init__(self, redis_key, start_urls_num_redis_key, interval=1, bar_name=None): threading.Thread.__init__(self) self.start_urls_num_redis_key = start_urls_num_redis_key self.setDaemon(True) self.setting = get_project_settings() self.redis = get_redis_from_settings(self.setting) self.redis_key = redis_key self.total = int(self.redis.get(self.start_urls_num_redis_key)) self.interval = interval if bar_name: self.bar_name = bar_name else: self.bar_name = self.redis_key self.stop = False
def __init__(self, redis_key, interval=1, bar_name=None): threading.Thread.__init__(self) self.setDaemon(True) #self.redis = redis.Redis(host='192.168.0.117', port=6379, db=0) self.setting = get_project_settings() self.redis = get_redis_from_settings(self.setting) self.redis_key = redis_key self.total = self.redis.scard(self.redis_key) self.interval = interval if bar_name: self.bar_name = bar_name else: self.bar_name = self.redis_key self.stop = False
def __init__(self, redis_key, interval=1, bar_name=None, buffer_size=512): threading.Thread.__init__(self) self.stop = False self.interval = interval self.buffer_size = buffer_size self.counter = 0 #self.redis = redis.Redis(host='192.168.0.117', port=6379, db=0) self.setting = get_project_settings() self.redis = get_redis_from_settings(self.setting) self.redis_key = redis_key self.total = self.redis.llen(self.redis_key) if bar_name: self.bar_name = bar_name else: self.bar_name = self.redis_key
def __init__(self, redis_key, bar_name=None, buffer_size=512, show_pbar=True, stop_epoch=12*30, distinct_field=None): threading.Thread.__init__(self) self.distinct_field = distinct_field self.show_pbar = show_pbar self.stop = False self.stop_epoch = stop_epoch self.buffer_size = buffer_size self.counter = 0 self.setting = get_project_settings() self.redis = get_redis_from_settings(self.setting) self.redis_key = redis_key self.total = self.redis.llen(self.redis_key) if bar_name: self.bar_name = bar_name else: self.bar_name = self.redis_key self.distinct_set = set()
def __init__(self, spider_name, spider_num=psutil.cpu_count(logical=True), start_id=0): self.spider_name = spider_name self.spider_num = spider_num self.setting = get_project_settings() configure_logging(self.setting) self.start_urls_redis_key = self.setting.get("START_URLS_KEY", "%(name)s:start_urls") % {"name": self.spider_name} self.items_redis_key = self.setting.get("RESULT_ITEMS_REDIS_KEY", "%(name)s:items") % {"name": self.spider_name} self.start_urls_num_redis_key = self.setting.get("START_URLS_NUM_KEY", "%(name)s:start_urls_num") % {"name": self.spider_name} self.http_proxies_queue_redis_key = self.setting.get("HTTP_PROXIES_QUEUE_REDIS_KEY", "%(name)s:http_proxies_queue") % {"name": self.spider_name} self.dupefilter_redis_key = self.setting.get("SCHEDULER_DUPEFILTER_KEY", "%(spider)s:dupefilter") % {"spider": self.spider_name} self.logger = logging.getLogger(__name__) self.redis = get_redis_from_settings(self.setting) self.logger.info(self.redis) self.start_id = start_id #范围start_id>=0 and start_id+self.spider_num<=237 if not (self.start_id >= 0 and start_id + self.spider_num <= 237): raise InterruptedError("not valid start_id, spider_num")
def from_settings(cls, settings): """Returns an instance from given settings. This uses by default the key ``dupefilter:<timestamp>``. When using the ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as it needs to pass the spider name in the key. Parameters ---------- settings : scrapy.settings.Settings Returns ------- RFPDupeFilter A RFPDupeFilter instance. """ server = get_redis_from_settings(settings) # XXX: This creates one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. key = defaults.DUPEFILTER_KEY debug = settings.getbool('DUPEFILTER_DEBUG') mysql = {} mysql['host'] = settings.get('MYSQL_HOST') mysql['user'] = settings.get('MYSQL_USER') mysql['passwd'] = settings.get('MYSQL_PASSWD') mysql['db'] = settings.get('MYSQL_DB') mysql['table'] = setting.get('MYSQL_TABLE') return cls(server, key=key, debug=debug, mysql=mysql)
def spider_opened(self, spider): logger.info("opened spider %s redis spider Idle, Continuous idle limit: %d", spider.name, self.idle_number) self.redis = get_redis_from_settings(self.settings) self.redis_key = self.settings.get('RESULT_ITEMS_REDIS_KEY', '%(name)s:items') % {"name": spider.name}
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals import base64 import random from scrapy_redis.connection import get_redis_from_settings from scrapy.utils.project import get_project_settings redis_cli = get_redis_from_settings(get_project_settings()) class NewsSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider.
def from_settings(cls, settings): server = get_redis_from_settings(settings) key = settings.get('DUPEFILTER_KEY') debug = settings.getbool('DUPEFILTER_DEBUG') return cls(server, key=key, debug=debug)
def from_spider(cls, spider): settings = spider.settings server = get_redis_from_settings(settings) key = spider.name debug = settings.getbool('DUPEFILTER_DEBUG') return cls(server, key=key, debug=debug)