def from_settings(cls, settings, spidername): server = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT')) persist = settings.get('SCHEDULER_PERSIST', True) up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) hits = settings.get('QUEUE_HITS', 10) window = settings.get('QUEUE_WINDOW', 60) mod = settings.get('QUEUE_MODERATED', False) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) add_type = settings.get('SCHEDULER_TYPE_ENABLED', False) add_ip = settings.get('SCHEDULER_IP_ENABLED', False) retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) ip_regex = settings.get('IP_ADDR_REGEX', '.*') my_level = settings.get('SC_LOG_LEVEL', 'DEBUG') my_name = "%s_%s" % (spidername, get_raspberrypi_ip_address()) my_output = settings.get('SC_LOG_STDOUT', False) my_json = settings.get('SC_LOG_JSON', True) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = "%s_%s.log" % (spidername, get_raspberrypi_ip_address()) my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = CustomLogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) return cls(server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex)
def from_settings(cls, settings, spidername): server = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT')) persist = settings.get('SCHEDULER_PERSIST', True) up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) hits = settings.get('QUEUE_HITS', 10) window = settings.get('QUEUE_WINDOW', 60) mod = settings.get('QUEUE_MODERATED', False) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) add_type = settings.get('SCHEDULER_TYPE_ENABLED', False) add_ip = settings.get('SCHEDULER_IP_ENABLED', False) retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) ip_regex = settings.get('IP_ADDR_REGEX', '.*') my_level = settings.get('SC_LOG_LEVEL', 'DEBUG') my_name = "%s_%s"%(spidername, get_raspberrypi_ip_address()) my_output = settings.get('SC_LOG_STDOUT', False) my_json = settings.get('SC_LOG_JSON', True) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = "%s_%s.log"%(spidername, get_raspberrypi_ip_address()) my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = CustomLogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) return cls(server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex)
def from_crawler(cls, crawler): settings = crawler.settings my_level = settings.get('SC_LOG_LEVEL', 'DEBUG') my_name = "%s_%s" % (crawler.spidercls.name, get_raspberrypi_ip_address()) my_output = settings.get('SC_LOG_STDOUT', False) my_json = settings.get('SC_LOG_JSON', True) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = "%s_%s.log" % (crawler.spidercls.name, get_raspberrypi_ip_address()) my_backups = settings.get('SC_LOG_BACKUPS', 5) cls.logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) return cls(crawler.settings)
def setup_logger(cls, settings, spidername): my_level = settings.get('SC_LOG_LEVEL', 'DEBUG') my_name = "%s_%s" % (spidername, get_raspberrypi_ip_address()) my_output = settings.get('SC_LOG_STDOUT', False) my_json = settings.get('SC_LOG_JSON', True) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = "%s_%s.log" % (spidername, get_raspberrypi_ip_address()) my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = CustomLogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) return logger
def setup(self, settings): ''' Does the actual setup of the middleware ''' # set up the default sc logger my_level = settings.get('SC_LOG_LEVEL', 'INFO') #my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_name = "%s_%s" % (settings['SPIDER_NAME'], get_raspberrypi_ip_address()) my_output = settings.get('SC_LOG_STDOUT', False) my_json = settings.get('SC_LOG_JSON', True) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') #my_file = settings.get('SC_LOG_FILE', 'main.log') my_file = "%s_%s.log" % (settings['SPIDER_NAME'], get_raspberrypi_ip_address()) my_backups = settings.get('SC_LOG_BACKUPS', 5) self.logger = CustomLogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) #self.logger.setLevel(logging.DEBUG) self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES')) # stats setup self.stats_dict = {} self.settings = settings self.name = self.settings['SPIDER_NAME'] if self.settings['STATS_STATUS_CODES']: self.redis_conn = redis.Redis(host=self.settings.get('REDIS_HOST'), port=self.settings.get('REDIS_PORT')) self._setup_stats_status_codes()
def __init__(self, settings): # 保存上次不用代理直接连接的时间点 self.last_no_proxy_time = datetime.now() # 一定分钟数后切换回不用代理, 因为用代理影响到速度 self.recover_interval = 20 # 一个proxy如果没用到这个数字就被发现老是超时, 则永久移除该proxy. 设为0则不会修改代理文件. self.dump_count_threshold = 20 # 存放代理列表的文件, 每行一个代理, 格式为ip:port, 注意没有http://, 而且这个文件会被修改, 注意备份 self.proxy_file = "%s_proxyes.list" % get_raspberrypi_ip_address() # 是否在超时的情况下禁用代理 self.invalid_proxy_flag = True # 当有效代理小于这个数时(包括直连), 从网上抓取新的代理, 可以将这个数设为为了满足每个ip被要求输入验证码后得到足够休息时间所需要的代理数 # 例如爬虫在十个可用代理之间切换时, 每个ip经过数分钟才再一次轮到自己, 这样就能get一些请求而不用输入验证码. # 如果这个数过小, 例如两个, 爬虫用A ip爬了没几个就被ban, 换了一个又爬了没几次就被ban, 这样整个爬虫就会处于一种忙等待的状态, 影响效率 self.extend_proxy_threshold = 10 # 初始化代理列表 #self.proxyes = [{"proxy": None, "valid": True, "count": 0}, {"proxy": "http://10.10.2.58:6666", "valid": True, "count": 0}, # {"proxy": "http://10.110.93.95:8088", "valid": True, "count": 0}] self.proxyes = [{"proxy": None, "valid": True, "count": 0}] # 初始时使用0号代理(即无代理) self.proxy_index = 0 # 表示可信代理的数量(如自己搭建的HTTP代理)+1(不用代理直接连接) self.fixed_proxy = len(self.proxyes) # 上一次抓新代理的时间 self.last_fetch_proxy_time = datetime.now() # 每隔固定时间强制抓取新代理(min) self.fetch_proxy_interval = 120 # 一个将被设为invalid的代理如果已经成功爬取大于这个参数的页面, 将不会被invalid self.invalid_proxy_threshold = 200 if not os.path.exists(self.proxy_file): open(self.proxy_file, "w") # 从文件读取初始代理 with open(self.proxy_file, "r") as fd: lines = fd.readlines() shuffle(lines) for line in lines: line = line.strip() if not line or self.url_in_proxyes("http://" + line): continue self.proxyes.append({ "proxy": "http://" + line, "valid": True, "count": 0 })
def __init__(self, settings): # 保存上次不用代理直接连接的时间点 self.last_no_proxy_time = datetime.now() # 一定分钟数后切换回不用代理, 因为用代理影响到速度 self.recover_interval = 20 # 一个proxy如果没用到这个数字就被发现老是超时, 则永久移除该proxy. 设为0则不会修改代理文件. self.dump_count_threshold = 20 # 存放代理列表的文件, 每行一个代理, 格式为ip:port, 注意没有http://, 而且这个文件会被修改, 注意备份 self.proxy_file = "%s_proxyes.list"%get_raspberrypi_ip_address() # 是否在超时的情况下禁用代理 self.invalid_proxy_flag = True # 当有效代理小于这个数时(包括直连), 从网上抓取新的代理, 可以将这个数设为为了满足每个ip被要求输入验证码后得到足够休息时间所需要的代理数 # 例如爬虫在十个可用代理之间切换时, 每个ip经过数分钟才再一次轮到自己, 这样就能get一些请求而不用输入验证码. # 如果这个数过小, 例如两个, 爬虫用A ip爬了没几个就被ban, 换了一个又爬了没几次就被ban, 这样整个爬虫就会处于一种忙等待的状态, 影响效率 self.extend_proxy_threshold = 10 # 初始化代理列表 #self.proxyes = [{"proxy": None, "valid": True, "count": 0}, {"proxy": "http://10.10.2.58:6666", "valid": True, "count": 0}, # {"proxy": "http://10.110.93.95:8088", "valid": True, "count": 0}] self.proxyes = [{"proxy": None, "valid": True, "count": 0}] # 初始时使用0号代理(即无代理) self.proxy_index = 0 # 表示可信代理的数量(如自己搭建的HTTP代理)+1(不用代理直接连接) self.fixed_proxy = len(self.proxyes) # 上一次抓新代理的时间 self.last_fetch_proxy_time = datetime.now() # 每隔固定时间强制抓取新代理(min) self.fetch_proxy_interval = 120 # 一个将被设为invalid的代理如果已经成功爬取大于这个参数的页面, 将不会被invalid self.invalid_proxy_threshold = 200 if not os.path.exists(self.proxy_file): open(self.proxy_file, "w") # 从文件读取初始代理 with open(self.proxy_file, "r") as fd: lines = fd.readlines() shuffle(lines) for line in lines: line = line.strip() if not line or self.url_in_proxyes("http://" + line): continue self.proxyes.append({"proxy": "http://" + line, "valid": True, "count": 0})
def update_ipaddress(self): ''' Updates the scheduler so it knows its own ip address ''' # assign local ip in case of exception self.old_ip = self.my_ip self.my_ip = get_raspberrypi_ip_address() try: obj = urllib2.urlopen( settings.get('PUBLIC_IP_URL', 'http://ip.42.pl/raw')) results = self.ip_regex.findall(obj.read()) if len(results) > 0: self.my_ip = results[0] else: raise IOError("Could not get valid IP Address") obj.close() self.logger.debug("Current public ip: {ip}".format(ip=self.my_ip)) except IOError: self.logger.error("Could not reach out to get public ip") pass if self.old_ip != self.my_ip: self.logger.info("Changed Public IP: {old} -> {new}".format( old=self.old_ip, new=self.my_ip))
def update_ipaddress(self): ''' Updates the scheduler so it knows its own ip address ''' # assign local ip in case of exception self.old_ip = self.my_ip self.my_ip = get_raspberrypi_ip_address() try: obj = urllib2.urlopen(settings.get('PUBLIC_IP_URL', 'http://ip.42.pl/raw')) results = self.ip_regex.findall(obj.read()) if len(results) > 0: self.my_ip = results[0] else: raise IOError("Could not get valid IP Address") obj.close() self.logger.debug("Current public ip: {ip}".format(ip=self.my_ip)) except IOError: self.logger.error("Could not reach out to get public ip") pass if self.old_ip != self.my_ip: self.logger.info("Changed Public IP: {old} -> {new}".format( old=self.old_ip, new=self.my_ip))