Ejemplo n.º 1
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     self.redis = connection.from_settings(settings)
     # Ensure the connection is working.
     self.redis.ping()
     #self.redis = settings.get("RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379)))
     initCookie(self.redis, crawler.spider.name)
Ejemplo n.º 2
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     self.rconn = settings.get(
         'RCONN',
         redis.Redis(crawler.settings.get('REDIS_HOST', '192.168.195.1'),
                     crawler.settings.get('REDIS_PORT', 6379)))
     initCookie(self.rconn, crawler.spider.name)
Ejemplo n.º 3
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     self.rconn = settings.get(
         "RCONN",
         redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'),
                     crawler.settings.get('REDIS_PORT', 6379)))
     initCookie(self.rconn, crawler.spider.name)
Ejemplo n.º 4
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     #self.rconn = redis.from_url(settings['REDIS_URL'], db=1, decode_responses=True)  ##decode_responses设置取出的编码为str
     # 首次登陆获取cookies
     res = QichabaoCookie().init_cookie()
     # 存入redis中
     PyRedis().get_redis().set("qichabao:Cookies",
                               json.dumps(res, ensure_ascii=False))
Ejemplo n.º 5
0
 def __init__(self, settings, crawler):
     #自己获取的ip
     self.TIMES = 10
     RetryMiddleware.__init__(self, settings)
     self.rconn = settings.get(
         "RCONN",
         redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'),
                     crawler.settings.get('REDIS_PORT', 6379)))
Ejemplo n.º 6
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     self.rconn = settings.get(
         "RCONN",
         redis.Redis(host=crawler.settings.get('REDIS_HOST', 'localhsot'),
                     port=crawler.settings.get('REDIS_PORT', 6379),
                     password=crawler.settings.get('REDIS_PASSWORD', '')))
     initCookie(self.rconn, crawler.spider.name)
Ejemplo n.º 7
0
 def __init__(self, settings, crawler):
     # 重载父类
     RetryMiddleware.__init__(self, settings)
     # decode_responses 设置取出的编码为str
     # settings['REDIS_URL'] 访问scrapy的settings
     self.redis_connection = redis.from_url(settings['REDIS_URL'],
                                            db=14,
                                            decode_responses=True)
     # 往redis中添加cookie。第二个参数就是spidername的获取方法(其实就是字典啦!)
     init_cookie(self.redis_connection, crawler.spider.name)
Ejemplo n.º 8
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     # 设置cookie的redis连接
     self.rconn = settings.get(
         "RCONN",
         redis.Redis(settings.get('COOKIE_REDIS_HOST', 'localhsot'),
                     settings.get('COOKIE_REDIS_PORT', 6379),
                     settings.get('COOKIE_REDIS_DB', 2)))
     # 初始化所有cookie,将cookie放入redis
     init_cookie(self.rconn, crawler.spider.name)
Ejemplo n.º 9
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     # xiaoman
     self.rconn = redis.from_url(settings['REDIS_URL'],
                                 db=1,
                                 decode_responses=True)
     # weixin
     # self.rconn = redis.from_url(settings['REDIS_URL'], db=4, decode_responses=True)
     self.cookie_tool = CookieTool()
     self.cookie_tool.init_cookie(self.rconn, crawler.spider.name)
Ejemplo n.º 10
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     REDIS_HOST = settings.get('REDIS_HOST')
     REDIS_PORT = settings.get('REDIS_PORT')
     REDIS_DB = settings.get('COOKIES_DB')
     self.r = redis.Redis(
         host=REDIS_HOST,
         port=REDIS_PORT,
         db=REDIS_DB,
         decode_responses=True)  #decode_responses设置取出的编码为str
     self.init_cookie(crawler.spider.name)
Ejemplo n.º 11
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     self.rconn = settings.get(
         "RCONN",
         redis.Redis(crawler.settings.get('REDIS_HOST', 'localhost'),
                     crawler.settings.get('REDIS_PORT', 6379), 2))
     if crawler.spider.name not in ["SoGouSpider", "QxjSpider"]:
         # 由于微博的cookie是以spider.name管理的,这里为了统一,就都使用“SinaSpider”
         name = "SinaSpider"
         # name = crawler.spider.name
         initCookie(self.rconn, name)
Ejemplo n.º 12
0
 def __init__(self, settings, crawler):
     self.logger = logging.getLogger("---Cookies池---")
     RetryMiddleware.__init__(self, settings)
     # 模拟登陆初始化 cookies,若注释掉需要手动向数据库中写入 cookies
     # 写入格式为: username(string) cookies(json string)
     # | username | cookies |
     # | "*****@*****.**" | "{"SSOLoginState": "1570157316", "SUB": "_2A25wksNUDeRhGedI7lER9i_Jzj6IHXVQfO0crDV6PUJbktANLW7HkW1NVzqyC0ntAqR8szHeQCefNRM41xZZJ3YI"}" |
     # init_cookies()
     client = pymongo.MongoClient(MONGO_HOST, MONGO_PORT)
     db = client[MONGO_DB_NAME]
     col = db[COOKIES_COLLECTION_NAME]
     self.cookies_pool = []
     for item in col.find():
         self.cookies_pool.append(json.loads(item['cookies']))
Ejemplo n.º 13
0
    def __init__(self, settings):
        RetryMiddleware.__init__(self, settings)
        self.proxy_list = settings.get('PROXY_LIST')

        self.proxies = {}
        for line in self.proxy_list:
            parts = re.match('(\w+://)(\w+:\w+@)?(.+)', line)

            # Cut trailing @
            if parts.group(2):
                user_pass = parts.group(2)[:-1]
            else:
                user_pass = ''

            self.proxies[parts.group(1) + parts.group(3)] = user_pass
Ejemplo n.º 14
0
 def process_exception(self, request, exception, spider):
     to_return = RetryMiddleware.process_exception(self, request, exception,
                                                   spider)
     # customize retry middleware by modifying this
     request.meta['url'] = request.url
     self.record_failed('failed.txt', request, exception, 'url')
     return to_return
Ejemplo n.º 15
0
    def __init__(self, settings):
        RetryMiddleware.__init__(self, settings)
        self.proxy_list = settings.get('PROXY_LIST')
        fin = open(self.proxy_list)

        self.proxies = {}
        for line in fin.readlines():
            parts = re.match('(\w+://)(\w+:\w+@)?(.+)', line)

            # Cut trailing @
            if parts.group(2):
                user_pass = parts.group(2)[:-1]
            else:
                user_pass = ''

            self.proxies[parts.group(1) + parts.group(3)] = user_pass

        fin.close()
Ejemplo n.º 16
0
    def __init__(self, settings):
        RetryMiddleware.__init__(self, settings)

        self.mode = settings.get('PROXY_MODE')
        self.proxy_list = settings.get('PROXY_LIST')
        self.chosen_proxy = ''

        if self.mode == Mode.RANDOMIZE_PROXY_EVERY_REQUESTS or self.mode == Mode.RANDOMIZE_PROXY_ONCE:
            if self.proxy_list is None:
                raise KeyError('PROXY_LIST setting is missing')
            self.proxies = {}
            fin = open(self.proxy_list)
            try:
                for line in fin.readlines():
                    parts = re.match('(\w+://)(\w+:\w+@)?(.+)', line.strip())
                    if not parts:
                        continue

                    # Cut trailing @
                    if parts.group(2):
                        user_pass = parts.group(2)[:-1]
                    else:
                        user_pass = ''

                    self.proxies[parts.group(1) + parts.group(3)] = user_pass
            finally:
                fin.close()
            if self.mode == Mode.RANDOMIZE_PROXY_ONCE:
                self.chosen_proxy = random.choice(list(self.proxies.keys()))
        elif self.mode == Mode.SET_CUSTOM_PROXY:
            custom_proxy = settings.get('CUSTOM_PROXY')
            self.proxies = {}
            parts = re.match('(\w+://)([^:]+?:[^@]+?@)?(.+)', custom_proxy.strip())
            if not parts:
                raise ValueError('CUSTOM_PROXY is not well formatted')

            if parts.group(2):
                user_pass = parts.group(2)[:-1]
            else:
                user_pass = ''

            self.proxies[parts.group(1) + parts.group(3)] = user_pass
            self.chosen_proxy = parts.group(1) + parts.group(3)
Ejemplo n.º 17
0
 def _retry(self, request, reason, spider):
     log.msg('Changing proxy')
     tn = telnetlib.Telnet('127.0.0.1', 9051)
     tn.read_until("Escape character is '^]'.", 2)
     tn.write('AUTHENTICATE "267765"\r\n')
     tn.read_until("250 OK", 2)
     tn.write("signal NEWNYM\r\n")
     tn.read_until("250 OK", 2)
     tn.write("quit\r\n")
     tn.close()
     time.sleep(3)
     log.msg('Proxy changed')
     return RetryMiddleware._retry(self, request, reason, spider)
Ejemplo n.º 18
0
 def _retry(self, request, reason, spider):
     log.msg('Changing proxy')
     tn = telnetlib.Telnet('127.0.0.1', 9051)
     tn.read_until("Escape character is '^]'.", 2)
     tn.write('AUTHENTICATE "267765"\r\n')
     tn.read_until("250 OK", 2)
     tn.write("signal NEWNYM\r\n")
     tn.read_until("250 OK", 2)
     tn.write("quit\r\n")
     tn.close()
     time.sleep(3)
     log.msg('Proxy changed')
     return RetryMiddleware._retry(self, request, reason, spider)
Ejemplo n.º 19
0
 def __init__(self, settings):
     RetryMiddleware.__init__(self, settings)
Ejemplo n.º 20
0
 def __init__(self, settings):
     RetryMiddleware.__init__(self, settings)
Ejemplo n.º 21
0
 def _retry(self, request, reason, spider):
     print('start RetryChangeProxyMiddleware:')
     os.system('/usr/local/bin/nym.sh')
     time.sleep(3)
     #        print 'ret RetryChangeProxyMiddleware:'
     return RetryMiddleware._retry(self, request, reason, spider)
Ejemplo n.º 22
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     self.rconn = settings.get("RCONN", redis.Redis("localhost", 6379))
     initCookie(self.rconn, crawler.spider.name)
 def setUp(self):
     self.crawler = get_crawler(Spider)
     self.spider = self.crawler._create_spider('foo')
     self.mw = RetryMiddleware.from_crawler(self.crawler)
     self.mw.max_retry_times = 2
     self.invalid_url = 'http://www.scrapytest.org/invalid_url'
Ejemplo n.º 24
0
 def __init__(self, settings):
     RetryMiddleware.__init__(self, settings)
     DownloaderBaseMiddleware.__init__(self, settings)
Ejemplo n.º 25
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     self.rconn = redis.from_url(settings['REDIS_URL'], db=1, decode_responses=True)##decode_responses设置取出的编码为str
     init_cookie(self.rconn, crawler.spider.name)
 def setUp(self):
     crawler = get_crawler(Spider)
     self.spider = crawler._create_spider("foo")
     self.mw = RetryMiddleware.from_crawler(crawler)
     self.mw.max_retry_times = 2
Ejemplo n.º 27
0
 def __init__(self, settings):
     RetryMiddleware.__init__(self, settings)
     self.retry_intervals = settings.getint('RETRY_TIME_INTERVAL')
 def setUp(self):
     self.crawler = get_crawler(Spider)
     self.spider = self.crawler._create_spider('foo')
     self.mw = RetryMiddleware.from_crawler(self.crawler)
     self.mw.max_retry_times = 2
Ejemplo n.º 29
0
 def setUp(self):
     self.crawler = get_crawler(Spider)
     self.spider = self.crawler._create_spider('foo')
     self.mw = RetryMiddleware.from_crawler(self.crawler)
     self.mw.max_retry_times = 2
     self.invalid_url = 'http://www.scrapytest.org/invalid_url'
    def _retry(self, request, reason, spider):
        print( 'start RetryChangeProxyMiddleware:')
        os.system('/usr/local/bin/nym.sh')
        time.sleep(3)
#        print 'ret RetryChangeProxyMiddleware:'
        return RetryMiddleware._retry(self, request, reason, spider)
Ejemplo n.º 31
0
 def __init__(self, settings):
     RetryMiddleware.__init__(self, settings)
     self.rconn = redis.Redis(settings.get('REDIS_HOST', 'localhsot'), settings.get('REDIS_PORT', 6379))
     self.cookiemanager=CookiesManager()
     self.cookiemanager.init_all_cookies(self.rconn)
Ejemplo n.º 32
0
 def __init__(self, crawler):
     RetryMiddleware.__init__(self, crawler.settings)
     RandomUserAgentBase.__init__(self, crawler)
Ejemplo n.º 33
0
 def __init__(self, settings):
     RetryMiddleware.__init__(self, settings)
     self.redis_conn =  redis.Redis(host=settings.get('REDIS_HOST'),
                                         port=settings.get('REDIS_PORT'))
Ejemplo n.º 34
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
Ejemplo n.º 35
0
 def _retry(self, request, reason, spider):
     log.msg('Changing proxy')
     request.meta['proxy'] = settings.get('HTTP_PROXY')
     return RetryMiddleware._retry(self, request, reason, spider)
Ejemplo n.º 36
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     self.rconn = settings.get("RCONN", redis.Redis("localhost", 6379))
     initCookie(self.rconn, crawler.spider.name)
 def get_spider_and_middleware(self, settings=None):
     crawler = get_crawler(Spider, settings or {})
     spider = crawler._create_spider('foo')
     middleware = RetryMiddleware.from_crawler(crawler)
     return spider, middleware
Ejemplo n.º 38
0
 def _retry(self, request, reason, spider):
     # log.msg('Changing proxy')
     request.meta['proxy'] = settings.get('HTTP_PROXY')
     return RetryMiddleware._retry(self, request, reason, spider)
Ejemplo n.º 39
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     self.rconn = settings.get("RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379)))
     initCookie(self.rconn, crawler.spider.name)
Ejemplo n.º 40
0
 def setUp(self):
     self.crawler = get_crawler(Spider)
     self.spider = self.crawler._create_spider('foo')
     self.mw = RetryMiddleware.from_crawler(self.crawler)
     self.mw.max_retry_times = 2
Ejemplo n.º 41
0
 def __init__(self, settings):
     RetryMiddleware.__init__(self, settings)
     self.rconn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=3)
Ejemplo n.º 42
0
    def __init__(self, settings):

        RetryMiddleware.__init__(self, settings)
        self.set_logger(self.crawler)
Ejemplo n.º 43
0
 def _retry(self, request, reason, spider):
     log.msg("Changing proxy")
     request.meta["proxy"] = settings.get("HTTP_PROXY")
     return RetryMiddleware._retry(self, request, reason, spider)