def test_override_default_params(self): for key, val in defaults.REDIS_PARAMS.items(): self.expected_params[key] = self.settings['REDIS_PARAMS'][key] = object() server = from_settings(self.settings) assert server is self.redis_cls.return_value self.redis_cls.assert_called_with(**self.expected_params)
def from_settings(cls, settings): server = connection.from_settings(settings) # create one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed key = "dupefilter:%s" % int(time.time()) return cls(server, key)
def from_settings(cls, settings): kwargs = { 'persist': settings.getbool('SCHEDULER_PERSIST'), 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), } # If these values are missing, it means we want to use the defaults. optional = { # TODO: Use custom prefixes for this settings to note that are # specific to scrapy-redis. 'queue_key': 'SCHEDULER_QUEUE_KEY', 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', # We use the default setting name to keep compatibility. 'dupefilter_cls': 'DUPEFILTER_CLASS', 'serializer': 'SCHEDULER_SERIALIZER', } for name, setting_name in optional.items(): val = settings.get(setting_name) if val: kwargs[name] = val # Support serializer as a path to a module. if isinstance(kwargs.get('serializer'), six.string_types): kwargs['serializer'] = importlib.import_module( kwargs['serializer']) # Redis Server connection server = connection.from_settings(settings) # Ensure the connection is working. server.ping() return cls(server=server, settings=settings, **kwargs)
class GxzfcgFactorySpider(RedisSpider): name = "gxzfcg_spider" name_pre = 'gxzfcg' redis_server = connection.from_settings(settings) def parse(self, response): node_name_pre = settings['NODE_NAME'] website_pre = '广西壮族自治区政府采购网' level_pre = response.xpath('//*[@id="channelBody"]/div[1]/a[2]/text()').extract()[0] typr_pre = response.xpath('//*[@id="channelBody"]/div[1]/a[3]/text()').extract()[0] ul = response.xpath("//*[@id=\"channelBody\"]/div[2]/ul/li") for li in ul: item = TenderItem() item['node_name'] = node_name_pre item['website'] = website_pre item['level'] = level_pre item['type'] = typr_pre item['title'] = li.xpath("a/@title").extract()[0] item['date'] = li.xpath("span[@class=\"date\"]/text()").extract()[0] item['url'] = 'http://www.gxzfcg.gov.cn' + li.xpath("a/@href").extract()[0] article = newspaper.Article( 'http://www.gxzfcg.gov.cn' + li.xpath("a/@href").extract()[0], language='zh', fetch_images=False) article.download() article.parse() item['content'] = article.text # 生成时间 now_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") item['add_time'] = now_time item['update_time'] = now_time # print(item) yield item
def test_override_default_params(self): for key, val in DEFAULT_PARAMS.items(): self.expected_params[key] = self.settings['REDIS_PARAMS'][key] = object() server = from_settings(self.settings) assert server is self.redis_cls.return_value self.redis_cls.assert_called_with(**self.expected_params)
def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if not self.redis_key: self.redis_key = '%s:start_urls' % self.name self.server = connection.from_settings(self.crawler.settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.item_scraped) self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.item_dropped) self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.request_scheduled) self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.response_received) self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.response_downloaded) self.crawler.signals.connect(self.__start_loop, signal=signals.spider_opened) self.crawler.signals.connect(self.__stop_loop, signal=signals.spider_closed) log.msg("Reading URLs from redis list '%s'" % self.redis_key, level=log.INFO) self.touch()
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) self.redis = connection.from_settings(settings) # Ensure the connection is working. self.redis.ping() #self.redis = settings.get("RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379))) initCookie(self.redis, crawler.spider.name)
def from_settings(cls, settings): kwargs = { 'persist': settings.getbool('SCHEDULER_PERSIST'), 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), } # 如果缺少这些值,则意味着我们要使用默认值。 optional = { # TODO: Use custom prefixes for this settings to note that are # specific to scrapy-redis. 'queue_key': 'SCHEDULER_QUEUE_KEY', 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', # 我们使用默认设置名称来保持兼容性。 'dupefilter_cls': 'DUPEFILTER_CLASS', 'serializer': 'SCHEDULER_SERIALIZER', } for name, setting_name in optional.items(): val = settings.get(setting_name) if val: kwargs[name] = val # 支持序列化程序作为模块的路径。 if isinstance(kwargs.get('serializer'), six.string_types): kwargs['serializer'] = importlib.import_module( kwargs['serializer']) server = connection.from_settings(settings) # 确保连接正常。 server.ping() return cls(server=server, **kwargs)
def from_settings(cls, settings): params = {} params['client'] = connection.from_settings(settings) # from_settings means redis_from_settings if settings.get("REDIS_SIMHASH_KEY"): params['key'] = settings["REDIS_SIMHASH_KEY"] return cls(**params)
def __init__(self): settings = get_project_settings() self.queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) self.server = connection.from_settings(settings) self.headers = { "Host": "www.zhihu.com", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36", "Referer": "http://www.zhihu.com/people/raymond-wang", "Accept-Encoding": "gzip,deflate,sdch", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-TW;q=0.2", } self.cookies = { '_za': r'bda5810c-88f0-40a8-8d2b-d9be0e0c58a9', 'q_c1': r'28f9a453b53a482486644378553c3a10|1447162001000|1447162001000', '_xsrf': r'4307a4b2977f25efbdacbd89edf2e789', 'cap_id': r'"OThkOGIwMDVkMDllNGZmMzkzN2JkY2MzNzhhMmZjZWQ=|1448186640|774a87a7e0bd5ecec150a0d4bed38b570859c822"', 'z_c0': r'"QUFBQUF1VWdBQUFYQUFBQVlRSlZUUjBnZVZZM0ptcEVROU9YSzZ3bXpUUEJXQm0zSUkxSFl3PT0=|1448186653|1eb9dfd0eff895cab5c818fd97d103a17d557dfe"', 'unlock_ticket': r'"QUFBQUF1VWdBQUFYQUFBQVlRSlZUU1dhVVZhcmFDck02VUROeVV3c1oyRHQ1aWduQmVLYWdRPT0=|1448186653|c734f11184740390f0b34536e218952aabdcff46"', '__utmt': r'1', '__utma': r'51854390.16347795.1448186642.1448186642.1448186642.1', '__utmb': r'51854390.18.10.1448186642', '__utmc': r'51854390', '__utmz': r'51854390.1448186642.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', '__utmv': r'51854390.100-1|2=registration_date=20131118=1^3=entry_date=20131118=1' } super(ZhihuNotGenRequestSpider, self).__init__()
def test_redis_default(self): settings = Settings() server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 6379)
def from_settings(cls, settings): params = { 'client': connection.from_settings(settings), } if settings.get('REDIS_SIMHASH_KEY'): params['key'] = settings['REDIS_SIMHASH_KEY'] return cls(**params)
def from_crawler(cls, crawler): settings = crawler.settings server = connection.from_settings(settings) s = cls(server, settings) crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) return s
def from_crawler(cls, crawler): settings = crawler.settings slave_key = settings.get('REDIS_START_URLS_KEY') master_key = settings.get('REDIS_START_URLS_MASTER_KEY') judge_key = settings.get('REDIS_JUDGE_KEY') scan_page = settings.get('SCAN_PAGE') server = connection.from_settings(settings) s = cls(server, slave_key, master_key, judge_key, scan_page) return s
def test_redis_host_port_fallback(self): settings = Settings( dict(REDIS_HOST='baz', REDIS_PORT=1337, REDIS_URL=None)) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'baz') self.assertEqual(connect_args['port'], 1337)
def from_settings(cls, settings): params = { 'client': connection.from_settings(settings), } if settings.get('REDIS_START_URLS_KEY'): params['start_url_key'] = settings['REDIS_START_URLS_KEY'] if settings.get('REDIS_START_URLS_AS_SET'): params['start_url_as_set'] = settings['REDIS_START_URLS_AS_SET'] return cls(**params)
def setup_redis(self, crawler=None): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if self.server is not None: return if crawler is None: # We allow optional crawler argument to keep backwards # compatibility. # XXX: Raise a deprecation warning. crawler = getattr(self, 'crawler', None) if crawler is None: raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None: self.redis_key = settings.get( 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, ) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip(): raise ValueError("redis_key must not be empty") if self.redis_batch_size is None: # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). self.redis_batch_size = settings.getint( 'REDIS_START_URLS_BATCH_SIZE', settings.getint('CONCURRENT_REQUESTS'), ) try: self.redis_batch_size = int(self.redis_batch_size) except (TypeError, ValueError): raise ValueError("redis_batch_size must be an integer") if self.redis_encoding is None: self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) self.logger.info( "Reading start URLs from redis key '%(redis_key)s' " "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s)", self.__dict__) self.server = connection.from_settings(crawler.settings) # The idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def from_settings(cls, settings): params = { 'server': connection.from_settings(settings), } if settings.get('REDIS_ITEMS_KEY'): params['key'] = settings['REDIS_ITEMS_KEY'] if settings.get('REDIS_ITEMS_SERIALIZER'): params['serialize_func'] = load_object( settings['REDIS_ITEMS_SERIALIZER']) return cls(**params)
def test_redis_host_port(self): settings = Settings({ 'REDIS_HOST': 'localhost', 'REDIS_PORT': 9001, }) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 9001)
def test_redis_url(self): settings = Settings({ 'REDIS_URL': 'redis://*****:*****@localhost:9001/42', }) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 9001) self.assertEqual(connect_args['password'], 'bar') self.assertEqual(connect_args['db'], 42)
def test_redis_host_port_fallback(self): settings = Settings(dict( REDIS_HOST='baz', REDIS_PORT=1337, REDIS_URL=None )) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'baz') self.assertEqual(connect_args['port'], 1337)
def test_redis_url_precedence(self): settings = Settings( dict(REDIS_HOST='baz', REDIS_PORT=1337, REDIS_URL='redis://*****:*****@localhost:9001/42')) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 9001) self.assertEqual(connect_args['password'], 'bar') self.assertEqual(connect_args['db'], 42)
class IsvServiceInfoFactorySpider(RedisSpider): name = "isv_service_info_factory" start_urls = [ 'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.CZ3Xrj&page_id=2489&isv_id=45632667&page_rank=2&tab_type=1' ] redis_server = connection.from_settings(settings) count = 0 def parse(self, response): self.count = 0 self.redis_server.delete('isv_service_info:items') self.redis_server.delete('isv_service_info:dupefilter') self.redis_server.delete('isv_service_info:start_urls') f = codecs.open('service-code-list.csv', 'r', 'utf-8') for datas in f.readlines(): data = datas[:-1].split(',') print data[1] self.generate_url(data[0]) f.close() def generate_url(self, service_code): # 随机休眠0~1秒 time.sleep(random.random() * 3) url = 'https://fuwu.taobao.com/ser/detail.html?service_code=' url += service_code html = requests.get(url).text selector = etree.HTML(html) company_url = selector.xpath( '//*[@id="apc-detail"]/div[1]/div/div/p[1]/a/@href') # 防止没有公司服务列表只有服务详细页面 if not company_url: self.redis_server.lpush('isv_service_info:start_urls', url) return company_url = company_url[0] isv_id = re.search('isv_id=(.*?)&', company_url + '&').group(1) company_url = 'https://fuwu.taobao.com/serv/shop_index.htm?isv_id=' company_url += isv_id html = requests.get(company_url).text selector = etree.HTML(html) ul = selector.xpath('//*[@id="seller-header"]/div[2]/div[2]/div/ul/li') for li in ul: tab_type = li.xpath('span/b/a/text()')[0] if '服务列表' == tab_type: service_urls = li.xpath('span/b/a/@href')[0] service_urls = 'https://fuwu.taobao.com/serv/' + service_urls self.redis_server.lpush('isv_service_info:start_urls', service_urls) print service_urls self.count += 1 print self.count break
def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if not self.redis_key: self.redis_key = '%s:start_urls' % self.name self.server = connection.from_settings(self.crawler.settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) log.msg("Reading URLs from redis list '%s'" % self.redis_key, level=log.INFO)
def test_redis_url_precedence(self): settings = Settings(dict( REDIS_HOST='baz', REDIS_PORT=1337, REDIS_URL='redis://*****:*****@localhost:9001/42' )) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 9001) self.assertEqual(connect_args['password'], 'bar') self.assertEqual(connect_args['db'], 42)
def __init__(self, settings): self.request_count = settings.getint('MYEXT_ITEMCOUNT', 1000) self.request_num = 0 # self.scheduler = scheduler self.request = None self.no_meet = True#是否遇到seed请求 self.path_base = settings.get("SEED_FILE_PATH") self.server = connection.from_settings(settings) use_set = settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) request_set = settings.get("SCHEDULER_QUEUE_CLASS") self.fetch_one = self.server.spop if use_set else self.server.lpop self.add_one = self.server.sadd if use_set else self.server.lpush self.get_num = self.server.llen if "LifoQueue" in request_set or "FifoQueue" in request_set else self.server.zcard self.get_startnum = self.server.scard if use_set else self.server.llen self.split_num = settings.get("SPLIT_NUM") self.path_split = None
class IsvServiceInfoFactorySpider(RedisSpider): name = "cycle_run" start_urls = [ 'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.CZ3Xrj&page_id=2489&isv_id=45632667&page_rank=2&tab_type=1' ] redis_server = connection.from_settings(settings) def parse(self, response): self.runTask(self.work, hour=4) def work(self): self.redis_server.lpush( 'isv_service_info_factory:start_urls', 'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.CZ3Xrj&page_id=2489&isv_id=45632667&page_rank=2&tab_type=1' ) print "更新周期到" def runTask(self, func, day=0, hour=0, min=0, second=0): # Init time now = datetime.now() strnow = now.strftime('%Y-%m-%d %H') # print "now:", strnow # First next run time period = timedelta(days=day, hours=hour, minutes=min, seconds=second) next_time = now + period strnext_time = next_time.strftime('%Y-%m-%d %H') # print "next run:", strnext_time while True: # Get system current time iter_now = datetime.now() iter_now_time = iter_now.strftime('%Y-%m-%d %H') if str(iter_now_time) == str(strnext_time): # Get every start work time # print "start work: %s" % iter_now_time # Call task func func() # print "task done." # Get next iteration time iter_time = iter_now + period strnext_time = iter_time.strftime('%Y-%m-%d %H') # print "next_iter: %s" % strnext_time # Continue next iteration continue # 1分钟检查一次 time.sleep(600)
def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if not self.redis_key: self.redis_key = '%s:start_urls' % self.name self.server = connection.from_settings(self.crawler.settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) # self.crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped) # self.crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) self.log("Reading URLs from redis list '%s'" % self.redis_key)
class GxzfcgFactorySpider(RedisSpider): name = "gxzfcg_factory" name_pre = 'gxzfcg' # allowed_domains = ["dmoz.org"] start_urls = [ "http://www.gxzfcg.gov.cn/CmsNewsController/recommendBulletinList/channelCode-cgxx/20/page_1.html" ] redis_server = connection.from_settings(settings) def parse(self, response): level_xpaths = [ "//*[@id=\"bodyMain\"]/div/aside/div/nav/ul/li[1]/ul/li", "//*[@id=\"bodyMain\"]/div/aside/div/nav/ul/li[2]/ul/li" ] level_names = ["区本级采购", "市(县)级采购"] i = 0 for level_xpath in level_xpaths: level_name = level_names[i] i += 1 ul = response.xpath(level_xpath) for li in ul: item = TenderItem() item['node_name'] = settings['NODE_NAME'] item['website'] = '广西壮族自治区政府采购网' item['level'] = level_name item['type'] = li.xpath("a/text()").extract()[0] next_page_url = 'http://www.gxzfcg.gov.cn' + li.xpath( "a/@href").extract()[0] yield scrapy.Request(next_page_url, callback=self.parse_news, meta={'item': item}) def parse_news(self, response): page_nums = re.search( u'页次:1/(.*?)页', response.xpath("//*[@id=\"QuotaList_paginate\"]/span[1]/text()"). extract()[0]).group(1) for page_num in range(1, int(page_nums) + 1): next_page_url = re.sub('page_(.*?).html', 'page_' + str(page_num) + '.html', response.url) self.redis_server.lpush('%s_spider:start_urls' % self.name_pre, next_page_url) print next_page_url
def from_settings(cls, settings): if os.environ.get('spider_set_persist'): persist = (os.environ.get('spider_set_persist') != 'False') else: persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) if os.environ.get('need_dupefilter'): need_dupefilter = (os.environ.get('need_dupefilter') != 'False') else: need_dupefilter = True queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) queue_cls = load_object( settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) server = connection.from_settings(settings) return cls(server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, need_dupefilter)
def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if not self.redis_key: self.redis_key = "%s:start_urls" % self.name self.server = connection.from_settings(self.crawler.settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.item_scraped) self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.item_dropped) self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.request_scheduled) self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.response_received) self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.response_downloaded) self.crawler.signals.connect(self.__start_loop, signal=signals.spider_opened) self.crawler.signals.connect(self.__stop_loop, signal=signals.spider_closed) log.msg("Reading URLs from redis list '%s'" % self.redis_key, level=log.INFO) self.touch()
def from_settings(cls, settings): params = { 'redis_conn': connection.from_settings(settings) } return cls(**params)
def test_default_params(self): server = from_settings(self.settings) assert server is self.redis_cls.return_value self.redis_cls.assert_called_with(**dict(defaults.REDIS_PARAMS, **self.expected_params))
def test_redis_cls_custom_path(self): self.settings['REDIS_PARAMS']['redis_cls'] = 'mock.Mock' server = from_settings(self.settings) assert isinstance(server, mock.Mock)
def test_redis_cls_default(self): server = from_settings(Settings()) assert isinstance(server, defaults.REDIS_CLS)
from scrapy_redis import connection from scrapy.conf import settings import time redis_server = connection.from_settings(settings) redis_server.lpush('isv_service_info_factory:start_urls', 'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.CZ3Xrj&page_id=2489&isv_id=45632667&page_rank=2&tab_type=1') print "更新周期到" time.sleep(2)
def get_server(settings): redis_server = connection.from_settings(settings) return redis_server
def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server)
def _set_crawler(self, crawler): super(HomepageSpider, self)._set_crawler(crawler) self.server = connection.from_settings(self.crawler.settings)
def test_default_params(self): server = from_settings(self.settings) assert server is self.redis_cls.return_value self.redis_cls.assert_called_with(**dict(DEFAULT_PARAMS, **self.expected_params))
class IsvServiceInfoSpider(RedisSpider): name = "isv_service_info" start_urls = [ 'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.CZ3Xrj&page_id=2489&isv_id=45632667&page_rank=2&tab_type=1', 'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.Oquk72&page_id=678230&isv_id=877021141&page_rank=2&tab_type=1', 'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.mSxKHl&page_id=25995&isv_id=305442977&page_rank=2&tab_type=1', 'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.lH8xCC&page_id=172044&isv_id=570102268&page_rank=2&tab_type=1', 'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.OzhuHM&page_id=690262&isv_id=897211958&page_rank=2&tab_type=1' ] redis_server = connection.from_settings(settings) def parse(self, response): print response.url item = IsvServiceInfoItem() isv_id = re.search('isv_id=(.*?)&', response.url + '&').group(1) company_name = response.xpath( '//*[@id="seller-header"]/div[1]/div/a/text()').extract()[0] servers = response.xpath('//*[@id="searchForm"]/div[2]/table/tbody/tr') for server in servers: user_number = server.xpath('td[4]/text()').extract()[0] browser_number = server.xpath('td[5]/text()').extract()[0] item['isv_id'] = isv_id item['company_name'] = company_name item['user_number'] = user_number item['browser_number'] = browser_number detail_url = re.sub( 'service/service.htm', 'ser/detail.html', 'https:' + server.xpath('td[2]/dl/dt/a/@href').extract()[0]) yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) print detail_url def parse_detail(self, response): detail_url = response.url service_code = re.search('service_code=(.*?)&', detail_url + '&').group(1) content = response.xpath('//*[@id="J_SKUForm"]/div[2]/text()') # print(content) # 判断是否存在"此服务暂不支持在线订购,请您直接联系服务商" if not content: service_name = response.xpath( '//*[@id="J_SKUForm"]/div[1]/h2/text()').extract()[0].replace( '\t', '').replace('\n', '') score = response.xpath( '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/span[2]/text()' ).extract()[0] usability = response.xpath( '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[1]/li[1]/span[2]/@class' ).extract()[0] usability_compare = \ response.xpath('//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[1]/li[1]/span[2]/text()').extract()[0] # 判断是高于还是低于 if usability == 'low per': usability_compare = '-' + usability_compare attitude = response.xpath( '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[1]/li[2]/span[2]/@class' ).extract()[0] attitude_compare = \ response.xpath('//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[1]/li[2]/span[2]/text()').extract()[0] if attitude == 'low per': attitude_compare = '-' + attitude_compare stability = response.xpath( '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[1]/li[3]/span[2]/@class' ).extract()[0] stability_compare = \ response.xpath('//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[1]/li[3]/span[2]/text()').extract()[0] if stability == 'low per': stability_compare = '-' + stability_compare secure_score = str( response.xpath( '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[2]/li[1]/span[2]/text()' ).extract()[0]).replace('\t', '').replace('\n', '') payer_number = response.xpath( '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[2]/li[2]/span[2]/text()' ).extract()[0] nearly_payer_number = \ response.xpath('//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[2]/li[2]/span[3]/text()').extract()[0] continue_rate = response.xpath( '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[2]/li[3]/span[2]/text()' ).extract()[0] refund_rate = response.xpath( '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[2]/li[4]/span[2]/text()' ).extract()[0] open_rate = response.xpath( '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[2]/li[5]/span[2]/text()' ) # 打开率有点特殊 if open_rate: open_rate = open_rate.extract()[0] else: open_rate = None score_times = re.search( '(\d+)', response.xpath( '//*[@id="reviews"]/div[1]/div/div/div[2]/span/text()'). extract()[0]).group(1) five_score_rate = \ response.xpath('//*[@id="reviews"]/div[1]/div/div/div[3]/ul/li[1]/span[@class="tb-r-pecent"]/text()').extract()[0].replace('\t', '').replace('\n', '') four_score_rate = \ response.xpath('//*[@id="reviews"]/div[1]/div/div/div[3]/ul/li[2]/span[@class="tb-r-pecent"]/text()').extract()[0].replace('\t', '').replace('\n', '') three_score_rate = \ response.xpath('//*[@id="reviews"]/div[1]/div/div/div[3]/ul/li[3]/span[@class="tb-r-pecent"]/text()').extract()[0].replace('\t', '').replace('\n', '') two_score_rate = \ response.xpath('//*[@id="reviews"]/div[1]/div/div/div[3]/ul/li[4]/span[@class="tb-r-pecent"]/text()').extract()[0].replace('\t', '').replace('\n', '') one_score_rate = \ response.xpath('//*[@id="reviews"]/div[1]/div/div/div[3]/ul/li[5]/span[@class="tb-r-pecent"]/text()').extract()[0].replace('\t', '').replace('\n', '') seller_rank_percent_url = 'https://fuwu.taobao.com' + \ response.xpath('//*[@id="desc-log"]/div/div[1]/div[1]/h5/a/@href').extract()[0] seller_industry_percent_url = 'https://fuwu.taobao.com' + \ response.xpath('//*[@id="desc-log"]/div/div[1]/div[2]/h5/a/@href').extract()[0] # 爬淘宝买家等级占比 html = requests.get(seller_rank_percent_url).text selector = etree.HTML(html) seller_rank_percent_trs = selector.xpath( '//*[@id="apc-detail"]/div[2]/table/tbody/tr') seller_rank_percent = '[' for seller_rank_percent_tr in seller_rank_percent_trs: seller_rank_percent_tds = seller_rank_percent_tr.xpath('td') index = 0 for seller_rank_percent_td in seller_rank_percent_tds: index += 1 img = seller_rank_percent_td.xpath('img/@src') if img: seller_rank = re.search('rank/(.*?).gif', img[0]).group(1) else: seller_rank = seller_rank_percent_td.xpath('text()') if seller_rank: seller_rank = str(seller_rank[0]).replace( '\t', '').replace('\n', '').replace(' ', '') if seller_rank: if index % 2 == 1: seller_rank_percent = seller_rank_percent + '{\"rank\":\"' + str( seller_rank).replace('\r', '') + '\",' else: seller_rank_percent = seller_rank_percent + '\"percent\":\"' + seller_rank + '\"},' seller_rank_percent = seller_rank_percent[:-1] + ']' # 爬卖家行业占比 html = requests.get(seller_industry_percent_url).text selector = etree.HTML(html) seller_industry_percent_trs = selector.xpath( '//*[@id="apc-detail"]/div[2]/table/tbody/tr') seller_industry_percent = '[' for seller_industry_percent_tr in seller_industry_percent_trs: seller_industry_percent_tds = seller_industry_percent_tr.xpath( 'td') index = 0 for seller_industry_percent_td in seller_industry_percent_tds: index += 1 img = seller_industry_percent_td.xpath('img/@src') if img: seller_rank = re.search('rank/(.*?).gif', img[0]).group(1) else: seller_rank = seller_industry_percent_td.xpath( 'text()') if seller_rank: seller_rank = str(seller_rank[0]).replace( '\t', '').replace('\n', '').replace(' ', '') if seller_rank: if index % 2 == 1: seller_industry_percent = seller_industry_percent + '{\"industry\":\"' + str( seller_rank).replace('\r', '') + '\",' else: seller_industry_percent = seller_industry_percent + '\"percent\":\"' + seller_rank + '\"},' seller_industry_percent = seller_industry_percent[:-1] + ']' # print(company_name) print(service_name) print(service_code) print(score) print(usability) print(usability_compare) print(attitude) print(attitude_compare) print(stability) print(stability_compare) print(secure_score) print(payer_number) print(nearly_payer_number) print(continue_rate) print(refund_rate) print(open_rate) print(score_times) print(five_score_rate) print(four_score_rate) print(three_score_rate) print(two_score_rate) print(one_score_rate) # print(seller_rank_percent_url) # print(seller_industry_percent_url) print(seller_rank_percent) print(seller_industry_percent) # print(user_number) # print(browser_number) now_time = datetime.datetime.today() item = response.meta['item'] item['add_time'] = now_time item['modify_time'] = now_time # item['isv_id'] = isv_id # item['company_name'] = company_name item['service_name'] = service_name item['service_code'] = service_code item['score'] = score # item['usability'] = usability item['usability_compare'] = usability_compare # item['attitude'] = attitude item['attitude_compare'] = attitude_compare # item['stability'] = stability item['stability_compare'] = stability_compare item['secure_score'] = secure_score item['payer_number'] = payer_number item['nearly_payer_number'] = nearly_payer_number item['continue_rate'] = continue_rate item['refund_rate'] = refund_rate item['open_rate'] = open_rate item['score_times'] = score_times item['five_score_rate'] = five_score_rate item['four_score_rate'] = four_score_rate item['three_score_rate'] = three_score_rate item['two_score_rate'] = two_score_rate item['one_score_rate'] = one_score_rate item['seller_rank_percent'] = seller_rank_percent item['seller_industry_percent'] = seller_industry_percent # item['user_number'] = user_number # item['browser_number'] = browser_number yield item
def from_settings(cls, settings): params = { 'server': connection.from_settings(settings), } return cls(**params)
class ShopBasicInfoSpider(RedisSpider): name = "shop_basic_info" start_urls = [] redis_server = connection.from_settings(settings) def parse(self, response): head = {'User-Agent': \ 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36', 'cookie': 'cna=9dmhD/z0ODgCATHd1PgOvAEo; ali_ab=49.221.212.248.1461498199870.2; hng=CN%7Czh-cn%7CCNY; thw=cn; isg=0402F5E80881A216E6813A6676800CB8; v=0; _tb_token_=ku02IolX76m3jJ; uc1=cookie14=UoWxMP74ys3MPA%3D%3D&existShop=false&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&cookie21=VFC%2FuZ9aiKCaj7AzN6nc&tag=1&cookie15=V32FPkk%2Fw0dUvg%3D%3D&pas=0; uc3=sg2=UIS5OL%2BOEDgy%2FIeQ7IgTu7dSOvuG0LEay5288ZRYw64%3D&nk2=pbEaPGpOBJk%3D&id2=UoYfobtYhLxhEw%3D%3D&vt3=F8dASmgu7PcOeAskyes%3D&lg2=UtASsssmOIJ0bQ%3D%3D; existShop=MTQ2MTkyMDU3MA%3D%3D; uss=UIIpyK78%2BArm1rQcpUrk%2FRwXHQDc93OpxAQgdlu7DWHJVDuJuqKSxy5hBg%3D%3D; lgc=%5Cu4F01%5Cu513F%5Cu8469%5Cu8469; tracknick=%5Cu4F01%5Cu513F%5Cu8469%5Cu8469; cookie2=1cd8c28594101548f03793d63c556c34; sg=%E8%91%A937; mt=np=&ci=9_1&cyk=-1_-1; cookie1=Vvkh3e1O2MWb%2FWoyF7KMYkHR3r9XP1ItH8ivkdLbbCM%3D; unb=1710468073; skt=6e1a6a704a8e0ddd; t=a74cdfe08cedf1981150320f20e9a793; _cc_=U%2BGCWk%2F7og%3D%3D; tg=0; _l_g_=Ug%3D%3D; _nk_=%5Cu4F01%5Cu513F%5Cu8469%5Cu8469; cookie17=UoYfobtYhLxhEw%3D%3D; l=Av7-AatB1kluKMPP1QNNkk-Bzh5BL8LA' } seller_nick = urllib.unquote( re.search('&q=(.*?)&', response.url + '&').group(1)).decode('utf-8') print seller_nick html = response.body content = re.search('g_page_config = (.*?);\n', html, re.S) while not content: html = requests.get(response.url, headers=head).text content = re.search('g_page_config = (.*?);\n', html, re.S) content = content.group(1) # 解析出全部店铺列表信息的json data = json.loads(content).get('mods').get('shoplist').get('data') # 判断搜索有没有结果 if data is not None: the_shop_data = data.get('shopItems')[0] nick = the_shop_data.get('nick') # 第一个店铺的nick要跟给定的nick相等 if nick == seller_nick: shop_type = the_shop_data.get('shopIcon').get( 'iconClass').strip() shop_name = the_shop_data.get('rawTitle').strip() shop_id = the_shop_data.get('nid').strip() shop_address = the_shop_data.get('provcity').strip() total_sold = int(the_shop_data.get('totalsold')) goods_number = int(the_shop_data.get('procnt')) shop_label = '' icons = the_shop_data.get('icons') for i in icons: shop_label = shop_label + i.get('title') + ',' if len(shop_label) > 0: shop_label = shop_label[:-1] good_rate_percent = float( self.delete_the_percent( the_shop_data.get('goodratePercent'))) shop_img_url = the_shop_data.get('picUrl').strip() shop_rate_url = the_shop_data.get('userRateUrl').strip() dsrStr = json.loads(the_shop_data.get('dsrInfo').get('dsrStr')) main_business = dsrStr.get('ind').strip() if main_business == '': main_business = None describe_score_industry = self.delete_the_percent( dsrStr.get('mg')) service_score_industry = self.delete_the_percent( dsrStr.get('sg')) logistics_score_industry = self.delete_the_percent( dsrStr.get('cg')) # 判断店铺等级是否为0 if shop_type != 'rank seller-rank-0': url = 'https:' + shop_rate_url html = requests.get(url, headers=head) add_time = datetime.datetime.today() modify_time = add_time is_exist = True deposit = None seller_rank = None buyer_rank = None main_rate = None # 切割地址 if shop_address != '': shop_address_s = str(shop_address).split(' ') if len(shop_address_s) == 2: shop_address_province = shop_address_s[0] shop_address_city = shop_address_s[1] elif len(shop_address_s) == 1: shop_address_province = shop_address_s[0] shop_address_city = None else: shop_address_province = None shop_address_city = None if shop_type != 'icon-service-tianmao-large': if shop_type == 'icon-service-qiye-large': shop_type = '企业店铺' else: shop_type = '普通店铺' else: shop_type = '天猫店铺' item = ShopbasicinfoItem() item['add_time'] = add_time item['modify_time'] = modify_time item['nick'] = nick item['shop_type'] = shop_type item['shop_name'] = shop_name item['shop_id'] = shop_id item['shop_address_province'] = shop_address_province item['shop_address_city'] = shop_address_city item['total_sold'] = total_sold item['goods_number'] = goods_number item['good_rate_percent'] = good_rate_percent item['shop_img_url'] = shop_img_url item['shop_rate_url'] = shop_rate_url item['main_business'] = main_business item['deposit'] = deposit item['seller_rank'] = seller_rank item['buyer_rank'] = buyer_rank item['main_rate'] = main_rate item['is_exist'] = is_exist item['shop_label'] = shop_label yield item def getContent(self, content): if content: return content[0] return None def getNumber(self, content): if content: result = re.search('(\d+)', content[0]) if result: return float(result.group(1)) return 0 def getUserId(self, content): content = re.search('"userID": "(.*?)"', content) if content: return content.group(1) return None def getCharge(self, content): content = re.search('¥(.*)', self.getContent(content)) if content: return float(content.group(1).replace(',', '')) return 0 def delete_the_percent(self, content): if content: content = re.search('(.*)%', content) if content: print('###' + content.group(1)) return float(content.group(1)) return 0 def delete_the_fen(self, content): if content: content = re.search('(.*)分', content) if content: print('fen' + content.group(1)) return float(content.group(1)) return 0 def delete_the_tian(self, content): if content: content = re.search('(.*)天', content) if content: print('天' + content.group(1)) return float(content.group(1)) return 0