def __init__(self, brand_id, extra_cond=None, max_images=15): print str.format( 'Publishing (brand_id={0}, max_images={1}, extra_cond="{2}")...', brand_id, max_images, extra_cond) # 某一单品最大发布的图片数量 self.max_images = max_images self.brand_id = brand_id if not extra_cond: extra_cond = ['1'] elif not iterable(extra_cond): extra_cond = [extra_cond] self.extra_cond = extra_cond self.tot = 0 self.progress = 0 # 国家的展示顺序 self.region_order = { k: info.region_info()[k]['weight'] for k in info.region_info() } self.products_tbl = 'products' self.prod_mt_tbl = 'products_mfashion_tags' self.mt_tbl = 'mfashion_tags' self.prod_ot_tbl = 'products_original_tags' self.ot_tbl = 'original_tags' self.price_hist = 'products_price_history'
def __init__(self, name, region, *a, **kw): self.name = str.format('{0}-{1}', name, '-'.join(region) if region else 'all') super(MFashionSpider, self).__init__(*a, **kw) if not region: self.region_list = self.get_supported_regions() else: self.region_list = list((set(region) if iterable(region) else {region}).intersection(set(self.get_supported_regions())))
def start_requests(self): for region in self.region_list: metadata = {'region': region, 'brand_id': getattr(self, 'spider_data')['brand_id'], 'tags_mapping': {}, 'category': []} tmp = getattr(self, 'spider_data')['home_urls'][region] start_urls = tmp if iterable(tmp) else [tmp] for url in start_urls: m = copy.deepcopy(metadata) yield Request(url=url, meta={'userdata': m}, callback=self.parse, errback=self.onerr)
def query_match(self, selects, table, matches=None, extra=None, tail_str=None, use_result=False, distinct=False): """ 查询:相当于SELECT ... FROM ... WHERE col=val :param selects: 需要select的字段 :param table: 查询的表名称 :param matches: dict类型,查询条件 :param extra: 其它的查询条件 :param tail_str: 添加在查询语句末尾的字符串 :param use_result: :return: """ if not extra: extra = ['1'] elif not iterable(extra): extra = [extra] if not iterable(selects): selects = [selects] def func(arg): k, v = arg return unicode.format(u'{0}="{1}"', k, self.sql_escape(v)) if v else unicode.format( u'{0} IS NULL', k) match_str = ' AND '.join(map(func, matches.items())) if matches else '1' extra_cond = ' AND '.join(extra) statement = unicode.format( u'SELECT {5} {0} FROM {1} WHERE {2} AND {3} {4}', ', '.join(selects), table, match_str, extra_cond, tail_str if tail_str else '', 'DISTINCT' if distinct else '') self.db.query(statement.encode('utf-8')) return self.db.use_result() if use_result else self.db.store_result()
def __init__(self, src_spec=getattr(glob, 'DATABASE')['DB_SPEC'], dst_spec=getattr(glob, 'DATABASE')['DB_SPEC'], cond=None): self.progress = 0 self.tot = 1 if cond: if iterable(cond): self.cond = cond else: self.cond = [cond] else: self.cond = ['1'] self.src_spec = src_spec self.dst_spec = dst_spec
def __init__(self, region): if iterable(region): HogoBossSpider.spider_data['home_urls'] = { reg: str.format('http://store-{0}.hugoboss.com', reg) if reg != 'cn' else 'http://store.hugoboss.cn' for reg in region } else: k = region HogoBossSpider.spider_data['home_urls'] = { k: str.format('http://store-{0}.hugoboss.com', k) if k != 'cn' else 'http://store.hugoboss.cn' } super(HogoBossSpider, self).__init__('Hugo Boss', region)
def start_requests(self): for region in self.region_list: metadata = { 'region': region, 'brand_id': self.spider_data['brand_id'], 'tags_mapping': {} } tmp = self.spider_data['home_urls']['common'] cookie = { 'DKI_FiftyOneInternationalCookie': str.format('{0}-{1}', region.upper(), self.spider_data['curreny'][region]) } start_urls = tmp if iterable(tmp) else [tmp] for url in start_urls: m = copy.deepcopy(metadata) yield Request(url=url, meta={'userdata': m}, callback=self.parse, errback=self.onerr, cookies=cookie, dont_filter=True)
def run(self): last_update = self.last_update extra_cond = self.extra_cond if not extra_cond: extra_cond = [] elif not iterable(extra_cond): extra_cond = [extra_cond] if last_update: extra_cond.append( unicode.format(u'update_time > "{0}"', last_update)) extra_cond.append('mapping_list IS NOT NULL') # MFashion标签的缓存记录 cached_mfashion = {} # 标签更新原理:original_tags存放原始标签。根据update_time字段可以得到最近更新过的标签。由于标签系统具备一定传染性,所以 # 该标签对应brand/region下的所有标签都必须重做 rs = self.db.query_match(['brand_id', 'region'], self.original_tags_tbl, {}, extra=extra_cond, distinct=True) # 需要处理的标签 tag_dict = {} for i in xrange(rs.num_rows()): brand_id, region = rs.fetch_row()[0] for val in self.db.query_match( ['idmappings', 'mapping_list'], self.original_tags_tbl, { 'brand_id': brand_id, 'region': region }, extra='mapping_list IS NOT NULL').fetch_row(maxrows=0): tag_dict[val[0]] = json.loads(val[1].replace("'", '"')) # 删除旧单品/标签关系 self.db.execute( str.format( 'DELETE FROM p2 USING {0} AS p1, {1} AS p2 WHERE p1.idproducts=p2.idproducts ' 'AND p1.brand_id={2} AND region="{3}"', self.products, self.prod_mt_tbl, brand_id, region)) self.tot = len(tag_dict) self.progress = 0 for tid, rule in tag_dict.items(): self.progress += 1 self.db.start_transaction() try: # 所有相关的单品 pid_list = [ int(val[0]) for val in self.db.query_match( ['idproducts'], self.prod_tag_tbl, { 'id_original_tags': tid }).fetch_row(maxrows=0) ] # 添加MFashion标签 for tag in rule: if tag not in cached_mfashion: self.db.insert({'tag': tag}, self.mfashion_tags_tbl, ignore=True) tid = int( self.db.query_match(['idmfashion_tags'], self.mfashion_tags_tbl, { 'tag': tag }).fetch_row()[0][0]) cached_mfashion[tag] = tid self.db.insert([{ 'idproducts': pid, 'id_mfashion_tags': cached_mfashion[tag] } for pid in pid_list], self.prod_mt_tbl, ignore=True) self.db.commit() except ValueError: self.db.rollback() except: self.db.rollback() raise
def set_up_spider(spider_class, data, spider_type='default'): """ 设置爬虫对象 @param spider_type: 爬虫类型,是update还是monitor,还是普通爬虫 @param spider_class: @param data: 爬虫的配置参数 @return: """ crawler = Crawler(Settings()) crawler.settings.values['BOT_NAME'] = 'mstore_bot' if spider_type == 'update': crawler.settings.values['ITEM_PIPELINES'] = {'scrapper.pipelines.UpdatePipeline': 800} brand_list = [int(tmp) for tmp in (data['brand'] if 'brand' in data else [])] if 'region' in data: region_list = data['region'] elif 'r' in data: region_list = data['r'] else: region_list = None spider = spider_class(brand_list, region_list, getattr(glob, 'DATABASE')['DB_SPEC']) welcome_msg = str.format('Updating started, processing the following brands: {0}', ', '.join(str(tmp) for tmp in brand_list)) # TODO update类型的spider可以指定多个品牌,导致user agent很不好处理。这是一种不太好的架构,以后考虑改成:一个UpdateSpider对应一个特定的品牌 major_brand = brand_list[0] elif spider_type == 'monitor': crawler.settings.values['ITEM_PIPELINES'] = {'scrapper.pipelines.MonitorPipeline': 800} brand = int(data['brand'][0]) region = data['region'][0] idmonitor = int(data['idmonitor'][0]) parameter = {'brand_id': brand, 'region': region} spider = spider_class(idmonitor, parameter, getattr(glob, 'DATABASE')['DB_SPEC']) welcome_msg = str.format('STARTING MONITORING, idmonitory={0}, brand={1}, region={2}', idmonitor, brand, region) major_brand = brand else: crawler.settings.values['ITEM_PIPELINES'] = {'scrapper.pipelines.ProductImagePipeline': 800, 'scrapper.pipelines.ProductPipeline': 300} \ if getattr(glob, 'DATABASE')['WRITE_DATABASE'] else {} if 'job' in data: job_path = get_job_path(spider_class.spider_data['brand_id']) + '-1' if 'rst-job' in data: shutil.rmtree(job_path, ignore_errors=True) crawler.settings.values['JOBDIR'] = job_path # Telnet支持 # crawler.settings.values['TELNETCONSOLE_HOST'] = '127.0.0.1' # if 'telnet' in data and data['telnet']: # start_port = int(data['telnet'][0]) # else: # start_port = spider_class.spider_data['brand_id'] # crawler.settings.values['TELNETCONSOLE_PORT'] = [start_port, start_port + 8] # 图像数据存储 crawler.settings.values['IMAGES_STORE'] = get_images_store(spider_class.spider_data['brand_id']) # crawler.settings.values['IMAGES_THUMBS'] = {'small': (480, 480), 'medium': (1200, 1200)} crawler.settings.values['IMAGES_MIN_HEIGHT'] = 128 crawler.settings.values['IMAGES_MIN_WIDTH'] = 128 # 获取爬虫区域 region_list = data['r'] if not region_list: region_list = spider_class.get_supported_regions() elif not iterable(region_list): region_list = [region_list] region_list = filter(lambda val: info.region_info()[val]['status'], region_list) if 'exclude-region' in data: for r in data['exclude-region']: if r in region_list: region_list.pop(region_list.index(r)) spider = spider_class(region_list) welcome_msg = str.format('Spider started, processing the following regions: {0}', ', '.join(region_list)) major_brand = spider_class.spider_data['brand_id'] crawler.settings.values['AUTOTHROTTLE_ENABLED'] = False # 设置spider的user agent # 优先级如下: # 1. 命令行中--user-agent参数指定 # 2. 配置文件指定,参见global_settings['USER_AGENT']项目 # 3. spider_class中,spider_data['USER_AGENT']指定 # 4. 默认为chrome # TODO 以上第2项暂未实现 ua_map = { 'chrome': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36', 'iphone': 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5', 'ipad': 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10'} if 'user-agent' in data: ua = data['user-agent'][0] else: spider_spec = info.spider_info()[major_brand]['spider_class'] if 'user_agent' in spider_spec.spider_data: ua = spider_spec.spider_data['user_agent'] else: ua = 'chrome' crawler.settings.values['USER_AGENT'] = ua_map[ua.lower()] if ua.lower() in ua_map else ua # 设置spider的proxy信息 crawler.settings.values['DOWNLOADER_MIDDLEWARES'] = { 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 1} if 'proxy' in data: try: crawler.settings.values['PROXY_ENABLED'] = True except AttributeError: crawler.settings.values['PROXY_ENABLED'] = False else: crawler.settings.values['PROXY_ENABLED'] = False # TODO deal with cookies # cookie_flag = getattr(glob, 'COOKIES_ENABLED', False) # try: # cookie_flag = (data['cookie'][0].lower() == 'true') # except (IndexError, KeyError): # pass # crawler.settings.values['COOKIES_ENABLED'] = cookie_flag # # try: # crawler.settings.values['COOKIES_DEBUG'] = getattr(glob, 'DEBUG')['COOKIES_DEBUG'] # except (AttributeError, KeyError): # crawler.settings.values['COOKIES_DEBUG'] = False crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() spider.log(welcome_msg, log.INFO) crawler.crawl(spider) crawler.start() return spider