def parse_cat1(self, response): metadata = response.meta['userdata'] sel = Selector(response) for node in sel.xpath( '//nav[@id="categoriesMenu"]/ul[@class="level1"]//ul[@class="level2"]/li/a[@href]' ): tag_text = self.reformat(unicodify(node._root.text)) if not tag_text: continue m = copy.deepcopy(metadata) m['tags_mapping']['category-2'] = [{ 'name': tag_text.lower(), 'title': tag_text }] yield Request(url=self.process_href(node._root.attrib['href'], response.url), callback=self.parse_cat2, errback=self.onerr, meta={'userdata': m})
def run(self): for i in xrange(self.tot): temp = self.rs.fetch_row(how=1)[0] record = dict((k, unicodify(temp[k])) for k in temp) rs = self.db.query( str.format( 'SELECT idproducts FROM products WHERE idproducts={0}', record['idproducts'])) if rs.num_rows() == 0: # 新数据 record['update_flag'] = 'I' self.db.insert(record, 'products') else: # 已经存在,更新 record['update_flag'] = 'U' self.db.update( record, 'products', str.format('idproducts={0}', record['idproducts'])) self.progress = i + 1
def fetch_price(cls, response, spider=None): sel = Selector(response) ret = {} old_price = None new_price = None try: temp = sel.xpath( '//div[@itemprop="offers"]//div[@itemprop="price" and @class="product-price"]' ) if temp: old_price = unicodify(temp[0]._root.text) except (TypeError, IndexError): pass if old_price: ret['price'] = old_price if new_price: ret['price_discount'] = new_price return ret
def fetch_price(cls, response, spider=None): sel = Selector(response) ret = {} old_price = None new_price = None try: temp = sel.xpath( '//div[@class="item-container"]/div[@class="iteminfo"]//div[@class="l4" or @class="t8"]' ) if temp: old_price = unicodify(temp[0]._root.text) except (TypeError, IndexError): pass if old_price: ret['price'] = old_price if new_price: ret['price_discount'] = new_price return ret
def fetch_price(cls, response, spider=None): sel = Selector(response) ret = {} old_price = None new_price = None try: temp = sel.xpath( '//section[@class="summary"]/div[@class="price"]/span[@class="value"]' ) if temp: old_price = unicodify(temp[0]._root.text) except (TypeError, IndexError): pass if old_price: ret['price'] = old_price if new_price: ret['price_discount'] = new_price return ret
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) for node in sel.xpath('//ul[@id="nav"]/li/a[@href]'): m = copy.deepcopy(metadata) tag_text = ', '.join( val for val in (self.reformat(unicodify(val.text)) for val in node._root.iterdescendants()) if val) m['tags_mapping']['category-0'] = [{ 'name': tag_text.lower(), 'title': tag_text }] m['category'] = [tag_text.lower()] yield Request(url=self.process_href(node._root.attrib['href'], response.url), meta={'userdata': m}, callback=self.parse_cat1, errback=self.onerr, dont_filter=True)
def fetch_details(cls, response, spider=None): sel = Selector(response) details = None try: node_list = sel.xpath('//div[@id="description"]/*[@class="desc"]') node_list.extend( sel.xpath('//div[@id="description"]/*[@class="desc"]/*')) node_list.extend( sel.xpath('//div[@id="description"]/*[@class="dimensions"]')) node_list.extend( sel.xpath('//div[@id="description"]/*[@class="dimensions"]/*')) details = '\r'.join(','.join( filter(lambda val: val, (cls.reformat(unicodify(val)) for val in (node._root.prefix, node._root.text, node._root.tail)))) for node in node_list) except (TypeError, IndexError): pass return details
def func(node, level, data): ret = [] temp = node.xpath('./a[@href]') if temp: temp = temp[0] data[str.format('category-{0}', level)] = unicodify(temp._root.text).lower() href = temp._root.attrib['href'] if 'javascript:void' not in href: data['href'] = href temp = node.xpath( str.format('./ul/li[contains(@class,level{0})]', level + 1)) if not temp and 'href' in data: # 到达叶节点 ret.append(data) else: # 中间节点 for node2 in temp: data2 = data.copy() ret.extend(func(node2, level + 1, data2)) return ret
def fetch_price(cls, response, spider=None): sel = Selector(response) ret = {} old_price = None new_price = None tmp = sel.xpath( '//div[@class="product-shop"]//div[contains(@class,"price-box")]//span[@class="price"]' ) if tmp: try: old_price = cls.reformat(unicodify(tmp[0]._root.text)) except (TypeError, IndexError): pass if old_price: ret['price'] = old_price if new_price: ret['price_discount'] = new_price return ret
def price_changed(brand_list=None, start=None, end=None, start_delta=datetime.timedelta(0), end_delta=datetime.timedelta(0)): """ 获得start到end时间区间内价格发生变化的单品记录。如果start和end中有任何一个为None,则默认采用过去一天的时间区间。 假设2014/02/25 02:00调用该函数,则默认查找2014/02/24 00:00:00至2014/02/25 00:00:00之间新添加的数据。 @param brand_list: 查找的品牌。如果为None,则默认对数据库中的所有品牌进行处理 @param start: datetime.date或datetime.datetime对象 @param end: """ def price_check(old, new): """ 对两组价格进行有效性检查。该函数的主要目的是:通过检查,查找可能存在的代码bug 检查策略: 1. 如果两条记录一样 2. 如果price为None,而price_discount不为None 3. 如果price<=price_discount 4. 如果old和new两项price的差别过大 则可能存在bug或错误,需要返回warning。 @param old: @param new: """ warnings = { -1: 'EQUAL RECORDS', -2: 'NULL PRICE', -3: 'PRICE IS EQUAL OR LESS THAN PRICE_DISCOUNT', -4: 'TOO MUCH GAP BETWEEN THE OLD AND THE NEW' } price1, discount1 = old price2, discount2 = new # 如果价格变化超过threshold,则认为old和new差异过大 threshold = 5 # if price1 == price2 and discount1 == discount2: # err_no = -1 if (not price1 and discount1) or (not price2 and discount2): err_no = -2 elif (price1 and discount1 and price1 <= discount1) or (price2 and discount2 and price2 <= discount2): err_no = -3 elif price1 > 0 and price2 > 0 and (price1 / price2 > threshold or price2 / price1 > threshold): err_no = -4 else: err_no = 0 if err_no != 0: return (err_no, warnings[err_no]) else: return err_no # 主要国家列表。只监控这些国家的单品的价格变化过程。 main_countries = [ tmp[0] for tmp in filter(lambda val: val[1]['weight'] < 999999, info.region_info().items()) ] with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: if not brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] # 获得默认的时间区间 if start: try: start = datetime.datetime.strptime(start, '%Y-%m-%d %H:%M:%S') except ValueError: start = datetime.datetime.strptime(start, '%Y-%m-%d') else: start = datetime.datetime.fromordinal( (datetime.datetime.now() - datetime.timedelta(1)).toordinal()) if end: try: end = datetime.datetime.strptime(end, '%Y-%m-%d %H:%M:%S') except ValueError: end = datetime.datetime.strptime(end, '%Y-%m-%d') else: end = datetime.datetime.fromordinal( datetime.datetime.now().date().toordinal()) start += start_delta end += end_delta results = { 'warnings': [], 'price_up': {}, 'discount_up': {}, 'price_down': {}, 'discount_down': {} } for brand in brand_list: pid_list = db.query( str.format( ''' SELECT p1.model,p1.idproducts,p1.region,p1.fingerprint FROM products AS p1 JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts WHERE p1.offline=0 AND p2.price IS NOT NULL AND brand_id={0} AND p1.region IN ({1}) AND (p2.date BETWEEN {2} AND {3}) ''', brand, ','.join( str.format('"{0}"', tmp) for tmp in main_countries), *map(lambda val: val.strftime('"%Y-%m-%d %H:%M:%S"'), (start, end)))).fetch_row(maxrows=0) if not pid_list: continue tmp = db.query( str.format( ''' SELECT p1.idproducts,p1.model,p1.region,p1.fingerprint,p2.price,p2.price_discount,p2.currency,p2.date FROM products AS p1 JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts WHERE p1.idproducts IN ({0}) ORDER BY p2.date DESC''', ','.join(tmp[1] for tmp in pid_list))).fetch_row(maxrows=0) rs = {} # 按照pid归并,即rs[pid] = [该pid所对应的价格历史] # 开始的时候,pid_set保留了所有需要处理的pid。归并的原则是,每个pid,取最近的最多两条有效记录。如果两条记录取满, # 该pid从pid_set中移除。今后,就算再次遇到这个pid,也不作处理了。 pid_set = set([val[0] for val in tmp]) for pid, model, region, fp, price, discount, currency, date in tmp: # 如果pid不在pid_set中,说明该pid对应的两条记录都已经取到。 # 如果price为None,说明该记录不包含有效价格数据,跳过不处理。 if pid not in pid_set or not price: continue if int(pid) in rs and len(rs[int(pid)]) >= 2: # 最近两条数据已满,跳过该pid pid_set.remove(pid) continue pid = int(pid) if pid not in rs: rs[pid] = [] rs[pid].append( [model, region, fp, price, discount, currency, date]) for pid, price_history in rs.items(): if len(price_history) < 2: continue def func(idx): rate = info.currency_info()[price_history[idx][-2]]['rate'] return (float(price_history[idx][-4]) * rate if price_history[idx][-4] else None, float(price_history[idx][-3]) * rate if price_history[idx][-3] else None) price1, discount1 = func(0) price2, discount2 = func(1) # 是否可能有错误? ret = price_check((price2, discount2), (price1, discount1)) if ret != 0: results['warnings'].append({ 'idproducts': pid, 'model': price_history[0][0], 'msg': ret[1] }) continue if price1 and price2 and price1 < price2: key = 'price_down' elif price1 and price2 and price1 > price2: key = 'price_up' elif discount1 and discount2 and discount1 < discount2: key = 'discount_down' elif not discount2 and discount1: key = 'discount_down' elif discount1 and discount2 and discount1 > discount2: key = 'discount_up' elif not discount1 and discount2: key = 'discount_up' else: key = None if key: if brand not in results[key]: results[key][brand] = {} fp = price_history[0][2] if fp not in results[key][brand]: results[key][brand][fp] = { 'model': price_history[0][0], 'brand_id': brand, 'fingerprint': fp, 'products': [] } # 获得单品的优先名称 region = price_history[0][1] price_new = float( price_history[0][3]) if price_history[0][3] else None price_old = float( price_history[1][3]) if price_history[1][3] else None discount_new = float( price_history[0][4]) if price_history[0][4] else None discount_old = float( price_history[1][4]) if price_history[1][4] else None currency_new = price_history[0][5] currency_old = price_history[1][5] results[key][brand][fp]['products'].append({ 'idproducts': int(pid), 'region': region, 'old_price': { 'price': price_old, 'price_discount': discount_old, 'currency': currency_old }, 'new_price': { 'price': price_new, 'price_discount': discount_new, 'currency': currency_new } }) # results中的记录,还需要单品名称信息。首先获得result中的所有fingerprint,并从数据库中查找对应的名称 fp_list = [] for change_type in [ 'price_up', 'price_down', 'discount_up', 'discount_down' ]: if brand in results[change_type]: fp_list.extend(results[change_type][brand].keys()) fp_list = list(set(fp_list)) # 获得fingerprint和name的关系 fp_name_map = {} if fp_list: for fp, name, region in db.query_match( ['fingerprint', 'name', 'region'], 'products', extra=str.format( 'fingerprint IN ({0})', ','.join( str.format('"{0}"', tmp) for tmp in fp_list))).fetch_row(maxrows=0): if fp not in fp_name_map: fp_name_map[fp] = { 'name': unicodify(name), 'region': region } elif info.region_info( )[region]['weight'] < info.region_info()[ fp_name_map[fp]['region']]['weight']: # 更高优先级的国家,替换: fp_name_map[fp] = { 'name': unicodify(name), 'region': region } for change_type in [ 'price_up', 'price_down', 'discount_up', 'discount_down' ]: if brand not in results[change_type]: continue for fp in results[change_type][brand]: results[change_type][brand][fp]['name'] = fp_name_map[ fp]['name'] return results
def merge_prods(self, prods, db): """ 按照国家顺序,挑选主记录 :param prods: """ logger = get_logger() # 将prods转换为unicode for idx in xrange(len(prods)): prods[idx] = {k: unicodify(prods[idx][k]) for k in prods[idx]} # 挑选primary记录 sorted_prods = sorted(prods, key=lambda k: self.region_order[k['region']]) main_entry = sorted_prods[0] entry = { k: unicodify(main_entry[k]) for k in ('brand_id', 'model', 'name', 'description', 'details', 'gender', 'category', 'color', 'url', 'fingerprint') } if not entry['name']: entry['name'] = u'单品' mfashion_tags = [ unicodify(val[0]) for val in db.query( str.format( 'SELECT DISTINCT p1.tag FROM mfashion_tags AS p1 ' 'JOIN products_mfashion_tags AS p2 ON p1.idmfashion_tags=p2.id_mfashion_tags ' 'WHERE p2.idproducts IN ({0})', ','.join( val['idproducts'] for val in prods))).fetch_row(maxrows=0) ] # # original_tags = [int(val[0]) for val in # db.query(str.format('SELECT DISTINCT id_original_tags FROM products_original_tags ' # 'WHERE idproducts IN ({0})', # ','.join(val['idproducts'] for val in prods))).fetch_row( # maxrows=0)] entry['mfashion_tags'] = json.dumps(mfashion_tags, ensure_ascii=False) entry[ 'original_tags'] = '' #json.dumps(original_tags, ensure_ascii=False) entry['region_list'] = json.dumps([val['region'] for val in prods], ensure_ascii=False) entry['brandname_e'] = info.brand_info()[int( entry['brand_id'])]['brandname_e'] entry['brandname_c'] = info.brand_info()[int( entry['brand_id'])]['brandname_c'] # # 该单品在所有国家的记录中,第一次被抓取到的时间,作为release的fetch_time # entry['fetch_time'] = \ # sorted(datetime.datetime.strptime(tmp['fetch_time'], "%Y-%m-%d %H:%M:%S") for tmp in prods)[ # 0].strftime("%Y-%m-%d %H:%M:%S") url_dict = {int(val['idproducts']): val['url'] for val in prods} offline_dict = { int(val['idproducts']): int(val['offline']) for val in prods } price_change_dict = { int(val['idproducts']): val['price_change'] for val in prods } update_time_dict = { int(val['idproducts']): datetime.datetime.strptime(val['update_time'], "%Y-%m-%d %H:%M:%S") for val in prods } # pid和region之间的关系 region_dict = {int(val['idproducts']): val['region'] for val in prods} price_list = {} # 以pid为主键,将全部的价格历史记录合并起来 for item in db.query_match( ['price', 'price_discount', 'currency', 'date', 'idproducts'], self.price_hist, {}, str.format('idproducts IN ({0})', ','.join(val['idproducts'] for val in prods)), tail_str='ORDER BY idprice_history DESC').fetch_row(maxrows=0, how=1): pid = int(item['idproducts']) region = region_dict[pid] offline = offline_dict[pid] if pid not in price_list: price_list[pid] = [] price = float(item['price']) if item['price'] else None if offline == 0: price_discount = float( item['price_discount']) if item['price_discount'] else None else: price_discount = None price_list[pid].append({ 'price': price, 'price_discount': price_discount, 'currency': item['currency'], 'date': datetime.datetime.strptime(item['date'], "%Y-%m-%d %H:%M:%S"), 'price_change': price_change_dict[pid], 'url': url_dict[pid], 'offline': offline, 'code': region, 'country': info.region_info()[region]['name_c'] }) currency_conv = lambda val, currency: info.currency_info()[currency][ 'rate'] * val # 对price_list进行简并操作。 # 策略:如果有正常的最新价格,则返回正常的价格数据。 # 如果最新价格为None,则取回溯第一条不为None的数据,同时将price_discount置空。 # 如果无法找到不为None的价格,则跳过该pid for pid, pid_data in price_list.items(): # 按照时间顺序逆排序,同时只保留price不为None的数据 # pid_data = sorted(pid_data, key=lambda val: val['date'], reverse=True) # 有价格的pid_data子集 valid_pid_data = filter(lambda val: val['price'], pid_data) if pid_data[0]['price']: # 正常情况 price_list[pid] = pid_data[0] # 如果当前没有折扣价,查看是否为一周内原价悄悄下降的情况 currency = valid_pid_data[0]['currency'] if price_change_dict[pid] == 'D' and len( valid_pid_data ) > 1 and currency == valid_pid_data[1]['currency']: if not pid_data[0]['price_discount'] and currency_conv( valid_pid_data[1]['price'], currency) > currency_conv( valid_pid_data[0]['price'], currency) and (datetime.datetime.now() - valid_pid_data[0]['date'] ) < datetime.timedelta(7): price_list[pid]['price_discount'] = price_list[pid][ 'price'] price_list[pid]['price'] = valid_pid_data[1]['price'] else: # 寻找回溯第一条price不为None的数据。 # tmp = filter(lambda val: val['price'], pid_data) if not valid_pid_data: # 没有价格信息,取消该pid记录 price_list.pop(pid) else: # 取最近一次价格,同时取消折扣价,保留最新记录的offline状态 tmp = valid_pid_data[0] tmp['price_discount'] = None price_list[pid] = tmp # 第一次有效价格对应的时间,为fetch_time # pid_data = filter(lambda val: val['price'], sorted(pid_data, key=lambda val: val['date'])) # pid_data = filter(lambda val: val['price'], pid_data) if valid_pid_data and pid in price_list: price_list[pid]['fetch_time'] = valid_pid_data[-1]['date'] price_list[pid]['idproducts'] = pid # 如果没有价格信息,则不发布 if not price_list: return entry['price_list'] = sorted( price_list.values(), key=lambda val: self.region_order[val['code']]) entry = release_filter(entry, logger) if not entry['price_list']: return entry['offline'] = entry['price_list'][0]['offline'] # model的fetch_time的确定:所有对应pid中,fetch_time最早的那个。 entry['fetch_time'] = min( tmp['fetch_time'] for tmp in entry['price_list']).strftime("%Y-%m-%d %H:%M:%S") # 价格排序的列表 alt_prices = [] for price_item in entry['price_list']: # 将datetime序列化,进而保存在release表中。 price_item['date'] = price_item['date'].strftime( "%Y-%m-%d %H:%M:%S") price_item['fetch_time'] = price_item['fetch_time'].strftime( "%Y-%m-%d %H:%M:%S") if price_item['offline'] == 0: if price_item['price_discount']: tmp = map( lambda key_name: currency_conv(price_item[key_name], price_item['currency']), ('price', 'price_discount')) tmp.extend([ price_item[key] for key in ('price_change', 'price', 'price_discount', 'currency', 'date', 'idproducts') ]) alt_prices.append(tmp) else: alt_prices.append([ currency_conv(price_item['price'], price_item['currency']), None, price_item['price_change'], price_item['price'], price_item['price_discount'], price_item['currency'], price_item['date'], price_item['idproducts'] ]) else: alt_prices.append([ currency_conv(price_item['price'], price_item['currency']), None, price_item['price_change'], price_item['price'], price_item['price_discount'], price_item['currency'], price_item['date'], price_item['idproducts'] ]) # 返回的价格:如果有折扣价,返回折扣价;如果没有,返回原价 alt_prices = sorted(alt_prices, key=lambda val: val[1] if val[1] else val[0]) entry['price'], entry['price_discount'] = alt_prices[ 0][:2] if alt_prices else (None, ) * 2 entry['price_change'] = alt_prices[0][2] if alt_prices else '0' entry['o_price'], entry['o_discount'], entry[ 'o_currency'] = alt_prices[0][3:6] # 取消entry['price_list']中的idproducts for i in xrange(len(entry['price_list'])): entry['price_list'][i].pop('idproducts') entry['price_list'] = json.dumps(entry['price_list'], ensure_ascii=False) entry['last_price_ts'] = alt_prices[0][6] entry['product_update_ts'] = update_time_dict[ alt_prices[0][7]].strftime("%Y-%m-%d %H:%M:%S") # 搜索字段 search_text = u' '.join(entry[tmp] if entry[tmp] else '' for tmp in ('name', 'description', 'details', 'model', 'brandname_e', 'brandname_c')) search_color = u' '.join(entry['color']) if entry['color'] else u'' rs = db.query_match( ['description_cn', 'description_en', 'details_cn', 'details_en'], 'products_translate', { 'fingerprint': entry['fingerprint'] }).fetch_row() part_translate = u' ' + u' '.join( unicodify(tmp) for tmp in filter(lambda v: v, rs[0])) if rs else ' ' search_tags = u' '.join(list(set(mfashion_tags))) entry['searchtext'] = unicode.format(u'{0} {1} {2} {3}', search_text, part_translate, search_tags, search_color) p = prods[0] checksums = [] # 爆照checksums中的数据唯一,且顺序和idproducts_image一致 for tmp in db.query( str.format( ''' SELECT p1.checksum, p3.width, p3.height, p3.path FROM products_image AS p1 JOIN products AS p2 ON p1.fingerprint=p2.fingerprint JOIN images_store AS p3 ON p1.checksum=p3.checksum WHERE p2.fingerprint="{0}" ORDER BY p1.idproducts_image ''', p['fingerprint'])).fetch_row(maxrows=0, how=1): if tmp not in checksums: checksums.append(tmp) # 如果没有图片,则暂时不添加到release表中 if not checksums: return image_list = [] for val in checksums: tmp = { 'path': val['path'], 'width': int(val['width']), 'height': int(val['height']) } if not image_list: entry['cover_image'] = json.dumps(tmp, ensure_ascii=False) image_list.append(tmp) entry['image_list'] = json.dumps(image_list[:self.max_images], ensure_ascii=False) db.insert(entry, 'products_release')
else: mod_name, mod_class = '.'.join(tmp[:-1]), tmp[-1] mod = __import__(mod_name, fromlist=[mod_class]) kclass = getattr(mod, mod_class) return kclass if __name__ == "__main__": ret = parse_args(sys.argv) section = ret['cmd'] if not section: section = 'CRON_TASK_DEFAULT' logger = get_logger() logger.info(str.format('TASK {0} STARTED.', section)) for task_name, task_param in getattr(glob, section, {}).items(): try: class_name = task_param['classname'] func = getattr(my_import(class_name), 'run') if 'param' in task_param: func(**task_param['param']) else: func() except (KeyError,): logger = get_logger().exception(unicode.format(u'Invalid task name: {0}', unicodify(task_name)).encode('utf-8')) logger.info(str.format('TASK {0} DONE.', section))
def sendemail(data, recipients): colors = [ '#C0C0C0', '#FFFF00', '#FAEBD7', '#7FFFD4', '#00FF00', '#CC99FF', '#FFCC66', '#0099FF' ] report = '' for file in data: color = random.choice(colors) if data[file]: for error in data[file]: if error['Traceback']: report += u'<tr><td style="background-color: %s">%s</td><td>' % ( color, file) + u'</td><td>'.join([ str(error['line_no']), error['error_time'], str(error['error_count']), error['error_info'], error['Traceback'][0][4], error['Traceback'][0][5], error['Traceback'][0][0], error['Traceback'][0][1], error['Traceback'][0][2], error['Traceback'][0][3] ]) + u'</td></tr>' else: tmp_str = u'</td><td>'.join([ unicode.format(u'{0}', tmp) for tmp in [ error['line_no'], error['error_time'], error['error_count'], unicodify(error['error_info']) ] ]) report += u'<tr><td style="background-color: %s">%s</td><td>' % ( color, file) + u'</td><td>'.join( map(lambda x: unicode(str(x if x is not '' else 'none')), [error['line_no'], error['error_time'], error['error_count'], error['error_info']])) + \ u'<td></td><td></td><td></td><td></td><td></td><td></td></tr>' content = u""" <h1>log文件分析报告</h1> <table cellpadding="2" cellspacing="0" border="1" bordercolor="#000000"> <tbody> <tr> <th>log文件</th> <th>错误行号</th> <th>错误时间</th> <th>错误次数</th> <th width="50%">scrapy error</th> <th>Traceback ERROR</th> <th>Traceback ERROR INFO</th> <th>Traceback file</th> <th>Traceline</th> <th>Traceback function</th> <th>Traceback content</th> </tr> {0} </tbody> </table> """ msg = MIMEText(unicode.format(content, report), _subtype='html', _charset='utf-8') # msg = MIMEMultipart('alternative') msg['Subject'] = u'MFashion Logs文件处理报告' msg['From'] = 'MStore Admin <*****@*****.**>' msg['To'] = ', '.join([ unicode.format(u'{0} <{1}>', item[0], item[1]) for item in recipients.items() ]) server = smtplib.SMTP_SSL('smtp.exmail.qq.com', 465) server.login('*****@*****.**', 'rose123') server.sendmail('*****@*****.**', recipients.values(), msg.as_string()) server.quit()
def run(self): db_src = RoseVisionDb() db_src.conn(self.src_spec) db_dst = RoseVisionDb() db_dst.conn(self.dst_spec) # 备选记录 idproducts_list = [ int(val[0]) for val in db_src.query( unicode.format(u'SELECT idproducts FROM products WHERE {0}', u' AND '.join(self.cond)).encode( 'utf-8')).fetch_row(maxrows=0) ] self.tot = len(idproducts_list) self.progress = 0 db_dst.execute('SET AUTOCOMMIT=0') # db_dst.execute('ALTER TABLE products DISABLE KEYS') for pid_src in idproducts_list: self.progress += 1 record = db_src.query( str.format('SELECT * FROM products WHERE idproducts={0}', pid_src)).fetch_row(how=1)[0] db_dst.start_transaction() try: rs = db_dst.query( str.format( 'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" ' 'AND region="{2}"', record['brand_id'], record['model'], record['region'])) pid_dst = int( rs.fetch_row()[0][0]) if rs.num_rows() > 0 else None entry = {k: record[k] for k in record if k != 'idproducts'} price = process_price(record['price'], record['region']) if price: entry['price_rev'] = price['price'] entry['currency_rev'] = price['currency'] if entry['details']: entry['details'] = self.process_text( unicodify(entry['details'])) if entry['description']: entry['description'] = self.process_text( unicodify(entry['description'])) if entry['name']: entry['name'] = self.process_text(unicodify(entry['name'])) if entry['category']: entry['category'] = self.process_text( unicodify(entry['category'])) if entry['extra']: entry['extra'] = self.process_text( unicodify(entry['extra'])) if pid_dst: db_dst.update(entry, 'products', str.format('idproducts={0}', pid_dst)) else: db_dst.insert(entry, 'products') pid_dst = int( db_dst.query( str.format( 'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" ' 'AND region="{2}"', record['brand_id'], record['model'], record['region'])).fetch_row()[0][0]) # 是否需要处理价格信息 if price: record_price = db_dst.query( str.format( 'SELECT price,currency FROM products_price_history ' 'WHERE idproducts={0} ORDER BY date DESC LIMIT 1', pid_dst)).fetch_row(how=1) if not record_price or float(record_price[0]['price']) != price['price'] or \ record_price[0]['currency'] != price['currency']: db_dst.insert( { 'idproducts': pid_dst, 'date': record['update_time'], 'brand_id': record['brand_id'], 'region': record['region'], 'model': record['model'], 'price': price['price'], 'currency': price['currency'] }, 'products_price_history') # 处理图像信息 tmp = db_src.query( str.format( 'SELECT checksum,brand_id,url,path,width,height,format FROM products_image ' 'WHERE brand_id={0} AND model="{1}"', record['brand_id'], record['model'])).fetch_row(maxrows=0, how=1) image_record = {val['checksum']: val for val in tmp} checksum_src = set(image_record.keys()) # 完善images_store信息。如果checksum没有在images_store中出现,则添加之。 for checksum in checksum_src: if db_dst.query( str.format( 'SELECT checksum FROM images_store WHERE checksum="{0}"', checksum)).num_rows() == 0: db_dst.insert( { 'checksum': checksum, 'brand_id': image_record[checksum]['brand_id'], 'url': image_record[checksum]['url'], 'path': image_record[checksum]['path'], 'width': image_record[checksum]['width'], 'height': image_record[checksum]['height'], 'format': image_record[checksum]['format'] }, 'images_store') # 补充目标数据库的products_image表,添加相应的checksum checksum_dst = set([ val[0] for val in db_dst.query( str.format( 'SELECT checksum FROM products_image WHERE brand_id={0} AND model="{1}"', record['brand_id'], record['model'])).fetch_row( maxrows=0) ]) for checksum in checksum_src - checksum_dst: db_dst.insert( { 'checksum': checksum, 'brand_id': record['brand_id'], 'model': record['model'] }, 'products_image') db_dst.commit() except: db_dst.rollback() raise
def fetch_price(cls, response, spider=None): sel = Selector(response) ret = {} response.meta['url'] = response.url if 'userdata' in response.meta: region = response.meta['userdata']['region'] else: region = response.meta['region'] region_code = '|'.join(cls.spider_data['base_url'][reg] for reg in cls.get_supported_regions()) watch_code = [] for r in cls.get_supported_regions(): if r in cls.spider_data['watch_term']: watch_code.extend(cls.spider_data['watch_term'][r]) watch_code = '|'.join(watch_code) old_price = None new_price = None mt = re.search( unicode.format(ur'chanel\.com/({0})/({1})/.+', region_code, watch_code), response.url) if mt: # 对应 parse_watch price_url = str.format( 'http://www-cn.chanel.com/{0}/{1}/collection_product_detail?product_id={2}&maj=price', cls.spider_data['base_url'][region], cls.spider_data['watch_term'][region][0], cls.fetch_model(response)) return ProxiedRequest(url=price_url, callback=cls.fetch_price_request_watch, errback=spider.onerror, meta=response.meta, proxy_enabled=True, proxy_region=region) else: mt = re.search( str.format(r'chanel\.com/({0})/.+\?sku=\d+$', region_code), response.url) if mt: # 对应 parse_sku1 # TODO 这种类型url找不到原来取价格的代码 pass else: mt = re.search( str.format(r'chanel\.com/({0})/.+/sku/\d+$', region_code), response.url) if mt: # 对应 parse_sku2 temp = sel.xpath( '//div[contains(@class, "product_detail_container")]') if len(temp) > 0: product_name = temp[0] temp = product_name.xpath( './/h3[@class="product_price"]') if len(temp) > 0: old_price = unicodify(temp[0]._root.text) else: mt = re.search( str.format(r'chanel\.com/({0})/.+(?<=/)s\.[^/]+\.html', region_code), response.url) if mt: mt = re.search(r'var\s+settings', response.body) content = cm.extract_closure( response.body[mt.start():], '{', '}')[0] try: data = json.loads(content) if 'detailsGridJsonUrl' in data['sectionCache']: temp = data['sectionCache'][ 'detailsGridJsonUrl'] if re.search(r'^http://', temp): url = temp else: url = str.format( '{0}{1}', cls.spider_data['hosts'][region], temp) return ProxiedRequest( url=url, meta=response.meta, callback=cls. fetch_price_request_fashion_json, proxy_enabled=True, proxy_region=region, dont_filter=True, errback=spider.onerror) else: return cls.fetch_price_request_fashion( response.meta, data['sectionCache'], spider) except (KeyError, TypeError, IndexError): pass else: pass if old_price: ret['price'] = old_price if new_price: ret['price_discount'] = new_price return ret
def parse_json(self, metadata, json_data): if not json_data: self.log(str.format('INVALID JSON: {0}', metadata['url'].url), log.ERROR) return for url, product_info in json_data.items(): if url not in metadata['url']: continue cat_idx = 0 cat_list = [] for temp in product_info['navItems']: if 'title' not in temp: continue cat = unicodify(temp['title']) if not cat or cat.lower() in cat_list: continue cat_idx += 1 cat_list.append(cat.lower()) metadata['tags_mapping'][str.format('category-{0}', cat_idx)] = [{ 'name': cat.lower(), 'title': cat }] #if len(cat_list) > 0 and cat_list[-1]: # metadata['category'].add(cat_list[-1]) # images image_data = product_info['data'] href = None try: href = image_data['zoom']['imgsrc'] except KeyError: if 'imgsrc' in image_data: href = image_data['imgsrc'] href = self.process_image_url(href, metadata['region']) if href: metadata['image_urls'].add(href) # # module images # if 'modulesJsonUrl' in image_data: # metadata['modules_url'] = cm.norm_url(image_data['modulesJsonUrl'], host=self.spider_data['host']) # else: # metadata['modules_url'] = None metadata['modules_url'] = None info = product_info['data']['details']['information'] if 'ref' in info: for val in self.func1(metadata, info): yield val else: for t1 in info: m1 = copy.deepcopy(metadata) for t2 in t1['datas']: m2 = copy.deepcopy(m1) for val in self.func1(m2, t2): yield val
def parse(self, response): self.log(unicode.format(u'PARSE_HOME: URL={0}', response.url), level=log.DEBUG) metadata = response.meta['userdata'] mt = re.search(r'www\.gucci\.com/([a-z]{2})', response.url) if mt: region = mt.group(1) sel = Selector(response) for node1 in sel.xpath( "//ul[@id='header_main']/li[contains(@class, 'mega_menu')]" ): span = node1.xpath("./span[@class='mega_link']") if len(span) == 0: continue span = span[0] inner = span.xpath('.//cufontext') if len(inner) > 0: cat = unicodify(inner[0]._root.text) else: cat = unicodify(span._root.text) if not cat: continue m = copy.deepcopy(metadata) m['tags_mapping']['category-1'] = [{ 'name': cat.lower(), 'title': cat }] gender = cm.guess_gender(cat) if gender: m['gender'] = [gender] for node2 in node1.xpath( "./div/ul/li[not(@class='mega_promo')]/a[@href]"): href = unicodify(node2._root.attrib['href']) inner = node2.xpath('.//cufontext') if len(inner) > 0: title = unicodify(inner[0]._root.text) else: title = unicodify(node2._root.text) if not title: continue else: title = title.strip() mt = re.search(ur'/([^/]+)/?$', href) if not mt: continue cat = unicodify(mt.group(1)) if not cat: continue else: cat = cat.lower() m2 = copy.deepcopy(m) m2['tags_mapping']['category-2'] = [{ 'name': cat, 'title': title }] m2['category'] = [cat] if href.find('http://') == -1: continue yield Request(url=href, meta={'userdata': m2}, callback=self.parse_category_2)
def parse_products(self, response): metadata = response.meta['userdata'] # self.log(unicode.format(u'PROCESSING {0} -> {1} -> {2}: {3}', metadata['extra']['category-0'][0], # metadata['extra']['category-1'][0], metadata['name'], response.url).encode('utf-8'), # log.DEBUG) for k in ('post_token', 'page_id'): if k in metadata: metadata.pop(k) sel = Selector(response) temp = sel.xpath( '//div[@class="product-header"]//span[@class="page-product-title"]' ) if temp: collection = unicodify(temp[0]._root.text) if collection: metadata['tags_mapping']['collection'] = [{ 'name': collection.lower(), 'title': collection }] model = self.fetch_model(response) if model: metadata['model'] = model else: return if 'name' not in metadata or not metadata['name']: name = self.fetch_name(response) if name: metadata['name'] = name description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] temp = sel.xpath( '//div[@class="column-images"]//a[@href and contains(@class,"zoom-trigger-link")]' ) image_urls = [ self.process_href(val._root.attrib['href'], response.url) for val in temp ] metadata['url'] = response.url item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata return item
def parse_sku1(self, response): self.log(str.format('PARSE_SKU1: {0}', response.url), level=log.DEBUG) mt = re.search(r'chanel\.com/([^/]+)/', response.url) region = None for a, b in self.spider_data['base_url'].items(): if b == mt.group(1): region = a break if not region: return mt = re.search(r'\?sku=(\d+)$', response.url) if not mt: return model = mt.group(1) metadata = { 'region': region, 'brand_id': self.spider_data['brand_id'], 'model': model, 'url': response.url, 'tags_mapping': {}, 'category': set([]) } sel = Selector(response) cat_idx = 0 cat_list = [] for node in sel.xpath( '//div[contains(@class,"trackingSettings")]/span[@class]'): cat = unicodify(node._root.text) if not cat: continue #if node._root.attrib['class'] == 'WT_cg_s': # if 'category' not in metadata: # metadata['category'] = set([]) # metadata['category'].add(cat.lower()) if cat.lower() in cat_list: continue cat_idx += 1 cat_list.append(cat.lower()) metadata['tags_mapping'][str.format('category-{0}', cat_idx)] = [{ 'name': cat.lower(), 'title': cat }] gender = cm.guess_gender(cat) if gender: if 'gender' not in metadata: metadata['gender'] = set([]) metadata['gender'].add(gender) temp = sel.xpath('//div[@class="productName"]') name_list = [] if len(temp) > 0: product_name = temp[0] temp = product_name.xpath( './h1[@class="family"]/span[@class="familyText"]') if len(temp) > 0: name = unicodify(temp[0]._root.text) if name: name_list.append(name) name = u', '.join([ unicodify(val.text) for val in temp[0]._root.iterdescendants() if val.text and val.text.strip() ]) if name: name_list.append(name.strip()) temp = product_name.xpath('./h2[@class="name"]') if len(temp) > 0: name = unicodify(temp[0]._root.text) if name: name_list.append(name) name = u', '.join([ unicodify(val.text) for val in temp[0]._root.iterdescendants() if val.text and val.text.strip() ]) if name: name_list.append(name.strip()) name = u' - '.join(name_list) metadata['name'] = name if name else None # Description and details temp = sel.xpath('//div[@class="tabHolderFullWidth tabHolder"]') if len(temp) > 0: content_node = temp[0] content_map = {} for node in content_node.xpath('./div[@class="tabs"]//a[@rel]'): temp = unicodify(node._root.text) if temp and temp in self.spider_data['description_hdr']: content_map['description'] = node._root.attrib['rel'] if temp and temp in self.spider_data['details_hdr']: content_map['details'] = node._root.attrib['rel'] for term in ('description', 'details'): if term in content_map: temp = content_node.xpath( str.format('./div[@id="{0}"]', content_map[term])) if len(temp) > 0: content_list = [] content = unicodify(temp[0]._root.text) if content: content_list.append(content) content_list.extend([ unicodify(val.text) for val in temp[0]._root.iterdescendants() if val.text and val.text.strip() ]) metadata[term] = u', '.join(content_list) # Images # image_urls = [] # for node in hxs.select('//div[@class="major productImg"]/img[@src]'): # href = node._root.attrib['src'] # if re.search(r'^http://', href): # image_urls.append(href) # else: # image_urls.append(str.format('{0}/{1}', self.spider_data['host'], href)) # image_urls = list(set([re.sub(r'\.+', '.', val) for val in image_urls])) image_urls = list( set( cm.norm_url(node._root.attrib['src'], self.spider_data['base_url']) for node in sel.xpath( '//div[@class="major productImg"]/img[@src]') if node._root.attrib['src'] and node._root.attrib['src'].strip())) if 'color' in metadata: metadata['color'] = list(metadata['color']) if 'gender' in metadata: metadata['gender'] = list(metadata['gender']) #metadata['category'] = list(metadata['category']) if 'model' in metadata: item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata yield item
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) for node1 in sel.xpath( '//nav/ul/li[@class="category-parent"]/a[@href]'): tag_text = self.reformat(unicodify(node1._root.text)) if not tag_text: continue m1 = copy.deepcopy(metadata) m1['tags_mapping']['category-0'] = [{ 'name': tag_text.lower(), 'title': tag_text }] m1['category'] = [tag_text] for node2 in node1.xpath('../ul/li/a[@href]'): tag_text = self.reformat(unicodify(node2._root.text)) if not tag_text: continue m2 = copy.deepcopy(m1) m2['tags_mapping']['category-1'] = [{ 'name': tag_text.lower(), 'title': tag_text }] yield Request(url=self.process_href(node2._root.attrib['href'], response.url), callback=self.parse_cat, errback=self.onerr, dont_filter=True, meta={ 'userdata': m2, 'cat-level': 0 }) # 针对美国官网 nav_nodes = sel.xpath( '//div[@id="siloheader"]/div[@id="menusilo"]/div/ul/li/a[@href][text()]' ) for node in nav_nodes: try: tag_text = node.xpath('./text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except (TypeError, IndexError): continue if tag_text and tag_name: m = copy.deepcopy(metadata) m['tags_mapping']['category-0'] = [ { 'name': tag_name, 'title': tag_text, }, ] gender = cm.guess_gender(tag_name) if gender: m['gender'] = [gender] try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) href = self.process_href_for_us(href) except (TypeError, IndexError): continue yield Request( url=href, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m}, )
def parse_cat(self, response): metadata = response.meta['userdata'] sel = Selector(response) region = metadata['region'] if region != 'us': cat_level = response.meta['cat-level'] node_list = [] if cat_level == 0: node_list = sel.xpath( '//ul[@class="product-categories"]/ul/li/a[@href]') for node in node_list: # 还有下级目录 tag_text = self.reformat(unicodify(node._root.text)) if not tag_text: continue m = copy.deepcopy(metadata) m['tags_mapping']['category-2'] = [{ 'name': tag_text.lower(), 'title': tag_text }] yield Request(url=self.process_href( node._root.attrib['href'], response.url), callback=self.parse_cat, errback=self.onerr, dont_filter=True, meta={ 'userdata': m, 'cat-level': 1 }) if not node_list: # 没有下级目录的情况,返回所有单品 for node in sel.xpath( '//ul[@id="list-content"]/li[contains(@class,"item")]/a[@href]' ): m = copy.deepcopy(metadata) # tmp = node.xpath('./span[@class="product-name"]') # if tmp: # m['name'] = self.reformat(unicodify(tmp[0]._root.text)) # tmp = node.xpath('.//span[@class="price"]') # if tmp: # m['price'] = self.reformat(unicodify(tmp[0]._root.text)) yield Request(url=self.process_href( node._root.attrib['href'], response.url), dont_filter=True, callback=self.parse_details, errback=self.onerr, meta={'userdata': m}) else: catalognav_nodes = sel.xpath( '//div[@id="template"]/div[@class="catalognav"]/ul/li//a[@href][text()]' ) for node in catalognav_nodes: try: tag_text = node.xpath('./text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except (TypeError, IndexError): continue if tag_text and tag_name: m = copy.deepcopy(metadata) m['tags_mapping']['catagory-1'] = [ { 'name': tag_name, 'title': tag_text, }, ] gender = cm.guess_gender(tag_name) if gender: m['gender'] = [gender] try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) href = self.process_href_for_us(href) except (TypeError, IndexError): continue yield Request(url=href, callback=self.parse_cat2_us, errback=self.onerr, meta={'userdata': m})
def parse_details(self, response): """ 解析“系列”下面的单品 @param response: """ metadata = response.meta['userdata'] sel = Selector(response) try: model = sel.xpath( '//div[@id="product-detail"]/div[@class="inner-detail"]//*[@class="reference-number"]/' 'text()').extract()[0] if not model: return metadata['model'] = model except IndexError: return metadata['url'] = unicodify(response.url) if 'name' not in metadata or not metadata['name']: tmp = sel.xpath( '//div[@id="product-detail"]/div[@class="inner-detail"]//*[@class="format"]' '/text()').extract() if tmp: metadata['name'] = self.reformat(unicodify(tmp[0])) # 颜色 sub_products = sel.xpath( '//div[@id="product-detail"]/div[@class="inner-detail"]//ul[@class="color-list"]' '/li/a[@href]/@href').extract() for href in sub_products: if href in response.url: continue yield Request(url=self.process_href(href, response.url), callback=self.parse_details, errback=self.onerr, meta={'userdata': copy.deepcopy(metadata)}) try: metadata['description'] = self.reformat( unicodify( sel.xpath( '//div[@id="tabs-product-detail-overview"]' '/div[@class="product-detail-tab-content"]' '/p[@class="slide-paragraph"]/text()').extract()[0])) except IndexError: pass details_nodes = sel.xpath( '//div[@id="tabs-product-detail-specification"]/' 'div[@class="product-detail-tab-content"]//li/span[@class="tooltip" or ' '@class="title"]/..') details = self.reformat( unicodify('\r'.join(': '.join(node.xpath('*/text()').extract()) for node in details_nodes))) if details: metadata['details'] = details image_urls = [ self.process_href(val, response.url) for val in sel.xpath( '//div[@id="product-gallery"]/div[@class="product-gallery-part"]' '/div[contains(@class,"positioned-product")]/img[@src]/@src'). extract() ] image_urls.extend([ self.process_href(val, response.url) for val in sel.xpath( '//div[@id="product-gallery"]/div[@class="product-gallery-part"]' '/img[@src]/@src').extract() ]) item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata yield item
def fetch_product_details(region, url, filter_data, download_image=True, extra=None): """ 获得单品的详细信息 """ product_url = hosts['url_host'][region] + url response = cm.retry_helper(lambda val: cm.get_data(url=val, client='iPad'), param=product_url, logger=logger, except_class=(URLError, socket.timeout), retry_delay=10) if response is None: return body = response['body'] if not body: return # 型号 model = None try: temp = pq(body)('div.sku') except ParserError: return if len(temp) > 0: mt = re.search(details_pattern['model_pattern'][region], temp[0].text.encode('utf-8'), re.M | re.I) if mt: model = mt.group(1) else: mt = re.search(r'[a-zA-Z]\d{5}', temp[0].text.encode('utf-8')) if mt: model = mt.group() if model is None: return None temp = pq(body)('td.priceValue') price = unicodify(temp[0].text) if temp else None product_name = '' temp = pq(body)('#productName h1') if temp: product_name = unicodify(temp[0].text) description = '' temp = pq(body)('#productDescription') if temp: description = unicodify(temp[0].text) details = '' temp = pq(body)('#productDescription div.productDescription') if temp: details = reformat(unicodify(temp[0].text_content())) post_data = filter_data['post_data'] init_data = {} temp = unicodify(post_data[ "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.color"] ) init_data['color'] = [temp] if temp else [] extra = {} temp = unicodify(post_data[ "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.lineik"] ) if temp: extra['texture'] = [temp] temp = unicodify(post_data[ "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.pageId"]) if temp: extra['category-0'] = [temp] temp = unicodify(post_data[ "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.functionik"] ) if temp: extra['function'] = [temp] temp = unicodify(post_data[ "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.casematerialik"] ) if temp: extra['material'] = [temp] temp = unicodify(post_data[ "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.collectionik"] ) if temp: extra['collection'] = [temp] temp = unicodify(post_data[ "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.shapeik"] ) if temp: extra['shape'] = [temp] temp = unicodify(post_data[ "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.subcategoryik"] ) if temp: extra['category-1'] = [temp] temp = unicodify(post_data[ '/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.subsubcategoryik'] ) if temp: extra['category-2'] = [temp] temp = unicodify(post_data[ "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.typeik"] ) if temp: extra['typeik'] = [temp] init_data['tags_mapping'] = { k: [{ 'name': val.lower(), 'title': val } for val in extra[k]] for k in extra } # init_data['extra'] = extra init_data['model'] = model init_data['name'] = product_name init_data['price'] = price init_data['description'] = description init_data['details'] = details temp = unicodify(filter_data['tags']['category']) init_data['category'] = [temp] if temp else [] init_data['brand_id'] = filter_data['tags']['brand_id'] temp = filter_data['tags']['gender'] if temp.lower() in ('women', 'woman', 'femme', 'donna', 'damen', 'mujer', 'demes', 'vrouw', 'frauen', 'womenswear'): init_data['gender'] = ['female'] elif temp.lower() in ('man', 'men', 'homme', 'uomo', 'herren', 'hombre', 'heren', 'mann', 'signore', 'menswear'): init_data['gender'] = ['male'] region = filter_data['tags']['region'] init_data['region'] = region init_data['url'] = product_url # product = init_product_item(init_data) product = init_data price = process_price(u'2 350,00 €', 'fr') if download_image: results = fetch_image(body, model) else: results = [] item = ProductItem() item['image_urls'] = [] item['url'] = init_data['url'] item['model'] = init_data['model'] item['metadata'] = init_data product_pipeline.process_item(item, spider) image_pipeline.item_completed(results, item, None) return item
def parse_cat_0(self, response): metadata = response.meta['userdata'] sel = Selector(response) # MINI-BAG temp = sel.xpath( '//article[contains(@class,"sliding-backgrounds")]//a[@href and contains(@class,"background")]' ) if temp: return Request(url=self.process_href(temp[0]._root.attrib['href'], response.url), callback=self.parse_list, meta={'userdata': metadata}, errback=self.onerr) node = None temp = sel.xpath( '//div[@class="menu"]/ul[@class="collections"]/li[contains(@class,"collection")]/' 'div[contains(@class,"name")]/a[@href]') if temp: for temp1 in temp: if self.process_href(temp1._root.attrib['href'], response.url) == response.url: node = temp1 break if not node: return None ret = [] for node1 in node.xpath( '../../ul[contains(@class,"departments")]/li[contains(@class,"department")]/div/a[@href]' ): m1 = copy.deepcopy(metadata) href = node1._root.attrib['href'] mt = re.search('/([^/]+)$', href) if mt: tag_name = unicodify(mt.group(1)).lower() tag_text = unicodify( node1._root.text).lower() if node1._root.text else tag_name m1['tags_mapping']['category-1'] = [{ 'name': tag_name, 'title': tag_text }] # 是否有子分类级别 for node2 in node1.xpath( '../../ul[contains(@class,"categories")]/li[contains(@class,"category")]//a[@href]' ): m2 = copy.deepcopy(m1) href = node2._root.attrib['href'] mt = re.search('/([^/]+)$', href) if mt: tag_name = unicodify(mt.group(1)) tag_text = unicodify( node2._root.text) if node2._root.text else tag_name m2['tags_mapping']['category-2'] = [{ 'name': tag_name, 'title': tag_text }] ret.append( Request(url=self.process_href(href, response.url), meta={'userdata': m2}, callback=self.parse_list, errback=self.onerr)) return ret
def parse_details(self, response): self.log(unicode.format(u'PARSE_DETAILS: URL={0}', response.url), level=log.DEBUG) metadata = response.meta['userdata'] sel = Selector(response) title = None node = sel.xpath( '//section[@id="column_description"]//span[@class="container_title"]/h1/span' ) if len(node) > 0: node = node[0] inner = node.xpath('.//cufontext') if len(inner) == 0: title = unicodify(node._root.text) else: title = u''.join(val._root.text for val in inner if val._root.text) node = sel.xpath( '//div[@id="accordion_left"]//div[@id="description"]//ul/li') desc = u'\n'.join( unicodify(val._root.text) for val in node if val._root.text) node = sel.xpath( '//div[@id="zoom_in_window"]/div[@class="zoom_in"]/img[@src]') if len(node) > 0: href = node[0]._root.attrib['src'] image_base = os.path.split(href)[0] node_list = sel.xpath( '//div[@id="zoom_tools"]/ul[@id="view_thumbs_list"]/li/img[@src]' ) image_list = set([]) for node in node_list: href = node._root.attrib['src'] pic_name = os.path.split(href)[1] idx = pic_name.find('web_variation') if idx == -1: continue pic_name = pic_name.replace('web_variation.', 'web_zoomin.') image_list.add(str.format('{0}/{1}', image_base, pic_name)) metadata['image_urls'] = image_list if title: metadata['name'] = title if desc: metadata['description'] = desc metadata['url'] = response.url style_id = os.path.split(response.url)[1] url = str.format( '{0}/{1}/styles/{2}/load_style.js', self.spider_data['hosts'][metadata['region']], 'ca-en' if metadata['region'] == 'ca' else metadata['region'], style_id) metadata['dynamic_url'] = response.url + '/2/populate_dynamic_content' return Request( url=url, meta={'userdata': metadata}, callback=self.parse_style, dont_filter=True, headers={'Accept': 'application/json, text/javascript, */*'})
def run(self): db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) # 如果没有指定brand_list,则默认使用数据库中所有的brand_list if not self.brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] self.brand_list = brand_list else: brand_list = self.brand_list self.progress = 0 self.tot = len(brand_list) # 最终生成的表格 tot_results = [] for brand in brand_list: results = {} print unicode.format(u'PROCESSING {0} / {1}', brand, info.brand_info()[brand]['brandname_e']) brand_name = info.brand_info()[brand]['brandname_e'] self.progress += 1 rs = db.query( str.format( '''SELECT p1.idproducts,p1.brand_id,p1.model,p1.region,p2.price,p2.price_discount,p2.currency,p2.date,p1.name,p4.tag,p1.url FROM products AS p1 JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts LEFT JOIN products_mfashion_tags AS p3 ON p3.idproducts=p1.idproducts LEFT JOIN mfashion_tags AS p4 ON p3.id_mfashion_tags=p4.idmfashion_tags WHERE p1.brand_id={0} AND p1.offline=0''', brand)) records = rs.fetch_row(maxrows=0, how=1) for r in records: pid = int(r['idproducts']) timestamp = datetime.datetime.strptime(r['date'], '%Y-%m-%d %H:%M:%S') tag = unicodify(r['tag']) if pid in results: # 如果已经存在相应单品的记录 old_rec = results[pid] old_rec['tag'].add(tag) old_t = datetime.datetime.strptime(old_rec['date'], '%Y-%m-%d %H:%M:%S') if timestamp > old_t: old_rec['price'] = unicodify(r['price']) old_rec['price_discount'] = unicodify( r['price_discount']) old_rec['currency'] = unicodify(r['currency']) old_rec['date'] = unicodify(r['date']) else: # 如果该单品的记录不存在 results[pid] = {k: unicodify(r[k]) for k in r} tmp = results[pid]['tag'] if tmp: results[pid]['tag'] = {tmp} else: results[pid]['tag'] = set({}) results[pid]['brand'] = brand_name results[pid].pop('idproducts') tot_results.extend(self.random_extract(results.values())) db.close() # 将所有的tag转换为[] data = [] for r in tot_results: r['tag'] = json.dumps(list(r['tag']), ensure_ascii=False) data.append( {k: r[k].encode('utf-8') if r[k] else 'NULL' for k in r}) # 写入CSV文件 with open( str.format('extract_{0}.csv', datetime.datetime.now().strftime('%Y%m%d%H%M%S')), 'wb') as f: f.write(u'\ufeff'.encode('utf8')) dict_writer = csv.DictWriter(f, fieldnames=[ 'brand_id', 'brand', 'model', 'region', 'price', 'price_discount', 'currency', 'date', 'name', 'tag', 'url' ]) dict_writer.writeheader() dict_writer.writerows(data)
def parse_details(self, response): self.log(unicode.format(u'PARSE_DETAILS: URL={0}', response.url).encode('utf-8'), level=log.DEBUG) metadata = response.meta['userdata'] hxs = Selector(response) # 访问商品的其它颜色版本 ret = hxs.xpath( "//div[contains(@class,'colors')]/ul[contains(@class,'color-set')]" "/li[contains(@class,'color') and not(contains(@class,'color-selected'))]" "/a[@title and @data-color-link]") for node in ret: m = copy.deepcopy(metadata) m['color'] = [ self.reformat(unicodify( node.xpath('@title').extract()[0])).lower() ] url = self.process_href( node.xpath('@data-color-link').extract()[0], response.url) m['url'] = url yield Request(url=url, callback=self.parse_details, errback=self.onerr, meta={'userdata': m}) colors = self.fetch_color(response) if colors: metadata['color'] = colors model = self.fetch_model(response) if model: metadata['model'] = model else: return ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail name = self.fetch_name(response) if name: metadata['name'] = name if 'name' in metadata and 'details' in metadata and 'description' in metadata: ret = hxs.xpath( "//div[@class='product_detail_container']/div[@class='product_viewer']" "//ul[@class='product-media-set']/li[@class='product-image']/img[@src]/@src" ).extract() image_urls = [self.process_href(val, response.url) for val in ret] item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata yield item else: self.log( unicode.format(u'INVALID ITEM: {0}', metadata['url']).encode('utf-8'), log.ERROR)
def process_price(price, region, decimal=None, currency=None): def func(val): """ 去掉多余的空格,以及首尾的非数字字符 :param val: """ # val=unicode.format(u' {0} ',) if not re.search(r'\d', val): return '' val = re.sub(r"[\s']", '', val, flags=re.U) if val[0] in ('.', ','): val = val[1:] if val[-1] in ('.', ','): val = val[:-1] return val if not price or not price.strip(): return None # 如果包含了appel, call等字符,说明不是这不是价格信息 for term in ['appel', 'call', 'appelez', 'chiamare']: if term in price.lower(): return None if isinstance(price, int) or isinstance(price, float): price = unicode(price) if not price or not price.strip(): return None val = unicode.format(u' {0} ', unicodify(price)) if not currency: # 如果price没有货币单位信息,则根据根据price内容,尝试提取货币信息。 currency = guess_currency(price, region=region) if not currency: # 如果无法提取货币信息,则使用region的默认值 currency = info.region_info()[region]['currency'] # 提取最长的数字,分隔符字符串 tmp = sorted([func(tmp) for tmp in re.findall(r"(?<=[^\d])[\d\s,'\.]+(?=[^\d])", val, flags=re.U)], key=lambda tmp: len(tmp), reverse=True) if not tmp: return None # 去除首尾的符号 while True: tmp = tmp[0].strip() if not tmp: return None elif tmp[0] in ('.', ','): tmp = tmp[1:] continue elif tmp[-1] in ('.', ','): tmp = tmp[:-1] continue break if re.search(r'^0+', tmp): return None # 判断小数点符号 # 方法:如果,和.都有,谁在后面,谁就是分隔符。否则的话,看该符号是否在三的倍数位置上 if decimal: pass elif (tmp.count('.') > 0 and tmp.count(',') == 1) or (tmp.count(',') > 0 and tmp.count('.') == 1): decimal = re.search(r'[\.,]', tmp[::-1]).group() elif (tmp.count('.') | tmp.count(',')) and not (tmp.count('.') & tmp.count(',')): # 只有一种符号出现 c = re.search(r'[\.,]', tmp).group() # 符号位的位置。如果相互之间间隔为4,则说明是千位分隔符。 pos = [val.start() for val in re.finditer(r'[\.,]', tmp)] pos.append(len(tmp)) is_triple = reduce(lambda ret, val: ret and (val == 4), [pos[i + 1] - pos[i] for i in xrange(len(pos) - 1)], True) if is_triple: decimal = list({',', '.'} - {c})[0] else: if tmp.count(c) == 1: decimal = c else: decimal = None elif tmp.count('.') == 0 and tmp.count(',') == 0: decimal = '.' else: decimal = None if not decimal: return None part = tmp.split(decimal) if len(part) == 1: part = part[0], '0' try: val = int(re.sub(r'[\.,]', '', part[0])) + float('.' + re.sub(r'[\.,]', '', part[1])) except (TypeError, ValueError): return None return {'currency': currency, 'price': val}
def parse_list(self, response): metadata = response.meta['userdata'] # self.log(unicode.format(u'PROCESSING {0} -> {1} -> PAGE {2}: {3}', metadata['extra']['category-0'][0], # metadata['extra']['category-1'][0], metadata['page_id'], response.url).encode('utf-8'), # log.DEBUG) if metadata['page_id'] == 0: sel = Selector(response) else: try: text = json.loads(response.body)['cartierFoAjaxSearch']['data'] sel = Selector(text=text) except (ValueError, KeyError, TypeError): # 解析错误,作为普通HTML对待 sel = Selector(response) # metadata['page_id'] = 0 if sel.xpath( '//div[@class="product-header"]//span[@class="page-product-title"]' ): # 实际上是单品页面 yield self.parse_products(response) else: flag = False for node in sel.xpath( '//div[contains(@class,"hover-info")]/a[@href]/div[@class="model-info"]' ): m = copy.deepcopy(metadata) temp = node.xpath('./div[@class="model-name"]') if not temp: continue m['name'] = unicodify(temp[0]._root.text) temp = node.xpath('./div[@class="model-description"]') if not temp: continue m['description'] = unicodify(temp[0]._root.text) flag = True yield Request(url=self.process_href( node.xpath('..')[0]._root.attrib['href'], response.url), meta={'userdata': m}, callback=self.parse_products, errback=self.onerr, dont_filter=True) if flag: # 处理翻页 post_token = metadata[ 'post_token'] if 'post_token' in metadata else None if not post_token: temp = sel.xpath( '//body[contains(@class, "html") and contains(@class, "page-navigation")]' ) if temp: temp = filter( lambda val: re.search('^page-navigation-(.+)', val ), re.split(r'\s+', temp[0]._root.attrib['class'])) if temp: post_token = re.search('^page-navigation-(.+)', temp[0]).group(1).replace( '-', '_') if post_token: m = copy.deepcopy(metadata) m['page_id'] += 1 # 这个分页请求,即使facetsajax和limit什么都不传,也会返回数据 # 可能是它们改了逻辑,这里直接限制一下页数 if m['page_id'] > 5: return m['post_token'] = post_token body = { 'facetsajax': 'true', 'limit': m['page_id'], 'params': '' } yield Request( url=self.spider_data['data_urls'][m['region']] + post_token, method='POST', body='&'.join( str.format('{0}={1}', k, body[k]) for k in body), headers={ 'Content-Type': 'application/x-www-form-urlencoded', 'X-Requested-With': 'XMLHttpRequest' }, callback=self.parse_list, meta={'userdata': m}, errback=self.onerr, dont_filter=True)
def parse_sku2(self, response): self.log(str.format('PARSE_SKU2: {0}', response.url), level=log.DEBUG) mt = re.search(r'chanel\.com/([^/]+)/', response.url) region = None for a, b in self.spider_data['base_url'].items(): if b == mt.group(1): region = a break if not region: return mt = re.search(r'/sku/(\d+)$', response.url) if not mt: return model = mt.group(1) metadata = { 'region': region, 'brand_id': self.spider_data['brand_id'], 'model': model, 'url': response.url, 'tags_mapping': {} } sel = Selector(response) cat_idx = 0 cat_list = [] for node in sel.xpath( '//div[contains(@class,"trackingSettings")]/span[@class]'): cat = unicodify(node._root.text) if not cat: continue #if node._root.attrib['class'] == 'WT_cg_s': # metadata['category'].add(cat.lower()) if cat.lower() in cat_list: continue cat_idx += 1 cat_list.append(cat.lower()) cat_name = str.format('category-{0}', cat_idx) metadata['tags_mapping'][cat_name] = [{ 'name': cat.lower(), 'title': cat }] gender = cm.guess_gender(cat) if gender: if 'gender' not in metadata: metadata['gender'] = set([]) metadata['gender'].add(gender) temp = sel.xpath('//div[contains(@class, "product_detail_container")]') name_list = [] if len(temp) > 0: product_name = temp[0] temp = product_name.xpath('./h1[@class="product_name"]') if len(temp) > 0: name = unicodify(temp[0]._root.text) if name: name_list.append(name) temp = product_name.xpath('./h2[@class="product_subtitle"]') if len(temp) > 0: name = unicodify(temp[0]._root.text) if name: name_list.append(name) temp = product_name.xpath('.//h3[@class="product_price"]') if len(temp) > 0: metadata['price'] = unicodify(temp[0]._root.text) name = u' - '.join(name_list) metadata['name'] = name if name else None # Description and details temp = sel.xpath('//div[@class="description_container"]') if len(temp) > 0: content_node = temp[0] content_map = {} for node in content_node.xpath( './/div[@class="accordion-heading"]/a[@href]'): temp = unicodify(node._root.text) if temp and temp in self.spider_data['description_hdr']: content_map['description'] = re.sub( r'^#', '', node._root.attrib['href']) if temp and temp in self.spider_data['details_hdr']: content_map['details'] = re.sub(r'^#', '', node._root.attrib['href']) for term in ('description', 'details'): if term in content_map: temp = content_node.xpath( str.format('.//div[@id="{0}"]', content_map[term])) if len(temp) > 0: content_list = [] content = unicodify(temp[0]._root.text) if content: content_list.append(content) content_list.extend([ unicodify(val.text) for val in temp[0]._root.iterdescendants() if val.text and val.text.strip() ]) metadata[term] = u', '.join(content_list) # Images image_urls = list( set( cm.norm_url(node._root.attrib['src'], self.spider_data['base_url']) for node in sel.xpath( '//section[@class="product_image_container"]/img[@src and @class="product_image"]' ) if node._root.attrib['src'] and node._root.attrib['src'].strip())) if 'color' in metadata: metadata['color'] = list(metadata['color']) if 'gender' in metadata: metadata['gender'] = list(metadata['gender']) #metadata['category'] = list(metadata['category']) if 'model' in metadata: item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata yield item