def process_editor_tags(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'], db_spider_spec=getattr(glob, 'SPIDER_SPEC'), table='products', extra_cond=None): """ 给editor库的数据添加tags字段 """ db = RoseVisionDb() db.conn(db_spider_spec) try: extra_cond = ' AND '.join( unicode.format(u'({0})', tuple(unicodify(v))) for v in extra_cond) if extra_cond else '1' rs = db.query( unicode.format( u'SELECT tag_name,mapping_list FROM products_tag_mapping WHERE {1}', extra_cond).encode('utf-8')) temp = rs.fetch_row(maxrows=0) mapping_rules = dict(temp) finally: db.close() db.conn(db_spec) db.start_transaction() try: rs = db.query( unicode.format(u'SELECT * FROM {0} WHERE {1}', table, extra_cond)) for i in xrange(rs.num_rows()): record = rs.fetch_row(how=1)[0] extra = json.loads(record['extra']) tags = [] for k in extra: tags.extend(extra[k]) tags = set(tags) tag_names = [] for v in tags: if v in mapping_rules: tag_names.extend(json.loads(mapping_rules[v])) tag_names = list(set(tag_names)) db.update({'tags': json.dumps(tag_names, ensure_ascii=False)}, str.format('idproducts={0}', record['idproducts'])) db.commit() pass except OperationalError: db.rollback() finally: db.close()
def process_editor_price(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'], table='products', extra_cond=None): """ 处理editor库中的价格信息 :param table: 需要操作的表。默认为products。 :param db_spec: 需要操作的数据库,默认为editor库。 :param extra_cond: 筛选条件。 """ db = RoseVisionDb() db.conn(db_spec) extra_cond = ' AND '.join( unicode.format(u'({0})', tuple(unicodify(v))) for v in extra_cond) if extra_cond else '1' db.lock([table]) db.start_transaction() try: # 根据update_flag字段判断哪些记录是需要更新的 obj = EditorPriceProcessor( db.query( unicode.format( u'SELECT idproducts,price,region FROM {0} WHERE price IS NOT NULL AND {1}', table, extra_cond).encode('utf-8')), db) core.func_carrier(obj, 1) db.commit() except OperationalError: db.rollback() finally: db.unlock() db.close()
def run(self): db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) if not self.brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] self.brand_list = brand_list else: brand_list = self.brand_list if not brand_list: # 如果没有任何品牌,则直接退出 return self.report self.progress = 0 # 获得检查总数 self.tot = int( db.query( str.format( 'SELECT COUNT(*) FROM products WHERE brand_id IN ({0})', ','.join(str(tmp) for tmp in brand_list))).fetch_row()[0][0]) for brand in brand_list: if not self.silent: print unicode.format(u'\nPROCESSING {0} / {1}\n', brand, info.brand_info()[brand]['brandname_e']) db.start_transaction() try: for model, pid, fingerprint in db.query_match( ['model', 'idproducts', 'fingerprint'], 'products', { 'brand_id': brand }).fetch_row(maxrows=0): self.progress += 1 new_fp = gen_fingerprint(brand, model) if fingerprint != new_fp: self.report.append({ 'model': model, 'idproducts': pid, 'fingerprint_db': fingerprint, 'fingerprint_gen': new_fp, 'brand_id': brand }) if not self.silent: print unicode.format( u'\nMismatched fingerprints! model={0}, idproducts={1}, brand_id={2}, ' u'fingerprints: {3} => {4}\n', model, pid, brand, fingerprint, new_fp) if self.update_fingerprint: # 自动更新MD5指纹 db.update({'fingerprint': new_fp}, 'products', str.format('idproducts={0}', pid), timestamps=['update_time']) except: db.rollback() raise finally: db.commit() db.close()
def run(self): db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) if not self.brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] self.brand_list = brand_list else: brand_list = self.brand_list self.progress = 0 self.tot = len(brand_list) for brand in brand_list: print unicode.format(u'PROCESSING {0} / {1}', brand, info.brand_info()[brand]['brandname_e']) self.progress += 1 rs = db.query( str.format( 'SELECT * FROM (SELECT p2.idprice_history,p2.date,p2.price,p2.currency,p1.idproducts,p1.brand_id,' 'p1.region,p1.name,p1.model,p1.offline FROM products AS p1 JOIN products_price_history AS p2 ON ' 'p1.idproducts=p2.idproducts ' 'WHERE p1.brand_id={0} ORDER BY p2.date DESC) AS p3 GROUP BY p3.idproducts', brand)) # 以model为键值,将同一个model下,不同区域的价格放在一起。 records = rs.fetch_row(maxrows=0, how=1) price_data = {} for r in records: model = r['model'] # # 仅有那些price不为None,且offline为0的数据,才加入到price check中。 # if r['price'] and int(r['offline']) == 0: # 这里更改为不管offline,全检查 if r['price']: # 首先检查model是否已存在 if model not in price_data: price_data[model] = [] price_data[model].append(r) # 最大值和最小值之间,如果差别过大,则说明价格可能有问题 for model in price_data: for item in price_data[model]: price = float(item['price']) item['nprice'] = info.currency_info()[ item['currency']]['rate'] * price # 按照nprice大小排序 sorted_data = sorted(price_data[model], key=lambda item: item['nprice']) max_price = sorted_data[-1]['nprice'] min_price = sorted_data[0]['nprice'] if min_price > 0 and max_price / min_price > self.threshold: print unicode.format( u'WARNING: {0}:{6} MODEL={1}, {2} / {3} => {4} / {5}', brand, model, sorted_data[0]['nprice'], sorted_data[0]['region'], sorted_data[-1]['nprice'], sorted_data[-1]['region'], info.brand_info()[brand]['brandname_e']) db.close()
def run(self): db_src = RoseVisionDb() db_src.conn(self.src_spec) db_dst = RoseVisionDb() db_dst.conn(self.dst_spec) # 备选记录 idproducts_list = [ int(val[0]) for val in db_src.query( unicode.format(u'SELECT idproducts FROM products WHERE {0}', u' AND '.join(self.cond)).encode( 'utf-8')).fetch_row(maxrows=0) ] self.tot = len(idproducts_list) self.progress = 0 db_dst.execute('SET AUTOCOMMIT=0') # db_dst.execute('ALTER TABLE products DISABLE KEYS') for pid_src in idproducts_list: self.progress += 1 record = db_src.query( str.format('SELECT * FROM products WHERE idproducts={0}', pid_src)).fetch_row(how=1)[0] db_dst.start_transaction() try: rs = db_dst.query( str.format( 'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" ' 'AND region="{2}"', record['brand_id'], record['model'], record['region'])) pid_dst = int( rs.fetch_row()[0][0]) if rs.num_rows() > 0 else None entry = {k: record[k] for k in record if k != 'idproducts'} price = process_price(record['price'], record['region']) if price: entry['price_rev'] = price['price'] entry['currency_rev'] = price['currency'] if entry['details']: entry['details'] = self.process_text( unicodify(entry['details'])) if entry['description']: entry['description'] = self.process_text( unicodify(entry['description'])) if entry['name']: entry['name'] = self.process_text(unicodify(entry['name'])) if entry['category']: entry['category'] = self.process_text( unicodify(entry['category'])) if entry['extra']: entry['extra'] = self.process_text( unicodify(entry['extra'])) if pid_dst: db_dst.update(entry, 'products', str.format('idproducts={0}', pid_dst)) else: db_dst.insert(entry, 'products') pid_dst = int( db_dst.query( str.format( 'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" ' 'AND region="{2}"', record['brand_id'], record['model'], record['region'])).fetch_row()[0][0]) # 是否需要处理价格信息 if price: record_price = db_dst.query( str.format( 'SELECT price,currency FROM products_price_history ' 'WHERE idproducts={0} ORDER BY date DESC LIMIT 1', pid_dst)).fetch_row(how=1) if not record_price or float(record_price[0]['price']) != price['price'] or \ record_price[0]['currency'] != price['currency']: db_dst.insert( { 'idproducts': pid_dst, 'date': record['update_time'], 'brand_id': record['brand_id'], 'region': record['region'], 'model': record['model'], 'price': price['price'], 'currency': price['currency'] }, 'products_price_history') # 处理图像信息 tmp = db_src.query( str.format( 'SELECT checksum,brand_id,url,path,width,height,format FROM products_image ' 'WHERE brand_id={0} AND model="{1}"', record['brand_id'], record['model'])).fetch_row(maxrows=0, how=1) image_record = {val['checksum']: val for val in tmp} checksum_src = set(image_record.keys()) # 完善images_store信息。如果checksum没有在images_store中出现,则添加之。 for checksum in checksum_src: if db_dst.query( str.format( 'SELECT checksum FROM images_store WHERE checksum="{0}"', checksum)).num_rows() == 0: db_dst.insert( { 'checksum': checksum, 'brand_id': image_record[checksum]['brand_id'], 'url': image_record[checksum]['url'], 'path': image_record[checksum]['path'], 'width': image_record[checksum]['width'], 'height': image_record[checksum]['height'], 'format': image_record[checksum]['format'] }, 'images_store') # 补充目标数据库的products_image表,添加相应的checksum checksum_dst = set([ val[0] for val in db_dst.query( str.format( 'SELECT checksum FROM products_image WHERE brand_id={0} AND model="{1}"', record['brand_id'], record['model'])).fetch_row( maxrows=0) ]) for checksum in checksum_src - checksum_dst: db_dst.insert( { 'checksum': checksum, 'brand_id': record['brand_id'], 'model': record['model'] }, 'products_image') db_dst.commit() except: db_dst.rollback() raise
class UpdateSpider(MFashionBaseSpider): handle_httpstatus_list = [404] def __init__(self, brand_list, region_list, db_spec, *a, **kw): self.name = str.format( 'update-{0}-{1}', '-'.join(str(tmp) for tmp in brand_list) if brand_list else 'all', '-'.join(region_list) if region_list else 'all') super(UpdateSpider, self).__init__(*a, **kw) self.brand_list = brand_list self.region_list = region_list self.db = RoseVisionDb() self.db.conn(db_spec) # self.python_logger = get_logger() # PythonLoggingObserver(loggerName='rosevision').start() def start_requests(self): # 如果未指定brand_list,则默认对所有的品牌进行更新 # 获得所有的品牌数据 if not self.brand_list: self.brand_list = info.brand_info().keys() # self.log('TEST', log.INFO) # # return # UpdateSpider的可选区域参数 region_cond = str.format( 'region IN ({0})', ','.join( "'" + tmp + "'" for tmp in self.region_list)) if self.region_list else '1' rs = self.db.query( str.format( 'SELECT COUNT(*) FROM products WHERE brand_id IN ({0}) AND {1}', ','.join(str(tmp) for tmp in self.brand_list), region_cond)) tot_num = int(rs.fetch_row()[0][0]) self.log(str.format('Total number of records to update: {0}', tot_num), level=log.INFO) for brand in self.brand_list: # 获得该品牌下所有记录 # 如果未指定region_list,则默认对所有的的确进行更新 if self.region_list: region_list = self.region_list else: rs = self.db.query( str.format( 'SELECT DISTINCT region FROM products WHERE brand_id={0}', brand)) region_list = [ tmp['region'] for tmp in rs.fetch_row(maxrows=0, how=1) ] region_info = info.region_info() region_list = filter(lambda val: int(region_info[val]['status']), region_list) for region in region_list: # 对该brand,该区域的所有商品,无论其下线状态是什么,都进行更新。 rs = self.db.query_match( {'idproducts', 'url', 'region', 'model'}, 'products', { 'brand_id': brand, 'region': region }) products_map = { int(tmp['idproducts']): { 'url': tmp['url'], 'region': tmp['region'], 'model': tmp['model'] } for tmp in rs.fetch_row(maxrows=0, how=1) } for pid, data in products_map.items(): url = data['url'] region = data['region'] model = data['model'] # url = 'http://www.gucci.com/us/styles/3085353G0109060' # region = 'us' # pid = 196907 # # return [Request(url=url, # callback=self.parse, # meta={'brand': brand, 'pid': pid, 'region': region}, # errback=self.onerror, # dont_filter=True)] if url: try: yield Request(url=url, callback=self.parse, meta={ 'brand': brand, 'pid': pid, 'region': region, 'model': model }, errback=self.onerror, dont_filter=True) except TypeError: continue else: continue def parse(self, response): brand = response.meta['brand'] item = UpdateItem() item['idproduct'] = response.meta['pid'] item['brand'] = brand item['region'] = response.meta['region'] sc = info.spider_info()[brand]['spider_class'] metadata = {} item['metadata'] = metadata metadata['url'] = response.url if response.status == 404: item['offline'] = 1 return item elif response.status < 200 or response.status > 300: return else: func = getattr(sc, 'is_offline') item['offline'] = 1 if func(response, self) else 0 if item['offline'] == 1: return item if 'fetch_price' in dir(sc): ret = getattr(sc, 'fetch_price')(response, self) if isinstance(ret, Request): metadata['price'] = ret else: if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] if 'fetch_name' in dir(sc): name = getattr(sc, 'fetch_name')(response, self) if name: metadata['name'] = name if 'fetch_model' in dir(sc): model = getattr(sc, 'fetch_model')(response, self) if model: metadata['model'] = model if 'fetch_description' in dir(sc): description = getattr(sc, 'fetch_description')(response, self) if description: metadata['description'] = description if 'fetch_details' in dir(sc): details = getattr(sc, 'fetch_details')(response, self) if details: metadata['details'] = details if 'fetch_color' in dir(sc): color = getattr(sc, 'fetch_color')(response, self) if color: metadata['color'] = color return self.resolve_requests(item) def update_callback(self, response): spider_cb = response.meta['spider_cb'] item = response.meta['item'] key = response.meta['key'] ret = spider_cb(response) # key对应的 item['metadata'].pop(key) if ret: if key == 'price': for tmp in ('price', 'price_discount'): if tmp in ret: item['metadata'][tmp] = ret[tmp] else: item['metadata'][key] = ret else: # 如果返回值为None,说明没有相对应的信息。 if key == 'price': for tmp in ('price', 'price_discount'): if tmp in item['metadata']: item['metadata'].pop(tmp) return self.resolve_requests(item) def resolve_requests(self, item): """ 检查metadata里面是否有Request对象。如果有,需要进一步提交这些request。 仅当所有的对象都不是Request时,该过程才结束。 应该注意的是,metadata中的键只会处理一次。 @param metadata: """ metadata = item['metadata'] while True: resolved = True for func_key, value in metadata.items(): if isinstance(value, Request): value.meta['spider_cb'] = value.callback value.meta['item'] = item value.meta['key'] = func_key value.callback = self.update_callback value.dont_filter = True return value if resolved: break return item @staticmethod def onerror(reason): meta = None try: if hasattr(reason.value, 'response'): response = reason.value.response # 这里response可能为None,比如出现 # ERROR: Error downloading <GET xxx>: # [<twisted.python.failure.Failure <class 'twisted.internet.error.ConnectionDone'>>] if response: meta = response.meta except AttributeError: pass try: if not meta: meta = reason.request.meta except AttributeError: pass if meta: try: brand = meta['brand'] item = UpdateItem() item['idproduct'] = meta['pid'] item['brand'] = brand item['region'] = meta['region'] # sc = glob.spider_info()[brand] metadata = {} item['metadata'] = metadata item['offline'] = 1 return item except KeyError: return
def run(self): db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) # 如果没有指定brand_list,则默认使用数据库中所有的brand_list if not self.brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] self.brand_list = brand_list else: brand_list = self.brand_list self.progress = 0 self.tot = len(brand_list) # 最终生成的表格 tot_results = [] for brand in brand_list: results = {} print unicode.format(u'PROCESSING {0} / {1}', brand, info.brand_info()[brand]['brandname_e']) brand_name = info.brand_info()[brand]['brandname_e'] self.progress += 1 rs = db.query( str.format( '''SELECT p1.idproducts,p1.brand_id,p1.model,p1.region,p2.price,p2.price_discount,p2.currency,p2.date,p1.name,p4.tag,p1.url FROM products AS p1 JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts LEFT JOIN products_mfashion_tags AS p3 ON p3.idproducts=p1.idproducts LEFT JOIN mfashion_tags AS p4 ON p3.id_mfashion_tags=p4.idmfashion_tags WHERE p1.brand_id={0} AND p1.offline=0''', brand)) records = rs.fetch_row(maxrows=0, how=1) for r in records: pid = int(r['idproducts']) timestamp = datetime.datetime.strptime(r['date'], '%Y-%m-%d %H:%M:%S') tag = unicodify(r['tag']) if pid in results: # 如果已经存在相应单品的记录 old_rec = results[pid] old_rec['tag'].add(tag) old_t = datetime.datetime.strptime(old_rec['date'], '%Y-%m-%d %H:%M:%S') if timestamp > old_t: old_rec['price'] = unicodify(r['price']) old_rec['price_discount'] = unicodify( r['price_discount']) old_rec['currency'] = unicodify(r['currency']) old_rec['date'] = unicodify(r['date']) else: # 如果该单品的记录不存在 results[pid] = {k: unicodify(r[k]) for k in r} tmp = results[pid]['tag'] if tmp: results[pid]['tag'] = {tmp} else: results[pid]['tag'] = set({}) results[pid]['brand'] = brand_name results[pid].pop('idproducts') tot_results.extend(self.random_extract(results.values())) db.close() # 将所有的tag转换为[] data = [] for r in tot_results: r['tag'] = json.dumps(list(r['tag']), ensure_ascii=False) data.append( {k: r[k].encode('utf-8') if r[k] else 'NULL' for k in r}) # 写入CSV文件 with open( str.format('extract_{0}.csv', datetime.datetime.now().strftime('%Y%m%d%H%M%S')), 'wb') as f: f.write(u'\ufeff'.encode('utf8')) dict_writer = csv.DictWriter(f, fieldnames=[ 'brand_id', 'brand', 'model', 'region', 'price', 'price_discount', 'currency', 'date', 'name', 'tag', 'url' ]) dict_writer.writeheader() dict_writer.writerows(data)
# coding=utf-8 from utils.db import RoseVisionDb import global_settings as gs __author__ = 'Ryan' db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) rs = db.query(str.format('select currency, rate from region_info')) exchange_dic = { val['currency']: val['rate'] for val in rs.fetch_row(maxrows=0, how=1) } def check_price_history_region_diff(price_dic, max_range=3): """ price_dic = { 'CNY': 1500.0 'USD': 500.0 } """ stand_value = None for key, value in price_dic.items(): rate = float(exchange_dic[key]) if stand_value: rate_value = value * rate if rate_value > max_range * stand_value or rate_value < max_range * stand_value: return False