def process_editor_price(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'], table='products', extra_cond=None): """ 处理editor库中的价格信息 :param table: 需要操作的表。默认为products。 :param db_spec: 需要操作的数据库,默认为editor库。 :param extra_cond: 筛选条件。 """ db = RoseVisionDb() db.conn(db_spec) extra_cond = ' AND '.join( unicode.format(u'({0})', tuple(unicodify(v))) for v in extra_cond) if extra_cond else '1' db.lock([table]) db.start_transaction() try: # 根据update_flag字段判断哪些记录是需要更新的 obj = EditorPriceProcessor( db.query( unicode.format( u'SELECT idproducts,price,region FROM {0} WHERE price IS NOT NULL AND {1}', table, extra_cond).encode('utf-8')), db) core.func_carrier(obj, 1) db.commit() except OperationalError: db.rollback() finally: db.unlock() db.close()
def currency_update(param_dict): """ 更新货币的汇率信息 @param param_dict: """ db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) rs = db.query_match(['iso_code', 'currency'], 'region_info').fetch_row(maxrows=0) db.start_transaction() try: for code, currency in rs: print str.format('Fetching for currency data for {0}...', currency) data = cm.get_data(url=str.format( 'http://download.finance.yahoo.com/d/quotes.csv?s={0}CNY=X' '&f=sl1d1t1ba&e=.json', currency)) rdr = csv.reader(StringIO(data['body'])) line_data = [val for val in rdr][0] timestamp = datetime.datetime.strptime( str.format('{0} {1}', line_data[2], line_data[3]), '%m/%d/%Y %I:%M%p') db.update( { 'rate': line_data[1], 'update_time': timestamp.strftime('%Y-%m-%d %H:%M:%S') }, 'region_info', str.format('iso_code="{0}"', code)) db.commit() except: db.rollback() raise
def run(self): db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) if not self.brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] self.brand_list = brand_list else: brand_list = self.brand_list if not brand_list: # 如果没有任何品牌,则直接退出 return self.report self.progress = 0 # 获得检查总数 self.tot = int( db.query( str.format( 'SELECT COUNT(*) FROM products WHERE brand_id IN ({0})', ','.join(str(tmp) for tmp in brand_list))).fetch_row()[0][0]) for brand in brand_list: if not self.silent: print unicode.format(u'\nPROCESSING {0} / {1}\n', brand, info.brand_info()[brand]['brandname_e']) db.start_transaction() try: for model, pid, fingerprint in db.query_match( ['model', 'idproducts', 'fingerprint'], 'products', { 'brand_id': brand }).fetch_row(maxrows=0): self.progress += 1 new_fp = gen_fingerprint(brand, model) if fingerprint != new_fp: self.report.append({ 'model': model, 'idproducts': pid, 'fingerprint_db': fingerprint, 'fingerprint_gen': new_fp, 'brand_id': brand }) if not self.silent: print unicode.format( u'\nMismatched fingerprints! model={0}, idproducts={1}, brand_id={2}, ' u'fingerprints: {3} => {4}\n', model, pid, brand, fingerprint, new_fp) if self.update_fingerprint: # 自动更新MD5指纹 db.update({'fingerprint': new_fp}, 'products', str.format('idproducts={0}', pid), timestamps=['update_time']) except: db.rollback() raise finally: db.commit() db.close()
def process_editor_tags(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'], db_spider_spec=getattr(glob, 'SPIDER_SPEC'), table='products', extra_cond=None): """ 给editor库的数据添加tags字段 """ db = RoseVisionDb() db.conn(db_spider_spec) try: extra_cond = ' AND '.join( unicode.format(u'({0})', tuple(unicodify(v))) for v in extra_cond) if extra_cond else '1' rs = db.query( unicode.format( u'SELECT tag_name,mapping_list FROM products_tag_mapping WHERE {1}', extra_cond).encode('utf-8')) temp = rs.fetch_row(maxrows=0) mapping_rules = dict(temp) finally: db.close() db.conn(db_spec) db.start_transaction() try: rs = db.query( unicode.format(u'SELECT * FROM {0} WHERE {1}', table, extra_cond)) for i in xrange(rs.num_rows()): record = rs.fetch_row(how=1)[0] extra = json.loads(record['extra']) tags = [] for k in extra: tags.extend(extra[k]) tags = set(tags) tag_names = [] for v in tags: if v in mapping_rules: tag_names.extend(json.loads(mapping_rules[v])) tag_names = list(set(tag_names)) db.update({'tags': json.dumps(tag_names, ensure_ascii=False)}, str.format('idproducts={0}', record['idproducts'])) db.commit() pass except OperationalError: db.rollback() finally: db.close()
def run(self): db_src = RoseVisionDb() db_src.conn(self.src_spec) db_dst = RoseVisionDb() db_dst.conn(self.dst_spec) # 备选记录 idproducts_list = [ int(val[0]) for val in db_src.query( unicode.format(u'SELECT idproducts FROM products WHERE {0}', u' AND '.join(self.cond)).encode( 'utf-8')).fetch_row(maxrows=0) ] self.tot = len(idproducts_list) self.progress = 0 db_dst.execute('SET AUTOCOMMIT=0') # db_dst.execute('ALTER TABLE products DISABLE KEYS') for pid_src in idproducts_list: self.progress += 1 record = db_src.query( str.format('SELECT * FROM products WHERE idproducts={0}', pid_src)).fetch_row(how=1)[0] db_dst.start_transaction() try: rs = db_dst.query( str.format( 'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" ' 'AND region="{2}"', record['brand_id'], record['model'], record['region'])) pid_dst = int( rs.fetch_row()[0][0]) if rs.num_rows() > 0 else None entry = {k: record[k] for k in record if k != 'idproducts'} price = process_price(record['price'], record['region']) if price: entry['price_rev'] = price['price'] entry['currency_rev'] = price['currency'] if entry['details']: entry['details'] = self.process_text( unicodify(entry['details'])) if entry['description']: entry['description'] = self.process_text( unicodify(entry['description'])) if entry['name']: entry['name'] = self.process_text(unicodify(entry['name'])) if entry['category']: entry['category'] = self.process_text( unicodify(entry['category'])) if entry['extra']: entry['extra'] = self.process_text( unicodify(entry['extra'])) if pid_dst: db_dst.update(entry, 'products', str.format('idproducts={0}', pid_dst)) else: db_dst.insert(entry, 'products') pid_dst = int( db_dst.query( str.format( 'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" ' 'AND region="{2}"', record['brand_id'], record['model'], record['region'])).fetch_row()[0][0]) # 是否需要处理价格信息 if price: record_price = db_dst.query( str.format( 'SELECT price,currency FROM products_price_history ' 'WHERE idproducts={0} ORDER BY date DESC LIMIT 1', pid_dst)).fetch_row(how=1) if not record_price or float(record_price[0]['price']) != price['price'] or \ record_price[0]['currency'] != price['currency']: db_dst.insert( { 'idproducts': pid_dst, 'date': record['update_time'], 'brand_id': record['brand_id'], 'region': record['region'], 'model': record['model'], 'price': price['price'], 'currency': price['currency'] }, 'products_price_history') # 处理图像信息 tmp = db_src.query( str.format( 'SELECT checksum,brand_id,url,path,width,height,format FROM products_image ' 'WHERE brand_id={0} AND model="{1}"', record['brand_id'], record['model'])).fetch_row(maxrows=0, how=1) image_record = {val['checksum']: val for val in tmp} checksum_src = set(image_record.keys()) # 完善images_store信息。如果checksum没有在images_store中出现,则添加之。 for checksum in checksum_src: if db_dst.query( str.format( 'SELECT checksum FROM images_store WHERE checksum="{0}"', checksum)).num_rows() == 0: db_dst.insert( { 'checksum': checksum, 'brand_id': image_record[checksum]['brand_id'], 'url': image_record[checksum]['url'], 'path': image_record[checksum]['path'], 'width': image_record[checksum]['width'], 'height': image_record[checksum]['height'], 'format': image_record[checksum]['format'] }, 'images_store') # 补充目标数据库的products_image表,添加相应的checksum checksum_dst = set([ val[0] for val in db_dst.query( str.format( 'SELECT checksum FROM products_image WHERE brand_id={0} AND model="{1}"', record['brand_id'], record['model'])).fetch_row( maxrows=0) ]) for checksum in checksum_src - checksum_dst: db_dst.insert( { 'checksum': checksum, 'brand_id': record['brand_id'], 'model': record['model'] }, 'products_image') db_dst.commit() except: db_dst.rollback() raise
class ProcessTags(object): """ 标签的映射规则有变动,需要更新 """ original_tags_tbl = 'original_tags' mfashion_tags_tbl = 'mfashion_tags' prod_tag_tbl = 'products_original_tags' prod_mt_tbl = 'products_mfashion_tags' products = 'products' db_spec = getattr(gs, 'DATABASE')['DB_SPEC'] tot = 1 progress = 0 def __init__(self, last_update=None, extra_cond=None): print str.format( 'Processing tags (last_update="{0}", extra_cond="{1}")...', last_update, extra_cond) self.db = RoseVisionDb() self.db.conn(self.db_spec) self.last_update = last_update self.extra_cond = extra_cond def get_msg(self): return str.format( '{0}/{1}({2:.1%}) PROCESSED', self.progress, self.tot, float(self.progress) / self.tot) if self.tot > 0 else 'IDLE' def run(self): last_update = self.last_update extra_cond = self.extra_cond if not extra_cond: extra_cond = [] elif not iterable(extra_cond): extra_cond = [extra_cond] if last_update: extra_cond.append( unicode.format(u'update_time > "{0}"', last_update)) extra_cond.append('mapping_list IS NOT NULL') # MFashion标签的缓存记录 cached_mfashion = {} # 标签更新原理:original_tags存放原始标签。根据update_time字段可以得到最近更新过的标签。由于标签系统具备一定传染性,所以 # 该标签对应brand/region下的所有标签都必须重做 rs = self.db.query_match(['brand_id', 'region'], self.original_tags_tbl, {}, extra=extra_cond, distinct=True) # 需要处理的标签 tag_dict = {} for i in xrange(rs.num_rows()): brand_id, region = rs.fetch_row()[0] for val in self.db.query_match( ['idmappings', 'mapping_list'], self.original_tags_tbl, { 'brand_id': brand_id, 'region': region }, extra='mapping_list IS NOT NULL').fetch_row(maxrows=0): tag_dict[val[0]] = json.loads(val[1].replace("'", '"')) # 删除旧单品/标签关系 self.db.execute( str.format( 'DELETE FROM p2 USING {0} AS p1, {1} AS p2 WHERE p1.idproducts=p2.idproducts ' 'AND p1.brand_id={2} AND region="{3}"', self.products, self.prod_mt_tbl, brand_id, region)) self.tot = len(tag_dict) self.progress = 0 for tid, rule in tag_dict.items(): self.progress += 1 self.db.start_transaction() try: # 所有相关的单品 pid_list = [ int(val[0]) for val in self.db.query_match( ['idproducts'], self.prod_tag_tbl, { 'id_original_tags': tid }).fetch_row(maxrows=0) ] # 添加MFashion标签 for tag in rule: if tag not in cached_mfashion: self.db.insert({'tag': tag}, self.mfashion_tags_tbl, ignore=True) tid = int( self.db.query_match(['idmfashion_tags'], self.mfashion_tags_tbl, { 'tag': tag }).fetch_row()[0][0]) cached_mfashion[tag] = tid self.db.insert([{ 'idproducts': pid, 'id_mfashion_tags': cached_mfashion[tag] } for pid in pid_list], self.prod_mt_tbl, ignore=True) self.db.commit() except ValueError: self.db.rollback() except: self.db.rollback() raise