def run(self): db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) if not self.brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] self.brand_list = brand_list else: brand_list = self.brand_list self.progress = 0 self.tot = len(brand_list) for brand in brand_list: print unicode.format(u'PROCESSING {0} / {1}', brand, info.brand_info()[brand]['brandname_e']) self.progress += 1 rs = db.query( str.format( 'SELECT * FROM (SELECT p2.idprice_history,p2.date,p2.price,p2.currency,p1.idproducts,p1.brand_id,' 'p1.region,p1.name,p1.model,p1.offline FROM products AS p1 JOIN products_price_history AS p2 ON ' 'p1.idproducts=p2.idproducts ' 'WHERE p1.brand_id={0} ORDER BY p2.date DESC) AS p3 GROUP BY p3.idproducts', brand)) # 以model为键值,将同一个model下,不同区域的价格放在一起。 records = rs.fetch_row(maxrows=0, how=1) price_data = {} for r in records: model = r['model'] # # 仅有那些price不为None,且offline为0的数据,才加入到price check中。 # if r['price'] and int(r['offline']) == 0: # 这里更改为不管offline,全检查 if r['price']: # 首先检查model是否已存在 if model not in price_data: price_data[model] = [] price_data[model].append(r) # 最大值和最小值之间,如果差别过大,则说明价格可能有问题 for model in price_data: for item in price_data[model]: price = float(item['price']) item['nprice'] = info.currency_info()[ item['currency']]['rate'] * price # 按照nprice大小排序 sorted_data = sorted(price_data[model], key=lambda item: item['nprice']) max_price = sorted_data[-1]['nprice'] min_price = sorted_data[0]['nprice'] if min_price > 0 and max_price / min_price > self.threshold: print unicode.format( u'WARNING: {0}:{6} MODEL={1}, {2} / {3} => {4} / {5}', brand, model, sorted_data[0]['nprice'], sorted_data[0]['region'], sorted_data[-1]['nprice'], sorted_data[-1]['region'], info.brand_info()[brand]['brandname_e']) db.close()
def run(self): db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) if not self.brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] self.brand_list = brand_list else: brand_list = self.brand_list if not brand_list: # 如果没有任何品牌,则直接退出 return self.report self.progress = 0 # 获得检查总数 self.tot = int( db.query( str.format( 'SELECT COUNT(*) FROM products WHERE brand_id IN ({0})', ','.join(str(tmp) for tmp in brand_list))).fetch_row()[0][0]) for brand in brand_list: if not self.silent: print unicode.format(u'\nPROCESSING {0} / {1}\n', brand, info.brand_info()[brand]['brandname_e']) db.start_transaction() try: for model, pid, fingerprint in db.query_match( ['model', 'idproducts', 'fingerprint'], 'products', { 'brand_id': brand }).fetch_row(maxrows=0): self.progress += 1 new_fp = gen_fingerprint(brand, model) if fingerprint != new_fp: self.report.append({ 'model': model, 'idproducts': pid, 'fingerprint_db': fingerprint, 'fingerprint_gen': new_fp, 'brand_id': brand }) if not self.silent: print unicode.format( u'\nMismatched fingerprints! model={0}, idproducts={1}, brand_id={2}, ' u'fingerprints: {3} => {4}\n', model, pid, brand, fingerprint, new_fp) if self.update_fingerprint: # 自动更新MD5指纹 db.update({'fingerprint': new_fp}, 'products', str.format('idproducts={0}', pid), timestamps=['update_time']) except: db.rollback() raise finally: db.commit() db.close()
def proc_by_brand(brand): if time_range_str: rs = db.query_match(['COUNT(DISTINCT model)'], 'products', {'brand_id': brand}, extra=[ str.format('update_time>"{0}"', time_range_str[0]), str.format('update_time<"{0}"', time_range_str[1]) ]) else: rs = db.query_match(['COUNT(DISTINCT model)'], 'products', {'brand_id': brand}) cnt_tot = int(rs.fetch_row()[0][0]) def func1(region): if time_range_str: rs = db.query_match(['COUNT(*)'], 'products', { 'brand_id': brand, 'region': region }, extra=[ str.format('update_time>"{0}"', time_range_str[0]), str.format('update_time<"{0}"', time_range_str[1]) ]) else: rs = db.query_match(['COUNT(*)'], 'products', { 'brand_id': brand, 'region': region }) return rs.fetch_row()[0][0] cnt_by_region = '/'.join(map(func1, ['cn', 'us', 'fr', 'uk', 'it'])) return unicode.format( u'<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td></tr>', brand, info.brand_info()[brand]['brandname_e'], cnt_tot, cnt_by_region)
def merge_prods(self, prods, db): """ 按照国家顺序,挑选主记录 :param prods: """ logger = get_logger() # 将prods转换为unicode for idx in xrange(len(prods)): prods[idx] = {k: unicodify(prods[idx][k]) for k in prods[idx]} # 挑选primary记录 sorted_prods = sorted(prods, key=lambda k: self.region_order[k['region']]) main_entry = sorted_prods[0] entry = { k: unicodify(main_entry[k]) for k in ('brand_id', 'model', 'name', 'description', 'details', 'gender', 'category', 'color', 'url', 'fingerprint') } if not entry['name']: entry['name'] = u'单品' mfashion_tags = [ unicodify(val[0]) for val in db.query( str.format( 'SELECT DISTINCT p1.tag FROM mfashion_tags AS p1 ' 'JOIN products_mfashion_tags AS p2 ON p1.idmfashion_tags=p2.id_mfashion_tags ' 'WHERE p2.idproducts IN ({0})', ','.join( val['idproducts'] for val in prods))).fetch_row(maxrows=0) ] # # original_tags = [int(val[0]) for val in # db.query(str.format('SELECT DISTINCT id_original_tags FROM products_original_tags ' # 'WHERE idproducts IN ({0})', # ','.join(val['idproducts'] for val in prods))).fetch_row( # maxrows=0)] entry['mfashion_tags'] = json.dumps(mfashion_tags, ensure_ascii=False) entry[ 'original_tags'] = '' #json.dumps(original_tags, ensure_ascii=False) entry['region_list'] = json.dumps([val['region'] for val in prods], ensure_ascii=False) entry['brandname_e'] = info.brand_info()[int( entry['brand_id'])]['brandname_e'] entry['brandname_c'] = info.brand_info()[int( entry['brand_id'])]['brandname_c'] # # 该单品在所有国家的记录中,第一次被抓取到的时间,作为release的fetch_time # entry['fetch_time'] = \ # sorted(datetime.datetime.strptime(tmp['fetch_time'], "%Y-%m-%d %H:%M:%S") for tmp in prods)[ # 0].strftime("%Y-%m-%d %H:%M:%S") url_dict = {int(val['idproducts']): val['url'] for val in prods} offline_dict = { int(val['idproducts']): int(val['offline']) for val in prods } price_change_dict = { int(val['idproducts']): val['price_change'] for val in prods } update_time_dict = { int(val['idproducts']): datetime.datetime.strptime(val['update_time'], "%Y-%m-%d %H:%M:%S") for val in prods } # pid和region之间的关系 region_dict = {int(val['idproducts']): val['region'] for val in prods} price_list = {} # 以pid为主键,将全部的价格历史记录合并起来 for item in db.query_match( ['price', 'price_discount', 'currency', 'date', 'idproducts'], self.price_hist, {}, str.format('idproducts IN ({0})', ','.join(val['idproducts'] for val in prods)), tail_str='ORDER BY idprice_history DESC').fetch_row(maxrows=0, how=1): pid = int(item['idproducts']) region = region_dict[pid] offline = offline_dict[pid] if pid not in price_list: price_list[pid] = [] price = float(item['price']) if item['price'] else None if offline == 0: price_discount = float( item['price_discount']) if item['price_discount'] else None else: price_discount = None price_list[pid].append({ 'price': price, 'price_discount': price_discount, 'currency': item['currency'], 'date': datetime.datetime.strptime(item['date'], "%Y-%m-%d %H:%M:%S"), 'price_change': price_change_dict[pid], 'url': url_dict[pid], 'offline': offline, 'code': region, 'country': info.region_info()[region]['name_c'] }) currency_conv = lambda val, currency: info.currency_info()[currency][ 'rate'] * val # 对price_list进行简并操作。 # 策略:如果有正常的最新价格,则返回正常的价格数据。 # 如果最新价格为None,则取回溯第一条不为None的数据,同时将price_discount置空。 # 如果无法找到不为None的价格,则跳过该pid for pid, pid_data in price_list.items(): # 按照时间顺序逆排序,同时只保留price不为None的数据 # pid_data = sorted(pid_data, key=lambda val: val['date'], reverse=True) # 有价格的pid_data子集 valid_pid_data = filter(lambda val: val['price'], pid_data) if pid_data[0]['price']: # 正常情况 price_list[pid] = pid_data[0] # 如果当前没有折扣价,查看是否为一周内原价悄悄下降的情况 currency = valid_pid_data[0]['currency'] if price_change_dict[pid] == 'D' and len( valid_pid_data ) > 1 and currency == valid_pid_data[1]['currency']: if not pid_data[0]['price_discount'] and currency_conv( valid_pid_data[1]['price'], currency) > currency_conv( valid_pid_data[0]['price'], currency) and (datetime.datetime.now() - valid_pid_data[0]['date'] ) < datetime.timedelta(7): price_list[pid]['price_discount'] = price_list[pid][ 'price'] price_list[pid]['price'] = valid_pid_data[1]['price'] else: # 寻找回溯第一条price不为None的数据。 # tmp = filter(lambda val: val['price'], pid_data) if not valid_pid_data: # 没有价格信息,取消该pid记录 price_list.pop(pid) else: # 取最近一次价格,同时取消折扣价,保留最新记录的offline状态 tmp = valid_pid_data[0] tmp['price_discount'] = None price_list[pid] = tmp # 第一次有效价格对应的时间,为fetch_time # pid_data = filter(lambda val: val['price'], sorted(pid_data, key=lambda val: val['date'])) # pid_data = filter(lambda val: val['price'], pid_data) if valid_pid_data and pid in price_list: price_list[pid]['fetch_time'] = valid_pid_data[-1]['date'] price_list[pid]['idproducts'] = pid # 如果没有价格信息,则不发布 if not price_list: return entry['price_list'] = sorted( price_list.values(), key=lambda val: self.region_order[val['code']]) entry = release_filter(entry, logger) if not entry['price_list']: return entry['offline'] = entry['price_list'][0]['offline'] # model的fetch_time的确定:所有对应pid中,fetch_time最早的那个。 entry['fetch_time'] = min( tmp['fetch_time'] for tmp in entry['price_list']).strftime("%Y-%m-%d %H:%M:%S") # 价格排序的列表 alt_prices = [] for price_item in entry['price_list']: # 将datetime序列化,进而保存在release表中。 price_item['date'] = price_item['date'].strftime( "%Y-%m-%d %H:%M:%S") price_item['fetch_time'] = price_item['fetch_time'].strftime( "%Y-%m-%d %H:%M:%S") if price_item['offline'] == 0: if price_item['price_discount']: tmp = map( lambda key_name: currency_conv(price_item[key_name], price_item['currency']), ('price', 'price_discount')) tmp.extend([ price_item[key] for key in ('price_change', 'price', 'price_discount', 'currency', 'date', 'idproducts') ]) alt_prices.append(tmp) else: alt_prices.append([ currency_conv(price_item['price'], price_item['currency']), None, price_item['price_change'], price_item['price'], price_item['price_discount'], price_item['currency'], price_item['date'], price_item['idproducts'] ]) else: alt_prices.append([ currency_conv(price_item['price'], price_item['currency']), None, price_item['price_change'], price_item['price'], price_item['price_discount'], price_item['currency'], price_item['date'], price_item['idproducts'] ]) # 返回的价格:如果有折扣价,返回折扣价;如果没有,返回原价 alt_prices = sorted(alt_prices, key=lambda val: val[1] if val[1] else val[0]) entry['price'], entry['price_discount'] = alt_prices[ 0][:2] if alt_prices else (None, ) * 2 entry['price_change'] = alt_prices[0][2] if alt_prices else '0' entry['o_price'], entry['o_discount'], entry[ 'o_currency'] = alt_prices[0][3:6] # 取消entry['price_list']中的idproducts for i in xrange(len(entry['price_list'])): entry['price_list'][i].pop('idproducts') entry['price_list'] = json.dumps(entry['price_list'], ensure_ascii=False) entry['last_price_ts'] = alt_prices[0][6] entry['product_update_ts'] = update_time_dict[ alt_prices[0][7]].strftime("%Y-%m-%d %H:%M:%S") # 搜索字段 search_text = u' '.join(entry[tmp] if entry[tmp] else '' for tmp in ('name', 'description', 'details', 'model', 'brandname_e', 'brandname_c')) search_color = u' '.join(entry['color']) if entry['color'] else u'' rs = db.query_match( ['description_cn', 'description_en', 'details_cn', 'details_en'], 'products_translate', { 'fingerprint': entry['fingerprint'] }).fetch_row() part_translate = u' ' + u' '.join( unicodify(tmp) for tmp in filter(lambda v: v, rs[0])) if rs else ' ' search_tags = u' '.join(list(set(mfashion_tags))) entry['searchtext'] = unicode.format(u'{0} {1} {2} {3}', search_text, part_translate, search_tags, search_color) p = prods[0] checksums = [] # 爆照checksums中的数据唯一,且顺序和idproducts_image一致 for tmp in db.query( str.format( ''' SELECT p1.checksum, p3.width, p3.height, p3.path FROM products_image AS p1 JOIN products AS p2 ON p1.fingerprint=p2.fingerprint JOIN images_store AS p3 ON p1.checksum=p3.checksum WHERE p2.fingerprint="{0}" ORDER BY p1.idproducts_image ''', p['fingerprint'])).fetch_row(maxrows=0, how=1): if tmp not in checksums: checksums.append(tmp) # 如果没有图片,则暂时不添加到release表中 if not checksums: return image_list = [] for val in checksums: tmp = { 'path': val['path'], 'width': int(val['width']), 'height': int(val['height']) } if not image_list: entry['cover_image'] = json.dumps(tmp, ensure_ascii=False) image_list.append(tmp) entry['image_list'] = json.dumps(image_list[:self.max_images], ensure_ascii=False) db.insert(entry, 'products_release')
def run(cls, logger=None, **kwargs): log_path_name = os.path.normpath( os.path.join( getattr(gs, 'STORAGE_PATH'), 'log/check/DataCheck%s.log' % datetime.datetime.now().strftime('%Y%m%d'))) logging.basicConfig(filename=log_path_name, level=logging.DEBUG) logging.info('PRODUCT CHECK STARTED!!!!') threshold = kwargs['threshold'] if 'threshold' in kwargs else 10 if 'brand_list' in kwargs: brand_list = kwargs['brand_list'] else: with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: brand_list = db.query_match(['brand_id'], 'products', distinct=True).fetch_row(maxrows=0) db.start_transaction() brand_list = [int(val[0]) for val in brand_list] for brand in brand_list: with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: #=============================product check================================================== logging.info( unicode.format( u'{0} PROCESSING product check {1} / {2}', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), brand, info.brand_info()[brand]['brandname_e'])) rs = db.query_match([ 'idproducts', 'region', 'name', 'url', 'color', 'description', 'details', 'price_change' ], 'products', { 'brand_id': brand }).fetch_row(maxrows=0) for idproducts, region, name, url, color, desc, details, price_change in rs: name_err = url_err = color_err = desc_err = details_err = price_change_err = False #查找html转义符 # if name and has_escape(name): # print idproducts, name # name_err = True # if desc and has_escape(desc): # print idproducts, desc # desc_err = True # if details and has_escape(details): # print idproducts, details # details_err = True # for c in [name, desc, details]: # if c and has_escape(c): # print idproducts, c # # db.update({'name': lxmlparser()}, # # 'products', str.format('idproducts="{0}"', idproducts)) # pass if name and has_escape(name): print(idproducts, name) # print lxmlparser(unicode(name).encode("utf-8")) # db.update({'name': lxmlparser(name)}, # 'products', str.format('idproducts="{0}"', idproducts)) if desc and has_escape(desc): print(idproducts, desc) # print lxmlparser(unicode(desc).encode("utf-8")) # db.update({'desc': lxmlparser(desc)}, # 'products', str.format('idproducts="{0}"', idproducts)) if details and has_escape(details): print(idproducts, details) # print lxmlparser(unicode(details).encode("utf-8")) # db.update({'details': lxmlparser(details)}, # 'products', str.format('idproducts="{0}"', idproducts)) #中英美区域name、description检验,只能包含中英文字符和标点,出现其他文字及符号标识为错误 if region in ['cn', 'us', 'uk']: name_err = not region_pass(name) desc_err = not region_pass(desc) #url不含cjk字符,否则报错,quote生成新url,待用。 url_err = check_url(url) if url_err: url = urllib.quote(url, ":?=/") #color为[]或者可json解析的字符串 if color != '[]' and color is not None: try: t = json.loads(color) color_err = False except: color_err = True print 'color:', color if name_err or url_err or color_err or desc_err or details_err: logging.error( (datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), 'Detail info--------------idproducts:', idproducts, 'name_err' if name_err else None, 'url_err' if url_err else None, 'color_err' if color_err else None, 'desc_err' if desc_err else None, 'details' if details_err else None)) #=============================price check================================================== logging.info( unicode.format( u'{0} PROCESSING price check {1} / {2}', datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), brand, info.brand_info()[brand]['brandname_e'])) prs = db.query( str.format( 'SELECT * FROM (SELECT p2.idprice_history,p2.date,p2.price,p2.currency,p1.idproducts,p1.brand_id,' 'p1.region,p1.name,p1.model,p1.offline FROM products AS p1 JOIN products_price_history AS p2 ON ' 'p1.idproducts=p2.idproducts ' 'WHERE p1.brand_id={0} ORDER BY p2.date DESC) AS p3 GROUP BY p3.idproducts', brand)) # 以model为键值,将同一个model下,不同区域的价格放在一起。 records = prs.fetch_row(maxrows=0, how=1) price_data = {} for r in records: model = r['model'] # 仅有那些price不为None,且offline为0的数据,才加入到price check中。 if r['price'] and int(r['offline']) == 0: # 首先检查model是否已存在 if model not in price_data: price_data[model] = [] price_data[model].append(r) # 最大值和最小值之间,如果差别过大,则说明价格可能有问题 for model in price_data: for item in price_data[model]: price = float(item['price']) item['nprice'] = info.currency_info()[ item['currency']]['rate'] * price # 按照nprice大小排序 sorted_data = sorted( price_data[model], key=lambda item: item['nprice']) max_price = sorted_data[-1]['nprice'] min_price = sorted_data[0]['nprice'] if min_price > 0 and max_price / min_price > threshold: logging.warning( unicode.format( u'{0} WARNING: {1}:{7} MODEL={2}, {3} / {4} => {5} / {6}', datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), brand, model, sorted_data[0]['nprice'], sorted_data[0]['region'], sorted_data[-1]['nprice'], sorted_data[-1]['region'], info.brand_info()[brand] ['brandname_e'])) logging.info('PRODUCT CHECK ENDED!!!!')
def start_requests(self): # 如果未指定brand_list,则默认对所有的品牌进行更新 # 获得所有的品牌数据 if not self.brand_list: self.brand_list = info.brand_info().keys() # self.log('TEST', log.INFO) # # return # UpdateSpider的可选区域参数 region_cond = str.format( 'region IN ({0})', ','.join( "'" + tmp + "'" for tmp in self.region_list)) if self.region_list else '1' rs = self.db.query( str.format( 'SELECT COUNT(*) FROM products WHERE brand_id IN ({0}) AND {1}', ','.join(str(tmp) for tmp in self.brand_list), region_cond)) tot_num = int(rs.fetch_row()[0][0]) self.log(str.format('Total number of records to update: {0}', tot_num), level=log.INFO) for brand in self.brand_list: # 获得该品牌下所有记录 # 如果未指定region_list,则默认对所有的的确进行更新 if self.region_list: region_list = self.region_list else: rs = self.db.query( str.format( 'SELECT DISTINCT region FROM products WHERE brand_id={0}', brand)) region_list = [ tmp['region'] for tmp in rs.fetch_row(maxrows=0, how=1) ] region_info = info.region_info() region_list = filter(lambda val: int(region_info[val]['status']), region_list) for region in region_list: # 对该brand,该区域的所有商品,无论其下线状态是什么,都进行更新。 rs = self.db.query_match( {'idproducts', 'url', 'region', 'model'}, 'products', { 'brand_id': brand, 'region': region }) products_map = { int(tmp['idproducts']): { 'url': tmp['url'], 'region': tmp['region'], 'model': tmp['model'] } for tmp in rs.fetch_row(maxrows=0, how=1) } for pid, data in products_map.items(): url = data['url'] region = data['region'] model = data['model'] # url = 'http://www.gucci.com/us/styles/3085353G0109060' # region = 'us' # pid = 196907 # # return [Request(url=url, # callback=self.parse, # meta={'brand': brand, 'pid': pid, 'region': region}, # errback=self.onerror, # dont_filter=True)] if url: try: yield Request(url=url, callback=self.parse, meta={ 'brand': brand, 'pid': pid, 'region': region, 'model': model }, errback=self.onerror, dont_filter=True) except TypeError: continue else: continue
def get_images_store(brand_id): return os.path.normpath(os.path.join( getattr(glob, 'STORAGE')['STORAGE_PATH'], u'products/images', unicode.format(u'{0}_{1}', brand_id, info.brand_info()[brand_id][ 'brandname_s'])))
def get_log_path(brand_id, region_list=None): return os.path.normpath(os.path.join(getattr(glob, 'STORAGE')['STORAGE_PATH'], u'products/log', unicode.format(u'{0}_{1}_{2}_{3}.log', brand_id, info.brand_info()[brand_id]['brandname_s'], datetime.datetime.now().strftime('%Y%m%d'), '_'.join(region_list) if region_list else 'all')))
def get_job_path(brand_id): return os.path.normpath( os.path.join(getattr(glob, 'STORAGE')['STORAGE_PATH'], unicode.format(u'products/crawl/{0}_{1}', brand_id, info.brand_info()[brand_id]['brandname_s'])))
def run(self): db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) # 如果没有指定brand_list,则默认使用数据库中所有的brand_list if not self.brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] self.brand_list = brand_list else: brand_list = self.brand_list self.progress = 0 self.tot = len(brand_list) # 最终生成的表格 tot_results = [] for brand in brand_list: results = {} print unicode.format(u'PROCESSING {0} / {1}', brand, info.brand_info()[brand]['brandname_e']) brand_name = info.brand_info()[brand]['brandname_e'] self.progress += 1 rs = db.query( str.format( '''SELECT p1.idproducts,p1.brand_id,p1.model,p1.region,p2.price,p2.price_discount,p2.currency,p2.date,p1.name,p4.tag,p1.url FROM products AS p1 JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts LEFT JOIN products_mfashion_tags AS p3 ON p3.idproducts=p1.idproducts LEFT JOIN mfashion_tags AS p4 ON p3.id_mfashion_tags=p4.idmfashion_tags WHERE p1.brand_id={0} AND p1.offline=0''', brand)) records = rs.fetch_row(maxrows=0, how=1) for r in records: pid = int(r['idproducts']) timestamp = datetime.datetime.strptime(r['date'], '%Y-%m-%d %H:%M:%S') tag = unicodify(r['tag']) if pid in results: # 如果已经存在相应单品的记录 old_rec = results[pid] old_rec['tag'].add(tag) old_t = datetime.datetime.strptime(old_rec['date'], '%Y-%m-%d %H:%M:%S') if timestamp > old_t: old_rec['price'] = unicodify(r['price']) old_rec['price_discount'] = unicodify( r['price_discount']) old_rec['currency'] = unicodify(r['currency']) old_rec['date'] = unicodify(r['date']) else: # 如果该单品的记录不存在 results[pid] = {k: unicodify(r[k]) for k in r} tmp = results[pid]['tag'] if tmp: results[pid]['tag'] = {tmp} else: results[pid]['tag'] = set({}) results[pid]['brand'] = brand_name results[pid].pop('idproducts') tot_results.extend(self.random_extract(results.values())) db.close() # 将所有的tag转换为[] data = [] for r in tot_results: r['tag'] = json.dumps(list(r['tag']), ensure_ascii=False) data.append( {k: r[k].encode('utf-8') if r[k] else 'NULL' for k in r}) # 写入CSV文件 with open( str.format('extract_{0}.csv', datetime.datetime.now().strftime('%Y%m%d%H%M%S')), 'wb') as f: f.write(u'\ufeff'.encode('utf8')) dict_writer = csv.DictWriter(f, fieldnames=[ 'brand_id', 'brand', 'model', 'region', 'price', 'price_discount', 'currency', 'date', 'name', 'tag', 'url' ]) dict_writer.writeheader() dict_writer.writerows(data)