def download_cat_property(cls): cat_id_list = [cat['_id'] for cat in cat_coll.find({}, {'_id': 1})] for cat_id in cat_id_list: property_list, prop_value_dict = [], {} for i in range(1, 3): result = get_cat_property(cat_id, i) if hasattr(result, 'item_props'): item_props = result.item_props if hasattr(item_props, 'item_prop'): item_prop = item_props.item_prop property_list = [] for prop in item_prop: pid = prop.pid property_list.append(pid) prop_value_dict.setdefault(pid, {}) prop_value_dict.setdefault( str(pid), { 'pid': pid, 'name': prop.name, 'must': prop.must, 'multi': prop.multi }) if hasattr(prop, 'prop_values'): prop_value = prop.prop_values.prop_value for prp_val in prop_value: prop_value_dict[pid].update({ prp_val.vid: { 'vid': prp_val.vid, 'name': prp_val.name, 'is_parent': getattr(prp_val, 'is_parent', False) } }) for pid in prop_value_dict: if type(pid) is str: continue property_dict = prop_value_dict[str(pid)] cat_prop_val_coll.update({ 'cat_id': cat_id, 'pid': pid }, { '$set': { 'name': property_dict['name'], 'must': property_dict['must'], 'multi': property_dict['multi'], 'prop_value_list': prop_value_dict[pid].values() } }, upsert=True) cat_coll.update({'_id': cat_id}, {'$set': { 'property_list': property_list }})
def get_cat_top_words(cls): ''' .获取到类目下的top词 ''' result_list = [] cat_id_list = WordCat.r_wckeyword.smembers('cat_set') if not cat_id_list or len(cat_id_list) < 10000: cat_id_list = [cat['_id'] for cat in cat_coll.find({}, {'_id': 1})] for cat_id in cat_id_list: result_list.append(get_cat_top.delay(cat_id)) cls.monitor_result('add cat top words now len = %s and total len = %s', result_list)
def update_all_cat(cls): ''' .更新所有的类目脚本 ''' all_cat_list = [] cat_dict = get_catinfo_new(0) all_cat_list.extend(cat_dict.values()) def get_sub_cats_new(cat_id_list): cat_sub_dict = get_catinfo_new( 2, [str(cat_id) for cat_id in cat_id_list]) if cat_sub_dict: cat_id_list = cat_sub_dict.keys() all_cat_list.extend(cat_sub_dict.values()) get_sub_cats_new(cat_id_list) get_sub_cats_new(cat_dict.keys()) old_cat_id_list = [cat['_id'] for cat in cat_coll.find({}, {'_id': 1})] new_cat_id_list, insert_list = [], [] for cat in all_cat_list: cat_id = cat['cat_id'] new_cat_id_list.append(cat_id) if cat_id in old_cat_id_list: cat_coll.update({'_id': cat_id}, { '$set': { 'cat_name': cat['cat_name'], 'parent_cat_id': cat['parent_cat_id'], 'cat_level': cat['cat_level'], 'cat_path_name': cat['cat_path_name'], 'cat_path_id': cat['cat_path_id'], 'last_sync_time': cat['last_sync_time'] } }) else: insert_list.append({ '_id': cat_id, 'cat_name': cat['cat_name'], 'parent_cat_id': cat['parent_cat_id'], 'cat_level': cat['cat_level'], 'cat_path_name': cat['cat_path_name'], 'cat_path_id': cat['cat_path_id'], 'last_sync_time': cat['last_sync_time'] }) remove_list = list(set(old_cat_id_list) - set(new_cat_id_list)) if 0 in remove_list: remove_list.remove(0) try: cat_coll.insert(insert_list, continue_on_error=True, safe=True) except: pass cat_coll.remove({'_id': {'$in': remove_list}}) Cat.compute_child_list()
def get_statictics_category_info(): key = CacheKey.CRM_CAT_ADG_STATISTICS cat_data = crm_cache.get(key) if not cat_data: crm_cache.delete(CacheKey.CRM_CAT_ADG_STATISTICS_LOCK) lock_flag = crm_cache.get(CacheKey.CRM_CAT_ADG_STATISTICS_LOCK) if lock_flag: return {} lock_flag = crm_cache.set(CacheKey.CRM_CAT_ADG_STATISTICS_LOCK, True, 10 * 60 * 60) aggr_pipeline = [ { '$group': { '_id': { 'category_ids': '$category_ids' }, 'adgroup_total': { '$sum': 1 } } }, # 暂过滤掉 < 5000,日后将会扩展 # { # '$match':{ # 'adgroup_total':{ # "$gte":5000 # } # } # }, { '$project': { '_id': 0, 'cat_path': "$_id.category_ids", 'total': '$adgroup_total' } }, { '$sort': { 'cat_path': 1, 'total': -1 } } ] cat_data = {} try: result = adg_coll.aggregate(aggr_pipeline)['result'] except Exception, e: log.error('aggregate adgroup by category error, e=%s' % e) return cat_data cat_mapping = {} cat_id_set = set() for temp in result: total = temp['total'] if temp.has_key('cat_path'): category_ids = str(temp['cat_path']).split() for index in xrange(len(category_ids)): cat = '>'.join(category_ids[:index + 1]) if cat and cat != 'None': if not cat_mapping.has_key(cat): cat_mapping[cat] = 0 cat_id_set.add(int(category_ids[index])) cat_mapping[cat] += total else: # 理论上应该不存在这个问题 pass cat_name_mapping = { cat['_id']: cat['cat_name'] for cat in cat_coll.find({'_id': { "$in": list(cat_id_set) }}, {'cat_name': 1}) } for cat_all_path, total in cat_mapping.items(): cat_id_list = cat_all_path.split('>') # 此处需要进行ID与名称之间转换 cat_all_name = [] for cat_id in cat_id_list: cat_name = cat_name_mapping.get(int(cat_id), '') if cat_name: cat_all_name.append(cat_name) else: # 理论应不会出现该问题 log.error( 'error : it should not happen, the program is wrong if it appear, cat_id=%s' % (cat_id)) continue if cat_id_list: base_dict = { 'cat_id': cat_id_list[-1], 'cat_name': '>'.join(cat_all_name), 'adgroup_total': total } cat_data[int(cat_id_list[-1])] = base_dict data = zlib.compress(json.dumps(cat_data), 5) crm_cache.set(key, data, 10 * 24 * 60 * 60) crm_cache.delete(CacheKey.CRM_CAT_ADG_STATISTICS_LOCK)
def load_cat_prop_dict(cls): cat_id_list = [cat['_id'] for cat in cat_coll.find({}, {'_id': 1})] for cat_id in cat_id_list: cls.load_single_cat_prop(cat_id)