def insert_word_list(cls, word_list): word_list = list(set(word_list)) insert_list = KeywordInfo.get_insert_list(word_list) if insert_list: # relate_word_list = get_relatewords_new(insert_list) # log.info('get relate word from taobao and word_list len = %s ' % len(relate_word_list)) KeywordInfo.insert_new_word_list(insert_list)
def update_gdata_keyword(key): word_list = [ kw.split(':')[0].decode('utf8') for kw in KeywordInfo.r_keyword.lrange(key, 0, -1) ] KeywordInfo.update_gdata(word_list) return len(word_list)
def get_cat_top(cat_id): ''' .获取类目top词 ''' start_date = datetime.date.today() word_list = get_cat_top_word(cat_id, str(start_date - datetime.timedelta(days=7)), str(start_date - datetime.timedelta(days=1))) KeywordInfo.insert_new_word_list(word_list) return len(word_list)
def add_user_keyword(shop_id): word_list = [] word_list.extend( [kw['word'] for kw in kw_coll.find({'shop_id': shop_id}, {'word'})]) KeywordInfo.insert_new_word_list(word_list) account_coll.update({'_id': shop_id}, {'$set': { 'has_push': True }}, multi=True) return 1
def get_recommand_word(record_list): ''' .获取宝贝推荐词 ''' word_list = [] for record in record_list: word_list.extend( get_tb_recommend_keywords(record['shop_id'], record['_id'])) if len(word_list) >= 10000: KeywordInfo.insert_new_word_list(word_list) word_list = [] return len(record_list)
def import_keyword_2redis(cls, file_path): ''' .从文本文件内导入数据到redis当中 ''' with open(file_path, 'r') as r_file: word_list = [] while True: word = r_file.readline() if not word: break word = word.decode('gbk').strip('\r\n') word_list.append(word) word_list = list(set(word_list)) KeywordInfo.insert_new_word_list(word_list)
def update_new_words(cls): ''' .执行刷新新词数据 ''' result_list = [] for key in KeywordInfo.r_nkeyword.lrange( RedisKeyManager.get_key_manager( KeywordInfo.NEW_KW_LIST_PREV_KEY), 0, -1): result_list.append(update_new_keyword.delay(key)) total = len(result_list) cls.monitor_result( 'update new keyword now len = %s and total len = %s', result_list) KeywordInfo.load_redis_newcat_word_2memcache() return total
def update_words(cls): ''' .执行刷新全网数据 ''' RedisKeyManager.clear_all_keyword_list( KeywordInfo.GKEYWORD_ALIAS, KeywordInfo.GDATA_KW_LIST_PREV_KEY) KeywordInfo.clean_gdata_timescope() KeywordInfo.get_gdata_timescope() result_list = [] for key in KeywordInfo.r_keyword.lrange( RedisKeyManager.get_key_manager(), 0, -1): result_list.append(update_gdata_keyword.delay(key)) total = len(result_list) cls.monitor_result( 'update all keyword now len = %s and total len = %s', result_list) return total
def update_new_keyword(key): ''' .将新加入到词库的关键词刷新到词库 ''' insert_list = [] word_list = [ kw.decode('utf8') for kw in KeywordInfo.r_nkeyword.lrange(key, 0, -1) ] result_list = KeywordInfo.update_gdata(word_list) KeywordInfo.load_redis_cat_new_word(result_list) for word in word_list: insert_list.append(word + ':' + RedisKeyManager.get_sort_word(word)) key_keyword_list = RedisKeyManager.get_keyword_list_key( KeywordInfo.KEYWORD_ALIAS) RedisKeyManager.redis_lpush(KeywordInfo.r_keyword, key_keyword_list, insert_list) RedisKeyManager.clear_single_keyword_list(key, KeywordInfo.NEWKEYWORD_ALIAS, KeywordInfo.NEW_KW_LIST_PREV_KEY) return len(word_list)
def add_all_recommand_words(cls): word_list, count = [], 0 for adg in adg_coll.find({'has_push': { '$in': [None, False] }}, {'shop_id': 1}, timeout=False): adg_coll.update({'_id': adg['_id']}, {'$set': {'has_push': True}}) try: word_list.extend([ kw['word'] for kw in get_tb_recommend_keywords( adg['shop_id'], adg['_id']) ]) except Exception, e: log.error('can not get tapi and the error is =%s' % e) pass if len(word_list) >= 10000: log.info('now get recommand words index is = %s' % count) KeywordInfo.insert_new_word_list(word_list) word_list = [] count += 1
def crawl_by_all_keyword(cls): ''' .根据所有的关键词进行爬词 ''' result_list = [] manager_list = KeywordInfo.r_keyword.lrange( RedisKeyManager.get_key_manager(), 0, -1) total_len, count = len(manager_list), 0 for key in manager_list: count += 1 word_list = [ kw.split(':')[0].decode('utf8') for kw in KeywordInfo.r_keyword.lrange(key, 0, -1) ] log.info('crawl word = %s count index = %s and total len = %s' % (len(word_list), count, total_len)) for word in word_list: result_list.extend( KeywordCrawler.crawl_word_by_word(word, True)) if len(result_list) >= 10000: result_list = list(set(result_list)) KeywordInfo.insert_new_word_list(result_list) result_list = [] KeywordInfo.insert_new_word_list(result_list)
def get_gdata_word(word_list, request=None): """ .共享数据api接口 """ result_dict = {'result': {}} if word_list: try: log.info('start get gdata by word and word_len = %s' % (len(word_list))) result_dict['result'] = KeywordInfo.get_gdata_by_words(word_list) log.info( 'end get gdata by word and return word_list ,result_dict len = %s' % len(result_dict['result'])) except Exception, e: log.error('get share gdata error , the error is = %s,words=%s' % (e, ','.join(word_list)))
def get_keyword_subdata(word_list, sub_type=0, request=None): """ args: word_list : [u'连衣裙',u'红色连衣裙'......] sub_type : 0 无线 1 PC -1 汇总 return { pv: 展现量 click:点击量 cost: 花费,单位(分) directtransaction: 直接成交金额 indirecttransaction:间接成交金额 directtransactionshipping:直接成交笔数 indirecttransactionshipping:间接成交笔数 favitemtotal:宝贝搜藏数 favshoptotal:店铺搜藏数 transactionshippingtotal:总的成交笔数 transactiontotal:成交总金额 favtotal:总的收藏数,包括宝贝收藏和店铺收藏 competition:竞争度 ctr:点击率 cpc:平均点击花费 roi:投入产出比 coverage:点击转化率 mechanism:投放机制:0:关键词推广 2:定向推广 3:通用定向 } """ result_dict = {'result': {}} try: log.info('start get subdata by word and word_len = %s' % (len(word_list))) result_dict['result'] = KeywordInfo.get_subdata(word_list, sub_type) log.info( 'end get subdata by word and return word_list ,result_dict len = %s' % len(result_dict['result'])) except Exception, e: log.error('get subdata gdata error , the error is = %s,words=%s' % (e, ','.join(word_list)))
def check_kw_2save_inDB(word_list): return KeywordInfo.check_kw_2save_inDB(word_list)
def get_timescope(): return KeywordInfo.get_gdata_timescope()
def update_cat_forcecats(key): ''' .类目预测 ''' result = KeywordInfo.update_cat_forcecats(key) return result
class TaskTools(): @classmethod def monitor_result(cls, monitor_str, result_list): ''' .监控执行任务的进度 ''' total_len = len(result_list) while result_list: for result in result_list: if result.ready(): result_list.remove(result) log.info(monitor_str % (len(result_list), total_len)) time.sleep(15) @classmethod def add_user_keyword(cls): ''' .添加用户的关键词 ''' result_list = [] shop_id_list = [ acct['_id'] for acct in account_coll.find({'has_push': { '$in': [None, False] }}, {'_id': 1}) ] for shop_id in shop_id_list: result_list.append(add_user_keyword.delay(shop_id)) total = len(result_list) cls.monitor_result('add user keyword now len = %s and total len = %s', result_list) return total @classmethod def update_new_words(cls): ''' .执行刷新新词数据 ''' result_list = [] for key in KeywordInfo.r_nkeyword.lrange( RedisKeyManager.get_key_manager( KeywordInfo.NEW_KW_LIST_PREV_KEY), 0, -1): result_list.append(update_new_keyword.delay(key)) total = len(result_list) cls.monitor_result( 'update new keyword now len = %s and total len = %s', result_list) KeywordInfo.load_redis_newcat_word_2memcache() return total @classmethod def update_words(cls): ''' .执行刷新全网数据 ''' RedisKeyManager.clear_all_keyword_list( KeywordInfo.GKEYWORD_ALIAS, KeywordInfo.GDATA_KW_LIST_PREV_KEY) KeywordInfo.clean_gdata_timescope() KeywordInfo.get_gdata_timescope() result_list = [] for key in KeywordInfo.r_keyword.lrange( RedisKeyManager.get_key_manager(), 0, -1): result_list.append(update_gdata_keyword.delay(key)) total = len(result_list) cls.monitor_result( 'update all keyword now len = %s and total len = %s', result_list) return total @classmethod def update_forcecats(cls): ''' .执行类目预测 ''' WordCat.clean_wordcat_keyword(WordCat.WCKW_ALIAS) result_list = [] for key in KeywordInfo.r_gkeyword.lrange( RedisKeyManager.get_key_manager( KeywordInfo.GDATA_KW_LIST_PREV_KEY), 0, -1): result_list.append(update_cat_forcecats.delay(key)) total = len(result_list) cls.monitor_result( 'update cat forcecats now len = %s and total len = %s', result_list) return total @classmethod def backup_db3_all_keyword(cls): ''' .备份db3当中的所有关键词,备份之前一定要删除原来备份的数据。 ''' import settings, redis conf = settings.REDIS_CONF['gdata'] r = redis.Redis(host=conf['host'], port=conf['port'], db=13, password=conf['password']) RedisKeyManager.clear_all_keyword_by_db(r) manager_list = KeywordInfo.r_keyword.lrange( RedisKeyManager.get_key_manager(), 0, -1) for key in manager_list: r.lpush( key, *[ kw.decode('utf8') for kw in KeywordInfo.r_keyword.lrange(key, 0, -1) ]) r.lpush(RedisKeyManager.get_key_manager(), *manager_list) @classmethod def insert_word_list(cls, word_list): word_list = list(set(word_list)) insert_list = KeywordInfo.get_insert_list(word_list) if insert_list: # relate_word_list = get_relatewords_new(insert_list) # log.info('get relate word from taobao and word_list len = %s ' % len(relate_word_list)) KeywordInfo.insert_new_word_list(insert_list) @classmethod def crawl_by_character_or_elemword(cls): ''' .根据汉字原子词进行爬词 ''' result_list = [] ChSegement.get_word_dict() label_list = [ word.decode('utf8') for word in KeywordInfo.r_keyword.smembers('label_set') ] word_list = Character.get_all_character() + ChSegement.word_dict.keys( ) + label_list count, total = 0, len(word_list) for word in word_list: count += 1 result_list.extend(KeywordCrawler.crawl_word_by_word(word, True)) if len(result_list) >= 10000: log.info('crawl word index = %s and total len = %s' % (count, total)) cls.insert_word_list(result_list) result_list = [] cls.insert_word_list(result_list) @classmethod def crawl_by_elemword(cls): result_list = [] ChSegement.get_word_dict() label_list = [ word.decode('utf8') for word in KeywordInfo.r_keyword.smembers('label_set') ] word_list = list(set(ChSegement.word_dict.keys() + label_list)) count, total = 0, len(word_list) for word in word_list: count += 1 tmp_list = KeywordCrawler.crawl_words_by_word_list([word], 10000) log.info( 'crawl word index = %s and total len = %s and word = %s and crawl_result_len = %s' % (count, total, word, len(tmp_list))) result_list.extend(tmp_list) if len(result_list) >= 10000: log.info('craw words more than 10000') cls.insert_word_list(result_list) result_list = [] cls.insert_word_list(word_list) @classmethod def crawl_by_all_keyword(cls): ''' .根据所有的关键词进行爬词 ''' result_list = [] manager_list = KeywordInfo.r_keyword.lrange( RedisKeyManager.get_key_manager(), 0, -1) total_len, count = len(manager_list), 0 for key in manager_list: count += 1 word_list = [ kw.split(':')[0].decode('utf8') for kw in KeywordInfo.r_keyword.lrange(key, 0, -1) ] log.info('crawl word = %s count index = %s and total len = %s' % (len(word_list), count, total_len)) for word in word_list: result_list.extend( KeywordCrawler.crawl_word_by_word(word, True)) if len(result_list) >= 10000: result_list = list(set(result_list)) KeywordInfo.insert_new_word_list(result_list) result_list = [] KeywordInfo.insert_new_word_list(result_list) @classmethod def import_keyword_2redis(cls, file_path): ''' .从文本文件内导入数据到redis当中 ''' with open(file_path, 'r') as r_file: word_list = [] while True: word = r_file.readline() if not word: break word = word.decode('gbk').strip('\r\n') word_list.append(word) word_list = list(set(word_list)) KeywordInfo.insert_new_word_list(word_list) @classmethod def get_cat_top_words(cls): ''' .获取到类目下的top词 ''' result_list = [] cat_id_list = WordCat.r_wckeyword.smembers('cat_set') if not cat_id_list or len(cat_id_list) < 10000: cat_id_list = [cat['_id'] for cat in cat_coll.find({}, {'_id': 1})] for cat_id in cat_id_list: result_list.append(get_cat_top.delay(cat_id)) cls.monitor_result('add cat top words now len = %s and total len = %s', result_list) @classmethod def update_all_cat(cls): ''' .更新所有的类目脚本 ''' all_cat_list = [] cat_dict = get_catinfo_new(0) all_cat_list.extend(cat_dict.values()) def get_sub_cats_new(cat_id_list): cat_sub_dict = get_catinfo_new( 2, [str(cat_id) for cat_id in cat_id_list]) if cat_sub_dict: cat_id_list = cat_sub_dict.keys() all_cat_list.extend(cat_sub_dict.values()) get_sub_cats_new(cat_id_list) get_sub_cats_new(cat_dict.keys()) old_cat_id_list = [cat['_id'] for cat in cat_coll.find({}, {'_id': 1})] new_cat_id_list, insert_list = [], [] for cat in all_cat_list: cat_id = cat['cat_id'] new_cat_id_list.append(cat_id) if cat_id in old_cat_id_list: cat_coll.update({'_id': cat_id}, { '$set': { 'cat_name': cat['cat_name'], 'parent_cat_id': cat['parent_cat_id'], 'cat_level': cat['cat_level'], 'cat_path_name': cat['cat_path_name'], 'cat_path_id': cat['cat_path_id'], 'last_sync_time': cat['last_sync_time'] } }) else: insert_list.append({ '_id': cat_id, 'cat_name': cat['cat_name'], 'parent_cat_id': cat['parent_cat_id'], 'cat_level': cat['cat_level'], 'cat_path_name': cat['cat_path_name'], 'cat_path_id': cat['cat_path_id'], 'last_sync_time': cat['last_sync_time'] }) remove_list = list(set(old_cat_id_list) - set(new_cat_id_list)) if 0 in remove_list: remove_list.remove(0) try: cat_coll.insert(insert_list, continue_on_error=True, safe=True) except: pass cat_coll.remove({'_id': {'$in': remove_list}}) Cat.compute_child_list() @classmethod def add_all_recommand_words(cls): word_list, count = [], 0 for adg in adg_coll.find({'has_push': { '$in': [None, False] }}, {'shop_id': 1}, timeout=False): adg_coll.update({'_id': adg['_id']}, {'$set': {'has_push': True}}) try: word_list.extend([ kw['word'] for kw in get_tb_recommend_keywords( adg['shop_id'], adg['_id']) ]) except Exception, e: log.error('can not get tapi and the error is =%s' % e) pass if len(word_list) >= 10000: log.info('now get recommand words index is = %s' % count) KeywordInfo.insert_new_word_list(word_list) word_list = [] count += 1 KeywordInfo.insert_new_word_list(word_list)
def save_lable_list(label_list): if label_list: KeywordInfo.set_label_list(label_list)