def get_view_profile_rowkeys(begin, end, mode='news'): # topic一个月大约30w,news一个大约80w,不需要快 print 'begin', begin, timestamp2datetime(begin) print 'end', end, timestamp2datetime(end) # 确定solr表 solr = solr_np if mode == 'news' else solr_tp # 调整时间戳为毫秒格式 begin = ensure_m_timestamp(begin) end = ensure_m_timestamp(end) print begin print end # 估算rows days = (end - begin) / 86400 / 1000 rows = days * every_day_topic_cnt if mode == 'topic' else days * every_day_news_cnt print 'rows', rows # 构建索引 solr_query = SolrQuery() q = '*:*' solr_query.set('q', q) solr_query.add('fq', 'event_time:[%s TO %s]' % (begin, end)) solr_query.set('rows', rows) solr_query.set('fl', ['id']) rowkey_list = [ item['id'] for item in solr.search(**solr_query.get_query_dict()) ] print len(rowkey_list) return rowkey_list
def test21(): ''' 尝试连接各种测试solr :return: ''' from add_data_to_solr.cy_solr_local.solr_base import SolrHelper, SolrCloud, ZooKeeper from general_utils.solr_utils import SolrQuery for table in [ "biztest_hospital_search", "biztest_main_doctors", "biztest_personal_doctors", "biztest_robot_news", "biztest_problem", "biztest_dialog", "biztest_full_problem", "biztest_drug", "biztest_topics", "biztest_pedia" ]: print 'tablename', table, '=' * 30 try: solr = SolrCloud(ZooKeeper("rd1:2181,rd2:2181"), table) solr_query = SolrQuery() solr_query.set('q', '*:*') solr_query.set('fl', ['*', 'score']) solr_query.set('rows', 10) for item in solr.search(**solr_query.get_query_dict()): print item.get('id') except Exception, e: print e
def get_query_uids(begin, end, mode='search'): from general_utils.solr_utils import zk_md4, SolrQuery, pysolr from general_utils.time_utils import ensure_m_timestamp # 更改时间戳格式 begin = ensure_m_timestamp(begin) end = ensure_m_timestamp(end) # 构建索引 q = '*:*' solr_query = SolrQuery() solr_query.set('q', q) solr_query.add('fq', 'event_time:[%s TO %s]' % (begin, end)) solr_query.set('fl', ['uid']) solr_query.set('rows', 1000000) if mode == 'search': solr = pysolr.SolrCloud(zk_md4, 'search_event', timeout=25) elif mode == 'news': solr = pysolr.SolrCloud(zk_md4, 'news_profile', timeout=25) elif mode == 'topic': solr = pysolr.SolrCloud(zk_md4, 'topic_profile', timeout=25) else: return set() uid_list = [ int(item['uid']) for item in solr.search(**solr_query.get_query_dict()) ] uid_set = set(uid_list) print 'len uids', len(uid_set) return uid_set
def get_click_data(): ''' (2017.10.31-2018.02.27)120天 分批从solr获取rowkey_list,每天一批(几十万) :return: ''' begin = 1509379200 end = 1519747200 # output file row_key_list_file = open(solr_rowkeylist_filename, 'w') for i in range((end - begin) / 86400): begin_i = begin + i * 86400 end_i = begin_i + 86400 begin_i_m = int(begin_i * 1000) end_i_m = int(end_i * 1000) print 'begin_i_m', begin_i_m, 'end_i_m', end_i_m # 构造query solrquery = SolrQuery() q = '*:*' solrquery.set('q', q) solrquery.add('fq', 'event_time:[%s TO %s]' % (begin_i_m, end_i_m)) solrquery.set('fl', ['id']) solrquery.set('rows', 100000) rowkey_list = [ item['id'] for item in solr_np.search(**solrquery.get_query_dict()) ] row_key_list_file.write('\n'.join(rowkey_list)) row_key_list_file.write('\n') print '%s row keys are saved' % len(rowkey_list) row_key_list_file.close() # output file2 connection = happybase.Connection('hbase_server', compat='0.90', port=19090, timeout=30000) table = connection.table("cy_event") click_action_file = open(click_actions_filename, 'w') row_key_list_file = open(solr_rowkeylist_filename, 'r') data_lines = row_key_list_file.readlines() data_lines = [item.strip('\n') for item in data_lines] row_key_list_file.close() cols = ['info:uid', 'info:news_id'] for batch_data in yield_batch_data(data_lines, 1000): row = table.rows(batch_data, columns=cols) for key, value in row: uid = int(value['info:uid']) newsid = int(value['info:news_id']) a, ts, b = key.split('|') outputstr = '|'.join([str(uid), str(ts), str(newsid)]) + '\n' click_action_file.write(outputstr) click_action_file.close() connection.close()
def ask_solr(sort_field_name, solr): solrquery = SolrQuery() solrquery.set('q', '*:*') solrquery.set('sort', '%s desc' % sort_field_name) solrquery.set('rows', 1) solrquery.set('fl', [sort_field_name]) res = [item for item in solr.search(**solrquery.get_query_dict())] last_value = res[0][sort_field_name] return { 'last_value': last_value }
def test27(): from add_data_to_solr.cy_solr_local.solr_base import SolrHelper, SolrCloud, ZooKeeper from general_utils.solr_utils import SolrQuery solr = SolrCloud(ZooKeeper("md7:2181,md8:2181,md9:2181"), "main_doctors") for id in ("clinic_web_98f32ad0d4461af8", "clinic_web_63da2e8135fabfb1", "clinic_zhongyike_zhouyuchun", "clinic_web_7ff76f1118d806e6", "4647c810af1ee0850bf2"): sq = SolrQuery() sq.set('q', '*:*') sq.add('fq', 'id:%s' % id) sq.set('rows', 1) sq.set('fl', ['*']) res = [item for item in solr.search(**sq.get_query_dict())][0] if 'name' not in res: continue print '=' * 30 print id, res.get('name2', '') for key in res: if 'score' in key or 'rate' in key or 'star' in key: print key, res[key]
def get_ids(): solrQuery = SolrQuery() solrQuery.set("q", "*:*") solrQuery.add("fq", "doctor_id:clinic_web_c383b3a7e6db1f1d") solrQuery.set("fl", ["id"]) solrQuery.set("rows", 200) res = [ item["id"] for item in solr.search(**solrQuery.get_query_dict()) ] return res