def merge_feed_data(cls): feedid2pid_dict = spark_lib.read_hdfs(cls.sc, FEEDID2PID_PATH)\ .map(cls.extract_feedid2pid).filter(None)\ .reduceByKey(add).collectAsMap() print "length:", len(feedid2pid_dict) global FID2PID_MAP FID2PID_MAP = cls.sc.broadcast(feedid2pid_dict) feed_info_rdd = spark_lib.read_hdfs(cls.sc, FEEDINFO_PATH)\ .map(cls.extract_feed_info).filter(None)\ .map(cls.transfid2pid).filter(None) spark_lib.save2hdfs(feed_info_rdd, FEED_HDFS_PATH) awesome_hdfs.getmerge(FEED_HDFS_PATH, FEED_LOCAL_PATH) return
def extract_insell_pid(cls): rdd = spark_lib.read_hdfs(cls.sc, PROD_CORE)\ .map(lambda line: line.strip().split('\t')[0]) spark_lib.save2hdfs(rdd, INSELL_PID_PATH, outformat="origin") awesome_hdfs.getmerge(INSELL_PID_PATH, '~/base_data/insell_pids.dat') return
def test_tps(cls): hdfs_path = '/groups/reco/recosys_log/2018-04-20' data = spark_lib.read_hdfs(cls.sc, hdfs_path)\ .map(cls.parse_log).filter(None).reduceByKey(add).collectAsMap() print len(data) lst = sorted(data.iteritems(), key=lambda x: x[1]) print lst[-50:] return
def extract_hot_pid(cls): rdd = spark_lib.read_hdfs(cls.sc, CLICK_DAILY_PATH)\ .map(cls.extract_click_pid).filter(None).reduceByKey(add)\ .filter(cls.filter_pids).collectAsMap() print 'length:', len(rdd) pid_list = sorted(rdd.iteritems(), key=lambda x: x[1], reverse=True)[:30000] print 'length:', len(pid_list) with open('/d1/home/yuanyuan/base_data/hot_pids.dat', 'w') as fp_hot: for pid in pid_list: print >> fp_hot, pid[0] # spark_lib.save2hdfs(rdd, INSELL_PID_PATH) # awesome_hdfs.getmerge(INSELL_PID_PATH, '~/base_data/insell_pids.dat') return pid_list
def get_active_custid(cls): uid_dict = spark_lib.read_hdfs(cls.sc, CLICK_DAILY_PATH)\ .map(cls.extract_click_uid).filter(None)\ .reduceByKey(add).filter(lambda x: x[1]>5).collectAsMap() pid_list = sorted(uid_dict.iteritems(), key=lambda x: x[1], reverse=True) print pid_list[1] if len(pid_list) > 10000: pid_list = pid_list[:10000] print 'length:', len(pid_list) with open('/d1/home/yuanyuan/base_data/active_uids.dat', 'w') as fp_active: for pid in pid_list: print >> fp_active, pid[0] return
def check_reco_log_out(cls): data = spark_lib.read_hdfs(cls.sc, RECOSYS_LOG)\ .map(cls.parse_recosys_log).filter(None)\ .reduceByKey(lambda x,y:(x[0]+y[0], x[1]+y[1])) time_module = data.collectAsMap() time_all = data.map(lambda x:(x[0].split('_')[0], x[1]))\ .reduceByKey(lambda x,y:(x[0]+y[0], x[1]+y[1]))\ .collectAsMap() time_module_lst = sorted(time_module.iteritems(), key=lambda x: x[0]) time_all_lst = sorted(time_all.iteritems(), key=lambda x: x[0]) with open('time_module.dat', 'w') as fp_m: for item in time_module_lst: key, (sum_out, count) = item print >> fp_m, '%s,%d,%s,%s' % (key, sum_out * 1.0 / count, sum_out, count) with open('time_all.dat', 'w') as fp_m: for item in time_all_lst: key, (sum_out, count) = item print >> fp_m, '%s,%d,%s,%s' % (key, sum_out * 1.0 / count, sum_out, count) return
def test_als(cls): num = spark_lib.read_hdfs(cls.sc, HDFS_PATH).map( cls.extract_perm).reduceByKey(add).collectAsMap() print num num = spark_lib.read_hdfs(cls.sc, ALS_EXT_PATH).map( cls.extract_perm).reduceByKey(add).collectAsMap() print num num = spark_lib.read_hdfs(cls.sc, ALS_PATH % 'babycloth').map( cls.extract_perm).reduceByKey(add).collectAsMap() print num num = spark_lib.read_hdfs(cls.sc, ALS_PATH % 'mancloth').map( cls.extract_perm).reduceByKey(add).collectAsMap() print num num = spark_lib.read_hdfs(cls.sc, ALS_PATH % 'motherbaby').map( cls.extract_perm).reduceByKey(add).collectAsMap() print num num = spark_lib.read_hdfs(cls.sc, ALS_PATH % 'womancloth').map( cls.extract_perm).reduceByKey(add).collectAsMap() print num num = spark_lib.read_hdfs(cls.sc, ALS_PATH % 'outside').map( cls.extract_perm).reduceByKey(add).collectAsMap() print num return
def check_kpi(cls): data = spark_lib.read_hdfs(cls.sc, OFFLINE_KPI_DATA)\ .map(cls.parse_kpi_data).filter(None).collect() return
def search_pid(cls): data = spark_lib.read_hdfs(cls.sc, FILTER_PATH)\ .map(cls.parse_reco_and_seach).filter(None).collect() print data return
def test_json(cls): hdfs_path = '/groups/reco/readdp/category_info.json' rdd = spark_lib.read_hdfs(cls.sc, hdfs_path)\ .map(cls.parse_json) print rdd.take(1) return