def merge_feed_data(cls):
        feedid2pid_dict = spark_lib.read_hdfs(cls.sc, FEEDID2PID_PATH)\
            .map(cls.extract_feedid2pid).filter(None)\
            .reduceByKey(add).collectAsMap()
        print "length:", len(feedid2pid_dict)
        global FID2PID_MAP
        FID2PID_MAP = cls.sc.broadcast(feedid2pid_dict)
        feed_info_rdd = spark_lib.read_hdfs(cls.sc, FEEDINFO_PATH)\
            .map(cls.extract_feed_info).filter(None)\
            .map(cls.transfid2pid).filter(None)

        spark_lib.save2hdfs(feed_info_rdd, FEED_HDFS_PATH)
        awesome_hdfs.getmerge(FEED_HDFS_PATH, FEED_LOCAL_PATH)
        return
    def extract_insell_pid(cls):
        rdd = spark_lib.read_hdfs(cls.sc, PROD_CORE)\
                .map(lambda line: line.strip().split('\t')[0])

        spark_lib.save2hdfs(rdd, INSELL_PID_PATH, outformat="origin")
        awesome_hdfs.getmerge(INSELL_PID_PATH, '~/base_data/insell_pids.dat')
        return
 def test_tps(cls):
     hdfs_path = '/groups/reco/recosys_log/2018-04-20'
     data = spark_lib.read_hdfs(cls.sc, hdfs_path)\
         .map(cls.parse_log).filter(None).reduceByKey(add).collectAsMap()
     print len(data)
     lst = sorted(data.iteritems(), key=lambda x: x[1])
     print lst[-50:]
     return
    def extract_hot_pid(cls):
        rdd = spark_lib.read_hdfs(cls.sc, CLICK_DAILY_PATH)\
            .map(cls.extract_click_pid).filter(None).reduceByKey(add)\
            .filter(cls.filter_pids).collectAsMap()
        print 'length:', len(rdd)
        pid_list = sorted(rdd.iteritems(), key=lambda x: x[1],
                          reverse=True)[:30000]
        print 'length:', len(pid_list)
        with open('/d1/home/yuanyuan/base_data/hot_pids.dat', 'w') as fp_hot:
            for pid in pid_list:
                print >> fp_hot, pid[0]

        # spark_lib.save2hdfs(rdd, INSELL_PID_PATH)
        # awesome_hdfs.getmerge(INSELL_PID_PATH, '~/base_data/insell_pids.dat')
        return pid_list
    def get_active_custid(cls):
        uid_dict = spark_lib.read_hdfs(cls.sc, CLICK_DAILY_PATH)\
            .map(cls.extract_click_uid).filter(None)\
            .reduceByKey(add).filter(lambda x: x[1]>5).collectAsMap()
        pid_list = sorted(uid_dict.iteritems(),
                          key=lambda x: x[1],
                          reverse=True)
        print pid_list[1]
        if len(pid_list) > 10000:
            pid_list = pid_list[:10000]
        print 'length:', len(pid_list)
        with open('/d1/home/yuanyuan/base_data/active_uids.dat',
                  'w') as fp_active:
            for pid in pid_list:
                print >> fp_active, pid[0]

        return
    def check_reco_log_out(cls):
        data = spark_lib.read_hdfs(cls.sc, RECOSYS_LOG)\
            .map(cls.parse_recosys_log).filter(None)\
            .reduceByKey(lambda x,y:(x[0]+y[0], x[1]+y[1]))
        time_module = data.collectAsMap()
        time_all = data.map(lambda x:(x[0].split('_')[0], x[1]))\
            .reduceByKey(lambda x,y:(x[0]+y[0], x[1]+y[1]))\
            .collectAsMap()

        time_module_lst = sorted(time_module.iteritems(), key=lambda x: x[0])
        time_all_lst = sorted(time_all.iteritems(), key=lambda x: x[0])
        with open('time_module.dat', 'w') as fp_m:
            for item in time_module_lst:
                key, (sum_out, count) = item
                print >> fp_m, '%s,%d,%s,%s' % (key, sum_out * 1.0 / count,
                                                sum_out, count)
        with open('time_all.dat', 'w') as fp_m:
            for item in time_all_lst:
                key, (sum_out, count) = item
                print >> fp_m, '%s,%d,%s,%s' % (key, sum_out * 1.0 / count,
                                                sum_out, count)

        return
 def test_als(cls):
     num = spark_lib.read_hdfs(cls.sc, HDFS_PATH).map(
         cls.extract_perm).reduceByKey(add).collectAsMap()
     print num
     num = spark_lib.read_hdfs(cls.sc, ALS_EXT_PATH).map(
         cls.extract_perm).reduceByKey(add).collectAsMap()
     print num
     num = spark_lib.read_hdfs(cls.sc, ALS_PATH % 'babycloth').map(
         cls.extract_perm).reduceByKey(add).collectAsMap()
     print num
     num = spark_lib.read_hdfs(cls.sc, ALS_PATH % 'mancloth').map(
         cls.extract_perm).reduceByKey(add).collectAsMap()
     print num
     num = spark_lib.read_hdfs(cls.sc, ALS_PATH % 'motherbaby').map(
         cls.extract_perm).reduceByKey(add).collectAsMap()
     print num
     num = spark_lib.read_hdfs(cls.sc, ALS_PATH % 'womancloth').map(
         cls.extract_perm).reduceByKey(add).collectAsMap()
     print num
     num = spark_lib.read_hdfs(cls.sc, ALS_PATH % 'outside').map(
         cls.extract_perm).reduceByKey(add).collectAsMap()
     print num
     return
 def check_kpi(cls):
     data = spark_lib.read_hdfs(cls.sc, OFFLINE_KPI_DATA)\
         .map(cls.parse_kpi_data).filter(None).collect()
     return
 def search_pid(cls):
     data = spark_lib.read_hdfs(cls.sc, FILTER_PATH)\
         .map(cls.parse_reco_and_seach).filter(None).collect()
     print data
     return
 def test_json(cls):
     hdfs_path = '/groups/reco/readdp/category_info.json'
     rdd = spark_lib.read_hdfs(cls.sc, hdfs_path)\
         .map(cls.parse_json)
     print rdd.take(1)
     return