Esempio n. 1
0
def attach_user_act(X, month):
    col_names = np.genfromtxt(
        paths.my_path +
        'user_act_counts_month_{0}{1}_col_names.csv'.format(1, month - 1),
        delimiter=',',
        dtype=str)
    full_act = Matrix(
        np.genfromtxt(paths.my_path +
                      'user_act_counts_month_{0}{1}.csv'.format(1, month - 1),
                      delimiter=',',
                      dtype=float),
        col_names[0, :],
        col_formats=["%s" for i in xrange(col_names.shape[1] - 1)])
    full_act_hash = HashSet()
    for i in xrange(full_act.ndata):
        if i % 100000 == 0:
            print i
        uid = full_act.get_cell(i, "uid")
        full_act_hash.set(uid, full_act.matrix[i, 1:])
    X_full_act = np.zeros((X.ndata, col_names.shape[1] - 1))
    for i in xrange(X.ndata):
        if i % 100000 == 0:
            print i
        uid = X.get_cell(i, "uid")
        X_full_act[i, :] = full_act_hash.get(uid,
                                             np.zeros(col_names.shape[1] - 1))
    col_names_check_point = []
    for name in col_names[0, 1:]:
        col_names_check_point.append('{0}_user_all_month'.format(name))
    X.cat_col(X_full_act, col_names_check_point,
              ["%s" for i in xrange(len(col_names_check_point))])
Esempio n. 2
0
def regen_feature_by_month(func, month, feature_name):
    col_names = np.genfromtxt(paths.my_path + 'col_names{0}.csv'.format(month),
                              delimiter=',',
                              dtype=str)
    col_number = col_names.shape[1]
    X = Matrix(
        np.genfromtxt(paths.my_path + 'X{0}.csv'.format(month),
                      delimiter=',',
                      dtype=str), list(col_names[0, :]),
        ["%s" for i in xrange(col_names.shape[1])])
    X.drop(feature_name)
    func(X, month, feature_name)
Esempio n. 3
0
def attach_full_act_history(X, month):
    for k in xrange(month - 1, 0, -1):
        print k
        col_names_history = np.genfromtxt(
            paths.my_path + 'act_counts_month_{0}_col_names.csv'.format(k),
            delimiter=',',
            dtype=str)
        history = Matrix(
            np.genfromtxt(paths.my_path + 'act_counts_month_{0}.csv'.format(k),
                          delimiter=',',
                          dtype=float), col_names_history[0, :],
            ["%s" for i in xrange(col_names_history.shape[1])])
        history_hash = HashSet()
        for i in xrange(history.ndata):
            if i % 100000 == 0:
                print i
            uid = history.get_cell(i, "uid")
            mid = history.get_cell(i, "mid")
            history_hash.get(uid, HashSet()).set(mid, history.matrix[i, 3:])
        X_history = np.zeros((X.ndata, col_names_history.shape[1] - 3))
        for i in xrange(X.ndata):
            if i % 100000 == 0:
                print i
            uid = X.get_cell(i, "uid")
            mid = X.get_cell(i, "mid")
            X_history[i, :] = history_hash.get(uid, HashSet()).get(
                mid, np.zeros(col_names_history.shape[1] - 3))
        col_names = []
        for name in col_names_history[0, 3:]:
            col_names.append('{0}_month_{1}'.format(name, month - k))
        X.cat_col(X_history, col_names, ["%s" for i in xrange(len(col_names))])
    for k in xrange(7 - month):
        col_names = []
        for name in col_names_history[0, 3:]:
            col_names.append('{0}_month_{1}'.format(name, month - k))
        X_history = np.zeros((X.ndata, len(col_names)))
        X.cat_col(X_history, col_names, ["%s" for i in xrange(len(col_names))])
Esempio n. 4
0
def append_feature_by_month(func, month):
    col_names = np.genfromtxt(
        paths.my_path +
        'X_{0}_{1}{2}_shop_month_col_names.csv'.format(month, 1, month - 1),
        delimiter=',',
        dtype=str)
    col_number = col_names.shape[1]
    X = Matrix(
        np.genfromtxt(
            paths.my_path +
            'X_{0}_{1}{2}_shop_month.csv'.format(month, 1, month - 1),
            delimiter=',',
            dtype=str), list(col_names[0, :]),
        ["%s" for i in xrange(col_names.shape[1])])
    func(X, month)
Esempio n. 5
0
def drop_multiple(feature_names, month):
    col_names = np.genfromtxt(paths.my_path + 'col_names{0}.csv'.format(month),
                              delimiter=',',
                              dtype=str)
    col_number = col_names.shape[1]
    X = Matrix(
        np.genfromtxt(paths.my_path + 'X{0}.csv'.format(month),
                      delimiter=',',
                      dtype=str), list(col_names[0, :]),
        ["%s" for i in xrange(col_names.shape[1])])
    for name in feature_names:
        X.drop(name)
    X.check_point(month)
Esempio n. 6
0
def drop(feature_name, month, checkpoint=True):
    col_names = np.genfromtxt(paths.my_path + 'col_names{0}.csv'.format(month),
                              delimiter=',',
                              dtype=str)
    col_number = col_names.shape[1]
    X = Matrix(
        np.genfromtxt(paths.my_path + 'X{0}.csv'.format(month),
                      delimiter=',',
                      dtype=str), list(col_names[0, :]),
        ["%s" for i in xrange(col_names.shape[1])])
    X.drop(feature_name)
    if checkpoint:
        X.check_point(month)
Esempio n. 7
0
def gen_by_month(target_file, month, since=None):
    if month < 7:
        X = Matrix(
            np.genfromtxt(paths.ccf_path + 'offline_train_test_' + str(month) +
                          '.csv',
                          delimiter=',',
                          dtype=str),
            ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"],
            ["%s" for i in xrange(7)])
    else:
        X = Matrix(
            np.genfromtxt(paths.ccf_path + 'offline_train_test_' + str(month) +
                          '.csv',
                          delimiter=',',
                          dtype=str),
            ["uid", "mid", "cid", "dis_rate", "dist", "date_rec"],
            ["%s" for i in xrange(6)])

    print "generating month {0}".format(month)

    print "gen_user_shop_features"
    gen_user_shop_features(X, month)

    print "gen_no_penalty_user_shop_features"
    gen_no_penalty_user_shop_features(X, month)

    print "gen_basic_features"
    gen_basic_features(X)

    print "attach_full_act_history"
    attach_full_act_history(X, month)

    print "attach_user_act_history"
    attach_user_act_history(X, month)

    print "attach_full_act"
    attach_full_act(X, month)

    print "attach_user_act"
    attach_user_act(X, month)

    if month < 7:
        print "gen_label"
        gen_label(X)

    X.check_point(target_file)
Esempio n. 8
0
def gen_act_counts_by_month(month):
    offline = Matrix(
        np.genfromtxt(paths.ccf_path + 'offline_1_month{0}.csv'.format(month),
                      delimiter=',',
                      dtype=str),
        ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"],
        ["%s" for i in xrange(7)])
    online = Matrix(
        np.genfromtxt(paths.ccf_path + 'online_1_month{0}.csv'.format(month),
                      delimiter=',',
                      dtype=str),
        ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"],
        ["%s" for i in xrange(7)])
    user_hash_set = HashSet()
    full_hash_set = HashSet()
    ndata = 0
    for i in xrange(offline.ndata):
        if i % 100000 == 0:
            print i
        uid_str = offline.get_cell(i, "uid")
        mid_str = offline.get_cell(i, "mid")
        cid_str = offline.get_cell(i, "cid")
        date_str = offline.get_cell(i, "date")
        date_rec_str = offline.get_cell(i, "date_rec")
        if date_str != 'null':
            act_counts = full_hash_set.get(uid_str, HashSet()).get(
                mid_str, np.zeros(3, dtype=float))
            user_act_counts = user_hash_set.get(uid_str,
                                                np.zeros(3, dtype=float))
            if act_counts[0] == 0 and act_counts[1] == 0 and act_counts[2] == 0:
                ndata += 1
            if cid_str != 'null':
                act_counts[2] += 1
                user_act_counts[2] += 1
            else:
                act_counts[1] += 1
                user_act_counts[1] += 1
        elif date_rec_str != 'null':
            act_counts = full_hash_set.get(uid_str, HashSet()).get(
                mid_str, np.zeros(3, dtype=float))
            user_act_counts = user_hash_set.get(uid_str,
                                                np.zeros(3, dtype=float))
            if act_counts[0] == 0 and act_counts[1] == 0 and act_counts[2] == 0:
                ndata += 1
            act_counts[0] += 1
            user_act_counts[0] += 1
    for i in xrange(online.ndata):
        if i % 100000 == 0:
            print i
        uid_str = online.get_cell(i, "uid")
        mid_str = online.get_cell(i, "mid")
        cid_str = online.get_cell(i, "cid")
        date_str = online.get_cell(i, "date")
        date_rec_str = online.get_cell(i, "date_rec")
        act_str = online.get_cell(i, "act")

        if date_str != 'null' and act_str == '1':
            act_counts = full_hash_set.get(uid_str, HashSet()).get(
                mid_str, np.zeros(3, dtype=float))
            user_act_counts = user_hash_set.get(uid_str,
                                                np.zeros(3, dtype=float))
            if act_counts[0] == 0 and act_counts[1] == 0 and act_counts[2] == 0:
                ndata += 1
            if cid_str != 'null':
                act_counts[2] += 1
                user_act_counts[2] += 1
            else:
                act_counts[1] += 1
                user_act_counts[1] += 1
        elif date_rec_str != 'null':
            assert act_str != '0'
            act_counts = full_hash_set.get(uid_str, HashSet()).get(
                mid_str, np.zeros(3, dtype=float))
            user_act_counts = user_hash_set.get(uid_str,
                                                np.zeros(3, dtype=float))
            if act_counts[0] == 0 and act_counts[1] == 0 and act_counts[2] == 0:
                ndata += 1
            act_counts[0] += 1
            user_act_counts[0] += 1
    col_names = ["uid", "mid", \
        "unused_coupon", "buy_without_coupon", "use_coupon", \
        "total_coupon", "total_buy", \
        "act_ratio_0", "act_ratio_1", "act_ratio_2", \
        "used_ratio", "unused_ratio", "buy_with_coupon_ratio", "buy_without_coupon_ratio", \
        "unused_coupon_shop_ratio", "buy_without_coupon_shop_ratio", "use_coupon_shop_ratio"]
    full_table = Matrix(np.zeros((ndata, len(col_names))),
                        col_names,
                        col_formats=["%s" for i in xrange(len(col_names))])
    row_index = 0
    for uid in full_hash_set.get_keys():
        user_act_counts = user_hash_set.get(uid)
        user_total_unused_coupon = user_act_counts[0]
        user_total_buy_without_coupon = user_act_counts[1]
        user_total_use_coupon = user_act_counts[2]
        for mid in full_hash_set.get(uid).get_keys():
            if row_index % 100000 == 0:
                print row_index
            acts = full_hash_set.get(uid).get(mid)
            use_coupon = acts[2]
            unused_coupon = acts[0]
            buy_without_coupon = acts[1]
            total_acts = use_coupon + unused_coupon + buy_without_coupon
            total_coupon = use_coupon + unused_coupon
            total_buy = use_coupon + buy_without_coupon

            def divide(x, y):
                if y == 0:
                    return 0
                else:
                    return x * 1.0 / y
            full_table.set_row(row_index, np.array([uid, mid, unused_coupon, buy_without_coupon, use_coupon, \
                total_coupon, total_buy, \
                divide(unused_coupon, total_acts), divide(buy_without_coupon, total_acts), divide(use_coupon, total_acts), \
                divide(use_coupon, total_coupon), divide(unused_coupon, total_coupon), \
                divide(use_coupon, total_buy), divide(buy_without_coupon, total_buy), \
                divide(unused_coupon, user_total_unused_coupon), divide(buy_without_coupon, user_total_buy_without_coupon), divide(use_coupon, user_total_use_coupon)]))
            row_index += 1
    full_table.check_point("act_counts_month_{0}".format(month))
def calc_give_after_purchase():
    offline = Matrix(np.genfromtxt(paths.ccf_path + 'ccf_offline_stage1_train.csv', delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)])
    #online = Matrix(np.genfromtxt(paths.ccf_path + 'ccf_online_stage1_train.csv', delimiter=',', dtype=str), ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"], ["%s" for i in xrange(7)])
    user_shop_coupon_buy = HashSet()
    for i in xrange(offline.ndata):
        if i % 100000 == 0:
            print i
        uid_str = offline.get_cell(i, "uid")
        mid_str = offline.get_cell(i, "mid")
        date_str = offline.get_cell(i, "date")
        if date_str != 'null':
            date_list = user_shop_coupon_buy.get(uid_str, HashSet()).get(mid_str, [])
            date_list.append(date_str)

    shop_coupon = {}
    shop_give_after_purchase = {}
    shop_give_after_purchase_used = {}
    shop_coupon_used = {}

    #user_shop_coupon = HashSet()
    #user_shop_coupon_give_after_purchase = HashSet()
    #user_shop_coupon_give_after_purchase_used = HashSet()
    #user_shop_coupon_used = HashSet()

    #user_coupon = HashSet()
    #user_give_after_purchase = HashSet()
    #user_give_after_purchase_used = HashSet()
    #user_coupon_used = HashSet()

    for i in xrange(offline.ndata):
        if i % 100000 == 0:
            print i
        uid_str = offline.get_cell(i, "uid")
        mid_str = offline.get_cell(i, "mid")
        date_str = offline.get_cell(i, "date")
        cid_str = offline.get_cell(i, "cid")
        date_rec_str = offline.get_cell(i, "date_rec")
        date_list = user_shop_coupon_buy.get(uid_str, HashSet()).get(mid_str, [])
        if cid_str != 'null':
            #user_shop_coupon.get(uid_str, HashSet()).add_one(mid_str)
            #user_coupon.add_one(uid_str)
            if mid_str not in shop_coupon:
                shop_coupon[mid_str] = 1
            else:
                shop_coupon[mid_str] += 1
            date_list = user_shop_coupon_buy.get(uid_str, HashSet()).get(mid_str, [])
            if date_rec_str in date_list:
                #user_shop_coupon_give_after_purchase.get(uid_str, HashSet()).add_one(mid_str)
                #user_give_after_purchase.add_one(uid_str)
                if mid_str not in shop_give_after_purchase:
                    shop_give_after_purchase[mid_str] = 1
                else:
                    shop_give_after_purchase[mid_str] += 1
                if date_str != 'null':
                    #user_shop_coupon_give_after_purchase_used.get(uid_str, HashSet()).add_one(mid_str)
                    #user_give_after_purchase_used.add_one(uid_str)
                    if mid_str not in shop_give_after_purchase_used:
                        shop_give_after_purchase_used[mid_str] = 1
                    else:
                        shop_give_after_purchase_used[mid_str] += 1
            if date_str != 'null':
                #user_shop_coupon_used.get(uid_str, HashSet()).add_one(mid_str)
                #user_coupon_used.add_one(uid_str)
                if mid_str not in shop_coupon_used:
                    shop_coupon_used[mid_str] = 1
                else:
                    shop_coupon_used[mid_str] += 1

    give_after_purchase_statistics_names = ["shop_coupon", "shop_coupon_used", "shop_coupon_give_after_purchase", "shop_coupon_give_after_purchase_used"]

    give_after_purchase_statistics = np.zeros((offline.ndata, len(give_after_purchase_statistics_names)))
    for i in xrange(offline.ndata):
        if i % 100000 == 0:
            print i
        uid_str = offline.get_cell(i, "uid")
        mid_str = offline.get_cell(i, "mid")
        shop_coupon_counter = shop_coupon.get(mid_str, 0)
        shop_coupon_used_counter = shop_coupon_used.get(mid_str, 0)
        shop_coupon_give_after_purchase_counter = shop_give_after_purchase.get(mid_str, 0)
        shop_coupon_give_after_purchase_used_counter = shop_give_after_purchase_used.get(mid_str, 0)
        #if uid_str in user_shop_coupon:
        #    user_shop_coupon_counter = user_shop_coupon.get(uid_str).get(mid_str, 0)
        #else:
        #    user_shop_coupon_counter = 0
        #if uid_str in user_shop_coupon_used:
        #    user_shop_coupon_used_counter = user_shop_coupon_used.get(uid_str).get(mid_str, 0)
        #else:
        #    user_shop_coupon_used_counter = 0
        #if uid_str in user_shop_coupon_give_after_purchase:
        #    user_shop_coupon_give_after_purchase_counter = user_shop_coupon_give_after_purchase.get(mid_str, 0)
        #else:
        #    user_shop_coupon_give_after_purchase_counter = 0
        #if uid_str in user_shop_coupon_give_after_purchase_used:
        #    user_shop_coupon_give_after_purchase_used_counter = user_shop_coupon_give_after_purchase_used.get(mid_str, 0)
        #else:
        #    user_shop_coupon_give_after_purchase_used_counter = 0

        #if not user_coupon_used.has(uid_str) or user_coupon.get(uid_str) == 0:
        #    user_coupon_use_ratio = 0
        #else:
        #    user_coupon_use_ratio = user_coupon_used.get(uid_str) * 1.0 / user_coupon.get(uid_str)

        #if not user_give_after_purchase_used.has(uid_str) or user_give_after_purchase_used.get(uid_str) == 0:
        #    user_give_after_purchase_use_ratio = 0
        #else:
        #    user_give_after_purchase_use_ratio = user_give_after_purchase_used.get(uid_str) * 1.0 / user_give_after_purchase.get(uid_str)

        #if not shop_give_after_purchase_used.has(mid_str) or shop_give_after_purchase_used.get(mid_str) == 0:
        #    shop_coupon_give_after_purchase_use_ratio = 0
        #else:
        #    shop_coupon_give_after_purchase_use_ratio = shop_give_after_purchase_used.get(mid_str) * 1.0 / shop_give_after_purchase.get(mid_str)

        #if not user_shop_coupon_used.has(uid_str) or not user_shop_coupon_used.get(uid_str).has(mid_str):
        #    user_shop_coupon_use_ratio = 0
        #else:
        #    user_shop_coupon_use_ratio = user_shop_coupon_used.get(uid_str).get(mid_str) * 1.0 / user_shop_coupon.get(uid_str).get(mid_str)

        #if not user_shop_coupon_give_after_purchase_used.has(uid_str) or not user_shop_coupon_give_after_purchase.get(uid_str).has(mid_str):
        #    user_shop_coupon_use_after_purchase_ratio = 0
        #else:
        #    user_shop_coupon_use_after_purchase_ratio = user_shop_coupon_give_after_purchase_used.get(uid_str).get(mid_str) * 1.0 / user_shop_coupon_give_after_purchase.get(uid_str).get(mid_str)

        give_after_purchase_statistics[i, :] = np.array([shop_coupon_counter, shop_coupon_used_counter, shop_coupon_give_after_purchase_counter, shop_coupon_give_after_purchase_used_counter])
    offline.cat_col(give_after_purchase_statistics, give_after_purchase_statistics_names, ["%s" for i in xrange(len(give_after_purchase_statistics_names))])
    offline.check_point("give_after_purchase_statistics")
def aggregate_shops_months(since, to):
    user_act_counts = HashSet()
    for k in xrange(since, to + 1):
        col_names = np.genfromtxt(
            paths.my_path +
            'user_act_counts_month_{0}_col_names.csv'.format(k),
            delimiter=',',
            dtype=str)
        user_table = Matrix(
            np.genfromtxt(paths.my_path +
                          'user_act_counts_month_{0}.csv'.format(k),
                          delimiter=',',
                          dtype=float), col_names[0, :], 0.0)
        for i in xrange(user_table.ndata):
            uid = user_table.get_cell(i, "uid")
            acts = user_act_counts.get(uid, np.zeros((to + 1 - since, 3)))
            acts[k - since, 0] += user_table.get_cell(i, "unused_coupon")
            acts[k - since, 1] += user_table.get_cell(i, "buy_without_coupon")
            acts[k - since, 2] += user_table.get_cell(i, "use_coupon")

    user_col_names = ["uid", "unused_coupon", "buy_without_coupon", "use_coupon", "total_coupon", "total_buy", \
        "act_ratio_0", "act_ratio_1", "act_ratio_2", \
        "used_ratio", "unused_ratio", "buy_with_coupon_ratio", "buy_without_coupon_ratio", \
        "trend_unused_coupon", "trend_buy_without_coupon", "trend_use_coupon", "trend_total_coupon", "trend_total_buy", \
        "trend_act_ratio_0", "trend_act_ratio_1", "trend_act_ratio_2", \
        "trend_used_ratio", "trend_unused_ratio", "trend_buy_with_coupon_ratio", "trend_buy_without_coupon_ratio", \
        "max_unused_coupon_month", "max_buy_without_coupon_month", "max_use_coupon_month", "max_total_coupon_month", "max_total_buy_month", \
        "avg_unused_coupon_month", "avg_buy_without_coupon_month", "avg_use_coupon_month", "avg_total_coupon_month", "avg_total_buy_month", \
        "var_unused_coupon_month", "var_buy_without_coupon_month", "var_use_coupon_month", "var_total_coupon_month", "var_total_buy_month"]
    ndata = len(user_act_counts.get_keys())
    user_act_table = Matrix(np.zeros(
        (ndata, len(user_col_names))), user_col_names,
                            ["%s" for i in xrange(len(user_col_names))])
    col_index = 0
    for uid in user_act_counts.get_keys():
        acts = user_act_counts.get(uid)
        unused_coupon = np.sum(acts[:, 0])
        buy_without_coupon = np.sum(acts[:, 1])
        use_coupon = np.sum(acts[:, 2])

        total_coupon = unused_coupon + use_coupon
        total_buy = buy_without_coupon + use_coupon
        total_acts = unused_coupon + buy_without_coupon + use_coupon

        act_ratio_0 = divide(unused_coupon, total_acts)
        act_ratio_1 = divide(buy_without_coupon, total_acts)
        act_ratio_2 = divide(use_coupon, total_acts)

        used_ratio = divide(use_coupon, total_coupon)
        unused_ratio = divide(unused_coupon, total_coupon)
        buy_with_coupon_ratio = divide(use_coupon, total_buy)
        buy_without_coupon_ratio = divide(buy_without_coupon, total_buy)

        user_act_table.set_row(col_index, np.hstack((np.array([uid, unused_coupon, buy_without_coupon, use_coupon, total_coupon, total_buy, \
            act_ratio_0, act_ratio_1, act_ratio_2, \
            used_ratio, unused_ratio, buy_with_coupon_ratio, buy_without_coupon_ratio]), calc_trend_and_max_avg_var(acts))))

        col_index += 1

    user_act_table.check_point("user_act_counts_month_{0}{1}".format(
        since, to))
Esempio n. 11
0
def aggregate_shops_by_month(month):
    def divide(x, y):
        if y == 0:
            return 0
        else:
            return x * 1.0 / y

    col_names = np.genfromtxt(
        paths.my_path + 'act_counts_month_{0}_col_names.csv'.format(month),
        delimiter=',',
        dtype=str)
    full_act_counts = Matrix(
        np.genfromtxt(paths.my_path + 'act_counts_month_{0}.csv'.format(month),
                      delimiter=',',
                      dtype=float),
        col_names[0, :],
        col_formats=["%s" for i in xrange(col_names.shape[1])])
    user_act_counts = HashSet()
    for i in xrange(full_act_counts.ndata):
        uid_str = full_act_counts.get_cell(i, "uid")
        user_acts = user_act_counts.get(
            uid_str, {
                "unused_coupons": [],
                "buy_without_coupons": [],
                "use_coupons": [],
                "total_coupons": [],
                "total_buys": []
            })
        user_acts["unused_coupons"].append(
            full_act_counts.get_cell(i, "unused_coupon"))
        user_acts["buy_without_coupons"].append(
            full_act_counts.get_cell(i, "buy_without_coupon"))
        user_acts["use_coupons"].append(
            full_act_counts.get_cell(i, "use_coupon"))
        user_acts["total_coupons"].append(
            full_act_counts.get_cell(i, "total_coupon"))
        user_acts["total_buys"].append(full_act_counts.get_cell(
            i, "total_buy"))

    all_users = user_act_counts.get_keys()
    user_col_names = ["uid", \
        "unused_coupon", "buy_without_coupon", "use_coupon", "total_coupon", "total_buy", \
        "act_ratio_0", "act_ratio_1", "act_ratio2", \
        "used_ratio", "unused_ratio", "buy_with_coupon_ratio", "buy_without_coupon_ratio", \
        "unused_coupon_shop_number", "buy_without_coupon_shop_number", "use_coupon_shop_number", "pure_unused_coupon_shop_number", "pure_no_unused_coupon_shop_number", \
        "max_unused_coupon_shop", "max_buy_without_coupon_shop", "max_use_coupon_shop", "max_total_coupon_shop", "max_total_buy_shop", \
        "avg_unused_coupon_shop", "avg_buy_without_coupon_shop", "avg_use_coupon_shop", "avg_total_coupon_shop", "avg_total_buy_shop", \
        "avg_unused_coupon_shop_nonzero", "avg_buy_without_coupon_shop_nonzero", "avg_use_coupon_shop_nonzero", "avg_total_coupon_nonzero", "avg_total_buy_nonzero", \
        "var_unused_coupon_shop", "var_buy_without_coupon_shop", "var_use_coupon_shop", "var_total_coupon_shop", "var_total_buy_shop", \
        "var_unused_coupon_shop_nonzero", "var_buy_without_coupon_shop_nonzero", "var_use_coupon_shop_nonzero", "var_total_coupon_shop_nonzero", "var_total_buy_shop_nonzero", \
        "mid_unused_coupon_shop_nonzero", "mid_buy_without_coupon_shop_nonzero", "mid_use_coupon_shop_nonzero", "mid_total_coupon_shop_nonzero", "mid_total_buy_shop_nonzero", \
        "min_unused_coupon_shop_nonzero", "min_buy_without_coupon_shop_nonzero", "min_use_coupon_shop_nonzero", "min_total_coupon_shop_nonzero", "min_total_buy_shop_nonzero"]
    user_table = Matrix(
        np.zeros((len(all_users), len(user_col_names))),
        user_col_names,
        col_formats=["%s" for i in xrange(len(user_col_names))])
    row_index = 0
    for uid in user_act_counts.get_keys():
        if row_index % 10000 == 0:
            print row_index

        user_acts = user_act_counts.get(uid)

        unused_coupons = np.array(user_acts["unused_coupons"])
        buy_without_coupons = np.array(user_acts["buy_without_coupons"])
        use_coupons = np.array(user_acts["use_coupons"])
        total_coupons = np.array(user_acts['total_coupons'])
        total_buys = np.array(user_acts["total_buys"])

        unused_coupon = np.sum(unused_coupons)
        buy_without_coupon = np.sum(buy_without_coupons)
        use_coupon = np.sum(use_coupons)
        total_coupon = np.sum(total_coupons)
        total_buy = np.sum(total_buys)
        total_acts = unused_coupon + buy_without_coupon + use_coupon

        act_ratio_0 = divide(unused_coupon, total_acts)
        act_ratio_1 = divide(buy_without_coupon, total_acts)
        act_ratio_2 = divide(use_coupon, total_acts)

        used_ratio = divide(use_coupon, total_coupon)
        unused_ratio = divide(unused_coupon, total_coupon)
        buy_with_coupon_ratio = divide(use_coupon, total_buy)
        buy_without_coupon_ratio = divide(buy_without_coupon, total_buy)

        unused_coupon_shop_number = np.sum(unused_coupons > 0)
        buy_without_coupon_shop_number = np.sum(buy_without_coupons > 0)
        use_coupon_shop_number = np.sum(use_coupons > 0)
        pure_unused_coupon_shop_number = len(
            unused_coupons) - use_coupon_shop_number
        pure_no_unused_coupon_shop_number = len(
            unused_coupons) - unused_coupon_shop_number

        max_unused_coupon_shop = np.max(unused_coupons)
        max_buy_without_coupon_shop = np.max(buy_without_coupons)
        max_use_coupon_shop = np.max(use_coupons)
        max_total_coupon_shop = np.max(total_coupons)
        max_total_buy_shop = np.max(total_buy)

        avg_unused_coupon_shop = np.mean(unused_coupons)
        avg_buy_without_coupon_shop = np.mean(buy_without_coupons)
        avg_use_coupon_shop = np.mean(use_coupons)
        avg_total_coupon_shop = np.mean(total_coupons)
        avg_total_buy_shop = np.mean(total_buys)

        if unused_coupon > 0:
            avg_unused_coupon_shop_nonzero = np.mean(
                unused_coupons[np.nonzero(unused_coupons)])
        else:
            avg_unused_coupon_shop_nonzero = 0
        if buy_without_coupon > 0:
            avg_buy_without_coupon_shop_nonzero = np.mean(
                buy_without_coupons[np.nonzero(buy_without_coupons)])
        else:
            avg_buy_without_coupon_shop_nonzero = 0
        if use_coupon > 0:
            avg_use_coupon_shop_nonzero = np.mean(
                use_coupons[np.nonzero(use_coupons)])
        else:
            avg_use_coupon_shop_nonzero = 0
        if total_coupon > 0:
            avg_total_coupon_shop_nonzero = np.mean(
                total_coupons[np.nonzero(total_coupons)])
        else:
            avg_total_coupon_shop_nonzero = 0
        if total_buy > 0:
            avg_total_buy_shop_nonzero = np.mean(
                total_buys[np.nonzero(total_buys)])
        else:
            avg_total_buy_shop_nonzero = 0

        var_unused_coupon_shop = np.var(unused_coupons)
        var_buy_without_coupon_shop = np.var(buy_without_coupons)
        var_use_coupon_shop = np.var(use_coupons)
        var_total_coupon_shop = np.var(total_coupons)
        var_total_buy_shop = np.var(total_buys)

        if unused_coupon > 0:
            var_unused_coupon_shop_nonzero = np.var(
                unused_coupons[np.nonzero(unused_coupons)])
        else:
            var_unused_coupon_shop_nonzero = 0
        if buy_without_coupon > 0:
            var_buy_without_coupon_shop_nonzero = np.var(
                buy_without_coupons[np.nonzero(buy_without_coupons)])
        else:
            var_buy_without_coupon_shop_nonzero = 0
        if use_coupon > 0:
            var_use_coupon_shop_nonzero = np.var(
                use_coupons[np.nonzero(use_coupons)])
        else:
            var_use_coupon_shop_nonzero = 0
        if total_coupon > 0:
            var_total_coupon_shop_nonzero = np.var(
                total_coupons[np.nonzero(total_coupons)])
        else:
            var_total_coupon_shop_nonzero = 0
        if total_buy > 0:
            var_total_buy_shop_nonzero = np.var(
                total_buys[np.nonzero(total_buys)])
        else:
            var_total_buy_shop_nonzero = 0

        if unused_coupon > 0:
            mid_unused_coupon_shop_nonzero = np.median(
                unused_coupons[np.nonzero(unused_coupons)])
        else:
            mid_unused_coupon_shop_nonzero = 0
        if buy_without_coupon > 0:
            mid_buy_without_coupon_shop_nonzero = np.median(
                buy_without_coupons[np.nonzero(buy_without_coupons)])
        else:
            mid_buy_without_coupon_shop_nonzero = 0
        if use_coupon > 0:
            mid_use_coupon_shop_nonzero = np.median(
                use_coupons[np.nonzero(use_coupons)])
        else:
            mid_use_coupon_shop_nonzero = 0
        if total_coupon > 0:
            mid_total_coupon_shop_nonzero = np.median(
                total_coupons[np.nonzero(total_coupons)])
        else:
            mid_total_coupon_shop_nonzero = 0
        if total_buy > 0:
            mid_total_buy_shop_nonzero = np.median(
                total_buys[np.nonzero(total_buys)])
        else:
            mid_total_buy_shop_nonzero = 0

        if unused_coupon > 0:
            min_unused_coupon_shop_nonzero = np.min(
                unused_coupons[np.nonzero(unused_coupons)])
        else:
            min_unused_coupon_shop_nonzero = 0
        if buy_without_coupon > 0:
            min_buy_without_coupon_shop_nonzero = np.min(
                buy_without_coupons[np.nonzero(buy_without_coupons)])
        else:
            min_buy_without_coupon_shop_nonzero = 0
        if use_coupon > 0:
            min_use_coupon_shop_nonzero = np.min(
                use_coupons[np.nonzero(use_coupons)])
        else:
            min_use_coupon_shop_nonzero = 0
        if total_coupon > 0:
            min_total_coupon_shop_nonzero = np.min(
                total_coupons[np.nonzero(total_coupons)])
        else:
            min_total_coupon_shop_nonzero = 0
        if total_buy > 0:
            min_total_buy_shop_nonzero = np.min(
                total_buys[np.nonzero(total_buys)])
        else:
            min_total_buy_shop_nonzero = 0

        user_table.set_row(row_index, np.array([uid, \
            unused_coupon, buy_without_coupon, use_coupon, total_coupon, total_buy, \
            act_ratio_0, act_ratio_1, act_ratio_2, \
            used_ratio, unused_ratio, buy_with_coupon_ratio, buy_without_coupon_ratio, \
            unused_coupon_shop_number, buy_without_coupon_shop_number, use_coupon_shop_number, pure_unused_coupon_shop_number, pure_no_unused_coupon_shop_number, \
            max_unused_coupon_shop, max_buy_without_coupon_shop, max_use_coupon_shop, max_total_coupon_shop, max_total_buy_shop, \
            avg_unused_coupon_shop, avg_buy_without_coupon_shop, avg_use_coupon_shop, avg_total_coupon_shop, avg_total_buy_shop, \
            avg_unused_coupon_shop_nonzero, avg_buy_without_coupon_shop_nonzero, avg_use_coupon_shop_nonzero, avg_total_coupon_shop_nonzero, avg_total_buy_shop_nonzero, \
            var_unused_coupon_shop, var_buy_without_coupon_shop, var_use_coupon_shop, var_total_coupon_shop, var_total_buy_shop, \
            var_unused_coupon_shop_nonzero, var_buy_without_coupon_shop_nonzero, var_use_coupon_shop_nonzero, var_total_coupon_shop_nonzero, var_total_buy_shop_nonzero, \
            mid_unused_coupon_shop_nonzero, mid_buy_without_coupon_shop_nonzero, mid_use_coupon_shop_nonzero, mid_total_coupon_shop_nonzero, mid_total_buy_shop_nonzero, \
            min_unused_coupon_shop_nonzero, min_buy_without_coupon_shop_nonzero, min_use_coupon_shop_nonzero, min_total_coupon_shop_nonzero, min_total_buy_shop_nonzero]))
        row_index += 1
    assert row_index == len(all_users)
    user_table.check_point("user_act_counts_month_{0}".format(month))
def calc_first_used(month):
    offline = Matrix(
        np.genfromtxt(paths.ccf_path +
                      'offline_train_test_{0}.csv'.format(month),
                      delimiter=',',
                      dtype=str),
        ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"],
        ["%s" for i in xrange(7)])
    user_coupon = HashSet()
    user_total_used = HashSet()
    user_first_used = HashSet()
    user_first_coupon = HashSet()
    for i in xrange(offline.ndata):
        if i % 10000 == 0:
            print i
        uid_str = offline.get_cell(i, "uid")
        mid_str = offline.get_cell(i, "mid")
        date_str = offline.get_cell(i, "date")
        date_rec_str = offline.get_cell(i, "date_rec")
        cid_str = offline.get_cell(i, "cid")
        if cid_str == 'null':
            continue
        user_coupon.add_one(uid_str)

        user_first_coupon_list = user_first_coupon.get(uid_str, HashSet())
        if not user_first_coupon_list.has(mid_str):
            if date_str != 'null':
                user_first_coupon_list.set(mid_str, (date_rec_str, 1))
            else:
                user_first_coupon_list.set(mid_str, (date_rec_str, 0))
        else:
            old_first_date_rec, old_used = user_first_coupon_list.get(mid_str)
            if date_rec_str < old_first_date_rec:
                if date_str != 'null':
                    user_first_coupon_list.set(mid_str, (date_rec_str, 1))
                else:
                    user_first_coupon_list.set(mid_str, (date_rec_str, 0))

        if date_str != 'null':
            user_total_used.add_one(uid_str)

    col_names = [
        "user_coupon", "user_coupon_used", "user_first_coupon",
        "user_first_coupon_used", "user_coupon_used_ratio",
        "user_first_coupon_used_ratio"
    ]
    statistics = np.zeros((offline.ndata, len(col_names)))
    for i in xrange(offline.ndata):
        if i % 10000 == 0:
            print i
        uid_str = offline.get_cell(i, "uid")
        user_first_used_counter = 0
        user_first_counter = 0
        user_first_coupon_list = user_first_coupon.get(uid_str, HashSet())
        for mid in user_first_coupon_list.get_keys():
            user_first_counter += 1
            first_date_rec, used = user_first_coupon_list.get(mid)
            if used == 1:
                user_first_used_counter += 1
        user_coupon_counter = user_coupon.get(uid_str, 0)
        user_total_used_counter = user_total_used.get(uid_str, 0)
        if user_coupon_counter > 0:
            ratio = user_total_used_counter * 1.0 / user_coupon_counter
        else:
            ratio = 0
        if user_first_counter > 0:
            first_ratio = user_first_used_counter * 1.0 / user_first_counter
        statistics[i, :] = [
            user_coupon_counter, user_total_used_counter, user_first_counter,
            user_first_used_counter, ratio, first_ratio
        ]
    offline.cat_col(statistics, col_names,
                    ["%s" for i in xrange(len(col_names))])
    offline.check_point("fisrt_use_{0}".format(month))
Esempio n. 13
0
def calc_user_shop_month():
    offline = Matrix(np.genfromtxt(paths.ccf_path + 'ccf_offline_stage1_train.csv', delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)])

    user_shop_month = HashSet()
    shop_month_coupon = {}
    shop_month_coupon_used = {}
    shop_coupon = {}
    shop_coupon_used = {}
    for i in xrange(offline.ndata):
        if i % 100000 == 0:
            print i
        uid_str = offline.get_cell(i, "uid")
        mid_str = offline.get_cell(i, "mid")
        date_rec_str = offline.get_cell(i, "date_rec")
        if date_rec_str == 'null':
            continue
        month = get_month(date_rec_str)
        user_shop_month_set = user_shop_month.get(uid_str, HashSet()).get(mid_str, {})
        if month not in user_shop_month_set:
            user_shop_month_set[month] = 1
        else:
            user_shop_month_set[month] += 1

    for i in xrange(offline.ndata):
        if i % 100000 == 0:
            print i
        uid_str = offline.get_cell(i, "uid")
        mid_str = offline.get_cell(i, "mid")
        date_rec_str = offline.get_cell(i, "date_rec")
        date_str = offline.get_cell(i, "date")
        cid_str = offline.get_cell(i, "cid")
        if date_rec_str == 'null':
            continue
        assert cid_str != 'null'
        month = get_month(date_rec_str)
        user_shop_month_set = user_shop_month.get(uid_str, HashSet()).get(mid_str, {})

        if mid_str not in shop_coupon:
            shop_coupon[mid_str] = 1
        else:
            shop_coupon[mid_str] += 1
        if date_str != 'null':
            if mid_str not in shop_coupon_used:
                shop_coupon_used[mid_str] = 1
            else:
                shop_coupon_used[mid_str] += 1

        if month in user_shop_month_set and user_shop_month_set[month] > 1:
            if mid_str not in shop_month_coupon:
                shop_month_coupon[mid_str] = 1
            else:
                shop_month_coupon[mid_str] += 1
            if date_str != 'null':
                if mid_str not in shop_month_coupon_used:
                    shop_month_coupon_used[mid_str] = 1
                else:
                    shop_month_coupon_used[mid_str] += 1
    col_names = ["user_shop_month", "shop_coupon", "shop_coupon_used", "shop_month_coupon", "shop_month_coupon_used"]
    statistics = np.zeros((offline.ndata, 5), dtype=float)
    for i in xrange(offline.ndata):
        if i % 100000 == 0:
            print i
        uid_str = offline.get_cell(i, "uid")
        mid_str = offline.get_cell(i, "mid")
        user_shop_month_set = user_shop_month.get(uid_str, HashSet()).get(mid_str, {})  
        statistics[i, :] = np.array([user_shop_month_set.get(month, 0), shop_coupon.get(mid_str, 0), \
            shop_coupon_used.get(mid_str, 0), shop_month_coupon.get(mid_str, 0), shop_month_coupon_used.get(mid_str, 0)])
    offline.cat_col(statistics, col_names, ["%s" for i in xrange(len(col_names))])
    offline.check_point("shop_month_coupon")