Esempio n. 1
0
def gen_user_shop_tags(path_to_user_tags_csv, path_to_shop_tags_csv, matrix):
    utags = HashSet(
        np.genfromtxt(path_to_user_tags_csv, delimiter=',', dtype=float))
    stags = HashSet(
        np.genfromtxt(path_to_shop_tags_csv, delimiter=',', dtype=float))
    matrix.join("uid", ["u_tag_{0}".format(i) for i in xrange(utags.row_dim)],
                utags, ["%s" for i in xrange(utags.row_dim)])
    matrix.join("mid", ["m_tag_{0}".format(i) for i in xrange(stags.row_dim)],
                stags, ["%s" for i in xrange(stags.row_dim)])
Esempio n. 2
0
def gen_user_shop_features(path_to_user_features_csv,
                           path_to_shop_features_csv, matrix):
    ufeatures = HashSet(
        np.genfromtxt(path_to_user_features_csv, delimiter=',', dtype=float))
    sfeatures = HashSet(
        np.genfromtxt(path_to_shop_features_csv, delimiter=',', dtype=float))
    matrix.join_op("uid", "mid", "score2", ufeatures, sfeatures,
                   lambda x, y: np.sum(np.array(x) * np.array(y)), "%s")
    matrix.join("uid", [
        "user_cofi_features_auto{0}".format(i)
        for i in xrange(ufeatures.row_dim)
    ], ufeatures, ["%s" for i in xrange(ufeatures.row_dim)])
    matrix.join("mid", [
        "shop_cofi_features_auto{0}".format(i)
        for i in xrange(sfeatures.row_dim)
    ], sfeatures, ["%s" for i in xrange(sfeatures.row_dim)])
Esempio n. 3
0
def gen_user_buy_shop(since, to, month):
    assert to == month - 1
    offline = Matrix(np.genfromtxt('offline_{0}_month_{1}{2}.csv'.format(to - since + 1, since, to), delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)])
    online = Matrix(np.genfromtxt('offline_{0}_month_{1}{2}.csv'.format(to - since + 1, since, to), delimiter=',', dtype=str), ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"], ["%s" for i in xrange(7)])

    col_names = np.genfromtxt(paths.my_path + 'col_names_{0}_{1}{2}.csv'.format(month, since, to), delimiter=',', dtype=str)
    col_number = col_names.shape[1]
    X = Matrix(np.genfromtxt(paths.my_path + 'X_{0}_{1}{2}.csv'.format(month, since, to), delimiter=',', dtype=str), list(col_names[0, :]), ["%s" for i in xrange(col_names.shape[1])])

    user_buy_shop = HashSet()
    for i in xrange(offline.ndata):
        
Esempio n. 4
0
def gen_no_penalty_user_shop_features(X, month):
    ufeatures = HashSet(
        np.genfromtxt(paths.my_path +
                      'u_features_no_penalty_10_{0}.csv'.format(month),
                      delimiter=',',
                      dtype=float))
    sfeatures = HashSet(
        np.genfromtxt(paths.my_path +
                      'i_features_no_penalty_10_{0}.csv'.format(month),
                      delimiter=',',
                      dtype=float))
    X.join_op("uid", "mid", "score1", ufeatures, sfeatures,
              lambda x, y: np.sum(np.array(x) * np.array(y)), "%s")
    X.join("uid", [
        "user_cofi_features_no_penalty{0}".format(i)
        for i in xrange(ufeatures.row_dim)
    ], ufeatures, ["%s" for i in xrange(ufeatures.row_dim)])
    X.join("mid", [
        "shop_cofi_features_no_penalty{0}".format(i)
        for i in xrange(sfeatures.row_dim)
    ], sfeatures, ["%s" for i in xrange(sfeatures.row_dim)])
Esempio n. 5
0
def gen_user_get_coupon(offline_source, online_source, X, month):

    offline = Matrix(
        np.genfromtxt(paths.ccf_path + offline_source,
                      delimiter=',',
                      dtype=str),
        ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"],
        ["%s" for i in xrange(7)])
    online = Matrix(
        np.genfromtxt(paths.ccf_path + online_source, delimiter=',',
                      dtype=str),
        ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"],
        ["%s" for i in xrange(7)])

    user_get_coupon = HashSet()
    for i in xrange(offline.ndata):
        uid_str = offline.get_cell(i, "uid")
        cid_str = offline.get_cell(i, "cid")
        if cid_str != 'null':
            user_get_coupon.add_one(uid_str)
    for i in xrange(online.ndata):
        uid_str = online.get_cell(i, "uid")
        cid_str = online.get_cell(i, "cid")
        act_str = online.get_cell(i, "act")
        if cid_str != 'null':
            assert act_str != '0'
            user_get_coupon.add_one(uid_str)
    X.join("uid", ["user_get_coupon"], user_get_coupon, ["%s"], dft=0.0)

    def divide(x, y):
        assert float(x) != -9999 and float(y) != -9999
        assert float(x) <= float(y), "{0} {1}".format(x, y)
        if float(y) == 0:
            return 0.0
        else:
            return float(x) * 1.0 / float(y)

    X.gen_arith_feature("user_buy_with_coupon",
                        "user_get_coupon",
                        "user_use_coupon_freq",
                        divide,
                        "%s",
                        dft=0.0)
Esempio n. 6
0
def gen_merchant_share(X, month):
    offline = Matrix(
        np.genfromtxt(paths.ccf_path + 'offline_train_{0}.csv'.format(month),
                      delimiter=',',
                      dtype=str),
        ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"],
        ["%s" for i in xrange(7)])
    online = Matrix(
        np.genfromtxt(paths.ccf_path + 'online_train_{0}.csv'.format(month),
                      delimiter=',',
                      dtype=str),
        ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"],
        ["%s" for i in xrange(7)])

    merchant_user_buy = HashSet()
    merchant_user_use_coupon = HashSet()
    merchant_user_buy_counter = HashSet()
    merchant_user_use_coupon_counter = HashSet()
    for i in xrange(offline.ndata):
        if i % 100000 == 0:
            print i
        mid_str = offline.get_cell(i, "mid")
        uid_str = offline.get_cell(i, "uid")
        date_str = offline.get_cell(i, "date")
        cid_str = offline.get_cell(i, "cid")
        if date_str != 'null':
            if not merchant_user_buy.has(mid_str):
                merchant_user_buy.set(mid_str, HashSet())
            if not merchant_user_buy.get(mid_str).has(uid_str):
                merchant_user_buy.get(mid_str).add_one(uid_str)
                merchant_user_buy_counter.add_one(mid_str)
            if cid_str != 'null':
                if not merchant_user_use_coupon.has(mid_str):
                    merchant_user_use_coupon.set(mid_str, HashSet())
                if not merchant_user_use_coupon.get(mid_str).has(uid_str):
                    merchant_user_use_coupon.get(mid_str).add_one(uid_str)
                    merchant_user_use_coupon_counter.add_one(mid_str)
    for i in xrange(online.ndata):
        if i % 100000 == 0:
            print i
        mid_str = online.get_cell(i, "mid")
        uid_str = online.get_cell(i, "uid")
        act_str = online.get_cell(i, "act")
        cid_str = online.get_cell(i, "cid")
        if act_str == '1':
            if not merchant_user_buy.has(mid_str):
                merchant_user_buy.set(mid_str, HashSet())
            if not merchant_user_buy.get(mid_str).has(uid_str):
                merchant_user_buy.get(mid_str).add_one(uid_str)
                merchant_user_buy_counter.add_one(mid_str)
            if cid_str != 'null':
                if not merchant_user_use_coupon.has(mid_str):
                    merchant_user_use_coupon.set(mid_str, HashSet())
                if not merchant_user_use_coupon.get(mid_str).has(uid_str):
                    merchant_user_use_coupon.get(mid_str).add_one(uid_str)
                    merchant_user_use_coupon_counter.add_one(mid_str)
    X.join("mid", ["merchant_user_buy"],
           merchant_user_buy_counter, ["%s"],
           dft=0.0)
    X.join("mid", ["merchant_user_use_coupon"],
           merchant_user_use_coupon_counter, ["%s"],
           dft=0.0)
    X.check_point(month)
Esempio n. 7
0
def gen_merchant_buy(X, month):

    offline = Matrix(
        np.genfromtxt(paths.ccf_path + 'offline_train_{0}.csv'.format(month),
                      delimiter=',',
                      dtype=str),
        ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"],
        ["%s" for i in xrange(7)])
    online = Matrix(
        np.genfromtxt(paths.ccf_path + 'online_train_{0}.csv'.format(month),
                      delimiter=',',
                      dtype=str),
        ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"],
        ["%s" for i in xrange(7)])

    merchant_buy = HashSet()
    merchant_buy_with_coupon = HashSet()
    merchant_distribute_coupon = HashSet()
    for i in xrange(offline.ndata):
        if i % 100000 == 0:
            print i
        mid_str = offline.get_cell(i, "mid")
        date_str = offline.get_cell(i, "date")
        cid_str = offline.get_cell(i, "cid")
        if date_str != 'null':
            merchant_buy.add_one(mid_str)
            if cid_str != 'null':
                merchant_buy_with_coupon.add_one(mid_str)
        if cid_str != 'null':
            merchant_distribute_coupon.add_one(mid_str)
    for i in xrange(online.ndata):
        if i % 100000 == 0:
            print i
        mid_str = online.get_cell(i, "mid")
        act_str = online.get_cell(i, "act")
        cid_str = online.get_cell(i, "cid")
        if act_str == '1':
            merchant_buy.add_one(mid_str)
            if cid_str != 'null':
                merchant_buy_with_coupon.add_one(mid_str)
        if cid_str != 'null':
            assert act_str != '0'
            merchant_distribute_coupon.add_one(mid_str)
    X.join("mid", ["merchant_buy"], merchant_buy, ["%s"], 0.0)
    X.join("mid", ["merchant_buy_with_coupon"], merchant_buy_with_coupon,
           ["%s"], 0.0)
    X.join("mid", ["merchant_distribute_coupon"], merchant_distribute_coupon,
           ["%s"], 0.0)

    def divide(x, y):
        assert float(x) <= float(y)
        if float(y) == 0:
            return 0.0
        else:
            return float(x) * 1.0 / float(y)

    X.gen_arith_feature("merchant_buy_with_coupon",
                        "merchant_buy",
                        "merchant_buy_with_coupon_ratio",
                        divide,
                        "%s",
                        dft=0.0)
    X.gen_arith_feature("merchant_buy_with_coupon",
                        "merchant_distribute_coupon",
                        "merchant_coupon_ratio",
                        divide,
                        "%s",
                        dft=0.0)
    X.check_point(month)
Esempio n. 8
0
def gen_user_recent_behavior(X, month):
    offline = Matrix(
        np.genfromtxt(paths.ccf_path + 'offline_train_{0}.csv'.format(month),
                      delimiter=',',
                      dtype=str),
        ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"],
        ["%s" for i in xrange(7)])
    online = Matrix(
        np.genfromtxt(paths.ccf_path + 'online_train_{0}.csv'.format(month),
                      delimiter=',',
                      dtype=str),
        ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"],
        ["%s" for i in xrange(7)])

    behaviors = {
        1: HashSet(),
        2: HashSet(),
        3: HashSet(),
        4: HashSet(),
        5: HashSet(),
        6: HashSet()
    }
    for i in xrange(offline.ndata):
        if i % 100000 == 0:
            print i
        uid_str = offline.get_cell(i, "uid")
        date_str = offline.get_cell(i, "date")
        cid_str = offline.get_cell(i, "cid")
        date_rec_str = offline.get_cell(i, "date_rec")
        if date_str != 'null':
            act_month = get_month(date_str)
            if not behaviors[act_month].has(uid_str):
                behaviors[act_month].set(uid_str, np.zeros(3))
            if cid_str != 'null':
                behaviors[act_month].get(uid_str)[2] += 1
            else:
                behaviors[act_month].get(uid_str)[1] += 1
        else:
            act_month = get_month(date_rec_str)
            if not behaviors[act_month].has(uid_str):
                behaviors[act_month].set(uid_str, np.zeros(3))
            behaviors[act_month].get(uid_str)[0] += 1

    for i in xrange(online.ndata):
        if i % 100000 == 0:
            print i
        uid_str = online.get_cell(i, "uid")
        date_str = online.get_cell(i, "date")
        cid_str = online.get_cell(i, "cid")
        date_rec_str = online.get_cell(i, "date_rec")
        act_str = online.get_cell(i, "act")
        if act_str == '0':
            continue
        if date_str != 'null':
            assert act_str == '1'
            act_month = get_month(date_str)
            if not behaviors[act_month].has(uid_str):
                behaviors[act_month].set(uid_str, np.zeros(3))
            if cid_str != 'null':
                behaviors[act_month].get(uid_str)[2] += 1
            else:
                behaviors[act_month].get(uid_str)[1] += 1
        else:
            assert act_str == '2'
            act_month = get_month(date_rec_str)
            if not behaviors[act_month].has(uid_str):
                behaviors[act_month].set(uid_str, np.zeros(3))
            behaviors[act_month].get(uid_str)[0] += 1

    def gen_user_recent_behavior_func(row):
        date_rec_str = row['date_rec']
        uid_str = row['uid']
        act_month = get_month(date_rec_str)
        behavior_history = np.zeros(0)
        for i in xrange(4):
            behavior_vector = np.array([-9999, -9999, -9999])
            if date_rec_str != 'null' and act_month - 1 - i > 0 and behaviors[
                    act_month - 1 - i].has(uid_str):
                behavior_vector = behaviors[act_month - 1 - i].get(uid_str)
            elif act_month - 1 - i > 0 and date_rec_str != 'null':
                behavior_vector = np.array([0, 0, 0])
            behavior_history = np.hstack((behavior_history, behavior_vector))
        return behavior_history

    names = []
    for i in xrange(4):
        names += [
            "recent_user_unuse{0}".format(i), "recent_user_buy{0}".format(i),
            "recent_user_use{0}".format(i)
        ]
    X.gen_features(names, gen_user_recent_behavior_func,
                   ["%s" for i in xrange(12)])
    X.check_point(month)
Esempio n. 9
0
def gen_user_buy_in_shop(matrix_offline, matrix_online, X):
    user_buy_in_shop = HashSet(default=matrix_offline.default)
    for i in xrange(matrix_offline.ndata):
        date_str = matrix_offline.get_cell(i, 'date')
        uid_str = matrix_offline.get_cell(i, 'uid')
        mid_str = matrix_offline.get_cell(i, "mid")
        if date_str != 'null':
            if not user_buy_in_shop.has(uid_str):
                user_buy_in_shop.set(uid_str, HashSet())
            user_buy_in_shop.get(uid_str).add_one(mid_str)
    for i in xrange(matrix_online.ndata):
        act_str = matrix_online.get_cell(i, 'act')
        uid_str = matrix_online.get_cell(i, 'uid')
        mid_str = matrix_online.get_cell(i, "mid")
        if act_str == '1':
            if not user_buy_in_shop.has(uid_str):
                user_buy_in_shop.set(uid_str, HashSet())
            user_buy_in_shop.get(uid_str).add_one(mid_str)
    X.join_by_double_key("uid",
                         "mid",
                         "user_buy_in_shop",
                         user_buy_in_shop,
                         "%s",
                         dft=0.0)

    def divide(x, y):
        assert float(x) <= float(y)
        if float(y) == 0:
            return 0
        else:
            return float(x) * 1.0 / float(y)

    X.gen_arith_feature("user_buy_in_shop",
                        "user_buy",
                        "user_buy_in_shop_ratio",
                        divide,
                        fmt="%s",
                        dft=0.0)
Esempio n. 10
0
def gen_user_buy_with_coupon(matrix_offline, matrix_online, X):
    user_buy = HashSet(default=matrix_offline.default)
    user_buy_with_coupon = HashSet(default=matrix_offline.default)
    for i in xrange(matrix_offline.ndata):
        cid_str = matrix_offline.get_cell(i, 'cid')
        date_str = matrix_offline.get_cell(i, 'date')
        uid_str = matrix_offline.get_cell(i, 'uid')
        if date_str != 'null':
            user_buy.add_one(uid_str)
            if cid_str != 'null':
                user_buy_with_coupon.add_one(uid_str)
    for i in xrange(matrix_online.ndata):
        act_str = matrix_online.get_cell(i, 'act')
        cid_str = matrix_online.get_cell(i, 'cid')
        uid_str = matrix_online.get_cell(i, 'uid')
        if act_str == '1':
            user_buy.add_one(uid_str)
            if cid_str != 'null':
                user_buy_with_coupon.add_one(uid_str)
    user_buy_with_coupon_freq = user_buy.merge_op(
        user_buy_with_coupon, lambda x, y: float(y) * 1.0 / float(x), dft=0.0)
    X.join("uid",
           ["user_buy", "user_buy_with_coupon", "user_buy_with_coupon_freq"],
           user_buy.merge(user_buy_with_coupon,
                          dft=0.0).merge(user_buy_with_coupon_freq,
                                         dft=0.0), ("%s" for i in xrange(3)),
           dft=0.0)
Esempio n. 11
0
def gen_user_get_shop_coupon(offline_source, online_source, X):
    offline = Matrix(
        np.genfromtxt(paths.ccf_path + offline_source,
                      delimiter=',',
                      dtype=str),
        ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"],
        ["%s" for i in xrange(7)])
    online = Matrix(
        np.genfromtxt(paths.ccf_path + online_source, delimiter=',',
                      dtype=str),
        ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"],
        ["%s" for i in xrange(7)])

    user_get_shop_coupon = HashSet()
    for i in xrange(offline.ndata):
        uid_str = offline.get_cell(i, "uid")
        cid_str = offline.get_cell(i, "cid")
        mid_str = offline.get_cell(i, "mid")
        if cid_str != 'null':
            if not user_get_shop_coupon.has(uid_str):
                user_get_shop_coupon.set(uid_str, HashSet())
            user_get_shop_coupon.get(uid_str).add_one(mid_str)
    for i in xrange(online.ndata):
        uid_str = online.get_cell(i, "uid")
        cid_str = online.get_cell(i, "cid")
        mid_str = online.get_cell(i, "mid")
        act_str = online.get_cell(i, "act")
        if cid_str != 'null' and act_str != '0':
            if not user_get_shop_coupon.has(uid_str):
                user_get_shop_coupon.set(uid_str, HashSet())
            user_get_shop_coupon.get(uid_str).add_one(mid_str)
    X.join_by_double_key("uid",
                         "mid",
                         "user_get_shop_coupon",
                         user_get_shop_coupon,
                         "%s",
                         dft=0.0)

    def divide(x, y):
        assert float(x) <= float(y)
        if float(y) == 0:
            return 0.0
        else:
            return float(x) * 1.0 / float(y)

    X.gen_arith_feature("user_buy_with_coupon_in_shop",
                        "user_get_shop_coupon",
                        "user_use_shop_coupon_freq",
                        divide,
                        "%s",
                        dft=0.0)
Esempio n. 12
0
def gen_user_buy_coupon_in_shop(matrix_offline, matrix_online, X):
    user_buy_with_coupon_in_shop = HashSet()
    user_buy_without_coupon_in_shop = HashSet()
    for i in xrange(matrix_offline.ndata):
        uid_str = matrix_offline.get_cell(i, "uid")
        cid_str = matrix_offline.get_cell(i, "cid")
        date_str = matrix_offline.get_cell(i, "date")
        mid_str = matrix_offline.get_cell(i, "mid")
        if date_str != 'null':
            if not user_buy_with_coupon_in_shop.has(uid_str):
                user_buy_with_coupon_in_shop.set(uid_str, HashSet())
                user_buy_without_coupon_in_shop.set(uid_str, HashSet())
            if cid_str != 'null':
                user_buy_with_coupon_in_shop.get(uid_str).add_one(mid_str)
            else:
                user_buy_without_coupon_in_shop.get(uid_str).add_one(mid_str)

    for i in xrange(matrix_online.ndata):
        uid_str = matrix_online.get_cell(i, "uid")
        cid_str = matrix_online.get_cell(i, "cid")
        mid_str = matrix_online.get_cell(i, "mid")
        act_str = matrix_online.get_cell(i, "act")
        if act_str == '1':
            if not user_buy_with_coupon_in_shop.has(uid_str):
                user_buy_with_coupon_in_shop.set(uid_str, HashSet())
                user_buy_without_coupon_in_shop.set(uid_str, HashSet())
            if cid_str != 'null':
                user_buy_with_coupon_in_shop.get(uid_str).add_one(mid_str)
            else:
                user_buy_without_coupon_in_shop.get(uid_str).add_one(mid_str)

    X.join_by_double_key("uid",
                         "mid",
                         "user_buy_with_coupon_in_shop",
                         user_buy_with_coupon_in_shop,
                         "%s",
                         dft=0.0)
    X.join_by_double_key("uid",
                         "mid",
                         "user_buy_without_coupon_in_shop",
                         user_buy_without_coupon_in_shop,
                         "%s",
                         dft=0.0)

    def divide(x, y):
        assert float(x) <= float(y)
        if float(y) == 0:
            return 0
        else:
            return float(x) * 1.0 / float(y)

    X.gen_arith_feature("user_buy_with_coupon_in_shop",
                        "user_buy_in_shop",
                        "user_buy_with_coupon_in_shop_ratio",
                        divide,
                        "%s",
                        dft=0.0)
    X.gen_arith_feature("user_buy_without_coupon_in_shop",
                        "user_buy_in_shop",
                        "user_buy_without_coupon_in_shop_ratio",
                        divide,
                        "%s",
                        dft=0.0)
Esempio n. 13
0
def gen_coupon_gap(X, month):
    get_coupon_history = HashSet()
    get_coupon_history_user = HashSet()
    for i in xrange(X.ndata):
        if i % 100000 == 0:
            print i
        uid_str = X.get_cell(i, "uid")
        mid_str = X.get_cell(i, "mid")
        date_rec_str = X.get_cell(i, "date_rec")
        if date_rec_str != 'null':
            history_list = get_coupon_history.get(uid_str,
                                                  HashSet()).get(mid_str, [])
            #if date_rec_str not in history_list:
            history_list.append(date_rec_str)
            user_history_list = get_coupon_history_user.get(uid_str, [])
            #if date_rec_str not in user_history_list:
            user_history_list.append(date_rec_str)
    if month > 1:
        last_X = Matrix(
            np.genfromtxt(paths.ccf_path +
                          'offline_1_month{0}.csv'.format(month - 1),
                          delimiter=',',
                          dtype=str),
            ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"],
            ["%s" for i in xrange(7)])
        for i in xrange(last_X.ndata):
            if i % 100000 == 0:
                print i
            uid_str = last_X.get_cell(i, "uid")
            mid_str = last_X.get_cell(i, "mid")
            date_rec_str = last_X.get_cell(i, "date_rec")
            if date_rec_str != 'null':
                history_list = get_coupon_history.get(uid_str, HashSet()).get(
                    mid_str, [])
                #if date_rec_str not in history_list:
                history_list.append(date_rec_str)
                user_history_list = get_coupon_history_user.get(uid_str, [])
                #if date_rec_str not in user_history_list:
                user_history_list.append(date_rec_str)
    if month < 7:
        next_X = Matrix(
            np.genfromtxt(paths.ccf_path +
                          'offline_1_month{0}.csv'.format(month + 1),
                          delimiter=',',
                          dtype=str),
            ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"],
            ["%s" for i in xrange(7)])
        for i in xrange(next_X.ndata):
            if i % 100000 == 0:
                print i
            uid_str = next_X.get_cell(i, "uid")
            mid_str = next_X.get_cell(i, "mid")
            date_rec_str = next_X.get_cell(i, "date_rec")
            if date_rec_str != 'null':
                history_list = get_coupon_history.get(uid_str, HashSet()).get(
                    mid_str, [])
                #if date_rec_str not in history_list:
                history_list.append(date_rec_str)
                user_history_list = get_coupon_history_user.get(uid_str, [])
                #if date_rec_str not in user_history_list:
                user_history_list.append(date_rec_str)
    for i in xrange(X.ndata):
        if i % 100000 == 0:
            print i
        uid_str = X.get_cell(i, "uid")
        mid_str = X.get_cell(i, "mid")
        history_list = get_coupon_history.get(uid_str,
                                              HashSet()).get(mid_str, [])
        get_coupon_history.get(uid_str).set(mid_str, np.sort(history_list))
        user_history_list = get_coupon_history_user.get(uid_str, [])
        get_coupon_history_user.set(uid_str, np.sort(user_history_list))
    col_names = [
        "prev_gap", "next_gap", "user_prev_gap", "user_next_gap", "in_between",
        "prev_gap_prev", "next_gap_prev"
    ]
    gaps = np.zeros((X.ndata, len(col_names)))

    for i in xrange(X.ndata):
        if i % 100000 == 0:
            print i
        uid_str = X.get_cell(i, "uid")
        mid_str = X.get_cell(i, "mid")
        date_rec_str = X.get_cell(i, "date_rec")
        history_list = get_coupon_history.get(uid_str).get(mid_str)
        user_history_list = get_coupon_history_user.get(uid_str)
        k = -1
        for j in xrange(len(history_list)):
            ##if history_list[j] == date_rec_str:
            if j == len(history_list) - 1 or history_list[j +
                                                          1] > date_rec_str:
                break
            else:
                k += 1
        in_between = 1
        if k > 0:
            prev_gap = days_dis(history_list[k], date_rec_str)
        else:
            prev_gap = 0
            in_between = 0
        if k + 2 < len(history_list):
            next_gap = days_dis(date_rec_str, history_list[k + 2])
        else:
            next_gap = 0
            in_between = 0

        k = -1
        for j in xrange(len(user_history_list)):
            ##if user_history_list[j] == date_rec_str:
            if j == len(history_list) - 1 or history_list[j +
                                                          1] > date_rec_str:
                break
            else:
                k += 1
        if k > 0:
            user_prev_gap = days_dis(user_history_list[k], date_rec_str)
        else:
            user_prev_gap = 0
        if k + 2 < len(user_history_list):
            user_next_gap = days_dis(date_rec_str, user_history_list[k + 2])
        else:
            user_next_gap = 0

        k = -1
        for j in xrange(len(history_list)):
            if history_list[j] == date_rec_str:
                ##if j == len(history_list) - 1 or history_list[j + 1] > date_rec_str:
                break
            else:
                k += 1
        if k > 0:
            prev_gap_prev = days_dis(history_list[k], date_rec_str)
        else:
            prev_gap_prev = 0
        if k + 2 < len(history_list):
            next_gap_prev = days_dis(date_rec_str, history_list[k + 2])
        else:
            next_gap_prev = 0
            in_between = 0

        gaps[i, :] = np.array([
            prev_gap, next_gap, user_prev_gap, user_next_gap, in_between,
            prev_gap_prev, next_gap_prev
        ])
    X.cat_col(gaps, col_names, ["%s" for i in xrange(len(col_names))])
    X.check_point("X_{0}_{1}{2}_gap".format(month, 1, month - 1))
Esempio n. 14
0
def gen_user_shop_month(X, month):
    user_shop_month = HashSet()
    shop_month_coupon = {}
    shop_month_coupon_used = {}
    shop_coupon = {}
    shop_coupon_used = {}
    for i in xrange(X.ndata):
        if i % 100000 == 0:
            print i
        uid_str = X.get_cell(i, "uid")
        mid_str = X.get_cell(i, "mid")
        date_rec_str = X.get_cell(i, "date_rec")
        if date_rec_str == 'null':
            continue
        month = get_month(date_rec_str)
        user_shop_month_set = user_shop_month.get(uid_str,
                                                  HashSet()).get(mid_str, {})
        if month not in user_shop_month_set:
            user_shop_month_set[month] = 1
        else:
            user_shop_month_set[month] += 1
    """
    for i in xrange(X.ndata):
        if i % 100000 == 0:
            print i
        uid_str = X.get_cell(i, "uid")
        mid_str = X.get_cell(i, "mid")
        date_rec_str = X.get_cell(i, "date_rec")
        date_str = X.get_cell(i, "date")
        cid_str = X.get_cell(i, "cid")
        if date_rec_str == 'null':
            continue
        assert cid_str != 'null'
        month = get_month(date_rec_str)
        user_shop_month_set = user_shop_month.get(uid_str, HashSet()).get(mid_str, {})

        if mid_str not in shop_coupon:
            shop_coupon[mid_str] = 1
        else:
            shop_coupon[mid_str] += 1
        if date_str != 'null':
            if mid_str not in shop_coupon_used:
                shop_coupon_used[mid_str] = 1
            else:
                shop_coupon_used[mid_str] += 1

        if month in user_shop_month_set and user_shop_month_set[month] > 1:
            if mid_str not in shop_month_coupon:
                shop_month_coupon[mid_str] = 1
            else:
                shop_month_coupon[mid_str] += 1
            if date_str != 'null':
                if mid_str not in shop_month_coupon_used:
                    shop_month_coupon_used[mid_str] = 1
                else:
                    shop_month_coupon_used[mid_str] += 1
    """
    #col_names = ["user_shop_month_11_2", "shop_coupon_11_2", "shop_coupon_used_11_2", "shop_month_coupon_11_2", "shop_month_coupon_used_11_2", "shop_coupon_used_ratio_11_2", "is_shop_coupon_11_2"]
    col_names = ["user_shop_month_11_2", "is_shop_coupon_11_2"]
    statistics = np.zeros((X.ndata, len(col_names)), dtype=float)
    for i in xrange(X.ndata):
        if i % 100000 == 0:
            print i
        uid_str = X.get_cell(i, "uid")
        mid_str = X.get_cell(i, "mid")
        user_shop_month_set = user_shop_month.get(uid_str,
                                                  HashSet()).get(mid_str, {})
        #if shop_month_coupon.get(mid_str, 0) == 0:
        #    ratio = 0
        #else:
        #    ratio = shop_month_coupon_used.get(mid_str, 0) * 1.0 / shop_month_coupon.get(mid_str)
        if user_shop_month_set.get(month, 0) > 1:
            is_shop_coupon = 1
        else:
            is_shop_coupon = 0
        statistics[i, :] = np.array(
            [user_shop_month_set.get(month, 0), is_shop_coupon])
    X.cat_col(statistics, col_names, ["%s" for i in xrange(len(col_names))])
    X.check_point("X_{0}_{1}{2}_shop_month".format(month, 1, month - 1))