Esempio n. 1
0
def main():
    # 待预测订单的数据 (原始训练集和测试集)
    train = pd.read_csv(Configure.base_path + 'train/orderFuture_train.csv',
                        encoding='utf8')
    test = pd.read_csv(Configure.base_path + 'test/orderFuture_test.csv',
                       encoding='utf8')
    orderHistory_train = pd.read_csv(Configure.cleaned_path +
                                     'cleaned_orderHistory_train.csv',
                                     encoding='utf8')
    orderHistory_test = pd.read_csv(Configure.cleaned_path +
                                    'cleaned_orderHistory_test.csv',
                                    encoding='utf8')
    action_train = pd.read_csv(Configure.base_path + 'train/action_train.csv')
    action_test = pd.read_csv(Configure.base_path + 'test/action_test.csv')

    action_train = build_time_features(action_train)
    action_test = build_time_features(action_test)

    orderHistory_train['city'] = orderHistory_train['city'].astype(str)
    orderHistory_test['city'] = orderHistory_test['city'].astype(str)
    orderHistory_train['orderTime'] = pd.to_datetime(
        orderHistory_train['orderTime'])
    orderHistory_test['orderTime'] = pd.to_datetime(
        orderHistory_test['orderTime'])

    feature_name = 'advance_order_history_features'
    if not data_utils.is_feature_created(feature_name):
        print('build train advance_order_history_features')
        train_features = gen_history_features(train, orderHistory_train)
        print('build test advance_order_history_features')
        test_features = gen_history_features(test, orderHistory_test)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)

    feature_name = 'advance_action_features'
    if not data_utils.is_feature_created(feature_name):
        print('build train advance_action_features')
        train_features = gen_action_features(train, action_train)
        print('build test advance_action_features')
        test_features = gen_action_features(test, action_test)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)

    feature_name = 'advance_action_features1'
    if not data_utils.is_feature_created(feature_name):
        print('build train advance_action_features1')
        train_features = gen_action_features1(train, action_train)
        print('build test advance_action_features1')
        test_features = gen_action_features1(test, action_test)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)

    feature_name = 'advance_action_features2'
    if not data_utils.is_feature_created(feature_name):
        print('build train advance_action_features2')
        train_features = gen_action_features2(train, action_train)
        print('build test advance_action_features2')
        test_features = gen_action_features2(test, action_test)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)
Esempio n. 2
0
def main():
    # 待预测订单的数据 (原始训练集和测试集)
    train = pd.read_csv(Configure.base_path + 'train/orderFuture_train.csv', encoding='utf8')
    test = pd.read_csv(Configure.base_path + 'test/orderFuture_test.csv', encoding='utf8')
    orderHistory_train = pd.read_csv(Configure.base_path + 'train/orderHistory_train.csv', encoding='utf8')
    orderHistory_test = pd.read_csv(Configure.base_path + 'test/orderHistory_test.csv', encoding='utf8')
    orderHistory_train = build_time_category_encode(orderHistory_train)
    orderHistory_test = build_time_category_encode(orderHistory_test)
    orderHistory_train.to_csv(Configure.cleaned_path + 'cleaned_orderHistory_train.csv', index=False,
                              columns=orderHistory_train.columns)
    orderHistory_test.to_csv(Configure.cleaned_path + 'cleaned_orderHistory_test.csv', index=False,
                             columns=orderHistory_test.columns)

    feature_name = 'user_order_history_features'
    if not data_utils.is_feature_created(feature_name):
        print('build train user_order_history_features')
        train_features = build_order_history_features(train, orderHistory_train)
        print('build test user_order_history_features')
        test_features = build_order_history_features(test, orderHistory_test)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)

    feature_name = 'user_order_history_features2'
    if not data_utils.is_feature_created(feature_name):
        print('build train user_order_history_features2')
        train_features = build_order_history_features2(train, orderHistory_train)
        print('build test user_order_history_features2')
        test_features = build_order_history_features2(test, orderHistory_test)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)

    feature_name = 'user_order_history_features3'
    if not data_utils.is_feature_created(feature_name):
        orderHistory = pd.concat([orderHistory_train, orderHistory_test])
        print('build train user_order_history_features3')
        train_features = build_order_history_features3(train, orderHistory, orderHistory_train)
        print('build test user_order_history_features3')
        test_features = build_order_history_features3(test, orderHistory, orderHistory_test)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)

    feature_name = 'user_order_history_features4'
    if not data_utils.is_feature_created(feature_name):
        print('build train user_order_history_features4')
        train_features = build_order_history_features4(train, orderHistory_train)
        print('build test user_order_history_features4')
        test_features = build_order_history_features4(test, orderHistory_test)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)

    feature_name = 'user_order_history_features_wxr'
    if not data_utils.is_feature_created(feature_name):
        orderHistory = pd.concat([orderHistory_train, orderHistory_test])
        print('build train user_order_history_features3')
        train_features = build_order_history_features_wxr(train, orderHistory, orderHistory_train)
        print('build test user_order_history_features3')
        test_features = build_order_history_features_wxr(test, orderHistory, orderHistory_test)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)
Esempio n. 3
0
def main():
    train = pd.read_csv(Configure.base_path + 'train/orderFuture_train.csv',
                        encoding='utf8')
    test = pd.read_csv(Configure.base_path + 'test/orderFuture_test.csv',
                       encoding='utf8')

    orderHistory_train = pd.read_csv(Configure.base_path +
                                     'train/orderHistory_train.csv',
                                     encoding='utf8')
    orderHistory_test = pd.read_csv(Configure.base_path +
                                    'test/orderHistory_test.csv',
                                    encoding='utf8')

    action_train = pd.read_csv(Configure.base_path + 'train/action_train.csv')
    action_test = pd.read_csv(Configure.base_path + 'test/action_test.csv')

    action_train = generate_new_action(action_train, orderHistory_train)
    action_test = generate_new_action(action_test, orderHistory_test)

    train_action_grouped = dict(list(action_train.groupby('userid')))
    test_action_grouped = dict(list(action_test.groupby('userid')))

    feature_name = 'action_order_features1'
    if not data_utils.is_feature_created(feature_name):
        print('build train action_order_features1')
        train_features = build_action_order_features1(train,
                                                      train_action_grouped)
        print('build test action_order_features1')
        test_features = build_action_order_features1(test, test_action_grouped)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)

    feature_name = 'action_order_features2'
    if not data_utils.is_feature_created(feature_name):
        print('build train action_order_features2')
        train_features = build_action_order_features2(train,
                                                      train_action_grouped)
        print('build test action_order_features2')
        test_features = build_action_order_features2(test, test_action_grouped)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)

    feature_name = 'action_order_features3'
    if not data_utils.is_feature_created(feature_name):
        print('build train action_order_features3')
        train_features = build_action_order_features3(train,
                                                      train_action_grouped)
        print('build test action_order_features3')
        test_features = build_action_order_features3(test, test_action_grouped)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)
Esempio n. 4
0
def main():
    feature_name = 'basic_user_action_features'
    if data_utils.is_feature_created(feature_name):
        return

    train_action = pd.read_csv(Configure.base_path + 'train/action_train.csv')
    test_action = pd.read_csv(Configure.base_path + 'test/action_test.csv')

    train_action = build_time_features(train_action)
    test_action = build_time_features(test_action)

    print('save cleaned datasets')
    train_action.to_csv(Configure.cleaned_path + 'cleaned_action_train.csv',
                        index=False,
                        columns=train_action.columns)
    test_action.to_csv(Configure.cleaned_path + 'cleaned_action_test.csv',
                       index=False,
                       columns=test_action.columns)

    train_action_features = basic_action_info(train_action)
    test_action_features = basic_action_info(test_action)

    print('save ', feature_name)
    data_utils.save_features(train_action_features,
                             test_action_features,
                             features_name=feature_name)
def main():
    feature_name = 'basic_user_info'
    if data_utils.is_feature_created(feature_name):
        return

    # 用户个人基本信息
    train_user = pd.read_csv(Configure.base_path +
                             'train/userProfile_train.csv',
                             encoding='utf8')
    test_user = pd.read_csv(Configure.base_path + 'test/userProfile_test.csv',
                            encoding='utf8')

    # 1. 性别 dummy code
    train_user['gender'] = train_user['gender'].map(gender_convert)
    test_user['gender'] = test_user['gender'].map(gender_convert)
    dummies = pd.get_dummies(train_user['gender'], prefix='gender')
    train_user[dummies.columns] = dummies
    dummies = pd.get_dummies(test_user['gender'], prefix='gender')
    test_user[dummies.columns] = dummies

    # province = pd.read_csv('province_economic.csv', encoding='utf8')
    # train_user = train_user.merge(province, on='province', how='left')
    # test_user = test_user.merge(province, on='province', how='left')

    # 2. 省份进行 LabelEncoder
    train_user['province'] = train_user['province'].map(province_convert)
    test_user['province'] = test_user['province'].map(province_convert)
    le = LabelEncoder()
    le.fit(train_user['province'].values)
    train_user['province_code'] = le.transform(train_user['province'])
    test_user['province_code'] = le.transform(test_user['province'])

    # 3. 年龄段进行 dummy code
    train_user['age'] = train_user['age'].map(lambda age: 'lg' + age[:2]
                                              if age == age else 'None')
    test_user['age'] = test_user['age'].map(lambda age: 'lg' + age[:2]
                                            if age == age else 'None')

    print('save cleaned datasets')
    train_user.to_csv(Configure.cleaned_path + 'cleaned_userProfile_train.csv',
                      index=False,
                      columns=train_user.columns)
    test_user.to_csv(Configure.cleaned_path + 'cleaned_userProfile_test.csv',
                     index=False,
                     columns=test_user.columns)

    dummies = pd.get_dummies(train_user['age'], prefix='age')
    train_user[dummies.columns] = dummies
    dummies = pd.get_dummies(test_user['age'], prefix='age')
    test_user[dummies.columns] = dummies

    print('save ', feature_name)
    data_utils.save_features(train_user, test_user, features_name=feature_name)
def main():
    feature_name = 'user_order_comment_features'
    if data_utils.is_feature_created(feature_name):
        return

    # 待预测订单的数据 (原始训练集和测试集)
    train = pd.read_csv(Configure.base_path + 'train/orderFuture_train.csv',
                        encoding='utf8')
    test = pd.read_csv(Configure.base_path + 'test/orderFuture_test.csv',
                       encoding='utf8')

    userComment_train = pd.read_csv(Configure.base_path +
                                    'train/userComment_train.csv',
                                    encoding='utf8')
    userComment_test = pd.read_csv(Configure.base_path +
                                   'test/userComment_test.csv',
                                   encoding='utf8')

    userComment_train.loc[userComment_train['rating'] == 4.33, 'rating'] = 4
    userComment_train.loc[userComment_train['rating'] == 3.67, 'rating'] = 4
    userComment_test.loc[userComment_train['rating'] == 2.33, 'rating'] = 2
    userComment_train['rating'] = userComment_train['rating'].astype(int)
    userComment_test['rating'] = userComment_test['rating'].astype(int)

    print('save cleaned datasets')
    userComment_train.to_csv(Configure.cleaned_path +
                             'cleaned_userComment_train.csv',
                             index=False,
                             columns=userComment_train.columns,
                             encoding='utf8')
    userComment_test.to_csv(Configure.cleaned_path +
                            'cleaned_userComment_test.csv',
                            index=False,
                            columns=userComment_test.columns,
                            encoding='utf8')

    print('build train features')
    train_features = built_comment_features(train, userComment_train)
    print('build test features')
    test_features = built_comment_features(test, userComment_test)

    print('save ', feature_name)
    data_utils.save_features(train_features, test_features, feature_name)
Esempio n. 7
0
def main():
    feature_name = 'wxr_features'
    if data_utils.is_feature_created(feature_name):
        return

    print('add comment score features')
    with open('wxr_train_comment_features.pkl', "rb") as f:
        user_comment_train = cPickle.load(f)
    with open('wxr_test_comment_features.pkl', "rb") as f:
        user_comment_test = cPickle.load(f)

    user_comment_train.fillna(-1, inplace=True)
    user_comment_test.fillna(-1, inplace=True)

    train_features = user_comment_train
    test_features = user_comment_test

    # print('add user_info features')
    # with open('wxr_train_user_info_features.pkl', "rb") as f:
    #     train_user_info = cPickle.load(f)
    # with open('wxr_test_user_info_features.pkl', "rb") as f:
    #     test_user_info = cPickle.load(f)
    # train_user_info.drop(['gender', 'province', 'age'], axis=1, inplace=True)
    # test_user_info.drop(['gender', 'province', 'age'], axis=1, inplace=True)
    #
    # train_features = train_features.merge(train_user_info, on='userid', how='left')
    # test_features = test_features.merge(test_user_info, on='userid', how='left')

    print('add history features')
    with open('wxr_operate_4_train_order_history_features.pkl', "rb") as f:
        history_features_train = cPickle.load(f)
    with open('wxr_operate_4_test_order_history_features.pkl', "rb") as f:
        history_features_test = cPickle.load(f)

    use_features = [
        'userid', 'avg_days_between_order', 'days_ratio_since_last_order',
        'city_num', 'country_num', 'continent_num', 'city_rich',
        'city_avg_rich', 'country_rich', 'country_avg_rich',
        'histord_time_last_1_year', 'histord_time_last_1_month',
        'histord_sum_cont1', 'histord_sum_cont2', 'histord_sum_cont3',
        'histord_sum_cont4', 'histord_sum_cont5', 'timespan_lastord_1_2',
        'timespan_lastord_2_3'
    ]
    history_features_train = history_features_train[use_features]
    history_features_test = history_features_test[use_features]
    train_features = train_features.merge(history_features_train,
                                          on='userid',
                                          how='left')
    test_features = test_features.merge(history_features_test,
                                        on='userid',
                                        how='left')

    print('add action features')
    with open('wxr_operate_3_train_action_features.pkl', "rb") as f:
        action_features_train = cPickle.load(f)
    with open('wxr_operate_3_test_action_features.pkl', "rb") as f:
        action_features_test = cPickle.load(f)
    use_features = [
        'userid', 'avg_browse_num_after_last_order',
        'browse_num_after_last_order', 'operate_num_after_last_order',
        'avg_operate_num_after_last_order', 'open_num_after_last_order',
        'action_1_num_after_last_order', 'action_2_num_after_last_order',
        'action_3_num_after_last_order', 'action_4_num_after_last_order',
        'action_5_num_after_last_order', 'action_6_num_after_last_order',
        'action_7_num_after_last_order', 'action_8_num_after_last_order',
        'action_9_num_after_last_order'
    ]
    action_features_train = action_features_train[use_features]
    action_features_test = action_features_test[use_features]
    train_features = train_features.merge(action_features_train,
                                          on='userid',
                                          how='left')
    test_features = test_features.merge(action_features_test,
                                        on='userid',
                                        how='left')

    print('add someother features')
    some_other_train = pd.read_csv('some_other_train_features.csv')
    some_other_test = pd.read_csv('some_other_test_features.csv')
    train_features = train_features.merge(some_other_train,
                                          on='userid',
                                          how='left')
    test_features = test_features.merge(some_other_test,
                                        on='userid',
                                        how='left')

    print('save ', feature_name)
    data_utils.save_features(train_features, test_features, feature_name)
Esempio n. 8
0
def main():
    print('load datasets')
    questions = pd.read_csv(Configure.question_file)
    train = pd.read_csv(Configure.train_data_file).sample(n=1000)
    test = pd.read_csv(Configure.test_data_file).sample(n=1000)

    train['id'] = np.arange(train.shape[0])
    train = pd.merge(train,
                     questions,
                     left_on=['q1'],
                     right_on=['qid'],
                     how='left')
    train = train.rename(columns={'words': 'q1_words', 'chars': 'q1_chars'})
    del train['qid']
    train = pd.merge(train,
                     questions,
                     left_on=['q2'],
                     right_on=['qid'],
                     how='left')
    train = train.rename(columns={'words': 'q2_words', 'chars': 'q2_chars'})
    train.drop(['q1', 'q2', 'qid'], axis=1, inplace=True)

    test['id'] = np.arange(test.shape[0])
    test = pd.merge(test,
                    questions,
                    left_on=['q1'],
                    right_on=['qid'],
                    how='left')
    test = test.rename(columns={'words': 'q1_words', 'chars': 'q1_chars'})
    del test['qid']
    test = pd.merge(test,
                    questions,
                    left_on=['q2'],
                    right_on=['qid'],
                    how='left')
    test = test.rename(columns={'words': 'q2_words', 'chars': 'q2_chars'})
    test.drop(['q1', 'q2', 'qid'], axis=1, inplace=True)

    feature_name = 'basic_features'
    if not data_utils.is_feature_created(feature_name):
        train_words = pd.Series(
            train['q1_words'].map(lambda x: x.split(' ')).tolist() +
            train['q2_words'].map(lambda x: x.split(' ')).tolist())
        words = [x for y in train_words for x in y]
        counts = Counter(words)
        words_weights = {
            word: get_weight(count)
            for word, count in counts.items()
        }

        train_chars = pd.Series(
            train['q1_chars'].map(lambda x: x.split(' ')).tolist() +
            train['q2_chars'].map(lambda x: x.split(' ')).tolist())
        chars = [x for y in train_chars for x in y]
        counts = Counter(chars)
        chars_weights = {
            word: get_weight(count)
            for word, count in counts.items()
        }

        ques = pd.concat(
            [train[['q1_words', 'q2_words']], test[['q1_words', 'q2_words']]],
            axis=0).reset_index(drop='index')
        q_dict = defaultdict(set)
        for i in range(ques.shape[0]):
            q_dict[ques.q1_words[i]].add(ques.q2_words[i])
            q_dict[ques.q2_words[i]].add(ques.q1_words[i])

        print('train build_basic_features')
        train_features = build_features1(train, words_weights, chars_weights,
                                         q_dict)
        print('test build_basic_features')
        test_features = build_features1(test, words_weights, chars_weights,
                                        q_dict)
        data_utils.save_features(train_features, test_features, feature_name)

    feature_name = 'basic_features2'
    if data_utils.is_feature_created(feature_name):
        print('create gensim model')
        word_model = gensim.models.KeyedVectors.load_word2vec_format(
            Configure.word_embed_path, binary=False)
        char_model = gensim.models.KeyedVectors.load_word2vec_format(
            Configure.char_embed_path, binary=False)

        norm_word_model = gensim.models.KeyedVectors.load_word2vec_format(
            Configure.word_embed_path, binary=False)
        norm_word_model.init_sims(replace=True)

        norm_char_model = gensim.models.KeyedVectors.load_word2vec_format(
            Configure.char_embed_path, binary=False)
        norm_char_model.init_sims(replace=True)

        print('train build_features2')
        train_features = build_features2(train, word_model, char_model)
        print('test build_features2')
        test_features = build_features2(test, word_model, char_model)
        data_utils.save_features(train_features, test_features, feature_name)
Esempio n. 9
0
def main():
    feature_name = 'user_order_comment_features'
    if data_utils.is_feature_created(feature_name):
        return

    # 待预测订单的数据 (原始训练集和测试集)
    train = pd.read_csv(Configure.base_path + 'train/orderFuture_train.csv',
                        encoding='utf8')
    test = pd.read_csv(Configure.base_path + 'test/orderFuture_test.csv',
                       encoding='utf8')

    userComment_train = pd.read_csv(Configure.base_path +
                                    'train/userComment_train.csv',
                                    encoding='utf8')
    userComment_test = pd.read_csv(Configure.base_path +
                                   'test/userComment_test.csv',
                                   encoding='utf8')

    userComment_train.loc[userComment_train['rating'] == 4.33, 'rating'] = 4
    userComment_train.loc[userComment_train['rating'] == 3.67, 'rating'] = 4
    userComment_test.loc[userComment_train['rating'] == 2.33, 'rating'] = 2
    userComment_train['rating'] = userComment_train['rating'].astype(int)
    userComment_test['rating'] = userComment_test['rating'].astype(int)

    orderHistory_train = pd.read_csv(Configure.cleaned_path +
                                     'cleaned_orderHistory_train.csv',
                                     encoding='utf8')
    orderHistory_test = pd.read_csv(Configure.cleaned_path +
                                    'cleaned_orderHistory_test.csv',
                                    encoding='utf8')
    userComment_train = pd.merge(userComment_train,
                                 orderHistory_train[['orderid', 'orderType']],
                                 on='orderid',
                                 how='left')
    userComment_test = pd.merge(userComment_test,
                                orderHistory_test[['orderid', 'orderType']],
                                on='orderid',
                                how='left')
    userComment_train = commentKey_score(userComment_train)
    userComment_test = commentKey_score(userComment_test)
    userComment_train = tag_score(userComment_train)
    userComment_test = tag_score(userComment_test)
    print('save cleaned datasets')
    userComment_train.to_csv(Configure.cleaned_path +
                             'cleaned_userComment_train.csv',
                             index=False,
                             columns=userComment_train.columns,
                             encoding='utf8')
    userComment_test.to_csv(Configure.cleaned_path +
                            'cleaned_userComment_test.csv',
                            index=False,
                            columns=userComment_test.columns,
                            encoding='utf8')

    print('build train features')
    train_features = built_comment_features(train, userComment_train)
    print('build test features')
    test_features = built_comment_features(test, userComment_test)
    print('save ', feature_name)
    data_utils.save_features(train_features, test_features, feature_name)

    print('build wxr features')
    feature_name = 'user_order_comment_features_wxr'
    if not data_utils.is_feature_created(feature_name):
        print('build train action history features11')
        train_features = built_comment_features_wxr(train, userComment_train,
                                                    orderHistory_train)
        print('build test action history features11')
        test_features = built_comment_features_wxr(test, userComment_test,
                                                   orderHistory_test)
        print('save ', feature_name)
        data_utils.save_features(train_features, test_features, feature_name)