def Myprediction(df, features, clf, name, item_category_list_unique): testdf = pd.read_csv('data/test/round1_ijcai_18_test_a_20180301.txt', sep=' ') testdf.context_timestamp += 8 * 60 * 60 testdf = convert_time(testdf) testdf.item_category_list.replace( item_category_list_unique, list(np.arange(len(item_category_list_unique))), inplace=True) prediction_format = pd.read_csv('data/output/0203.txt', sep=' ') train, predict = df, testdf clf.fit(train[features], train.is_trade, eval_set=[(train[features], train.is_trade)], eval_metric='logloss', verbose=True) predict['predicted_score'] = clf.predict_proba(predict[features])[:, 1] print(predict[['instance_id', 'predicted_score']]) prediction_file = pd.merge(prediction_format[['instance_id']], predict[['instance_id', 'predicted_score']], on='instance_id', how='left') prediction_file.to_csv('data/output/{}.txt'.format(name), sep=' ', index=None) return clf
def base_process(data): lbl = preprocessing.LabelEncoder() print( '--------------------------------------------------------------item--------------------------------------------------------------' ) data['len_item_category'] = data['item_category_list'].map( lambda x: len(str(x).split(';'))) data['len_item_property'] = data['item_property_list'].map( lambda x: len(str(x).split(';'))) for i in range(1, 3): data['item_category_list' + str(i)] = lbl.fit_transform( data['item_category_list'].map(lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else '')) # item_category_list的第0列全部都一样 for i in range(10): data['item_property_list' + str(i)] = lbl.fit_transform( data['item_property_list'].map(lambda x: str(str(x).split(';')[ i]) if len(str(x).split(';')) > i else '')) for col in ['item_id', 'item_brand_id', 'item_city_id']: data[col] = lbl.fit_transform(data[col]) print( '--------------------------------------------------------------user--------------------------------------------------------------' ) for col in ['user_id']: data[col] = lbl.fit_transform(data[col]) print('user 0,1 feature') data['gender0'] = data['user_gender_id'].apply(lambda x: 1 if x == -1 else 2) data['age0'] = data['user_age_level'].apply( lambda x: 1 if x == 1004 | x == 1005 | x == 1006 | x == 1007 else 2) data['occupation0'] = data['user_occupation_id'].apply( lambda x: 1 if x == -1 | x == 2003 else 2) data['star0'] = data['user_star_level'].apply( lambda x: 1 if x == -1 | x == 3000 | x == 3001 else 2) print( '--------------------------------------------------------------context--------------------------------------------------------------' ) # data['realtime'] = data['context_timestamp'].apply(timestamp_datetime) # data['realtime'] = pd.to_datetime(data['realtime']) # data['day'] = data['realtime'].dt.day # data['hour'] = data['realtime'].dt.hour data = convert_time(data) data['len_predict_category_property'] = data[ 'predict_category_property'].map(lambda x: len(str(x).split(';'))) for i in range(5): data['predict_category_property' + str(i)] = lbl.fit_transform( data['predict_category_property'].map(lambda x: str( str(x).split(';')[i]) if len(str(x).split(';')) > i else '')) print('context 0,1 feature') data['context_page0'] = data['context_page_id'].apply( lambda x: 1 if x == 4001 | x == 4002 | x == 4003 | x == 4004 | x == 4007 else 2) print( '--------------------------------------------------------------shop--------------------------------------------------------------' ) for col in ['shop_id']: data[col] = lbl.fit_transform(data[col]) data['shop_score_delivery0'] = data['shop_score_delivery'].apply( lambda x: 0 if x <= 0.98 and x >= 0.96 else 1) return data
verbose=True) predict['predicted_score'] = clf.predict_proba(predict[features])[:, 1] print(predict[['instance_id', 'predicted_score']]) prediction_file = pd.merge(prediction_format[['instance_id']], predict[['instance_id', 'predicted_score']], on='instance_id', how='left') prediction_file.to_csv('data/output/{}.txt'.format(name), sep=' ', index=None) return clf df = pd.read_csv('data/train/round1_ijcai_18_train_20180301.txt', sep=' ') df.context_timestamp += 8 * 60 * 60 df = convert_time(df) item_category_list_unique = list(np.unique(df.item_category_list)) df.item_category_list.replace(item_category_list_unique, list(np.arange(len(item_category_list_unique))), inplace=True) label = 'is_trade' start_features = [ 'item_category_list', 'item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_gender_id', 'user_age_level',