def run(self): mall_list = self.shop_info.mall_id.unique() for mall_id in mall_list: if mall_id != 'm_6803': continue # 提取训练集数据和验证集数据 train_mall_df = self.train_data[self.train_data.mall_id == mall_id] evl_mall_df = self.evl_data[self.evl_data.mall_id == mall_id] train_mall_df.rename(columns={ 'longitude_x': 'longitude', 'latitude_x': 'latitude' }, inplace=True) # wif_infos 字段的预处理 train_mall_df['wifi_infos'] = train_mall_df['wifi_infos'].apply( lambda x: [ self.wifi_info_process(wifi.split('|')) for wifi in x.split(';') ]) evl_mall_df['wifi_infos'] = evl_mall_df['wifi_infos'].apply( lambda x: [ self.wifi_info_process(wifi.split('|')) for wifi in x.split(';') ]) # 提取训练集标签和测试集行号 row_ids = list(evl_mall_df['row_id']) shop_ids = list(train_mall_df['shop_id']) # 提取需要的列 train_columns = ['longitude', 'latitude', 'wifi_infos', 'shop_id'] evl_columns = ['longitude', 'latitude', 'wifi_infos', 'row_id'] train_mall_df = train_mall_df[train_columns] evl_mall_df = evl_mall_df[evl_columns] # mall 数据结构初始化 self.mall_init(mall_id, train_mall_df, evl_mall_df) # 连接train_mall_df和evl_mall_df进行预处理 df = pd.concat([train_mall_df, evl_mall_df]) df = self.get_wifi_vector(df) columns = ['longitude', 'latitude' ] + ['wifi_' + str(i) for i in range(len(self.wifi))] df = df[columns] df = df.fillna(0) X = np.asarray(df, dtype=np.float64) min_max_scaler = MinMaxScaler() min_max_scaler.fit(X) X = min_max_scaler.transform(X) # 分离出训练集和测试集 X_train = X[:len(shop_ids)] X_test = X[len(shop_ids):] rf = RF() rf.train(mall_id, X_train, shop_ids, X_test, row_ids) xgb.analyse(mall_id, X_train, shop_ids) xgb.train(mall_id, X_train, shop_ids, X_test, row_ids) print('=' * 120)
def test_seed_diff(self): ## When the seeds are different, the random forest provides different results. rf = RF(y, mtry=0.75, n_jobs=20, seed=2001) rf.fit(X1) weights1 = rf.fit(X2) rf = RF(y, mtry=0.75, n_jobs=20, seed=2002) rf.fit(X1) weights2 = rf.fit(X2) self.assertFalse(all(weights1 == weights2))
def test_seed(self): # When the seeds are the same, the random forest provides identical results. rf = RF(y, mtry=0.75, n_jobs=20, seed=2001) rf.fit(X1) weights1 = rf.fit(X2) rf = RF(y, mtry=0.75, n_jobs=20, seed=2001) rf.fit(X1) weights2 = rf.fit(X2) assert_almost_equal(weights1, weights2)
class RFWrapper: """ Online Random Forest. """ def __init__(self, y, X_t, y_t, n_jobs, mtry, random_state): self.rf = RF(y, n_jobs=n_jobs, mtry=mtry, seed=random_state) self.X_t = X_t self.y_t = y_t def fit(self, x): self.rf.fit(x) def get_auc(self): prediction = self.rf.score(self.X_t) fpr, tpr, thresholds = metrics.roc_curve(self.y_t, prediction, pos_label=1) return metrics.auc(fpr, tpr)
def test_anytime(self): # The feature weights should be as uniformly distributed as possible. # We select old features with very little restraint -> we can mess up old distributions, # furthermore, we do not correct that -> fulfil at least these loose constraints. rf = RF(y, mtry=0.75, n_jobs=30, seed=2001) rf.fit(random.randint(0, 10, (5, 20))) weights = rf.fit(random.randint(0, 10, (5, 1))) self.assertTrue(min(weights) > (mean(weights) - 3 * std(weights))) self.assertTrue(max(weights) < (mean(weights) + 3 * std(weights))) assert_almost_equal( mean(weights[0:20]), weights[20], err_msg= "The new feature should have the weight equivalent to the average weight of all the previous features" ) weights = rf.fit(random.randint(0, 10, (5, 1))) assert_almost_equal( mean(weights[0:21]), weights[21], err_msg= "The new feature should have the weight equivalent to the average weight of all the previous features" )
def test_incremental_learning(self): # Test that we can initialize the RF, add a feature, score, add features, score. rf = RF(y, mtry=0.8, n_jobs=2, seed=2001) rf.fit(X0) prediction1 = rf.score(X0) rf.fit(X1) prediction2 = rf.score(column_stack((X0, X1))) assert_almost_equal( prediction1, y, err_msg="Feature X0 is a leaking feature - overfit on it!") assert_almost_equal( prediction2, y, err_msg="Feature X0 is a leaking feature - overfit on it!")
def randomForest(): print "--------------------Random Forest---------------------" rf = RF(X_train, y_train) test_accuracy = rf.predictRF(X_test, y_test) print "test accuracy of Random Forest is", test_accuracy print " Sample prediction of the rating by RF for a Positive review" pos_review = w_reviews['text'][0] pos_review_transformed = feature_matrix.transform([pos_review]) print rf.predictRating(pos_review_transformed) print " Sample prediction of the rating by RF for a Negative review" neg_review = w_reviews['text'][16] neg_review_transformed = feature_matrix.transform([neg_review]) print rf.predictRating(neg_review_transformed) print " Sample prediction of the rating by RF for a Neutral review" neutral_review = w_reviews['text'][1] neutral_review_transformed = feature_matrix.transform([neutral_review]) print rf.predictRating(neutral_review_transformed) our_review = "Horrible food " print "our test for sample text :: ", our_review our_review_transformed = feature_matrix.transform([our_review]) print "Rating of our review", rf.predictRating(our_review_transformed)
def test_vector(self): rf = RF(y, mtry=0.8, n_jobs=2, seed=2001) rf.fit(y)
def __init__(self, y, X_t, y_t, n_jobs, mtry, random_state): self.rf = RF(y, n_jobs=n_jobs, mtry=mtry, seed=random_state) self.X_t = X_t self.y_t = y_t