Esempio n. 1
0
    def run(self):

        mall_list = self.shop_info.mall_id.unique()
        for mall_id in mall_list:
            if mall_id != 'm_6803':
                continue
            # 提取训练集数据和验证集数据
            train_mall_df = self.train_data[self.train_data.mall_id == mall_id]
            evl_mall_df = self.evl_data[self.evl_data.mall_id == mall_id]
            train_mall_df.rename(columns={
                'longitude_x': 'longitude',
                'latitude_x': 'latitude'
            },
                                 inplace=True)
            # wif_infos 字段的预处理
            train_mall_df['wifi_infos'] = train_mall_df['wifi_infos'].apply(
                lambda x: [
                    self.wifi_info_process(wifi.split('|'))
                    for wifi in x.split(';')
                ])
            evl_mall_df['wifi_infos'] = evl_mall_df['wifi_infos'].apply(
                lambda x: [
                    self.wifi_info_process(wifi.split('|'))
                    for wifi in x.split(';')
                ])
            # 提取训练集标签和测试集行号
            row_ids = list(evl_mall_df['row_id'])
            shop_ids = list(train_mall_df['shop_id'])
            # 提取需要的列
            train_columns = ['longitude', 'latitude', 'wifi_infos', 'shop_id']
            evl_columns = ['longitude', 'latitude', 'wifi_infos', 'row_id']
            train_mall_df = train_mall_df[train_columns]
            evl_mall_df = evl_mall_df[evl_columns]
            # mall 数据结构初始化
            self.mall_init(mall_id, train_mall_df, evl_mall_df)
            # 连接train_mall_df和evl_mall_df进行预处理
            df = pd.concat([train_mall_df, evl_mall_df])
            df = self.get_wifi_vector(df)
            columns = ['longitude', 'latitude'
                       ] + ['wifi_' + str(i) for i in range(len(self.wifi))]
            df = df[columns]
            df = df.fillna(0)
            X = np.asarray(df, dtype=np.float64)
            min_max_scaler = MinMaxScaler()
            min_max_scaler.fit(X)
            X = min_max_scaler.transform(X)
            # 分离出训练集和测试集
            X_train = X[:len(shop_ids)]
            X_test = X[len(shop_ids):]
            rf = RF()
            rf.train(mall_id, X_train, shop_ids, X_test, row_ids)
            xgb.analyse(mall_id, X_train, shop_ids)
            xgb.train(mall_id, X_train, shop_ids, X_test, row_ids)
            print('=' * 120)
Esempio n. 2
0
    def test_seed_diff(self):
        ## When the seeds are different, the random forest provides different results.

        rf = RF(y, mtry=0.75, n_jobs=20, seed=2001)
        rf.fit(X1)
        weights1 = rf.fit(X2)

        rf = RF(y, mtry=0.75, n_jobs=20, seed=2002)
        rf.fit(X1)
        weights2 = rf.fit(X2)

        self.assertFalse(all(weights1 == weights2))
Esempio n. 3
0
    def test_seed(self):
        # When the seeds are the same, the random forest provides identical results.

        rf = RF(y, mtry=0.75, n_jobs=20, seed=2001)
        rf.fit(X1)
        weights1 = rf.fit(X2)

        rf = RF(y, mtry=0.75, n_jobs=20, seed=2001)
        rf.fit(X1)
        weights2 = rf.fit(X2)

        assert_almost_equal(weights1, weights2)
Esempio n. 4
0
class RFWrapper:
    """
    Online Random Forest.
    """
    def __init__(self, y, X_t, y_t, n_jobs, mtry, random_state):
        self.rf = RF(y, n_jobs=n_jobs, mtry=mtry, seed=random_state)
        self.X_t = X_t
        self.y_t = y_t

    def fit(self, x):
        self.rf.fit(x)

    def get_auc(self):
        prediction = self.rf.score(self.X_t)
        fpr, tpr, thresholds = metrics.roc_curve(self.y_t,
                                                 prediction,
                                                 pos_label=1)
        return metrics.auc(fpr, tpr)
Esempio n. 5
0
    def test_anytime(self):
        # The feature weights should be as uniformly distributed as possible.
        # We select old features with very little restraint -> we can mess up old distributions,
        # furthermore, we do not correct that -> fulfil at least these loose constraints.

        rf = RF(y, mtry=0.75, n_jobs=30, seed=2001)
        rf.fit(random.randint(0, 10, (5, 20)))
        weights = rf.fit(random.randint(0, 10, (5, 1)))

        self.assertTrue(min(weights) > (mean(weights) - 3 * std(weights)))
        self.assertTrue(max(weights) < (mean(weights) + 3 * std(weights)))
        assert_almost_equal(
            mean(weights[0:20]),
            weights[20],
            err_msg=
            "The new feature should have the weight equivalent to the average weight of all the previous features"
        )

        weights = rf.fit(random.randint(0, 10, (5, 1)))
        assert_almost_equal(
            mean(weights[0:21]),
            weights[21],
            err_msg=
            "The new feature should have the weight equivalent to the average weight of all the previous features"
        )
Esempio n. 6
0
    def test_incremental_learning(self):
        # Test that we can initialize the RF, add a feature, score, add features, score.

        rf = RF(y, mtry=0.8, n_jobs=2, seed=2001)
        rf.fit(X0)
        prediction1 = rf.score(X0)
        rf.fit(X1)
        prediction2 = rf.score(column_stack((X0, X1)))

        assert_almost_equal(
            prediction1,
            y,
            err_msg="Feature X0 is a leaking feature - overfit on it!")
        assert_almost_equal(
            prediction2,
            y,
            err_msg="Feature X0 is a leaking feature - overfit on it!")
Esempio n. 7
0
def randomForest():
    print "--------------------Random Forest---------------------"
    rf = RF(X_train, y_train)
    test_accuracy = rf.predictRF(X_test, y_test)
    print "test accuracy of Random Forest is", test_accuracy
    print " Sample prediction of the rating by RF for a Positive review"
    pos_review = w_reviews['text'][0]
    pos_review_transformed = feature_matrix.transform([pos_review])
    print rf.predictRating(pos_review_transformed)

    print " Sample prediction of the rating by RF for a Negative review"
    neg_review = w_reviews['text'][16]
    neg_review_transformed = feature_matrix.transform([neg_review])
    print rf.predictRating(neg_review_transformed)

    print " Sample prediction of the rating by RF for a Neutral review"
    neutral_review = w_reviews['text'][1]
    neutral_review_transformed = feature_matrix.transform([neutral_review])
    print rf.predictRating(neutral_review_transformed)

    our_review = "Horrible food "
    print "our test for sample text :: ", our_review
    our_review_transformed = feature_matrix.transform([our_review])
    print "Rating of our review", rf.predictRating(our_review_transformed)
Esempio n. 8
0
 def test_vector(self):
     rf = RF(y, mtry=0.8, n_jobs=2, seed=2001)
     rf.fit(y)
Esempio n. 9
0
 def __init__(self, y, X_t, y_t, n_jobs, mtry, random_state):
     self.rf = RF(y, n_jobs=n_jobs, mtry=mtry, seed=random_state)
     self.X_t = X_t
     self.y_t = y_t