def test_partial_fit_equivalence(): X, y = make_regression(random_state=0, n_samples=100) mfr = MondrianForestRegressor(random_state=0) mfr.partial_fit(X, y) for batch_size in [10, 20, 25, 50, 90]: check_partial_fit_equivalence(batch_size, mfr, 0, X, y) X, y = make_classification(random_state=0, n_samples=100) mtc = MondrianForestClassifier(random_state=0) mtc.partial_fit(X, y) for batch_size in [10, 20, 25, 50, 90]: check_partial_fit_equivalence(batch_size, mtc, 0, X, y, is_clf=True)
def test_forest_attributes(): mr = MondrianForestRegressor(n_estimators=5, random_state=0) mr.fit([[1, 2, 3], [4, 5, 6]], [1, 2]) assert_false(hasattr(mr, "classes_")) assert_false(hasattr(mr, "n_classes_")) mr.partial_fit([[1, 2, 3], [4, 5, 6]], [1, 2]) assert_false(hasattr(mr, "classes_")) assert_false(hasattr(mr, "n_classes_")) mr = MondrianForestClassifier(n_estimators=5, random_state=0) mr.fit([[1, 2, 3], [4, 5, 6]], [1, 2]) assert_true(hasattr(mr, "classes_")) assert_true(hasattr(mr, "n_classes_")) mr = MondrianForestClassifier(n_estimators=5, random_state=0) mr.partial_fit([[1, 2, 3], [4, 5, 6]], [1, 2]) assert_true(hasattr(mr, "classes_")) assert_true(hasattr(mr, "n_classes_"))
def test_min_samples_split(): X_c, y_c = load_digits(return_X_y=True) X_r, y_r = make_regression(n_samples=10000, random_state=0) for mss in [2, 4, 10, 20]: mfr = MondrianForestRegressor(random_state=0, min_samples_split=mss) mfr.partial_fit(X_r[:X_r.shape[0] // 2], y_r[:X_r.shape[0] // 2]) mfr.partial_fit(X_r[X_r.shape[0] // 2:], y_r[X_r.shape[0] // 2:]) for est in mfr.estimators_: n_node_samples = est.tree_.n_node_samples[ est.tree_.children_left != -1] assert_greater(np.min(n_node_samples) + 1, mss) mfc = MondrianForestClassifier(random_state=0, min_samples_split=mss) mfc.partial_fit(X_c[:X_c.shape[0] // 2], y_c[:X_c.shape[0] // 2]) mfc.partial_fit(X_c[X_c.shape[0] // 2:], y_c[X_c.shape[0] // 2:]) for est in mfc.estimators_: n_node_samples = est.tree_.n_node_samples[ est.tree_.children_left != -1] assert_greater(np.min(n_node_samples) + 1, mss)
def check_partial_fit_equivalence(size_batch, f, random_state, X, y, is_clf=False): start_ptr = list(range(0, 100, size_batch)) end_ptr = start_ptr[1:] + [100] if not is_clf: p_f = MondrianForestRegressor(random_state=random_state) else: p_f = MondrianForestClassifier(random_state=random_state) for start, end in zip(start_ptr, end_ptr): p_f.partial_fit(X[start:end], y[start:end]) for est, p_est in zip(f.estimators_, p_f.estimators_): assert_array_equal(p_est.tree_.n_node_samples, est.tree_.n_node_samples) assert_array_equal(p_est.tree_.threshold, est.tree_.threshold) assert_array_equal(p_est.tree_.feature, est.tree_.feature) assert_equal(p_est.tree_.root, est.tree_.root) assert_array_equal(p_est.tree_.value, est.tree_.value) assert_equal(est.tree_.n_node_samples[est.tree_.root], 100) assert_equal(p_est.tree_.n_node_samples[est.tree_.root], 100)
def test_boston(): mr = MondrianForestRegressor(n_estimators=5, random_state=0) mr.fit(X, y) check_boston(mr) mr.partial_fit(X, y) check_boston(mr)
def test_mean_std_forest_regressor(): mfr = MondrianForestRegressor(random_state=0) mfr.fit(X, y) check_mean_std_forest_regressor(mfr) mfr.partial_fit(X, y) check_mean_std_forest_regressor(mfr)
class Client: """ gRPC Client class for streaming competition platform""" channel = None stub = None def __init__(self, batch_size): """ :param batch_size: Integer value, defined by the competition and available at competition page :param server_port: Connection string ('IP:port') :param user_email: String, e-mail used for registering to competition :param token: String, received after subscription to a competition :param competition_code: String, received after subscription to a competition :param first_prediction: Prediction, class generated from .proto file. Used to initiate communication with the server. Not influencing the results. Should contain appropriate fields from .proto file. """ # mondrian self.mfr = MondrianForestRegressor(random_state=1, n_estimators=100, bootstrap=True) self.previous_target_3 = pd.Series() self.features_for_rowID = Queue() self.previous_train_batch = np.array([-1, -1, -1, -1, -1]) # rrcf self.num_trees = 40 self.tree_size = 256 self.forest = [] self.avg_codisp = {} self.curr_sum = 0 self.curr_num = 0 self.idx = 0 self._init_modeling() a = 1 while a == 1: print("wait") now = datetime.datetime.now() starttime = now.replace(hour=21, minute=0, second=0, microsecond=0) if now >= starttime: print(now) print("시작!") break self.batch_size = batch_size self.stop_thread = False self.predictions_to_send = Queue() self.channel = grpc.insecure_channel( 'app.streaming-challenge.com:50051') self.stub = file_pb2_grpc.DataStreamerStub(self.channel) self.user_email = '*****@*****.**' self.competition_code = 'jR' #oj self.token = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoieXU5OTA1MjRAZ21haWwuY29tIiwiY29tcGV0aXRpb25faWQiOiIxIn0.B7CAjAsEbTjp4l1K4GR1Y0IJZj6_mKEbKBXsXXJmGBg' self.predictions_to_send.put( file_pb2.Prediction(rowID=1000, target=333)) self.metadata = self.create_metadata(user_id=self.user_email, code=self.competition_code, token=self.token) @staticmethod def create_metadata(user_id, code, token): """ :param user_id: :param code: :param token: :return: """ metadata = [(b'authorization', bytes(token, 'utf-8')), (b'user_id', bytes(user_id, 'utf-8')), (b'competition_id', bytes(code, 'utf-8'))] return metadata @staticmethod def create_forest(num_trees): forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) return forest def partial_train(self, X_test, y_test): y_pred, y_std = self.mfr.predict(X_test, return_std=True) self.mfr.partial_fit(X_test, y_test) #print('pred : %f, std: %f, y: %f'%(y_pred, y_std, y_test)) return y_pred, y_std def _init_modeling(self): network = pd.read_csv('initial_training_data.csv', index_col='date', parse_dates=['date']) self.forest = [] for _ in range(self.num_trees): tree = rrcf.RCTree() self.forest.append(tree) train_len = len(network) #train_len = 1000 train_start = 80000 self.idx = 0 print("start!") for index in range(train_start, train_len): point = float(network[index:index + 1].values) # get one by one for tree in self.forest: if len(tree.leaves) > self.tree_size: tree.forget_point(self.idx - self.tree_size) tree.insert_point(point, index=self.idx) if not index in self.avg_codisp: self.avg_codisp[self.idx] = 0 self.avg_codisp[self.idx] += tree.codisp( self.idx) / self.num_trees # avg_codisp은 (각 tree 이 point를 anomaly로 생각하는 정도)의 평균 mean = np.array(list(self.avg_codisp.values())).mean() std = np.array(list(self.avg_codisp.values())).std() z = (self.avg_codisp[self.idx] - mean) / std self.idx += 1 if z > 3.0 or z < -3.0: # if abs(z-score) is over 3.0 # replace the value with the mean of prev 5 days network.iloc[index] = network[index - 5:index].mean() # print("init_modeling에서 anomaly detection 완료") print("init_modeling에서 trainign 시작") for i in range(7 + train_start, train_len): X_train = pd.Series() X_train['prev1'] = float(network[i - 7:i - 6]['target'].values) X_train['prev2'] = float(network[i - 6:i - 5]['target'].values) X_train['prev3'] = float(network[i - 5:i - 4]['target'].values) y_train = (network[i:i + 1]['target'].values) self.mfr.partial_fit(X_train.values.reshape(1, -1), y_train) print("train 완료") self.previous_target_3['prev3'] = float( network[train_len - 8:train_len - 7]['target'].values) self.previous_target_3['prev2'] = float( network[train_len - 7:train_len - 6]['target'].values) self.previous_target_3['prev1'] = float( network[train_len - 6:train_len - 5]['target'].values) self.previous_train_batch = network[train_len - 5:train_len]['target'].values print('endebded') def generate_predictions(self): """ Sending predictions :return: Prediction """ while True: try: prediction = self.predictions_to_send.get(block=True, timeout=60) print("Prediction: ", prediction) yield prediction except queue.Empty: self.stop_thread = True break #check anomaly with RRCF def anomaly_detection(self, data): for tree in self.forest: if len(tree.leaves) > self.tree_size: tree.forget_point(self.idx - self.tree_size) tree.insert_point(data, index=self.idx) if not self.idx in self.avg_codisp: self.avg_codisp[self.idx] = 0 self.avg_codisp[self.idx] += tree.codisp(self.idx) / self.num_trees # avg_codisp은 (각 tree 이 point를 anomaly로 생각하는 정도)의 평균 mean = np.array(list(self.avg_codisp.values())).mean() std = np.array(list(self.avg_codisp.values())).std() z = (self.avg_codisp[self.idx] - mean) / std self.idx += 1 if z > 3.0 or z < -3.0: return self.previous_train_batch.mean() # if abs(z-score) is over 3.0 # replace the value with the mean of whole data we met else: return data #if not over 3.0, then no need to replace the value def loop_messages(self): """ Getting messages (data instances) from the stream. :return: """ #generate prediction -> get prediction from predictions_to_send one by one ans SEND to server messages = self.stub.sendData(self.generate_predictions(), metadata=self.metadata) test_idx = 0 test_feature = self.previous_target_3 try: for message in messages: message = json.loads(json_format.MessageToJson(message)) print("message:", message) if message['tag'] == 'TEST': print('test') test_feature['prev3'] = test_feature['prev2'] test_feature['prev2'] = test_feature['prev1'] test_feature['prev1'] = float( self.previous_train_batch[test_idx]) pred = self.mfr.predict(test_feature.values.reshape(1, -1)) prediction = file_pb2.Prediction(rowID=message['rowID'], target=pred) self.predictions_to_send.put(prediction) # test_idx = (test_idx + 1) % 5 print(test_idx) print('test end') if message['tag'] == 'TRAIN': print('train') #training data to train my model. target = message['target'] target = self.anomaly_detection(target) print(self.previous_target_3) # i-5, i-6, i-7 의 값을 갖고 학습 if self.previous_target_3['prev3'] < 0: self.previous_target_3['prev3'] = target elif self.previous_target_3['prev2'] < 0: self.previous_target_3['prev2'] = target elif self.previous_target_3['prev1'] < 0: self.previous_target_3['prev1'] = target else: print('else') #replace the oldest value self.previous_target_3[ 'prev3'] = self.previous_target_3['prev2'] #-7 self.previous_target_3[ 'prev2'] = self.previous_target_3['prev1'] #-6 self.previous_target_3['prev1'] = float( self.previous_train_batch[0]) #-5 # partial fit with 3 previous values as feature self.mfr.partial_fit( self.previous_target_3.values.reshape(1, -1), [target]) #현재 train data의 target값 저장 self.previous_train_batch = np.roll( self.previous_train_batch, -1) self.previous_train_batch[4] = target print('else end') print('train end') if self.stop_thread: break except Exception as e: print(str(e)) pass def run(self): """ Start thread. """ print("Start") t1 = Thread(target=self.loop_messages) t1.start()