def OLStraining(train,test,t_index,y_true): df_train = train.copy() df_test = test.copy() col = [c for c in df_train.columns if c not in excl] X = np.array([]) y = np.array([]) X = df_train.loc[df_train.timestamp.isin(t_index), col].copy() y = df_train.loc[df_train.timestamp.isin(t_index), "y"].copy() print("--- selected X shape %s" % (X.shape,)) print("--- selected y shape %s" % (y.shape,)) X = X.fillna(X.mean(axis=0)) # # failed to fill with mean, then fill with zero # X = X.fillna(.0) X = ( X - X.mean(axis=0) ) / X.std(axis=0) X = np.array(X) X = sm.add_constant(X) ols_model = sm.OLS(y,X) res = ols_model.fit() print res.summary() Xt = df_test.loc[:, col].copy() Xt = Xt.fillna(Xt.mean(axis=0)) Xt = Xt.fillna(.0) Xt = ( Xt - Xt.mean(axis=0) ) / Xt.std(axis=0) Xt = np.array(Xt) Xt = sm.add_constant(Xt) Xt = sm.add_constant(Xt) y_pred = res.predict(Xt) print(len(y_pred),len(y_true)) print r_score(y_true,y_pred)
def testAalysis3(train, test, y_test_true): df_test = test.copy() df_train = train.copy() print("test shape %s" % (df_test.shape, )) print("y_test_true shape %s" % (y_test_true.shape, )) uniq_timestamp = df_test["timestamp"].unique() uniq_id = sorted(df_test["id"].unique()) #print uniq_id print("-- selected features id: %d" % uniq_timestamp) train_uniq_id = sorted(df_train["id"].unique()) train_uniq_timestamp = sorted(df_train["timestamp"].unique()) new_test_uniq_id = np.array( [test_id for test_id in uniq_id if test_id not in train_uniq_id]) print("test timestamp length %d , test id length %d" % (len(uniq_timestamp), len(uniq_id))) print("train id length %d" % len(train_uniq_id)) print("length of new test unique id %d" % new_test_uniq_id.shape) Xtrain = np.array(df_train) print Xtrain.shape train_id = Xtrain[:, 0] scores = {} for cnt, idx in enumerate(uniq_id): mask = Xtrain[:, 0] == idx select_one_id = Xtrain[mask, :].copy() mask = select_one_id[:, 1] > (uniq_timestamp - 30) select_one_id = select_one_id[mask, :] #print select_one_id.shape data = select_one_id[:, 2:110] y = select_one_id[:, 110].ravel() y = np.cumsum(y) if cnt % 100 == 0: print("-- trainig counter:%d test_id:%d" % (cnt, idx)) y_pred, final_cost = fitting(data, y) if not np.isnan(final_cost): #print idx,r_score(y,y_pred) scores[idx] = r_score(y, y_pred) else: scores[idx] = 0.0 print("-- total counter %d" % cnt) r_score_value = scores.values() print np.mean(r_score_value)
def testAalysis4(train, test, y_test_true): df_test = test.copy() df_train = train.copy() print("test shape %s" % (df_test.shape, )) print("y_test_true shape %s" % (y_test_true.shape, )) uniq_timestamp = df_test["timestamp"].unique() uniq_id = sorted(df_test["id"].unique()) #print uniq_id print("-- selected features timestamp: %d" % uniq_timestamp) train_uniq_id = sorted(df_train["id"].unique()) train_uniq_timestamp = sorted(df_train["timestamp"].unique()) new_test_uniq_id = np.array( [test_id for test_id in uniq_id if test_id not in train_uniq_id]) print("test id length %d" % (len(uniq_id))) print("train id length %d" % len(train_uniq_id)) print("length of new test unique id %d" % new_test_uniq_id.shape) Xtrain = np.array(df_train) Xtest = np.array(df_test) print("* training data shape %s" % (Xtrain.shape, )) print("* test data shape %s" % (Xtest.shape, )) train_id = Xtrain[:, 0] scores = {} y_test_pred_dict = {} for cnt, idx in enumerate(uniq_id): mask = Xtrain[:, 0] == idx select_one_id = Xtrain[mask, :].copy() mask = select_one_id[:, 1] > (uniq_timestamp - 30) select_one_id = select_one_id[mask, :] #print select_one_id.shape data = select_one_id[:, 2:110] y = select_one_id[:, 110].ravel() y = np.cumsum(y) if cnt % 100 == 0: print("-- trainig counter:%d test_id:%d" % (cnt, idx)) # # data model used from PCA analysis 5 dimentions # y_pred, theta, final_cost = fitting(data, y) # if not np.isnan(final_cost): #print idx,r_score(y,y_pred) scores[idx] = r_score(y, y_pred) mask = Xtest[:, 0] == idx select_one_test_id = Xtest[mask, :].copy() test_data = select_one_test_id[:, 2:] data_stacked = np.vstack((data[1:, :], test_data)) Xtt = dataPCA(data_stacked) y_test_diff = np.dot(Xtt, theta) #y_test_diff = np.diff(y_test) y_test_prediction = y_test_diff[-1] y_test_pred_dict[idx] = y_test_prediction #print(y_test_true[cnt],y_test_prediction) else: scores[idx] = 0.0 y_test_pred_dict[idx] = 0.0 print("-- total counter %d" % (cnt + 1)) r_score_value = scores.values() print np.mean(r_score_value) return y_test_pred_dict
features = observation.features.copy() # y_hat = gmodel_test.predict2(observation.features.copy()) target = observation.target target['y'] = y_hat timestamp = observation.features["timestamp"][0] if timestamp % 100 == 0: print("Timestamp #{}".format(timestamp)) y_true = env.temp_test_y #y_true = np.exp(y_true) score_ = r_score(y_true, y_hat) rewards.append(score_) print("-- score %.5f" % np.mean(rewards)) print("-- reward %.5f" % reward) # We perform a "step" by making our prediction and getting back an updated "observation": observation, reward, done, info = env.step(target) if done: print("Public score: {}".format(info["public_score"])) break
def proc1(log): env = make() observation_test = env.reset() emcv = ElasticNetCV() #columns = ['technical_30', 'technical_20', 'fundamental_11', 'technical_19'] columns = ['technical_30', 'technical_20', 'fundamental_11'] train_data = observation_test.train.copy() gmodel_test = glmModel(train_data, columns) y_hat = gmodel_test.BuildModel() model_test = fitModel(emcv, train_data, columns) prediction_test = model_test.predict(observation_test.features.copy()) print "No elasticnet observation :", len(prediction_test) #score_ = r_score(y_true, y_hat) #print score_ return 1 """ train_data = observation_test.train.copy() features_data = observation_test.features.copy() feat_colNames = features_data.columns.values.tolist()[2:] #train_data = observation_test.features.copy kaggleAnalysis = KaggleDataAnalysisClass(train_data,True) kaggleAnalysis.corrCheck(feat_colNames) #emcv = ElasticNetCV(fit_intercept = True) kaggleAnalysis.modelfit(emcv) """ while True: prediction_test = model_test.predict(observation_test.features.copy()) target_test = observation_test.target target_test['y'] = prediction_test """ features_data = observation_test.features.copy() prediction_test = kaggleAnalysis.predict(features_data) target_test = observation_test.target target_test['y'] = prediction_test timestamp_ = observation_test.features["timestamp"][0] log.info("timestamp : %d " % timestamp_) """ timestamp_ = observation_test.features["timestamp"][0] rewards = [] if timestamp_ % 100 == 0: print(timestamp_) y_true = env.temp_test_y score_ = r_score(y_true, prediction_test) rewards.append(score_) log.info("score %.5f" % np.mean(rewards)) observation_test, reward_test, done_test, info_test = env.step( target_test) #log.info("reward_test : %.5f " % reward_test) if done_test: print('Info-test:', info_test['public_score']) break