def test_pipe_transformation(): # SegmentX transform pipe pipe = Pype([('seg', SegmentX()), ('ftr', FeatureRep()), ('scaler', StandardScaler())]) Xt = [np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10)] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [1, 2, 3] transformation_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) transformation_test(pipe, X, y) # SegmentXY transform pipe pipe = Pype([('seg', SegmentXY()), ('ftr', FeatureRep()), ('scaler', StandardScaler())]) Xt = [np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10)] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)] transformation_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) transformation_test(pipe, X, y) # Forecast transform pipe pipe = Pype([('seg', SegmentXYForecast()), ('ftr', FeatureRep()), ('scaler', StandardScaler())]) Xt = [np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10)] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)] transformation_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) transformation_test(pipe, X, y) # Padtrunc transform pipe pipe = Pype([('trunc', PadTrunc()), ('ftr', FeatureRep()), ('scaler', StandardScaler())]) Xt = [np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10)] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [1, 2, 3] transformation_test(pipe, X, y) X = pd.DataFrame(Xc) X['ts_data'] = Xt X = TS_Data.from_df(X) transformation_test(pipe, X, y)
def test_pipe_classification(): # no context data, single time series X = [np.random.rand(1000, 10)] y = [5] pipe = Pype([('seg', SegmentX()), ('ftr', FeatureRep()), ('rf', RandomForestClassifier(n_estimators=10))]) classifier_test(pipe, X, y) # context data, single time seres Xt = [np.random.rand(1000, 10)] Xc = [np.random.rand(3)] X = TS_Data(Xt, Xc) y = [5] classifier_test(pipe, X, y) # multiple time series Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [1, 2, 3] classifier_test(pipe, X, y) # univariate data Xt = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)] Xc = np.random.rand(3) X = TS_Data(Xt, Xc) y = [1, 2, 3] classifier_test(pipe, X, y)
return model # load the data data = load_watch() X = data['X'] y = data['y'] # temporal splitting of data splitter = TemporalKFold(n_splits=3) Xs, ys, cv = splitter.split(X, y) # create a segment learning pipeline width = 100 pipe = Pype([('seg', SegmentX(order='C')), ('crnn', KerasClassifier(build_fn=crnn_model, epochs=1, batch_size=256, verbose=0))]) # create a parameter dictionary using the sklearn API # # you can also set a parameter to be always equal to another parameter, by setting its value to # parameter name to track (this is an extension from sklearn) # # note that if you want to set a parameter to a single value, it will still need to be as a list par_grid = { 'seg__width': [50, 100, 200],
from seglearn.feature_functions import maximum, minimum from sklearn.preprocessing import normalize, MinMaxScaler batchSize = 2048 FILE = '../data/gesture/' + sys.argv[1] + '.txt' with open(FILE, 'r') as file: y = np.loadtxt(file, delimiter=',') print(y.shape) y = np.reshape(y, (-1, batchSize)) y = np.reshape(y, (1, -1)) print(y.shape) segment = SegmentX(width=batchSize, step=batchSize, shuffle=False, random_state=None, order='F') y = segment.transform(y)[0] print(y.shape) maxy = maximum(y) miny = minimum(y) print(maxy.shape) print(miny.shape) # y=np.transpose(y) # scaler = MinMaxScaler() # scaler.fit(y) # y=scaler.transform(y) # y=np.transpose(y) print(y)
return model # load the data data = load_watch() X = data['X'] y = data['y'] # temporal splitting of data splitter = TemporalKFold(n_splits=3) Xs, ys, cv = splitter.split(X, y) # create a segment learning pipeline width = 100 pipe = Pype([('seg', SegmentX()), ('crnn', KerasClassifier(build_fn=crnn_model, epochs=1, batch_size=256, verbose=0))]) # create a parameter dictionary using the sklearn API # # you can also set a parameter to be always equal to another parameter, by setting its value to # parameter name to track (this is an extension from sklearn) # # note that if you want to set a parameter to a single value, it will still need to be as a list par_grid = { 'seg__width': [50, 100, 200],
from seglearn.base import TS_Data from seglearn.datasets import load_watch from seglearn.pipe import Pype from seglearn.transform import FeatureRep, SegmentX # seed RNGESUS np.random.seed(123124) # load the data data = load_watch() X = data['X'] y = data['y'] # create a feature representation pipeline clf = Pype([('segment', SegmentX()), ('features', FeatureRep()), ('scaler', StandardScaler()), ('rf', RandomForestClassifier(n_estimators=20))]) # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print("N series in train: ", len(X_train)) print("N series in test: ", len(X_test)) print("N segments in train: ", clf.N_train) print("N segments in test: ", clf.N_test) print("Accuracy score: ", score)
# ``SegPipe`` can be initialized with a scorer callable made with sklearn.metrics.make_scorer # this can be used to cross_validate or grid search with any 1 score scorer = make_scorer(f1_score, average='macro') pipe = SegPipe(est, scorer=scorer) cv_scores = cross_validate(pipe, X, y, cv=4, return_train_score=True) print("CV F1 Scores: ", pd.DataFrame(cv_scores)) ################################################## # SCORING WORKAROUND 2: WORK OUTSIDE THE PIPELINE ################################################## # If you want to have multiple score computed, the only way is as follows # # First transform the time series data into segments and then score the ``est`` part of the # pipeline. # # The disadvantage of this is that the parameters of the ``seg`` pipeline cannot be # optimized with this approach segmenter = SegmentX() X_seg, y_seg, _ = segmenter.fit_transform(X, y) scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'] cv_scores = cross_validate(est, X_seg, y_seg, cv=4, return_train_score=False, scoring=scoring) print("CV Scores (workaround): ", pd.DataFrame(cv_scores))
train_df = pd.concat(train_df_list) train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S') all_df = pd.concat([train_df]) X = [] y = [] id_list = [] for ship_id, group in all_df.groupby('渔船ID'): X.append(group[['lat', 'lon', '速度', '方向', 'time']]) y.append(group['type'].values[0]) id_list.append(ship_id) print(len(id_list)) pype = Pype([('segment', SegmentX(width=72, overlap=0.1))]) pype = pype.fit(X, y) shape_list = [] df_list = [] for ship_id, group in all_df.groupby('渔船ID'): sample = group[['lat', 'lon', '速度', '方向', 'time']].values transform_result = pype.transform([sample])[0] if transform_result.shape[0] == 0: seg_df = pd.DataFrame(sample, columns=['lat', 'lon', '速度', '方向', 'time']) seg_df['渔船ID'] = len(df_list) seg_df['type'] = group['type'].values[0] df_list.append(seg_df)
def test_pipe_classification(): # no context data, single time series X = [np.random.rand(1000, 10)] y = [5] est = Pipeline([('ftr', FeatureRep()), ('ridge', RandomForestClassifier())]) pipe = SegPipe(est, segmenter=SegmentX()) pipe.fit(X, y) pipe.predict(X, y) pipe.score(X, y) # context data, single time seres Xt = [np.random.rand(1000, 10)] Xc = [np.random.rand(3)] X = make_ts_data(Xt, Xc) y = [5] pipe.fit(X, y) pipe.predict(X, y) pipe.score(X, y) # multiple time series Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = make_ts_data(Xt, Xc) y = [1, 2, 3] pipe.fit(X, y) pipe.predict(X, y) pipe.score(X, y) # univariate data Xt = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)] Xc = np.random.rand(3) X = make_ts_data(Xt, Xc) y = [1, 2, 3] pipe.fit(X, y) pipe.predict(X, y) pipe.score(X, y) # transform pipe est = Pipeline([('ftr', FeatureRep()), ('scaler', StandardScaler())]) pipe = SegPipe(est, segmenter=SegmentX()) Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = make_ts_data(Xt, Xc) y = [1, 2, 3] pipe.fit(X, y) pipe.transform(X, y) pipe.fit_transform(X, y)
plt.ylabel('True label') plt.xlabel('Predicted label') plt.tight_layout() ############################################## # SETUP ############################################## # load the data data = load_watch() X = data['X'] y = data['y'] # create a feature representation pipeline steps = [('seg', SegmentX()), ('features', FeatureRep()), ('scaler', StandardScaler()), ('rf', RandomForestClassifier(n_estimators=20))] pipe = Pype(steps) # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) ############################################## # OPTION 1: Use the score SegPipe score method ############################################## pipe.fit(X_train, y_train) score = pipe.score(X_test, y_test)
def test_pipe_classification(): # no context data, single time series X = [np.random.rand(1000, 10)] y = [5] pipe = Pype([('seg', SegmentX()), ('ftr', FeatureRep()), ('rf', RandomForestClassifier(n_estimators=10))]) pipe.fit(X, y) pipe.predict(X) pipe.transform_predict(X, y) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) # context data, single time seres Xt = [np.random.rand(1000, 10)] Xc = [np.random.rand(3)] X = TS_Data(Xt, Xc) y = [5] pipe.fit(X, y) pipe.transform_predict(X, y) pipe.predict(X) pipe.score(X, y) # multiple time series Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [1, 2, 3] pipe.fit(X, y) pipe.transform_predict(X, y) pipe.predict(X) pipe.score(X, y) # univariate data Xt = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)] Xc = np.random.rand(3) X = TS_Data(Xt, Xc) y = [1, 2, 3] pipe.fit(X, y) pipe.transform_predict(X, y) pipe.predict(X) pipe.score(X, y) # transform pipe pipe = Pype([('seg', SegmentX()), ('ftr', FeatureRep()), ('scaler', StandardScaler())]) Xt = [ np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10) ] Xc = np.random.rand(3, 3) X = TS_Data(Xt, Xc) y = [1, 2, 3] pipe.fit(X, y) pipe.transform(X, y) pipe.fit_transform(X, y)
# data = sio.loadmat('./data/shiftcorr%d'%idx) # poor results # data = sio.loadmat('./data/shiftlinear%d'%idx) # poor results # data = sio.loadmat('./data/singledimshiftfreq%d'%idx)# poor results # data = sio.loadmat('./data/agotsshiftmean%d'%idx) # data = sio.loadmat('./data/agotsshiftvar%d'%idx) data = sio.loadmat('./data/extreme%d' % idx) # good esults ts = data['ts'] #.T # transpose is needed for shiftfreq print(ts.shape) bkps = data['bkps'][0] scaler = StandardScaler() ts = scaler.fit_transform(ts) width = 10 step = 5 ts = [ts] segment = SegmentX(width=width, step=step) x = segment.fit_transform(ts, None)[0] x = x.reshape([x.shape[0], -1]) x = torch.from_numpy(x).float() bkss = bkps // 5 #bkss for break samples model = AutoEncoder(input_dim=10, latent_dim=1, output_dim=10) _, pred = model.fit_predict(x) err = (pred - x).detach().numpy() err = np.max(np.power(err, 2), axis=1) rpt.display(err, true_chg_pts=bkss) rpt.display(ts[0], true_chg_pts=bkps) plt.show()
for ship_id, group in all_df.groupby('渔船ID'): X.append(group[['lat', 'lon', '方向', '速度']].values) y.append(group['type'].values[0]) id_list.append(int(ship_id)) le = preprocessing.LabelEncoder() y_train = le.fit_transform(y[:len(train_df_list)]) X_train = X[:len(train_df_list)] X_test = X[len(train_df_list):] kf = KFold(n_splits=5, random_state=42, shuffle=True) model_v1_list = [] score_v1_list = [] for train_index, test_index in kf.split(X_train): model_v1 = Pype([('segment', SegmentX(width=10)), ('features', FeatureRep()), ('scaler', StandardScaler()), ('rf', RandomForestClassifier(n_estimators=100, random_state=42))]) model_v1.fit(np.array(X_train)[train_index], y_train[train_index]) model_v1_list.append(model_v1) y_pred = [] for test_sample in np.array(X_train)[test_index]: result = model_v1.predict_proba([test_sample]) pred = np.argmax(np.sum(result, axis=0) / result.shape[0]) y_pred.append(pred) score_v1_list.append(f1_score(y_train[test_index], y_pred, average='macro'))
np.array(pd.read_csv('LF_BC_9600.csv', sep=",", header=None)), np.array(pd.read_csv('RF_BC_9600.csv', sep=",", header=None)), np.array(pd.read_csv('LB_BP_9600.csv', sep=",", header=None)), np.array(pd.read_csv('RB_BP_9600.csv', sep=",", header=None)), np.array(pd.read_csv('LF_BP_9600.csv', sep=",", header=None)), np.array(pd.read_csv('RF_BP_9600.csv', sep=",", header=None)) ]) # create the label vector and the corresponding semantic vector y = np.array([0, 1, 2, 3, 4, 5, 6, 7]) labels = [ 'LB_BC', 'RB_BC', 'LF_BC', 'RF_BC', 'LB_BP', 'RB_BP', 'LF_BP', 'RF_BP' ] # segment the data and labels segmenter = SegmentX(100, 0.5) X_new, y_new, _ = segmenter.fit_transform(X, y) ################################################################################################### # create a pipeline for LDA transformation of the feature representation est = Pipeline([('features', FeatureRep()), ('lda', LinearDiscriminantAnalysis(n_components=2))]) pipe = SegPipe(est) # plot embedding X2, y2 = pipe.fit_transform(X_new, y_new) plot_embedding(X2, y2.astype(int), labels) plt.show() ###################################################################################################
model.add(LSTM(units=lstm_units, dropout=0.1, recurrent_dropout=0.1)) model.add(Dense(n_classes, activation="softmax")) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model # load the data data = load_watch() X = data['X'] y = data['y'] # create a segment learning pipeline pipe = Pype([('seg', SegmentX(width=100, step=100, order='C')), ('crnn', KerasClassifier(build_fn=crnn_model, epochs=1, batch_size=256, verbose=0))]) # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) pipe.fit(X_train, y_train) score = pipe.score(X_test, y_test) print("N series in train: ", len(X_train)) print("N series in test: ", len(X_test)) print("N segments in train: ", pipe.N_train) print("N segments in test: ", pipe.N_test) print("Accuracy score: ", score)
def generate_result(): train_path = '/tcdata/hy_round2_train_20200225' test_path = '/tcdata/hy_round2_testB_20200312' train_df_list = [] for file_name in os.listdir(train_path): if file_name.endswith('.csv'): df = pd.read_csv(os.path.join(train_path, file_name)) train_df_list.append(df) test_df_list = [] for file_name in os.listdir(test_path): if file_name.endswith('.csv'): df = pd.read_csv(os.path.join(test_path, file_name)) test_df_list.append(df) train_df = pd.concat(train_df_list) test_df = pd.concat(test_df_list) train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S') test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S') all_df = pd.concat([train_df, test_df]) X = [] y = [] id_list = [] for ship_id, group in all_df.groupby('渔船ID'): X.append(group[['lat', 'lon', '速度', '方向', 'time']]) y.append(group['type'].values[0]) id_list.append(ship_id) print(len(id_list)) pype = Pype([('segment', SegmentX(width=72, overlap=0.0))]) pype = pype.fit(X, y) shape_list = [] df_list = [] for ship_id, group in all_df.groupby('渔船ID'): sample = group[['lat', 'lon', '速度', '方向', 'time']].values transform_result = pype.transform([sample])[0] if transform_result.shape[0] == 0: seg_df = pd.DataFrame(sample, columns=['lat', 'lon', '速度', '方向', 'time']) seg_df['渔船ID'] = len(df_list) seg_df['type'] = group['type'].values[0] df_list.append(seg_df) shape_list.append(1) else: for seg in transform_result: seg_df = pd.DataFrame( seg, columns=['lat', 'lon', '速度', '方向', 'time']) seg_df['渔船ID'] = len(df_list) seg_df['type'] = group['type'].values[0] df_list.append(seg_df) shape_list.append(transform_result.shape[0]) new_all_df = pd.concat(df_list, sort=False) new_all_df.to_csv('help.csv', index=False) new_all_df = pd.read_csv('help.csv') df = new_all_df.drop(columns=['type']) extracted_df = extract_features(df, column_id='渔船ID', column_sort='time', n_jobs=8, kind_to_fc_parameters=fc_parameters_v2) new_df = new_all_df.groupby('渔船ID').agg(x_min=('lat', 'min'), x_max=('lat', 'max'), y_min=('lon', 'min'), y_max=('lon', 'max')) extracted_df['x_max-x_min'] = new_df['x_max'] - new_df['x_min'] extracted_df['y_max-y_min'] = new_df['y_max'] - new_df['y_min'] extracted_df['x_max-y_min'] = new_df['x_max'] - new_df['y_min'] extracted_df['y_max-x_min'] = new_df['y_max'] - new_df['x_min'] extracted_df['slope'] = extracted_df['y_max-y_min'] / np.where( extracted_df['x_max-x_min'] == 0, 0.001, extracted_df['x_max-x_min']) extracted_df[ 'area'] = extracted_df['x_max-x_min'] * extracted_df['y_max-y_min'] def get_feature(arr): feature = [ np.max(arr), np.quantile(arr, 0.9), np.quantile(arr, 0.1), np.quantile(arr, 0.75), np.quantile(arr, 0.25), np.mean(arr), np.std(arr), np.median(arr), np.std(arr) / np.mean(arr) ] feature.append(np.corrcoef(np.array([arr[:-1], arr[1:]]))[0, 1]) feature.append(skew(arr)) feature.append(kurtosis(arr)) return feature features = [] for _, group in new_all_df.groupby('渔船ID'): group = group.sort_values(by=['time']) lat = group['lat'].values lon = group['lon'].values time_ = pd.to_datetime(group['time'], format='%Y-%m-%d %H:%M:%S').values dire = group['方向'].values speed_list = [] for i in range(lat.shape[0]): if i == 0: continue hour = (time_[i] - time_[i - 1]) / np.timedelta64(1, 'h') dist = geodesic((lat[i - 1], lon[i - 1]), (lat[i], lon[i])) speed_list.append(dist.km / hour) c = np.sum(np.cos(dire / 180 * np.pi)) / group.shape[0] s = np.sum(np.sin(dire / 180 * np.pi)) / group.shape[0] r = np.sqrt(c**2 + s**2) theta = np.arctan(s / c) angle_feature = [r, theta, np.sqrt(-2 * np.log(r))] turn_list = [] for i in range(dire.shape[0]): if i == 0: continue turn = 1 - np.cos(dire[i - 1] / 180 * np.pi - dire[i] / 180 * np.pi) turn_list.append(turn * np.pi) turn_list = np.array(turn_list) c = np.sum(np.cos(turn_list)) / (group.shape[0] - 1) s = np.sum(np.sin(turn_list)) / (group.shape[0] - 1) r = np.sqrt(c**2 + s**2) theta = np.arctan(s / c) turn_feature = [r, theta, np.sqrt(-2 * np.log(r))] features.append( np.concatenate( [get_feature(speed_list), angle_feature[:1], turn_feature[:1]])) extracted_df_ = pd.concat([pd.DataFrame(np.array(features)), extracted_df], axis=1) y = [] for _, group in new_all_df.groupby('渔船ID'): y.append(group.iloc[0]['type']) train_df = extracted_df_.iloc[:np.sum(shape_list[:len(train_df_list)])] test_df = extracted_df_.iloc[np.sum(shape_list[:len(train_df_list)]):] y_train = y[:train_df.shape[0]] le = preprocessing.LabelEncoder() y_train = le.fit_transform(y_train) train_df['type'] = le.inverse_transform(y_train) train_df.to_csv('./train.csv') test_df.to_csv('./test.csv') train_df = pd.read_csv('./train.csv', index_col=0) X_train = train_df.drop(columns=['type']).values y_train = train_df['type'].values test_df = pd.read_csv('./test.csv', index_col=0) X_test = test_df.values from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') X_train = imputer.fit_transform( pd.DataFrame(X_train).replace([np.inf, -np.inf], np.nan).values) X_test = imputer.fit_transform( pd.DataFrame(X_test).replace([np.inf, -np.inf], np.nan).values) le = preprocessing.LabelEncoder() y_train = le.fit_transform(y_train) def get_model(): exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=48), StackingEstimator( estimator=SGDClassifier(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.25, learning_rate="invscaling", loss="modified_huber", penalty="elasticnet", power_t=10.0)), ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.6000000000000001, min_samples_leaf=1, min_samples_split=3, n_estimators=100)) set_param_recursive(exported_pipeline.steps, 'random_state', 42) return exported_pipeline def get_model_v2(): exported_pipeline = make_pipeline( make_union( make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), SelectPercentile(score_func=f_classif, percentile=18)), FunctionTransformer(copy)), StackingEstimator(estimator=SGDClassifier(alpha=0.01, eta0=0.1, fit_intercept=False, l1_ratio=1.0, learning_rate="constant", loss="hinge", penalty="elasticnet", power_t=0.1)), VarianceThreshold(threshold=0.05), ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.55, min_samples_leaf=1, min_samples_split=4, n_estimators=100)) set_param_recursive(exported_pipeline.steps, 'random_state', 42) return exported_pipeline def get_data(shape_idx): start_idx = int(np.sum(shape_list[:shape_idx])) end_idx = start_idx + shape_list[shape_idx] if shape_idx < len(train_df_list): return X_train[start_idx:end_idx], y_train[start_idx:end_idx] else: return X_test[start_idx:end_idx], None kf = KFold(n_splits=5, random_state=2019, shuffle=True) model_v1_list = [] score_v1_list = [] for train_index, test_index in kf.split(shape_list[:len(train_df_list)]): train_data = [] y_data = [] for idx in train_index: data = get_data(idx) train_data.append(data[0]) y_data.append(data[1]) train_data = np.concatenate(train_data, axis=0) y_data = np.concatenate(y_data, axis=0) model_v1 = get_model() model_v1.fit(train_data, y_data) model_v1_list.append(model_v1) y_true = [] y_pred = [] for idx in test_index: data = get_data(idx) proba = model_v1.predict_proba(data[0]) pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0]) y_pred.append(pred) y_true.append(data[1][0]) score = f1_score(y_pred, y_true, average='macro') score_v1_list.append(score) print(score_v1_list) print(np.mean(score_v1_list)) kf = KFold(n_splits=5, random_state=22, shuffle=True) model_v2_list = [] score_v2_list = [] for train_index, test_index in kf.split(shape_list[:len(train_df_list)]): train_data = [] y_data = [] for idx in train_index: data = get_data(idx) train_data.append(data[0]) y_data.append(data[1]) train_data = np.concatenate(train_data, axis=0) y_data = np.concatenate(y_data, axis=0) model_v2 = get_model_v2() model_v2.fit(train_data, y_data) model_v2_list.append(model_v2) y_true = [] y_pred = [] for idx in test_index: data = get_data(idx) proba = model_v2.predict_proba(data[0]) pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0]) y_pred.append(pred) y_true.append(data[1][0]) score = f1_score(y_pred, y_true, average='macro') score_v2_list.append(score) print(score_v2_list) print(np.mean(score_v2_list)) kf = KFold(n_splits=5, random_state=22, shuffle=True) model_v3_list = [] score_v3_list = [] for train_index, test_index in kf.split(shape_list[:len(train_df_list)]): train_data = [] y_data = [] for idx in train_index: data = get_data(idx) train_data.append(data[0]) y_data.append(data[1]) train_data = np.concatenate(train_data, axis=0) y_data = np.concatenate(y_data, axis=0) model_v3 = RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.1, min_samples_leaf=1, min_samples_split=2, n_estimators=100) model_v3.fit(train_data, y_data) model_v3_list.append(model_v3) y_true = [] y_pred = [] for idx in test_index: data = get_data(idx) proba = model_v3.predict_proba(data[0]) pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0]) y_pred.append(pred) y_true.append(data[1][0]) score = f1_score(y_pred, y_true, average='macro') score_v3_list.append(score) print(score_v3_list) print(np.mean(score_v3_list)) kf = KFold(n_splits=5, random_state=22, shuffle=True) model_v4_list = [] score_v4_list = [] for train_index, test_index in kf.split(shape_list[:len(train_df_list)]): train_data = [] y_data = [] for idx in train_index: data = get_data(idx) train_data.append(data[0]) y_data.append(data[1]) train_data = np.concatenate(train_data, axis=0) y_data = np.concatenate(y_data, axis=0) model_v4 = ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.6000000000000001, min_samples_leaf=1, min_samples_split=3, n_estimators=100) model_v4.fit(train_data, y_data) model_v4_list.append(model_v4) y_true = [] y_pred = [] for idx in test_index: data = get_data(idx) proba = model_v4.predict_proba(data[0]) pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0]) y_pred.append(pred) y_true.append(data[1][0]) score = f1_score(y_pred, y_true, average='macro') score_v4_list.append(score) print(score_v4_list) print(np.mean(score_v4_list)) pred = [] for i in range(len(train_df_list), len(shape_list)): start_idx = int(np.sum(shape_list[len(train_df_list):i])) sample = X_test[start_idx:start_idx + shape_list[i]] result = [] for model in model_v1_list: result.append( np.sum(model.predict_proba(sample), axis=0) / shape_list[i]) for model in model_v2_list: result.append( np.sum(model.predict_proba(sample), axis=0) / shape_list[i]) for model in model_v3_list: result.append( np.sum(model.predict_proba(sample), axis=0) / shape_list[i]) for model in model_v4_list: result.append( np.sum(model.predict_proba(sample), axis=0) / shape_list[i]) pred.append(np.sum(result, axis=0) / 20) pd.DataFrame(pred, index=id_list[len(train_df_list):]).to_csv( './probaresult.csv', header=None)
# load the data data = load_watch() X = data['X'] y = data['y'] # I am adding in a column to represent time (50 Hz sampling), since my data doesn't include it # the Interp class assumes time is the first column in the series X = np.array([ np.column_stack([np.arange(len(X[i])) / 50., X[i]]) for i in np.arange(len(X)) ]) clf = Pype([('interp', Interp(1. / 25., categorical_target=True)), ('segment', SegmentX(width=100)), ('features', FeatureRep()), ('scaler', StandardScaler()), ('rf', RandomForestClassifier(n_estimators=20))]) # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print("N series in train: ", len(X_train)) print("N series in test: ", len(X_test)) print("N segments in train: ", clf.N_train) print("N segments in test: ", clf.N_test) print("Accuracy score: ", score)