Python SegmentX Examples, seglearn.transform.SegmentX Python Examples

Example #1

0

Show file

File: test_pipe.py Project: xiaoruishan/seglearn

def test_pipe_transformation():
    # SegmentX transform pipe
    pipe = Pype([('seg', SegmentX()),
                 ('ftr', FeatureRep()),
                 ('scaler', StandardScaler())])
    Xt = [np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10)]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]
    transformation_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    transformation_test(pipe, X, y)

    # SegmentXY transform pipe
    pipe = Pype([('seg', SegmentXY()),
                 ('ftr', FeatureRep()),
                 ('scaler', StandardScaler())])
    Xt = [np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10)]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)]
    transformation_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    transformation_test(pipe, X, y)

    # Forecast transform pipe
    pipe = Pype([('seg', SegmentXYForecast()),
                 ('ftr', FeatureRep()),
                 ('scaler', StandardScaler())])
    Xt = [np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10)]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)]
    transformation_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    transformation_test(pipe, X, y)

    # Padtrunc transform pipe
    pipe = Pype([('trunc', PadTrunc()),
                 ('ftr', FeatureRep()),
                 ('scaler', StandardScaler())])
    Xt = [np.random.rand(1000, 10), np.random.rand(100, 10), np.random.rand(500, 10)]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]
    transformation_test(pipe, X, y)

    X = pd.DataFrame(Xc)
    X['ts_data'] = Xt
    X = TS_Data.from_df(X)
    transformation_test(pipe, X, y)

Example #2

0

Show file

File: test_pipe.py Project: yejiachen/seglearn

def test_pipe_classification():
    # no context data, single time series
    X = [np.random.rand(1000, 10)]
    y = [5]

    pipe = Pype([('seg', SegmentX()), ('ftr', FeatureRep()),
                 ('rf', RandomForestClassifier(n_estimators=10))])

    classifier_test(pipe, X, y)

    # context data, single time seres
    Xt = [np.random.rand(1000, 10)]
    Xc = [np.random.rand(3)]
    X = TS_Data(Xt, Xc)
    y = [5]
    classifier_test(pipe, X, y)

    # multiple time series
    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]
    classifier_test(pipe, X, y)

    # univariate data
    Xt = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)]
    Xc = np.random.rand(3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]
    classifier_test(pipe, X, y)

Example #3

0

Show file

File: plot_model_selection2.py Project: tylerwmarrs/seglearn

    return model


# load the data
data = load_watch()
X = data['X']
y = data['y']

# temporal splitting of data
splitter = TemporalKFold(n_splits=3)
Xs, ys, cv = splitter.split(X, y)

# create a segment learning pipeline
width = 100
pipe = Pype([('seg', SegmentX(order='C')),
             ('crnn',
              KerasClassifier(build_fn=crnn_model,
                              epochs=1,
                              batch_size=256,
                              verbose=0))])

# create a parameter dictionary using the sklearn API
#
# you can also set a parameter to be always equal to another parameter, by setting its value to
# parameter name to track (this is an extension from sklearn)
#
# note that if you want to set a parameter to a single value, it will still need to be as a list

par_grid = {
    'seg__width': [50, 100, 200],

Example #4

0

Show file

from seglearn.feature_functions import maximum, minimum
from sklearn.preprocessing import normalize, MinMaxScaler

batchSize = 2048

FILE = '../data/gesture/' + sys.argv[1] + '.txt'

with open(FILE, 'r') as file:
    y = np.loadtxt(file, delimiter=',')
print(y.shape)
y = np.reshape(y, (-1, batchSize))
y = np.reshape(y, (1, -1))
print(y.shape)
segment = SegmentX(width=batchSize,
                   step=batchSize,
                   shuffle=False,
                   random_state=None,
                   order='F')
y = segment.transform(y)[0]
print(y.shape)
maxy = maximum(y)
miny = minimum(y)
print(maxy.shape)
print(miny.shape)
# y=np.transpose(y)
# scaler = MinMaxScaler()
# scaler.fit(y)
# y=scaler.transform(y)
# y=np.transpose(y)
print(y)

Example #5

0

Show file

    return model


# load the data
data = load_watch()
X = data['X']
y = data['y']

# temporal splitting of data
splitter = TemporalKFold(n_splits=3)
Xs, ys, cv = splitter.split(X, y)

# create a segment learning pipeline
width = 100
pipe = Pype([('seg', SegmentX()),
             ('crnn',
              KerasClassifier(build_fn=crnn_model,
                              epochs=1,
                              batch_size=256,
                              verbose=0))])

# create a parameter dictionary using the sklearn API
#
# you can also set a parameter to be always equal to another parameter, by setting its value to
# parameter name to track (this is an extension from sklearn)
#
# note that if you want to set a parameter to a single value, it will still need to be as a list

par_grid = {
    'seg__width': [50, 100, 200],

Example #6

0

Show file

from seglearn.base import TS_Data
from seglearn.datasets import load_watch
from seglearn.pipe import Pype
from seglearn.transform import FeatureRep, SegmentX

# seed RNGESUS
np.random.seed(123124)

# load the data
data = load_watch()
X = data['X']
y = data['y']

# create a feature representation pipeline
clf = Pype([('segment', SegmentX()), ('features', FeatureRep()),
            ('scaler', StandardScaler()),
            ('rf', RandomForestClassifier(n_estimators=20))])

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)

print("N series in train: ", len(X_train))
print("N series in test: ", len(X_test))
print("N segments in train: ", clf.N_train)
print("N segments in test: ", clf.N_test)
print("Accuracy score: ", score)

Example #7

0

Show file

# ``SegPipe`` can be initialized with a scorer callable made with sklearn.metrics.make_scorer
# this can be used to cross_validate or grid search with any 1 score

scorer = make_scorer(f1_score, average='macro')
pipe = SegPipe(est, scorer=scorer)
cv_scores = cross_validate(pipe, X, y, cv=4, return_train_score=True)
print("CV F1 Scores: ", pd.DataFrame(cv_scores))

##################################################
# SCORING WORKAROUND 2: WORK OUTSIDE THE PIPELINE
##################################################

# If you want to have multiple score computed, the only way is as follows
#
# First transform the time series data into segments and then score the ``est`` part of the
# pipeline.
#
# The disadvantage of this is that the parameters of the ``seg`` pipeline cannot be
# optimized with this approach

segmenter = SegmentX()
X_seg, y_seg, _ = segmenter.fit_transform(X, y)
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_scores = cross_validate(est,
                           X_seg,
                           y_seg,
                           cv=4,
                           return_train_score=False,
                           scoring=scoring)
print("CV Scores (workaround): ", pd.DataFrame(cv_scores))

Example #8

0

Show file

File: preprocess.py Project: linytsysu/fishing-classification

train_df = pd.concat(train_df_list)

train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')

all_df = pd.concat([train_df])

X = []
y = []
id_list = []
for ship_id, group in all_df.groupby('渔船ID'):
    X.append(group[['lat', 'lon', '速度', '方向', 'time']])
    y.append(group['type'].values[0])
    id_list.append(ship_id)
print(len(id_list))

pype = Pype([('segment', SegmentX(width=72, overlap=0.1))])

pype = pype.fit(X, y)

shape_list = []
df_list = []
for ship_id, group in all_df.groupby('渔船ID'):
    sample = group[['lat', 'lon', '速度', '方向', 'time']].values
    transform_result = pype.transform([sample])[0]

    if transform_result.shape[0] == 0:
        seg_df = pd.DataFrame(sample,
                              columns=['lat', 'lon', '速度', '方向', 'time'])
        seg_df['渔船ID'] = len(df_list)
        seg_df['type'] = group['type'].values[0]
        df_list.append(seg_df)

Example #9

0

Show file

File: test_pipe.py Project: wsyjwps1983/seglearn

def test_pipe_classification():
    # no context data, single time series
    X = [np.random.rand(1000, 10)]
    y = [5]
    est = Pipeline([('ftr', FeatureRep()),
                    ('ridge', RandomForestClassifier())])

    pipe = SegPipe(est, segmenter=SegmentX())

    pipe.fit(X, y)
    pipe.predict(X, y)
    pipe.score(X, y)

    # context data, single time seres
    Xt = [np.random.rand(1000, 10)]
    Xc = [np.random.rand(3)]
    X = make_ts_data(Xt, Xc)
    y = [5]

    pipe.fit(X, y)
    pipe.predict(X, y)
    pipe.score(X, y)

    # multiple time series
    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = make_ts_data(Xt, Xc)
    y = [1, 2, 3]

    pipe.fit(X, y)
    pipe.predict(X, y)
    pipe.score(X, y)

    # univariate data
    Xt = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)]
    Xc = np.random.rand(3)
    X = make_ts_data(Xt, Xc)
    y = [1, 2, 3]

    pipe.fit(X, y)
    pipe.predict(X, y)
    pipe.score(X, y)

    # transform pipe
    est = Pipeline([('ftr', FeatureRep()), ('scaler', StandardScaler())])

    pipe = SegPipe(est, segmenter=SegmentX())

    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = make_ts_data(Xt, Xc)
    y = [1, 2, 3]

    pipe.fit(X, y)
    pipe.transform(X, y)
    pipe.fit_transform(X, y)

Example #10

0

Show file

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


##############################################
# SETUP
##############################################

# load the data
data = load_watch()
X = data['X']
y = data['y']

# create a feature representation pipeline
steps = [('seg', SegmentX()),
         ('features', FeatureRep()),
         ('scaler', StandardScaler()),
         ('rf', RandomForestClassifier(n_estimators=20))]

pipe = Pype(steps)

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

##############################################
# OPTION 1: Use the score SegPipe score method
##############################################

pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)

Example #11

0

Show file

def test_pipe_classification():
    # no context data, single time series
    X = [np.random.rand(1000, 10)]
    y = [5]

    pipe = Pype([('seg', SegmentX()), ('ftr', FeatureRep()),
                 ('rf', RandomForestClassifier(n_estimators=10))])

    pipe.fit(X, y)
    pipe.predict(X)
    pipe.transform_predict(X, y)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)

    # context data, single time seres
    Xt = [np.random.rand(1000, 10)]
    Xc = [np.random.rand(3)]
    X = TS_Data(Xt, Xc)
    y = [5]

    pipe.fit(X, y)
    pipe.transform_predict(X, y)
    pipe.predict(X)
    pipe.score(X, y)

    # multiple time series
    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]

    pipe.fit(X, y)
    pipe.transform_predict(X, y)
    pipe.predict(X)
    pipe.score(X, y)

    # univariate data
    Xt = [np.random.rand(1000), np.random.rand(100), np.random.rand(500)]
    Xc = np.random.rand(3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]

    pipe.fit(X, y)
    pipe.transform_predict(X, y)
    pipe.predict(X)
    pipe.score(X, y)

    # transform pipe
    pipe = Pype([('seg', SegmentX()), ('ftr', FeatureRep()),
                 ('scaler', StandardScaler())])

    Xt = [
        np.random.rand(1000, 10),
        np.random.rand(100, 10),
        np.random.rand(500, 10)
    ]
    Xc = np.random.rand(3, 3)
    X = TS_Data(Xt, Xc)
    y = [1, 2, 3]

    pipe.fit(X, y)
    pipe.transform(X, y)
    pipe.fit_transform(X, y)

Example #12

0

Show file

# data = sio.loadmat('./data/shiftcorr%d'%idx) # poor results
# data = sio.loadmat('./data/shiftlinear%d'%idx) # poor results
# data = sio.loadmat('./data/singledimshiftfreq%d'%idx)# poor results
# data = sio.loadmat('./data/agotsshiftmean%d'%idx)
# data = sio.loadmat('./data/agotsshiftvar%d'%idx)
data = sio.loadmat('./data/extreme%d' % idx)  # good esults
ts = data['ts']  #.T # transpose is needed for shiftfreq
print(ts.shape)
bkps = data['bkps'][0]

scaler = StandardScaler()
ts = scaler.fit_transform(ts)

width = 10
step = 5
ts = [ts]
segment = SegmentX(width=width, step=step)
x = segment.fit_transform(ts, None)[0]
x = x.reshape([x.shape[0], -1])
x = torch.from_numpy(x).float()
bkss = bkps // 5  #bkss for break samples

model = AutoEncoder(input_dim=10, latent_dim=1, output_dim=10)

_, pred = model.fit_predict(x)

err = (pred - x).detach().numpy()
err = np.max(np.power(err, 2), axis=1)
rpt.display(err, true_chg_pts=bkss)
rpt.display(ts[0], true_chg_pts=bkps)
plt.show()

Example #13

0

Show file

for ship_id, group in all_df.groupby('渔船ID'):
    X.append(group[['lat', 'lon', '方向', '速度']].values)
    y.append(group['type'].values[0])
    id_list.append(int(ship_id))

le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y[:len(train_df_list)])

X_train = X[:len(train_df_list)]
X_test = X[len(train_df_list):]

kf = KFold(n_splits=5, random_state=42, shuffle=True)
model_v1_list = []
score_v1_list = []
for train_index, test_index in kf.split(X_train):
    model_v1 = Pype([('segment', SegmentX(width=10)),
                        ('features', FeatureRep()),
                        ('scaler', StandardScaler()),
                        ('rf', RandomForestClassifier(n_estimators=100, random_state=42))])

    model_v1.fit(np.array(X_train)[train_index], y_train[train_index])

    model_v1_list.append(model_v1)

    y_pred = []
    for test_sample in np.array(X_train)[test_index]:
        result = model_v1.predict_proba([test_sample])
        pred = np.argmax(np.sum(result, axis=0) / result.shape[0])
        y_pred.append(pred)
    score_v1_list.append(f1_score(y_train[test_index], y_pred, average='macro'))

Example #14

0

Show file

File: ml.py Project: jerryliu3/EMG-Exercise-Analysis

    np.array(pd.read_csv('LF_BC_9600.csv', sep=",", header=None)),
    np.array(pd.read_csv('RF_BC_9600.csv', sep=",", header=None)),
    np.array(pd.read_csv('LB_BP_9600.csv', sep=",", header=None)),
    np.array(pd.read_csv('RB_BP_9600.csv', sep=",", header=None)),
    np.array(pd.read_csv('LF_BP_9600.csv', sep=",", header=None)),
    np.array(pd.read_csv('RF_BP_9600.csv', sep=",", header=None))
])

# create the label vector and the corresponding semantic vector
y = np.array([0, 1, 2, 3, 4, 5, 6, 7])
labels = [
    'LB_BC', 'RB_BC', 'LF_BC', 'RF_BC', 'LB_BP', 'RB_BP', 'LF_BP', 'RF_BP'
]

# segment the data and labels
segmenter = SegmentX(100, 0.5)
X_new, y_new, _ = segmenter.fit_transform(X, y)

###################################################################################################

# create a pipeline for LDA transformation of the feature representation
est = Pipeline([('features', FeatureRep()),
                ('lda', LinearDiscriminantAnalysis(n_components=2))])
pipe = SegPipe(est)

# plot embedding
X2, y2 = pipe.fit_transform(X_new, y_new)
plot_embedding(X2, y2.astype(int), labels)
plt.show()

###################################################################################################

Example #15

0

Show file

File: plot_segment_rep.py Project: zhouyuanzhe/seglearn

    model.add(LSTM(units=lstm_units, dropout=0.1, recurrent_dropout=0.1))
    model.add(Dense(n_classes, activation="softmax"))

    model.compile(loss='categorical_crossentropy', optimizer='adam',
                  metrics=['accuracy'])

    return model


# load the data
data = load_watch()
X = data['X']
y = data['y']

# create a segment learning pipeline
pipe = Pype([('seg', SegmentX(width=100, step=100, order='C')),
             ('crnn', KerasClassifier(build_fn=crnn_model, epochs=1, batch_size=256, verbose=0))])

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)

print("N series in train: ", len(X_train))
print("N series in test: ", len(X_test))
print("N segments in train: ", pipe.N_train)
print("N segments in test: ", pipe.N_test)
print("Accuracy score: ", score)

Example #16

0

Show file

File: main_v3.py Project: linytsysu/fishing-classification

def generate_result():
    train_path = '/tcdata/hy_round2_train_20200225'
    test_path = '/tcdata/hy_round2_testB_20200312'

    train_df_list = []
    for file_name in os.listdir(train_path):
        if file_name.endswith('.csv'):
            df = pd.read_csv(os.path.join(train_path, file_name))
            train_df_list.append(df)

    test_df_list = []
    for file_name in os.listdir(test_path):
        if file_name.endswith('.csv'):
            df = pd.read_csv(os.path.join(test_path, file_name))
            test_df_list.append(df)

    train_df = pd.concat(train_df_list)
    test_df = pd.concat(test_df_list)

    train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
    test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

    all_df = pd.concat([train_df, test_df])

    X = []
    y = []
    id_list = []
    for ship_id, group in all_df.groupby('渔船ID'):
        X.append(group[['lat', 'lon', '速度', '方向', 'time']])
        y.append(group['type'].values[0])
        id_list.append(ship_id)
    print(len(id_list))

    pype = Pype([('segment', SegmentX(width=72, overlap=0.0))])

    pype = pype.fit(X, y)

    shape_list = []
    df_list = []
    for ship_id, group in all_df.groupby('渔船ID'):
        sample = group[['lat', 'lon', '速度', '方向', 'time']].values
        transform_result = pype.transform([sample])[0]

        if transform_result.shape[0] == 0:
            seg_df = pd.DataFrame(sample,
                                  columns=['lat', 'lon', '速度', '方向', 'time'])
            seg_df['渔船ID'] = len(df_list)
            seg_df['type'] = group['type'].values[0]
            df_list.append(seg_df)
            shape_list.append(1)
        else:
            for seg in transform_result:
                seg_df = pd.DataFrame(
                    seg, columns=['lat', 'lon', '速度', '方向', 'time'])
                seg_df['渔船ID'] = len(df_list)
                seg_df['type'] = group['type'].values[0]
                df_list.append(seg_df)
            shape_list.append(transform_result.shape[0])

    new_all_df = pd.concat(df_list, sort=False)
    new_all_df.to_csv('help.csv', index=False)
    new_all_df = pd.read_csv('help.csv')
    df = new_all_df.drop(columns=['type'])
    extracted_df = extract_features(df,
                                    column_id='渔船ID',
                                    column_sort='time',
                                    n_jobs=8,
                                    kind_to_fc_parameters=fc_parameters_v2)

    new_df = new_all_df.groupby('渔船ID').agg(x_min=('lat', 'min'),
                                            x_max=('lat', 'max'),
                                            y_min=('lon', 'min'),
                                            y_max=('lon', 'max'))
    extracted_df['x_max-x_min'] = new_df['x_max'] - new_df['x_min']
    extracted_df['y_max-y_min'] = new_df['y_max'] - new_df['y_min']
    extracted_df['x_max-y_min'] = new_df['x_max'] - new_df['y_min']
    extracted_df['y_max-x_min'] = new_df['y_max'] - new_df['x_min']
    extracted_df['slope'] = extracted_df['y_max-y_min'] / np.where(
        extracted_df['x_max-x_min'] == 0, 0.001, extracted_df['x_max-x_min'])
    extracted_df[
        'area'] = extracted_df['x_max-x_min'] * extracted_df['y_max-y_min']

    def get_feature(arr):
        feature = [
            np.max(arr),
            np.quantile(arr, 0.9),
            np.quantile(arr, 0.1),
            np.quantile(arr, 0.75),
            np.quantile(arr, 0.25),
            np.mean(arr),
            np.std(arr),
            np.median(arr),
            np.std(arr) / np.mean(arr)
        ]
        feature.append(np.corrcoef(np.array([arr[:-1], arr[1:]]))[0, 1])
        feature.append(skew(arr))
        feature.append(kurtosis(arr))
        return feature

    features = []
    for _, group in new_all_df.groupby('渔船ID'):
        group = group.sort_values(by=['time'])
        lat = group['lat'].values
        lon = group['lon'].values
        time_ = pd.to_datetime(group['time'],
                               format='%Y-%m-%d %H:%M:%S').values
        dire = group['方向'].values

        speed_list = []
        for i in range(lat.shape[0]):
            if i == 0:
                continue
            hour = (time_[i] - time_[i - 1]) / np.timedelta64(1, 'h')
            dist = geodesic((lat[i - 1], lon[i - 1]), (lat[i], lon[i]))
            speed_list.append(dist.km / hour)

        c = np.sum(np.cos(dire / 180 * np.pi)) / group.shape[0]
        s = np.sum(np.sin(dire / 180 * np.pi)) / group.shape[0]
        r = np.sqrt(c**2 + s**2)
        theta = np.arctan(s / c)
        angle_feature = [r, theta, np.sqrt(-2 * np.log(r))]

        turn_list = []
        for i in range(dire.shape[0]):
            if i == 0:
                continue
            turn = 1 - np.cos(dire[i - 1] / 180 * np.pi -
                              dire[i] / 180 * np.pi)
            turn_list.append(turn * np.pi)
        turn_list = np.array(turn_list)
        c = np.sum(np.cos(turn_list)) / (group.shape[0] - 1)
        s = np.sum(np.sin(turn_list)) / (group.shape[0] - 1)
        r = np.sqrt(c**2 + s**2)
        theta = np.arctan(s / c)
        turn_feature = [r, theta, np.sqrt(-2 * np.log(r))]

        features.append(
            np.concatenate(
                [get_feature(speed_list), angle_feature[:1],
                 turn_feature[:1]]))

    extracted_df_ = pd.concat([pd.DataFrame(np.array(features)), extracted_df],
                              axis=1)

    y = []
    for _, group in new_all_df.groupby('渔船ID'):
        y.append(group.iloc[0]['type'])

    train_df = extracted_df_.iloc[:np.sum(shape_list[:len(train_df_list)])]
    test_df = extracted_df_.iloc[np.sum(shape_list[:len(train_df_list)]):]

    y_train = y[:train_df.shape[0]]
    le = preprocessing.LabelEncoder()
    y_train = le.fit_transform(y_train)

    train_df['type'] = le.inverse_transform(y_train)

    train_df.to_csv('./train.csv')
    test_df.to_csv('./test.csv')

    train_df = pd.read_csv('./train.csv', index_col=0)
    X_train = train_df.drop(columns=['type']).values
    y_train = train_df['type'].values

    test_df = pd.read_csv('./test.csv', index_col=0)
    X_test = test_df.values

    from sklearn.impute import SimpleImputer

    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    X_train = imputer.fit_transform(
        pd.DataFrame(X_train).replace([np.inf, -np.inf], np.nan).values)
    X_test = imputer.fit_transform(
        pd.DataFrame(X_test).replace([np.inf, -np.inf], np.nan).values)

    le = preprocessing.LabelEncoder()
    y_train = le.fit_transform(y_train)

    def get_model():
        exported_pipeline = make_pipeline(
            SelectPercentile(score_func=f_classif, percentile=48),
            StackingEstimator(
                estimator=SGDClassifier(alpha=0.01,
                                        eta0=0.01,
                                        fit_intercept=False,
                                        l1_ratio=0.25,
                                        learning_rate="invscaling",
                                        loss="modified_huber",
                                        penalty="elasticnet",
                                        power_t=10.0)),
            ExtraTreesClassifier(bootstrap=False,
                                 criterion="entropy",
                                 max_features=0.6000000000000001,
                                 min_samples_leaf=1,
                                 min_samples_split=3,
                                 n_estimators=100))

        set_param_recursive(exported_pipeline.steps, 'random_state', 42)
        return exported_pipeline

    def get_model_v2():
        exported_pipeline = make_pipeline(
            make_union(
                make_pipeline(
                    make_union(FunctionTransformer(copy),
                               FunctionTransformer(copy)),
                    SelectPercentile(score_func=f_classif, percentile=18)),
                FunctionTransformer(copy)),
            StackingEstimator(estimator=SGDClassifier(alpha=0.01,
                                                      eta0=0.1,
                                                      fit_intercept=False,
                                                      l1_ratio=1.0,
                                                      learning_rate="constant",
                                                      loss="hinge",
                                                      penalty="elasticnet",
                                                      power_t=0.1)),
            VarianceThreshold(threshold=0.05),
            ExtraTreesClassifier(bootstrap=False,
                                 criterion="entropy",
                                 max_features=0.55,
                                 min_samples_leaf=1,
                                 min_samples_split=4,
                                 n_estimators=100))
        set_param_recursive(exported_pipeline.steps, 'random_state', 42)
        return exported_pipeline

    def get_data(shape_idx):
        start_idx = int(np.sum(shape_list[:shape_idx]))
        end_idx = start_idx + shape_list[shape_idx]
        if shape_idx < len(train_df_list):

            return X_train[start_idx:end_idx], y_train[start_idx:end_idx]
        else:
            return X_test[start_idx:end_idx], None

    kf = KFold(n_splits=5, random_state=2019, shuffle=True)

    model_v1_list = []
    score_v1_list = []
    for train_index, test_index in kf.split(shape_list[:len(train_df_list)]):
        train_data = []
        y_data = []
        for idx in train_index:
            data = get_data(idx)
            train_data.append(data[0])
            y_data.append(data[1])
        train_data = np.concatenate(train_data, axis=0)
        y_data = np.concatenate(y_data, axis=0)

        model_v1 = get_model()
        model_v1.fit(train_data, y_data)
        model_v1_list.append(model_v1)

        y_true = []
        y_pred = []
        for idx in test_index:
            data = get_data(idx)
            proba = model_v1.predict_proba(data[0])
            pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0])
            y_pred.append(pred)
            y_true.append(data[1][0])
        score = f1_score(y_pred, y_true, average='macro')
        score_v1_list.append(score)

    print(score_v1_list)
    print(np.mean(score_v1_list))

    kf = KFold(n_splits=5, random_state=22, shuffle=True)

    model_v2_list = []
    score_v2_list = []
    for train_index, test_index in kf.split(shape_list[:len(train_df_list)]):
        train_data = []
        y_data = []
        for idx in train_index:
            data = get_data(idx)
            train_data.append(data[0])
            y_data.append(data[1])
        train_data = np.concatenate(train_data, axis=0)
        y_data = np.concatenate(y_data, axis=0)

        model_v2 = get_model_v2()
        model_v2.fit(train_data, y_data)
        model_v2_list.append(model_v2)

        y_true = []
        y_pred = []
        for idx in test_index:
            data = get_data(idx)
            proba = model_v2.predict_proba(data[0])
            pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0])
            y_pred.append(pred)
            y_true.append(data[1][0])
        score = f1_score(y_pred, y_true, average='macro')
        score_v2_list.append(score)

    print(score_v2_list)
    print(np.mean(score_v2_list))

    kf = KFold(n_splits=5, random_state=22, shuffle=True)

    model_v3_list = []
    score_v3_list = []
    for train_index, test_index in kf.split(shape_list[:len(train_df_list)]):
        train_data = []
        y_data = []
        for idx in train_index:
            data = get_data(idx)
            train_data.append(data[0])
            y_data.append(data[1])
        train_data = np.concatenate(train_data, axis=0)
        y_data = np.concatenate(y_data, axis=0)

        model_v3 = RandomForestClassifier(bootstrap=False,
                                          criterion="entropy",
                                          max_features=0.1,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          n_estimators=100)
        model_v3.fit(train_data, y_data)
        model_v3_list.append(model_v3)

        y_true = []
        y_pred = []
        for idx in test_index:
            data = get_data(idx)
            proba = model_v3.predict_proba(data[0])
            pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0])
            y_pred.append(pred)
            y_true.append(data[1][0])
        score = f1_score(y_pred, y_true, average='macro')
        score_v3_list.append(score)

    print(score_v3_list)
    print(np.mean(score_v3_list))

    kf = KFold(n_splits=5, random_state=22, shuffle=True)

    model_v4_list = []
    score_v4_list = []
    for train_index, test_index in kf.split(shape_list[:len(train_df_list)]):
        train_data = []
        y_data = []
        for idx in train_index:
            data = get_data(idx)
            train_data.append(data[0])
            y_data.append(data[1])
        train_data = np.concatenate(train_data, axis=0)
        y_data = np.concatenate(y_data, axis=0)

        model_v4 = ExtraTreesClassifier(bootstrap=False,
                                        criterion="entropy",
                                        max_features=0.6000000000000001,
                                        min_samples_leaf=1,
                                        min_samples_split=3,
                                        n_estimators=100)
        model_v4.fit(train_data, y_data)
        model_v4_list.append(model_v4)

        y_true = []
        y_pred = []
        for idx in test_index:
            data = get_data(idx)
            proba = model_v4.predict_proba(data[0])
            pred = np.argmax(np.sum(proba, axis=0) / proba.shape[0])
            y_pred.append(pred)
            y_true.append(data[1][0])
        score = f1_score(y_pred, y_true, average='macro')
        score_v4_list.append(score)

    print(score_v4_list)
    print(np.mean(score_v4_list))

    pred = []
    for i in range(len(train_df_list), len(shape_list)):
        start_idx = int(np.sum(shape_list[len(train_df_list):i]))
        sample = X_test[start_idx:start_idx + shape_list[i]]
        result = []
        for model in model_v1_list:
            result.append(
                np.sum(model.predict_proba(sample), axis=0) / shape_list[i])

        for model in model_v2_list:
            result.append(
                np.sum(model.predict_proba(sample), axis=0) / shape_list[i])

        for model in model_v3_list:
            result.append(
                np.sum(model.predict_proba(sample), axis=0) / shape_list[i])

        for model in model_v4_list:
            result.append(
                np.sum(model.predict_proba(sample), axis=0) / shape_list[i])

        pred.append(np.sum(result, axis=0) / 20)

    pd.DataFrame(pred, index=id_list[len(train_df_list):]).to_csv(
        './probaresult.csv', header=None)

Example #17

0

Show file

File: plot_interp.py Project: zhouyuanzhe/seglearn

# load the data
data = load_watch()

X = data['X']
y = data['y']

# I am adding in a column to represent time (50 Hz sampling), since my data doesn't include it
# the Interp class assumes time is the first column in the series
X = np.array([
    np.column_stack([np.arange(len(X[i])) / 50., X[i]])
    for i in np.arange(len(X))
])

clf = Pype([('interp', Interp(1. / 25., categorical_target=True)),
            ('segment', SegmentX(width=100)), ('features', FeatureRep()),
            ('scaler', StandardScaler()),
            ('rf', RandomForestClassifier(n_estimators=20))])

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)

print("N series in train: ", len(X_train))
print("N series in test: ", len(X_test))
print("N segments in train: ", clf.N_train)
print("N segments in test: ", clf.N_test)
print("Accuracy score: ", score)