コード例 #1
0
def main():
    # 1. load data
    fname2 = "../data/stendalmark_20181122_RAW_export.xyz"
    fname1 = "../data/stendalmark_20181121_RAW_export.xyz"
    fname0 = "../data/stendalmark_20181120_RAW_export.xyz"
    fname = "../data/vildbjerg_20171101_RAW_export.xyz"
    _, dbdt, lbl, timestamp, _ = load_data2(fname1, 8, 24)
    _, dbdt1, lbl1, timestamp1, _ = load_data2(fname0, 8, 24)
    _, dbdt2, lbl2, timestamp2, _ = load_data2(fname, 8, 24)
    dbdt, lbl, timestamp = remove_edge(timestamp, dbdt, lbl, 20)
    dbdt1, lbl1, timestamp1 = remove_edge(timestamp1, dbdt1, lbl1, 20)
    dbdt2, lbl2, timestamp2 = remove_edge(timestamp2, dbdt2, lbl2, 20)

    timestamp = timestampToTime(timestamp)

    # normalize by sign and log
    # dbdt_norm = np.sign(dbdt) * np.log(np.abs(dbdt))
    # split in training test
    r = int(np.ceil(0.8 * dbdt.shape[0]))
    dbdt_train = dbdt[0:r, :]
    lbl_train = lbl[0:r]
    dbdt_train = dbdt_train[lbl_train == 1, :]
    # dbdt_train = np.concatenate(( dbdt1[lbl1 == 1, :], dbdt2[lbl2 == 1, :]), axis=0)

    dbdt_testOG = dbdt[r:, :]
    dbdt_test = dbdt[r:, :]
    lbl_test = lbl[r:]
    # normalise by zero mean, 1 std
    sc = StandardScaler()
    dbdt_train = sc.fit_transform(dbdt_train)
    dbdt_test = sc.transform(dbdt_test)
    # Build input array, where a sounding and its neighbours are stacked
    autoencoder(dbdt_train, dbdt_test, lbl_test)
コード例 #2
0
ファイル: AutoEnc2.py プロジェクト: TokeF/MasterThesis
def main():
    fname0 = "../data/stendalmark_20181120_RAW_export.xyz"
    fname1 = "../data/stendalmark_20181121_RAW_export.xyz"

    df, dbdt, lbl, timestamp, gtimes = load_data2(fname0, 8, 24)
    dbdt, lbl, timestamp = remove_edge(timestamp, dbdt, lbl, 20)
    timestamp = (timestamp - timestamp[0]) * 10**5

    df0, dbdt0, lbl0, timestamp0, gtimes0 = load_data2(fname1, 8, 24)
    dbdt0, lbl0, timestamp0 = remove_edge(timestamp0, dbdt0, lbl0, 20)
    timestamp0 = (timestamp0 - timestamp0[0]) * 10**5 + timestamp[-1] + 0.7

    dbdt = np.concatenate((dbdt, dbdt0))
    lbl = np.concatenate((lbl, lbl0))
    timestamp = np.concatenate((timestamp, timestamp0))
    sc = StandardScaler()

    r = int(np.ceil(0.8 * dbdt.shape[0]))
    dbdt = dbdt[0:r, :]
    lbl = lbl[0:r]
    timestamp = timestamp[0:r]

    splits = 5
    i = 0
    kf = KFold(n_splits=splits, shuffle=False)
    for trainidx, testidx in kf.split(dbdt):
        X_train, X_test = dbdt[trainidx], dbdt[testidx]
        y_train, y_test = lbl[trainidx], lbl[testidx]
        time_test = timestamp[testidx]
        X_testOG = X_test
        i += 1
        if i == 5: break

    X_train = X_train[y_train == 1, :]
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    fig, axROC = plt.subplots()
    ndata = np.linspace(0.01, 1, 50)
    AUCarray = np.zeros((len(ndata), ))

    for i, frac in enumerate(ndata):
        n = int(np.ceil(frac * X_train.shape[0]))
        y_score, _, _, _, CM = autoencoder2(X_train[0:n, :], X_test, y_test,
                                            X_testOG, time_test, True)
        AUC, _ = data_visualize.plot_roc(y_test, y_score, axROC, i, pos=0)
        AUCarray[i] = CM[1, 0] + CM[0, 1]
        plt.close('all')
        print(i)

    plt.figure("Learn data")
    plt.plot(ndata, AUCarray)
    plt.ylabel("Total errors")
    plt.xlabel("Training set in use [Fraction]")
    plt.show()
コード例 #3
0
def main():
    ## Data preprocessing
    fname0 = "../data/stendalmark_20181120_RAW_export.xyz"
    fname1 = "../data/stendalmark_20181121_RAW_export.xyz"

    df, dbdt, lbl, timestamp, gtimes = load_data2(fname0, 8, 24)
    dbdt, lbl, timestamp = remove_edge(timestamp, dbdt, lbl, 20)
    timestamp = (timestamp - timestamp[0]) * 10 ** 5

    df0, dbdt0, lbl0, timestamp0, gtimes0 = load_data2(fname1, 8, 24)
    dbdt0, lbl0, timestamp0 = remove_edge(timestamp0, dbdt0, lbl0, 20)
    timestamp0 = (timestamp0 - timestamp0[0]) * 10 ** 5 + timestamp[-1] + 0.7

    dbdt = np.concatenate((dbdt, dbdt0));
    lbl = np.concatenate((lbl, lbl0))
    timestamp = np.concatenate((timestamp, timestamp0))
    sc = StandardScaler()

    r = int(np.ceil(0.8 * dbdt.shape[0]))
    dbdt = dbdt[0:r, :]
    lbl = lbl[0:r]
    timestamp=timestamp[0:r]
    idx = range(0, dbdt.shape[0])
    v = int(np.ceil(0.8 * dbdt.shape[0]))
    X_test = dbdt[v:, :]
    X_train = dbdt[0:v, :]
    y_train = lbl[0:v]
    y_test = lbl[v:]
    idx_train = list(idx[0:v])
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    # X_train = build_array_skytem(41, X_train)
    # X_test = build_array_skytem(41, X_test)
    # y_score, _, report, acc = rnd_forest2(timestamp, dbdt, lbl, idx[r:],
    #                                       X_train.T, X_test.T,
    #                                       y_train[20:-20], y_test[20:-20], n_trees=100)

    # trees = range(1,100)
    trees = np.linspace(0.01,1.0, 50)
    metric = np.zeros((len(trees),))
    for i, n_trees in enumerate(trees):
        # random.shuffle(idx_train)
        n = int(np.ceil(n_trees * X_train.shape[0]))
        _, _, report, acc, auc, CM = rnd_forest2(timestamp[v:], dbdt, lbl, X_test,
                                              X_train[idx_train[0:n],:], X_test,
                                              y_train[idx_train[0:n]], y_test, n_trees=100)
        metric[i] = CM[1,0] + CM[0,1]
        plt.figure()
        plt.close('all')
        print("Iteration: ", i)
    plt.plot(trees, metric)
    plt.ylabel("Total errors")
    plt.xlabel("Training set in use [Fraction]")
    plt.show()
コード例 #4
0
ファイル: driver.py プロジェクト: TokeF/MasterThesis
def plotNormalnratio():
    fname = "../data/20171101_RAW_export.xyz"
    _, dbdt, lbl, timestamp = load_data2(fname, 8, 20)

    ratio = difference.row_ratio(timestamp, dbdt)
    data_visualize.plotDat(timestamp, ratio, lbl)
    data_visualize.plotDat(timestamp, dbdt, lbl)
    plt.yscale('log')
    plt.show()
コード例 #5
0
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler, Normalizer, PolynomialFeatures
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
# features = tpot_data.drop('target', axis=1).values
# training_features, testing_features, training_target, testing_target = \
#             train_test_split(features, tpot_data['target'].values, random_state=42)
from utilities.data_reader import load_data2
fname = "../data/20171101_RAW_export.xyz"
# fname = "../data/stendalmark_20181120_RAW_export.xyz"
df, dbdt, lbl, timestamp = load_data2(fname, 8, 23)
training_features, testing_features, training_target, testing_target = \
            train_test_split(dbdt, lbl, random_state=42)

# apply random undersampling
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, CondensedNearestNeighbour
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
rus = SMOTEENN()
training_features, training_target = rus.fit_sample(training_features, training_target)

from SVM.tpot_exported_pipeline_4_bernoulliEXTT import pipe4bernoulliET
from SVM.tpot_exported_pipeline_5_ET import pipe5ET
pipe5ET(training_features, testing_features, training_target, testing_target)
exit()
コード例 #6
0
ファイル: lstm_bidir.py プロジェクト: TokeF/MasterThesis
    # Length og original data: sequence length, plus stepsize for each row other than the first
    # ie. the total number og soundings, accounting for overlapping windows
    nsounding = lblA.shape[1] + (lblA.shape[0] - 1) * stepzise
    # Create empty array. Each row holds predictions for the corresponding window columns.
    # remaining columns are NaN. Average the score of a sounding over each windows prediction
    nanA = np.empty((lblA.shape[0], nsounding))
    nanA[:] = np.nan
    for i in range(0, lblA.shape[0]):
        idx = i * stepzise
        nanA[i, idx:idx + lblA.shape[1]] = lblA[i, :, 0]
    assert np.isnan(nanA[-1, -1]), "last element did not get a score"
    return np.nanmean(nanA, axis=0)


fname = "../data/stendalmark_20181120_RAW_export.xyz"
df, dbdt, lbl, timestamp, gtimes = load_data2(fname, 8, 24)
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

r = int(np.ceil(0.8 * dbdt.shape[0]))
X_train = dbdt[0:r, :]
X_test = dbdt[r:, :]
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

lbl_train2 = lbl[0:r]
lbl_test = lbl[r:]

w = 30
s = 1
コード例 #7
0
ファイル: driver.py プロジェクト: TokeF/MasterThesis
# from SVM.SVM_1 import SVM_classify

# fig, ax = plt.subplots()
# ax.plot([0, 0, 1], [0, 1, 1], "r-", label="Perfect classifier")
# ax.set_ylim([0,1])
# ax.set_xlim([0,1])
# ax.plot([0,1], [0,1], color="tab:gray", linestyle = "-.", label="50% line")
# ax.set_ylabel('True positive rate'); ax.set_xlabel('False positive rate')
# ax.legend()
# plt.show()
# exit()
fname = "../data/vildbjerg_20171101_modified_RAW_export.xyz"
fname1 = "../data/stendalmark_20181121_RAW_export.xyz"
fname0 = "../data/stendalmark_20181120_RAW_export.xyz"

df, dbdt, lbl, timestamp, gtimes = load_data2(fname0, 8, 24)
dbdt, lbl, timestamp = remove_edge(timestamp, dbdt, lbl, 20)
timestamp = (timestamp - timestamp[0]) * 10 ** 5

df0, dbdt0, lbl0, timestamp0, gtimes0 = load_data2(fname1, 8, 24)
dbdt0, lbl0, timestamp0 = remove_edge(timestamp0, dbdt0, lbl0, 20)
timestamp0 = (timestamp0 - timestamp0[0]) * 10 ** 5 + timestamp[-1]+0.7

dbdt = np.concatenate((dbdt,dbdt0)); lbl = np.concatenate((lbl,lbl0))
timestamp = np.concatenate((timestamp, timestamp0))

print("coupled ",sum(lbl == 0) / dbdt.shape[0])
print("total coupled ",dbdt[lbl==0,:].shape[0])
print("total", dbdt.shape[0])

data_visualize.plotDat(range(0, len(timestamp)), dbdt, lbl)
コード例 #8
0
ファイル: NNSky.py プロジェクト: TokeF/MasterThesis
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from NN.window import build_array_skytem
# from neupy.layers import *
# from neupy import algorithms


## Data preprocessing
from utilities.data_reader import load_data2, remove_edge
from utilities.data_visualize import plot_training, plot_misclassified, dist_score

fname2 = "../data/stendalmark_20181122_RAW_export.xyz"
fname1 = "../data/stendalmark_20181121_RAW_export.xyz"
fname0 = "../data/stendalmark_20181120_RAW_export.xyz"
fname = "../data/vildbjerg_20171101_RAW_export.xyz"
df, dbdt, lbl, timestamp, _ = load_data2(fname1, 8, 24)
dbdt, lbl, timestamp = remove_edge(timestamp, dbdt, lbl, 20)

if 1:
    #normalize by sign and log
    dbdt_norm = np.sign(dbdt) * np.log(np.abs(dbdt))
    #split in rtaining test
    r = int(np.ceil(0.8 * dbdt.shape[0]))
    dbdt_train = dbdt_norm[0:r, :]
    lbl_train = lbl[1:r-1]
    dbdt_testOG = dbdt[r+1:-1]
    dbdt_test = dbdt_norm[r:]
    lbl_test = lbl[r+1:-1]
    #normalise by zero mean, 1 std
    sc = StandardScaler()
    dbdt_train = sc.fit_transform(dbdt_train)