Exemple #1
0
def main():
    for filename in os.listdir(PATH_TRAIN):
        os.remove(os.path.join(PATH_TRAIN, filename))
    for filename in os.listdir(PATH_VAL):
        os.remove(os.path.join(PATH_VAL, filename))

    image_to_matrix.convert()
    split_data.split_data()
Exemple #2
0
 def estructure_data(work_dir):
     # os.path.dirname(os.path.realpath(__file__))
     root = os.path.join(work_dir, 'birds')
     if not os.path.exists(root):
         split_data()
     train_dir = os.path.join(root, 'train')
     valid_dir = os.path.join(root, 'valid')
     test_dir = os.path.join(root, 'test')
     return train_dir, valid_dir, test_dir
Exemple #3
0
def split_and_write_data(data, mask, split_method, cell_dim, proportions,
                         data_path, outfile_prefix):

    for method in split_method:
        if method == 'edge':
            sides = ['n', 's', 'e', 'w']
            for side in sides:
                data_split = split.split_data(data, mask, method, cell_dim,
                                              proportions, side)
                write_data(data_split, method, data_path, outfile_prefix, side)
        else:
            data_split = split.split_data(data, mask, method, cell_dim,
                                          proportions)
            write_data(data_split, method, data_path, outfile_prefix)
Exemple #4
0
    def __init__(self, dim, size):
        try:
            self.data = pickle.load(open("data_cluster.pickle", "rb"))
            self.clusters = pickle.load(open("clusters.pickle", "rb"))
        except (OSError, IOError) as e:
            # dim centroids for the generation of the synthetic dataset
            # centroids = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
            centroids = np.random.randn(4, dim)
            self.centroids = centroids
            data, clusters = sintetic_dataset(centroids, 20, size)
            data = add_nan(data, 0.001)
            train, validation, test, idx_train = split_data(data, 0.7, 0.2)
            pickle.dump(clusters[idx_train], open("clusters.pickle", "wb"))
            pickle.dump(train, open("data_cluster.pickle", "wb"))
            self.data = pickle.load(open("data_cluster.pickle", "rb"))
            self.clusters = pickle.load(open("clusters.pickle", "rb"))

        # pre-processing
        means, self.data = remove_mean_nan(self.data)
        print(self.data.shape[0])
        self.mean = np.mean(self.data, axis=0)
        self.std = np.std(self.data, axis=0)
        self.norm_2 = np.linalg.norm(self.data, axis=0)
        self.exp_col = exponencial_n_samples(0.4, self.data.shape[0])
        self.exp_col = np.reshape(self.exp_col, (self.data.shape[0], 1))
        self.data = np.append(self.data, self.exp_col, axis=1)
def build_regression(districts):
    figure = pl.figure(1)

    dl = len(districts)
    plot_idx = 1
    for district in districts:
        train_set, test_set = split_data(district=district)

        min_max_scaler = preprocessing.MinMaxScaler()
        X_train, y_train = preprocess_data(train_set, min_max_scaler, True)
        X_test, y_test = preprocess_data(test_set, min_max_scaler, False)

        tuned_parameters = [
            {"kernel": ["rbf"], "C": [0.1, 1, 10, 100, 1000], "gamma": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]},
            {"kernel": ["linear"], "C": [0.1, 1, 10, 100, 1000]},
        ]

        # find optimal C and gamma
        grid = GridSearchCV(SVR(C=1, cache_size=400), tuned_parameters)
        grid.fit(X_train, y_train)

        district = district or "All"

        ax = plot_prediction(X_train, y_train, grid, figure, plot_idx, dl)
        plot_idx += 1
        ax.set_title("Support Vector Regression (train set, %s)" % district)

        ax = plot_prediction(X_test, y_test, grid, figure, plot_idx, dl)
        plot_idx += 1
        ax.set_title("Support Vector Regression (test set, %s)" % district)

    pl.show()
def main():
    train_set, test_set = split_data()

    X = np.matrix((
            np.ones(train_set.shape[0]),
            train_set['number_of_rooms'],
            train_set['living_space'])
        ).T

    Y = np.matrix((train_set['price'])).T

    ne_theta = normal_equation.learn(X, Y)
    gd_theta = gradient_descent.learn(X, Y, 0.00015, 100)

    test_set = test_set[['number_of_rooms', 'living_space', 'price']].values

    for rooms, area, price in test_set:
        ne_price = int(round(predict(rooms, area, ne_theta)))
        gd_price = int(round(predict(rooms, area, gd_theta)))

        print 'Number of rooms %s, area %s sqm:' % (rooms, area)
        print 'actual price: %s EUR' % price
        print 'ne predict: %s EUR (%s%%)' %(
                ne_price, int(100. * ne_price / price))
        print 'gd predict: %s EUR (%s%%)' % (
                gd_price, int(100. * gd_price / price))
def main(args):
    start_time = time.time()

    img_dir = args.data_dir + '/images'
    images = [os.path.join(img_dir, f) for f in os.listdir(img_dir)]
    images = sorted(images)

    mask_dir = args.data_dir + '/labels'
    masks = [os.path.join(mask_dir, f) for f in os.listdir(mask_dir)]
    masks = sorted(masks)

    loop = tqdm(range(len(images)))
    for idx in loop:
        img = cv2.imread(images[idx])
        mask = cv2.imread(masks[idx])

        h, w, _ = img.shape
        rows = h // args.patch_size
        cols = w // args.patch_size

        for i in range(0, rows):
            for j in range(0, cols):
                ymin = i * h // rows
                ymax = i * h // rows + h // rows
                xmin = j * w // cols
                xmax = j * w // cols + w // cols

                roi_img = img[ymin:ymax, xmin:xmax]
                roi_mask = mask[ymin:ymax, xmin:xmax]

                roi_img = cv2.resize(roi_img,
                                     (args.patch_size, args.patch_size),
                                     interpolation=cv2.INTER_CUBIC)
                roi_mask = cv2.resize(roi_mask,
                                      (args.patch_size, args.patch_size),
                                      interpolation=cv2.INTER_CUBIC)

                cv2.imwrite(f'{args.save_dir}/images/{idx:05d}.png', roi_img)
                cv2.imwrite(f'{args.save_dir}/labels/{idx:05d}.png', roi_mask)

    if args.split_train_val:
        split_data(f'{args.save_dir}/images', args.save_dir, True)

    end_time = time.time() - start_time
    print(f'Done! It took {end_time:.04f} seconds')
Exemple #8
0
def main():
    train_set, test_set = split_data(district='Steglitz')
    #train_set, test_set = split_data()

    X_train = np.matrix([
            np.ones(train_set.shape[0]),
            train_set['number_of_rooms'],
            train_set['living_space']
        ]).T

    Y_train = np.matrix((train_set['price'])).T

    alpha = 0.00015
    n_iterations = 50
    thetas = gradient_descent.learn(X_train, Y_train, alpha, n_iterations, True)
    theta = thetas[-1]

    X_test = np.matrix([
            np.ones(test_set.shape[0]),
            test_set['number_of_rooms'],
            test_set['living_space']
        ]).T
    Y_test = np.matrix((test_set['price'])).T

    test_predictions = np.dot(X_test, theta)
    costs = calc_cost_functions(X_train, Y_train, thetas)

    print 'Train error', calc_error(X_train, Y_train, theta)
    print 'Test error', calc_error(X_test, Y_test, theta)

    print 'Train R squared', r_squared(X_train, Y_train, theta)
    print 'Test R squared', r_squared(X_test, Y_test, theta)

    figure = pl.figure(1)
    ax = figure.add_subplot(211)

    ax.scatter(test_set['price'], test_set['living_space'],
            label='test set')
    ax.scatter(train_set['price'], train_set['living_space'], color='g',
            label='train set')
    ax.plot(test_predictions, test_set['living_space'], color='red',
            label='regression')

    ax.set_xlabel('Living space, sqm')
    ax.set_ylabel('Price, EUR')
    ax.legend()

    ax = figure.add_subplot(212)
    ax.scatter(np.arange(len(costs)), costs)
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Cost function')

    pl.show()
def main():
    x = []
    valid = []
    test = []
    for i in xrange(2, 68, 5):
        print "Current training set size: ", i

        shutil.rmtree(TRAIN_DIR, ignore_errors=True)
        shutil.rmtree(TEST_DIR, ignore_errors=True)
        shutil.rmtree(VALIDATION_DIR, ignore_errors=True)

        split_data.split_data(i)

        theta = linear_regression.train()
        x.append(i)
        valid.append(linear_regression.validation(theta))
        test.append(linear_regression.eval(theta))
    plt.plot(x, valid)
    plt.plot(x, test)
    plt.ylabel('Accuracy')
    plt.xlabel('Training Set Size')
    plt.show()
def main():
    split_data.split_data(base_path, input_path, class_names)
    print("[INFO] completed data splitting ")
    
    train_data, train_labels = image_preprocessing(train_dir)
    test_data, test_labels = image_preprocessing(test_dir, 'test')
    print("[INFO] completed data preprocessing ")

    model = build_cnn_model()
    print("[INFO] completed model building ")

    X_train, Y_train = bottleneck_feature_extractor(train_data, train_labels, model)
    X_test, Y_test = bottleneck_feature_extractor(test_data, test_labels, model)
    print("[INFO] completed bottleneck feature extraction ")
    
    if os.path.exists(output_path):
        shutil.rmtree(output_path)
    
    os.makedirs(output_path, mode=0o777)
    
    save_features_labels('train', X_train, Y_train)
    save_features_labels('test', X_test, Y_test)
    print("[INFO] completed saving bottleneck features ")
def split_orthography(trans, name):    
    '''Split transcription into train and test data'''
    train, test = split_data(trans)
    trainfile = codecs.open("%s/%s_train.transcription"  % (ETC, name), 
                             "w", ENC)
    testfile = codecs.open("%s/%s_test.transcription"  % (ETC, name), 
                            "w", ENC)
    for entry in train:
        trainfile.write(entry)
    trainfile.close()
    
    for entry in test:
        testfile.write(entry)
    testfile.close()
    
    print "Data segmented into training and test data"
Exemple #12
0
def train_model(spex, subject, date):
    gpus = tensorflow.config.experimental.list_physical_devices('GPU')
    tensorflow.config.experimental.set_memory_growth(gpus[0], True)
    k_folds = 10
    (define_model, epochs, L, Fs, nchan, modelName) = spex

    path = "C:/Users/Kioskar/Desktop/Testing exjobb/Albin_Damir/AD_crop/" + modelName + "/"
    names = glob.glob(path + '*/*', recursive=True)  #os.listdir(path)
    names = [x for x in names if "subj" + str(subject) not in x]
    #print(names)
    np.random.shuffle(names)

    vals = []
    #map = np.zeros([3,120])
    for i in range(0, 1):
        print("Fold number " + str(i + 1) + "!")
        (gen, genVal, trainlen, vallen) = split_data(["A", "B", "C"],
                                                     k_folds,
                                                     i,
                                                     names,
                                                     path,
                                                     spex,
                                                     class_on_char=74)

        checkpoint_path_fold = checkpoint_path + date + "/fold" + str(
            i + 1) + "/cp-{epoch:04d}.ckpt"
        cp_callback = tensorflow.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_path_fold, save_weights_only=True, verbose=1)

        model = define_model(nchan, L, Fs)
        history = model.fit(gen,
                            validation_data=genVal,
                            steps_per_epoch=trainlen,
                            validation_steps=vallen,
                            epochs=30,
                            callbacks=[cp_callback],
                            verbose=2)
        #heatmap_mean = generate_gradCAM(model,spex,path,gen,trainlen)
        #plt.imshow(np.repeat(heatmap_mean,50,axis=0))
        #plt.show()
        #vals.append(history.history['accuracy'])
    return vals
Exemple #13
0
print('Tokenized review: \n', reviews_ints[:1])

# removing outliers
from pre_process import remove_outliers
reviews_ints, encoded_labels = remove_outliers(reviews_ints, encoded_labels)

# padding the sequences to be of equal length
from pre_process import pad_features
seq_length = 200  # can be modified as needed
features = pad_features(reviews_ints, seq_length)

# creating data sets
from split_data import split_data
split_frac = 0.8

train_x, train_y, val_x, val_y, test_x, test_y = split_data(
    features, encoded_labels, split_frac)
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

# create data loaders
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x),
                           torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# data loaders
batch_size = 50
            names.append(name)
        if name[0:13] == ('C' + subject):
            names.append(name)

    k_folds = 10
    date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    val_accs = []
    for i in range(0, k_folds):
        print("Fold number " + str(i + 1) + "!")

        (data_generator, data_generatorVal, l,
         lv) = split_data(['A', 'B', 'C'],
                          k_folds,
                          i,
                          names,
                          path(),
                          nchan,
                          data_aug,
                          batch_size=batch_size)

        tensorboard_callback = load_tensorboard(who, date, i)
        #es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
        checkpoint_path_fold = checkpoint_path + date + "/fold" + str(
            i + 1) + "/cp-{epoch:04d}.ckpt"
        check_point_dir = os.path.dirname(checkpoint_path_fold)
        cp_callback = tensorflow.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_path_fold, save_weights_only=True, verbose=1)

        model = define_model(nchan, L, Fs, batch_size=batch_size)
        # Load weights:
        #model.load_weights("C:/Users/Oskar/Documents/GitHub/Exjobb/logs/model_check_points/20210126-143212/fold1/cp-0005.ckpt")
#     min_size=300, max_size=300
# )

# See the model architecture
print(model)

# use our dataset and defined transformations
dataset = Dataset(DATA_DIR, transforms=get_transform(train=True))
dataset_val = Dataset(DATA_DIR, transforms=get_transform(train=False))
dataset_test = Dataset(DATA_DIR, transforms=get_transform(train=False))

# split the dataset into train and validation sets
torch.manual_seed(1)
# get similar distributated train, val and test set
sequences, sequenceStats = get_sequence_stats()
training_seq_indices, validation_seq_indices, testing_seq_indices = split_data(
    sequenceStats)

training_indices = seq_indices_to_frame_indices(
    training_seq_indices)  #dataset.ann = load_labels()
validation_indices = seq_indices_to_frame_indices(
    validation_seq_indices)  #dataset.ann = load_labels()
testing_indices = seq_indices_to_frame_indices(
    testing_seq_indices)  #dataset.ann = load_labels()

# not needed anymore indices = torch.randperm(len(dataset)).tolist()
dataset_sub = torch.utils.data.Subset(dataset, training_indices)
dataset_val_sub = torch.utils.data.Subset(dataset_val, validation_indices)
dataset_test_sub = torch.utils.data.Subset(dataset_test, testing_indices)

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(dataset_sub,
Exemple #16
0
#from crossval import *
from implementations import *
from helpers import *
from split_data import split_data
from classification_accuracy import *
from logreg import *
from create_data_with_jet import *
from build_polynomial import *

print("\n", '********************************************')

#%%Import
DATA_TRAIN_PATH = 'C:/Users/joeld/Desktop/EPFL/machine learning/AIAIaie/data/train.csv'
#DATA_TRAIN_PATH = '/Users/benoithohl/Desktop/epfl/master_epfl/Ma3/Machine_learning/AIAIaie/data/train.csv' # TODO: download train data and supply path here
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
trainx, trainy, ids_train, validationx, validationy, ids_test = split_data(
    tX, y, ids, 0.8, seed=1)
print('Data loaded')

#%% Preprocessing

data_jetnum = np.array(
    create_data_with_jet(trainx, trainy, ids_train, validationx, validationy,
                         ids_test))
print('Data matrix ready')

#%% parameters setting
#partition of the train set

lambda_ = 0.01
degree = 1
max_iters = 5000
Exemple #17
0
  
    #Open files for results
    res1=open('ref_res.txt','w')
    res2=open('alt_res.txt','w')

    #Create lists with splicing events
    ref=read_file(file1)
    alt=read_file(file2)

    #Create numpy arrays for lists with splicing arrays
    ref=np.array(ref)
    alt=np.array(alt)

    #Scale arrays (for each value subtract mean and then divide by SD)
    ref_scale,alt_scale=scale(ref,alt)

    #Split data onto train and test sets
    X_train,X_test=split_data(ref_scale)

    #Perform SVM with different parameters for choose the bese one
    y_pred_X_train,y_pred_X_test=svm_data(ref_scale,alt_scale)

    #Writing results
    for i in y_pred_X_train:
         res1.write(str(i)+"\n")
    for j in y_pred_X_test:
         res2.write(str(j)+"\n")

    res1.close()
    res2.close()
Exemple #18
0
def main():

    # Locations for audio and midi source files (train and test are subfolders)
    audio_folder = 'audio_files'
    midi_folder = 'midi_files'

    # Subfolder in audio_folder to get wav files from, i.e. 'clean' or 'noise'
    audio_source = 'noise'

    # Locations for input data to tdnn
    tdnn_feat_train = 'data/tdnn/mfcc_feat_train.pkl'
    tdnn_feat_test = 'data/tdnn/mfcc_feat_test.pkl'
    tdnn_target_train = 'data/tdnn/target_train.pkl'
    tdnn_target_test = 'data/tdnn/target_test.pkl'
    target_corpus = 'data/hmm/target_corpus'

    # Locations for output data from tdnn
    tdnn_probs_train = 'data/hmm/train_probs.pkl'
    tdnn_probs_test = 'data/hmm/test_probs.pkl'
    notes_train = 'data/hmm/train_notes.pkl'
    notes_test = 'data/hmm/test_notes.pkl'

    # Path for saved tdnn model and hmm model
    tdnn_model_name = "models/tdnn.h5"
    hmm_model_name = "models/hmm.pkl"

    # Path for resulting midi files from tdnn and hmm
    output_midi_tdnn = 'output_midi/output_midi_tdnn.mid'
    output_midi_hmm = 'output_midi/output_midi_hmm.mid'

    stage = 1

    # Split data into train and test sets
    if stage <= 1:

        print("\nSplitting data...\n")

        wav_in = join(audio_folder, audio_source)
        wav_out = audio_folder
        mid_in = join(midi_folder, 'all')
        mid_out = midi_folder

        split_data(wav_in, wav_out, mid_in, mid_out)

        print("\nFinished splitting data...\n")

    # Create MFCC features for each test and train audio file
    if stage <= 2:

        print("\nGenerating MFCCs...\n")

        src_dir = 'audio_files/train'
        generate_features(src_dir, tdnn_feat_train)

        src_dir = 'audio_files/test'
        generate_features(src_dir, tdnn_feat_test)

        print("\nFinished generating MFCCs...\n")

    # Generate expected pitches for each MFCC from test, train, and corpus midi files
    if stage <= 3:

        print("\nGenerating target pitches...\n")

        src_dir = 'midi_files/train'
        src_audio_dir = 'audio_files/train'
        generate_outputs(src_dir, src_audio_dir, tdnn_target_train)

        src_dir = 'midi_files/test'
        src_audio_dir = 'audio_files/test'
        generate_outputs(src_dir, src_audio_dir, tdnn_target_test)

        # Generate expected pitches for corpus files (we don't need MFCCs for LM)
        src_dir = 'midi_files/corpus'
        src_audio_dir = 'audio_files/clean'
        generate_outputs(src_dir, src_audio_dir, target_corpus)

        print("\nFinished generating target pitches...\n")

    # Train TDNN on MFCCs and target pitch data
    if stage <= 4:

        print("\nTraining time-delay neural network...\n")

        tdnn_train(tdnn_feat_train, tdnn_target_train, tdnn_model_name)

        print("\nFinished training time-delay neural network...\n")

    # Make predictions based on TDNN
    if stage <= 5:

        print("\nPredicting note probabilities using TDNN...\n")

        tdnn_predict(tdnn_model_name, tdnn_feat_train, tdnn_target_train,
                     tdnn_feat_test, tdnn_target_test, output_midi_tdnn)

        print("\nFinished predicting note probabilities using TDNN...\n")

    # Train HMM on TDNN output probabilities
    if stage <= 6:

        print("\nTraining hidden markov model...\n")

        hmm_train(tdnn_probs_train, notes_train, target_corpus,
                  tdnn_target_train, hmm_model_name)

        print("\nFinished training hidden markov model...\n")

    # Train HMM on TDNN output probabilities
    if stage <= 7:

        print("\nDecoding hidden markov model...\n")

        hmm_predict(tdnn_probs_test, notes_test, hmm_model_name,
                    output_midi_hmm)

        print("\nFinished decoding hidden markov model...\n")
Exemple #19
0
from normalize_data import rescaleNormalize
from split_data import split_data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# read data and normalize it.
dataDf = read_data("sat.csv", [1, 2, 4])
data = rescaleNormalize(dataDf)
data = data.values

# split data
X = np.ones((data.shape[0], data.shape[1]))
X[:, 1:3] = data[:, :2]
Y = data[:, 2]
X_train, X_test, Y_train, Y_test = split_data(X, Y, 0.66)

# final parameters:
ALPHA = 0.05
ITERATIONS = 200

# call GradientDescent function to train the model
theta = np.zeros(X.shape[1])
theta, costList = GradientDescent(X_train, Y_train, theta, ITERATIONS, ALPHA)

# visualsize the convergence curve
plt.plot(range(0, len(costList)), costList)
plt.xlabel('iteration')
plt.ylabel('cost')
plt.title('alpha = {}  theta = {}'.format(ALPHA, theta))
plt.show()
from SQL_connection import insert_val


# use these columns as features
# dropped amount_mean_lag7 to avoid errors
feat_merch = ['description', 'transaction_category_name', 'amount', 'state',
              'city', 'transaction_base_type', 'transaction_origin']

df = df_encoder(rng=14,
                spending_report=False,
                plots=False,
                include_lag_features=False)

X_train, X_train_scaled, X_train_minmax, X_test, X_test_scaled, \
    X_test_minmax, y_train, y_test = split_data(df= df,
                                                features = feat_merch,
                                                test_size=0.2,
                                                label='primary_merchant_name')


# convert train data to ndarray to avoid feature_names mismatch error
X_array = X_train.values
y_array = y_train.values
Xt_array = X_test.values
yt_array = y_test.values
# X_train and y_train used to train pipeline
xgb_clf_object = pipeline_xgb(x=X_array,
                              y=y_array,
                              test_features=Xt_array,
                              test_target=yt_array,
                              verb=False)
def main():
    if not RUN_DIRTY:
        clean_up()

    if not RUN_DIRTY and not os.path.exists(TRAINING_PATH) and not os.path.exists(TEST_PATH):
        print("Splicing raw data")
        split_data.split_data(RAW_PATH)

    training_labels_file = TRAINING_PATH + "/_label"
    test_labels_file = TEST_PATH + "/_label"

    print("Reading labels")
    training_labels = file_util.read_line_list(training_labels_file)
    test_labels = file_util.read_line_list(test_labels_file)

    training_tokens_path = tokenizer.get_token_path(TRAINING_PATH)
    test_tokens_path = tokenizer.get_token_path(TEST_PATH)

    print("Tokenizing...")
    if not os.path.exists(training_tokens_path):
        print("Tokenizing training set...")
        tokenizer.tokenize_path(TRAINING_PATH)
        print("Training set tokenization complete")

    if not os.path.exists(test_tokens_path):
        print("Tokenizing test set...")
        tokenizer.tokenize_path(TEST_PATH)
        print("Test set tokenization complete")

    print("Reading tokens")
    training_set_tokens = article_util.load_tokenized_articals(training_tokens_path)
    test_set_tokens = article_util.load_tokenized_articals(test_tokens_path)

    print("Training naive bayes")
    naive_bayes = NaiveBayes(training_set_tokens, training_labels)

    print("Validating with training set")

    training_true_positives = 0
    training_false_positives = 0
    training_false_negative = 0
    for i in range(len(training_set_tokens)):
        predictedClass = naive_bayes.classify(training_set_tokens[i], N_OF_WORDS, N_OF_COMMAS)
        if VERBOSE:
            print("Predicted " + training_labels[i] + " as " + predictedClass)
        if predictedClass == training_labels[i]:
            training_true_positives += 1
        else:
            training_false_positives += 1
            training_false_negative += 1

    training_precisions = training_true_positives / (
        training_true_positives + training_false_positives)

    training__recall = training_true_positives / (
        training_true_positives + training_false_negative)
    training_class_f_score = (2 * training_precisions * training__recall) / (
        training_precisions + training__recall)

    print("Training Precision " + str(training_precisions))
    print("Training Recall " + str(training__recall))
    print("Training F-Score " + str(training_class_f_score))

    print("*" * 50)

    print("Validating with test set")
    test_true_positives = 0
    test_false_positives = 0
    test_false_negative = 0
    for i in range(len(test_set_tokens)):
        predictedClass = naive_bayes.classify(test_set_tokens[i], N_OF_WORDS, N_OF_COMMAS)
        if VERBOSE:
            print("Predicted " + test_labels[i] + " as " + predictedClass)
        if predictedClass == test_labels[i]:
            test_true_positives += 1
        else:
            test_false_positives += 1
            test_false_negative += 1

    test_precisions = test_true_positives / (
        test_true_positives + test_false_positives)

    test__recall = test_true_positives / (
        test_true_positives + test_false_negative)
    test_class_f_score = (2 * test_precisions * test__recall) / (
        test_precisions + test__recall)

    print("Test Precision " + str(test_precisions))
    print("Test Recall " + str(test__recall))
    print("Test F-Score " + str(test_class_f_score))
Exemple #22
0
    parser.add_argument("--nnz_per_slot",
                        type=int,
                        help="the number of keys in each slot",
                        required=True)
    parser.add_argument("--vocabulary_size",
                        type=int,
                        required=False,
                        default=1024 * 8)
    parser.add_argument("--iter_num",
                        type=int,
                        help="the number of training iterations",
                        required=True)
    parser.add_argument("--filename",
                        type=str,
                        help="the filename used to save the generated datas.",
                        required=False,
                        default=r"./datas.file")
    parser.add_argument("--split_num",
                        type=int,
                        required=True,
                        help="the number of shards to be splited.")
    parser.add_argument("--save_prefix",
                        type=str,
                        required=True,
                        help="the prefix used to save splits.")

    args = parser.parse_args()

    generate_datas(args)
    split_data(args.filename, args.split_num, args.save_prefix)
        parent = parent.split('_')
        lab = True if lab == '+' else False
        features = make_vector(parent=parent[1],
                               parent_pos=parent[2][0],
                               child=child[1],
                               child_pos=child[2][0],
                               custom={
                                   'Pid': parent[0],
                                   'Cid': child[0]
                               })
        annot_data.append({**features, **{'result': lab}})

# split annotated data on train/validation/holdout
divided = split_data(relations,
                     train=0.65,
                     validation=0.15,
                     holdout=0.2,
                     random_seed=24)
for item in annot_data:
    parent = item['Pid'] + '_' + item['parent'] + '_' + item['parentPos']
    child = item['Cid'] + '_' + item['child'] + '_' + item['childPos']
    item['data'] = divided[(parent, child)]

# feature analysis/selection
if par.fsmi or par.fsce:
    # basic preprocessing
    dfc = pd.DataFrame(annot_data)
    dfc = dfc.drop(columns=['child', 'parent', 'Pid', 'Cid'], axis=1)

    # calculating mutual information
    if par.fsmi:
Exemple #24
0
    cd.convert_discontinuous_variable(data_frame)
    #print((data_frame.iloc[:,21:]).head())
    #normalize otherwise it will overflow
    data_frame = (data_frame - data_frame.mean()) / data_frame.std()
    data_frame.rename(columns={"class": "MPG"}, inplace=True)

    data_frame = data_frame.drop(["weight"], axis=1)
    #extract dependent variable from the data
    y = (data_frame["MPG"].values)
    y = y.reshape(398, 1)
    y_column = "MPG"
    X = (data_frame.loc[:, data_frame.columns != "MPG"])
    #X = (X.iloc[:,0:2]).values
    X = X.values

    train_x, train_y, test_x, test_y = sp.split_data(X, y)

    X_column = (data_frame.loc[:, data_frame.columns != "MPG"])
    x_column = X_column.columns.values
    #add y columns
    X = gd.add_y_intercept(X)
    theta = np.matrix(np.zeros([train_x.shape[1], 1]))

    #set hyper parameters
    alpha = 0.001
    iters = 10000

    g, cost = gd.gradient_descent(train_x, train_y, theta, alpha, iters)

    x_column = x_column.reshape(24, 1)
Exemple #25
0
def train_model(spex, subject, date, pretrain=False):
    batch_size = 1
    k_folds = 10
    (define_model, epochs, L, Fs, nchan, modelName) = spex

    path = "C:/Users/Kioskar/Desktop/Testing exjobb/EmoDecode1/Study/" + modelName + "/subj" + str(
        subject) + "/"
    path2 = "C:/Users/Kioskar/Desktop/Testing exjobb/EmoDecode1/Retrieval/" + modelName + "/subj" + str(
        subject) + "/"
    names = os.listdir(path)
    np.random.shuffle(names)

    vals = []
    confusion_matrix_F = np.zeros([k_folds, 3, 3])
    confusion_matrix_S = np.zeros([k_folds, 3, 3])

    for i in range(0, k_folds):
        gpus = tensorflow.config.experimental.list_physical_devices('GPU')
        print(gpus)
        print(tensorflow.__version__)
        tensorflow.config.experimental.set_memory_growth(gpus[0], True)
        print(tensorflow.config.experimental.get_memory_growth(gpus[0]))

        print("Fold number " + str(i + 1) + "!")

        (gen, genVal, trainlen, vallen,
         val_names) = split_data(["A", "B", "C"],
                                 k_folds,
                                 i,
                                 names,
                                 path,
                                 spex,
                                 batch_size=batch_size)
        checkpoint_path_fold = checkpoint_path + date + "/fold" + str(
            i + 1) + "/cp-{epoch:04d}.ckpt"
        #cp_callback = tensorflow.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path_fold,save_weights_only=True,verbose=1)
        model = define_model(nchan, L, Fs)
        #model.load_weights("C:/Users/Kioskar/Documents/GitHub/Exjobb/logs/model_check_points/20210311-183357/fold1/cp-0042.ckpt") #Subj03
        history = model.fit(
            gen,
            validation_data=genVal,
            steps_per_epoch=int(trainlen / batch_size) + 1,
            validation_steps=int(vallen / batch_size) + 1,
            epochs=epochs,
            #callbacks=[cp_callback],
            verbose=2)
        vals.append(history.history['val_accuracy'])

        #heatmap_mean = generate_gradCAM(model,spex,path)
        #plt.imshow(np.repeat(heatmap_mean,50,aixs=0))
        #plt.show()
        labels = np.zeros([vallen, 3])
        labels_FS = np.zeros(vallen)
        for j in range(0, vallen):
            (_, labels[j, :]) = next(genVal)
            if val_names[j][4] == "F":
                labels_FS[j] = 1
        print(labels_FS)
        val_preds = np.argmax(model.predict(genVal, steps=vallen), axis=1)
        confusion_matrix_F[i, :, :] = tensorflow.math.confusion_matrix(
            np.argmax(labels, axis=1)[labels_FS == 1],
            val_preds[labels_FS == 1])
        print(np.argmax(labels, axis=1)[labels_FS == 0])
        print(val_preds[labels_FS == 0])
        confusion_matrix_S[i, :, :] = tensorflow.math.confusion_matrix(
            np.argmax(labels, axis=1)[labels_FS == 0],
            val_preds[labels_FS == 0])

        print(confusion_matrix_F[i, :, :])
        print(confusion_matrix_S[i, :, :])
        names2 = os.listdir(path2)
        (gen2, _, trainlen2, _, _) = split_data(["A", "B", "C"],
                                                100,
                                                0,
                                                names2,
                                                path2,
                                                spex,
                                                batch_size=batch_size)
        history2 = model.evaluate(gen2, steps=trainlen2)
        print(history2)

        del history
        del model
        tensorflow.keras.backend.clear_session()
        tensorflow.compat.v1.reset_default_graph()

        def limit_mem():
            tensorflow.config.experimental.get_session().close()

        #limit_mem()
    print("final conf")
    print(np.mean(confusion_matrix_F, axis=0))
    print(np.mean(confusion_matrix_S, axis=0))
    return vals
Exemple #26
0
    y_fn = directory + '/training_solutions_rev1.csv'
    y = np.genfromtxt(y_fn, delimiter=',')
    y = y[1:, :]

    print('Finished loading input after ' + str(time.time() - start) + 'sec')

    return y


if __name__ == '__main__':

    # define input arguments
    directory = '/Users/Karen_Loscocco/Desktop/galaxy-zoo-the-galaxy-challenge'
    size = 424
    trim = 100
    testsize = 0.33
    randnum = 42

    X = get_X_train(directory, size, trim)

    y = get_y_train(directory)

    X_train, X_test, y_train, y_test = split_data.split_data(
        X, y, testsize, randnum)

    np.save('X_train', X_train)
    np.save('X_test', X_test)

    np.save('y_train', y_train)
    np.save('y_test', y_test)
Exemple #27
0
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)


#Use the SpamAssassin dataset

#replace with your path
path = r"C:\spam\*\*"
data = []

for fn in glob.glob(path):
    is_spam = "ham" not in fn
    with open(fn, 'r') as file:
        for line in file:
            if line.startswith("Subject:"):
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))

random.seed(0)
train_data, test_data = split_data(data, 0.75)
classifier = NaiveBayesClassifier()
classifier.classify(train_data)
Exemple #28
0
def main():
    # Parse the training argument
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--training',
        help=
        'Decide whether to train the model or just run testing on previously saved model.'
    )
    args = parser.parse_args()

    is_training = args.training

    if (is_training is None) or (is_training == 'True'):
        is_training = True
    else:
        is_training = False
    print('is_training  mode = ', is_training)

    chamferDist = ChamferDistance()

    # Decide on GPU or CPU
    if torch.cuda.is_available():
        gpu_or_cpu = torch.device('cuda')
    else:
        gpu_or_cpu = torch.device('cpu')

    # Training Configuration


#     image_root = "./../../../datasets/cs253-wi20-public/ShapeNetRendering/"
#     point_cloud_root = "./../../../datasets/cs253-wi20-public/ShapeNet_pointclouds/"
    image_root = "/datasets/cs253-wi20-public/ShapeNetRendering/"
    point_cloud_root = "/datasets/cs253-wi20-public/ShapeNet_pointclouds/"

    num_epochs = 1000
    batch_size = 64
    shuffle = True
    num_workers = 8
    use_2048 = True
    img_size = 227  # I don't know why, but this has to be 227!
    learning_rate = 1e-4
    num_points = 2048
    transform = transforms.Compose([
        transforms.Resize(img_size, interpolation=2),
        transforms.CenterCrop(img_size),
        transforms.ToTensor()
    ])
    # Checkpoint
    use_checkpoint = False

    # Split and Get data. Override the saved files if you change the ratios.
    train_ratio = 0.8
    val_ratio = 0.1
    test_ratio = 0.1

    split_data(train_ratio, val_ratio, test_ratio, overrideFiles=False)

    path_train = 'train_data.txt'
    path_val = 'val_data.txt'
    path_test = 'test_data.txt'

    train_data = read_from_file(path_train)
    val_data = read_from_file(path_val)
    test_data = read_from_file(path_test)

    # Data loader
    train_data_loader = get_loader(image_root, point_cloud_root, train_data,
                                   use_2048, transform, batch_size, shuffle,
                                   num_workers)

    val_data_loader = get_loader(image_root, point_cloud_root, val_data,
                                 use_2048, transform, batch_size, shuffle,
                                 num_workers)
    test_data_loader = get_loader(image_root, point_cloud_root, test_data,
                                  use_2048, transform, batch_size, shuffle,
                                  num_workers)

    print('Len of train loader = ', len(train_data_loader))

    # create model
    print("model building...")
    model = pic2points(num_points=num_points)
    model.to(device=gpu_or_cpu)

    if is_training:
        # Train
        print('Starting training...')
        train_losses, val_loss, best_model = train(
            model,
            train_data_loader,
            val_data_loader,
            chamferDist,
            model_name="Baseline_DL_Vis",
            num_epochs=num_epochs,
            lr=learning_rate,
            use_checkpoint=use_checkpoint)
    else:
        best_model = torch.load('best-Baseline_DL_Vis.pt')
        print('Loaded previously saved model.')

    model = best_model.cuda()
    model.eval()

    # Compute chamfer distance on Pix3D dataset.
    img_path = "/datasets/cs253-wi20-public/pix3d/"
    pc_path = "/datasets/cs253-wi20-public/pix_pointclouds/"

    objects = ['table', 'sofa']

    test_dataset = TestDataset(img_path, pc_path, objects)

    test_data_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                                   batch_size=1,
                                                   shuffle=True,
                                                   num_workers=8)

    print('Starting testing on Pix3D dataset...')
    total_test_loss = 0.
    # Get loss on training data.
    with torch.no_grad():
        for i, (image, point_cloud) in enumerate(test_data_loader):

            image, point_cloud = Variable(image), Variable(point_cloud)

            #         print(image.size())
            if (image.size(1) != 3):
                continue
    #         print('reaching.')

            image, point_cloud = image.float().to(
                device=gpu_or_cpu), point_cloud.float().to(device=gpu_or_cpu)
            pred = model(image)
            dist1, dist2 = chamferDist(pred, point_cloud)
            loss = (torch.mean(dist1)) + (torch.mean(dist2))
            #             emd_cost = torch.sum(dist(pred.cuda().double(), points.cuda().double()))
            total_test_loss += loss.item()

            #         print(total_test_loss)
            #         break

            if i % 100 == 0:
                print('Batch ' + str(i) + ' finished.')

    print('Chamfer distance on Pix3D dataset = ',
          total_test_loss / len(test_data_loader))
Exemple #29
0
from split_data import split_data
if __name__ == '__main__':
    ## test on split_data func
    data_name = 'ring'
    data_path = 'data/{}.csv'.format(data_name)
    num_folds = 5
    split_data(data_path, data_name, num_folds)
"""Creates the Ancient_Greek_ML dataset and then prepares the train, dev and test sets for the character-level BERT."""
from clean_data import clean_data
from sentence_tokenization import sentence_tokenize_corpus
from split_data import split_data
import os

os.chdir("../data")
clean_data()
sentence_tokenize_corpus()
split_data()
def train_test(
        data, instance_testing_size, 
        forecast_horizon, feature_or_covariate_set, 
        history_length, model='knn', base_models=None,
        model_type='regression', model_parameters=None, 
        feature_scaler='logarithmic', target_scaler='logarithmic', 
        labels=None, performance_measures=['MAPE'], 
        performance_mode='normal', performance_report=True, 
        save_predictions=True, verbose=0):
    
    
    """
    Parameters:
        data:    Pandas DataFrame
            a preprocessed DataFrame to be used for training the model and making predictions on the test part
        
        instance_testing_size:    int or float
            the size of testing instances
        
        forecast_horizon:    int
            forecast horizon to gap consideration in data splitting process by the gap, we mean the number of temporal units
            which are excluded from data to simulate the situation of real prediction in which we do not have access to the
            information of forecast horizon-1 units before the time point of the target variable.

        feature_or_covariate_set:    list<string>
            a list of covariates or features which feature selection process will be based on them if historical data is provided, 
            the input will be considered as a feature list, otherwise as a covariate list

        history_length:    int
            history length of the input "data", history length is just used for the reports in "train_test"

        model:    string or callable or dict
            string: one of the pre-defined model names 
            function: a user-defined function
            dict: pre-defined model names and corresponding hyper parameters
            pre-defined model names: 'knn', 'nn' , 'gbm', 'glm'

        model_type:    string

        model_parameters:    list<int> or None

        feature_scaler:    string

        target_scaler:    string

        labels:    list<int> or None

        performance_measures:    list<string>
            a list of performance measures that the user wants to calculate the errors on predictions of test dataset 
        
        performance_mode:    string

        performance_report:    bool
            if True, some tables containing a report on models and their corresponding errors (based on performance_measurements) 
            will be saved in the same directory
        
        save_predictions:    bool
            if True, the prediction values of trained models for training data and validation data through train_and_evaluate 
            process will be saved in the same directory as your program is running as in ‘.csv’ format
        
        verbose:    int
            the level of produced detailed logging information
            available options:
            0: no logging
            1: only important information logging 
            2: all details logging


    Returns:
        model:    string or callable or dict
            exactly same as the 'model' parameter

        model_parameters:    list<int>
    """

    warnings.filterwarnings("once")

    ################################ checking for TypeError and other possible mistakes in the inputs
    if not(isinstance(data, pd.DataFrame)):
        raise TypeError("Expected a pandas DataFrame for data.")

    if not(isinstance(instance_testing_size, int) or isinstance(instance_testing_size, float)):
        raise TypeError("Expected an integer or a float number for instance_testing_size.")
    
    if not(isinstance(forecast_horizon, int)):
        raise TypeError("Expected an integer for forecast_horizon.")
    
    if not(isinstance(feature_or_covariate_set, list)):
        raise TypeError("Expected a list of strings for feature_or_covariate_set.")
    
    if not(isinstance(history_length, int)):
        raise TypeError("Expected an integer for history_length.")
    
    if not(isinstance(model, str) or callable(model) or isinstance(model, dict)):
        raise TypeError("Expected a string or function or a dictionary of model parameters for model.")
    
    if not(isinstance(model_type, str)):
        raise TypeError("Expected a string for model_type.")
    
    if not(isinstance(model_parameters, dict) or model_parameters == None):
        raise TypeError("Expected a dictionary or None value for model_parameters.")
    
    if not(isinstance(feature_scaler, str) or feature_scaler == None):
        raise TypeError("Expected a string or None value for feature_scaler.")
    
    if not(isinstance(target_scaler, str) or target_scaler == None):
        raise TypeError("Expected a string or None value for target_scaler.")

    if not(isinstance(labels, list) or labels == None):
        raise TypeError("Expected a list or None value for labels.")
    
    if not(isinstance(performance_measures, list)):
        raise TypeError("Expected a list for performance_measures.")
    
    if not(isinstance(performance_mode, str)):
        raise TypeError("Expected a string for performance_mode.")
    
    if not(isinstance(performance_report, bool)):
        raise TypeError("Expected a bool variable for performance_report.")
    
    if not(isinstance(save_predictions, bool)):
        raise TypeError("Expected a bool variable for save_predictions.")
    
    if not(isinstance(verbose, int)):
        raise TypeError("Expected an integer (0 or 1 or 2) for verbose.")
    ################################

    # classification checking
    if model_type == 'classification':
        if not set(performance_measures) <= set(configurations.CLASSIFICATION_PERFORMANCE_MEASURES):
            raise Exception("Error: The input 'performance_measures' is not valid according to 'model_type=classification'.")
        if performance_mode != 'normal':
            performance_mode = 'normal'
            print("Warning: The input 'performance_mode' is set to 'normal' according to model_type=classification'.")
        if target_scaler is not None:
            target_scaler = None
            print("Warning: The input 'target_scaler' is set to None according to model_type=classification'.")

    # get some information of the data
    target_mode, target_granularity, granularity, data = get_target_quantities(data=data.copy())
    
    # get the target temporal id from temporal id
    # if target temporal id is already in the data, call is from inside the predict function
    # otherwise backup file must be removed
    if 'target temporal id' in data.columns:
        data = data.rename(columns={'target temporal id':'temporal id'})
    else:
        data, _ = get_target_temporal_ids(temporal_data = data.copy(), forecast_horizon = forecast_horizon,
                                               granularity = granularity)
        if os.path.isfile('test_process_backup.csv'):
            os.remove('test_process_backup.csv')
    
    # check rows related to future prediction are removed and if not then remove them
    temp_data = data.sort_values(by = ['temporal id','spatial id']).copy()
    number_of_spatial_units = len(temp_data['spatial id'].unique())
    if all(temp_data.tail(granularity*forecast_horizon*number_of_spatial_units)['Target'].isna()):
        data = temp_data.iloc[:-(granularity*forecast_horizon*number_of_spatial_units)]
    
    # check if model is a string or function
    model_name = ''
    if isinstance(model, str) == False:
        model_name = model.__name__
        if model_name in ['nn', 'knn', 'glm', 'gbm']:
            raise TypeError("Name of the user defined model matches the name of one of our predefined models.")
    else:
        model_name = model

    # find labels for classification problem
    if labels == None:
        if model_type == 'regression':    # just an empty list
            labels = []
        elif model_type == 'classification':    # unique values in 'Target' column of data
            labels = data.Target.unique()
            labels.sort()

    # select features
    processed_data = select_features(
        data=data.copy(), 
        ordered_covariates_or_features=feature_or_covariate_set
    )

    # splitting data in the way is set for train_test
    training_data, _, testing_data, gap_data = split_data(
        data=processed_data.copy(), 
        splitting_type='instance', 
        instance_testing_size=instance_testing_size, 
        instance_validation_size=None, 
        instance_random_partitioning=False, 
        fold_total_number=0, 
        fold_number=0, 
        forecast_horizon=forecast_horizon,         
        granularity=granularity, 
        verbose=verbose
    )

    # separate some data which are needed later
    base_data = training_data['Target'].values.tolist()
    training_target = training_data[['spatial id', 'temporal id', 'Target', 'Normal target']]
    test_target = testing_data[['spatial id', 'temporal id', 'Target', 'Normal target']]

    # scaling data
    training_data, testing_data = data_scaling(
        train_data=training_data.copy(), 
        test_data=testing_data.copy(), 
        feature_scaler=feature_scaler, 
        target_scaler=target_scaler
    )

    # training model with processed data    
    training_predictions, testing_predictions, trained_model, number_of_parameters = inner_train_evaluate(
        training_data=training_data.copy(), 
        validation_data=testing_data.copy(), 
        model=model, 
        model_type=model_type, 
        model_parameters=model_parameters, 
        labels=labels, 
        base_models = base_models,
        verbose=verbose
    )

    # target descale
    training_predictions = target_descale(
        scaled_data=list(training_predictions), 
        base_data=base_data, 
        scaler=target_scaler
    )

    testing_predictions = target_descale(
        scaled_data=list(testing_predictions), 
        base_data=base_data, 
        scaler=target_scaler
    )

    # checking for some files to exit which will be used in the next phases
    test_process_backup_file_name = 'test_process_backup.csv'
    if pathlib.Path(test_process_backup_file_name).is_file() == False:
        if model_type == 'regression':
            df = pd.DataFrame(columns=['spatial id', 'temporal id', 'Target', 'Normal target', 'prediction'])
        elif model_type == 'classification':
            df = pd.DataFrame(columns=['spatial id', 'temporal id', 'Target', 'Normal target']+\
                              ['prediction class '+str(class_num) for class_num in range(np.array(testing_predictions).shape[1])])
        df.to_csv(test_process_backup_file_name, index=False)


    # getting back previous points (useful for one-by-one method, also works for one-as-whole method)
    previous_test_points = pd.read_csv(test_process_backup_file_name)

    # append current point to previous points
    test_target = test_target.append(previous_test_points[['spatial id', 'temporal id', 'Target', 'Normal target']], ignore_index=True)
    if model_type == 'regression':
        previous_testing_predictions = previous_test_points['prediction'].tolist()
        testing_predictions = list(testing_predictions) + previous_testing_predictions
    elif model_type == 'classification':
        previous_testing_predictions = previous_test_points.filter(regex='^prediction class ',axis=1)
        testing_predictions = np.concatenate((np.array(testing_predictions),np.array(previous_testing_predictions)))
        testing_predictions_df = pd.DataFrame(testing_predictions)
        testing_predictions_df.columns = ['prediction class '+str(class_num) for class_num in testing_predictions_df.columns]

    # saving test_target+testing_predictions into a backup file to be used in the next point
    df_for_backup = test_target.copy()
    if model_type == 'regression':
        df_for_backup.insert(loc=len(df_for_backup.columns), column='prediction', value=testing_predictions)
    elif model_type == 'classification':
        df_for_backup = pd.concat([df_for_backup,testing_predictions_df],axis = 1)
    df_for_backup.to_csv(test_process_backup_file_name, index=False)

    # get normal data
    training_target, test_target, training_prediction, test_prediction = get_normal_target(
        training_target=training_target.append(gap_data[['spatial id', 'temporal id', 'Target', 'Normal target']], ignore_index=True), 
        test_target=test_target.copy(), 
        training_prediction=list(training_predictions) + gap_data['Target'].tolist(), 
        test_prediction=testing_predictions, 
        target_mode=target_mode, 
        target_granularity=target_granularity
    )

    # make copy of some data to be stores later
    test_target_normal, test_prediction_normal = test_target.copy(), test_prediction.copy()

    # including performance_mode
    training_target, test_target, training_prediction, test_prediction = apply_performance_mode(
        training_target=training_target.copy(), 
        test_target=test_target.copy(), 
        training_prediction=list(training_prediction), 
        test_prediction=test_prediction, 
        performance_mode=performance_mode
    )

    # computing trivial values for the test set (just when want to calculate MASE)
    if 'MASE' in performance_measures:
        _, _, _, testing_true_values, testing_predicted_values, testing_trivial_values = get_trivial_values(
            train_true_values_df=training_target.copy(), 
            validation_true_values_df=test_target.copy(), 
            train_prediction=list(training_prediction), 
            validation_prediction=test_prediction, 
            forecast_horizon=forecast_horizon, 
            granularity=granularity
        )

        # computing performnace on test dataset
        test_prediction_errors = performance(
            true_values=testing_true_values, 
            predicted_values=testing_predicted_values, 
            performance_measures=performance_measures, 
            trivial_values=testing_trivial_values, 
            model_type=model_type, 
            num_params=number_of_parameters, 
            labels=labels)

    else:
        # computing performnace on test dataset
        test_prediction_errors = performance(
            true_values=test_target['Normal target'], 
            predicted_values=test_prediction, 
            performance_measures=performance_measures, 
            trivial_values=[], 
            model_type=model_type, 
            num_params=number_of_parameters, 
            labels=labels)
    
    # checking for existance of some directories for logging purpose
    if pathlib.Path('prediction/test process').is_dir() == False:
        pathlib.Path('prediction/test process').mkdir(parents=True, exist_ok=True)
    if pathlib.Path('performance/test process').is_dir() == False:
        pathlib.Path('performance/test process').mkdir(parents=True, exist_ok=True)

    # saving predictions based on model_type
    pred_file_name = 'prediction/test process/test prediction forecast horizon = %s.csv' % (forecast_horizon)
    testing_predictions = np.array(testing_predictions)

    if save_predictions == True:
        if model_type == 'regression':
            df = pd.DataFrame()
            df['real'] = test_target_normal['Normal target'].values.tolist()
            df['prediction'] = list(test_prediction_normal)
            df.insert(0, 'temporal id', test_target_normal['temporal id'].values.tolist(), True)
            df.insert(0, 'spatial id', test_target_normal['spatial id'].values.tolist(), True)
            df.insert(0, 'model name', model_name, True)
            df.to_csv(pred_file_name, index=False)
        elif model_type == 'classification':
            df = pd.DataFrame()
            df['real'] = test_target_normal['Normal target'].values.tolist()
            for i in range(len(labels)):
                col_name = 'class ' + str(labels[i])
                df[col_name] = testing_predictions[:, i]
            df.insert(0, 'temporal id', test_target_normal['temporal id'].values.tolist(), True)
            df.insert(0, 'spatial id', test_target_normal['spatial id'].values.tolist(), True)
            df.insert(0, 'model name', model_name, True)
            df.to_csv(pred_file_name, index=False)
    
    # saving performance (same approach for both regression and classification)
    performance_file_name = 'performance/test process/test performance report forecast horizon = %s.csv' % (forecast_horizon)

    # selecting temporal and futuristic features or covariates from the feature_or_covariate_set list
    check_list = [item for item in feature_or_covariate_set if item.count(' ') != 0]

    # type_flag for detecting feature type (False) or covariate type (True)
    # check if all elements in check_list meet the condition for being covariate type
    type_flag = all(re.search(' t$', element) or re.search(' t[+]$', element) for element in check_list)

    processed_feature_or_covariate_set = []    # a list to be saved in performance report file

    if type_flag == 1:
        for item in feature_or_covariate_set:
            if item.count(' ') != 0:
                processed_feature_or_covariate_set.append(item[:-2])
            else:
                processed_feature_or_covariate_set.append(item)
    else:
        processed_feature_or_covariate_set = feature_or_covariate_set.copy()
        
    if performance_report == True:
        df_data = {
            'model name': list([model_name]), 
            'history length': list([history_length]), 
            'feature or covariate set': ', '.join(processed_feature_or_covariate_set)
        }
        df = pd.DataFrame(df_data, columns=list(df_data.keys()))
        for i in range(len(performance_measures)):
            df[performance_measures[i]] = list([float(test_prediction_errors[i])])
        df.to_csv(performance_file_name, index=False)
    
    return trained_model
#%%Import
print('Data is loading')
DATA_TRAIN_PATH = '/Users/benoithohl/Desktop/epfl/master_epfl/Ma3/Machine_learning/AIAIaie/data/train.csv'  # TODO: download train data and supply path here
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
print('Data loaded')

#%% Preprocessing
Data = remove_features_with_too_many_missing_values(tX, 0.66)
Data = replace_missing_values_with_global_mean(Data)
ZData = Z_score_of_each_feature(Data)
print('Data matrix ready')

#%% parameters setting
#partition of the train set
trainx, trainy, validationx, validationy = split_data(ZData, y, 0.75, seed=1)
initial_w = np.zeros(trainx.shape[1])
max_iters = 100
gamma = 0.1
batch_size = 10
lambdas_vector = np.logspace(-3, 0, num=15)
#lambdas_vector = np.linspace(0, 1, num=15)
print('parameters set', "\n")

#%% Ridge
performance_ridge = []
performance_training = []
for lambda_ in lambdas_vector:
    w = ridge_regression(trainy, trainx, lambda_)
    weights = np.asarray(w)
    y_pred = predict_labels(weights, validationx)
Exemple #33
0
 def on_created(self, event):
     print("[{}] noticed: [{}] on: [{}] ".format(time.asctime(),
                                                 event.event_type,
                                                 event.src_path))
     split_data(event.src_path)
Exemple #34
0
def train_model(spex, subject, date, pretrain=False):
    batch_size = 1
    k_folds = 10
    (define_model, epochs, L, Fs, nchan, modelName) = spex

    path = "C:/Users/Kioskar/Desktop/Testing exjobb/Albin_Damir/all_subj_crop/" + modelName + "/subj" + str(
        subject) + "/"
    #path2 = "C:/Users/Kioskar/Desktop/Testing exjobb/Albin_Damir/all_subj_crop/" + modelName + "/subj" + str(subject) + "/"
    names = os.listdir(path)

    #	classes = ["A","B","C"]
    k = k_folds
    #	list_names = [[],[],[]]
    #	i = 0
    #	for c in classes:
    #		the_names = [idx for idx in names if idx[0].lower() == c.lower()]
    #		list_names[i] = np.array_split(the_names,k)
    #		i = i + 1
    #	def methodToLoad(files,path,spex,batch_size=1):
    #		(_,_,L,Fs,nchan,modelName) = spex
    #		train_0 = np.zeros([batch_size,Fs,L,nchan])
    #		for i,imID in enumerate(files):
    #			spec = np.loadtxt(path+imID,delimiter=',')
    #			spec = np.reshape(spec,[nchan,L,Fs])
    #			spec = np.transpose(spec,[2,1,0])
    #			train_0[i,:,:,:] = spec
    #		return train_0
    #	files  = [[],[],[]]
    #	for i in range(0,3):
    #		for the_names in list_names[i]:
    #			files[i].append(methodToLoad(the_names,path,spex,batch_size=len(the_names)))

    np.random.shuffle(names)

    vals = []
    #tracker = SummaryTracker()
    #vals_pretrain = []
    confusion_matrix = np.zeros([k_folds, 3, 3])
    for i in range(0, k_folds):
        gpus = tensorflow.config.experimental.list_physical_devices('GPU')
        tensorflow.config.experimental.set_memory_growth(gpus[0], True)

        print("Fold number " + str(i + 1) + "!")
        #fold_files = []
        #fold_files_val = []
        #class_names = []
        #class_names_val = []
        #for j in range(0,3):
        #	class_names.extend(np.hstack(np.delete(list_names[j], i, 0)).transpose())
        #	class_names_val.extend(list_names[j][i])
        #	fold_files.extend(np.delete(files[j], i, 0))
        #	fold_files_val.extend(files[j])
        #fold_files = np.vstack(fold_files)
        #fold_files_val = np.vstack(fold_files_val)
        #genVal = signalLoader(class_names_val,fold_files_val,path,spex,batch_size=batch_size,class_on_char=0)
        #gen = signalLoader(class_names,fold_files,path,spex,batch_size=batch_size,class_on_char=0)
        #print(class_names)
        #print(class_names_val)
        #trainlen = len(class_names)
        #vallen = len(class_names_val)
        #print(vallen)
        (gen, genVal, trainlen, vallen) = split_data(["A", "B", "C"],
                                                     k_folds,
                                                     i,
                                                     names,
                                                     path,
                                                     spex,
                                                     batch_size=batch_size)
        checkpoint_path_fold = checkpoint_path + date + "/fold" + str(
            i + 1) + "/cp-{epoch:04d}.ckpt"
        #cp_callback = tensorflow.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path_fold,save_weights_only=True,verbose=1)
        model = define_model(nchan, L, Fs)
        #model.load_weights("C:/Users/Kioskar/Documents/GitHub/Exjobb/logs/model_check_points/20210311-183357/fold1/cp-0042.ckpt") #Subj03
        #model.load_weights("C:/Users/Kioskar/Documents/GitHub/Exjobb/logs/model_check_points/20210312-095405/fold1/cp-0051.ckpt") #Subj01
        #if pretrain :
        #	model.load_weights("C:/Users/Kioskar/Documents/GitHub/Exjobb/logs/model_check_points/"+date+"/fold1/cp-0030.ckpt") #for transfer algorithm.
        #history_pretrain = model.evaluate(gen,steps=trainlen,verbose=2)
        #print(history_pretrain)
        #vals_pretrain.append(history_pretrain[1])
        history = model.fit(
            gen,
            validation_data=genVal,
            steps_per_epoch=int(trainlen / batch_size) + 1,
            validation_steps=int(vallen / batch_size) + 1,
            epochs=epochs,
            #callbacks=[cp_callback],
            verbose=2)
        vals.append(history.history['val_accuracy'])
        del history
        del model
        tensorflow.keras.backend.clear_session()
        tensorflow.compat.v1.reset_default_graph()

        def limit_mem():
            tensorflow.config.experimental.get_session().close()

        #limit_mem()
        #heatmap_mean = generate_gradCAM(model,spex,path)

        #plt.imshow(np.repeat(heatmap_mean,50,aixs=0))
        #plt.show()
        #labels = np.zeros([vallen,3])
        #for j in range(0,vallen):
        #	(_,labels[j,:]) = next(genVal)
        #val_preds = np.argmax(model.predict(genVal,steps=vallen),axis=1)
        #confusion_matrix[i,:,:] = tensorflow.math.confusion_matrix(np.argmax(labels,axis=1),val_preds)
        #print(confusion_matrix[i,:,:])
        #names2 = os.listdir(path2)
        #(gen2,_,trainlen2,_) = split_data(["A","B","C"],100,0,names2,path2,spex,batch_size=batch_size)
        #history2 = model.evaluate(gen2,steps=trainlen2)
        #print(history2)
#pri	nt(np.mean(confusion_matrix,axis=0))
#pri	nt(vals_pretrain)
#pri	nt(np.mean(vals_pretrain))
#del fold_files
#del fold_files_val
#tracker.print_diff()
    return vals
def make_bottleneck_dump_subdir(src_dir, shape, ratio):
    """
	Use names of subdirs as a id.
	And then calculate class_index from id.
	"""

    class_id_set = set()
    #bottleneck_data = dict()
    feature_vectors, labels, filenames = [], [], []

    image_size = (shape[0], shape[1])
    listdir = os.listdir(src_dir)

    # 1) findout number of classes
    for class_id in listdir:

        subdir = src_dir + '/' + class_id
        if not os.path.isdir(subdir): continue

        if len(os.listdir(subdir)) == 0:
            continue
        else:
            try:
                class_id_int = int(class_id)
                class_id_set.add(class_id_int)
            except:
                continue

    # 2) maps class_id to class_index
    id_list = list(class_id_set)
    id_list.sort()
    print('Number of classes in the sample: {0}'.format(len(id_list)))
    print('Min class id: {0}'.format(min(id_list)))
    print('Max class id: {0}'.format(max(id_list)))
    map_id_label = {class_id: index for index, class_id in enumerate(id_list)}
    map_label_id = {index: class_id for index, class_id in enumerate(id_list)}
    maps = {'id_label': map_id_label, 'label_id': map_label_id}
    num_classes = len(map_id_label)

    # 3) Calculate bottleneck in TF
    height, width, color = shape
    x = tf.placeholder(tf.float32, [None, height, width, 3],
                       name='Placeholder-x')
    resized_input_tensor = tf.reshape(x, [-1, height, width, 3])
    #module = hub.Module("https://tfhub.dev/google/imagenet/resnet_v2_152/classification/1")

    # num_features = 2048, height x width = 224 x 224 pixels
    assert height, width == hub.get_expected_image_size(module)
    bottleneck_tensor = module(
        resized_input_tensor)  # Features with shape [batch_size, num_features]
    print('bottleneck_tensor:', bottleneck_tensor)

    with tf.Session() as sess:  # Connect to the TF runtime.
        init = tf.global_variables_initializer()
        sess.run(init)  # Randomly initialize weights.

        for class_id in class_id_set:

            subdir = src_dir + '/' + str(class_id)
            print(subdir)
            files = os.listdir(subdir)
            num_files = len(files)

            for index_file, filename in enumerate(files):

                base = os.path.splitext(filename)[0]
                ext = os.path.splitext(filename)[1]
                if not ext in {'.jpg', ".png"}: continue

                class_index = map_id_label[class_id]
                #print(class_index)

                label = [0] * num_classes
                label[class_index] = 1

                #class_index_set.add(class_index)

                file_path = subdir + '/' + filename
                im = Image.open(file_path)
                im = im.resize(image_size, Image.ANTIALIAS)
                arr = np.array(im, dtype=np.float32) / 256
                feature_vector = bottleneck_tensor.eval(feed_dict={x: [arr]})

                feature_vectors.append(feature_vector)
                labels.append(label)
                filenames.append(filename)  # or file_path

                im.close()

                print("dir={0}, class={1}: {2}/{3}: {4}".format(
                    class_id, class_index, index_file, num_files, filename))

    print('----')
    print('Number of classes: {0}'.format(num_classes))
    print('Number of feature vectors: {0}'.format(len(feature_vectors)))

    data = {
        'images': feature_vectors,
        'labels': labels,
        'filenames': filenames
    }

    # mix data
    if DO_MIX:
        print('start mix data')
        zip3 = list(zip(data['images'], data['labels'], data['filenames']))
        random.shuffle(zip3)
        print('mix ok')
        data['images'] = [x[0] for x in zip3]
        data['labels'] = [x[1] for x in zip3]
        data['filenames'] = [x[2] for x in zip3]

    print('Split data')
    data = split_data.split_data(data, ratio=ratio)
    data['id_label'] = map_id_label
    data['label_id'] = map_label_id

    return data