def train_predict_test(subject,clf,X,X_test,enhance_size = 0): filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data('preprocessed/cnn/', subject, filenames_grouped_by_hour) X, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=10, window_size=X.shape[-1], overlap_interictal=True, overlap_preictal=True) X, scalers = scale_across_time(X, x_test=None) X_test, _ = scale_across_time(X_test, x_test=None, scalers=scalers) print X.shape X = X.reshape(X.shape[0],X.shape[1]*X.shape[2]*X.shape[3]) X_test = X_test.reshape(X_test.shape[0],X_test.shape[1]*X_test.shape[2]*X_test.shape[3]) X,xt,y,yt = train_test_split(X,y,test_size = .25) print "train size", X.shape print "test_size", xt.shape #print "done loading" clf.fit(X) preds_proba = clf.predict(X_test) #print preds_proba.shape validation_preds = clf.predict(xt) return preds_proba,list(validation_preds),list(yt)
def cross_validate(subject, data_path, reg_C, random_cv=False): if random_cv: d = load_train_data(data_path, subject) x, y = d['x'], d['y'] skf = StratifiedKFold(y, n_folds=10) else: filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data( data_path, subject, filenames_grouped_by_hour) n_preictal, n_interictal = len(data_grouped_by_hour['preictal']), len( data_grouped_by_hour['interictal']) hours_data = data_grouped_by_hour['preictal'] + data_grouped_by_hour[ 'interictal'] hours_labels = np.concatenate( (np.ones(n_preictal), np.zeros(n_interictal))) n_folds = n_preictal skf = StratifiedKFold(hours_labels, n_folds=n_folds) preictal_probs, labels = [], [] for train_indexes, valid_indexes in skf: x_train, x_valid = [], [] y_train, y_valid = [], [] for i in train_indexes: x_train.extend(hours_data[i]) y_train.extend(hours_labels[i] * np.ones(len(hours_data[i]))) for i in valid_indexes: x_valid.extend(hours_data[i]) y_valid.extend(hours_labels[i] * np.ones(len(hours_data[i]))) x_train = [x[..., np.newaxis] for x in x_train] x_train = np.concatenate(x_train, axis=3) x_train = np.rollaxis(x_train, axis=3) y_train = np.array(y_train) x_valid = [x[..., np.newaxis] for x in x_valid] x_valid = np.concatenate(x_valid, axis=3) x_valid = np.rollaxis(x_valid, axis=3) y_valid = np.array(y_valid) n_valid_examples = x_valid.shape[0] n_timesteps = x_valid.shape[-1] x_train, y_train = reshape_data(x_train, y_train) data_scaler = StandardScaler() x_train = data_scaler.fit_transform(x_train) logreg = LogisticRegression(C=reg_C) logreg.fit(x_train, y_train) x_valid = reshape_data(x_valid) x_valid = data_scaler.transform(x_valid) p_valid = predict(logreg, x_valid, n_valid_examples, n_timesteps) preictal_probs.extend(p_valid) labels.extend(y_valid) return preictal_probs, labels
def cross_validate(subject, data_path, reg_C, random_cv=False): if random_cv: d = load_train_data(data_path,subject) x, y = d['x'], d['y'] skf = StratifiedKFold(y, n_folds=10) else: filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data(data_path, subject, filenames_grouped_by_hour) n_preictal, n_interictal = len(data_grouped_by_hour['preictal']), len(data_grouped_by_hour['interictal']) hours_data = data_grouped_by_hour['preictal'] + data_grouped_by_hour['interictal'] hours_labels = np.concatenate((np.ones(n_preictal), np.zeros(n_interictal))) n_folds = n_preictal skf = StratifiedKFold(hours_labels, n_folds=n_folds) preictal_probs, labels = [], [] for train_indexes, valid_indexes in skf: x_train, x_valid = [], [] y_train, y_valid = [], [] for i in train_indexes: x_train.extend(hours_data[i]) y_train.extend(hours_labels[i] * np.ones(len(hours_data[i]))) for i in valid_indexes: x_valid.extend(hours_data[i]) y_valid.extend(hours_labels[i] * np.ones(len(hours_data[i]))) x_train = [x[..., np.newaxis] for x in x_train] x_train = np.concatenate(x_train, axis=3) x_train = np.rollaxis(x_train, axis=3) y_train = np.array(y_train) x_valid = [x[..., np.newaxis] for x in x_valid] x_valid = np.concatenate(x_valid, axis=3) x_valid = np.rollaxis(x_valid, axis=3) y_valid = np.array(y_valid) n_valid_examples = x_valid.shape[0] n_timesteps = x_valid.shape[-1] x_train, y_train = reshape_data(x_train, y_train) data_scaler = StandardScaler() x_train = data_scaler.fit_transform(x_train) logreg = LogisticRegression(C=reg_C) logreg.fit(x_train, y_train) x_valid = reshape_data(x_valid) x_valid = data_scaler.transform(x_valid) p_valid = predict(logreg, x_valid, n_valid_examples, n_timesteps) preictal_probs.extend(p_valid) labels.extend(y_valid) return preictal_probs, labels
def train_predict_test_cnn(subject,clf,X,X_test,enhance_size = 0): filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data('preprocessed/cnn/', subject, filenames_grouped_by_hour) X, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=10, window_size=X.shape[-1], overlap_interictal=True, overlap_preictal=True) X, scalers = scale_across_time(X, x_test=None) X_test, _ = scale_across_time(X_test, x_test=None, scalers=scalers) # X,xt,y,yt = split_evenly(X,y,test_size = .5) # if enhance_size > 0: # X,y = enhance_data(X,y,enhance_size,cnn=True) # xt,yt = enhance_data(xt,yt,enhance_size/2,cnn=True) X, xt, y, yt = train_test_split(X, y, test_size=0.25, random_state=42) print "train size", X.shape print "test_size", xt.shape preds_proba = np.zeros(( X.shape[1], X_test.shape[0] )) val_proba = np.zeros(( xt.shape[1], xt.shape[0] )) weighting = np.zeros((X.shape[1],)) for i in range(0, X.shape[1]): print "Progress: " + str(100*i/X.shape[1]) + '%' X_train = X[:,i,:,:] xt_train = xt[:,i,:,:] weighting[i], val_proba[i,] = clf.fit(X_train,y,xt_train,yt) train_loss = np.array([]) valid_loss = np.array([]) X_test_subset = X_test[:,i,:,:] preds_proba[i,] = clf.predict_proba(X_test_subset) #idx = np.argmax(weighting) sc = np.amax(weighting) print "Best score:" + str(sc) weighting -= weighting.min() weighting /= weighting.sum() #preds_proba = preds_proba[idx,] preds_proba = np.average(preds_proba, axis=0, weights=weighting) preds_scaled = preds_proba #preds_scaled = min_max_scale(preds_proba) #validation_preds = val_proba[idx,] validation_preds = np.average(val_proba, axis=0, weights=weighting) return preds_scaled,preds_proba,list(validation_preds),list(yt),train_loss,valid_loss
def train(subject, data_path, model_path, model_params, validation_params): d = load_train_data(data_path, subject) x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx'] x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None # --------- add params model_params['n_channels'] = x.shape[1] model_params['n_fbins'] = x.shape[2] model_params['n_timesteps'] = x.shape[3] print '============ parameters' for key, value in model_params.items(): print key, ':', value print '========================' x_train, y_train = None, None x_valid, y_valid = None, None if model_params['overlap']: # no validation if overlap filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data( data_path, subject, filenames_grouped_by_hour) x, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=model_params['overlap'], window_size=x.shape[-1], overlap_interictal=True, overlap_preictal=True) print x.shape x, scalers = scale_across_time(x, x_test=None) if model_params['scale_time'] \ else scale_across_features(x, x_test=None) cnn = ConvNet(model_params) cnn.train(train_set=(x, y), max_iter=175000) state_dict = cnn.get_state() state_dict['scalers'] = scalers with open(model_path + '/' + subject + '.pickle', 'wb') as f: cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL) return else: if validation_params['random_split']: skf = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0) for train_idx, valid_idx in skf: x_train, y_train = x[train_idx], y[train_idx] x_valid, y_valid = x[valid_idx], y[valid_idx] else: filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) d = split_train_valid_filenames(subject, filenames_grouped_by_hour) train_filenames, valid_filenames = d['train_filenames'], d[ 'valid_filnames'] train_idx = [filename_to_idx[i] for i in train_filenames] valid_idx = [filename_to_idx[i] for i in valid_filenames] x_train, y_train = x[train_idx], y[train_idx] x_valid, y_valid = x[valid_idx], y[valid_idx] if model_params['scale_time']: x_train, scalers_train = scale_across_time(x=x_train, x_test=x_test) x_valid, _ = scale_across_time(x=x_valid, x_test=x_test, scalers=scalers_train) else: x_train, scalers_train = scale_across_features(x=x_train, x_test=x_test) x_valid, _ = scale_across_features(x=x_valid, x_test=x_test, scalers=scalers_train) del x, x_test print '============ dataset' print 'train:', x_train.shape print 'n_pos:', np.sum(y_train), 'n_neg:', len(y_train) - np.sum(y_train) print 'valid:', x_valid.shape print 'n_pos:', np.sum(y_valid), 'n_neg:', len(y_valid) - np.sum(y_valid) # -------------- validate cnn = ConvNet(model_params) best_iter = cnn.validate(train_set=(x_train, y_train), valid_set=(x_valid, y_valid), valid_freq=validation_params['valid_freq'], max_iter=validation_params['max_iter'], fname_out=model_path + '/' + subject + '.txt') # ---------------- scale d = load_train_data(data_path, subject) x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx'] x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None x, scalers = scale_across_time(x=x, x_test=x_test) if model_params['scale_time'] \ else scale_across_features(x=x, x_test=x_test) del x_test cnn = ConvNet(model_params) cnn.train(train_set=(x, y), max_iter=best_iter) state_dict = cnn.get_state() state_dict['scalers'] = scalers with open(model_path + '/' + subject + '.pickle', 'wb') as f: cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL)
def plot_sequences(subject, data_path, test_labels): # data train filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data(data_path, subject, filenames_grouped_by_hour) interictal_hours = data_grouped_by_hour['interictal'] preictal_hours = data_grouped_by_hour['preictal'] marker_type = {u'D': u'diamond', u's': u'square', u'^': u'triangle_up', u'd': u'thin_diamond', u'h': u'hexagon1', u'*': u'star', u'o': u'circle', u'.': u'point', u'p': u'pentagon', u'H': u'hexagon2', u'v': u'triangle_down', u'8': u'octagon', u'<': u'triangle_left', u'>': u'triangle_right'} marker_list = marker_type.keys() * 50 x_train, colors, markers = [], [], [] cmap = get_cmap(len(preictal_hours)) print len(preictal_hours) for i, hour in enumerate(preictal_hours): for clip in hour: x_train.append(np.reshape(clip, (1, clip.shape[0] * clip.shape[1] * clip.shape[2]))) colors.extend([cmap(i)] * len(hour)) markers.extend([marker_list[i]] * len(hour)) for i, hour in enumerate(interictal_hours): for clip in hour: x_train.append(np.reshape(clip, (1, clip.shape[0] * clip.shape[1] * clip.shape[2]))) colors.extend(['r'] * len(hour)) markers.extend([u' '] * len(hour)) x_train = np.vstack(x_train) print x_train.shape d = load_test_data(data_path, subject) x_test, id = d['x'], d['id'] x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1] * x_test.shape[2] * x_test.shape[3])) color_test = test_labels['preictal'] print np.sum(test_labels['preictal']) color_test[np.where(color_test == 0)[0]] = 'b' color_test[np.where(color_test == 1)[0]] = 'b' colors.extend(list(color_test)) markers.extend([u' '] * len(x_test)) x_all = np.vstack((np.float64(x_train), np.float64(x_test))) scaler = StandardScaler() x_all = scaler.fit_transform(x_all) pca = PCA(50) pca.fit(x_all) x_all = pca.fit_transform(x_all) model = TSNE(n_components=2, perplexity=40, learning_rate=100, random_state=42) z = model.fit_transform(x_all) prev_c, i = 0, 0 for a, b, c, d in zip(z[:, 0], z[:, 1], colors, markers): if c != prev_c and d != u' ': plt.scatter(a, b, c=c, s=70, marker=d, label=str(i)) i += 1 else: plt.scatter(a, b, c=c, s=70, marker=d) prev_c = c zz = z[np.where(np.array(markers) != u' ')[0], :] ax = plt.subplot(111) ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, fancybox=True, shadow=True) plt.xlim([min(zz[:, 0]) - 0.5, max(zz[:, 0] + 0.5)]) plt.ylim([min(zz[:, 1]) - 0.5, max(zz[:, 1] + 0.5)]) for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(20) plt.ylabel('Z_2', fontsize=20) plt.xlabel('Z_1', fontsize=20) plt.show()
def train(subject, data_path, model_path, model_params, validation_params): d = load_train_data(data_path, subject) x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx'] x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None # --------- add params model_params['n_channels'] = x.shape[1] model_params['n_fbins'] = x.shape[2] model_params['n_timesteps'] = x.shape[3] print '============ parameters' for key, value in model_params.items(): print key, ':', value print '========================' x_train, y_train = None, None x_valid, y_valid = None, None if model_params['overlap']: # no validation if overlap filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data(data_path, subject, filenames_grouped_by_hour) x, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=model_params['overlap'], window_size=x.shape[-1], overlap_interictal=True, overlap_preictal=True) print x.shape x, scalers = scale_across_time(x, x_test=None) if model_params['scale_time'] \ else scale_across_features(x, x_test=None) cnn = ConvNet(model_params) cnn.train(train_set=(x, y), max_iter=175000) state_dict = cnn.get_state() state_dict['scalers'] = scalers with open(model_path + '/' + subject + '.pickle', 'wb') as f: cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL) return else: if validation_params['random_split']: skf = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0) for train_idx, valid_idx in skf: x_train, y_train = x[train_idx], y[train_idx] x_valid, y_valid = x[valid_idx], y[valid_idx] else: filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) d = split_train_valid_filenames(subject, filenames_grouped_by_hour) train_filenames, valid_filenames = d['train_filenames'], d['valid_filnames'] train_idx = [filename_to_idx[i] for i in train_filenames] valid_idx = [filename_to_idx[i] for i in valid_filenames] x_train, y_train = x[train_idx], y[train_idx] x_valid, y_valid = x[valid_idx], y[valid_idx] if model_params['scale_time']: x_train, scalers_train = scale_across_time(x=x_train, x_test=x_test) x_valid, _ = scale_across_time(x=x_valid, x_test=x_test, scalers=scalers_train) else: x_train, scalers_train = scale_across_features(x=x_train, x_test=x_test) x_valid, _ = scale_across_features(x=x_valid, x_test=x_test, scalers=scalers_train) del x, x_test print '============ dataset' print 'train:', x_train.shape print 'n_pos:', np.sum(y_train), 'n_neg:', len(y_train) - np.sum(y_train) print 'valid:', x_valid.shape print 'n_pos:', np.sum(y_valid), 'n_neg:', len(y_valid) - np.sum(y_valid) # -------------- validate cnn = ConvNet(model_params) best_iter = cnn.validate(train_set=(x_train, y_train), valid_set=(x_valid, y_valid), valid_freq=validation_params['valid_freq'], max_iter=validation_params['max_iter'], fname_out=model_path + '/' + subject + '.txt') # ---------------- scale d = load_train_data(data_path, subject) x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx'] x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None x, scalers = scale_across_time(x=x, x_test=x_test) if model_params['scale_time'] \ else scale_across_features(x=x, x_test=x_test) del x_test cnn = ConvNet(model_params) cnn.train(train_set=(x, y), max_iter=best_iter) state_dict = cnn.get_state() state_dict['scalers'] = scalers with open(model_path + '/' + subject + '.pickle', 'wb') as f: cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL)
def train_predict_test_cnn(subject,clf,X,X_test,enhance_size = 0): filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data('preprocessed/cnn/', subject, filenames_grouped_by_hour) X, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=10, window_size=X.shape[-1], overlap_interictal=True, overlap_preictal=True) X, scalers = scale_across_time(X, x_test=None) X_test, _ = scale_across_time(X_test, x_test=None, scalers=scalers) X,xt,y,yt = split_evenly(X,y,test_size = .25) #X,xt,y,yt = train_test_split(X,y,test_size = .25) if enhance_size > 0: X,y = enhance_data(X,y,enhance_size,cnn=True,even=True) xt,yt = enhance_data(xt,yt,enhance_size,cnn=True,even=True) print "train size", X.shape print "test_size", xt.shape #print "done loading" clf.fit(X,y,xt,yt) #train_loss = np.array([]) #valid_loss = np.array([]) #print "train,valid size",train_loss.shape,valid_loss.shape #print "done fitting" preds_proba = clf.predict_proba(X_test)[:,1] # unsup_size = int(X_test.shape[0]/5) # top_ind = np.argpartition(preds_proba,-unsup_size)[-unsup_size:] # bot_ind = preds_proba.argsort()[:unsup_size] # x_new_p = X_test[top_ind] # x_new_i = X_test[bot_ind] # y_p = np.ones(x_new_p.shape[0]) # y_i = np.zeros(x_new_i.shape[0]) #print y_p.shape,y_i.shape #print x_new_p.shape, x_new_i.shape # x_new = np.vstack((x_new_p,x_new_i)) # y_new = np.append(y_p,y_i) # #print x_new.shape,y_new.shape # #X,xt,y,yt = split_evenly(x_new,y_new,test_size = .25) # if enhance_size > 0: # x_new,y_new = enhance_data(x_new,y_new,enhance_size,cnn=True) # print "train size", X.shape # print "test_size", xt.shape # #print "done loading" # clf2 = CNN(subject) # clf2.fit(x_new,y_new,xt,yt) # preds_proba = clf2.predict_proba(X_test)[:,1] train_loss = np.array([i["train_loss"] for i in clf.convnet.train_history_]) valid_loss = np.array([i["valid_loss"] for i in clf.convnet.train_history_]) #preds_proba = set_median_to_half(preds_proba)[:,1] preds_scaled = min_max_scale(preds_proba) #print preds_proba.shape validation_preds = min_max_scale(clf.predict_proba(xt)[:,1]) return preds_scaled,preds_proba,list(validation_preds),list(yt),train_loss,valid_loss