def plot_features(subject, data_path, model_path, test_labels, dataset='test'): with open(model_path + '/' + subject + '.pickle', 'rb') as f: state_dict = cPickle.load(f) cnn = ConvNet(state_dict['params']) cnn.set_weights(state_dict['weights']) scalers = state_dict['scalers'] if dataset == 'test': d = load_test_data(data_path, subject) x = d['x'] y = test_labels['preictal'] elif dataset == 'train': d = load_train_data(data_path, subject) x, y = d['x'], d['y'] else: raise ValueError('dataset') x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \ else scale_across_features(x, x_test=None, scalers=scalers) cnn.batch_size.set_value(x.shape[0]) get_features = theano.function([cnn.x, Param(cnn.training_mode, default=0)], cnn.feature_extractor.output, allow_input_downcast=True) logits_test = get_features(x) model = TSNE(n_components=2, random_state=0) z = model.fit_transform(np.float64(logits_test)) plt.scatter(z[:, 0], z[:, 1], s=60, c=y) plt.show()
def predict(subject, data_path, model_path, submission_path): patient_filenames = [ filename for filename in os.listdir(model_path) if subject in filename and filename.endswith('.pickle') ] for filename in patient_filenames: print(filename) d = load_test_data(data_path, subject) x, id = d['x'], d['id'] with open(model_path + '/' + filename, 'rb') as f: state_dict = pickle.load(f) scalers = state_dict['scalers'] x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \ else scale_across_features(x, x_test=None, scalers=scalers) cnn = ConvNet(state_dict['params']) cnn.set_weights(state_dict['weights']) test_proba = cnn.get_test_proba(x) ans = list(zip(id, test_proba)) df = DataFrame(data=ans, columns=['clip', 'preictal']) csv_name = '.'.join( filename.split('.')[:-1]) if '.' in filename else filename df.to_csv(submission_path + '/' + csv_name + '.csv', index=False, header=True)
def plot_train_probs(subject, data_path, model_path): with open(model_path + '/' + subject + '.pickle', 'rb') as f: state_dict = pickle.load(f) cnn = ConvNet(state_dict['params']) cnn.set_weights(state_dict['weights']) scalers = state_dict['scalers'] d = load_train_data(data_path, subject) x, y = d['x'], d['y'] x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \ else scale_across_features(x, x_test=None, scalers=scalers) cnn.batch_size.set_value(x.shape[0]) probs = cnn.get_test_proba(x) fpr, tpr, threshold = roc_curve(y, probs) c = np.sqrt((1 - tpr)**2 + fpr**2) opt_threshold = threshold[np.where(c == np.min(c))[0]] print(opt_threshold) x_coords = np.zeros(len(y), dtype='float64') rng = np.random.RandomState(42) x_coords += rng.normal(0.0, 0.08, size=len(x_coords)) plt.scatter(x_coords, probs, c=y, s=60) plt.title(subject) plt.show()
def plot_train_probs(subject, data_path, model_path): with open(model_path + "/" + subject + ".pickle", "rb") as f: state_dict = cPickle.load(f) cnn = ConvNet(state_dict["params"]) cnn.set_weights(state_dict["weights"]) scalers = state_dict["scalers"] d = load_train_data(data_path, subject) x, y = d["x"], d["y"] x, _ = ( scale_across_time(x, x_test=None, scalers=scalers) if state_dict["params"]["scale_time"] else scale_across_features(x, x_test=None, scalers=scalers) ) cnn.batch_size.set_value(x.shape[0]) probs = cnn.get_test_proba(x) fpr, tpr, threshold = roc_curve(y, probs) c = np.sqrt((1 - tpr) ** 2 + fpr ** 2) opt_threshold = threshold[np.where(c == np.min(c))[0]] print opt_threshold x_coords = np.zeros(len(y), dtype="float64") rng = np.random.RandomState(42) x_coords += rng.normal(0.0, 0.08, size=len(x_coords)) plt.scatter(x_coords, probs, c=y, s=60) plt.title(subject) plt.show()
def predict(subject, data_path, model_path, submission_path): patient_filenames = [filename for filename in os.listdir(model_path) if subject in filename and filename.endswith('.pickle')] for filename in patient_filenames: print filename d = load_test_data(data_path, subject) x, id = d['x'], d['id'] with open(model_path + '/' + filename, 'rb') as f: state_dict = cPickle.load(f) scalers = state_dict['scalers'] x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \ else scale_across_features(x, x_test=None, scalers=scalers) cnn = ConvNet(state_dict['params']) cnn.set_weights(state_dict['weights']) test_proba = cnn.get_test_proba(x) ans = zip(id, test_proba) df = DataFrame(data=ans, columns=['clip', 'preictal']) csv_name = '.'.join(filename.split('.')[:-1]) if '.' in filename else filename df.to_csv(submission_path + '/' + csv_name + '.csv', index=False, header=True)
def train(subject, data_path, model_path, model_params, validation_params): d = load_train_data(data_path, subject) x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx'] x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None # --------- add params model_params['n_channels'] = x.shape[1] model_params['n_fbins'] = x.shape[2] model_params['n_timesteps'] = x.shape[3] print '============ parameters' for key, value in model_params.items(): print key, ':', value print '========================' x_train, y_train = None, None x_valid, y_valid = None, None if model_params['overlap']: # no validation if overlap filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data( data_path, subject, filenames_grouped_by_hour) x, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=model_params['overlap'], window_size=x.shape[-1], overlap_interictal=True, overlap_preictal=True) print x.shape x, scalers = scale_across_time(x, x_test=None) if model_params['scale_time'] \ else scale_across_features(x, x_test=None) cnn = ConvNet(model_params) cnn.train(train_set=(x, y), max_iter=175000) state_dict = cnn.get_state() state_dict['scalers'] = scalers with open(model_path + '/' + subject + '.pickle', 'wb') as f: cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL) return else: if validation_params['random_split']: skf = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0) for train_idx, valid_idx in skf: x_train, y_train = x[train_idx], y[train_idx] x_valid, y_valid = x[valid_idx], y[valid_idx] else: filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) d = split_train_valid_filenames(subject, filenames_grouped_by_hour) train_filenames, valid_filenames = d['train_filenames'], d[ 'valid_filnames'] train_idx = [filename_to_idx[i] for i in train_filenames] valid_idx = [filename_to_idx[i] for i in valid_filenames] x_train, y_train = x[train_idx], y[train_idx] x_valid, y_valid = x[valid_idx], y[valid_idx] if model_params['scale_time']: x_train, scalers_train = scale_across_time(x=x_train, x_test=x_test) x_valid, _ = scale_across_time(x=x_valid, x_test=x_test, scalers=scalers_train) else: x_train, scalers_train = scale_across_features(x=x_train, x_test=x_test) x_valid, _ = scale_across_features(x=x_valid, x_test=x_test, scalers=scalers_train) del x, x_test print '============ dataset' print 'train:', x_train.shape print 'n_pos:', np.sum(y_train), 'n_neg:', len(y_train) - np.sum(y_train) print 'valid:', x_valid.shape print 'n_pos:', np.sum(y_valid), 'n_neg:', len(y_valid) - np.sum(y_valid) # -------------- validate cnn = ConvNet(model_params) best_iter = cnn.validate(train_set=(x_train, y_train), valid_set=(x_valid, y_valid), valid_freq=validation_params['valid_freq'], max_iter=validation_params['max_iter'], fname_out=model_path + '/' + subject + '.txt') # ---------------- scale d = load_train_data(data_path, subject) x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx'] x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None x, scalers = scale_across_time(x=x, x_test=x_test) if model_params['scale_time'] \ else scale_across_features(x=x, x_test=x_test) del x_test cnn = ConvNet(model_params) cnn.train(train_set=(x, y), max_iter=best_iter) state_dict = cnn.get_state() state_dict['scalers'] = scalers with open(model_path + '/' + subject + '.pickle', 'wb') as f: cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL)
def train(subject, data_path, model_path, model_params, validation_params): d = load_train_data(data_path, subject) x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx'] x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None # --------- add params model_params['n_channels'] = x.shape[1] model_params['n_fbins'] = x.shape[2] model_params['n_timesteps'] = x.shape[3] print '============ parameters' for key, value in model_params.items(): print key, ':', value print '========================' x_train, y_train = None, None x_valid, y_valid = None, None if model_params['overlap']: # no validation if overlap filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data(data_path, subject, filenames_grouped_by_hour) x, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=model_params['overlap'], window_size=x.shape[-1], overlap_interictal=True, overlap_preictal=True) print x.shape x, scalers = scale_across_time(x, x_test=None) if model_params['scale_time'] \ else scale_across_features(x, x_test=None) cnn = ConvNet(model_params) cnn.train(train_set=(x, y), max_iter=175000) state_dict = cnn.get_state() state_dict['scalers'] = scalers with open(model_path + '/' + subject + '.pickle', 'wb') as f: cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL) return else: if validation_params['random_split']: skf = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0) for train_idx, valid_idx in skf: x_train, y_train = x[train_idx], y[train_idx] x_valid, y_valid = x[valid_idx], y[valid_idx] else: filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) d = split_train_valid_filenames(subject, filenames_grouped_by_hour) train_filenames, valid_filenames = d['train_filenames'], d['valid_filnames'] train_idx = [filename_to_idx[i] for i in train_filenames] valid_idx = [filename_to_idx[i] for i in valid_filenames] x_train, y_train = x[train_idx], y[train_idx] x_valid, y_valid = x[valid_idx], y[valid_idx] if model_params['scale_time']: x_train, scalers_train = scale_across_time(x=x_train, x_test=x_test) x_valid, _ = scale_across_time(x=x_valid, x_test=x_test, scalers=scalers_train) else: x_train, scalers_train = scale_across_features(x=x_train, x_test=x_test) x_valid, _ = scale_across_features(x=x_valid, x_test=x_test, scalers=scalers_train) del x, x_test print '============ dataset' print 'train:', x_train.shape print 'n_pos:', np.sum(y_train), 'n_neg:', len(y_train) - np.sum(y_train) print 'valid:', x_valid.shape print 'n_pos:', np.sum(y_valid), 'n_neg:', len(y_valid) - np.sum(y_valid) # -------------- validate cnn = ConvNet(model_params) best_iter = cnn.validate(train_set=(x_train, y_train), valid_set=(x_valid, y_valid), valid_freq=validation_params['valid_freq'], max_iter=validation_params['max_iter'], fname_out=model_path + '/' + subject + '.txt') # ---------------- scale d = load_train_data(data_path, subject) x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx'] x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None x, scalers = scale_across_time(x=x, x_test=x_test) if model_params['scale_time'] \ else scale_across_features(x=x, x_test=x_test) del x_test cnn = ConvNet(model_params) cnn.train(train_set=(x, y), max_iter=best_iter) state_dict = cnn.get_state() state_dict['scalers'] = scalers with open(model_path + '/' + subject + '.pickle', 'wb') as f: cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL)
def initial_cifar(): # initial cifar net cnn = ConvNet() conv1_params = { 'HF': 5, 'WF': 5, 'DF': 3, 'NF': 32, 'stride': 1, 'pad': 2, 'var': 0.01 } cnn.add_layer('conv', conv1_params) pooling1_params = {'HF': 3, 'WF': 3, 'stride': 2, 'pad': [0, 1, 0, 1]} cnn.add_layer('max_pooling', pooling1_params) cnn.add_layer('relu', {}) conv2_params = { 'HF': 5, 'WF': 5, 'DF': 32, 'NF': 32, 'stride': 1, 'pad': 2, 'var': 0.02 } cnn.add_layer('conv', conv2_params) cnn.add_layer('relu', {}) pooling2_params = {'HF': 3, 'WF': 3, 'stride': 2, 'pad': [0, 1, 0, 1]} cnn.add_layer('max_pooling', pooling2_params) conv3_params = { 'HF': 5, 'WF': 5, 'DF': 32, 'NF': 64, 'stride': 1, 'pad': 2, 'var': 0.03 } cnn.add_layer('conv', conv3_params) cnn.add_layer('relu', {}) pooling3_params = {'HF': 3, 'WF': 3, 'stride': 2, 'pad': [0, 1, 0, 1]} cnn.add_layer('max_pooling', pooling3_params) conv4_params = { 'HF': 4, 'WF': 4, 'DF': 64, 'NF': 64, 'stride': 1, 'pad': 0, 'var': 0.04 } cnn.add_layer('conv', conv4_params) cnn.add_layer('relu', {}) conv5_params = { 'HF': 1, 'WF': 1, 'DF': 64, 'NF': 10, 'stride': 1, 'pad': 0, 'var': 0.05 } cnn.add_layer('conv', conv5_params) cnn.add_layer('softmax-loss', {}) return cnn
def initial_LeNet(): # initial LeNet cnn = ConvNet() conv1_params = { 'HF': 5, 'WF': 5, 'DF': 1, 'NF': 20, 'stride': 1, 'pad': 0, 'var': 0.01 } cnn.add_layer('conv', conv1_params) pooling1_params = {'HF': 2, 'WF': 2, 'stride': 2, 'pad': 0} cnn.add_layer('max_pooling', pooling1_params) conv2_params = { 'HF': 5, 'WF': 5, 'DF': 20, 'NF': 50, 'stride': 1, 'pad': 0, 'var': 0.01 } cnn.add_layer('conv', conv2_params) pooling2_params = {'HF': 2, 'WF': 2, 'stride': 2, 'pad': 0} cnn.add_layer('max_pooling', pooling2_params) conv3_params = { 'HF': 4, 'WF': 4, 'DF': 50, 'NF': 500, 'stride': 1, 'pad': 0, 'var': 0.01 } cnn.add_layer('conv', conv3_params) cnn.add_layer('relu', {}) conv4_params = { 'HF': 1, 'WF': 1, 'DF': 500, 'NF': 10, 'stride': 1, 'pad': 0, 'var': 0.01 } cnn.add_layer('conv', conv4_params) cnn.add_layer('softmax-loss', {}) return cnn
print 'activation:', activation print '====================' #path = '/mnt/storage/usr/ikorshun/data/data08_npy/' path = '../data/data' + patient + '_npy/' files = glob.glob(path + 'X_*.npy') files = [f.split('/')[-1] for f in files] p = re.compile('\d+') file_nums = [p.findall(f)[0] for f in files] file_nums = np.asarray(file_nums, dtype='int32') test_nums = np.asarray([4], dtype='int32') out_file = open('out.txt', 'w') rng = np.random.RandomState(424242) for i in file_nums: print 'test', i test_set = DatasetsLoader.load(path, i) sets = DatasetsLoader.get_train_valid_set(path, file_nums[file_nums != i], rng) train_set = sets['train'] valid_set = sets['valid'] cnn = ConvNet(nkerns, recept_width, pool_width, dropout_prob, batch_size, activation) opt_iters = cnn.validate(train_set, valid_set, init_learning_rate, max_iters, validation_frequency, improvement_threshold) cnn = ConvNet(nkerns, recept_width, pool_width, dropout_prob, batch_size, activation) train_set = np.concatenate((train_set[0], valid_set[0])), np.concatenate((train_set[1], valid_set[1])) cnn.test(train_set, test_set, init_learning_rate, init_learning_rate / max_iters, opt_iters, out_file) out_file.close()
def initial_cifar(): # initial cifar net cnn = ConvNet() conv1_params = {'HF': 5, 'WF': 5, 'DF': 3, 'NF': 32, 'stride': 1, 'pad': 2, 'var': 0.01} cnn.add_layer('conv', conv1_params) pooling1_params = {'HF': 3, 'WF': 3, 'stride': 2, 'pad': [0, 1, 0, 1]} cnn.add_layer('max_pooling', pooling1_params) cnn.add_layer('relu', {}) conv2_params = {'HF': 5, 'WF': 5, 'DF': 32, 'NF': 32, 'stride': 1, 'pad': 2, 'var': 0.02} cnn.add_layer('conv', conv2_params) cnn.add_layer('relu', {}) pooling2_params = {'HF': 3, 'WF': 3, 'stride': 2, 'pad': [0, 1, 0, 1]} cnn.add_layer('max_pooling', pooling2_params) conv3_params = {'HF': 5, 'WF': 5, 'DF': 32, 'NF': 64, 'stride': 1, 'pad': 2, 'var': 0.02} cnn.add_layer('conv', conv3_params) cnn.add_layer('relu', {}) pooling3_params = {'HF': 3, 'WF': 3, 'stride': 2, 'pad': [0, 1, 0, 1]} cnn.add_layer('max_pooling', pooling3_params) conv4_params = {'HF': 4, 'WF': 4, 'DF': 64, 'NF': 64, 'stride': 1, 'pad': 0, 'var': 0.02} cnn.add_layer('conv', conv4_params) cnn.add_layer('relu', {}) conv5_params = {'HF': 1, 'WF': 1, 'DF': 64, 'NF': 10, 'stride': 1, 'pad': 0, 'var': 0.02} cnn.add_layer('conv', conv5_params) cnn.add_layer('softmax-loss', {}) return cnn
def initial_LeNet(): # initial LeNet cnn = ConvNet() conv1_params = {'HF': 5, 'WF': 5, 'DF': 1, 'NF': 20, 'stride': 1, 'pad': 0, 'var': 0.01} cnn.add_layer('conv', conv1_params) pooling1_params = {'HF': 2, 'WF': 2, 'stride': 2, 'pad': 0} cnn.add_layer('max_pooling', pooling1_params) conv2_params = {'HF': 5, 'WF': 5, 'DF': 20, 'NF': 50, 'stride': 1, 'pad': 0, 'var': 0.01} cnn.add_layer('conv', conv2_params) pooling2_params = {'HF': 2, 'WF': 2, 'stride': 2, 'pad': 0} cnn.add_layer('max_pooling', pooling2_params) conv3_params = {'HF': 4, 'WF': 4, 'DF': 50, 'NF': 500, 'stride': 1, 'pad': 0, 'var': 0.01} cnn.add_layer('conv', conv3_params) cnn.add_layer('relu', {}) conv4_params = {'HF': 1, 'WF': 1, 'DF': 500, 'NF': 10, 'stride': 1, 'pad': 0, 'var': 0.01} cnn.add_layer('conv', conv4_params) cnn.add_layer('softmax-loss', {}) return cnn
files = glob.glob(path + 'X_*.npy') files = [f.split('/')[-1] for f in files] p = re.compile('\d+') file_nums = [p.findall(f)[0] for f in files] file_nums = np.asarray(file_nums, dtype='int32') test_nums = np.asarray([4], dtype='int32') out_file = open('out.txt', 'w') rng = np.random.RandomState(424242) for i in file_nums: print 'test', i test_set = DatasetsLoader.load(path, i) sets = DatasetsLoader.get_train_valid_set(path, file_nums[file_nums != i], rng) train_set = sets['train'] valid_set = sets['valid'] cnn = ConvNet(nkerns, recept_width, pool_width, dropout_prob, batch_size, activation) opt_iters = cnn.validate(train_set, valid_set, init_learning_rate, max_iters, validation_frequency, improvement_threshold) cnn = ConvNet(nkerns, recept_width, pool_width, dropout_prob, batch_size, activation) train_set = np.concatenate((train_set[0], valid_set[0])), np.concatenate( (train_set[1], valid_set[1])) cnn.test(train_set, test_set, init_learning_rate, init_learning_rate / max_iters, opt_iters, out_file) out_file.close()