def train_predict_test(subject,clf,X,X_test,enhance_size = 0):

	filenames_grouped_by_hour = cPickle.load(open('filenames.pickle'))
	data_grouped_by_hour = load_grouped_train_data('preprocessed/cnn/', subject, filenames_grouped_by_hour)

	
	X, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=10,
	                                window_size=X.shape[-1],
	                                overlap_interictal=True,
	                                overlap_preictal=True)

	X, scalers = scale_across_time(X, x_test=None)

	X_test, _ = scale_across_time(X_test, x_test=None, scalers=scalers)


	print X.shape
	X = X.reshape(X.shape[0],X.shape[1]*X.shape[2]*X.shape[3])
	X_test = X_test.reshape(X_test.shape[0],X_test.shape[1]*X_test.shape[2]*X_test.shape[3])
	X,xt,y,yt = train_test_split(X,y,test_size = .25)		
	

	print "train size", X.shape
	print "test_size", xt.shape

	#print "done loading"
	clf.fit(X)

	preds_proba = clf.predict(X_test)

	
	#print preds_proba.shape
	validation_preds = clf.predict(xt)

	return preds_proba,list(validation_preds),list(yt)
Example #2
0
def cross_validate(subject, data_path, reg_C, random_cv=False):
    if random_cv:
        d = load_train_data(data_path, subject)
        x, y = d['x'], d['y']
        skf = StratifiedKFold(y, n_folds=10)
    else:
        filenames_grouped_by_hour = cPickle.load(open('filenames.pickle'))
        data_grouped_by_hour = load_grouped_train_data(
            data_path, subject, filenames_grouped_by_hour)
        n_preictal, n_interictal = len(data_grouped_by_hour['preictal']), len(
            data_grouped_by_hour['interictal'])
        hours_data = data_grouped_by_hour['preictal'] + data_grouped_by_hour[
            'interictal']
        hours_labels = np.concatenate(
            (np.ones(n_preictal), np.zeros(n_interictal)))
        n_folds = n_preictal
        skf = StratifiedKFold(hours_labels, n_folds=n_folds)

    preictal_probs, labels = [], []
    for train_indexes, valid_indexes in skf:
        x_train, x_valid = [], []
        y_train, y_valid = [], []
        for i in train_indexes:
            x_train.extend(hours_data[i])
            y_train.extend(hours_labels[i] * np.ones(len(hours_data[i])))
        for i in valid_indexes:
            x_valid.extend(hours_data[i])
            y_valid.extend(hours_labels[i] * np.ones(len(hours_data[i])))

        x_train = [x[..., np.newaxis] for x in x_train]
        x_train = np.concatenate(x_train, axis=3)
        x_train = np.rollaxis(x_train, axis=3)
        y_train = np.array(y_train)

        x_valid = [x[..., np.newaxis] for x in x_valid]
        x_valid = np.concatenate(x_valid, axis=3)
        x_valid = np.rollaxis(x_valid, axis=3)
        y_valid = np.array(y_valid)

        n_valid_examples = x_valid.shape[0]
        n_timesteps = x_valid.shape[-1]

        x_train, y_train = reshape_data(x_train, y_train)
        data_scaler = StandardScaler()
        x_train = data_scaler.fit_transform(x_train)

        logreg = LogisticRegression(C=reg_C)
        logreg.fit(x_train, y_train)

        x_valid = reshape_data(x_valid)
        x_valid = data_scaler.transform(x_valid)

        p_valid = predict(logreg, x_valid, n_valid_examples, n_timesteps)

        preictal_probs.extend(p_valid)
        labels.extend(y_valid)

    return preictal_probs, labels
def cross_validate(subject, data_path, reg_C, random_cv=False):
    if random_cv:
        d = load_train_data(data_path,subject)
        x, y = d['x'], d['y']
        skf = StratifiedKFold(y, n_folds=10)
    else:
        filenames_grouped_by_hour = cPickle.load(open('filenames.pickle'))
        data_grouped_by_hour = load_grouped_train_data(data_path, subject, filenames_grouped_by_hour)
        n_preictal, n_interictal = len(data_grouped_by_hour['preictal']), len(data_grouped_by_hour['interictal'])
        hours_data = data_grouped_by_hour['preictal'] + data_grouped_by_hour['interictal']
        hours_labels = np.concatenate((np.ones(n_preictal), np.zeros(n_interictal)))
        n_folds = n_preictal
        skf = StratifiedKFold(hours_labels, n_folds=n_folds)


    preictal_probs, labels = [], []
    for train_indexes, valid_indexes in skf:
        x_train, x_valid = [], []
        y_train, y_valid = [], []
        for i in train_indexes:
            x_train.extend(hours_data[i])
            y_train.extend(hours_labels[i] * np.ones(len(hours_data[i])))
        for i in valid_indexes:
            x_valid.extend(hours_data[i])
            y_valid.extend(hours_labels[i] * np.ones(len(hours_data[i])))

        x_train = [x[..., np.newaxis] for x in x_train]
        x_train = np.concatenate(x_train, axis=3)
        x_train = np.rollaxis(x_train, axis=3)
        y_train = np.array(y_train)

        x_valid = [x[..., np.newaxis] for x in x_valid]
        x_valid = np.concatenate(x_valid, axis=3)
        x_valid = np.rollaxis(x_valid, axis=3)
        y_valid = np.array(y_valid)

        n_valid_examples = x_valid.shape[0]
        n_timesteps = x_valid.shape[-1]

        x_train, y_train = reshape_data(x_train, y_train)
        data_scaler = StandardScaler()
        x_train = data_scaler.fit_transform(x_train)

        logreg = LogisticRegression(C=reg_C)
        logreg.fit(x_train, y_train)

        x_valid = reshape_data(x_valid)
        x_valid = data_scaler.transform(x_valid)

        p_valid = predict(logreg, x_valid, n_valid_examples, n_timesteps)

        preictal_probs.extend(p_valid)
        labels.extend(y_valid)

    return preictal_probs, labels
def train_predict_test_cnn(subject,clf,X,X_test,enhance_size = 0):
        filenames_grouped_by_hour = cPickle.load(open('filenames.pickle'))
        data_grouped_by_hour = load_grouped_train_data('preprocessed/cnn/', subject, filenames_grouped_by_hour)

        X, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=10,
	                                window_size=X.shape[-1],
	                                overlap_interictal=True,
	                                overlap_preictal=True)

        X, scalers = scale_across_time(X, x_test=None)

        X_test, _ = scale_across_time(X_test, x_test=None, scalers=scalers)
#	X,xt,y,yt = split_evenly(X,y,test_size = .5)
  #      if enhance_size > 0:
   #             X,y = enhance_data(X,y,enhance_size,cnn=True)
    #            xt,yt = enhance_data(xt,yt,enhance_size/2,cnn=True)
        X, xt, y, yt = train_test_split(X, y, test_size=0.25, random_state=42)
        print "train size", X.shape
        print "test_size", xt.shape

        preds_proba = np.zeros(( X.shape[1], X_test.shape[0] ))
        val_proba = np.zeros(( xt.shape[1], xt.shape[0] ))
        weighting = np.zeros((X.shape[1],))

        for i in range(0, X.shape[1]):
                print "Progress: " + str(100*i/X.shape[1]) + '%'
                X_train = X[:,i,:,:]
                xt_train = xt[:,i,:,:]
                weighting[i], val_proba[i,] = clf.fit(X_train,y,xt_train,yt)


                train_loss = np.array([])
                valid_loss = np.array([])
                X_test_subset = X_test[:,i,:,:]
                preds_proba[i,] = clf.predict_proba(X_test_subset)

        #idx = np.argmax(weighting)
        sc = np.amax(weighting)
        print "Best score:" + str(sc)
        weighting -= weighting.min()
        weighting /= weighting.sum()
        #preds_proba = preds_proba[idx,]
        preds_proba = np.average(preds_proba, axis=0, weights=weighting)

        preds_scaled = preds_proba
        #preds_scaled = min_max_scale(preds_proba)
        #validation_preds = val_proba[idx,]
        validation_preds = np.average(val_proba, axis=0, weights=weighting)


        return preds_scaled,preds_proba,list(validation_preds),list(yt),train_loss,valid_loss
Example #5
0
def train(subject, data_path, model_path, model_params, validation_params):
    d = load_train_data(data_path, subject)
    x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx']
    x_test = load_test_data(data_path,
                            subject)['x'] if model_params['use_test'] else None

    # --------- add params
    model_params['n_channels'] = x.shape[1]
    model_params['n_fbins'] = x.shape[2]
    model_params['n_timesteps'] = x.shape[3]

    print '============ parameters'
    for key, value in model_params.items():
        print key, ':', value
    print '========================'

    x_train, y_train = None, None
    x_valid, y_valid = None, None

    if model_params['overlap']:
        # no validation if overlap
        filenames_grouped_by_hour = cPickle.load(open('filenames.pickle'))
        data_grouped_by_hour = load_grouped_train_data(
            data_path, subject, filenames_grouped_by_hour)
        x, y = generate_overlapped_data(data_grouped_by_hour,
                                        overlap_size=model_params['overlap'],
                                        window_size=x.shape[-1],
                                        overlap_interictal=True,
                                        overlap_preictal=True)
        print x.shape

        x, scalers = scale_across_time(x, x_test=None) if model_params['scale_time'] \
            else scale_across_features(x, x_test=None)

        cnn = ConvNet(model_params)
        cnn.train(train_set=(x, y), max_iter=175000)
        state_dict = cnn.get_state()
        state_dict['scalers'] = scalers
        with open(model_path + '/' + subject + '.pickle', 'wb') as f:
            cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL)
        return
    else:
        if validation_params['random_split']:
            skf = StratifiedShuffleSplit(y,
                                         n_iter=1,
                                         test_size=0.25,
                                         random_state=0)
            for train_idx, valid_idx in skf:
                x_train, y_train = x[train_idx], y[train_idx]
                x_valid, y_valid = x[valid_idx], y[valid_idx]
        else:
            filenames_grouped_by_hour = cPickle.load(open('filenames.pickle'))
            d = split_train_valid_filenames(subject, filenames_grouped_by_hour)
            train_filenames, valid_filenames = d['train_filenames'], d[
                'valid_filnames']
            train_idx = [filename_to_idx[i] for i in train_filenames]
            valid_idx = [filename_to_idx[i] for i in valid_filenames]
            x_train, y_train = x[train_idx], y[train_idx]
            x_valid, y_valid = x[valid_idx], y[valid_idx]

    if model_params['scale_time']:
        x_train, scalers_train = scale_across_time(x=x_train, x_test=x_test)
        x_valid, _ = scale_across_time(x=x_valid,
                                       x_test=x_test,
                                       scalers=scalers_train)
    else:
        x_train, scalers_train = scale_across_features(x=x_train,
                                                       x_test=x_test)
        x_valid, _ = scale_across_features(x=x_valid,
                                           x_test=x_test,
                                           scalers=scalers_train)

    del x, x_test

    print '============ dataset'
    print 'train:', x_train.shape
    print 'n_pos:', np.sum(y_train), 'n_neg:', len(y_train) - np.sum(y_train)
    print 'valid:', x_valid.shape
    print 'n_pos:', np.sum(y_valid), 'n_neg:', len(y_valid) - np.sum(y_valid)

    # -------------- validate
    cnn = ConvNet(model_params)
    best_iter = cnn.validate(train_set=(x_train, y_train),
                             valid_set=(x_valid, y_valid),
                             valid_freq=validation_params['valid_freq'],
                             max_iter=validation_params['max_iter'],
                             fname_out=model_path + '/' + subject + '.txt')

    # ---------------- scale
    d = load_train_data(data_path, subject)
    x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx']
    x_test = load_test_data(data_path,
                            subject)['x'] if model_params['use_test'] else None

    x, scalers = scale_across_time(x=x, x_test=x_test) if model_params['scale_time'] \
        else scale_across_features(x=x, x_test=x_test)
    del x_test

    cnn = ConvNet(model_params)
    cnn.train(train_set=(x, y), max_iter=best_iter)
    state_dict = cnn.get_state()
    state_dict['scalers'] = scalers
    with open(model_path + '/' + subject + '.pickle', 'wb') as f:
        cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL)
def plot_sequences(subject, data_path, test_labels):
    # data train
    filenames_grouped_by_hour = cPickle.load(open('filenames.pickle'))
    data_grouped_by_hour = load_grouped_train_data(data_path, subject, filenames_grouped_by_hour)

    interictal_hours = data_grouped_by_hour['interictal']
    preictal_hours = data_grouped_by_hour['preictal']

    marker_type = {u'D': u'diamond', u's': u'square', u'^': u'triangle_up', u'd': u'thin_diamond', u'h': u'hexagon1',
                   u'*': u'star', u'o': u'circle', u'.': u'point', u'p': u'pentagon', u'H': u'hexagon2',
                   u'v': u'triangle_down', u'8': u'octagon', u'<': u'triangle_left', u'>': u'triangle_right'}
    marker_list = marker_type.keys() * 50

    x_train, colors, markers = [], [], []
    cmap = get_cmap(len(preictal_hours))

    print len(preictal_hours)
    for i, hour in enumerate(preictal_hours):
        for clip in hour:
            x_train.append(np.reshape(clip, (1, clip.shape[0] * clip.shape[1] * clip.shape[2])))
        colors.extend([cmap(i)] * len(hour))
        markers.extend([marker_list[i]] * len(hour))

    for i, hour in enumerate(interictal_hours):
        for clip in hour:
            x_train.append(np.reshape(clip, (1, clip.shape[0] * clip.shape[1] * clip.shape[2])))
        colors.extend(['r'] * len(hour))
        markers.extend([u' '] * len(hour))

    x_train = np.vstack(x_train)
    print x_train.shape

    d = load_test_data(data_path, subject)
    x_test, id = d['x'], d['id']
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1] * x_test.shape[2] * x_test.shape[3]))
    color_test = test_labels['preictal']
    print np.sum(test_labels['preictal'])
    color_test[np.where(color_test == 0)[0]] = 'b'
    color_test[np.where(color_test == 1)[0]] = 'b'

    colors.extend(list(color_test))
    markers.extend([u' '] * len(x_test))

    x_all = np.vstack((np.float64(x_train), np.float64(x_test)))
    scaler = StandardScaler()
    x_all = scaler.fit_transform(x_all)

    pca = PCA(50)
    pca.fit(x_all)
    x_all = pca.fit_transform(x_all)

    model = TSNE(n_components=2, perplexity=40, learning_rate=100, random_state=42)
    z = model.fit_transform(x_all)
    prev_c, i = 0, 0
    for a, b, c, d in zip(z[:, 0], z[:, 1], colors, markers):
        if c != prev_c and d != u' ':
            plt.scatter(a, b, c=c, s=70, marker=d, label=str(i))
            i += 1
        else:
            plt.scatter(a, b, c=c, s=70, marker=d)
        prev_c = c

    zz = z[np.where(np.array(markers) != u' ')[0], :]
    ax = plt.subplot(111)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05),
              ncol=2, fancybox=True, shadow=True)
    plt.xlim([min(zz[:, 0]) - 0.5, max(zz[:, 0] + 0.5)])
    plt.ylim([min(zz[:, 1]) - 0.5, max(zz[:, 1] + 0.5)])
    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set_fontsize(20)
    plt.ylabel('Z_2', fontsize=20)
    plt.xlabel('Z_1', fontsize=20)
    plt.show()
def train(subject, data_path, model_path, model_params, validation_params):
    d = load_train_data(data_path, subject)
    x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx']
    x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None

    # --------- add params
    model_params['n_channels'] = x.shape[1]
    model_params['n_fbins'] = x.shape[2]
    model_params['n_timesteps'] = x.shape[3]

    print '============ parameters'
    for key, value in model_params.items():
        print key, ':', value
    print '========================'

    x_train, y_train = None, None
    x_valid, y_valid = None, None

    if model_params['overlap']:
        # no validation if overlap
        filenames_grouped_by_hour = cPickle.load(open('filenames.pickle'))
        data_grouped_by_hour = load_grouped_train_data(data_path, subject, filenames_grouped_by_hour)
        x, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=model_params['overlap'],
                                        window_size=x.shape[-1],
                                        overlap_interictal=True,
                                        overlap_preictal=True)
        print x.shape

        x, scalers = scale_across_time(x, x_test=None) if model_params['scale_time'] \
            else scale_across_features(x, x_test=None)

        cnn = ConvNet(model_params)
        cnn.train(train_set=(x, y), max_iter=175000)
        state_dict = cnn.get_state()
        state_dict['scalers'] = scalers
        with open(model_path + '/' + subject + '.pickle', 'wb') as f:
            cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL)
        return
    else:
        if validation_params['random_split']:
            skf = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0)
            for train_idx, valid_idx in skf:
                x_train, y_train = x[train_idx], y[train_idx]
                x_valid, y_valid = x[valid_idx], y[valid_idx]
        else:
            filenames_grouped_by_hour = cPickle.load(open('filenames.pickle'))
            d = split_train_valid_filenames(subject, filenames_grouped_by_hour)
            train_filenames, valid_filenames = d['train_filenames'], d['valid_filnames']
            train_idx = [filename_to_idx[i] for i in train_filenames]
            valid_idx = [filename_to_idx[i] for i in valid_filenames]
            x_train, y_train = x[train_idx], y[train_idx]
            x_valid, y_valid = x[valid_idx], y[valid_idx]

    if model_params['scale_time']:
        x_train, scalers_train = scale_across_time(x=x_train, x_test=x_test)
        x_valid, _ = scale_across_time(x=x_valid, x_test=x_test, scalers=scalers_train)
    else:
        x_train, scalers_train = scale_across_features(x=x_train, x_test=x_test)
        x_valid, _ = scale_across_features(x=x_valid, x_test=x_test, scalers=scalers_train)

    del x, x_test

    print '============ dataset'
    print 'train:', x_train.shape
    print 'n_pos:', np.sum(y_train), 'n_neg:', len(y_train) - np.sum(y_train)
    print 'valid:', x_valid.shape
    print 'n_pos:', np.sum(y_valid), 'n_neg:', len(y_valid) - np.sum(y_valid)

    # -------------- validate
    cnn = ConvNet(model_params)
    best_iter = cnn.validate(train_set=(x_train, y_train),
                             valid_set=(x_valid, y_valid),
                             valid_freq=validation_params['valid_freq'],
                             max_iter=validation_params['max_iter'],
                             fname_out=model_path + '/' + subject + '.txt')

    # ---------------- scale
    d = load_train_data(data_path, subject)
    x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx']
    x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None

    x, scalers = scale_across_time(x=x, x_test=x_test) if model_params['scale_time'] \
        else scale_across_features(x=x, x_test=x_test)
    del x_test

    cnn = ConvNet(model_params)
    cnn.train(train_set=(x, y), max_iter=best_iter)
    state_dict = cnn.get_state()
    state_dict['scalers'] = scalers
    with open(model_path + '/' + subject + '.pickle', 'wb') as f:
        cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL)
def train_predict_test_cnn(subject,clf,X,X_test,enhance_size = 0):

	filenames_grouped_by_hour = cPickle.load(open('filenames.pickle'))
	data_grouped_by_hour = load_grouped_train_data('preprocessed/cnn/', subject, filenames_grouped_by_hour)

	
	X, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=10,
	                                window_size=X.shape[-1],
	                                overlap_interictal=True,
	                                overlap_preictal=True)

	X, scalers = scale_across_time(X, x_test=None)

	X_test, _ = scale_across_time(X_test, x_test=None, scalers=scalers)


	X,xt,y,yt = split_evenly(X,y,test_size = .25)
	#X,xt,y,yt = train_test_split(X,y,test_size = .25)		
	if enhance_size > 0:
		X,y = enhance_data(X,y,enhance_size,cnn=True,even=True)
		xt,yt = enhance_data(xt,yt,enhance_size,cnn=True,even=True)

	print "train size", X.shape
	print "test_size", xt.shape

	#print "done loading"
	clf.fit(X,y,xt,yt)

	#train_loss = np.array([])
	#valid_loss = np.array([])
	

	#print "train,valid size",train_loss.shape,valid_loss.shape
	#print "done fitting"
	preds_proba = clf.predict_proba(X_test)[:,1]

	# unsup_size = int(X_test.shape[0]/5)
	# top_ind = np.argpartition(preds_proba,-unsup_size)[-unsup_size:]
	# bot_ind = preds_proba.argsort()[:unsup_size]
	# x_new_p = X_test[top_ind]
	# x_new_i = X_test[bot_ind]
	# y_p = np.ones(x_new_p.shape[0])
	# y_i = np.zeros(x_new_i.shape[0])

	#print y_p.shape,y_i.shape
	#print x_new_p.shape, x_new_i.shape
	# x_new = np.vstack((x_new_p,x_new_i))
	# y_new = np.append(y_p,y_i)
	# #print x_new.shape,y_new.shape
	# #X,xt,y,yt = split_evenly(x_new,y_new,test_size = .25)	
	# if enhance_size > 0:
	# 	x_new,y_new = enhance_data(x_new,y_new,enhance_size,cnn=True)
		

	# print "train size", X.shape
	# print "test_size", xt.shape

	# #print "done loading"
	# clf2 = CNN(subject)
	# clf2.fit(x_new,y_new,xt,yt)

	# preds_proba = clf2.predict_proba(X_test)[:,1]
	train_loss = np.array([i["train_loss"] for i in clf.convnet.train_history_])
	valid_loss = np.array([i["valid_loss"] for i in clf.convnet.train_history_])
	#preds_proba = set_median_to_half(preds_proba)[:,1]
	preds_scaled = min_max_scale(preds_proba)
	#print preds_proba.shape
	validation_preds = min_max_scale(clf.predict_proba(xt)[:,1])

	return preds_scaled,preds_proba,list(validation_preds),list(yt),train_loss,valid_loss