def read_train_data(dataset, k_normal_val, k_normal_train, k_seizure_val,
                    k_seizure_train):
    global train_counter_seizure
    global val_counter_seizure
    global train_counter_normal
    global val_counter_normal
    global patients

    print "read data and preprocess (fft and slicing)"
    channels = patients[dataset.user]
    print "read in channels", channels

    path = data_path + '/' + dataset.set_name + '/' + dataset.base_name
    print path

    # read in normal
    is_train_index = get_train_val_split(k_normal_train, k_normal_val)
    no_normal = k_normal_val + k_normal_train
    for i in xrange(no_normal):
        print "normal i", i
        sys.stdout.flush()
        data_1h = read_data_1h(path, '_0.mat', i * 6 + 1)
        ch_arrays = []
        for ch in channels:
            ch_arrays.append(
                calcFFT(data_1h[:, ch], fft_width, overlap)[:, floor:ceil])
        magnitude = np.stack(ch_arrays, axis=0)
        if is_train_index[i]:
            g.magnitudes_normal_train[train_counter_normal] = magnitude
            train_counter_normal += 1
        else:
            g.magnitudes_normal_val[val_counter_normal] = magnitude
            val_counter_normal += 1

    # read in seizure
    is_train_index = get_train_val_split(k_seizure_train, k_seizure_val)
    no_seizure = k_seizure_val + k_seizure_train
    for i in xrange(no_seizure):
        print "seizure i", i
        sys.stdout.flush()
        data_1h = read_data_1h(path, '_1.mat', i * 6 + 1)
        ch_arrays = []
        for ch in channels:
            ch_arrays.append(
                calcFFT(data_1h[:, ch], fft_width, overlap)[:, floor:ceil])
        magnitude = np.stack(ch_arrays, axis=0)
        if is_train_index[i]:
            g.magnitudes_seizure_train[train_counter_seizure] = magnitude
            train_counter_seizure += 1
        else:
            g.magnitudes_seizure_val[val_counter_seizure] = magnitude
            val_counter_seizure += 1

    print "Done reading in", no_normal, "no seizure hours and", no_seizure, "seizure hours"
def preprocess_test_data():
    global magnitudes_test
    global test_counter

    print("Loading and preprocessing data...")

    no_files = 0

    for dataset in datasets.all:
        if dataset.enabled and not dataset.trainset:
            no_files += int(dataset.no_files * args.debug_sub_ratio)

    print "no_files", no_files

    test = read_data(data_path + '/test_1/1_', '.mat', 1)
    test_magnitude = calcFFT(test[:, 0], fft_width, overlap)[:, floor:ceil]
    print "test_magnitude.shape", test_magnitude.shape
    stft_steps = test_magnitude.shape[0]

    magnitudes_test = np.zeros(
        (no_files, args.no_channels, stft_steps, ceil - floor),
        dtype=np.float32)
    print magnitudes_test.shape
    test_counter = 0

    for dataset in datasets.all:
        if dataset.enabled and not dataset.trainset:
            print "Read in dataset from %s ..." % (dataset.set_name)
            nf = int(dataset.no_files * args.debug_sub_ratio)
            read_test_data(dataset, 0, nf)

    process = psutil.Process(os.getpid())
    print("Memory usage (GB): " + str(process.memory_info().rss / 1e9))
def read_test_data(dataset, start, stop):
    global magnitudes_test
    global test_counter

    print "read data and preprocess (fft and slicing)"
    channels = patients[dataset.user]
    print "read in channels", channels

    path = data_path + '/' + dataset.set_name + '/' + dataset.base_name
    print path

    # read in normal
    for i in xrange(start, stop):
        #print "test i", i
        sys.stdout.flush()
        data = read_data(path, '.mat', i + 1)
        ch_arrays = []
        for ch in channels:
            ch_arrays.append(
                calcFFT(data[:, ch], fft_width, overlap)[:, floor:ceil])
        magnitude = np.stack(ch_arrays, axis=0)
        magnitudes_test[test_counter] = magnitude
        test_counter += 1

    print "Done reading in", stop - start, "test snippets of 10min."
def preprocess():
    global size
    global xTrain
    global udTrain
    global yTrain
    global aTrain
    global xVal
    global udVal
    global yVal
    global aVal
    global train_counter_seizure
    global val_counter_seizure
    global train_counter_normal
    global val_counter_normal
    global userdata
    global labels
    global analysis_datas

    print("Loading and preprocessing data...")

    no_normal_train = 0
    no_normal_val = 0
    no_seizure_train = 0
    no_seizure_val = 0

    for dataset in datasets.all:
        if dataset.enabled:
            no_normal_val += int(dataset.no_normal * args.debug_sub_ratio *
                                 args.chosen_validation_ratio)
            no_normal_train += int(dataset.no_normal * args.debug_sub_ratio *
                                   (1 - args.chosen_validation_ratio))
            no_seizure_val += int(dataset.no_seizure * args.debug_sub_ratio *
                                  args.chosen_validation_ratio)
            no_seizure_train += int(dataset.no_seizure * args.debug_sub_ratio *
                                    (1 - args.chosen_validation_ratio))

    no_normal = no_normal_val + no_normal_train
    no_seizure = no_seizure_val + no_seizure_train

    print "total"
    print no_normal
    print no_seizure
    print "train"
    print no_normal_train
    print no_seizure_train
    print "validation"
    print no_normal_val
    print no_seizure_val

    test = read_data_1h(data_path + '/train_1/1_', '_0.mat', 1)
    test_magnitude = calcFFT(test[:, 0], fft_width, overlap)[:, floor:ceil]
    print "test_magnitude.shape", test_magnitude.shape
    stft_steps = test_magnitude.shape[0]

    print no_seizure_train
    print no_seizure - no_seizure_train
    print no_normal_train
    print no_normal - no_normal_train

    g.magnitudes_seizure_train = np.zeros(
        (no_seizure_train, args.no_channels, stft_steps, ceil - floor),
        dtype=np.float32)
    g.magnitudes_seizure_val = np.zeros(
        (no_seizure_val, args.no_channels, stft_steps, ceil - floor),
        dtype=np.float32)
    g.magnitudes_normal_train = np.zeros(
        (no_normal_train, args.no_channels, stft_steps, ceil - floor),
        dtype=np.float32)
    g.magnitudes_normal_val = np.zeros(
        (no_normal_val, args.no_channels, stft_steps, ceil - floor),
        dtype=np.float32)

    # analysis_datas = np.zeros(size, dtype=analysis_data_type)

    global train_counter_seizure
    global val_counter_seizure
    train_counter_seizure = 0
    val_counter_seizure = 0

    global train_counter_normal
    global val_counter_normal
    train_counter_normal = 0
    val_counter_normal = 0

    no_dss = 0
    for dataset in datasets.all:
        if dataset.enabled:
            no_dss += 1

    for dataset in datasets.all:
        if dataset.enabled and dataset.trainset:
            print "Read in dataset from %s ..." % (dataset.set_name)
            print "Processing data ..."
            k_normal_val = int(dataset.no_normal * args.debug_sub_ratio *
                               args.chosen_validation_ratio)
            k_normal_train = int(dataset.no_normal * args.debug_sub_ratio *
                                 (1 - args.chosen_validation_ratio))
            k_seizure_val = int(dataset.no_seizure * args.debug_sub_ratio *
                                args.chosen_validation_ratio)
            k_seizure_train = int(dataset.no_seizure * args.debug_sub_ratio *
                                  (1 - args.chosen_validation_ratio))
            read_train_data(dataset, k_normal_val, k_normal_train,
                            k_seizure_val, k_seizure_train)
            print 'train_counter_seizure', train_counter_seizure, 'val_counter_seizure', val_counter_seizure
            print 'train_counter_normal', train_counter_normal, 'val_counter_normal', val_counter_normal

    process = psutil.Process(os.getpid())
    print("Memory usage (GB): " + str(process.memory_info().rss / 1e9))

    print 'train_counter_seizure', train_counter_seizure, 'val_counter_seizure', val_counter_seizure
    print 'train_counter_normal', train_counter_normal, 'val_counter_normal', val_counter_normal

    print "percentiles:"
    for p in range(0, 101, 10):
        print p, np.percentile(g.magnitudes_normal_train,
                               p), np.percentile(g.magnitudes_normal_val, p)

    multiplier = 1
    no_samples_normal_ph = multiplier * no_seizure
    no_samples_seizure_ph = multiplier * no_normal
    size = no_normal * no_samples_normal_ph + no_seizure * no_samples_seizure_ph

    print "no_normal", no_normal
    print "no_seizure", no_seizure
    print "no_samples_normal_ph", no_samples_normal_ph
    print "no_samples_seizure_ph", no_samples_seizure_ph

    magnitudes = np.random.rand(size)
    labels = np.hstack((np.zeros(size / 2), np.ones(size / 2)))
    np.random.shuffle(labels)

    print "size", size

    print "no_normal", no_normal
    print "no_seizure", no_seizure
    print "no_samples_normal_ph", no_samples_normal_ph
    print "no_samples_seizure_ph", no_samples_seizure_ph

    labels = labels.astype(np.int32)
    magnitudes = magnitudes.astype(np.float32)

    print("Histogram:")
    print np.bincount(labels)

    print "magnitudes.shape", magnitudes.shape
    print "labels.shape", labels.shape

    no_val = int(math.floor(args.chosen_validation_ratio * size))
    no_train = size - no_val
    assert no_train + no_val == size
    print 'Ratio validation:', no_val / float(size)
    if abs(no_val / float(size) - args.chosen_validation_ratio) > 0.02:
        print "WARNING: validation ratio (%g) differs from expected value (%g)" % (
            no_val / float(size), args.chosen_validation_ratio)

    xTrain = magnitudes[:no_train]
    udTrain = []
    if include_userdata:
        udTrain = userdata[:no_train]
    yTrain = labels[:no_train]

    xVal = magnitudes[no_train:]

    udVal = []
    if include_userdata:
        udVal = userdata[no_train:]
    yVal = labels[no_train:]

    print "xVal.shape", xVal.shape
    print "yVal.shape", yVal.shape
    xVal = np.vstack((xVal, yVal))
    xVal = np.swapaxes(xVal, 0, 1)
    #aVal = analysis_datas[no_train:]

    # print("Shuffling data...")
    # a = np.arange(xTrain.shape[0])
    # np.random.shuffle(a)
    # xTrain = xTrain[a]
    # if include_userdata:
    # 	udTrain = udTrain[a]
    # yTrain = yTrain[a]

    # inorder to be able to release magnitudes array
    # xVal = np.copy(xVal)

    del magnitudes
    gc.collect()

    print 'xTrain.shape', xTrain.shape
    print 'yTrain.shape', yTrain.shape
    print 'xVal.shape', xVal.shape
    print 'yVal.shape', yVal.shape
    assert xTrain.shape[0] == yTrain.shape[0]
    assert xVal.shape[0] == yVal.shape[0]

    if not args.no_save_preprocessed:
        print("Saving preprocessed data...")
        data = {
            'magnitudes_seizure_val': g.magnitudes_seizure_val,
            'magnitudes_seizure_train': g.magnitudes_seizure_train,
            'magnitudes_normal_val': g.magnitudes_normal_val,
            'magnitudes_normal_train': g.magnitudes_normal_train,
            'xTrain': xTrain,
            #'udTrain':udTrain,
            #'aTrain':aTrain,
            'yTrain': yTrain,
            'xVal': xVal,
            #'udVal':udVal,
            'yVal': yVal,
        }
        hkl.dump(data, 'preprocessedData_16.hkl', compression="lzf")
Esempio n. 5
0
def preprocess():
	global size
	global xTrain
	global udTrain
	global yTrain
	global aTrain
	global xVal
	global udVal
	global yVal
	global aVal
	global train_counter_seizure
	global val_counter_seizure
	global train_counter_normal
	global val_counter_normal
	global userdata
	global labels
	global analysis_datas

	print("Loading and preprocessing data...")

	no_normal_train = 0
	no_normal_val = 0
	no_seizure_train = 0
	no_seizure_val = 0

	for dataset in datasets.all:
		if dataset.enabled:
			no_normal_val += int(dataset.no_normal * args.debug_sub_ratio * args.chosen_validation_ratio)
			no_normal_train += int(dataset.no_normal * args.debug_sub_ratio * (1-args.chosen_validation_ratio))
			no_seizure_val += int(dataset.no_seizure * args.debug_sub_ratio * args.chosen_validation_ratio)
			no_seizure_train += int(dataset.no_seizure * args.debug_sub_ratio * (1-args.chosen_validation_ratio))
	

	no_normal = no_normal_val + no_normal_train
	no_seizure = no_seizure_val + no_seizure_train

	print "total"
	print no_normal
	print no_seizure
	print "train"
	print no_normal_train
	print no_seizure_train
	print "validation"
	print no_normal_val
	print no_seizure_val
	
	test = read_data_1h(data_path+'/train_1/1_','_0.mat',1)
	test_magnitude = calcFFT(test[:,0],fft_width,overlap)[:,floor:ceil]
	print "test_magnitude.shape", test_magnitude.shape
	stft_steps = test_magnitude.shape[0]


	print no_seizure_train
	print no_seizure-no_seizure_train
	print no_normal_train
	print no_normal-no_normal_train

	g.magnitudes_seizure_train = np.zeros((no_seizure_train,args.no_channels,stft_steps,ceil-floor), dtype=np.float32)
	g.magnitudes_seizure_val = np.zeros((no_seizure_val,args.no_channels,stft_steps,ceil-floor), dtype=np.float32)
	g.magnitudes_normal_train = np.zeros((no_normal_train,args.no_channels,stft_steps,ceil-floor), dtype=np.float32)
	g.magnitudes_normal_val = np.zeros((no_normal_val,args.no_channels,stft_steps,ceil-floor), dtype=np.float32)
	

	# analysis_datas = np.zeros(size, dtype=analysis_data_type)

	global train_counter_seizure
	global val_counter_seizure
	train_counter_seizure = 0
	val_counter_seizure = 0

	global train_counter_normal
	global val_counter_normal
	train_counter_normal = 0
	val_counter_normal = 0

	no_dss = 0
	for dataset in datasets.all:
		if dataset.enabled:
			no_dss += 1

	for dataset in datasets.all:
		if dataset.enabled and dataset.trainset:
			print "Read in dataset from %s ..."%(dataset.set_name)
			print "Processing data ..."
			k_normal_val = int(dataset.no_normal * args.debug_sub_ratio * args.chosen_validation_ratio)
			k_normal_train = int(dataset.no_normal * args.debug_sub_ratio * (1-args.chosen_validation_ratio))
			k_seizure_val = int(dataset.no_seizure * args.debug_sub_ratio * args.chosen_validation_ratio)
			k_seizure_train = int(dataset.no_seizure * args.debug_sub_ratio * (1-args.chosen_validation_ratio))
			read_train_data(dataset,k_normal_val,k_normal_train,k_seizure_val,k_seizure_train)
			print 'train_counter_seizure', train_counter_seizure, 'val_counter_seizure', val_counter_seizure
			print 'train_counter_normal', train_counter_normal, 'val_counter_normal', val_counter_normal

	process = psutil.Process(os.getpid())
	print("Memory usage (GB): "+str(process.memory_info().rss/1e9))

	print 'train_counter_seizure', train_counter_seizure, 'val_counter_seizure', val_counter_seizure
	print 'train_counter_normal', train_counter_normal, 'val_counter_normal', val_counter_normal

	print "percentiles:"
	for p in range(0,101,10):
		print p, np.percentile(g.magnitudes_normal_train, p), np.percentile(g.magnitudes_normal_val, p)

	#Construct training vector
	train_multiplier = 1
	gcd_ = gcd(no_normal,no_seizure)
	samplesph_normal = train_multiplier * no_seizure / gcd_
	samplesph_seizure = train_multiplier * no_normal / gcd_
	size_train = no_normal_train * samplesph_normal + no_seizure_train * samplesph_seizure
	
	magnitudes = np.hstack((np.arange(size_train/2),np.arange(size_train/2)))
	labels = np.hstack((np.zeros(size_train/2),np.ones(size_train/2)))
	np.random.shuffle(labels)

	yTrain = labels.astype(np.int32)
	xTrain = magnitudes.astype(np.float32)

	print("Histogram:")
	print np.bincount(yTrain)

	print "yTrain.shape", yTrain.shape
	print "xTrain.shape", xTrain.shape

	#Construct validation vector
	val_mult = 3
	gcd_ = gcd(no_normal_val,no_seizure_val)
	samplesph_normal = val_mult * no_seizure_val / gcd_
	samplesph_seizure = val_mult * no_normal_val / gcd_
	size_val = no_normal_val * samplesph_normal + no_seizure_val * samplesph_seizure
	
	magnitudes = np.hstack((np.arange(size_val/2)%no_normal_val,np.arange(size_val/2)%no_seizure_val))
	labels = np.hstack((np.zeros(size_val/2),np.ones(size_val/2)))	

	yVal = labels.astype(np.int32)
	xVal = magnitudes.astype(np.float32)

	print("Histogram:")
	print np.bincount(yVal)

	print "yTrain.shape", yTrain.shape
	print "xTrain.shape", xVal.shape

	print "xVal.shape", xVal.shape
	print "yVal.shape", yVal.shape
	xVal = np.vstack((xVal,yVal))
	xVal = np.swapaxes(xVal,0,1)

	size = size_val +size_train
	no_val = int(math.floor(args.chosen_validation_ratio * size))
	no_train = size-no_val
	assert no_train + no_val == size
	print 'Ratio validation:', no_val/float(size)
	if abs(no_val/float(size) - args.chosen_validation_ratio) > 0.02:
		print "WARNING: validation ratio (%g) differs from expected value (%g)"%(no_val/float(size), args.chosen_validation_ratio)
	

	del magnitudes
	gc.collect()


	print 'xTrain.shape', xTrain.shape
	print 'yTrain.shape', yTrain.shape
	print 'xVal.shape', xVal.shape
	print 'yVal.shape', yVal.shape
	assert xTrain.shape[0] == yTrain.shape[0]
	assert xVal.shape[0] == yVal.shape[0]

	if args.save_preprocessed:
		print("Saving preprocessed data...")
		data = {
			'magnitudes_seizure_val': g.magnitudes_seizure_val,
			'magnitudes_seizure_train': g.magnitudes_seizure_train,
			'magnitudes_normal_val': g.magnitudes_normal_val,
			'magnitudes_normal_train': g.magnitudes_normal_train,
			'xTrain':xTrain, 
			#'udTrain':udTrain, 
			#'aTrain':aTrain, 
			'yTrain':yTrain, 
			'xVal':xVal,
			#'udVal':udVal, 
			'yVal':yVal,
			}
		hkl.dump(data, 'preprocessedData.hkl',compression="lzf")