def generate_real_dataset_crafted(): ################################################ LOADING AND CLEANING THE DATA ######################################### #samples = open('./samples_int.txt') samples = open('/nthome/maugust/thesis/samples_int.txt') #labels = open('./labels_int.txt') labels = open('/nthome/maugust/thesis/labels_int.txt') #annotations = open('./annotations_int.txt') annotations = open('/nthome/maugust/thesis/annotations_int.txt') bad_samples = [] real_labels = [] qpoint_lists = [] label_list = [] annotation_list = [] label_count = np.zeros((1,13)) for data in samples: qpoint_lists = data.split(';') for data in labels: label_list = data.split(';') for data in annotations: annotation_list = data.split(';') print 'found %i qpoint lists.' % len(qpoint_lists) print 'found %i labels.' % len(label_list) print 'found %i annotations.' % len(annotation_list) for list_ind in np.arange(len(qpoint_lists)): bad = False ################# PROCESS THE LABELS if annotation_list[list_ind][0:2] != 'vo' and annotation_list[list_ind][0:2] != 'fl' and annotation_list[list_ind][0:2] != 'mi' and annotation_list[list_ind][0:2] != 'ja': real_labels.append(0) label_count[0][0] += 1 else: position = label_list[list_ind].split(',') if float(position[0]) == -2000 or float(position[0]) == -1000: real_labels.append(-1) bad = True else: lab = determine_label((float(position[0]),float(position[1]),float(position[2]))) real_labels.append(lab) label_count[0][lab] += 1 ################# PROCESS THE Q-POINTS qpoint_lists[list_ind] = qpoint_lists[list_ind].split(':') for point_ind in np.arange(len(qpoint_lists[list_ind])): qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(',') if len(qpoint_lists[list_ind][point_ind]) != 7: bad = True if bad: bad_samples.append(list_ind) print 'need to remove %i bad samples.' %len(bad_samples) ################# REMOVE BAD SAMPLES ind = 0 for bad_ind in bad_samples: real_ind = bad_ind - ind qpoint_lists.pop(real_ind) real_labels.pop(real_ind) annotation_list.pop(real_ind) ind += 1 total = 0 for qpoint_list in qpoint_lists: total += len(qpoint_list) print 'average number of qpoints per sample: ' + str(float(total)/len(qpoint_lists)) print str(len(qpoint_lists)) + ' samples remain after purging.' print str(len(real_labels)) + ' labels remain after purging.' print str(len(annotation_list)) + ' annotations remain after purging.' print 'percentages of the labels are %s' %str(label_count/len(qpoint_lists)) samples.close() labels.close() annotations.close() ################################################## COMPUTE THE FEATURES ########################################### good_samples = [] good_labels = [] good_annotations = [] last_per = -1 for ind, qpoint_list in enumerate(qpoint_lists): vec = np.zeros((84,),dtype=np.float32) area_points = [[] for _ in np.arange(12)] area_counts = np.zeros(12) area_x_means = np.zeros(12) area_y_means = np.zeros(12) area_z_means = np.zeros(12) area_highest = np.zeros(12) area_highest_pow = np.zeros(12) area_pow_means = np.zeros(12) bad = False for qpoint in qpoint_list: # need to substract -1 since the function returns the value starting with 1 label = determine_label((float(qpoint[0]), float(qpoint[1]), float(qpoint[2])))-1 area_points[label].append(qpoint) area_counts[label] += 1 if float(qpoint[2]) > area_highest[label]: area_highest[label] = float(qpoint[2]) if float(qpoint[4]) > area_highest_pow[label]: area_highest_pow[label] = float(qpoint[4]) for area in np.arange(12): for point in area_points[area]: area_x_means[area] += float(point[0]) area_y_means[area] += float(point[1]) area_z_means[area] += float(point[2]) area_pow_means[area] += float(point[4]) if area_counts[area] > 0: area_x_means[area] /= area_counts[area] area_y_means[area] /= area_counts[area] area_z_means[area] /= area_counts[area] area_pow_means[area] /= area_pow_means[area] for area in np.arange(12): vec[area*7] = area_counts[area] vec[area*7+1] = area_x_means[area] vec[area*7+2] = area_y_means[area] vec[area*7+3] = area_z_means[area] vec[area*7+4] = area_highest[area] vec[area*7+5] = area_highest_pow[area] vec[area*7+6] = area_pow_means[area] for index, dim in enumerate(vec): if not type(dim) == np.float32: bad = True if dim == np.inf or dim == -np.inf: bad = True if dim == np.nan: bad = True if index == 8 and (dim < -1000 or dim > 1000): print dim bad = True if not bad: good_samples.append(vec) good_labels.append(real_labels[ind]) good_annotations.append(annotation_list[ind]) curr_percent = int(float(ind) / len(qpoint_lists) * 100) if last_per != curr_percent: last_per = curr_percent print 'have now looked at %i%% of the data.' % int(float(ind) / len(qpoint_lists) * 100) f = h5.File("./crafted_real_int_wo_covar.hdf5", "w") f.create_dataset('data_set/data_set', (len(good_samples),84), dtype='f') f.create_dataset('labels/real_labels', (len(good_labels),), dtype='i') dt = h5.special_dtype(vlen=unicode) f.create_dataset('annotations/annotations', (len(good_annotations),), dtype=dt) for ind, vec in enumerate(good_samples): f['data_set/data_set'][ind] = vec f['labels/real_labels'][ind] = good_labels[ind] f['annotations/annotations'][ind] = good_annotations[ind] print 'number of samples: ' +str(len(f['data_set/data_set'])) print 'dimensionality of the samples: ' +str(len(f['data_set/data_set'][0])) print 'number of labels: ' +str(len(f['labels/real_labels'])) print 'number of annotations: ' +str(len(f['annotations/annotations'])) f['data_set/data_set'][...] = scale(f['data_set/data_set']) f.close() generate_train_val_test_set("./crafted_real_int_wo_covar.hdf5", "train_val_test_crafted_real_int_wo_covar.hdf5")
def generate_real_dataset_binning_cnn(data_path): ################################################ LOADING AND CLEANING THE DATA ######################################### samples = open(os.path.join(data_path,'samples_int.txt')) labels = open(os.path.join(data_path,'labels_int.txt')) annotations = open(os.path.join(data_path,'annotations_int.txt')) bad_samples = [] real_labels = [] qpoint_lists = [] label_list = [] annotation_list = [] label_count = np.zeros((1,13)) for data in samples: qpoint_lists = data.split(';') for data in labels: label_list = data.split(';') for data in annotations: annotation_list = data.split(';') print 'found %i qpoint lists.' % len(qpoint_lists) print 'found %i labels.' % len(label_list) print 'found %i annotations.' % len(annotation_list) for list_ind in np.arange(len(qpoint_lists)): bad = False ################# PROCESS THE LABELS if annotation_list[list_ind][0:2] != 'vo' and annotation_list[list_ind][0:2] != 'fl' and annotation_list[list_ind][0:2] != 'mi' and annotation_list[list_ind][0:2] != 'ja': real_labels.append(0) label_count[0][0] += 1 else: position = label_list[list_ind].split(',') if float(position[0]) == -2000 or float(position[0]) == -1000: real_labels.append(-1) bad = True else: lab = determine_label((float(position[0]),float(position[1]),float(position[2]))) real_labels.append(lab) label_count[0][lab] += 1 ################# PROCESS THE Q-POINTS qpoint_lists[list_ind] = qpoint_lists[list_ind].split(':') for point_ind in np.arange(len(qpoint_lists[list_ind])): qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(',') if len(qpoint_lists[list_ind][point_ind]) != 7: bad = True if bad: bad_samples.append(list_ind) print 'need to remove %i bad samples.' %len(bad_samples) ################# REMOVE BAD SAMPLES ind = 0 for bad_ind in bad_samples: real_ind = bad_ind - ind qpoint_lists.pop(real_ind) real_labels.pop(real_ind) annotation_list.pop(real_ind) ind += 1 print str(len(qpoint_lists)) + ' samples remain after purging.' print str(len(real_labels)) + ' labels remain after purging.' print str(len(annotation_list)) + ' annotations remain after purging.' print 'percentages of the labels are %s' %str(label_count/len(qpoint_lists)) samples.close() labels.close() annotations.close() ################################################## PROJECTING THE DATA INTO A GRID ##################################### pcol = 0 ps = 0 # ASSUMPTION: relevant area is never less than 0.7 meters and more than 4.4 meters on the x-axis, 2.5 meters to both sides on the y-axis # and 2 meters on the z-axis away from the sensors bin_cm = 10 max_x_cm = 440 min_x_cm = 70 max_y_cm = 250 max_z_cm = 200 nr_z_intervals = 2 x_range = max_x_cm/bin_cm - min_x_cm/bin_cm y_range = max_y_cm*2/bin_cm z_range = nr_z_intervals f = h5.File(os.path.join(data_path,"binning_real_cnn_int.hdf5"), "w") f.create_dataset('data_set/data_set', (len(qpoint_lists),z_range*x_range*y_range), dtype='f') f.create_dataset('labels/real_labels', (len(real_labels),), dtype='i') dt = h5.special_dtype(vlen=unicode) f.create_dataset('annotations/annotations', (len(annotation_list),), dtype=dt) last_per = -1 for ind, qpoint_list in enumerate(qpoint_lists): grid = np.zeros((z_range, x_range, y_range)) for qpoint in qpoint_list: x = int(float(qpoint[0])*100) / bin_cm y = (int(float(qpoint[1])*100) + max_y_cm) / bin_cm # this actually only works if z_range == 2 z = int(float(qpoint[2])*100) > (max_z_cm / nr_z_intervals) if x < min_x_cm/bin_cm or x > max_x_cm/bin_cm-1 or y > max_y_cm*2/bin_cm-1 or y < 0: continue pow = float(qpoint[4]) if grid[z][x-min_x_cm/bin_cm][y] != 0: pcol += 1 if grid[z][x-min_x_cm/bin_cm][y] < pow: grid[z][x-min_x_cm/bin_cm][y] = pow else: grid[z][x-min_x_cm/bin_cm][y] = pow ps += 1 # unroll the grid into a vector?! f['data_set/data_set'][ind] = grid.flatten() f['labels/real_labels'][ind] = real_labels[ind] f['annotations/annotations'][ind] = annotation_list[ind] curr_percent = int(float(ind) / len(qpoint_lists) * 100) if last_per != curr_percent: last_per = curr_percent print 'have now looked at %i%% of the data.' % int(float(ind) / len(qpoint_lists) * 100) print 'percentage of point collision: ' + str(float(pcol)/ps) print 'number of samples: ' +str(len(f['data_set/data_set'])) print 'dimensionality of the samples: ' +str(len(f['data_set/data_set'][0])) print 'number of labels: ' +str(len(f['labels/real_labels'])) print 'number of annotations: ' +str(len(f['annotations/annotations'])) f.close() generate_train_val_test_set(os.path.join(data_path,"binning_real_cnn_int.hdf5"), os.path.join(data_path,"train_val_test_binning_real_cnn_int.hdf5"))
def generate_real_dataset_rp(data_path, sparse=False, eps=0.1): ################################################ LOADING AND CLEANING THE DATA ######################################### samples = open(os.path.join(data_path, 'samples.txt')) labels = open(os.path.join(data_path, 'labels.txt')) annotations = open(os.path.join(data_path, 'annotations.txt')) out_f = open(os.path.join(data_path,'rp_out'),'w') bad_samples = [] real_labels = [] qpoint_lists = [] label_list = [] annotation_list = [] label_count = np.zeros((1,13)) for data in samples: qpoint_lists = data.split(';') for data in labels: label_list = data.split(';') for data in annotations: annotation_list = data.split(';') out_s = 'found %i qpoint lists.\n' % len(qpoint_lists) + 'found %i labels.\n' % len(label_list) + 'found %i annotations.\n\n' % len(annotation_list) print out_s out_f.write(out_s) out_f.close() for list_ind in np.arange(len(qpoint_lists)): bad = False ################# PROCESS THE LABELS if annotation_list[list_ind][0:2] != 'vo' and annotation_list[list_ind][0:2] != 'fl' and annotation_list[list_ind][0:2] != 'mi' and annotation_list[list_ind][0:2] != 'ja': real_labels.append(0) label_count[0][0] += 1 else: position = label_list[list_ind].split(',') if float(position[0]) == -2000 or float(position[0]) == -1000: real_labels.append(-1) bad = True else: lab = determine_label((float(position[0]),float(position[1]),float(position[2]))) real_labels.append(lab) label_count[0][lab] += 1 ################# PROCESS THE Q-POINTS qpoint_lists[list_ind] = qpoint_lists[list_ind].split(':') for point_ind in np.arange(len(qpoint_lists[list_ind])): qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(',') if len(qpoint_lists[list_ind][point_ind]) != 7: bad = True if bad: bad_samples.append(list_ind) print 'need to remove %i bad samples.' %len(bad_samples) ################# REMOVE BAD SAMPLES ind = 0 for bad_ind in bad_samples: real_ind = bad_ind - ind qpoint_lists.pop(real_ind) real_labels.pop(real_ind) annotation_list.pop(real_ind) ind += 1 out_f = open(os.path.join(data_path,'rp_out'),'a') out_s = str(len(qpoint_lists)) + ' samples remain after purging.\n' + str(len(real_labels)) + ' labels remain after purging.\n'\ + str(len(annotation_list)) + ' annotations remain after purging.\n' + 'percentages of the labels are %s\n\n' %str(label_count/len(qpoint_lists)) print out_s out_f.write(out_s) out_f.close() samples.close() labels.close() annotations.close() ################################################## PROJECTING THE DATA INTO A GRID ##################################### pcol = 0 ps = 0 # ASSUMPTION: relevant area is never less than 0.7 meters and more than 4.4 meters on the x-axis, 2.5 meters to both sides on the y-axis # and 2 meters on the z-axis away from the sensors bin_cm = 3 max_x_cm = 440 min_x_cm = 70 max_y_cm = 250 max_z_cm = 200 x_range = max_x_cm / bin_cm - min_x_cm / bin_cm y_range = max_y_cm * 2 / bin_cm z_range = max_z_cm / bin_cm out_f = open(os.path.join(data_path,'rp_out'),'a') out_s = 'length of data in original space: %d\n\n' %(x_range*y_range*z_range) print out_s out_f.write(out_s) out_f.close() # compute a conservative estimate of the number of latent dimensions required to guarantuee the given epsilons n_dims = johnson_lindenstrauss_min_dim(len(qpoint_lists),eps) out_f = open(os.path.join(data_path,'rp_out'),'a') out_s = 'number of latent dimensions needed to guarantee %f epsilon is %f\n\n' %(eps, n_dims) print out_s out_f.write(out_s) out_f.close() f_path = os.path.join(data_path,'rp_real_sparse.hdf5') if sparse else os.path.join(data_path,'rp_real_gauss.hdf5') print f_path f = h5.File(f_path, "w") f.create_dataset('data_set/data_set', (len(qpoint_lists), n_dims), dtype='f') f.create_dataset('labels/real_labels', (len(real_labels),), dtype='i') dt = h5.special_dtype(vlen=unicode) f.create_dataset('annotations/annotations', (len(annotation_list),), dtype=dt) transformer = random_projection.SparseRandomProjection(n_components=n_dims) if sparse else random_projection.GaussianRandomProjection(n_components=n_dims) if sparse: print 'performing projection with sparse matrix' else: print 'performing projection with gaussian matrix' # this is not the way it's supposed to be done BUT the proper training set doesn't fit into the memory transformer.components_ = transformer._make_random_matrix(n_dims, x_range*y_range*z_range) last_per = -1 for ind, qpoint_list in enumerate(qpoint_lists): grid = np.zeros((x_range, y_range, z_range)) for qpoint in qpoint_list: x = int(float(qpoint[0])*100) / bin_cm y = (int(float(qpoint[1])*100) + max_y_cm) / bin_cm z = int(float(qpoint[2])*100) / bin_cm if x - min_x_cm/bin_cm < 0 or x - min_x_cm/bin_cm > x_range-1 or y > y_range-1 or y < 0 or z > z_range-1 or z < 0: continue pow = float(qpoint[4]) if grid[x-min_x_cm/bin_cm][y][z] != 0: pcol += 1 if grid[x-min_x_cm/bin_cm][y][z] < pow: grid[x-min_x_cm/bin_cm][y][z] = pow else: grid[x-min_x_cm/bin_cm][y][z] = pow ps += 1 f['data_set/data_set'][ind] = transformer.transform(np.reshape(grid,(1,-1))) f['labels/real_labels'][ind] = real_labels[ind] f['annotations/annotations'][ind] = annotation_list[ind] curr_percent = int(float(ind) / len(qpoint_lists) * 100) if last_per != curr_percent: last_per = curr_percent out_f = open(os.path.join(data_path,'rp_out'),'a') out_s = 'have now looked at %i%% of the data.\n' % int(float(ind) / len(qpoint_lists) * 100) print out_s out_f.write(out_s) out_f.close() print 'done with projecting onto the grid (without binning)' print 'percentage of point collision: ' + str(float(pcol)/ps) print 'number of samples: ' +str(len(f['data_set/data_set'])) print 'dimensionality of the samples: ' +str(len(f['data_set/data_set'][0])) print 'number of labels: ' +str(len(f['labels/real_labels'])) print 'number of annotations: ' +str(len(f['annotations/annotations'])) out_f = open(os.path.join(data_path,'rp_out'),'a') out_s = 'projection done, new dimension is %d\n\n' %len(f['data_set/data_set'][0]) print out_s out_f.write(out_s) out_f.close() f.close() if sparse: generate_train_val_test_set(os.path.join(data_path,"rp_real_sparse.hdf5"), os.path.join(data_path,"train_val_test_rp_real_sparse.hdf5")) else: generate_train_val_test_set(os.path.join(data_path,"rp_real_gauss.hdf5"), os.path.join(data_path,"train_val_test_rp_real_gauss.hdf5"))
def generate_real_dataset_crafted(): ################################################ LOADING AND CLEANING THE DATA ######################################### samples = open('./samples.txt') labels = open('./labels.txt') annotations = open('./annotations.txt') bad_samples = [] real_labels = [] qpoint_lists = [] label_list = [] annotation_list = [] label_count = np.zeros((1,13)) for data in samples: qpoint_lists = data.split(';') for data in labels: label_list = data.split(';') for data in annotations: annotation_list = data.split(';') print 'found %i qpoint lists.' % len(qpoint_lists) print 'found %i labels.' % len(label_list) print 'found %i annotations.' % len(annotation_list) for list_ind in np.arange(len(qpoint_lists)): bad = False ################# PROCESS THE LABELS if annotation_list[list_ind][0:2] != 'vo' and annotation_list[list_ind][0:2] != 'fl' and annotation_list[list_ind][0:2] != 'mi' and annotation_list[list_ind][0:2] != 'ja': real_labels.append(0) label_count[0][0] += 1 else: position = label_list[list_ind].split(',') if float(position[0]) == -2000 or float(position[0]) == -1000: real_labels.append(-1) bad = True else: lab = determine_label((float(position[0]),float(position[1]),float(position[2]))) real_labels.append(lab) label_count[0][lab] += 1 ################# PROCESS THE Q-POINTS qpoint_lists[list_ind] = qpoint_lists[list_ind].split(':') for point_ind in np.arange(len(qpoint_lists[list_ind])): qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(',') if len(qpoint_lists[list_ind][point_ind]) != 7: bad = True if bad: bad_samples.append(list_ind) print 'need to remove %i bad samples.' %len(bad_samples) ################# REMOVE BAD SAMPLES ind = 0 for bad_ind in bad_samples: real_ind = bad_ind - ind qpoint_lists.pop(real_ind) real_labels.pop(real_ind) annotation_list.pop(real_ind) ind += 1 total = 0 for qpoint_list in qpoint_lists: total += len(qpoint_list) print 'average number of qpoints per sample: ' + str(float(total)/len(qpoint_lists)) print str(len(qpoint_lists)) + ' samples remain after purging.' print str(len(real_labels)) + ' labels remain after purging.' print str(len(annotation_list)) + ' annotations remain after purging.' print 'percentages of the labels are %s' %str(label_count/len(qpoint_lists)) samples.close() labels.close() annotations.close() ################################################## COMPUTE THE FEATURES ########################################### good_samples = [] good_labels = [] good_annotations = [] last_per = -1 for ind, qpoint_list in enumerate(qpoint_lists): vec = np.zeros((156,),dtype=np.float32) area_points = [[] for _ in np.arange(12)] area_counts = np.zeros(12) area_x_means = np.zeros(12) area_y_means = np.zeros(12) area_z_means = np.zeros(12) area_highest = np.zeros(12) area_highest_pow = np.zeros(12) area_pow_means = np.zeros(12) area_x_vars = np.zeros(12) area_y_vars = np.zeros(12) area_z_vars = np.zeros(12) area_xy_covars = np.zeros(12) area_xz_covars = np.zeros(12) area_yz_covars = np.zeros(12) bad = False for qpoint in qpoint_list: # need to substract -1 since the function returns the value starting with 1 label = determine_label((float(qpoint[0]), float(qpoint[1]), float(qpoint[2])))-1 area_points[label].append(qpoint) area_counts[label] += 1 if float(qpoint[2]) > area_highest[label]: area_highest[label] = float(qpoint[2]) if float(qpoint[4]) > area_highest_pow[label]: area_highest_pow[label] = float(qpoint[4]) for area in np.arange(12): for point in area_points[area]: area_x_means[area] += float(point[0]) area_y_means[area] += float(point[1]) area_z_means[area] += float(point[2]) area_pow_means[area] += float(point[4]) if area_counts[area] > 0: area_x_means[area] /= area_counts[area] area_y_means[area] /= area_counts[area] area_z_means[area] /= area_counts[area] area_pow_means[area] /= area_pow_means[area] for point in area_points[area]: area_x_vars[area] += (float(point[0]) - area_x_means[area])**2 area_y_vars[area] += (float(point[1]) - area_y_means[area])**2 area_z_vars[area] += (float(point[2]) - area_z_means[area])**2 # if there is only one point, we assume the uncorrected estimator and divide by one if area_counts[area] > 1: area_x_vars[area] *= 1/(area_counts[area]-1) area_y_vars[area] *= 1/(area_counts[area]-1) area_z_vars[area] *= 1/(area_counts[area]-1) for point in area_points[area]: area_xy_covars[area] += (float(point[0]) - area_x_means[area])*(float(point[1]) - area_y_means[area]) area_xz_covars[area] += (float(point[0]) - area_x_means[area])*(float(point[2]) - area_z_means[area]) area_yz_covars[area] += (float(point[1]) - area_y_means[area])*(float(point[2]) - area_z_means[area]) # if there is only one point, we assume the uncorrected estimator and divide by one if area_counts[area] > 1: area_xy_covars[area] *= 1/(area_counts[area]-1) area_xz_covars[area] *= 1/(area_counts[area]-1) area_yz_covars[area] *= 1/(area_counts[area]-1) for area in np.arange(12): vec[area*11] = area_counts[area] vec[area*11+1] = area_x_means[area] vec[area*11+2] = area_y_means[area] vec[area*11+3] = area_z_means[area] vec[area*11+4] = area_x_vars[area] vec[area*11+5] = area_y_vars[area] vec[area*11+6] = area_z_vars[area] vec[area*11+7] = area_xy_covars[area] vec[area*11+8] = area_xz_covars[area] vec[area*11+9] = area_yz_covars[area] vec[area*11+10] = area_highest[area] vec[area*11+11] = area_highest_pow[area] vec[area*11+12] = area_pow_means[area] for dim in vec: if not type(dim) == np.float32: bad = True if dim == np.inf or dim == -np.inf: bad = True if dim == np.nan: bad = True if dim > 1e+20 or dim < -1e+20: print 'BAD' bad = True if not bad: good_samples.append(vec) good_labels.append(real_labels[ind]) good_annotations.append(annotation_list[ind]) curr_percent = int(float(ind) / len(qpoint_lists) * 100) if last_per != curr_percent: last_per = curr_percent print 'have now looked at %i%% of the data.' % int(float(ind) / len(qpoint_lists) * 100) f = h5.File("./crafted_real.hdf5", "w") f.create_dataset('data_set/data_set', (len(good_samples),156), dtype='f') f.create_dataset('labels/real_labels', (len(good_labels),), dtype='i') dt = h5.special_dtype(vlen=unicode) f.create_dataset('annotations/annotations', (len(good_annotations),), dtype=dt) for ind, vec in enumerate(good_samples): f['data_set/data_set'][ind] = vec f['labels/real_labels'][ind] = good_labels[ind] f['annotations/annotations'][ind] = good_annotations[ind] print 'number of samples: ' +str(len(f['data_set/data_set'])) print 'dimensionality of the samples: ' +str(len(f['data_set/data_set'][0])) print 'number of labels: ' +str(len(f['labels/real_labels'])) print 'number of annotations: ' +str(len(f['annotations/annotations'])) #compute mean and std for scaling in the ros node means = np.mean(f['data_set/data_set'], axis=0) stds = np.std(f['data_set/data_set'], axis=0) stds[stds == 0.0] = 1.0 cPickle.dump(means, open('means_crafted.pkl', 'wb')) cPickle.dump(stds, open('stds_crafted.pkl', 'wb')) f['data_set/data_set'][...] = scale(f['data_set/data_set']) f.close() generate_train_val_test_set("./crafted_real.hdf5", "train_val_test_crafted_real.hdf5")
def crafted_predict(self): vec = np.zeros((156,), dtype=np.float32) area_points = [[] for _ in np.arange(12)] area_counts = np.zeros(12) area_x_means = np.zeros(12) area_y_means = np.zeros(12) area_z_means = np.zeros(12) area_highest = np.zeros(12) area_highest_pow = np.zeros(12) area_pow_means = np.zeros(12) area_x_vars = np.zeros(12) area_y_vars = np.zeros(12) area_z_vars = np.zeros(12) area_xy_covars = np.zeros(12) area_xz_covars = np.zeros(12) area_yz_covars = np.zeros(12) bad = False for qpoint in self.sample: # need to substract -1 since the function returns the value starting with 1 label = determine_label((float(qpoint[0]), float(qpoint[1]), float(qpoint[2])))-1 area_points[label].append(qpoint) area_counts[label] += 1 if float(qpoint[2]) > area_highest[label]: area_highest[label] = float(qpoint[2]) if float(qpoint[4]) > area_highest_pow[label]: area_highest_pow[label] = float(qpoint[4]) for area in np.arange(12): for point in area_points[area]: area_x_means[area] += float(point[0]) area_y_means[area] += float(point[1]) area_z_means[area] += float(point[2]) area_pow_means[area] += float(point[4]) if area_counts[area] > 0: area_x_means[area] /= area_counts[area] area_y_means[area] /= area_counts[area] area_z_means[area] /= area_counts[area] area_pow_means[area] /= area_pow_means[area] for point in area_points[area]: area_x_vars[area] += (float(point[0]) - area_x_means[area])**2 area_y_vars[area] += (float(point[1]) - area_y_means[area])**2 area_z_vars[area] += (float(point[2]) - area_z_means[area])**2 # if there is only one point, we assume the uncorrected estimator and implicitly divide by one if area_counts[area] > 1: area_x_vars[area] *= 1/(area_counts[area]-1) area_y_vars[area] *= 1/(area_counts[area]-1) area_z_vars[area] *= 1/(area_counts[area]-1) for point in area_points[area]: area_xy_covars[area] += (float(point[0]) - area_x_means[area])*(float(point[1]) - area_y_means[area]) area_xz_covars[area] += (float(point[0]) - area_x_means[area])*(float(point[2]) - area_z_means[area]) area_yz_covars[area] += (float(point[1]) - area_y_means[area])*(float(point[2]) - area_z_means[area]) # if there is only one point, we assume the uncorrected estimator and implicitly divide by one if area_counts[area] > 1: area_xy_covars[area] *= 1/(area_counts[area]-1) area_xz_covars[area] *= 1/(area_counts[area]-1) area_yz_covars[area] *= 1/(area_counts[area]-1) for area in np.arange(12): vec[area*11] = area_counts[area] vec[area*11+1] = area_x_means[area] vec[area*11+2] = area_y_means[area] vec[area*11+3] = area_z_means[area] vec[area*11+4] = area_x_vars[area] vec[area*11+5] = area_y_vars[area] vec[area*11+6] = area_z_vars[area] vec[area*11+7] = area_xy_covars[area] vec[area*11+8] = area_xz_covars[area] vec[area*11+9] = area_yz_covars[area] vec[area*11+10] = area_highest[area] vec[area*11+11] = area_highest_pow[area] vec[area*11+12] = area_pow_means[area] vec = np.reshape(vec, (1, 156)) vec -= self.means vec /= self.stds self.output_prediction(self.model.predict(vec))
def compute_markov_table(): # samples = open('/nthome/maugust/thesis/samples_int_ordered.txt') samples = open("./samples_int_ordered.txt") # labels = open('/nthome/maugust/thesis/labels_int_ordered.txt') labels = open("./labels_int_ordered.txt") # annotations = open('/nthome/maugust/thesis/annotations_int_ordered.txt') annotations = open("./annotations_int_ordered.txt") bad_samples = [] real_labels = [] qpoint_lists = [] label_list = [] annotation_list = [] label_count = np.zeros((1, 13)) for data in samples: qpoint_lists = data.split(";") for data in labels: label_list = data.split(";") for data in annotations: annotation_list = data.split(";") print "found %i qpoint lists." % len(qpoint_lists) print "found %i labels." % len(label_list) print "found %i annotations." % len(annotation_list) for list_ind in np.arange(len(qpoint_lists)): bad = False ################# PROCESS THE LABELS if ( annotation_list[list_ind][0:2] != "vo" and annotation_list[list_ind][0:2] != "fl" and annotation_list[list_ind][0:2] != "mi" and annotation_list[list_ind][0:2] != "ja" ): real_labels.append(0) label_count[0][0] += 1 else: position = label_list[list_ind].split(",") if float(position[0]) == -2000 or float(position[0]) == -1000: real_labels.append(-1) bad = True else: lab = determine_label((float(position[0]), float(position[1]), float(position[2]))) real_labels.append(lab) label_count[0][lab] += 1 ################# PROCESS THE Q-POINTS qpoint_lists[list_ind] = qpoint_lists[list_ind].split(":") for point_ind in np.arange(len(qpoint_lists[list_ind])): qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(",") if len(qpoint_lists[list_ind][point_ind]) != 7: bad = True if bad: bad_samples.append(list_ind) print "need to remove %i bad samples." % len(bad_samples) ################# REMOVE BAD SAMPLES ind = 0 for bad_ind in bad_samples: real_ind = bad_ind - ind qpoint_lists.pop(real_ind) real_labels.pop(real_ind) annotation_list.pop(real_ind) ind += 1 print str(len(qpoint_lists)) + " samples remain after purging." print str(len(real_labels)) + " labels remain after purging." print str(len(annotation_list)) + " annotations remain after purging." print "percentages of the labels are %s" % str(label_count / len(qpoint_lists)) samples.close() labels.close() annotations.close() ################################################### COMPUTE THE TABLE ############################################## prior_table = np.zeros((13, 13), dtype=np.float32) stat_table = np.zeros((13, 13), dtype=np.float32) prior_table[0][0] = 1.0 / 11 prior_table[0][1] = 1.0 / 11 prior_table[0][2] = 1.0 / 11 prior_table[0][3] = 1.0 / 11 prior_table[0][4] = 1.0 / 11 prior_table[0][5] = 1.0 / 11 prior_table[0][6] = 0.0 prior_table[0][7] = 0.0 prior_table[0][8] = 1.0 / 11 prior_table[0][9] = 1.0 / 11 prior_table[0][10] = 1.0 / 11 prior_table[0][11] = 1.0 / 11 prior_table[0][12] = 1.0 / 11 prior_table[1][0] = 1.0 / 5 prior_table[1][1] = 1.0 / 5 prior_table[1][2] = 1.0 / 5 prior_table[1][3] = 0.0 prior_table[1][4] = 0.0 prior_table[1][5] = 1.0 / 5 prior_table[1][6] = 1.0 / 5 prior_table[1][7] = 0.0 prior_table[1][8] = 0.0 prior_table[1][9] = 0.0 prior_table[1][10] = 0.0 prior_table[1][11] = 0.0 prior_table[1][12] = 0.0 prior_table[2][0] = 1.0 / 7 prior_table[2][1] = 1.0 / 7 prior_table[2][2] = 1.0 / 7 prior_table[2][3] = 1.0 / 7 prior_table[2][4] = 0.0 prior_table[2][5] = 1.0 / 7 prior_table[2][6] = 1.0 / 7 prior_table[2][7] = 1.0 / 7 prior_table[2][8] = 0.0 prior_table[2][9] = 0.0 prior_table[2][10] = 0.0 prior_table[2][11] = 0.0 prior_table[2][12] = 0.0 prior_table[3][0] = 1.0 / 7 prior_table[3][1] = 0.0 prior_table[3][2] = 1.0 / 7 prior_table[3][3] = 1.0 / 7 prior_table[3][4] = 1.0 / 7 prior_table[3][5] = 0.0 prior_table[3][6] = 1.0 / 7 prior_table[3][7] = 1.0 / 7 prior_table[3][8] = 1.0 / 7 prior_table[3][9] = 0.0 prior_table[3][10] = 0.0 prior_table[3][11] = 0.0 prior_table[3][12] = 0.0 prior_table[4][0] = 1.0 / 5 prior_table[4][1] = 0.0 prior_table[4][2] = 0.0 prior_table[4][3] = 1.0 / 5 prior_table[4][4] = 1.0 / 5 prior_table[4][5] = 0.0 prior_table[4][6] = 0.0 prior_table[4][7] = 1.0 / 5 prior_table[4][8] = 1.0 / 5 prior_table[4][9] = 0.0 prior_table[4][10] = 0.0 prior_table[4][11] = 0.0 prior_table[4][12] = 0.0 prior_table[5][0] = 1.0 / 7 prior_table[5][1] = 1.0 / 7 prior_table[5][2] = 1.0 / 7 prior_table[5][3] = 0.0 prior_table[5][4] = 0.0 prior_table[5][5] = 1.0 / 7 prior_table[5][6] = 1.0 / 7 prior_table[5][7] = 0.0 prior_table[5][8] = 0.0 prior_table[5][9] = 1.0 / 7 prior_table[5][10] = 1.0 / 7 prior_table[5][11] = 0.0 prior_table[5][12] = 0.0 prior_table[6][0] = 0.0 prior_table[6][1] = 1.0 / 9 prior_table[6][2] = 1.0 / 9 prior_table[6][3] = 1.0 / 9 prior_table[6][4] = 0.0 prior_table[6][5] = 1.0 / 9 prior_table[6][6] = 1.0 / 9 prior_table[6][7] = 1.0 / 9 prior_table[6][8] = 0.0 prior_table[6][9] = 1.0 / 9 prior_table[6][10] = 1.0 / 9 prior_table[6][11] = 1.0 / 9 prior_table[6][12] = 0.0 prior_table[7][0] = 0.0 prior_table[7][1] = 0.0 prior_table[7][2] = 1.0 / 9 prior_table[7][3] = 1.0 / 9 prior_table[7][4] = 1.0 / 9 prior_table[7][5] = 0.0 prior_table[7][6] = 1.0 / 9 prior_table[7][7] = 1.0 / 9 prior_table[7][8] = 1.0 / 9 prior_table[7][9] = 0.0 prior_table[7][10] = 1.0 / 9 prior_table[7][11] = 1.0 / 9 prior_table[7][12] = 1.0 / 9 prior_table[8][0] = 1.0 / 7 prior_table[8][1] = 0.0 prior_table[8][2] = 0.0 prior_table[8][3] = 1.0 / 7 prior_table[8][4] = 1.0 / 7 prior_table[8][5] = 0.0 prior_table[8][6] = 0.0 prior_table[8][7] = 1.0 / 7 prior_table[8][8] = 1.0 / 7 prior_table[8][9] = 0.0 prior_table[8][10] = 0.0 prior_table[8][11] = 1.0 / 7 prior_table[8][12] = 1.0 / 7 prior_table[9][0] = 1.0 / 5 prior_table[9][1] = 0.0 prior_table[9][2] = 0.0 prior_table[9][3] = 0.0 prior_table[9][4] = 0.0 prior_table[9][5] = 1.0 / 5 prior_table[9][6] = 1.0 / 5 prior_table[9][7] = 0.0 prior_table[9][8] = 0.0 prior_table[9][9] = 1.0 / 5 prior_table[9][10] = 1.0 / 5 prior_table[9][11] = 0.0 prior_table[9][12] = 0.0 prior_table[10][0] = 1.0 / 7 prior_table[10][1] = 0.0 prior_table[10][2] = 0.0 prior_table[10][3] = 0.0 prior_table[10][4] = 0.0 prior_table[10][5] = 1.0 / 7 prior_table[10][6] = 1.0 / 7 prior_table[10][7] = 1.0 / 7 prior_table[10][8] = 0.0 prior_table[10][9] = 1.0 / 7 prior_table[10][10] = 1.0 / 7 prior_table[10][11] = 1.0 / 7 prior_table[10][12] = 0.0 prior_table[11][0] = 1.0 / 7 prior_table[11][1] = 0.0 prior_table[11][2] = 0.0 prior_table[11][3] = 0.0 prior_table[11][4] = 0.0 prior_table[11][5] = 0.0 prior_table[11][6] = 1.0 / 7 prior_table[11][7] = 1.0 / 7 prior_table[11][8] = 1.0 / 7 prior_table[11][9] = 0.0 prior_table[11][10] = 1.0 / 7 prior_table[11][11] = 1.0 / 7 prior_table[11][12] = 1.0 / 7 prior_table[12][0] = 1.0 / 5 prior_table[12][1] = 0.0 prior_table[12][2] = 0.0 prior_table[12][3] = 0.0 prior_table[12][4] = 0.0 prior_table[12][5] = 0.0 prior_table[12][6] = 0.0 prior_table[12][7] = 1.0 / 5 prior_table[12][8] = 1.0 / 5 prior_table[12][9] = 0.0 prior_table[12][10] = 0.0 prior_table[12][11] = 1.0 / 5 prior_table[12][12] = 1.0 / 5 last_state = real_labels[0] print prior_table print "" for label in real_labels[1:]: current_state = label stat_table[last_state][current_state] += 1 last_state = current_state for row in np.arange(13): stat_table[row] /= np.sum(stat_table[row]) print stat_table print "" posterior_table = prior_table * stat_table for row in np.arange(13): posterior_table[row] /= np.sum(posterior_table[row]) posterior_table[0][0] = 1.0 / 11 posterior_table[0][1] = 1.0 / 11 posterior_table[0][2] = 1.0 / 11 posterior_table[0][3] = 1.0 / 11 posterior_table[0][4] = 1.0 / 11 posterior_table[0][5] = 1.0 / 11 posterior_table[0][6] = 0.0 posterior_table[0][7] = 0.0 posterior_table[0][8] = 1.0 / 11 posterior_table[0][9] = 1.0 / 11 posterior_table[0][10] = 1.0 / 11 posterior_table[0][11] = 1.0 / 11 posterior_table[0][12] = 1.0 / 11 print posterior_table print "" cPickle.dump(posterior_table, open("posterior_table.pkl", "wb"))