def createMiniDatasetMulti(train_size,test_size,t_affordances=[0,1,2,3,4],positives_file='AffordancesDataset_augmented.h5',negatives_file='AffordancesDataset_negatives.h5',info_file='AffordancesDataset_augmented_names.txt'): # sample traing_size random for each class # check repeated the_affordances=np.expand_dims(np.asarray(t_affordances),0) names=np.genfromtxt(info_file,dtype='str',skip_header=0,delimiter=':') names=names[:,1] aff_initials=sorted(list(set([x[0] for x in names]))) actual_initials=[] positive_data,positive_labels=load_h5(positives_file) negative_data,negative_labels=load_h5(negatives_file) for i in range(1,the_affordances.size): id_=the_affordances[0,i] thisIds=np.nonzero(positive_labels[:,id_])[0] print(thisIds.size) #select train and test np.random.shuffle(thisIds) train_ids=thisIds[:train_size] test_ids=thisIds[train_size:train_size+test_size] if i>1: #check for repeated new_=np.setdiff1d(train_ids,all_train_ids) all_train_ids=np.concatenate((all_train_ids,new_),axis=0) new_=np.setdiff1d(test_ids,all_test_ids) all_test_ids=np.concatenate((all_test_ids,new_),axis=0) else: all_train_ids=train_ids all_test_ids=test_ids actual_initials.append(aff_initials[id_-1]) negative_ids_train=np.arange(train_size) negative_ids_test=np.arange(train_size,train_size+test_size) negative_labels_train=np.zeros((train_size,the_affordances.size)) negative_labels_train[:,0]=1 negative_labels_test=np.zeros((test_size,the_affordances.size)) negative_labels_test[:,0]=1 all_train_ids=all_train_ids.reshape(-1,1) all_test_ids=all_test_ids.reshape(-1,1) #print(all_train_ids.shape) train_data=np.concatenate((positive_data[all_train_ids.squeeze(),...],negative_data[negative_ids_train,...]),axis=0) train_labels=np.concatenate((positive_labels[all_train_ids,the_affordances],negative_labels_train),axis=0) #train_ids=np.arange(train_data.shape[0]) #np.random.shuffle(train_ids) test_data=np.concatenate((positive_data[all_test_ids.squeeze(),...],negative_data[negative_ids_test,...]),axis=0) test_labels=np.concatenate((positive_labels[all_test_ids,the_affordances],negative_labels_test),axis=0) name='mini3_AffordancesDataset_train_'+''.join(actual_initials)+'_'+str(train_size)+'.h5' if os.path.exists(name): os.system('rm %s' % (name)) save_h5(name,train_data,train_labels,'float32','uint8') name='mini3_AffordancesDataset_test_'+''.join(actual_initials)+'_'+str(train_size)+'.h5' if os.path.exists(name): os.system('rm %s' % (name)) save_h5(name,test_data,test_labels,'float32','uint8') return train_data,train_labels,test_data,test_labels
def split_data(list_of_files, train_size=0.8): for i in range(len(list_of_files)): tmp_data, tmp_labels = load_h5(list_of_files[i]) if i > 0: data = np.concatenate((data, tmp_data), axis=0) labels = np.concatenate((labels, tmp_labels), axis=0) else: data = tmp_data labels = tmp_labels print(tmp_data.shape) print('All data %d' % (data.shape[0])) all_ids = np.arange(data.shape[0]) np.random.shuffle(all_ids) train_ids_size = int(all_ids.size * train_size) print(train_ids_size) train_ids = all_ids[:train_ids_size] new_train_data = data[train_ids, ...] new_train_labels = labels[train_ids, ...] test_ids = all_ids[train_ids_size:] new_test_data = data[test_ids, ...] new_test_labels = labels[test_ids, ...] print('Train data %d' % new_train_labels.shape[0]) print('Test data %d' % new_test_labels.shape[0]) save_h5('MultilabelDataSet_splitTrain4.h5', new_train_data, new_train_labels, 'float32', 'uint8') save_h5('MultilabelDataSet_splitTest4.h5', new_test_data, new_test_labels, 'float32', 'uint8') np.save('MultilabelDataSet_splitTest4.npy', test_ids)
def extractSubset(dataSet, new_size=0.5): tmp_data, tmp_labels = load_h5(dataSet) print(tmp_labels.shape[0]) all_ids = np.arange(tmp_labels.shape[0]) newSize = int(all_ids.size * new_size) print('New %d' % newSize) np.random.shuffle(all_ids) newData = tmp_data[all_ids[:newSize], ...] newLabels = tmp_labels[all_ids[:newSize], ...] print('New Data size %d %d' % (newData.shape[0], newLabels.shape[0])) return newData, newLabels
def getDataset(file): #split dataset into smaller batches/files all_data,all_labels=load_h5(file) #shuffle them to add 'randomness' all_ids=np.arange(all_data.shape[0]) np.random.shuffle(all_ids) all_data=all_data[all_ids,...] all_labels=all_labels[all_ids] print(all_data.shape) print(all_labels.shape) n_splits=all_labels.shape[0]/(496*4) print(n_splits) for i in range(n_splits): name='AffordancesDataset_file'+str(i)+'.h5' start_id=i*(496*4) end_id=(i+1)*(496*4) toSaveData=all_data[start_id:end_id,...] toSaveLabels=all_labels[start_id:end_id] print('%s %d %d'%(name,start_id,end_id)) if os.path.exists(name): os.system('rm %s' % (name)) save_h5(name,toSaveData,toSaveLabels,'float32','uint8')
#print(tmp_ids.size) #ids_presented=ids_presented[tmp_ids] #get the original data original_data_points_file = NEW_DATA_DIR + 'dataPoints_' + str( interaction) + '.h5' original_data_clouds_file = NEW_DATA_DIR + 'dataClouds_' + str( interaction) + '.h5' input_clouds_file = INPUT_DATA_DIR + 'binaryOc_AffordancesDataset_test' + str( interaction) + '_' + str(traininig_examples) + '.h5' data_presented_original_ids_file = NEW_DATA_DIR + 'binaryOc_AffordancesDataset_test' + str( interaction) + '_' + str(traininig_examples) + '_shuffledIds.npy' #print(data_presented_original_ids_file) #this goes 0-1023 #_,original_ids=load_h5(data_presented_original_ids_file) original_ids = np.load(data_presented_original_ids_file) input_clouds, _ = load_h5(input_clouds_file) original_points, _ = load_h5(original_data_points_file) #original_clouds,_=load_h5(original_data_clouds_file) #find indices of corresponding clouds #ids=np.nonzero(original_ids>511)[0] #for j in range(ids_presented.shape[0]): #print(original_ids[:10]) #print(ids_presented[:10]) for j in range(tmp_ids.size): anId = tmp_ids[j] #affordance 'positive' examples are the last 512 in the dataset real_id = original_ids[anId] - 512 pointcloud_id = real_id if pointcloud_id != 511: continue one_original_cloud = input_clouds[anId, ...]
def createMiniDatasets(train_size,test_size,positives_file='AffordancesDataset_augmented.h5',negatives_file='AffordancesDataset_negatives.h5',info_file='AffordancesDataset_augmented_names.txt',target_affordance='Filling'): # This function creates binary datasets for every affordance in the csv file # train_size and test_size are per class positive_data,_=load_h5(positives_file) print(positive_data.shape) negative_data,negative_labels=load_h5(negatives_file) if train_size>negative_data.shape[0] or test_size>negative_data.shape[0]: print('Number of examples exceeded') sys.exit() info=np.genfromtxt(info_file,dtype='str',skip_header=0,delimiter=':') real_ids=np.array([int(x) for x in info[:,0]]) bar = Bar('Processing', max=real_ids.shape[0]) # if need all binary datasets, make target_affordance an empty string #target_affordance='' count=1 if target_affordance: print('Getting data for %s'%(target_affordance)) else: print('Getting all data ') data_train=np.array([],dtype=np.float32).reshape(0,n_points,3) data_test=np.array([],dtype=np.float32).reshape(0,n_points,3) labels_train=np.array([],dtype=np.uint8).reshape(0,1) labels_test=np.array([],dtype=np.uint8).reshape(0,1) for j in range(real_ids.shape[0]): current_aff=info[j,1] if target_affordance: if target_affordance not in current_aff: continue # this file is supposed to have 128 examples per affordance x 8 orientations start_i=j*(128*8) end_i=(j+1)*(128*8) thisAffordance_data=positive_data[start_i:end_i,...] train_ids=np.random.randint(thisAffordance_data.shape[0],size=train_size) test_ids=np.setdiff1d(np.arange(thisAffordance_data.shape[0]),train_ids) test_ids=test_ids[:test_size] #save training data sample_negative=np.arange(negative_data.shape[0]) np.random.shuffle(sample_negative) data=np.concatenate((thisAffordance_data[train_ids,...],negative_data[sample_negative[:train_size],...]),axis=0) labels=np.concatenate((np.ones((train_size,1)),np.zeros((train_size,1))),axis=0) if target_affordance: #concat tmp data with training data data_train=np.concatenate((data,data_train),axis=0) labels_train=np.concatenate((count*labels,labels_train),axis=0) else: data_train=data labels_train=labels #shuffle the data shuffle_ids=np.arange(labels_train.shape[0]) np.random.shuffle(shuffle_ids) data_train=data_train[shuffle_ids,...] labels_train=labels_train[shuffle_ids] if not target_affordance: name='binary_AffordancesDataset_train'+str(j)+'_'+str(train_size)+'.h5' if os.path.exists(name): os.system('rm %s'%(name)) save_h5(name,data_train,labels_train,'float32','uint8') # save test data data=np.concatenate((thisAffordance_data[test_ids,...],negative_data[sample_negative[train_size:train_size+test_size],...]),axis=0) #print(thisAffordance_data[test_ids,...].shape[0]) labels=np.concatenate((np.ones((test_size,1)),np.zeros((test_size,1))),axis=0) if target_affordance: data_test=np.concatenate((data,data_test),axis=0) labels_test=np.concatenate((count*labels,labels_test),axis=0) #count+=1 else: data_test=data labels_test=labels shuffle_ids=np.arange(labels_test.shape[0]) np.random.shuffle(shuffle_ids) data_test=data_test[shuffle_ids,...] labels_test=labels_test[shuffle_ids] if not target_affordance: name='binary_AffordancesDataset_test'+str(j)+'_'+str(train_size)+'.h5' if os.path.exists(name): os.system('rm %s'%(name)) save_h5(name,data_test,labels_test,'float32','uint8') bar.next() bar.finish() if target_affordance: print('Saving test data for %s '%(target_affordance)) # before saving, remove unbalance in negatives # since there will be X (affordances) times more negatives '''ids_to_remove=np.nonzero(labels_test==0)[0] ids_to_remove=ids_to_remove[test_size:] ids_to_keep=np.setdiff1d(np.arange(labels_test.shape[0]),ids_to_remove) data_test=data_test[ids_to_keep,...] labels_test=labels_test[ids_to_keep]''' #Same for positives print(data_test.shape) print(labels_test.shape) name='miniAffordancesDataset_test_'+target_affordance+'_'+str(train_size)+'.h5' if os.path.exists(name): os.system('rm %s'%(name)) save_h5(name,data_test,labels_test,'float32','uint8') name='miniAffordancesDataset_train_'+target_affordance+'_'+str(train_size)+'.h5' print('Saving train data for %s '%(target_affordance)) '''ids_to_remove=np.nonzero(labels_train==0)[0] ids_to_remove=ids_to_remove[train_size:] ids_to_keep=np.setdiff1d(np.arange(labels_train.shape[0]),ids_to_remove) data_train=data_train[ids_to_keep,...] labels_train=labels_train[ids_to_keep]''' print(data_train.shape) print(labels_train.shape) if os.path.exists(name): os.system('rm %s'%(name)) save_h5(name,data_train,labels_train,'float32','uint8')
def getMiniDataset(class_ids,train_size,test_size,file='AffordancesDataset_augmented.h5',negatives_file='AffordancesDataset_negatives.h5',return_data=False,info_file='AffordancesDataset_augmented_names.txt'): #if return data is true then no data is saved # and data/labels are returned to caller names=np.genfromtxt(info_file,dtype='str',skip_header=0,delimiter=':') #print(names) real_ids=np.array([int(x) for x in names[:,0]]) #print(real_ids) all_data,all_labels=load_h5(file) #print(np.unique(all_labels)) if (test_size+train_size)>all_labels.shape[0]: print('Max data size is '%all_labels.shape[0]) sys.exit() if test_size<0: test_size=all_labels.shape[0]-train_size #print(all_data.shape) train_ids=np.zeros((class_ids.shape[0]*train_size,1),dtype=np.int32) test_ids=np.zeros((class_ids.shape[0]*test_size,1),dtype=np.int32) #some_ids_new=np.zeros((class_ids.shape[0],1),dtype=np.uint8) new_labels_train=np.zeros((class_ids.shape[0]*train_size,1),dtype=np.uint8) new_labels_test=np.zeros((class_ids.shape[0]*test_size,1),dtype=np.uint8) aff_initial=[] for i in range(class_ids.shape[0]): ids=np.nonzero(all_labels==class_ids[i])[0] #print(all_labels[ids]) #take 32 from each class to test test=np.arange(ids.shape[0],dtype=np.int32) np.random.shuffle(test) start_id=i*train_size end_id=(i+1)*train_size train_ids[start_id:end_id,0]=ids[test[:train_size]] new_labels_train[start_id:end_id,0]=i+1 start_id=i*test_size end_id=(i+1)*test_size test_ids[start_id:end_id,0]=ids[test[train_size:train_size+test_size]] new_labels_test[start_id:end_id,0]=i+1 aff_initial.append(names[class_ids[i],1][0]) print(aff_initial) #print(ids_train) #train_ids=np.asarray(ids_train,dtype=np.uint8).reshape(-1,1) train_ids=np.squeeze(train_ids) test_ids=np.squeeze(test_ids) #print(train_ids.T) #print(test_ids.T) #sys.exit() #test_ids=np.squeeze(np.asarray(ids_test,dtype=np.uint8).reshape(-1,1)) print('Training set %d'%train_ids.shape[0]) print('Testing set %d'%test_ids.shape[0]) new_data_train=all_data[train_ids,...] new_data_test=all_data[test_ids,...] #concatenate here the negatives negative_data,negative_labels=load_h5(negatives_file) new_data_train=np.concatenate((new_data_train,negative_data[:train_size]),axis=0) new_labels_train=np.concatenate((new_labels_train,np.zeros((train_size,1))),axis=0) train_shuffle=np.arange(new_data_train.shape[0]) np.random.shuffle(train_shuffle) new_data_train=new_data_train[train_shuffle,...] new_labels_train=new_labels_train[train_shuffle] name='mini_AffordancesDataset_train_'+''.join(aff_initial)+'_'+str(train_size)+'.h5' if not return_data: if os.path.exists(name): os.system('rm %s' % (name)) save_h5(name,new_data_train,new_labels_train,'float32','uint8') new_data_test=np.concatenate((new_data_test,negative_data[train_size:train_size+test_size]),axis=0) new_labels_test=np.concatenate((new_labels_test,np.zeros((test_size,1))),axis=0) train_shuffle=np.arange(new_data_test.shape[0]) np.random.shuffle(train_shuffle) new_data_test=new_data_test[train_shuffle,...] new_labels_test=new_labels_test[train_shuffle] print('Training data ') print(new_data_train.shape) print(new_labels_train) print('Test data ') print(new_data_test.shape) print(new_labels_test.shape) name='mini_AffordancesDataset_test_'+''.join(aff_initial)+'_'+str(train_size)+'.h5' if not return_data: if os.path.exists(name): os.system('rm %s' % (name)) save_h5(name,new_data_test,new_labels_test,'float32','uint8') # save the original class ids to keep track of the affordances involved in this dataset name='mini_AffordancesDataset_names_'+''.join(aff_initial)+'_'+str(train_size)+'.txt' with open(name, "w") as text_file: for i in range(class_ids.shape[0]): print('%d:%s' % (i+1,names[class_ids[i],1])) text_file.write("%d:%s\n" % (i+1,names[class_ids[i],1])) else: for i in range(class_ids.shape[0]): print('%d:%s' % (i+1,names[class_ids[i],1])) '''fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.hold(False) for i in range(new_labels_test.shape[0]): ax.scatter(new_data_test[i,:,0],new_data_test[i,:,1],new_data_test[i,:,2],s=10) #print(names[class_ids[new_labels_test[i,0]],1]) ax.set_title(names[class_ids[new_labels_test[i,0]],1]+' '+str(new_labels_test[i,0])) plt.pause(5) plt.draw()''' if return_data: return new_data_train,new_labels_train,new_data_test,new_labels_test else: return 0,0,0,0
def extractSingleLabeledData(data_file): data, label = load_h5(data_file) print(label.shape) train_examples = 512 test_examples = 128 examples = train_examples + test_examples print(examples * label.shape[1], data.shape[1], 3) new_data_train = np.zeros( (train_examples * label.shape[1], data.shape[1], 3), dtype=np.float32) new_labels_train = np.zeros((train_examples * label.shape[1], 1), dtype=np.int32) new_data_test = np.zeros( (test_examples * label.shape[1], data.shape[1], 3), dtype=np.float32) new_labels_test = np.zeros((test_examples * label.shape[1], 1), dtype=np.int32) #for every affordance st = 0 st2 = 0 for i in range(label.shape[1]): #get the pointclouds of this affordance target_indices = np.nonzero(label[:, i])[0] #print('Aff %d %d'%(i,target_indices.size)) to_sample_from = np.arange(target_indices.size) np.random.shuffle(to_sample_from) if to_sample_from.size < (train_examples + test_examples): real_train_examples = int(to_sample_from.size * .8 // 1) #print(real_train_examples) real_test_examples = to_sample_from.size - real_train_examples print('Less data from %d,%d' % (real_train_examples, real_test_examples)) else: real_train_examples = train_examples real_test_examples = test_examples ed = st + real_train_examples ed2 = st2 + real_test_examples real_sample = target_indices[to_sample_from[:real_train_examples]] real_sample_test = target_indices[ to_sample_from[real_train_examples:real_train_examples + real_test_examples]] new_data_train[st:ed, ...] = data[real_sample, ...] new_labels_train[st:ed, ...] = i new_data_test[st2:ed2, ...] = data[real_sample_test, ...] new_labels_test[st2:ed2, ...] = i st = ed st2 = ed2 # get the real data in case some affordances had less examples than the target new_data_train = new_data_train[:ed, ...] new_labels_train = new_labels_train[:ed, ...] new_data_test = new_data_test[:ed2, ...] new_labels_test = new_labels_test[:ed2, ...] #shuffle things ids = np.arange(new_labels_train.shape[0]) np.random.shuffle(ids) new_data_train = new_data_train[ids, ...] new_labels_train = new_labels_train[ids, ...] ids = np.arange(new_labels_test.shape[0]) np.random.shuffle(ids) new_data_test = new_data_test[ids, ...] new_labels_test = new_labels_test[ids, ...] print('New binary train data %d' % new_labels_train.shape[0]) print('New binary test data %d' % new_labels_test.shape[0]) name = 'SinglelabelDataSet_train_' + data_file.split('.')[0].split( '_')[-1] + '.h5' print(name) save_h5(name, new_data_train, new_labels_train, 'float32', 'uint8') name = 'SinglelabelDataSet_test_' + data_file.split('.')[0].split( '_')[-1] + '.h5' print(name) save_h5(name, new_data_test, new_labels_test, 'float32', 'uint8')