def generate_large_batch_parallel(args): batches, large_batch_size = args large_batches_lst = [] x = None for batch in batches: # Append input data. x_tmp = get_bag_of_words_matrix(batch) if x is None: x = x_tmp else: x = append(x, x_tmp, axis=0) if len( x ) == large_batch_size and batches[-1] - batch >= large_batch_size: large_batches_lst.append(int(batch)) save_large_batch(int(batch), x) x = None elif len(x) > large_batch_size and batch == batches[-1]: large_batches_lst.append(int(batch)) save_large_batch(int(batch), x) x = None elif batch == batches[-1]: large_batches_lst.append(int(batch)) save_large_batch(int(batch), x) x = None
def __init__(self,trainingdata = False): self.batches = data_processing.get_batch_list(trainingdata) # Run data through neural network self.lower_dimension_data = [] # Output data from the dbn self.higher_dimensional_data = [] # Input data to the dbn self.path = 'output' if not os.path.exists(self.path): os.makedirs(self.path) weights = rsm.get_weights() visible_biases = rsm.get_visible_biases() hidden_biases = rsm.get_hidden_biases() # Generate class indices and class names if trainingdata: path = 'pickle/train/bag_of_words' else: path = 'pickle/test/bag_of_words' self.class_indices = self.__generate_class_indices__(path, self.batches) # Class indices for all documents # Run through batches and generate high and low dimensional data lists for batch in range(len(self.batches)): print 'Batch ',batch + 1, ' of ',len(self.batches) d = data_processing.get_bag_of_words_matrix(self.batches[batch],trainingdata) self.higher_dimensional_data += list(d) self.lower_dimension_data += list((rsm.generate_output_data(d, weights,visible_biases,hidden_biases)))
def compare_real_data_to_reconstructed_data(): weights = s.load(open(env_paths.get_dbn_weight_path(),"rb")) batches = s.load(open(env_paths.get_batches_path(train=False),"rb")) class_indices = s.load(open(env_paths.get_class_indices_path(False,batches[0]).replace(".0",""),"rb")) batch = batches[0] data = data_processing.get_bag_of_words_matrix(batch,training = False) dict = {} for i in range(len(class_indices)): idx = class_indices[i] if idx in dict.keys(): continue dict[idx] = data[i] if len(dict) >= 10: break print dict.keys() data_points = dict.values() output_data_points = [] for d in data_points: d = append(d,1.) out = generate_output_data(d,weights) output_data_points.append(out) visualise_data_points(data_points,output_data_points)
def rsm_learn(self,epochs): ''' Learning method for the replicated softmax. The higher value of epochs will result in more training. Parameters ---------- epochs: The number of epochs. ''' for epoch in range(epochs): errsum = 0 batch_index = 0 for _ in self.batches: # Positive phase - generate data from visible to hidden units. pos_vis = data_processing.get_bag_of_words_matrix(self.batches[batch_index]) D = sum(pos_vis,axis = 1) batch_size = len(pos_vis) #pos_hid_prob = (1+sp.tanh((dot(pos_vis,self.weights)+outer(D, self.hidden_biases))/2))/2 pos_hid_prob = sigmoid(dot(pos_vis,self.weights)+outer(D, self.hidden_biases)) # If probabilities are higher than randomly generated, the states are 1 randoms = rand.rand(batch_size,self.num_hid) pos_hid = array(randoms < pos_hid_prob,dtype = int) # Negative phase - generate data from hidden to visible units and then again to hidden units. neg_vis = dot(pos_hid,self.weights.T)+self.visible_biases tmp = exp(neg_vis) s = tmp.sum(axis = 1) s = s.reshape((batch_size,1)) neg_vis_pdf = tmp/s neg_vis *= 0 for i in xrange(batch_size): neg_vis[i] = random.multinomial(D[i],neg_vis_pdf[i],size = 1) neg_hid_prob = sigmoid(dot(neg_vis,self.weights)+outer(D,self.hidden_biases)) #neg_hid_prob = (1+sp.tanh((dot(neg_vis,self.weights)+outer(D,self.hidden_biases))/2))/2 # Set the error errsum += sum(((pos_vis)-neg_vis)**2) self.delta_weights = self.delta_weights*self.momentum + dot(pos_vis.T, pos_hid_prob) - dot(neg_vis.T, neg_hid_prob) self.delta_visible_biases = self.delta_visible_biases * self.momentum + pos_vis.sum(axis = 0) - neg_vis.sum(axis = 0) self.delta_hidden_biases = self.delta_hidden_biases * self.momentum + pos_hid_prob.sum(axis = 0) - neg_hid_prob.sum(axis = 0) self.weights += self.delta_weights * (self.epsilon_weights/batch_size) self.visible_biases += self.delta_visible_biases * (self.epsilon_visibleBiases/batch_size) self.hidden_biases += self.delta_hidden_biases * (self.epsilon_hiddenBiases/batch_size) batch_index += 1 print 'Epoch ',epoch+1,' Error ',errsum/batch_size self.__save_rsm__()
def error(args): """ Compute the training or testing error on the unfolded network. """ weights,epoch,training,batches,queue = args err = 0 for batch in range(len(batches)): x = get_bag_of_words_matrix(batches[batch]) if training else get_bag_of_words_matrix(batches[batch],training = False) x = append(x,ones((len(x),1)),axis = 1) xout,_ = generate_output_data(x, weights) err += sum((x[:,:-1]-xout)**2) if training: out = 'Train error before epoch['+str(epoch+1)+']: '+str(err/(len(batches))) else: out = 'Test error before epoch['+str(epoch+1)+']: '+str(err/(len(batches))) queue.put([training,out,err/(len(batches))])
def error(args): """ Compute the training or testing error on the unfolded network. """ weights, epoch, training, batches, queue, binary_output = args err = 0 for batch in range(len(batches)): x = get_bag_of_words_matrix(batches[batch]) if training else get_bag_of_words_matrix(batches[batch], training=False) x = append(x, ones((len(x), 1)), axis=1) xout, _ = generate_output_data(x, weights, binary_output=binary_output) x[:, :-1] = get_norm_x(x[:, :-1]) err -= sum(x[:, :-1] * log(xout)) if training: out = 'Train error before epoch[%i]: %.2f' % (epoch + 1, err / len(batches)) else: out = 'Test error before epoch[%i]: %.2f' % (epoch + 1, err / len(batches)) queue.put([training, out, err / (len(batches))])
def error(args): """ Compute the training or testing error on the unfolded network. """ weights, epoch, training, batches, queue, binary_output = args err = 0 for batch in range(len(batches)): x = get_bag_of_words_matrix( batches[batch]) if training else get_bag_of_words_matrix( batches[batch], training=False) x = append(x, ones((len(x), 1)), axis=1) xout, _ = generate_output_data(x, weights, binary_output=binary_output) x[:, :-1] = get_norm_x(x[:, :-1]) err -= sum(x[:, :-1] * log(xout)) if training: out = 'Train error before epoch[%i]: %.2f' % (epoch + 1, err / len(batches)) else: out = 'Test error before epoch[%i]: %.2f' % (epoch + 1, err / len(batches)) queue.put([training, out, err / (len(batches))])
def generate_input_data_list(training = True): """ Generate a list of all input data. @param training: If training is True, the input should be generated for training data and vice versa. """ batches = data_processing.get_batch_list(training = training) input_data = [] for batch in range(len(batches)): print 'Batch ',batch + 1, ' of ',len(batches) d = data_processing.get_bag_of_words_matrix(batches[batch],training=training) d = get_norm_x(d) input_data += list(d) return input_data
def generate_input_data_list(training=True): """ Generate a list of all input data. @param training: If training is True, the input should be generated for training data and vice versa. """ batches = data_processing.get_batch_list(training=training) input_data = [] for batch in range(len(batches)): print 'Batch ', batch + 1, ' of ', len(batches) d = data_processing.get_bag_of_words_matrix(batches[batch], training=training) d = get_norm_x(d) input_data += list(d) return input_data
def compare_real_data_to_reconstructed_data_random(): weights = s.load(open(env_paths.get_dbn_weight_path(),"rb")) batches = s.load(open(env_paths.get_batches_path(train=False),"rb")) batch = choice(batches) # make sure to pick batch at random data = data_processing.get_bag_of_words_matrix(batch,training = False) # choose 10 data points at random data_points = [] indices = random.randint(0,len(data),10) for idx in indices: data_points.append(data[idx]) output_data_points = [] for d in data_points: d = append(d,1.) out = generate_output_data(d,weights) output_data_points.append(out) visualise_data_points(data_points,output_data_points)
def generate_large_batch_parallel(args): batches, large_batch_size = args large_batches_lst = [] x = None for batch in batches: # Append input data. x_tmp = get_bag_of_words_matrix(batch) if x is None: x = x_tmp else: x = append(x, x_tmp, axis=0) if len(x) == large_batch_size and batches[-1] - batch >= large_batch_size: large_batches_lst.append(int(batch)) save_large_batch(int(batch), x) x = None elif len(x) > large_batch_size and batch == batches[-1]: large_batches_lst.append(int(batch)) save_large_batch(int(batch), x) x = None elif batch == batches[-1]: large_batches_lst.append(int(batch)) save_large_batch(int(batch), x) x = None
def __generate_output_for_train_data_par(args): batch, weight_matrices_added_biases, binary_output = args d = data_processing.get_bag_of_words_matrix(batch, training=True) return list((generate_output_data(d, weight_matrices_added_biases, binary_output=binary_output)))
def __generate_output_for_train_data_par(args): batch,weight_matrices_added_biases,binary_output = args d = data_processing.get_bag_of_words_matrix(batch,training = True) return list((generate_output_data(d, weight_matrices_added_biases,binary_output=binary_output)))