def handle_sys_args(sys_argv): #test = ["man","-V 2000", "-H 500,250", "-O 2","-E 1","-T 0.7"] #sys.argv[1:] = test shortoptions = "V:H:O:E:B" longoptions = ["visible=","hidden=","output=","epochs=","binary="] visible = 2000 hiddentxt = "500,500" hidden = [500,500] output = 128 epochs = 50 train = 0.7 path = 'input' if len(sys_argv)<2: usage(visible,hiddentxt,output,epochs,train) try: opts,_ = getopt.getopt(sys.argv[2:], shortoptions, longoptions) except: usage(visible,hiddentxt,output,epochs,train) # parse arguments for o, a in opts: if o in ('-V', '--visible'): visible = int(a) elif o in ('-H', '--hidden'): hidden = [] hiddens = a.split(',') for h in hiddens: hidden.append(int(h)) elif o in ('-O', '--output'): output = int(a) elif o in ('-E', '--epochs'): epochs = int(a) elif o in ('-B', '--binary'): bin = str(a) else: assert False, "unknown option" if 'y' in raw_input("Datapreparation? [y/n]"): check_path(path) training_path = raw_input("Enter training path as a relative path (i.e. input/train):") test_path = raw_input("Enter testing path as a relative path (i.e. input/test):") stem = raw_input("Stemming the documents is needed for the data processing to complete. Stem documents? [y/n]") run_data_processing(visible,training_path,test_path,stem = True if stem == 'y' else False) if 'aut' in sys_argv[1]: deepbelief = dbn.DBN(visible, data_processing.get_batch_list(), hidden, output, epochs,binary_output=bin) deepbelief.run_dbn() elif 'man' in sys_argv[1]: deepbelief = dbn.DBN(visible, data_processing.get_batch_list(), hidden, output, epochs,binary_output=bin) if 'y' in raw_input("Pre-training? [y/n]"): deepbelief.run_pretraining() if 'y' in raw_input("Fine-tuning? [y/n]"): deepbelief.run_finetuning(load_from_serialization=True)
def run_finetuning(self, epochs): """ Run the train and test error evaluation and the backpropagation using conjugate gradient to optimize the weights in order to make the DBN perform better. @param epochs: The number of epochs to run the finetuning for. """ self.train_error = {} self.test_error = {} dbn.save_dbn(self.weight_matrices_added_biases, self.train_error, self.test_error, self.fout()) for epoch in range(epochs): self.fout('Backprop: Epoch ' + str(epoch + 1)) result_queue = Manager().Queue() w_queue = Manager().Queue() # Start backprop process proc = Process(target=self.backpropagation, args=( epoch, self.weight_matrices_added_biases, w_queue, )) proc.start() # Start error eval processes evaluations = [] evaluations.append((self.weight_matrices_added_biases, epoch, True, data_processing.get_batch_list(training=True), result_queue, self.binary_output)) evaluations.append( (self.weight_matrices_added_biases, epoch, False, data_processing.get_batch_list(training=False), result_queue, self.binary_output)) p = Pool(cpu_count()) p.map_async(error, evaluations) p.close() # Join multiple processes p.join() proc.join() self.weight_matrices_added_biases = w_queue.get() # Print and save error estimates for e in range(2): out = result_queue.get() if out[0]: self.train_error[epoch] = out[2] self.fout(out[1]) else: self.test_error[epoch] = out[2] self.fout(out[1]) # Save DBN dbn.save_dbn(self.weight_matrices_added_biases, self.train_error, self.test_error, self.fout())
def run_finetuning(self, epochs): """ Run the train and test error evaluation and the backpropagation using conjugate gradient to optimize the weights in order to make the DBN perform better. @param epochs: The number of epochs to run the finetuning for. """ self.train_error = {} self.test_error = {} dbn.save_dbn(self.weight_matrices_added_biases, self.train_error, self.test_error, self.fout()) for epoch in range(epochs): self.fout('Backprop: Epoch ' + str(epoch + 1)) result_queue = Manager().Queue() w_queue = Manager().Queue() # Start backprop process proc = Process(target=self.backpropagation, args=(epoch, self.weight_matrices_added_biases, w_queue,)) proc.start() # Start error eval processes evaluations = [] evaluations.append(( self.weight_matrices_added_biases, epoch, True, data_processing.get_batch_list(training=True), result_queue, self.binary_output)) evaluations.append(( self.weight_matrices_added_biases, epoch, False, data_processing.get_batch_list(training=False), result_queue, self.binary_output)) p = Pool(cpu_count()) p.map_async(error, evaluations) p.close() # Join multiple processes p.join() proc.join() self.weight_matrices_added_biases = w_queue.get() # Print and save error estimates for e in range(2): out = result_queue.get() if out[0]: self.train_error[epoch] = out[2] self.fout(out[1]) else: self.test_error[epoch] = out[2] self.fout(out[1]) # Save DBN dbn.save_dbn(self.weight_matrices_added_biases, self.train_error, self.test_error, self.fout())
def __init__(self,trainingdata = False): self.batches = data_processing.get_batch_list(trainingdata) # Run data through neural network self.lower_dimension_data = [] # Output data from the dbn self.higher_dimensional_data = [] # Input data to the dbn self.path = 'output' if not os.path.exists(self.path): os.makedirs(self.path) weights = rsm.get_weights() visible_biases = rsm.get_visible_biases() hidden_biases = rsm.get_hidden_biases() # Generate class indices and class names if trainingdata: path = 'pickle/train/bag_of_words' else: path = 'pickle/test/bag_of_words' self.class_indices = self.__generate_class_indices__(path, self.batches) # Class indices for all documents # Run through batches and generate high and low dimensional data lists for batch in range(len(self.batches)): print 'Batch ',batch + 1, ' of ',len(self.batches) d = data_processing.get_bag_of_words_matrix(self.batches[batch],trainingdata) self.higher_dimensional_data += list(d) self.lower_dimension_data += list((rsm.generate_output_data(d, weights,visible_biases,hidden_biases)))
def generate_input_data_list(training = True): """ Generate a list of all input data. @param training: If training is True, the input should be generated for training data and vice versa. """ batches = data_processing.get_batch_list(training = training) input_data = [] for batch in range(len(batches)): print 'Batch ',batch + 1, ' of ',len(batches) d = data_processing.get_bag_of_words_matrix(batches[batch],training=training) d = get_norm_x(d) input_data += list(d) return input_data
def generate_input_data_list(training=True): """ Generate a list of all input data. @param training: If training is True, the input should be generated for training data and vice versa. """ batches = data_processing.get_batch_list(training=training) input_data = [] for batch in range(len(batches)): print 'Batch ', batch + 1, ' of ', len(batches) d = data_processing.get_bag_of_words_matrix(batches[batch], training=training) d = get_norm_x(d) input_data += list(d) return input_data
def generate_output_for_train_data(binary_output = False): """ For all train data, generate the output and add to a list. @return: List of all output data. """ weight_matrices_added_biases = get_weights() batches = data_processing.get_batch_list(training = True) output_data = [] evaluations = [] for batch in range(len(batches)): evaluations.append((batches[batch],weight_matrices_added_biases,binary_output)) p = Pool(6) results = p.map(__generate_output_for_train_data_par,evaluations) p.close() p.join() for elem in results: output_data+=elem return output_data
def generate_output_for_train_data(binary_output=False): """ For all train data, generate the output and add to a list. @return: List of all output data. """ weight_matrices_added_biases = get_weights() batches = data_processing.get_batch_list(training=True) output_data = [] evaluations = [] for batch in range(len(batches)): evaluations.append( (batches[batch], weight_matrices_added_biases, binary_output)) p = Pool(6) results = p.map(__generate_output_for_train_data_par, evaluations) p.close() p.join() for elem in results: output_data += elem return output_data
def run_simulation(train_path, test_path, epochs = 50, attributes = 2000, evaluation_points = [1,3,7,15,31,63],binary_output = True): # Define training and test set paths. paths = os.listdir(train_path) train_paths= [] for p in paths: if p.startswith('.'): continue train_paths.append(os.path.join(train_path,p)) print train_paths paths = os.listdir(test_path) test_paths= [] for p in paths: if p.startswith('.'): continue test_paths.append(os.path.join(test_path,p)) print test_paths # Stem documents #data_processing.stem_docs(train_paths) #data_processing.stem_docs(test_paths) # Generate bag of word matrice dat_proc_train = data_processing.DataProcessing(train_paths,words_count=attributes,trainingset_size=1.0,acceptance_lst_path="input/acceptance_lst_stemmed.txt") #dat_proc_train.generate_bows() dat_proc_test = data_processing.DataProcessing(test_paths,trainingset_size=0.0, trainingset_attributes=data_processing.get_attributes()) dat_proc_test.generate_bows() # Train network deepbelief = dbn.DBN(attributes, data_processing.get_batch_list(), [500], 500, epochs, binary_output=binary_output) deepbelief.run_pretraining() deepbelief.run_finetuning(load_from_serialization=True) # Evaluate network test = dbn_testing.DBNTesting(testing = True,binary_output=False) test.generate_accuracy_measurement_parallel(evaluation_points)
def example2(): ''' Run simulation on the 20 Newsgroups dataset "20news-18828.tar.gz" from http://qwone.com/~jason/20Newsgroups/ through a network structure 2000-500-250-125-10 (real valued outputs). ''' ### Archiving output files ### # Archive output files so that the new simulation for example 1 will not use the data already present. archive_outputs() ### DATA PREPARATION ### # Define training and test set paths. datapath = os.path.join('input', '20news-18828') # Generate list of all the subfolders in the data path paths = os.listdir(datapath) datapaths = [] for p in paths: if p.startswith('.'): # check for hidden files continue datapaths.append(os.path.join(datapath, p)) print datapaths # Stem documents and compute a .p file (serlialized file). data_processing.stem_docs_parallel(datapaths) # Generate bag of word matrix for training set which is 0.7 (70%) of the data in the data paths. dat_proc_train = data_processing.DataProcessing(datapaths, words_count=2000, trainingset_size=1.0) dat_proc_train.generate_bows() # Generate bag of word matrix for test set which is 0.3 (30%) of the data in the data paths. dat_proc_test = data_processing.DataProcessing(datapaths, trainingset_size=0.0, trainingset_attributes=data_processing.get_attributes()) dat_proc_test.generate_bows() ### DBN TRAINING ### # Generate network 2000-500-250-125-10 (real valued outputs), training 50 epochs. deepbelief = dbn.DBN(2000, data_processing.get_batch_list(), [500, 250, 125], 10, 50, binary_output=False) # Pretrain with a replicated softmax model at the bottom and restricted boltzmann machines in the remaining layers. deepbelief.run_pretraining(learning_rate=0.01, weight_cost=0.0002, momentum=0.9, gibbs_steps=1) # Construct deep autoencoder and finetune using backpropagation with conjugate gradient as optimization. deepbelief.run_finetuning(load_from_serialization=True) ### EVALUATION ### # Evaluate on the test set and output as real output units. eval = dbn_testing.DBNTesting(testing=True, binary_output=False) # Evaluate the output space on the 1,3,7,15 nearest neighbors. eval.generate_accuracy_measurement_parallel([1, 3, 7, 15]) ### VISUALISATION ### # Initialise visualization. Only plot 6 categories so that the plot will not get too cluttered. v = visualise.Visualise(testing=True, classes_to_visualise=["rec.sport.hockey", "comp.graphics", "sci.crypt", "soc.religion.christian", "talk.politics.mideast", "talk.politics.guns"]) # Visualise the output data with 4 principal components. v.visualise_data_pca_2d(input_data=False, number_of_components=4) # Visualise the output data with 2 principal components. v.visualise_data_pca_2d_two_components(1, 2, input_data=False) # Visualise the output data in 3d with 3 principal components. v.visualise_data_pca_3d(1, 2, 3, input_data=False)