def handle_sys_args(sys_argv):
    #test = ["man","-V 2000", "-H 500,250", "-O 2","-E 1","-T 0.7"]
    #sys.argv[1:] = test
    shortoptions = "V:H:O:E:B"
    longoptions = ["visible=","hidden=","output=","epochs=","binary="]

    visible = 2000
    hiddentxt = "500,500"
    hidden = [500,500]
    output = 128
    epochs = 50
    train = 0.7
    path = 'input'

    if len(sys_argv)<2:
        usage(visible,hiddentxt,output,epochs,train)
    try:
        opts,_ = getopt.getopt(sys.argv[2:], shortoptions, longoptions)
    except:
        usage(visible,hiddentxt,output,epochs,train)


    # parse arguments
    for o, a in opts:
        if o in ('-V', '--visible'):
            visible = int(a)
        elif o in ('-H', '--hidden'):
            hidden = []
            hiddens = a.split(',')
            for h in hiddens:
                hidden.append(int(h))
        elif o in ('-O', '--output'):
            output = int(a)
        elif o in ('-E', '--epochs'):
            epochs = int(a)
        elif o in ('-B', '--binary'):
            bin = str(a)
        else:
            assert False, "unknown option"

    if 'y' in raw_input("Datapreparation? [y/n]"):
        check_path(path)
        training_path = raw_input("Enter training path as a relative path (i.e. input/train):")
        test_path = raw_input("Enter testing path as a relative path (i.e. input/test):")
        stem = raw_input("Stemming the documents is needed for the data processing to complete. Stem documents? [y/n]")
        run_data_processing(visible,training_path,test_path,stem = True if stem == 'y' else False)

    if 'aut' in sys_argv[1]:
        deepbelief = dbn.DBN(visible, data_processing.get_batch_list(), hidden, output, epochs,binary_output=bin)
        deepbelief.run_dbn()

    elif 'man' in sys_argv[1]:
        deepbelief = dbn.DBN(visible, data_processing.get_batch_list(), hidden, output, epochs,binary_output=bin)
        if 'y' in raw_input("Pre-training? [y/n]"):
            deepbelief.run_pretraining()
        if 'y' in raw_input("Fine-tuning? [y/n]"):
            deepbelief.run_finetuning(load_from_serialization=True)
    def run_finetuning(self, epochs):
        """
        Run the train and test error evaluation and the backpropagation using conjugate gradient to optimize the
        weights in order to make the DBN perform better.

        @param epochs: The number of epochs to run the finetuning for.
        """
        self.train_error = {}
        self.test_error = {}
        dbn.save_dbn(self.weight_matrices_added_biases, self.train_error,
                     self.test_error, self.fout())
        for epoch in range(epochs):
            self.fout('Backprop: Epoch ' + str(epoch + 1))
            result_queue = Manager().Queue()
            w_queue = Manager().Queue()

            # Start backprop process
            proc = Process(target=self.backpropagation,
                           args=(
                               epoch,
                               self.weight_matrices_added_biases,
                               w_queue,
                           ))
            proc.start()
            # Start error eval processes
            evaluations = []
            evaluations.append((self.weight_matrices_added_biases, epoch, True,
                                data_processing.get_batch_list(training=True),
                                result_queue, self.binary_output))
            evaluations.append(
                (self.weight_matrices_added_biases, epoch, False,
                 data_processing.get_batch_list(training=False), result_queue,
                 self.binary_output))
            p = Pool(cpu_count())
            p.map_async(error, evaluations)
            p.close()

            # Join multiple processes
            p.join()
            proc.join()
            self.weight_matrices_added_biases = w_queue.get()

            # Print and save error estimates
            for e in range(2):
                out = result_queue.get()
                if out[0]:
                    self.train_error[epoch] = out[2]
                    self.fout(out[1])
                else:
                    self.test_error[epoch] = out[2]
                    self.fout(out[1])

            # Save DBN
            dbn.save_dbn(self.weight_matrices_added_biases, self.train_error,
                         self.test_error, self.fout())
    def run_finetuning(self, epochs):
        """
        Run the train and test error evaluation and the backpropagation using conjugate gradient to optimize the
        weights in order to make the DBN perform better.

        @param epochs: The number of epochs to run the finetuning for.
        """
        self.train_error = {}
        self.test_error = {}
        dbn.save_dbn(self.weight_matrices_added_biases, self.train_error, self.test_error, self.fout())
        for epoch in range(epochs):
            self.fout('Backprop: Epoch ' + str(epoch + 1))
            result_queue = Manager().Queue()
            w_queue = Manager().Queue()

            # Start backprop process
            proc = Process(target=self.backpropagation, args=(epoch, self.weight_matrices_added_biases, w_queue,))
            proc.start()
            # Start error eval processes
            evaluations = []
            evaluations.append((
                self.weight_matrices_added_biases, epoch, True, data_processing.get_batch_list(training=True),
                result_queue,
                self.binary_output))
            evaluations.append((
                self.weight_matrices_added_biases, epoch, False, data_processing.get_batch_list(training=False),
                result_queue, self.binary_output))
            p = Pool(cpu_count())
            p.map_async(error, evaluations)
            p.close()

            # Join multiple processes
            p.join()
            proc.join()
            self.weight_matrices_added_biases = w_queue.get()

            # Print and save error estimates
            for e in range(2):
                out = result_queue.get()
                if out[0]:
                    self.train_error[epoch] = out[2]
                    self.fout(out[1])
                else:
                    self.test_error[epoch] = out[2]
                    self.fout(out[1])

            # Save DBN
            dbn.save_dbn(self.weight_matrices_added_biases, self.train_error, self.test_error, self.fout())
 def __init__(self,trainingdata = False):
     self.batches = data_processing.get_batch_list(trainingdata)        
     
     # Run data through neural network
     self.lower_dimension_data = [] # Output data from the dbn
     self.higher_dimensional_data = [] # Input data to the dbn
     
     
     self.path = 'output'
     if not os.path.exists(self.path):
         os.makedirs(self.path)
     
     weights = rsm.get_weights()
     visible_biases = rsm.get_visible_biases()
     hidden_biases = rsm.get_hidden_biases()
     
     # Generate class indices and class names
     if trainingdata:
         path = 'pickle/train/bag_of_words'
     else:
         path = 'pickle/test/bag_of_words'
     
     self.class_indices = self.__generate_class_indices__(path, self.batches) # Class indices for all documents
     
     # Run through batches and generate high and low dimensional data lists
     for batch in range(len(self.batches)):
         print 'Batch ',batch + 1, ' of ',len(self.batches)
         d = data_processing.get_bag_of_words_matrix(self.batches[batch],trainingdata)
         self.higher_dimensional_data += list(d)
         self.lower_dimension_data += list((rsm.generate_output_data(d, weights,visible_biases,hidden_biases)))
def generate_input_data_list(training = True):
    """
    Generate a list of all input data.

    @param training: If training is True, the input should be generated for training data and vice versa.
    """
    batches = data_processing.get_batch_list(training = training)
    input_data = []

    for batch in range(len(batches)):
        print 'Batch ',batch + 1, ' of ',len(batches)
        d = data_processing.get_bag_of_words_matrix(batches[batch],training=training)
        d = get_norm_x(d)
        input_data += list(d)

    return input_data
def generate_input_data_list(training=True):
    """
    Generate a list of all input data.

    @param training: If training is True, the input should be generated for training data and vice versa.
    """
    batches = data_processing.get_batch_list(training=training)
    input_data = []

    for batch in range(len(batches)):
        print 'Batch ', batch + 1, ' of ', len(batches)
        d = data_processing.get_bag_of_words_matrix(batches[batch],
                                                    training=training)
        d = get_norm_x(d)
        input_data += list(d)

    return input_data
def generate_output_for_train_data(binary_output = False):
    """
    For all train data, generate the output and add to a list.

    @return: List of all output data.
    """
    weight_matrices_added_biases = get_weights()
    batches = data_processing.get_batch_list(training = True)
    output_data = []
    
    evaluations = []
    for batch in range(len(batches)):
        evaluations.append((batches[batch],weight_matrices_added_biases,binary_output))
    p = Pool(6)
    results = p.map(__generate_output_for_train_data_par,evaluations)
    p.close()
    p.join()
    for elem in results:
        output_data+=elem
    return output_data
def generate_output_for_train_data(binary_output=False):
    """
    For all train data, generate the output and add to a list.

    @return: List of all output data.
    """
    weight_matrices_added_biases = get_weights()
    batches = data_processing.get_batch_list(training=True)
    output_data = []

    evaluations = []
    for batch in range(len(batches)):
        evaluations.append(
            (batches[batch], weight_matrices_added_biases, binary_output))
    p = Pool(6)
    results = p.map(__generate_output_for_train_data_par, evaluations)
    p.close()
    p.join()
    for elem in results:
        output_data += elem
    return output_data
def run_simulation(train_path, test_path, epochs = 50, attributes = 2000, evaluation_points = [1,3,7,15,31,63],binary_output = True):
    # Define training and test set paths.
    paths = os.listdir(train_path)
    train_paths= []
    for p in paths:
        if p.startswith('.'):
            continue
        train_paths.append(os.path.join(train_path,p))

    print train_paths

    paths = os.listdir(test_path)
    test_paths= []
    for p in paths:
        if p.startswith('.'):
            continue
        test_paths.append(os.path.join(test_path,p))
    print test_paths

    # Stem documents
    #data_processing.stem_docs(train_paths)
    #data_processing.stem_docs(test_paths)

    # Generate bag of word matrice
    dat_proc_train = data_processing.DataProcessing(train_paths,words_count=attributes,trainingset_size=1.0,acceptance_lst_path="input/acceptance_lst_stemmed.txt")
    #dat_proc_train.generate_bows()
    dat_proc_test = data_processing.DataProcessing(test_paths,trainingset_size=0.0, trainingset_attributes=data_processing.get_attributes())
    dat_proc_test.generate_bows()

    # Train network
    deepbelief = dbn.DBN(attributes, data_processing.get_batch_list(), [500], 500, epochs, binary_output=binary_output)
    deepbelief.run_pretraining()
    deepbelief.run_finetuning(load_from_serialization=True)

    # Evaluate network
    test = dbn_testing.DBNTesting(testing = True,binary_output=False)
    test.generate_accuracy_measurement_parallel(evaluation_points)
Exemple #10
0
def example2():
    '''
    Run simulation on the 20 Newsgroups dataset "20news-18828.tar.gz" from http://qwone.com/~jason/20Newsgroups/
    through a network structure 2000-500-250-125-10 (real valued outputs).
    '''

    ### Archiving output files ###
    # Archive output files so that the new simulation for example 1 will not use the data already present.
    archive_outputs()

    ### DATA PREPARATION ###

    # Define training and test set paths.
    datapath = os.path.join('input', '20news-18828')

    # Generate list of all the subfolders in the data path
    paths = os.listdir(datapath)
    datapaths = []
    for p in paths:
        if p.startswith('.'):  # check for hidden files
            continue
        datapaths.append(os.path.join(datapath, p))
    print datapaths

    # Stem documents and compute a .p file (serlialized file).
    data_processing.stem_docs_parallel(datapaths)

    # Generate bag of word matrix for training set which is 0.7 (70%) of the data in the data paths.
    dat_proc_train = data_processing.DataProcessing(datapaths, words_count=2000, trainingset_size=1.0)
    dat_proc_train.generate_bows()
    # Generate bag of word matrix for test set which is 0.3 (30%) of the data in the data paths.
    dat_proc_test = data_processing.DataProcessing(datapaths, trainingset_size=0.0,
                                                   trainingset_attributes=data_processing.get_attributes())
    dat_proc_test.generate_bows()

    ### DBN TRAINING ###

    # Generate network 2000-500-250-125-10 (real valued outputs), training 50 epochs.
    deepbelief = dbn.DBN(2000, data_processing.get_batch_list(), [500, 250, 125], 10, 50, binary_output=False)
    # Pretrain with a replicated softmax model at the bottom and restricted boltzmann machines in the remaining layers.
    deepbelief.run_pretraining(learning_rate=0.01, weight_cost=0.0002, momentum=0.9, gibbs_steps=1)
    # Construct deep autoencoder and finetune using backpropagation with conjugate gradient as optimization.
    deepbelief.run_finetuning(load_from_serialization=True)

    ### EVALUATION ###

    # Evaluate on the test set and output as real output units.
    eval = dbn_testing.DBNTesting(testing=True, binary_output=False)
    # Evaluate the output space on the 1,3,7,15 nearest neighbors.
    eval.generate_accuracy_measurement_parallel([1, 3, 7, 15])

    ### VISUALISATION ###

    # Initialise visualization. Only plot 6 categories so that the plot will not get too cluttered.
    v = visualise.Visualise(testing=True, classes_to_visualise=["rec.sport.hockey", "comp.graphics", "sci.crypt",
                                                                "soc.religion.christian", "talk.politics.mideast",
                                                                "talk.politics.guns"])
    # Visualise the output data with 4 principal components.
    v.visualise_data_pca_2d(input_data=False, number_of_components=4)
    # Visualise the output data with 2 principal components.
    v.visualise_data_pca_2d_two_components(1, 2, input_data=False)
    # Visualise the output data in 3d with 3 principal components.
    v.visualise_data_pca_3d(1, 2, 3, input_data=False)