Esempio n. 1
0
    def load_data(self, dataset):
        ''' Loads the dataset with the path given by the dataset parameter. If no
        such dataset is available locally, loads the MNIST dataset over the 
        network.
        
        The data is assumed to be in pickled form with three elements:
        
        1) Training
        2) Validation
        3) Test
            
        :type dataset:  string
        :param dataset: the path to the dataset
        '''
        ###############
        # LOAD DATA
        ###############
            
        # If dataset is not present locally, download MNIST data from the network. 
        data_dir, data_file = os.path.split(dataset)
        if data_dir == "" and not os.path.isfile(dataset):
            # Check if dataset is in the data directory.
            new_path = os.path.join(
                os.path.split(__file__)[0],
                "..",
                "data",
                dataset
            )
            if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
                dataset = new_path
                    
        if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
            import urllib
            origin = (
                'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
            )
            print 'Downloading data from %s' % origin
            urllib.urlretrieve(origin, dataset)
            
        # Load the dataset
        f = gzip.open(dataset, 'rb')
        training_set, validation_set, test_set = cPickle.load(f)
        f.close()
                        
        test_set_x, test_set_y = theanoutil.shared_dataset(test_set)
        validation_set_x, validation_set_y = theanoutil.shared_dataset(validation_set)
        training_set_x, training_set_y = theanoutil.shared_dataset(training_set)

        if self.debug:         
            print "TYPE of test_set_x =", type(test_set_x)        
            print "TYPE of test_set=", type(test_set), "  SIZE of test_set=", len(test_set)
            print "TYPE of test_set[0]=", type(test_set[0]), "  SHAPE of test_set[0]=", test_set[0].shape
            print "TYPE of test_set[1]=", type(test_set[1]), "  SHAPE of test_set[1]=", test_set[1].shape
            print "VALUE of training_set[0,0,0]=", training_set[0][0,0]
            print "VALUE of training_set[1,0]=", training_set[1][0]
            
        rval = [(training_set_x, training_set_y), (validation_set_x, validation_set_y), (test_set_x, test_set_y)]
        return rval
Esempio n. 2
0
    def load_data(self, source):
        """Load the data from a directory with a collection of source files,
        one file for each kind of protein. 
        
        Returns an array of pairs in the form:
        
        [(train_set_in, train_set_out), (validation_set_in, validation_set_out), (test_set_in, test_set_out)]

        :type source:   String
        :param source:  The directory where the source files are located.
        """
        dir = source
        raw_data = list()
        unsupporteds = list()
        for i in range(0, len(self.names)):
            num_in_file = 0
            if self.debug:
                print (dir + self.names[i] + ".faa")
            handle = open(dir + self.names[i] + ".faa", "rU")  # Open a file.
            for record in SeqIO.parse(handle, "fasta"):
                num_in_file += 1
                try:
                    # print "      " + record.id
                    feature_vector = self.feature_extractor.extract_features(record)
                    # Now we have to augment the feature vector with the output
                    # vector. So we:
                    #   1) Make a new array a bit longer than the feature vector,
                    #   2) Copy the feature vector into the first cells of the new array,
                    #   3) Find the appropriate cell in the tail of the new array
                    #      and set that one equal to 1.
                    prepared_data_record = numpy.zeros(len(feature_vector) + self.output_width)
                    for col in range(0, len(feature_vector)):  # This surely could be done more efficiently.
                        prepared_data_record[col] = feature_vector[col]  # Doesn't matter for now.
                    prepared_data_record[
                        len(feature_vector) + i
                    ] = 1  # The class of the protein is taken from the order of the files in the list "names"
                    raw_data.append(prepared_data_record)
                except KeyError:
                    if self.debug:
                        print "   Unsupported sequence: " + record.id + "   " + str(record.annotations)
                    unsupporteds.append(record)
                pass
            handle.close()
            if self.debug:
                print "Total in file " + self.names[i] + " = " + str(num_in_file)

        # Now we are done reading all of the data in. In debug mode, print some
        # overall summary information.
        if self.debug:
            print "Supported Sequences = " + str(len(raw_data))
            print "Unsupported Sequences = " + str(len(unsupporteds))

        num_examples = len(raw_data)

        # But the labeled data we have is not randomly ordered. It is sorted
        # by class. We need to shuffle it up or we will only train on the first
        # classes.
        if self.debug:
            print "Shuffling data to randomize for training"
        shuffle = self.rand_perm(num_examples)

        data = numpy.ndarray((num_examples, self.input_width + self.output_width), float)
        for n in range(0, num_examples):
            for w in range(0, self.input_width + self.output_width):
                s = raw_data[shuffle[n]][w]
                data[n, w] = float(s)
        if self.debug:
            print "Finished shuffling data"
            print "Processing data to cull outliers"
        data = self.preprocess(self.cull(data))
        num_examples = len(data)
        print "Data shape = ", data.shape, "   num_examples=", num_examples
        inputs = numpy.array(data)[:, 0 : self.input_width]
        outputs_full = numpy.array(data)[:, self.input_width : self.input_width + self.output_width]
        if self.debug:
            print "Finished culling outliers"
            print inputs.shape
            print outputs_full.shape
        outputs = numpy.ndarray((num_examples,), int)
        for n in range(0, num_examples):
            found_class = False
            for w in range(0, self.output_width):
                if outputs_full[n, w] > 0.5:
                    outputs[n] = w
                    found_class = True
                    break
        num_training_cases = self.num_training(num_examples)
        num_validation_cases = self.num_validation(num_examples)
        num_test_cases = self.num_test(num_examples)

        print num_training_cases, " ", num_validation_cases, " ", num_test_cases
        training_set = (inputs[0:num_training_cases, :], outputs[0:num_training_cases])
        validation_set = (
            inputs[num_training_cases : num_training_cases + num_validation_cases, :],
            outputs[num_training_cases : num_training_cases + num_validation_cases],
        )
        test_set = (
            inputs[num_training_cases + num_validation_cases :, :],
            outputs[num_training_cases + num_validation_cases :],
        )
        training_set_x, training_set_y = theanoutil.shared_dataset(training_set)
        validation_set_x, validation_set_y = theanoutil.shared_dataset(validation_set)
        test_set_x, test_set_y = theanoutil.shared_dataset(test_set)

        if self.debug:
            print "TYPE of test_set_x =", type(test_set_x)
            print "TYPE of test_set=", type(test_set), "  SIZE of test_set=", len(test_set)
            print "TYPE of test_set[0]=", type(test_set[0]), "  SHAPE of test_set[0]=", test_set[0].shape
            print "TYPE of test_set[1]=", type(test_set[1]), "  SHAPE of test_set[1]=", test_set[1].shape
            print "VALUE of training_set[0,0,0]=", training_set[0][0, 0]
            print "VALUE of training_set[1,0]=", training_set[1][0], "   test_set[1,0]=", test_set[1][0]

        rval = [(training_set_x, training_set_y), (validation_set_x, validation_set_y), (test_set_x, test_set_y)]
        return rval
Esempio n. 3
0
    def load_data(self, source):
        f = file(source, "r")
        string_data = [line.split() for line in f]
        f.close()
        
        num_examples = len(string_data)
        if self.debug:
            print len(string_data)
            print len(string_data[1])
            print string_data[0][0]
        
        # But the labeled data we have is not randomly ordered. It is sorted
        # by class. We need to shuffle it up or we will only train on the first
        # classes.
        shuffle = self.rand_perm(num_examples)

        data = numpy.ndarray((num_examples, self.input_width+self.output_width), float)
        for n in range(0, num_examples):
            for w in range(0,self.input_width+self.output_width):
                s = string_data[shuffle[n]][w]
                data[n,w] = float(s)

        data = self.preprocess(self.cull(data))
        num_examples = len(data)
        print "Data shape = ", data.shape, "   num_examples=", num_examples
        inputs = numpy.array(data)[:, 0:self.input_width]
        outputs_full = numpy.array(data)[:, self.input_width:self.input_width+self.output_width]
        
        if self.debug:
            print inputs.shape
            print outputs_full.shape
        outputs = numpy.ndarray((num_examples,),int)       
        for n in range(0, num_examples):
            found_class = False
            for w in range(0, self.output_width):
                if outputs_full[n,w] > 0.5:
                    outputs[n] = w
                    found_class = True
                    break
        num_training_cases = self.num_training(num_examples)
        num_validation_cases = self.num_validation(num_examples)
        num_test_cases = self.num_test(num_examples)
        
        print num_training_cases, " ", num_validation_cases, " ", num_test_cases
        training_set = (inputs[0:num_training_cases,:], outputs[0:num_training_cases])
        validation_set = (inputs[num_training_cases:num_training_cases+num_validation_cases,:], outputs[num_training_cases:num_training_cases+num_validation_cases])
        test_set = (inputs[num_training_cases+num_validation_cases:,:], outputs[num_training_cases+num_validation_cases:])
        training_set_x, training_set_y = theanoutil.shared_dataset(training_set)
        validation_set_x, validation_set_y = theanoutil.shared_dataset(validation_set)
        test_set_x, test_set_y = theanoutil.shared_dataset(test_set)
        
        if self.debug:
            print "TYPE of test_set_x =", type(test_set_x)        
            print "TYPE of test_set=", type(test_set), "  SIZE of test_set=", len(test_set)
            print "TYPE of test_set[0]=", type(test_set[0]), "  SHAPE of test_set[0]=", test_set[0].shape        
            print "TYPE of test_set[1]=", type(test_set[1]), "  SHAPE of test_set[1]=", test_set[1].shape
            print "VALUE of training_set[0,0,0]=", training_set[0][0,0]
            print "VALUE of training_set[1,0]=", training_set[1][0], "   test_set[1,0]=",test_set[1][0]
        
        rval = [(training_set_x, training_set_y), (validation_set_x, validation_set_y), (test_set_x, test_set_y)]
        return rval