Exemple #1
0
	def __init__(self, dataset_path, batch_size=500, instance_weights_path=None):
		
		L.info("Initializing dataset from: " + os.path.abspath(dataset_path))
		
		# Reading parameters from the mmap file
		fp = np.memmap(dataset_path, dtype='int32', mode='r')
		self.num_samples = fp[0]
		self.ngram = fp[1]
		fp = fp.reshape((self.num_samples + 3, self.ngram))
		self.vocab_size = fp[1,0]
		self.num_classes = fp[2,0]

		# Setting minibatch size and number of mini batches
		self.batch_size = batch_size
		self.num_batches = int(M.ceil(self.num_samples / self.batch_size))
		
		# Reading the matrix of samples
		x = fp[3:,0:self.ngram - 1]			# Reading the context indices
		y = fp[3:,self.ngram - 1]			# Reading the output word index
		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
		
		self.is_weighted = False
		if instance_weights_path:
			instance_weights = np.loadtxt(instance_weights_path)
			U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.")
			self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
			self.is_weighted = True
		
		L.info('  #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (
				U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)
			)
		)
Exemple #2
0
    def __init__(self,
                 rng,
                 input,
                 vocab_size,
                 emb_dim,
                 emb_matrix=None,
                 concat=True,
                 emb_path=None,
                 vocab_path=None,
                 add_weights=False):

        L.info("Lookup Table layer, #words: %s, #dims: %s" %
               (U.red(vocab_size), U.red(emb_dim)))

        self.input = input

        self.emb_matrix = emb_matrix

        if self.emb_matrix is None:
            self.emb_matrix = numpy.asarray(
                rng.uniform(
                    low=-0.01,  #low=-1,
                    high=0.01,  #high=1,
                    size=(vocab_size, emb_dim)),
                dtype=theano.config.floatX)

        if emb_path:
            U.xassert(vocab_path,
                      'When emb_path is given, vocab must be given too.')
            self.initialize(emb_path, vocab_path)

        self.embeddings = theano.shared(value=self.emb_matrix,
                                        name='embeddings',
                                        borrow=True)

        if add_weights:
            weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX)
            self.weights = theano.shared(value=weights_vec,
                                         name='word_weights',
                                         borrow=True)

            # Check if the speed can be improved
            self.output = (self.weights.dimshuffle(0, 'x') *
                           self.embeddings)[input]
            #self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input]
            #self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input]

            self.params = [self.embeddings, self.weights]
        else:
            self.output = self.embeddings[input]
            self.params = [self.embeddings]

        if concat:
            self.output = self.output.reshape(
                (input.shape[0], emb_dim * input.shape[1]))
Exemple #3
0
    def __init__(self,
                 rng,
                 input,
                 n_in,
                 n_out,
                 W_values=None,
                 init_method=0,
                 b_values=None,
                 no_bias=False,
                 suffix=None):

        L.info("Linear layer, #inputs: %s, #outputs: %s" %
               (U.red(n_in), U.red(n_out)))

        self.input = input

        if W_values is None:
            if init_method == 0:  # Useful for Relu activation
                high = 0.01
            elif init_method == 1:  # Useful for Tanh activation
                high = numpy.sqrt(6. / (n_in + n_out))
            elif init_method == 2:  # Useful for Sigmoid activation
                high = 4 * numpy.sqrt(6. / (n_in + n_out))
            else:
                L.error('Invalid initialization method')
            W_values = numpy.asarray(rng.uniform(low=-high,
                                                 high=high,
                                                 size=(n_in, n_out)),
                                     dtype=theano.config.floatX)

        if b_values is None and not no_bias:
            b_values = numpy.zeros((n_out, ), dtype=theano.config.floatX)

        W_name = 'W'
        if suffix is not None:
            W_name += '.' + str(suffix)

        W = theano.shared(value=W_values, name=W_name, borrow=True)
        self.W = W

        if no_bias:
            self.output = T.dot(input, self.W)
            self.params = [self.W]
        else:
            b_name = 'b'
            if suffix is not None:
                b_name += '.' + str(suffix)
            b = theano.shared(value=b_values, name=b_name, borrow=True)
            self.b = b
            self.output = T.dot(input, self.W) + self.b
            self.params = [self.W, self.b]
Exemple #4
0
	def __init__(self, dataset_path, batch_size=500, instance_weights_path=None):

		L.info("Initializing dataset from: " + os.path.abspath(dataset_path))
		# Reading parameters from the mmap file
                print K.get_platform()
		fp = np.memmap(dataset_path, dtype='int32', mode='r')
		self.num_samples = fp[0]
		self.ngram = fp[1]
		fp = fp.reshape((self.num_samples + 3, self.ngram))
		self.vocab_size = fp[1,0]
		self.num_classes = fp[2,0]

		# Setting minibatch size and number of mini batches
		self.batch_size = batch_size
		self.num_batches = int(M.ceil(self.num_samples / self.batch_size))

		# Reading the matrix of samples
		x = fp[3:,0:self.ngram - 1]			# Reading the context indices
		y = fp[3:,self.ngram - 1]			# Reading the output word index
		#self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		#self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
                # What is T.cast :))
                L.info("Initialize a simple variable")
                val = np.random.random((4, 2))
                tmp = K.variable(val)

                L.info("Initialize a real variable")
                tmp = K.variable(x)
                L.info("Initialize two casted variables")
                self.shared_x = K.cast(K.variable(x), 'int32')
                self.shared_y = K.cast(K.variable(y), 'int32')
                L.info("Create two variable without borrow=True")
		self.is_weighted = False
		if instance_weights_path:
			instance_weights = np.loadtxt(instance_weights_path)
			U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.")
			# what is borrow=True
                        # self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
                        self.shared_w = K.cast(K.variable(instance_weights), K._FLOATX)

			self.is_weighted = True

		L.info('  #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (
				U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)
			)
		)
Exemple #5
0
	def __init__(self, rng, input, n_in, n_out, W_values=None, init_method=0, b_values=None, no_bias=False, suffix=None):
		
		L.info("Linear layer, #inputs: %s, #outputs: %s" % (U.red(n_in), U.red(n_out)))

		self.input = input

		if W_values is None:
			if init_method == 0:	# Useful for Relu activation
				high = 0.01
			elif init_method == 1:	# Useful for Tanh activation
				high = numpy.sqrt(6. / (n_in + n_out))
			elif init_method == 2:	# Useful for Sigmoid activation
				high = 4 * numpy.sqrt(6. / (n_in + n_out))
			else:
				L.error('Invalid initialization method')
			W_values = numpy.asarray(
				rng.uniform(
					low=-high,
					high=high,
					size=(n_in, n_out)
				),
				dtype=theano.config.floatX
			)

		if b_values is None and not no_bias:
			b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
		
		W_name = 'W'
		if suffix is not None:
			W_name += '.' + str(suffix)
		
		W = theano.shared(value=W_values, name=W_name, borrow=True)
		self.W = W

		if no_bias:
			self.output = T.dot(input, self.W)
			self.params = [self.W]
		else:
			b_name = 'b'
			if suffix is not None:
				b_name += '.' + str(suffix)
			b = theano.shared(value=b_values, name=b_name, borrow=True)
			self.b = b
			self.output = T.dot(input, self.W) + self.b
			self.params = [self.W, self.b]
Exemple #6
0
	def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False, suffix=None, high=0.01):
		
		L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim)))

		self.input = input
		
		self.emb_matrix = emb_matrix

		if self.emb_matrix is None:
			self.emb_matrix = numpy.asarray(
				rng.uniform(
					low=-high, #low=-1,
					high=high, #high=1,
					size=(vocab_size, emb_dim)
				),
				dtype=theano.config.floatX
			)
		
		if emb_path:
			U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.')
			self.initialize(emb_path, vocab_path)
		

		embeddings_name = 'embeddings'
		if suffix is not None:
			embeddings_name += '.' + str(suffix)
		
		self.embeddings = theano.shared(value=self.emb_matrix, name=embeddings_name, borrow=True)
		
		if add_weights:
			weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX)
			self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True)
			
			# Check if the speed can be improved
			self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input]
			#self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input]
			#self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input]
			
			self.params = [self.embeddings, self.weights]
		else:
			self.output = self.embeddings[input]
			self.params = [self.embeddings]
		
		if concat:
			self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
Exemple #7
0
	def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False):

		L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim)))

		self.input = input
		L.info("Input " + str(input))
                L.info("Add weightes " + str(add_weights))
                self.emb_matrix = emb_matrix

		if self.emb_matrix is None:
			self.emb_matrix = numpy.asarray(
				rng.uniform(
					low=-0.01, #low=-1,
					high=0.01, #high=1,
					size=(vocab_size, emb_dim)
				),
				dtype=K._FLOATX
			)

		if emb_path:
			U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.')
			self.initialize(emb_path, vocab_path)

		#self.embeddings = theano.shared(value=self.emb_matrix, name='embeddings', borrow=True)
		self.embeddings = K.variable(self.emb_matrix, name='embeddings')


		if add_weights:
			weights_vec = numpy.ones(vocab_size, dtype=K._FLOATX)
			#self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True)
			self.weights = K.variable(weights_vec, name='word_weights')

			# Check if the speed can be improved
			self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input]
			#self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input]
			#self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input]

			self.params = [self.embeddings, self.weights]
		else:
			self.output = self.embeddings[input]
			self.params = [self.embeddings]

		if concat:
			self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
Exemple #8
0
    def __init__(self,
                 dataset_path,
                 batch_size=500,
                 instance_weights_path=None):

        L.info("Initializing dataset from: " + os.path.abspath(dataset_path))
        # Reading parameters from the mmap file
        print K.get_platform()
        fp = np.memmap(dataset_path, dtype='int32', mode='r')
        self.num_samples = fp[0]
        self.ngram = fp[1]
        fp = fp.reshape((self.num_samples + 3, self.ngram))
        self.vocab_size = fp[1, 0]
        self.num_classes = fp[2, 0]

        # Setting minibatch size and number of mini batches
        self.batch_size = batch_size
        self.num_batches = int(M.ceil(self.num_samples / self.batch_size))

        # Reading the matrix of samples
        x = fp[3:, 0:self.ngram - 1]  # Reading the context indices
        y = fp[3:, self.ngram - 1]  # Reading the output word index
        #self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
        #self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
        # What is T.cast :))
        L.info("Initialize a simple variable")
        val = np.random.random((4, 2))
        tmp = K.variable(val)

        L.info("Initialize a real variable")
        tmp = K.variable(x)
        L.info("Initialize two casted variables")
        self.shared_x = K.cast(K.variable(x), 'int32')
        self.shared_y = K.cast(K.variable(y), 'int32')
        L.info("Create two variable without borrow=True")
        self.is_weighted = False
        if instance_weights_path:
            instance_weights = np.loadtxt(instance_weights_path)
            U.xassert(
                instance_weights.shape == (self.num_samples, ),
                "The number of lines in weights file must be the same as the number of samples."
            )
            # what is borrow=True
            # self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
            self.shared_w = K.cast(K.variable(instance_weights), K._FLOATX)

            self.is_weighted = True

        L.info(
            '  #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s'
            % (U.red(self.num_samples), U.red(
                self.ngram), U.red(self.vocab_size), U.red(self.num_classes),
               U.red(self.batch_size), U.red(self.num_batches)))
Exemple #9
0
    def __init__(self,
                 dataset_path,
                 batch_size=500,
                 instance_weights_path=None):

        L.info("Initializing dataset from: " + os.path.abspath(dataset_path))

        # Reading parameters from the mmap file
        fp = np.memmap(dataset_path, dtype='int32', mode='r')
        self.num_samples = fp[0]
        self.ngram = fp[1]
        fp = fp.reshape((self.num_samples + 3, self.ngram))
        self.vocab_size = fp[1, 0]
        self.num_classes = fp[2, 0]

        # Setting minibatch size and number of mini batches
        self.batch_size = batch_size
        self.num_batches = int(M.ceil(self.num_samples / self.batch_size))

        # Reading the matrix of samples
        x = fp[3:, 0:self.ngram - 1]  # Reading the context indices
        y = fp[3:, self.ngram - 1]  # Reading the output word index
        self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
        self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')

        self.is_weighted = False
        if instance_weights_path:
            instance_weights = np.loadtxt(instance_weights_path)
            U.xassert(
                instance_weights.shape == (self.num_samples, ),
                "The number of lines in weights file must be the same as the number of samples."
            )
            self.shared_w = T.cast(
                theano.shared(instance_weights, borrow=True),
                theano.config.floatX)
            self.is_weighted = True

        L.info(
            '  #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s'
            % (U.red(self.num_samples), U.red(
                self.ngram), U.red(self.vocab_size), U.red(self.num_classes),
               U.red(self.batch_size), U.red(self.num_batches)))
	def __init__(self, dataset_path, batch_size=500, instance_weights_path=None):
		
		L.info("Initializing dataset (with features) from: " + os.path.abspath(dataset_path))
		
		# Reading parameters from the mmap file
		fp = np.memmap(dataset_path, dtype='int32', mode='r')
		#print type(fp1)
		#fp = np.empty(fp1.shape, dtype='int32')
		#fp[:] = fp1
		#print type(fp)
		self.num_samples = fp[0]
		self.ngram = fp[1]

		fp = fp.reshape((len(fp)/self.ngram, self.ngram))

		num_header_lines = fp[1,0]

	
		self.features_info = []    # Format (vocab_size, num_of_elements)
		for i in xrange(num_header_lines-1):
			self.features_info.append( (fp[i+2,0], fp[i+2,1]) )


		self.num_classes = fp[(num_header_lines+2)-1,0]


		# Setting minibatch size and number of mini batches
		self.batch_size = batch_size
		self.num_batches = int(M.ceil(self.num_samples / self.batch_size))

		# Reading the matrix of samples
		# x is list
		'''
		self.shared_x_list = []
		last_start_pos = 0
		for i in xrange(len(self.features_info)):
			vocab_size, num_elems = self.features_info[i]
			x = fp[num_header_lines+2:,last_start_pos:last_start_pos + num_elems]			# Reading the context indices
			last_start_pos += num_elems
			shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
			self.shared_x_list.append(shared_x)
		'''
		x = fp[num_header_lines+2:,0:self.ngram - 1]			# Reading the context indices
		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		y = fp[num_header_lines+2:,self.ngram - 1]			# Reading the output word index
		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
		

		## Untested instance weighting
		self.is_weighted = False
		if instance_weights_path:
			instance_weights = np.loadtxt(instance_weights_path)
			U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.")
			self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
			self.is_weighted = True
		
		L.info('  #samples: %s,  #classes: %s, batch size: %s, #batches: %s' % (
				U.red(self.num_samples),   U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)
			))
		for feature in enumerate(self.features_info):
			L.info("Feature %s: #ngrams= %s vocab_size= %s" %( U.red(feature[0]), U.red(feature[1][1]), U.red(feature[1][0])))
Exemple #11
0
def train(classifier, criterion, args, trainset, devset, testset=None):
	if args.algorithm == "sgd":
		from dlm.algorithms.sgd import SGD as Trainer
	else:
		L.error("Invalid training algorithm: " + args.algorithm)

	# Get number of minibatches from the training file
	num_train_batches = trainset.get_num_batches()

	# Initialize the trainer object
	trainer = Trainer(classifier, criterion, args.learning_rate, trainset, clip_threshold=args.clip_threshold)

	# Initialize the Learning Rate tuner, which adjusts learning rate based on the development/validation file
	lr_tuner = LRTuner(low=0.01*args.learning_rate, high=10*args.learning_rate, inc=0.01*args.learning_rate)
	validation_frequency = 5000 # minibatches

	# Logging and statistics
	total_num_iter = args.num_epochs * num_train_batches
	hook = Hook(classifier, devset, testset, total_num_iter, args.out_dir)
	L.info('Training')
	start_time = time.time()
	verbose_freq = 1000 # minibatches
	epoch = 0

	hook.evaluate(0)

	a = time.time()
	classifier.save_model(args.out_dir + '/model.epoch_0.gz', zipped=True)

	while (epoch < args.num_epochs):
		epoch = epoch + 1
		L.info("Epoch: " + U.red(epoch))

		minibatch_avg_cost_sum = 0
		for minibatch_index in xrange(num_train_batches):
			# Makes an update of the paramters after processing the minibatch
			minibatch_avg_cost, gparams = trainer.step(minibatch_index)
			minibatch_avg_cost_sum += minibatch_avg_cost

			if minibatch_index % verbose_freq == 0:
				grad_norms = [np.linalg.norm(gparam) for gparam in gparams]
				L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f'
					% (minibatch_index, num_train_batches, minibatch_avg_cost_sum/(minibatch_index+1), trainer.get_learning_rate()))
				L.info('Grad Norms: [' + ', '.join(['%.6f' % gnorm for gnorm in grad_norms]) + ']')
			curr_iter = (epoch - 1) * num_train_batches + minibatch_index
			if curr_iter > 0 and curr_iter % validation_frequency == 0:
				hook.evaluate(curr_iter)

		L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f'
			% (num_train_batches, num_train_batches, minibatch_avg_cost_sum/num_train_batches, trainer.get_learning_rate()))
		dev_ppl = hook.evaluate(curr_iter)
		lr = trainer.get_learning_rate()
		if args.enable_lr_adjust:
			lr = lr_tuner.adapt_lr(dev_ppl, lr)
		trainer.set_learning_rate(lr)
		classifier.save_model(args.out_dir + '/model.epoch_' + str(epoch) + '.gz', zipped=True)

	end_time = time.time()
	hook.evaluate(total_num_iter)
	L.info('Optimization complete')
	L.info('Ran for %.2fm' % ((end_time - start_time) / 60.))
Exemple #12
0
def train(classifier, criterion, args, trainset, devset, testset=None):
	if args.algorithm == "sgd":
		from dlm.algorithms.sgd import SGD as Trainer
	else:
		L.error("Invalid training algorithm: " + args.algorithm)

	# Get number of minibatches from the training file
	num_train_batches = trainset.get_num_batches()
	
	# Initialize the trainer object
	trainer = Trainer(classifier, criterion, args.learning_rate, trainset, clip_threshold=args.clip_threshold)

	# Initialize the Learning Rate tuner, which adjusts learning rate based on the development/validation file
	lr_tuner = LRTuner(low=0.01*args.learning_rate, high=10*args.learning_rate, inc=0.01*args.learning_rate)
	validation_frequency = 5000 # minibatches

	# Logging and statistics
	total_num_iter = args.num_epochs * num_train_batches
	hook = Hook(classifier, devset, testset, total_num_iter, args.out_dir)
	L.info('Training')
	start_time = time.time()
	verbose_freq = 1000 # minibatches
	epoch = 0
	
	hook.evaluate(0)
	
	a = time.time()
	classifier.save_model(args.out_dir + '/model.epoch_0.gz', zipped=True)
	
	while (epoch < args.num_epochs):
		epoch = epoch + 1
		L.info("Epoch: " + U.red(epoch))

		minibatch_avg_cost_sum = 0
		for minibatch_index in xrange(num_train_batches):
			# Makes an update of the paramters after processing the minibatch
			minibatch_avg_cost, gparams = trainer.step(minibatch_index)
			minibatch_avg_cost_sum += minibatch_avg_cost
			
			if minibatch_index % verbose_freq == 0:
				grad_norms = [np.linalg.norm(gparam) for gparam in gparams]
				L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f'
					% (minibatch_index, num_train_batches, minibatch_avg_cost_sum/(minibatch_index+1), trainer.get_learning_rate()))
				L.info('Grad Norms: [' + ', '.join(['%.6f' % gnorm for gnorm in grad_norms]) + ']')
			curr_iter = (epoch - 1) * num_train_batches + minibatch_index
			if curr_iter > 0 and curr_iter % validation_frequency == 0:
				hook.evaluate(curr_iter)

		L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f'
			% (num_train_batches, num_train_batches, minibatch_avg_cost_sum/num_train_batches, trainer.get_learning_rate()))
		dev_ppl = hook.evaluate(curr_iter)
		lr = trainer.get_learning_rate()
		if args.enable_lr_adjust:
			lr = lr_tuner.adapt_lr(dev_ppl, lr)
		trainer.set_learning_rate(lr)
		classifier.save_model(args.out_dir + '/model.epoch_' + str(epoch) + '.gz', zipped=True)

	end_time = time.time()
	hook.evaluate(total_num_iter)
	L.info('Optimization complete')
	L.info('Ran for %.2fm' % ((end_time - start_time) / 60.))
Exemple #13
0
    def __init__(self,
                 dataset_path,
                 batch_size=500,
                 instance_weights_path=None):

        L.info("Initializing dataset (with features) from: " +
               os.path.abspath(dataset_path))

        # Reading parameters from the mmap file
        fp = np.memmap(dataset_path, dtype='int32', mode='r')
        #print type(fp1)
        #fp = np.empty(fp1.shape, dtype='int32')
        #fp[:] = fp1
        #print type(fp)
        self.num_samples = fp[0]
        self.ngram = fp[1]

        fp = fp.reshape((len(fp) / self.ngram, self.ngram))

        num_header_lines = fp[1, 0]

        self.features_info = []  # Format (vocab_size, num_of_elements)
        for i in xrange(num_header_lines - 1):
            self.features_info.append((fp[i + 2, 0], fp[i + 2, 1]))

        self.num_classes = fp[(num_header_lines + 2) - 1, 0]

        # Setting minibatch size and number of mini batches
        self.batch_size = batch_size
        self.num_batches = int(M.ceil(self.num_samples / self.batch_size))

        # Reading the matrix of samples
        # x is list
        '''
		self.shared_x_list = []
		last_start_pos = 0
		for i in xrange(len(self.features_info)):
			vocab_size, num_elems = self.features_info[i]
			x = fp[num_header_lines+2:,last_start_pos:last_start_pos + num_elems]			# Reading the context indices
			last_start_pos += num_elems
			shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
			self.shared_x_list.append(shared_x)
		'''
        x = fp[num_header_lines + 2:,
               0:self.ngram - 1]  # Reading the context indices
        self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
        y = fp[num_header_lines + 2:,
               self.ngram - 1]  # Reading the output word index
        self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')

        ## Untested instance weighting
        self.is_weighted = False
        if instance_weights_path:
            instance_weights = np.loadtxt(instance_weights_path)
            U.xassert(
                instance_weights.shape == (self.num_samples, ),
                "The number of lines in weights file must be the same as the number of samples."
            )
            self.shared_w = T.cast(
                theano.shared(instance_weights, borrow=True),
                theano.config.floatX)
            self.is_weighted = True

        L.info('  #samples: %s,  #classes: %s, batch size: %s, #batches: %s' %
               (U.red(self.num_samples), U.red(self.num_classes),
                U.red(self.batch_size), U.red(self.num_batches)))
        for feature in enumerate(self.features_info):
            L.info("Feature %s: #ngrams= %s vocab_size= %s" % (U.red(
                feature[0]), U.red(feature[1][1]), U.red(feature[1][0])))
Exemple #14
0
	def __init__(self, input, func_name):
		L.info("Activation layer, function: " + U.red(func_name))
		self.input = input
		self.func = self.get_function(func_name)
		self.output = self.func(input)