def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1,0] self.num_classes = fp[2,0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:,0:self.ngram - 1] # Reading the context indices y = fp[3:,self.ngram - 1] # Reading the output word index self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.") self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info(' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % ( U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches) ) )
def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False): L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim))) self.input = input self.emb_matrix = emb_matrix if self.emb_matrix is None: self.emb_matrix = numpy.asarray( rng.uniform( low=-0.01, #low=-1, high=0.01, #high=1, size=(vocab_size, emb_dim)), dtype=theano.config.floatX) if emb_path: U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.') self.initialize(emb_path, vocab_path) self.embeddings = theano.shared(value=self.emb_matrix, name='embeddings', borrow=True) if add_weights: weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX) self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True) # Check if the speed can be improved self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input] #self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input] #self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input] self.params = [self.embeddings, self.weights] else: self.output = self.embeddings[input] self.params = [self.embeddings] if concat: self.output = self.output.reshape( (input.shape[0], emb_dim * input.shape[1]))
def __init__(self, rng, input, n_in, n_out, W_values=None, init_method=0, b_values=None, no_bias=False, suffix=None): L.info("Linear layer, #inputs: %s, #outputs: %s" % (U.red(n_in), U.red(n_out))) self.input = input if W_values is None: if init_method == 0: # Useful for Relu activation high = 0.01 elif init_method == 1: # Useful for Tanh activation high = numpy.sqrt(6. / (n_in + n_out)) elif init_method == 2: # Useful for Sigmoid activation high = 4 * numpy.sqrt(6. / (n_in + n_out)) else: L.error('Invalid initialization method') W_values = numpy.asarray(rng.uniform(low=-high, high=high, size=(n_in, n_out)), dtype=theano.config.floatX) if b_values is None and not no_bias: b_values = numpy.zeros((n_out, ), dtype=theano.config.floatX) W_name = 'W' if suffix is not None: W_name += '.' + str(suffix) W = theano.shared(value=W_values, name=W_name, borrow=True) self.W = W if no_bias: self.output = T.dot(input, self.W) self.params = [self.W] else: b_name = 'b' if suffix is not None: b_name += '.' + str(suffix) b = theano.shared(value=b_values, name=b_name, borrow=True) self.b = b self.output = T.dot(input, self.W) + self.b self.params = [self.W, self.b]
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file print K.get_platform() fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1,0] self.num_classes = fp[2,0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:,0:self.ngram - 1] # Reading the context indices y = fp[3:,self.ngram - 1] # Reading the output word index #self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') #self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') # What is T.cast :)) L.info("Initialize a simple variable") val = np.random.random((4, 2)) tmp = K.variable(val) L.info("Initialize a real variable") tmp = K.variable(x) L.info("Initialize two casted variables") self.shared_x = K.cast(K.variable(x), 'int32') self.shared_y = K.cast(K.variable(y), 'int32') L.info("Create two variable without borrow=True") self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.") # what is borrow=True # self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.shared_w = K.cast(K.variable(instance_weights), K._FLOATX) self.is_weighted = True L.info(' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % ( U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches) ) )
def __init__(self, rng, input, n_in, n_out, W_values=None, init_method=0, b_values=None, no_bias=False, suffix=None): L.info("Linear layer, #inputs: %s, #outputs: %s" % (U.red(n_in), U.red(n_out))) self.input = input if W_values is None: if init_method == 0: # Useful for Relu activation high = 0.01 elif init_method == 1: # Useful for Tanh activation high = numpy.sqrt(6. / (n_in + n_out)) elif init_method == 2: # Useful for Sigmoid activation high = 4 * numpy.sqrt(6. / (n_in + n_out)) else: L.error('Invalid initialization method') W_values = numpy.asarray( rng.uniform( low=-high, high=high, size=(n_in, n_out) ), dtype=theano.config.floatX ) if b_values is None and not no_bias: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) W_name = 'W' if suffix is not None: W_name += '.' + str(suffix) W = theano.shared(value=W_values, name=W_name, borrow=True) self.W = W if no_bias: self.output = T.dot(input, self.W) self.params = [self.W] else: b_name = 'b' if suffix is not None: b_name += '.' + str(suffix) b = theano.shared(value=b_values, name=b_name, borrow=True) self.b = b self.output = T.dot(input, self.W) + self.b self.params = [self.W, self.b]
def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False, suffix=None, high=0.01): L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim))) self.input = input self.emb_matrix = emb_matrix if self.emb_matrix is None: self.emb_matrix = numpy.asarray( rng.uniform( low=-high, #low=-1, high=high, #high=1, size=(vocab_size, emb_dim) ), dtype=theano.config.floatX ) if emb_path: U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.') self.initialize(emb_path, vocab_path) embeddings_name = 'embeddings' if suffix is not None: embeddings_name += '.' + str(suffix) self.embeddings = theano.shared(value=self.emb_matrix, name=embeddings_name, borrow=True) if add_weights: weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX) self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True) # Check if the speed can be improved self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input] #self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input] #self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input] self.params = [self.embeddings, self.weights] else: self.output = self.embeddings[input] self.params = [self.embeddings] if concat: self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False): L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim))) self.input = input L.info("Input " + str(input)) L.info("Add weightes " + str(add_weights)) self.emb_matrix = emb_matrix if self.emb_matrix is None: self.emb_matrix = numpy.asarray( rng.uniform( low=-0.01, #low=-1, high=0.01, #high=1, size=(vocab_size, emb_dim) ), dtype=K._FLOATX ) if emb_path: U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.') self.initialize(emb_path, vocab_path) #self.embeddings = theano.shared(value=self.emb_matrix, name='embeddings', borrow=True) self.embeddings = K.variable(self.emb_matrix, name='embeddings') if add_weights: weights_vec = numpy.ones(vocab_size, dtype=K._FLOATX) #self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True) self.weights = K.variable(weights_vec, name='word_weights') # Check if the speed can be improved self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input] #self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input] #self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input] self.params = [self.embeddings, self.weights] else: self.output = self.embeddings[input] self.params = [self.embeddings] if concat: self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file print K.get_platform() fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1, 0] self.num_classes = fp[2, 0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:, 0:self.ngram - 1] # Reading the context indices y = fp[3:, self.ngram - 1] # Reading the output word index #self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') #self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') # What is T.cast :)) L.info("Initialize a simple variable") val = np.random.random((4, 2)) tmp = K.variable(val) L.info("Initialize a real variable") tmp = K.variable(x) L.info("Initialize two casted variables") self.shared_x = K.cast(K.variable(x), 'int32') self.shared_y = K.cast(K.variable(y), 'int32') L.info("Create two variable without borrow=True") self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert( instance_weights.shape == (self.num_samples, ), "The number of lines in weights file must be the same as the number of samples." ) # what is borrow=True # self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.shared_w = K.cast(K.variable(instance_weights), K._FLOATX) self.is_weighted = True L.info( ' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (U.red(self.num_samples), U.red( self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)))
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1, 0] self.num_classes = fp[2, 0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:, 0:self.ngram - 1] # Reading the context indices y = fp[3:, self.ngram - 1] # Reading the output word index self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert( instance_weights.shape == (self.num_samples, ), "The number of lines in weights file must be the same as the number of samples." ) self.shared_w = T.cast( theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info( ' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (U.red(self.num_samples), U.red( self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)))
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset (with features) from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') #print type(fp1) #fp = np.empty(fp1.shape, dtype='int32') #fp[:] = fp1 #print type(fp) self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((len(fp)/self.ngram, self.ngram)) num_header_lines = fp[1,0] self.features_info = [] # Format (vocab_size, num_of_elements) for i in xrange(num_header_lines-1): self.features_info.append( (fp[i+2,0], fp[i+2,1]) ) self.num_classes = fp[(num_header_lines+2)-1,0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples # x is list ''' self.shared_x_list = [] last_start_pos = 0 for i in xrange(len(self.features_info)): vocab_size, num_elems = self.features_info[i] x = fp[num_header_lines+2:,last_start_pos:last_start_pos + num_elems] # Reading the context indices last_start_pos += num_elems shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_x_list.append(shared_x) ''' x = fp[num_header_lines+2:,0:self.ngram - 1] # Reading the context indices self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') y = fp[num_header_lines+2:,self.ngram - 1] # Reading the output word index self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') ## Untested instance weighting self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.") self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info(' #samples: %s, #classes: %s, batch size: %s, #batches: %s' % ( U.red(self.num_samples), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches) )) for feature in enumerate(self.features_info): L.info("Feature %s: #ngrams= %s vocab_size= %s" %( U.red(feature[0]), U.red(feature[1][1]), U.red(feature[1][0])))
def train(classifier, criterion, args, trainset, devset, testset=None): if args.algorithm == "sgd": from dlm.algorithms.sgd import SGD as Trainer else: L.error("Invalid training algorithm: " + args.algorithm) # Get number of minibatches from the training file num_train_batches = trainset.get_num_batches() # Initialize the trainer object trainer = Trainer(classifier, criterion, args.learning_rate, trainset, clip_threshold=args.clip_threshold) # Initialize the Learning Rate tuner, which adjusts learning rate based on the development/validation file lr_tuner = LRTuner(low=0.01*args.learning_rate, high=10*args.learning_rate, inc=0.01*args.learning_rate) validation_frequency = 5000 # minibatches # Logging and statistics total_num_iter = args.num_epochs * num_train_batches hook = Hook(classifier, devset, testset, total_num_iter, args.out_dir) L.info('Training') start_time = time.time() verbose_freq = 1000 # minibatches epoch = 0 hook.evaluate(0) a = time.time() classifier.save_model(args.out_dir + '/model.epoch_0.gz', zipped=True) while (epoch < args.num_epochs): epoch = epoch + 1 L.info("Epoch: " + U.red(epoch)) minibatch_avg_cost_sum = 0 for minibatch_index in xrange(num_train_batches): # Makes an update of the paramters after processing the minibatch minibatch_avg_cost, gparams = trainer.step(minibatch_index) minibatch_avg_cost_sum += minibatch_avg_cost if minibatch_index % verbose_freq == 0: grad_norms = [np.linalg.norm(gparam) for gparam in gparams] L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f' % (minibatch_index, num_train_batches, minibatch_avg_cost_sum/(minibatch_index+1), trainer.get_learning_rate())) L.info('Grad Norms: [' + ', '.join(['%.6f' % gnorm for gnorm in grad_norms]) + ']') curr_iter = (epoch - 1) * num_train_batches + minibatch_index if curr_iter > 0 and curr_iter % validation_frequency == 0: hook.evaluate(curr_iter) L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f' % (num_train_batches, num_train_batches, minibatch_avg_cost_sum/num_train_batches, trainer.get_learning_rate())) dev_ppl = hook.evaluate(curr_iter) lr = trainer.get_learning_rate() if args.enable_lr_adjust: lr = lr_tuner.adapt_lr(dev_ppl, lr) trainer.set_learning_rate(lr) classifier.save_model(args.out_dir + '/model.epoch_' + str(epoch) + '.gz', zipped=True) end_time = time.time() hook.evaluate(total_num_iter) L.info('Optimization complete') L.info('Ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset (with features) from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') #print type(fp1) #fp = np.empty(fp1.shape, dtype='int32') #fp[:] = fp1 #print type(fp) self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((len(fp) / self.ngram, self.ngram)) num_header_lines = fp[1, 0] self.features_info = [] # Format (vocab_size, num_of_elements) for i in xrange(num_header_lines - 1): self.features_info.append((fp[i + 2, 0], fp[i + 2, 1])) self.num_classes = fp[(num_header_lines + 2) - 1, 0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples # x is list ''' self.shared_x_list = [] last_start_pos = 0 for i in xrange(len(self.features_info)): vocab_size, num_elems = self.features_info[i] x = fp[num_header_lines+2:,last_start_pos:last_start_pos + num_elems] # Reading the context indices last_start_pos += num_elems shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_x_list.append(shared_x) ''' x = fp[num_header_lines + 2:, 0:self.ngram - 1] # Reading the context indices self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') y = fp[num_header_lines + 2:, self.ngram - 1] # Reading the output word index self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') ## Untested instance weighting self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert( instance_weights.shape == (self.num_samples, ), "The number of lines in weights file must be the same as the number of samples." ) self.shared_w = T.cast( theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info(' #samples: %s, #classes: %s, batch size: %s, #batches: %s' % (U.red(self.num_samples), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches))) for feature in enumerate(self.features_info): L.info("Feature %s: #ngrams= %s vocab_size= %s" % (U.red( feature[0]), U.red(feature[1][1]), U.red(feature[1][0])))
def __init__(self, input, func_name): L.info("Activation layer, function: " + U.red(func_name)) self.input = input self.func = self.get_function(func_name) self.output = self.func(input)