Example #1
0
	def __init__(self, dataset_path, ngram_size, vocab_path):

		L.info("Initializing dataset from: " + dataset_path)

		vocab = VocabManager(vocab_path)

		curr_index = 0
		self.num_sentences = 0

		ngrams_list = []
		dataset = codecs.open(dataset_path, 'r', encoding="UTF-8")
		for line in dataset:
			tokens = line.split()
			ngrams = vocab.get_ids_given_word_list(tokens)
			ngrams_list.append(ngrams)
			curr_index += 1
		dataset.close()

		data = np.asarray(ngrams_list)

		x = data[:,0:-1]
		y = data[:,-1]
		self.num_samples = y.shape[0]

		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
Example #2
0
	def evaluate(self, curr_iter):
		denominator = self.dev_eval.get_denominator()
		dev_error = self.dev_eval.classification_error()
		dev_perplexity = self.dev_eval.perplexity()
		if self.test_eval:
			test_error = self.test_eval.classification_error()
			test_perplexity = self.test_eval.perplexity()

		if dev_perplexity < self.best_dev_perplexity:
			self.best_dev_perplexity = dev_perplexity
			self.best_iter = curr_iter
			if self.test_eval:
				self.best_test_perplexity = test_perplexity

		if curr_iter > 0:
			t1 = time.time()
			rem_time = int((self.total_num_iter - curr_iter) * (t1 - self.t0) / (curr_iter * 60))
			rem_time = str(rem_time) + "m"
		else:
			rem_time = ""

		L.info(('DEV  => Error=%.2f%%, PPL=' + U.b_yellow('%.2f @ %i') + ' (' + U.b_red('%.2f @ %i') + '), Denom=%.3f, %s')
			% (dev_error * 100., dev_perplexity, curr_iter, self.best_dev_perplexity, self.best_iter, denominator, rem_time))
		if self.test_eval:
			L.info(('TEST => Error=%.2f%%, PPL=' + U.b_yellow('%.2f @ %i') + ' (' + U.b_red('%.2f @ %i') + ')')
				% (test_error * 100., test_perplexity, curr_iter, self.best_test_perplexity, self.best_iter))

		return dev_perplexity
Example #3
0
	def __init__(self, dataset_path, batch_size=500, instance_weights_path=None):
		
		L.info("Initializing dataset from: " + os.path.abspath(dataset_path))
		
		# Reading parameters from the mmap file
		fp = np.memmap(dataset_path, dtype='int32', mode='r')
		self.num_samples = fp[0]
		self.ngram = fp[1]
		fp = fp.reshape((self.num_samples + 3, self.ngram))
		self.vocab_size = fp[1,0]
		self.num_classes = fp[2,0]

		# Setting minibatch size and number of mini batches
		self.batch_size = batch_size
		self.num_batches = int(M.ceil(self.num_samples / self.batch_size))
		
		# Reading the matrix of samples
		x = fp[3:,0:self.ngram - 1]			# Reading the context indices
		y = fp[3:,self.ngram - 1]			# Reading the output word index
		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
		
		self.is_weighted = False
		if instance_weights_path:
			instance_weights = np.loadtxt(instance_weights_path)
			U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.")
			self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
			self.is_weighted = True
		
		L.info('  #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (
				U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)
			)
		)
Example #4
0
	def initialize(self, emb_path, vocab_path):
		L.info('Initializing lookup table')
		vm = VocabManager(vocab_path)
		w2v = W2VEmbReader(emb_path)
		U.xassert(w2v.get_emb_dim() == self.emb_matrix.shape[1], 'The embeddings dimension does not match with the given word embeddings')
		for i in range(self.emb_matrix.shape[0]):
			vec = w2v.get_emb_given_word(vm.get_word_given_id(i))
			if vec:
				self.emb_matrix[i] = vec
Example #5
0
def print_args(args):
	import dlm.io.logging as L
	L.info("Arguments:")
	items = vars(args)
	for key in sorted(items.keys(), key=lambda s: s.lower()):
		value = items[key]
		if not value:
			value = "None"
		L.info("  " + key + ": " + BColors.MAGENTA + str(items[key]) + BColors.ENDC)
Example #6
0
	def load_model(self, model_path):
		L.info('Loading model from ' + model_path)
		t0 = time.time()
		if model_path.endswith('.gz'):
			with gzip.open(model_path, 'rb') as model_file:
				args, params = pickle.load(model_file)
		else:
			with open(model_path, 'r') as model_file:
				args, params = pickle.load(model_file)
		L.info('  |-> took %.2f seconds' % (time.time() - t0))
		return args, params
Example #7
0
def set_theano_device(device, threads):
	import sys
	import dlm.io.logging as L
	xassert(device == "cpu" or device.startswith("gpu"), "The device can only be 'cpu', 'gpu' or 'gpu<id>'")
	if device.startswith("gpu") and len(device) > 3:
		try:
			gpu_id = int(device[3:])
			if not is_gpu_free(gpu_id):
				L.warning('The selected GPU (GPU' + str(gpu_id) + ') is apparently busy.')
		except ValueError:
			L.error("Unknown GPU device format: " + device)
	if device.startswith("gpu"):
		L.warning('Running on GPU yields non-deterministic results.')
	xassert(sys.modules.has_key('theano') == False, "dlm.utils.set_theano_device() function cannot be called after importing theano")
	os.environ['OMP_NUM_THREADS'] = str(threads)
	os.environ['THEANO_FLAGS'] = 'device=' + device
	os.environ['THEANO_FLAGS'] += ',force_device=True'
	os.environ['THEANO_FLAGS'] += ',floatX=float32'
	os.environ['THEANO_FLAGS'] += ',warn_float64=warn'
	os.environ['THEANO_FLAGS'] += ',cast_policy=numpy+floatX'
	#os.environ['THEANO_FLAGS'] += ',allow_gc=True'
	os.environ['THEANO_FLAGS'] += ',print_active_device=False'
	os.environ['THEANO_FLAGS'] += ',exception_verbosity=high'		# Highly verbose debugging
	os.environ['THEANO_FLAGS'] += ',mode=FAST_RUN'
	os.environ['THEANO_FLAGS'] += ',nvcc.fastmath=False' 			# True: makes div and sqrt faster at the cost of precision, and possible bugs
	#os.environ['THEANO_FLAGS'] += ',optimizer_including=cudnn' 	# Comment out if CUDNN is not available

        # change theano to wrapper
	try:
		#import theano
                import backend.nn_wrapper as K
	except EnvironmentError:
		L.exception()
	global logger
	#if theano.config.device == "gpu":
	#	L.info(
	#		"Device: " + theano.config.device.upper() + " "
	#		+ str(theano.sandbox.cuda.active_device_number())
	#		+ " (" + str(theano.sandbox.cuda.active_device_name()) + ")"
	#	)
	#else:
	#	L.info("Device: " + theano.config.device.upper())

        #global K
        try:
            K.set_platform('tensorflow') # theano is working
            L.info("Creating a variable inside utils")
            import numpy as np
            val = np.random.random((4, 2))
            tmp = K.variable(val)

        except:
            print >> sys.stderr, "Unexpected error:", sys.exc_info()
            raise TypeError("Cannot set the platform")
Example #8
0
	def save_model(self, model_path, zipped=True, compress_level=5):
		L.info('Saving model to ' + model_path)
		t0 = time.time()
		if zipped:
			with gzip.open(model_path, 'wb', compresslevel=compress_level) as model_file:
				params = self.get_params()
				pickle.dump((self.args, [param.get_value() for param in params]), model_file)
		else:
			with open(model_path, 'w') as model_file:
				params = self.get_params()
				pickle.dump((self.args, [param.get_value() for param in params]), model_file)
		L.info('  |-> took %.2f seconds' % (time.time() - t0))
Example #9
0
	def __init__(self, dataset_path, is_nbest, ngram_size, vocab_path):
		
		L.info("Initializing dataset from: " + dataset_path)
		
		vocab = VocabManager(vocab_path)
		
		def get_ngrams(tokens):
			for i in range(ngram_size - 1):
				tokens.insert(0, '<s>')
			if vocab.has_end_padding:
				tokens.append('</s>')
			indices = vocab.get_ids_given_word_list(tokens)
			return U.get_all_windows(indices, ngram_size)
		
		starts_list = []
		curr_index = 0
		curr_start_index = 0
		self.num_sentences = 0
		
		ngrams_list = []
		if is_nbest == True:
			nbest = NBestList(dataset_path)
			for group in nbest:
				for item in group:
					tokens = item.hyp.split()
					starts_list.append(curr_start_index)
					ngrams = get_ngrams(tokens)
					ngrams_list += ngrams
					curr_start_index += len(ngrams)
		else:
			dataset = codecs.open(dataset_path, 'r', encoding="UTF-8")
			for line in dataset:
				tokens = line.split()
				starts_list.append(curr_start_index)
				ngrams = get_ngrams(tokens)
				ngrams_list += ngrams
				curr_start_index += len(ngrams)
			dataset.close()
		
		self.num_sentences = len(starts_list)
		
		data = np.asarray(ngrams_list)
		starts_list.append(curr_start_index)
		starts_array = np.asarray(starts_list)
		
		x = data[:,0:-1]
		y = data[:,-1]
		
		self.num_samples = y.shape[0]
		
		self.shared_starts = T.cast(theano.shared(starts_array, borrow=True), 'int64')
		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
Example #10
0
def augment(model_path, input_nbest_path, vocab_path, output_nbest_path):
	classifier = MLP(model_path=model_path)
	evaluator = eval.Evaluator(None, classifier)

	vocab = VocabManager(vocab_path)

	ngram_size = classifier.ngram_size

	def get_ngrams(tokens):
		for i in range(ngram_size - 1):
			tokens.insert(0, '<s>')
		if vocab.has_end_padding:
			tokens.append('</s>')
		indices = vocab.get_ids_given_word_list(tokens)
		return U.get_all_windows(indices, ngram_size)

	input_nbest = NBestList(input_nbest_path, mode='r')
	output_nbest = NBestList(output_nbest_path, mode='w')

	L.info('Augmenting: ' + input_nbest_path)
	
	start_time = time.time()

	counter = 0
	cache = dict()
	for group in input_nbest:
		ngram_list = []
		for item in group:
			tokens = item.hyp.split()
			ngrams = get_ngrams(tokens)
			for ngram in ngrams:
				if not cache.has_key(str(ngram)):
					ngram_list.append(ngram)
					cache[str(ngram)] = 1000
		if len(ngram_list) > 0:
			ngram_array = np.asarray(ngram_list, dtype='int32')
			ngram_log_prob_list = evaluator.get_ngram_log_prob(ngram_array[:,0:-1], ngram_array[:,-1])
			for i in range(len(ngram_list)):
				cache[str(ngram_list[i])] = ngram_log_prob_list[i]
		for item in group:
			tokens = item.hyp.split()
			ngrams = get_ngrams(tokens)
			sum_ngram_log_prob = 0
			for ngram in ngrams:
				sum_ngram_log_prob += cache[str(ngram)]
			item.append_feature(sum_ngram_log_prob)
			output_nbest.write(item)
		#print counter
		counter += 1
	output_nbest.close()

	L.info("Ran for %.2fs" % (time.time() - start_time))
Example #11
0
	def __init__(self, input_path):
		L.info("Initializing vocabulary from: " + input_path)
		self.word_to_id_dict = dict()
		self.id_to_word_dict = dict()
		curr_id = 0
		with codecs.open(input_path, 'r', encoding='UTF-8') as input_file:
			for line in input_file:
				word = line.strip()
				self.word_to_id_dict[word] = curr_id
				self.id_to_word_dict[curr_id] = word
				curr_id += 1
		try:
			self.unk_id = self.word_to_id_dict['<unk>']
			self.padding_id = self.word_to_id_dict['<s>']
		except KeyError:
			L.error("Given vocab file does not include <unk> or <s>")
		self.has_end_padding = self.word_to_id_dict.has_key('</s>')
Example #12
0
	def __init__(self, rng, input, n_in, n_out, W_values=None, init_method=0, b_values=None, no_bias=False, suffix=None):
		
		L.info("Linear layer, #inputs: %s, #outputs: %s" % (U.red(n_in), U.red(n_out)))

		self.input = input

		if W_values is None:
			if init_method == 0:	# Useful for Relu activation
				high = 0.01
			elif init_method == 1:	# Useful for Tanh activation
				high = numpy.sqrt(6. / (n_in + n_out))
			elif init_method == 2:	# Useful for Sigmoid activation
				high = 4 * numpy.sqrt(6. / (n_in + n_out))
			else:
				L.error('Invalid initialization method')
			W_values = numpy.asarray(
				rng.uniform(
					low=-high,
					high=high,
					size=(n_in, n_out)
				),
				dtype=theano.config.floatX
			)

		if b_values is None and not no_bias:
			b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
		
		W_name = 'W'
		if suffix is not None:
			W_name += '.' + str(suffix)
		
		W = theano.shared(value=W_values, name=W_name, borrow=True)
		self.W = W

		if no_bias:
			self.output = T.dot(input, self.W)
			self.params = [self.W]
		else:
			b_name = 'b'
			if suffix is not None:
				b_name += '.' + str(suffix)
			b = theano.shared(value=b_values, name=b_name, borrow=True)
			self.b = b
			self.output = T.dot(input, self.W) + self.b
			self.params = [self.W, self.b]
Example #13
0
	def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False, suffix=None, high=0.01):
		
		L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim)))

		self.input = input
		
		self.emb_matrix = emb_matrix

		if self.emb_matrix is None:
			self.emb_matrix = numpy.asarray(
				rng.uniform(
					low=-high, #low=-1,
					high=high, #high=1,
					size=(vocab_size, emb_dim)
				),
				dtype=theano.config.floatX
			)
		
		if emb_path:
			U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.')
			self.initialize(emb_path, vocab_path)
		

		embeddings_name = 'embeddings'
		if suffix is not None:
			embeddings_name += '.' + str(suffix)
		
		self.embeddings = theano.shared(value=self.emb_matrix, name=embeddings_name, borrow=True)
		
		if add_weights:
			weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX)
			self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True)
			
			# Check if the speed can be improved
			self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input]
			#self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input]
			#self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input]
			
			self.params = [self.embeddings, self.weights]
		else:
			self.output = self.embeddings[input]
			self.params = [self.embeddings]
		
		if concat:
			self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
Example #14
0
	def __init__(self, emb_path):
		L.info('Loading embeddings from: ' + emb_path)
		has_header=False
		with codecs.open(emb_path, 'r', encoding='utf8') as emb_file:
			tokens = emb_file.next().split()
			if len(tokens) == 2:
				try:
					int(tokens[0])
					int(tokens[1])
					has_header = True
				except ValueError:
					pass
		if has_header:
			with codecs.open(emb_path, 'r', encoding='utf8') as emb_file:
				tokens = emb_file.next().split()
				U.xassert(len(tokens) == 2, 'The first line in W2V embeddings must be the pair (vocab_size, emb_dim)')
				self.vocab_size = int(tokens[0])
				self.emb_dim = int(tokens[1])
				self.embeddings = {}
				counter = 0
				for line in emb_file:
					tokens = line.split()
					U.xassert(len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info')
					word = tokens[0]
					vec = tokens[1:]
					self.embeddings[word] = vec
					counter += 1
				U.xassert(counter == self.vocab_size, 'Vocab size does not match the header info')
		else:
			with codecs.open(emb_path, 'r', encoding='utf8') as emb_file:
				self.vocab_size = 0
				self.emb_dim = -1
				self.embeddings = {}
				for line in emb_file:
					tokens = line.split()
					if self.emb_dim == -1:
						self.emb_dim = len(tokens) - 1
					else:
						U.xassert(len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info')
					word = tokens[0]
					vec = tokens[1:]
					self.embeddings[word] = vec
					self.vocab_size += 1
		
		L.info('  #vectors: %i, #dimensions: %i' % (self.vocab_size, self.emb_dim))
Example #15
0
def set_theano_device(device, threads):
	import sys
	import dlm.io.logging as L
	xassert(device == "cpu" or device.startswith("gpu"), "The device can only be 'cpu', 'gpu' or 'gpu<id>'")
	if device.startswith("gpu") and len(device) > 3:
		try:
			gpu_id = int(device[3:])
			if not is_gpu_free(gpu_id):
				L.warning('The selected GPU (GPU' + str(gpu_id) + ') is apparently busy.')
		except ValueError:
			L.error("Unknown GPU device format: " + device)
	if device.startswith("gpu"):
		L.warning('Running on GPU yields non-deterministic results.')
	xassert(sys.modules.has_key('theano') == False, "dlm.utils.set_theano_device() function cannot be called after importing theano")
	os.environ['OMP_NUM_THREADS'] = str(threads)
	os.environ['THEANO_FLAGS'] = 'device=' + device
	os.environ['THEANO_FLAGS'] += ',force_device=True'
	os.environ['THEANO_FLAGS'] += ',floatX=float32'
	os.environ['THEANO_FLAGS'] += ',warn_float64=warn'
	os.environ['THEANO_FLAGS'] += ',cast_policy=numpy+floatX'
	# os.environ['THEANO_FLAGS'] += ',cuda.root=/usr/local/cuda'
	#os.environ['THEANO_FLAGS'] += ',allow_gc=True'
	os.environ['THEANO_FLAGS'] += ',print_active_device=False'
	os.environ['THEANO_FLAGS'] += ',exception_verbosity=high'		# Highly verbose debugging
	os.environ['THEANO_FLAGS'] += ',mode=FAST_RUN'
	os.environ['THEANO_FLAGS'] += ',nvcc.fastmath=False' 			# True: makes div and sqrt faster at the cost of precision, and possible bugs
	#os.environ['THEANO_FLAGS'] += ',optimizer_including=cudnn' 	# Comment out if CUDNN is not available
	try:
		import theano
	except EnvironmentError:
		L.exception()
	global logger
	if theano.config.device == "gpu":
		L.info(
			"Device: " + theano.config.device.upper() + " "
			+ str(theano.sandbox.cuda.active_device_number())
			+ " (" + str(theano.sandbox.cuda.active_device_name()) + ")"
		)
	else:
		L.info("Device: " + theano.config.device.upper())
Example #16
0
	def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False):

		L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim)))

		self.input = input
		L.info("Input " + str(input))
                L.info("Add weightes " + str(add_weights))
                self.emb_matrix = emb_matrix

		if self.emb_matrix is None:
			self.emb_matrix = numpy.asarray(
				rng.uniform(
					low=-0.01, #low=-1,
					high=0.01, #high=1,
					size=(vocab_size, emb_dim)
				),
				dtype=K._FLOATX
			)

		if emb_path:
			U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.')
			self.initialize(emb_path, vocab_path)

		#self.embeddings = theano.shared(value=self.emb_matrix, name='embeddings', borrow=True)
		self.embeddings = K.variable(self.emb_matrix, name='embeddings')


		if add_weights:
			weights_vec = numpy.ones(vocab_size, dtype=K._FLOATX)
			#self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True)
			self.weights = K.variable(weights_vec, name='word_weights')

			# Check if the speed can be improved
			self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input]
			#self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input]
			#self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input]

			self.params = [self.embeddings, self.weights]
		else:
			self.output = self.embeddings[input]
			self.params = [self.embeddings]

		if concat:
			self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
Example #17
0
def train(classifier, criterion, args, trainset, devset, testset=None):
	if args.algorithm == "sgd":
		from dlm.algorithms.sgd import SGD as Trainer
	else:
		L.error("Invalid training algorithm: " + args.algorithm)

	# Get number of minibatches from the training file
	num_train_batches = trainset.get_num_batches()

	# Initialize the trainer object
	trainer = Trainer(classifier, criterion, args.learning_rate, trainset, clip_threshold=args.clip_threshold)

	# Initialize the Learning Rate tuner, which adjusts learning rate based on the development/validation file
	lr_tuner = LRTuner(low=0.01*args.learning_rate, high=10*args.learning_rate, inc=0.01*args.learning_rate)
	validation_frequency = 5000 # minibatches

	# Logging and statistics
	total_num_iter = args.num_epochs * num_train_batches
	hook = Hook(classifier, devset, testset, total_num_iter, args.out_dir)
	L.info('Training')
	start_time = time.time()
	verbose_freq = 1000 # minibatches
	epoch = 0

	hook.evaluate(0)

	a = time.time()
	classifier.save_model(args.out_dir + '/model.epoch_0.gz', zipped=True)

	while (epoch < args.num_epochs):
		epoch = epoch + 1
		L.info("Epoch: " + U.red(epoch))

		minibatch_avg_cost_sum = 0
		for minibatch_index in xrange(num_train_batches):
			# Makes an update of the paramters after processing the minibatch
			minibatch_avg_cost, gparams = trainer.step(minibatch_index)
			minibatch_avg_cost_sum += minibatch_avg_cost

			if minibatch_index % verbose_freq == 0:
				grad_norms = [np.linalg.norm(gparam) for gparam in gparams]
				L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f'
					% (minibatch_index, num_train_batches, minibatch_avg_cost_sum/(minibatch_index+1), trainer.get_learning_rate()))
				L.info('Grad Norms: [' + ', '.join(['%.6f' % gnorm for gnorm in grad_norms]) + ']')
			curr_iter = (epoch - 1) * num_train_batches + minibatch_index
			if curr_iter > 0 and curr_iter % validation_frequency == 0:
				hook.evaluate(curr_iter)

		L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f'
			% (num_train_batches, num_train_batches, minibatch_avg_cost_sum/num_train_batches, trainer.get_learning_rate()))
		dev_ppl = hook.evaluate(curr_iter)
		lr = trainer.get_learning_rate()
		if args.enable_lr_adjust:
			lr = lr_tuner.adapt_lr(dev_ppl, lr)
		trainer.set_learning_rate(lr)
		classifier.save_model(args.out_dir + '/model.epoch_' + str(epoch) + '.gz', zipped=True)

	end_time = time.time()
	hook.evaluate(total_num_iter)
	L.info('Optimization complete')
	L.info('Ran for %.2fm' % ((end_time - start_time) / 60.))
Example #18
0
	testset = TextReader(dataset_path=args.test_path, is_nbest=is_nbest, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path)

#########################
## Compiling theano function
#

evaluator = eval.Evaluator(testset, classifier)

#########################
## Testing
#

start_time = time.time()

if args.perplexity:
	L.info("Perplexity: %f" % (evaluator.perplexity()))
	if args.unnormalized:
		L.info("Unnormalized Perplexity: %f" % (evaluator.unnormalized_perplexity()))

if args.nlp_path:
	with open(args.nlp_path, 'w') as output:
		for i in xrange(testset.get_num_sentences()):
			output.write(str(evaluator.get_sequence_log_prob(i)) + '\n')

if args.ulp_path:
	with open(args.ulp_path, 'w') as output:
		for i in xrange(testset.get_num_sentences()):
			output.write(str(evaluator.get_unnormalized_sequence_log_prob(i)) + '\n')

L.info("Ran for %.2fs" % (time.time() - start_time))
Example #19
0
for group in input_nbest:
	if mode == 0:
		for i in range(min(N, group.size())):
			output_nbest.write(group[i])
	elif mode == 1:
		output_1best.write(group[0].hyp + "\n")
	elif mode == 2:
		for i in range(group.size()):
			features = group[i].features.split()
			output.write(features[N] + "\n")
	elif mode == 3:
		for i in range(group.size()):
			features.append(float(group[i].features.split()[N]))
	counter += 1
	if counter % 100 == 0:
		L.info("%i groups processed" % (counter))
L.info("Finished processing %i groups" % (counter))

if mode == 0:
	output_nbest.close()
elif mode == 1:
	output_1best.close()
elif mode == 2:
	output.close()
elif mode == 3:
	import scipy.stats as S
	print 'PEARSON: ', S.pearsonr(features, oracles)
	print 'SPEARMAN:', S.spearmanr(features, oracles)


Example #20
0
	def __init__(self, dataset_path, batch_size=500, instance_weights_path=None):

		L.info("Initializing dataset from: " + os.path.abspath(dataset_path))
		# Reading parameters from the mmap file
                print K.get_platform()
		fp = np.memmap(dataset_path, dtype='int32', mode='r')
		self.num_samples = fp[0]
		self.ngram = fp[1]
		fp = fp.reshape((self.num_samples + 3, self.ngram))
		self.vocab_size = fp[1,0]
		self.num_classes = fp[2,0]

		# Setting minibatch size and number of mini batches
		self.batch_size = batch_size
		self.num_batches = int(M.ceil(self.num_samples / self.batch_size))

		# Reading the matrix of samples
		x = fp[3:,0:self.ngram - 1]			# Reading the context indices
		y = fp[3:,self.ngram - 1]			# Reading the output word index
		#self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		#self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
                # What is T.cast :))
                L.info("Initialize a simple variable")
                val = np.random.random((4, 2))
                tmp = K.variable(val)

                L.info("Initialize a real variable")
                tmp = K.variable(x)
                L.info("Initialize two casted variables")
                self.shared_x = K.cast(K.variable(x), 'int32')
                self.shared_y = K.cast(K.variable(y), 'int32')
                L.info("Create two variable without borrow=True")
		self.is_weighted = False
		if instance_weights_path:
			instance_weights = np.loadtxt(instance_weights_path)
			U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.")
			# what is borrow=True
                        # self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
                        self.shared_w = K.cast(K.variable(instance_weights), K._FLOATX)

			self.is_weighted = True

		L.info('  #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (
				U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)
			)
		)
Example #21
0
	def __init__(self, input, func_name):
		L.info("Activation layer, function: " + U.red(func_name))
		self.input = input
		self.func = self.get_function(func_name)
		self.output = self.func(input)
Example #22
0
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--corelm-model", dest="corelm_model", required=True, help="The input NPLM model file")
parser.add_argument("-v", "--vocab-file", dest="vocab_path", required=True, help="The input vocabulary")
parser.add_argument("-dir", "--directory", dest="out_dir", help="The output directory for log file, model, etc.")

args = parser.parse_args()

U.set_theano_device('cpu',1)
from dlm.models.mlp import MLP

if args.out_dir is None:
	args.out_dir = 'corelm_convert-' + U.curr_time()
U.mkdir_p(args.out_dir)

# Loading CoreLM model and creating classifier class
L.info("Loading CoreLM model")
classifier = MLP(model_path=args.corelm_model)
args_nn = classifier.args
params_nn = classifier.params
U.xassert(len(params_nn)==7, "CoreLM model is not compatible with NPLM architecture. 2 hidden layers and an output linear layer is required.")

embeddings = params_nn[0].get_value()
W1 = params_nn[1].get_value()
W1 = np.transpose(W1)
b1 = params_nn[2].get_value()
W2 = params_nn[3].get_value()
W2 = np.transpose(W2)
b2 = params_nn[4].get_value()
W3 = params_nn[5].get_value()
W3 = np.transpose(W3)
b3 = params_nn[6].get_value()
Example #23
0
	def __init__(self, dataset_path, batch_size=500, instance_weights_path=None):
		
		L.info("Initializing dataset (with features) from: " + os.path.abspath(dataset_path))
		
		# Reading parameters from the mmap file
		fp = np.memmap(dataset_path, dtype='int32', mode='r')
		#print type(fp1)
		#fp = np.empty(fp1.shape, dtype='int32')
		#fp[:] = fp1
		#print type(fp)
		self.num_samples = fp[0]
		self.ngram = fp[1]

		fp = fp.reshape((len(fp)/self.ngram, self.ngram))

		num_header_lines = fp[1,0]

	
		self.features_info = []    # Format (vocab_size, num_of_elements)
		for i in xrange(num_header_lines-1):
			self.features_info.append( (fp[i+2,0], fp[i+2,1]) )


		self.num_classes = fp[(num_header_lines+2)-1,0]


		# Setting minibatch size and number of mini batches
		self.batch_size = batch_size
		self.num_batches = int(M.ceil(self.num_samples / self.batch_size))

		# Reading the matrix of samples
		# x is list
		'''
		self.shared_x_list = []
		last_start_pos = 0
		for i in xrange(len(self.features_info)):
			vocab_size, num_elems = self.features_info[i]
			x = fp[num_header_lines+2:,last_start_pos:last_start_pos + num_elems]			# Reading the context indices
			last_start_pos += num_elems
			shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
			self.shared_x_list.append(shared_x)
		'''
		x = fp[num_header_lines+2:,0:self.ngram - 1]			# Reading the context indices
		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		y = fp[num_header_lines+2:,self.ngram - 1]			# Reading the output word index
		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
		

		## Untested instance weighting
		self.is_weighted = False
		if instance_weights_path:
			instance_weights = np.loadtxt(instance_weights_path)
			U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.")
			self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
			self.is_weighted = True
		
		L.info('  #samples: %s,  #classes: %s, batch size: %s, #batches: %s' % (
				U.red(self.num_samples),   U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)
			))
		for feature in enumerate(self.features_info):
			L.info("Feature %s: #ngrams= %s vocab_size= %s" %( U.red(feature[0]), U.red(feature[1][1]), U.red(feature[1][0])))
Example #24
0
# Setting the args for the classifier
args_nn.emb_dim = int(config_dict['input_embedding_dimension'])
args_nn.num_hidden = config_dict['num_hidden'] + ',' + config_dict['output_embedding_dimension']
args_nn.vocab_size = int(config_dict['input_vocab_size'])
args_nn.ngram_size = int(config_dict['ngram_size'])
args_nn.num_classes = int(config_dict['output_vocab_size'])

act_func = config_dict['activation_function']
if act_func == 'rectifier':
	act_func = 'relu'

args_nn.activation_name = act_func

# Creating the classifier with the arguments read
L.info("Creating PrimeLM model")
classifier = MLP(args_nn)


# Loading matrices
embeddings = np.loadtxt(model_dict['\input_embeddings'])
W1 = np.loadtxt(model_dict['\hidden_weights 1'])
W1 = np.transpose(W1)
b1 = np.loadtxt(model_dict['\hidden_biases 1'])
W2 = np.loadtxt(model_dict['\hidden_weights 2'])
W2 = np.transpose(W2)
b2 = np.loadtxt(model_dict['\hidden_biases 2'])
W3 = np.loadtxt(model_dict['\output_weights'])
W3 = np.transpose(W3)
b3 = np.loadtxt(model_dict['\output_biases'])
params_nn =[embeddings, W1, b1, W2, b2, W3, b3]
Example #25
0
	L.error("Set MOSES_ROOT variable to your moses root directory")

U.mkdir_p(args.out_dir)

#cmd = moses_root + '/bin/moses -show-weights -f ' + args.input_config + ' 2> /dev/null'
#features = U.capture(cmd).strip().split('\n')
features = iniReader.parseIni(args.input_config)

output_nbest_path = args.out_dir + '/augmented.nbest'

if args.no_aug:
	shutil.copy(args.input_nbest, output_nbest_path)
else:
	augmenter.augment(args.model_path, args.input_nbest, args.vocab_path, output_nbest_path)

L.info('Extracting stats and features')
#L.warning('The optional arguments of extractor are not used yet')
cmd = moses_root + '/bin/extractor -r ' + args.ref_paths + ' -n ' + output_nbest_path + ' --scfile ' + args.out_dir + '/statscore.data --ffile ' + args.out_dir + '/features.data'
U.capture(cmd)

with open(args.out_dir + '/init.opt', 'w') as init_opt:
	init_list = []
	for line in features:
		tokens = line.split(" ")
		try:
			float(tokens[1])
			init_list += tokens[1:]
		except ValueError:
			pass
	if not args.no_aug:
		init_list.append(args.init_value)
Example #26
0
# Setting the args for the classifier
args_nn.emb_dim = int(config_dict['input_embedding_dimension'])
args_nn.num_hidden = config_dict['num_hidden'] + ',' + config_dict['output_embedding_dimension']
args_nn.vocab_size = int(config_dict['input_vocab_size'])
args_nn.ngram_size = int(config_dict['ngram_size'])
args_nn.num_classes = int(config_dict['output_vocab_size'])

act_func = config_dict['activation_function']
if act_func == 'rectifier':
	act_func = 'relu'

args_nn.activation_name = act_func

# Creating the classifier with the arguments read
L.info("Creating CoreLM model")
classifier = MLP(args_nn)


# Loading matrices
embeddings = np.loadtxt(model_dict['\input_embeddings'])
W1 = np.loadtxt(model_dict['\hidden_weights 1'])
W1 = np.transpose(W1)
b1 = np.loadtxt(model_dict['\hidden_biases 1'])
W2 = np.loadtxt(model_dict['\hidden_weights 2'])
W2 = np.transpose(W2)
b2 = np.loadtxt(model_dict['\hidden_biases 2'])
W3 = np.loadtxt(model_dict['\output_weights'])
W3 = np.transpose(W3)
b3 = np.loadtxt(model_dict['\output_biases'])
params_nn =[embeddings, W1, b1, W2, b2, W3, b3]
Example #27
0
parser.add_argument( "--adjust-learning-rate", dest="enable_lr_adjust", action='store_true', help="Enable learning rate adjustment")

#parser.add_argument("-m","--model-file", dest="model_path",  help="The file path to load the model from")

args = parser.parse_args()

args.cwd = os.getcwd()

if args.out_dir is None:
	args.out_dir = 'corelm-' + U.curr_time()
U.mkdir_p(args.out_dir)

L.quiet = args.quiet
L.set_file_path(os.path.abspath(args.out_dir) + "/log.txt")

L.info('Command: ' + ' '.join(sys.argv))

curr_version = U.curr_version()
if curr_version:
	L.info("Version: " + curr_version)

if args.emb_path:
	U.xassert(args.vocab, 'When --emb-path is used, vocab file must be given too (using --vocab).')

if args.loss_function == "nll":
	args.num_noise_samples = 0

U.print_args(args)
U.set_theano_device(args.device, args.threads)

import dlm.trainer
Example #28
0
                #### Add POS tag to the sample ####
            sample.append(label)
            sample_idx.append(label_to_id[label])

            if args.shuffle:
                samples.append(sample)
                samples_idx.append(sample_idx)
            else:
                tmp_file.write(" ".join([str(idx) for idx in sample_idx]) + "\n")
                if args.word_out:
                    f_words.write(" ".join([word for word in sample]) + "\n")

            nsamples += 1
            if nsamples % 100000 == 0:
                L.info(str(nsamples) + " samples processed.")

                # print word, feature, label

                # if not input_word_to_id.has_key(word):
                # 	word = "<unk>"
                # indices.append(str(input_word_to_id[word]))
                # f_indices.append(str(feature_to_id[feature]))

# Shuffling the data and writing to tmp file
if args.shuffle:
    L.info("Shuffling data.")
    permutation_arr = np.random.permutation(nsamples)
    with open(tmp_path, "w") as tmp_file:
        for index in permutation_arr:
            tmp_file.write(" ".join([str(idx) for idx in samples_idx[index]]) + "\n")
Example #29
0
src_prune_args.add_argument("--source-vocab-file", dest="src_vocab_path",  help="Source vocabulary file path")

trg_prune_args = parser.add_mutually_exclusive_group(required=True)
trg_prune_args.add_argument("-vt","--prune-target-vocab", dest="trg_vocab_size", type=int, help="Target vocabulary size")
trg_prune_args.add_argument("--target-vocab-file", dest="trg_vocab_path", help="Target vocabulary file path")

output_prune_args = parser.add_mutually_exclusive_group(required=True)
output_prune_args.add_argument("-vo","--prune-output-vocab", dest="output_vocab_size", type=int, help="Output vocabulary size. Defaults to target vocabulary size.")
output_prune_args.add_argument("--output-vocab-file", dest="output_vocab_path", help="Output vocabulary file")

args = parser.parse_args()

# Format of the memmap file does not support less than 5 because the first row consists of parameters for the neural network
U.xassert(args.trg_context + args.src_context*2 + 1 > 3, "Total ngram size must be greater than 3. ngrams < 3 are not supported by the current memmap format.")

L.info("Source Window Size: " + str(args.src_context * 2 + 1))
L.info("Target Window Size: " + str(args.trg_context - 1))
L.info("Total Sample Size: " + str(args.trg_context + args.src_context * 2 + 1))

if (args.output_vocab_size is None):
	args.output_vocab_size = args.trg_vocab_size

# The output directory is 
if (not os.path.exists(args.output_dir_path)):
	os.makedirs(args.output_dir_path)
L.info("Output directory: " + os.path.abspath(args.output_dir_path))

# Prefix of files
src_prefix = args.output_dir_path + "/" + os.path.basename(args.src_input_path)
trg_prefix = args.output_dir_path + "/" + os.path.basename(args.trg_input_path)
Example #30
0
	'chen'    : B.chen_smoothing
}

ref_path_list = args.ref_paths.split(',')

input_nbest = NBestList(args.input_path, mode='r', reference_list=ref_path_list)
if args.out_nbest_path:
	output_nbest = NBestList(args.out_nbest_path, mode='w')
if args.out_scores_path:
	output_scores = open(args.out_scores_path, mode='w')
output_1best = codecs.open(args.out_1best_path, mode='w', encoding='UTF-8')

U.xassert(methods.has_key(args.method), "Invalid smoothing method: " + args.method)
scorer = methods[args.method]

L.info('Processing the n-best list')

def process_group(group):
	index = 0
	scores = dict()
	for item in group:
		scores[index] = scorer(item.hyp, group.refs)
		index += 1
	return scores

pool = Pool(args.threads)

counter = 0
group_counter = 0
flag = True
while (flag):