def evaluate(self, curr_iter): denominator = self.dev_eval.get_denominator() dev_error = self.dev_eval.classification_error() dev_perplexity = self.dev_eval.perplexity() if self.test_eval: test_error = self.test_eval.classification_error() test_perplexity = self.test_eval.perplexity() if dev_perplexity < self.best_dev_perplexity: self.best_dev_perplexity = dev_perplexity self.best_iter = curr_iter if self.test_eval: self.best_test_perplexity = test_perplexity if curr_iter > 0: t1 = time.time() rem_time = int((self.total_num_iter - curr_iter) * (t1 - self.t0) / (curr_iter * 60)) rem_time = str(rem_time) + "m" else: rem_time = "" L.info(('DEV => Error=%.2f%%, PPL=' + U.b_yellow('%.2f @ %i') + ' (' + U.b_red('%.2f @ %i') + '), Denom=%.3f, %s') % (dev_error * 100., dev_perplexity, curr_iter, self.best_dev_perplexity, self.best_iter, denominator, rem_time)) if self.test_eval: L.info(('TEST => Error=%.2f%%, PPL=' + U.b_yellow('%.2f @ %i') + ' (' + U.b_red('%.2f @ %i') + ')') % (test_error * 100., test_perplexity, curr_iter, self.best_test_perplexity, self.best_iter)) return dev_perplexity
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1,0] self.num_classes = fp[2,0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:,0:self.ngram - 1] # Reading the context indices y = fp[3:,self.ngram - 1] # Reading the output word index self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.") self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info(' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % ( U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches) ) )
def __init__(self, dataset_path, ngram_size, vocab_path): L.info("Initializing dataset from: " + dataset_path) vocab = VocabManager(vocab_path) curr_index = 0 self.num_sentences = 0 ngrams_list = [] dataset = codecs.open(dataset_path, 'r', encoding="UTF-8") for line in dataset: tokens = line.split() ngrams = vocab.get_ids_given_word_list(tokens) ngrams_list.append(ngrams) curr_index += 1 dataset.close() data = np.asarray(ngrams_list) x = data[:,0:-1] y = data[:,-1] self.num_samples = y.shape[0] self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
def print_args(args): import dlm.io.logging as L L.info("Arguments:") items = vars(args) for key in sorted(items.keys(), key=lambda s: s.lower()): value = items[key] if not value: value = "None" L.info(" " + key + ": " + BColors.MAGENTA + str(items[key]) + BColors.ENDC)
def initialize(self, emb_path, vocab_path): L.info('Initializing lookup table') vm = VocabManager(vocab_path) w2v = W2VEmbReader(emb_path) U.xassert(w2v.get_emb_dim() == self.emb_matrix.shape[1], 'The embeddings dimension does not match with the given word embeddings') for i in range(self.emb_matrix.shape[0]): vec = w2v.get_emb_given_word(vm.get_word_given_id(i)) if vec: self.emb_matrix[i] = vec
def __init__(self, emb_path): L.info('Loading embeddings from: ' + emb_path) has_header = False with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: tokens = emb_file.next().split() if len(tokens) == 2: try: int(tokens[0]) int(tokens[1]) has_header = True except ValueError: pass if has_header: with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: tokens = emb_file.next().split() U.xassert( len(tokens) == 2, 'The first line in W2V embeddings must be the pair (vocab_size, emb_dim)' ) self.vocab_size = int(tokens[0]) self.emb_dim = int(tokens[1]) self.embeddings = {} counter = 0 for line in emb_file: tokens = line.split() U.xassert( len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info' ) word = tokens[0] vec = tokens[1:] self.embeddings[word] = vec counter += 1 U.xassert(counter == self.vocab_size, 'Vocab size does not match the header info') else: with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: self.vocab_size = 0 self.emb_dim = -1 self.embeddings = {} for line in emb_file: tokens = line.split() if self.emb_dim == -1: self.emb_dim = len(tokens) - 1 else: U.xassert( len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info' ) word = tokens[0] vec = tokens[1:] self.embeddings[word] = vec self.vocab_size += 1 L.info(' #vectors: %i, #dimensions: %i' % (self.vocab_size, self.emb_dim))
def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False): L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim))) self.input = input self.emb_matrix = emb_matrix if self.emb_matrix is None: self.emb_matrix = numpy.asarray( rng.uniform( low=-0.01, #low=-1, high=0.01, #high=1, size=(vocab_size, emb_dim)), dtype=theano.config.floatX) if emb_path: U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.') self.initialize(emb_path, vocab_path) self.embeddings = theano.shared(value=self.emb_matrix, name='embeddings', borrow=True) if add_weights: weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX) self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True) # Check if the speed can be improved self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input] #self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input] #self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input] self.params = [self.embeddings, self.weights] else: self.output = self.embeddings[input] self.params = [self.embeddings] if concat: self.output = self.output.reshape( (input.shape[0], emb_dim * input.shape[1]))
def set_theano_device(device, threads): import sys import dlm.io.logging as L xassert(device == "cpu" or device.startswith("gpu"), "The device can only be 'cpu', 'gpu' or 'gpu<id>'") if device.startswith("gpu") and len(device) > 3: try: gpu_id = int(device[3:]) if not is_gpu_free(gpu_id): L.warning('The selected GPU (GPU' + str(gpu_id) + ') is apparently busy.') except ValueError: L.error("Unknown GPU device format: " + device) if device.startswith("gpu"): L.warning('Running on GPU yields non-deterministic results.') xassert(sys.modules.has_key('theano') == False, "dlm.utils.set_theano_device() function cannot be called after importing theano") os.environ['OMP_NUM_THREADS'] = str(threads) os.environ['THEANO_FLAGS'] = 'device=' + device os.environ['THEANO_FLAGS'] += ',force_device=True' os.environ['THEANO_FLAGS'] += ',floatX=float32' os.environ['THEANO_FLAGS'] += ',warn_float64=warn' os.environ['THEANO_FLAGS'] += ',cast_policy=numpy+floatX' #os.environ['THEANO_FLAGS'] += ',allow_gc=True' os.environ['THEANO_FLAGS'] += ',print_active_device=False' os.environ['THEANO_FLAGS'] += ',exception_verbosity=high' # Highly verbose debugging os.environ['THEANO_FLAGS'] += ',mode=FAST_RUN' os.environ['THEANO_FLAGS'] += ',nvcc.fastmath=False' # True: makes div and sqrt faster at the cost of precision, and possible bugs #os.environ['THEANO_FLAGS'] += ',optimizer_including=cudnn' # Comment out if CUDNN is not available # change theano to wrapper try: #import theano import backend.nn_wrapper as K except EnvironmentError: L.exception() global logger #if theano.config.device == "gpu": # L.info( # "Device: " + theano.config.device.upper() + " " # + str(theano.sandbox.cuda.active_device_number()) # + " (" + str(theano.sandbox.cuda.active_device_name()) + ")" # ) #else: # L.info("Device: " + theano.config.device.upper()) #global K try: K.set_platform('tensorflow') # theano is working L.info("Creating a variable inside utils") import numpy as np val = np.random.random((4, 2)) tmp = K.variable(val) except: print >> sys.stderr, "Unexpected error:", sys.exc_info() raise TypeError("Cannot set the platform")
def load_model(self, model_path): L.info('Loading model from ' + model_path) t0 = time.time() if model_path.endswith('.gz'): with gzip.open(model_path, 'rb') as model_file: args, params = pickle.load(model_file) else: with open(model_path, 'r') as model_file: args, params = pickle.load(model_file) L.info(' |-> took %.2f seconds' % (time.time() - t0)) return args, params
def __init__(self, dataset_path, is_nbest, ngram_size, vocab_path): L.info("Initializing dataset from: " + dataset_path) vocab = VocabManager(vocab_path) def get_ngrams(tokens): for i in range(ngram_size - 1): tokens.insert(0, '<s>') if vocab.has_end_padding: tokens.append('</s>') indices = vocab.get_ids_given_word_list(tokens) return U.get_all_windows(indices, ngram_size) starts_list = [] curr_index = 0 curr_start_index = 0 self.num_sentences = 0 ngrams_list = [] if is_nbest == True: nbest = NBestList(dataset_path) for group in nbest: for item in group: tokens = item.hyp.split() starts_list.append(curr_start_index) ngrams = get_ngrams(tokens) ngrams_list += ngrams curr_start_index += len(ngrams) else: dataset = codecs.open(dataset_path, 'r', encoding="UTF-8") for line in dataset: tokens = line.split() starts_list.append(curr_start_index) ngrams = get_ngrams(tokens) ngrams_list += ngrams curr_start_index += len(ngrams) dataset.close() self.num_sentences = len(starts_list) data = np.asarray(ngrams_list) starts_list.append(curr_start_index) starts_array = np.asarray(starts_list) x = data[:,0:-1] y = data[:,-1] self.num_samples = y.shape[0] self.shared_starts = T.cast(theano.shared(starts_array, borrow=True), 'int64') self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
def initialize(self, emb_path, vocab_path): L.info('Initializing lookup table') vm = VocabManager(vocab_path) w2v = W2VEmbReader(emb_path) U.xassert( w2v.get_emb_dim() == self.emb_matrix.shape[1], 'The embeddings dimension does not match with the given word embeddings' ) for i in range(self.emb_matrix.shape[0]): vec = w2v.get_emb_given_word(vm.get_word_given_id(i)) if vec: self.emb_matrix[i] = vec
def save_model(self, model_path, zipped=True, compress_level=5): L.info('Saving model to ' + model_path) t0 = time.time() if zipped: with gzip.open(model_path, 'wb', compresslevel=compress_level) as model_file: params = self.get_params() pickle.dump((self.args, [param.get_value() for param in params]), model_file) else: with open(model_path, 'w') as model_file: params = self.get_params() pickle.dump((self.args, [param.get_value() for param in params]), model_file) L.info(' |-> took %.2f seconds' % (time.time() - t0))
def augment(model_path, input_nbest_path, vocab_path, output_nbest_path): classifier = MLP(model_path=model_path) evaluator = eval.Evaluator(None, classifier) vocab = VocabManager(vocab_path) ngram_size = classifier.ngram_size def get_ngrams(tokens): for i in range(ngram_size - 1): tokens.insert(0, '<s>') if vocab.has_end_padding: tokens.append('</s>') indices = vocab.get_ids_given_word_list(tokens) return U.get_all_windows(indices, ngram_size) input_nbest = NBestList(input_nbest_path, mode='r') output_nbest = NBestList(output_nbest_path, mode='w') L.info('Augmenting: ' + input_nbest_path) start_time = time.time() counter = 0 cache = dict() for group in input_nbest: ngram_list = [] for item in group: tokens = item.hyp.split() ngrams = get_ngrams(tokens) for ngram in ngrams: if not cache.has_key(str(ngram)): ngram_list.append(ngram) cache[str(ngram)] = 1000 if len(ngram_list) > 0: ngram_array = np.asarray(ngram_list, dtype='int32') ngram_log_prob_list = evaluator.get_ngram_log_prob( ngram_array[:, 0:-1], ngram_array[:, -1]) for i in range(len(ngram_list)): cache[str(ngram_list[i])] = ngram_log_prob_list[i] for item in group: tokens = item.hyp.split() ngrams = get_ngrams(tokens) sum_ngram_log_prob = 0 for ngram in ngrams: sum_ngram_log_prob += cache[str(ngram)] item.append_feature(sum_ngram_log_prob) output_nbest.write(item) #print counter counter += 1 output_nbest.close() L.info("Ran for %.2fs" % (time.time() - start_time))
def augment(model_path, input_nbest_path, vocab_path, output_nbest_path): classifier = MLP(model_path=model_path) evaluator = eval.Evaluator(None, classifier) vocab = VocabManager(vocab_path) ngram_size = classifier.ngram_size def get_ngrams(tokens): for i in range(ngram_size - 1): tokens.insert(0, '<s>') if vocab.has_end_padding: tokens.append('</s>') indices = vocab.get_ids_given_word_list(tokens) return U.get_all_windows(indices, ngram_size) input_nbest = NBestList(input_nbest_path, mode='r') output_nbest = NBestList(output_nbest_path, mode='w') L.info('Augmenting: ' + input_nbest_path) start_time = time.time() counter = 0 cache = dict() for group in input_nbest: ngram_list = [] for item in group: tokens = item.hyp.split() ngrams = get_ngrams(tokens) for ngram in ngrams: if not cache.has_key(str(ngram)): ngram_list.append(ngram) cache[str(ngram)] = 1000 if len(ngram_list) > 0: ngram_array = np.asarray(ngram_list, dtype='int32') ngram_log_prob_list = evaluator.get_ngram_log_prob(ngram_array[:,0:-1], ngram_array[:,-1]) for i in range(len(ngram_list)): cache[str(ngram_list[i])] = ngram_log_prob_list[i] for item in group: tokens = item.hyp.split() ngrams = get_ngrams(tokens) sum_ngram_log_prob = 0 for ngram in ngrams: sum_ngram_log_prob += cache[str(ngram)] item.append_feature(sum_ngram_log_prob) output_nbest.write(item) #print counter counter += 1 output_nbest.close() L.info("Ran for %.2fs" % (time.time() - start_time))
def __init__(self, rng, input, n_in, n_out, W_values=None, init_method=0, b_values=None, no_bias=False, suffix=None): L.info("Linear layer, #inputs: %s, #outputs: %s" % (U.red(n_in), U.red(n_out))) self.input = input if W_values is None: if init_method == 0: # Useful for Relu activation high = 0.01 elif init_method == 1: # Useful for Tanh activation high = numpy.sqrt(6. / (n_in + n_out)) elif init_method == 2: # Useful for Sigmoid activation high = 4 * numpy.sqrt(6. / (n_in + n_out)) else: L.error('Invalid initialization method') W_values = numpy.asarray(rng.uniform(low=-high, high=high, size=(n_in, n_out)), dtype=theano.config.floatX) if b_values is None and not no_bias: b_values = numpy.zeros((n_out, ), dtype=theano.config.floatX) W_name = 'W' if suffix is not None: W_name += '.' + str(suffix) W = theano.shared(value=W_values, name=W_name, borrow=True) self.W = W if no_bias: self.output = T.dot(input, self.W) self.params = [self.W] else: b_name = 'b' if suffix is not None: b_name += '.' + str(suffix) b = theano.shared(value=b_values, name=b_name, borrow=True) self.b = b self.output = T.dot(input, self.W) + self.b self.params = [self.W, self.b]
def __init__(self, input_path): L.info("Initializing vocabulary from: " + input_path) self.word_to_id_dict = dict() self.id_to_word_dict = dict() curr_id = 0 with codecs.open(input_path, 'r', encoding='UTF-8') as input_file: for line in input_file: word = line.strip() self.word_to_id_dict[word] = curr_id self.id_to_word_dict[curr_id] = word curr_id += 1 try: self.unk_id = self.word_to_id_dict['<unk>'] self.padding_id = self.word_to_id_dict['<s>'] except KeyError: L.error("Given vocab file does not include <unk> or <s>") self.has_end_padding = self.word_to_id_dict.has_key('</s>')
def __init__(self, rng, input, n_in, n_out, W_values=None, init_method=0, b_values=None, no_bias=False, suffix=None): L.info("Linear layer, #inputs: %s, #outputs: %s" % (U.red(n_in), U.red(n_out))) self.input = input if W_values is None: if init_method == 0: # Useful for Relu activation high = 0.01 elif init_method == 1: # Useful for Tanh activation high = numpy.sqrt(6. / (n_in + n_out)) elif init_method == 2: # Useful for Sigmoid activation high = 4 * numpy.sqrt(6. / (n_in + n_out)) else: L.error('Invalid initialization method') W_values = numpy.asarray( rng.uniform( low=-high, high=high, size=(n_in, n_out) ), dtype=theano.config.floatX ) if b_values is None and not no_bias: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) W_name = 'W' if suffix is not None: W_name += '.' + str(suffix) W = theano.shared(value=W_values, name=W_name, borrow=True) self.W = W if no_bias: self.output = T.dot(input, self.W) self.params = [self.W] else: b_name = 'b' if suffix is not None: b_name += '.' + str(suffix) b = theano.shared(value=b_values, name=b_name, borrow=True) self.b = b self.output = T.dot(input, self.W) + self.b self.params = [self.W, self.b]
def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False, suffix=None, high=0.01): L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim))) self.input = input self.emb_matrix = emb_matrix if self.emb_matrix is None: self.emb_matrix = numpy.asarray( rng.uniform( low=-high, #low=-1, high=high, #high=1, size=(vocab_size, emb_dim) ), dtype=theano.config.floatX ) if emb_path: U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.') self.initialize(emb_path, vocab_path) embeddings_name = 'embeddings' if suffix is not None: embeddings_name += '.' + str(suffix) self.embeddings = theano.shared(value=self.emb_matrix, name=embeddings_name, borrow=True) if add_weights: weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX) self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True) # Check if the speed can be improved self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input] #self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input] #self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input] self.params = [self.embeddings, self.weights] else: self.output = self.embeddings[input] self.params = [self.embeddings] if concat: self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
def __init__(self, emb_path): L.info('Loading embeddings from: ' + emb_path) has_header=False with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: tokens = emb_file.next().split() if len(tokens) == 2: try: int(tokens[0]) int(tokens[1]) has_header = True except ValueError: pass if has_header: with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: tokens = emb_file.next().split() U.xassert(len(tokens) == 2, 'The first line in W2V embeddings must be the pair (vocab_size, emb_dim)') self.vocab_size = int(tokens[0]) self.emb_dim = int(tokens[1]) self.embeddings = {} counter = 0 for line in emb_file: tokens = line.split() U.xassert(len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info') word = tokens[0] vec = tokens[1:] self.embeddings[word] = vec counter += 1 U.xassert(counter == self.vocab_size, 'Vocab size does not match the header info') else: with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: self.vocab_size = 0 self.emb_dim = -1 self.embeddings = {} for line in emb_file: tokens = line.split() if self.emb_dim == -1: self.emb_dim = len(tokens) - 1 else: U.xassert(len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info') word = tokens[0] vec = tokens[1:] self.embeddings[word] = vec self.vocab_size += 1 L.info(' #vectors: %i, #dimensions: %i' % (self.vocab_size, self.emb_dim))
def set_theano_device(device, threads): import sys import dlm.io.logging as L xassert(device == "cpu" or device.startswith("gpu"), "The device can only be 'cpu', 'gpu' or 'gpu<id>'") if device.startswith("gpu") and len(device) > 3: try: gpu_id = int(device[3:]) if not is_gpu_free(gpu_id): L.warning('The selected GPU (GPU' + str(gpu_id) + ') is apparently busy.') except ValueError: L.error("Unknown GPU device format: " + device) if device.startswith("gpu"): L.warning('Running on GPU yields non-deterministic results.') xassert( sys.modules.has_key('theano') == False, "dlm.utils.set_theano_device() function cannot be called after importing theano" ) os.environ['OMP_NUM_THREADS'] = str(threads) os.environ['THEANO_FLAGS'] = 'device=' + device os.environ['THEANO_FLAGS'] += ',force_device=True' os.environ['THEANO_FLAGS'] += ',floatX=float32' os.environ['THEANO_FLAGS'] += ',warn_float64=warn' os.environ['THEANO_FLAGS'] += ',cast_policy=numpy+floatX' #os.environ['THEANO_FLAGS'] += ',allow_gc=True' os.environ['THEANO_FLAGS'] += ',print_active_device=False' os.environ[ 'THEANO_FLAGS'] += ',exception_verbosity=high' # Highly verbose debugging os.environ['THEANO_FLAGS'] += ',mode=FAST_RUN' os.environ[ 'THEANO_FLAGS'] += ',nvcc.fastmath=False' # True: makes div and sqrt faster at the cost of precision, and possible bugs #os.environ['THEANO_FLAGS'] += ',optimizer_including=cudnn' # Comment out if CUDNN is not available try: import theano except EnvironmentError: L.exception() global logger if theano.config.device == "gpu": L.info("Device: " + theano.config.device.upper() + " " + str(theano.sandbox.cuda.active_device_number()) + " (" + str(theano.sandbox.cuda.active_device_name()) + ")") else: L.info("Device: " + theano.config.device.upper())
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1, 0] self.num_classes = fp[2, 0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:, 0:self.ngram - 1] # Reading the context indices y = fp[3:, self.ngram - 1] # Reading the output word index self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert( instance_weights.shape == (self.num_samples, ), "The number of lines in weights file must be the same as the number of samples." ) self.shared_w = T.cast( theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info( ' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (U.red(self.num_samples), U.red( self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)))
def set_theano_device(device, threads): import sys import dlm.io.logging as L xassert(device == "cpu" or device.startswith("gpu"), "The device can only be 'cpu', 'gpu' or 'gpu<id>'") if device.startswith("gpu") and len(device) > 3: try: gpu_id = int(device[3:]) if not is_gpu_free(gpu_id): L.warning('The selected GPU (GPU' + str(gpu_id) + ') is apparently busy.') except ValueError: L.error("Unknown GPU device format: " + device) if device.startswith("gpu"): L.warning('Running on GPU yields non-deterministic results.') xassert(sys.modules.has_key('theano') == False, "dlm.utils.set_theano_device() function cannot be called after importing theano") os.environ['OMP_NUM_THREADS'] = str(threads) os.environ['THEANO_FLAGS'] = 'device=' + device os.environ['THEANO_FLAGS'] += ',force_device=True' os.environ['THEANO_FLAGS'] += ',floatX=float32' os.environ['THEANO_FLAGS'] += ',warn_float64=warn' os.environ['THEANO_FLAGS'] += ',cast_policy=numpy+floatX' # os.environ['THEANO_FLAGS'] += ',cuda.root=/usr/local/cuda' #os.environ['THEANO_FLAGS'] += ',allow_gc=True' os.environ['THEANO_FLAGS'] += ',print_active_device=False' os.environ['THEANO_FLAGS'] += ',exception_verbosity=high' # Highly verbose debugging os.environ['THEANO_FLAGS'] += ',mode=FAST_RUN' os.environ['THEANO_FLAGS'] += ',nvcc.fastmath=False' # True: makes div and sqrt faster at the cost of precision, and possible bugs #os.environ['THEANO_FLAGS'] += ',optimizer_including=cudnn' # Comment out if CUDNN is not available try: import theano except EnvironmentError: L.exception() global logger if theano.config.device == "gpu": L.info( "Device: " + theano.config.device.upper() + " " + str(theano.sandbox.cuda.active_device_number()) + " (" + str(theano.sandbox.cuda.active_device_name()) + ")" ) else: L.info("Device: " + theano.config.device.upper())
def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False): L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim))) self.input = input L.info("Input " + str(input)) L.info("Add weightes " + str(add_weights)) self.emb_matrix = emb_matrix if self.emb_matrix is None: self.emb_matrix = numpy.asarray( rng.uniform( low=-0.01, #low=-1, high=0.01, #high=1, size=(vocab_size, emb_dim) ), dtype=K._FLOATX ) if emb_path: U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.') self.initialize(emb_path, vocab_path) #self.embeddings = theano.shared(value=self.emb_matrix, name='embeddings', borrow=True) self.embeddings = K.variable(self.emb_matrix, name='embeddings') if add_weights: weights_vec = numpy.ones(vocab_size, dtype=K._FLOATX) #self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True) self.weights = K.variable(weights_vec, name='word_weights') # Check if the speed can be improved self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input] #self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input] #self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input] self.params = [self.embeddings, self.weights] else: self.output = self.embeddings[input] self.params = [self.embeddings] if concat: self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file print K.get_platform() fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1,0] self.num_classes = fp[2,0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:,0:self.ngram - 1] # Reading the context indices y = fp[3:,self.ngram - 1] # Reading the output word index #self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') #self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') # What is T.cast :)) L.info("Initialize a simple variable") val = np.random.random((4, 2)) tmp = K.variable(val) L.info("Initialize a real variable") tmp = K.variable(x) L.info("Initialize two casted variables") self.shared_x = K.cast(K.variable(x), 'int32') self.shared_y = K.cast(K.variable(y), 'int32') L.info("Create two variable without borrow=True") self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.") # what is borrow=True # self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.shared_w = K.cast(K.variable(instance_weights), K._FLOATX) self.is_weighted = True L.info(' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % ( U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches) ) )
for group in input_nbest: if mode == 0: for i in range(min(N, group.size())): output_nbest.write(group[i]) elif mode == 1: output_1best.write(group[0].hyp + "\n") elif mode == 2: for i in range(group.size()): features = group[i].features.split() output.write(features[N] + "\n") elif mode == 3: for i in range(group.size()): features.append(float(group[i].features.split()[N])) counter += 1 if counter % 100 == 0: L.info("%i groups processed" % (counter)) L.info("Finished processing %i groups" % (counter)) if mode == 0: output_nbest.close() elif mode == 1: output_1best.close() elif mode == 2: output.close() elif mode == 3: import scipy.stats as S print 'PEARSON: ', S.pearsonr(features, oracles) print 'SPEARMAN:', S.spearmanr(features, oracles)
L.error("Set MOSES_ROOT variable to your moses root directory") U.mkdir_p(args.out_dir) #cmd = moses_root + '/bin/moses -show-weights -f ' + args.input_config + ' 2> /dev/null' #features = U.capture(cmd).strip().split('\n') features = iniReader.parseIni(args.input_config) output_nbest_path = args.out_dir + '/augmented.nbest' if args.no_aug: shutil.copy(args.input_nbest, output_nbest_path) else: augmenter.augment(args.model_path, args.input_nbest, args.vocab_path, output_nbest_path) L.info('Extracting stats and features') #L.warning('The optional arguments of extractor are not used yet') cmd = moses_root + '/bin/extractor -r ' + args.ref_paths + ' -n ' + output_nbest_path + ' --scfile ' + args.out_dir + '/statscore.data --ffile ' + args.out_dir + '/features.data' U.capture(cmd) with open(args.out_dir + '/init.opt', 'w') as init_opt: init_list = [] for line in features: tokens = line.split(" ") try: float(tokens[1]) init_list += tokens[1:] except ValueError: pass if not args.no_aug: init_list.append(args.init_value)
#### Add POS tag to the sample #### sample.append(label) sample_idx.append(label_to_id[label]) if args.shuffle: samples.append(sample) samples_idx.append(sample_idx) else: tmp_file.write(" ".join([str(idx) for idx in sample_idx]) + "\n") if args.word_out: f_words.write(" ".join([word for word in sample]) + "\n") nsamples += 1 if nsamples % 100000 == 0: L.info(str(nsamples) + " samples processed.") #print word, feature, label #if not input_word_to_id.has_key(word): # word = "<unk>" #indices.append(str(input_word_to_id[word])) #f_indices.append(str(feature_to_id[feature])) # Shuffling the data and writing to tmp file if args.shuffle: L.info("Shuffling data.") permutation_arr = np.random.permutation(nsamples) with open(tmp_path, 'w') as tmp_file: for index in permutation_arr: tmp_file.write(" ".join([str(idx)
# Setting the args for the classifier args_nn.emb_dim = int(config_dict['input_embedding_dimension']) args_nn.num_hidden = config_dict['num_hidden'] + ',' + config_dict['output_embedding_dimension'] args_nn.vocab_size = int(config_dict['input_vocab_size']) args_nn.ngram_size = int(config_dict['ngram_size']) args_nn.num_classes = int(config_dict['output_vocab_size']) act_func = config_dict['activation_function'] if act_func == 'rectifier': act_func = 'relu' args_nn.activation_name = act_func # Creating the classifier with the arguments read L.info("Creating CoreLM model") classifier = MLP(args_nn) # Loading matrices embeddings = np.loadtxt(model_dict['\input_embeddings']) W1 = np.loadtxt(model_dict['\hidden_weights 1']) W1 = np.transpose(W1) b1 = np.loadtxt(model_dict['\hidden_biases 1']) W2 = np.loadtxt(model_dict['\hidden_weights 2']) W2 = np.transpose(W2) b2 = np.loadtxt(model_dict['\hidden_biases 2']) W3 = np.loadtxt(model_dict['\output_weights']) W3 = np.transpose(W3) b3 = np.loadtxt(model_dict['\output_biases']) params_nn =[embeddings, W1, b1, W2, b2, W3, b3]
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset (with features) from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') #print type(fp1) #fp = np.empty(fp1.shape, dtype='int32') #fp[:] = fp1 #print type(fp) self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((len(fp) / self.ngram, self.ngram)) num_header_lines = fp[1, 0] self.features_info = [] # Format (vocab_size, num_of_elements) for i in xrange(num_header_lines - 1): self.features_info.append((fp[i + 2, 0], fp[i + 2, 1])) self.num_classes = fp[(num_header_lines + 2) - 1, 0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples # x is list ''' self.shared_x_list = [] last_start_pos = 0 for i in xrange(len(self.features_info)): vocab_size, num_elems = self.features_info[i] x = fp[num_header_lines+2:,last_start_pos:last_start_pos + num_elems] # Reading the context indices last_start_pos += num_elems shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_x_list.append(shared_x) ''' x = fp[num_header_lines + 2:, 0:self.ngram - 1] # Reading the context indices self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') y = fp[num_header_lines + 2:, self.ngram - 1] # Reading the output word index self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') ## Untested instance weighting self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert( instance_weights.shape == (self.num_samples, ), "The number of lines in weights file must be the same as the number of samples." ) self.shared_w = T.cast( theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info(' #samples: %s, #classes: %s, batch size: %s, #batches: %s' % (U.red(self.num_samples), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches))) for feature in enumerate(self.features_info): L.info("Feature %s: #ngrams= %s vocab_size= %s" % (U.red( feature[0]), U.red(feature[1][1]), U.red(feature[1][0])))
src_prune_args.add_argument("--source-vocab-file", dest="src_vocab_path", help="Source vocabulary file path") trg_prune_args = parser.add_mutually_exclusive_group(required=True) trg_prune_args.add_argument("-vt","--prune-target-vocab", dest="trg_vocab_size", type=int, help="Target vocabulary size") trg_prune_args.add_argument("--target-vocab-file", dest="trg_vocab_path", help="Target vocabulary file path") output_prune_args = parser.add_mutually_exclusive_group(required=True) output_prune_args.add_argument("-vo","--prune-output-vocab", dest="output_vocab_size", type=int, help="Output vocabulary size. Defaults to target vocabulary size.") output_prune_args.add_argument("--output-vocab-file", dest="output_vocab_path", help="Output vocabulary file") args = parser.parse_args() # Format of the memmap file does not support less than 5 because the first row consists of parameters for the neural network U.xassert(args.trg_context + args.src_context*2 + 1 > 3, "Total ngram size must be greater than 3. ngrams < 3 are not supported by the current memmap format.") L.info("Source Window Size: " + str(args.src_context * 2 + 1)) L.info("Target Window Size: " + str(args.trg_context - 1)) L.info("Total Sample Size: " + str(args.trg_context + args.src_context * 2 + 1)) if (args.output_vocab_size is None): args.output_vocab_size = args.trg_vocab_size # The output directory is if (not os.path.exists(args.output_dir_path)): os.makedirs(args.output_dir_path) L.info("Output directory: " + os.path.abspath(args.output_dir_path)) # Prefix of files src_prefix = args.output_dir_path + "/" + os.path.basename(args.src_input_path) trg_prefix = args.output_dir_path + "/" + os.path.basename(args.trg_input_path)
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset (with features) from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') #print type(fp1) #fp = np.empty(fp1.shape, dtype='int32') #fp[:] = fp1 #print type(fp) self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((len(fp)/self.ngram, self.ngram)) num_header_lines = fp[1,0] self.features_info = [] # Format (vocab_size, num_of_elements) for i in xrange(num_header_lines-1): self.features_info.append( (fp[i+2,0], fp[i+2,1]) ) self.num_classes = fp[(num_header_lines+2)-1,0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples # x is list ''' self.shared_x_list = [] last_start_pos = 0 for i in xrange(len(self.features_info)): vocab_size, num_elems = self.features_info[i] x = fp[num_header_lines+2:,last_start_pos:last_start_pos + num_elems] # Reading the context indices last_start_pos += num_elems shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_x_list.append(shared_x) ''' x = fp[num_header_lines+2:,0:self.ngram - 1] # Reading the context indices self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') y = fp[num_header_lines+2:,self.ngram - 1] # Reading the output word index self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') ## Untested instance weighting self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.") self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info(' #samples: %s, #classes: %s, batch size: %s, #batches: %s' % ( U.red(self.num_samples), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches) )) for feature in enumerate(self.features_info): L.info("Feature %s: #ngrams= %s vocab_size= %s" %( U.red(feature[0]), U.red(feature[1][1]), U.red(feature[1][0])))
def set_theano_device(device, threads): import sys import dlm.io.logging as L xassert(device == "cpu" or device.startswith("gpu"), "The device can only be 'cpu', 'gpu' or 'gpu<id>'") if device.startswith("gpu") and len(device) > 3: try: gpu_id = int(device[3:]) if not is_gpu_free(gpu_id): L.warning('The selected GPU (GPU' + str(gpu_id) + ') is apparently busy.') except ValueError: L.error("Unknown GPU device format: " + device) if device.startswith("gpu"): L.warning('Running on GPU yields non-deterministic results.') xassert( sys.modules.has_key('theano') == False, "dlm.utils.set_theano_device() function cannot be called after importing theano" ) os.environ['OMP_NUM_THREADS'] = str(threads) os.environ['THEANO_FLAGS'] = 'device=' + device os.environ['THEANO_FLAGS'] += ',force_device=True' os.environ['THEANO_FLAGS'] += ',floatX=float32' os.environ['THEANO_FLAGS'] += ',warn_float64=warn' os.environ['THEANO_FLAGS'] += ',cast_policy=numpy+floatX' #os.environ['THEANO_FLAGS'] += ',allow_gc=True' os.environ['THEANO_FLAGS'] += ',print_active_device=False' os.environ[ 'THEANO_FLAGS'] += ',exception_verbosity=high' # Highly verbose debugging os.environ['THEANO_FLAGS'] += ',mode=FAST_RUN' os.environ[ 'THEANO_FLAGS'] += ',nvcc.fastmath=False' # True: makes div and sqrt faster at the cost of precision, and possible bugs #os.environ['THEANO_FLAGS'] += ',optimizer_including=cudnn' # Comment out if CUDNN is not available # change theano to wrapper try: #import theano import backend.nn_wrapper as K except EnvironmentError: L.exception() global logger #if theano.config.device == "gpu": # L.info( # "Device: " + theano.config.device.upper() + " " # + str(theano.sandbox.cuda.active_device_number()) # + " (" + str(theano.sandbox.cuda.active_device_name()) + ")" # ) #else: # L.info("Device: " + theano.config.device.upper()) #global K try: K.set_platform('tensorflow') # theano is working L.info("Creating a variable inside utils") import numpy as np val = np.random.random((4, 2)) tmp = K.variable(val) except: print >> sys.stderr, "Unexpected error:", sys.exc_info() raise TypeError("Cannot set the platform")
#### Add POS tag to the sample #### sample.append(label) sample_idx.append(label_to_id[label]) if args.shuffle: samples.append(sample) samples_idx.append(sample_idx) else: tmp_file.write(" ".join([str(idx) for idx in sample_idx]) + "\n") if args.word_out: f_words.write(" ".join([word for word in sample]) + "\n") nsamples += 1 if nsamples % 100000 == 0: L.info(str(nsamples) + " samples processed.") # print word, feature, label # if not input_word_to_id.has_key(word): # word = "<unk>" # indices.append(str(input_word_to_id[word])) # f_indices.append(str(feature_to_id[feature])) # Shuffling the data and writing to tmp file if args.shuffle: L.info("Shuffling data.") permutation_arr = np.random.permutation(nsamples) with open(tmp_path, "w") as tmp_file: for index in permutation_arr: tmp_file.write(" ".join([str(idx) for idx in samples_idx[index]]) + "\n")
dest="output_vocab_size", type=int, help="Output vocabulary size. Defaults to target vocabulary size.") output_prune_args.add_argument("--output-vocab-file", dest="output_vocab_path", help="Output vocabulary file") args = parser.parse_args() # Format of the memmap file does not support less than 5 because the first row consists of parameters for the neural network U.xassert( args.trg_context + args.src_context * 2 + 1 > 3, "Total ngram size must be greater than 3. ngrams < 3 are not supported by the current memmap format." ) L.info("Source Window Size: " + str(args.src_context * 2 + 1)) L.info("Target Window Size: " + str(args.trg_context - 1)) L.info("Total Sample Size: " + str(args.trg_context + args.src_context * 2 + 1)) if (args.output_vocab_size is None): args.output_vocab_size = args.trg_vocab_size # The output directory is if (not os.path.exists(args.output_dir_path)): os.makedirs(args.output_dir_path) L.info("Output directory: " + os.path.abspath(args.output_dir_path)) # Prefix of files src_prefix = args.output_dir_path + "/" + os.path.basename(args.src_input_path) trg_prefix = args.output_dir_path + "/" + os.path.basename(args.trg_input_path)
help="Base model used for adaptation") #parser.add_argument("-m","--model-file", dest="model_path", help="The file path to load the model from") args = parser.parse_args() args.cwd = os.getcwd() if args.out_dir is None: args.out_dir = 'primelm-' + U.curr_time() U.mkdir_p(args.out_dir) L.quiet = args.quiet L.set_file_path(os.path.abspath(args.out_dir) + "/log.txt") L.info('Command: ' + ' '.join(sys.argv)) curr_version = U.curr_version() if curr_version: L.info("Version: " + curr_version) if args.emb_path: U.xassert( args.vocab, 'When --emb-path is used, vocab file must be given too (using --vocab).' ) if args.loss_function == "nll": args.num_noise_samples = 0 U.print_args(args)
counter = 0 for group in input_aug_nbest: index = 0 scores = dict() for item in group: features = np.asarray( [x for x in item.features.split() if is_number(x)], dtype=float) try: scores[index] = np.dot(features, weights) except ValueError: L.error( 'Number of features in the nbest and the weights file are not the same' ) index += 1 sorted_indices = sorted(scores, key=scores.get, reverse=True) for idx in sorted_indices: output_nbest.write(group[idx]) output_1best.write(group[sorted_indices[0]].hyp + "\n") counter += 1 if counter % 100 == 0: L.info("%i groups processed" % (counter)) L.info("Finished processing %i groups" % (counter)) output_nbest.close() output_1best.close() if args.clean_up: os.remove(output_nbest_path)
# Setting the args for the classifier args_nn.emb_dim = int(config_dict['input_embedding_dimension']) args_nn.num_hidden = config_dict['num_hidden'] + ',' + config_dict['output_embedding_dimension'] args_nn.vocab_size = int(config_dict['input_vocab_size']) args_nn.ngram_size = int(config_dict['ngram_size']) args_nn.num_classes = int(config_dict['output_vocab_size']) act_func = config_dict['activation_function'] if act_func == 'rectifier': act_func = 'relu' args_nn.activation_name = act_func # Creating the classifier with the arguments read L.info("Creating PrimeLM model") classifier = MLP(args_nn) # Loading matrices embeddings = np.loadtxt(model_dict['\input_embeddings']) W1 = np.loadtxt(model_dict['\hidden_weights 1']) W1 = np.transpose(W1) b1 = np.loadtxt(model_dict['\hidden_biases 1']) W2 = np.loadtxt(model_dict['\hidden_weights 2']) W2 = np.transpose(W2) b2 = np.loadtxt(model_dict['\hidden_biases 2']) W3 = np.loadtxt(model_dict['\output_weights']) W3 = np.transpose(W3) b3 = np.loadtxt(model_dict['\output_biases']) params_nn =[embeddings, W1, b1, W2, b2, W3, b3]
parser = argparse.ArgumentParser() parser.add_argument("-m", "--corelm-model", dest="corelm_model", required=True, help="The input NPLM model file") parser.add_argument("-v", "--vocab-file", dest="vocab_path", required=True, help="The input vocabulary") parser.add_argument("-dir", "--directory", dest="out_dir", help="The output directory for log file, model, etc.") args = parser.parse_args() U.set_theano_device('cpu',1) from dlm.models.mlp import MLP if args.out_dir is None: args.out_dir = 'corelm_convert-' + U.curr_time() U.mkdir_p(args.out_dir) # Loading CoreLM model and creating classifier class L.info("Loading CoreLM model") classifier = MLP(model_path=args.corelm_model) args_nn = classifier.args params_nn = classifier.params U.xassert(len(params_nn)==7, "CoreLM model is not compatible with NPLM architecture. 2 hidden layers and an output linear layer is required.") embeddings = params_nn[0].get_value() W1 = params_nn[1].get_value() W1 = np.transpose(W1) b1 = params_nn[2].get_value() W2 = params_nn[3].get_value() W2 = np.transpose(W2) b2 = params_nn[4].get_value() W3 = params_nn[5].get_value() W3 = np.transpose(W3) b3 = params_nn[6].get_value()
# Setting the args for the classifier args_nn.emb_dim = int(config_dict['input_embedding_dimension']) args_nn.num_hidden = config_dict['num_hidden'] + ',' + config_dict[ 'output_embedding_dimension'] args_nn.vocab_size = int(config_dict['input_vocab_size']) args_nn.ngram_size = int(config_dict['ngram_size']) args_nn.num_classes = int(config_dict['output_vocab_size']) act_func = config_dict['activation_function'] if act_func == 'rectifier': act_func = 'relu' args_nn.activation_name = act_func # Creating the classifier with the arguments read L.info("Creating PrimeLM model") classifier = MLP(args_nn) # Loading matrices embeddings = np.loadtxt(model_dict['\input_embeddings']) W1 = np.loadtxt(model_dict['\hidden_weights 1']) W1 = np.transpose(W1) b1 = np.loadtxt(model_dict['\hidden_biases 1']) W2 = np.loadtxt(model_dict['\hidden_weights 2']) W2 = np.transpose(W2) b2 = np.loadtxt(model_dict['\hidden_biases 2']) W3 = np.loadtxt(model_dict['\output_weights']) W3 = np.transpose(W3) b3 = np.loadtxt(model_dict['\output_biases']) params_nn = [embeddings, W1, b1, W2, b2, W3, b3]
parser.add_argument( "--adjust-learning-rate", dest="enable_lr_adjust", action='store_true', help="Enable learning rate adjustment") #parser.add_argument("-m","--model-file", dest="model_path", help="The file path to load the model from") args = parser.parse_args() args.cwd = os.getcwd() if args.out_dir is None: args.out_dir = 'corelm-' + U.curr_time() U.mkdir_p(args.out_dir) L.quiet = args.quiet L.set_file_path(os.path.abspath(args.out_dir) + "/log.txt") L.info('Command: ' + ' '.join(sys.argv)) curr_version = U.curr_version() if curr_version: L.info("Version: " + curr_version) if args.emb_path: U.xassert(args.vocab, 'When --emb-path is used, vocab file must be given too (using --vocab).') if args.loss_function == "nll": args.num_noise_samples = 0 U.print_args(args) U.set_theano_device(args.device, args.threads) import dlm.trainer
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file print K.get_platform() fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1, 0] self.num_classes = fp[2, 0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:, 0:self.ngram - 1] # Reading the context indices y = fp[3:, self.ngram - 1] # Reading the output word index #self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') #self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') # What is T.cast :)) L.info("Initialize a simple variable") val = np.random.random((4, 2)) tmp = K.variable(val) L.info("Initialize a real variable") tmp = K.variable(x) L.info("Initialize two casted variables") self.shared_x = K.cast(K.variable(x), 'int32') self.shared_y = K.cast(K.variable(y), 'int32') L.info("Create two variable without borrow=True") self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert( instance_weights.shape == (self.num_samples, ), "The number of lines in weights file must be the same as the number of samples." ) # what is borrow=True # self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.shared_w = K.cast(K.variable(instance_weights), K._FLOATX) self.is_weighted = True L.info( ' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (U.red(self.num_samples), U.red( self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)))
testset = TextReader(dataset_path=args.test_path, is_nbest=is_nbest, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path) ######################### ## Compiling theano function # evaluator = eval.Evaluator(testset, classifier) ######################### ## Testing # start_time = time.time() if args.perplexity: L.info("Perplexity: %f" % (evaluator.perplexity())) if args.unnormalized: L.info("Unnormalized Perplexity: %f" % (evaluator.unnormalized_perplexity())) if args.nlp_path: with open(args.nlp_path, 'w') as output: for i in xrange(testset.get_num_sentences()): output.write(str(evaluator.get_sequence_log_prob(i)) + '\n') if args.ulp_path: with open(args.ulp_path, 'w') as output: for i in xrange(testset.get_num_sentences()): output.write(str(evaluator.get_unnormalized_sequence_log_prob(i)) + '\n') L.info("Ran for %.2fs" % (time.time() - start_time))
def train(classifier, criterion, args, trainset, devset, testset=None): if args.algorithm == "sgd": from dlm.algorithms.sgd import SGD as Trainer else: L.error("Invalid training algorithm: " + args.algorithm) # Get number of minibatches from the training file num_train_batches = trainset.get_num_batches() # Initialize the trainer object trainer = Trainer(classifier, criterion, args.learning_rate, trainset, clip_threshold=args.clip_threshold) # Initialize the Learning Rate tuner, which adjusts learning rate based on the development/validation file lr_tuner = LRTuner(low=0.01*args.learning_rate, high=10*args.learning_rate, inc=0.01*args.learning_rate) validation_frequency = 5000 # minibatches # Logging and statistics total_num_iter = args.num_epochs * num_train_batches hook = Hook(classifier, devset, testset, total_num_iter, args.out_dir) L.info('Training') start_time = time.time() verbose_freq = 1000 # minibatches epoch = 0 hook.evaluate(0) a = time.time() classifier.save_model(args.out_dir + '/model.epoch_0.gz', zipped=True) while (epoch < args.num_epochs): epoch = epoch + 1 L.info("Epoch: " + U.red(epoch)) minibatch_avg_cost_sum = 0 for minibatch_index in xrange(num_train_batches): # Makes an update of the paramters after processing the minibatch minibatch_avg_cost, gparams = trainer.step(minibatch_index) minibatch_avg_cost_sum += minibatch_avg_cost if minibatch_index % verbose_freq == 0: grad_norms = [np.linalg.norm(gparam) for gparam in gparams] L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f' % (minibatch_index, num_train_batches, minibatch_avg_cost_sum/(minibatch_index+1), trainer.get_learning_rate())) L.info('Grad Norms: [' + ', '.join(['%.6f' % gnorm for gnorm in grad_norms]) + ']') curr_iter = (epoch - 1) * num_train_batches + minibatch_index if curr_iter > 0 and curr_iter % validation_frequency == 0: hook.evaluate(curr_iter) L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f' % (num_train_batches, num_train_batches, minibatch_avg_cost_sum/num_train_batches, trainer.get_learning_rate())) dev_ppl = hook.evaluate(curr_iter) lr = trainer.get_learning_rate() if args.enable_lr_adjust: lr = lr_tuner.adapt_lr(dev_ppl, lr) trainer.set_learning_rate(lr) classifier.save_model(args.out_dir + '/model.epoch_' + str(epoch) + '.gz', zipped=True) end_time = time.time() hook.evaluate(total_num_iter) L.info('Optimization complete') L.info('Ran for %.2fm' % ((end_time - start_time) / 60.))
'chen' : B.chen_smoothing } ref_path_list = args.ref_paths.split(',') input_nbest = NBestList(args.input_path, mode='r', reference_list=ref_path_list) if args.out_nbest_path: output_nbest = NBestList(args.out_nbest_path, mode='w') if args.out_scores_path: output_scores = open(args.out_scores_path, mode='w') output_1best = codecs.open(args.out_1best_path, mode='w', encoding='UTF-8') U.xassert(methods.has_key(args.method), "Invalid smoothing method: " + args.method) scorer = methods[args.method] L.info('Processing the n-best list') def process_group(group): index = 0 scores = dict() for item in group: scores[index] = scorer(item.hyp, group.refs) index += 1 return scores pool = Pool(args.threads) counter = 0 group_counter = 0 flag = True while (flag):
def __init__(self, input, func_name): L.info("Activation layer, function: " + U.red(func_name)) self.input = input self.func = self.get_function(func_name) self.output = self.func(input)
ref_path_list = args.ref_paths.split(',') input_nbest = NBestList(args.input_path, mode='r', reference_list=ref_path_list) if args.out_nbest_path: output_nbest = NBestList(args.out_nbest_path, mode='w') if args.out_scores_path: output_scores = open(args.out_scores_path, mode='w') output_1best = codecs.open(args.out_1best_path, mode='w', encoding='UTF-8') U.xassert(methods.has_key(args.method), "Invalid smoothing method: " + args.method) scorer = methods[args.method] L.info('Processing the n-best list') def process_group(group): index = 0 scores = dict() for item in group: scores[index] = scorer(item.hyp, group.refs) index += 1 return scores pool = Pool(args.threads) counter = 0 group_counter = 0
parser.add_argument("-dir", "--directory", dest="out_dir", help="The output directory for log file, model, etc.") args = parser.parse_args() U.set_theano_device('cpu', 1) from dlm.models.mlp import MLP if args.out_dir is None: args.out_dir = 'primelm_convert-' + U.curr_time() U.mkdir_p(args.out_dir) # Loading PrimeLM model and creating classifier class L.info("Loading PrimeLM model") classifier = MLP(model_path=args.primelm_model) args_nn = classifier.args params_nn = classifier.params U.xassert( len(params_nn) == 7, "PrimeLM model is not compatible with NPLM architecture. 2 hidden layers and an output linear layer is required." ) embeddings = params_nn[0].get_value() W1 = params_nn[1].get_value() W1 = np.transpose(W1) b1 = params_nn[2].get_value() W2 = params_nn[3].get_value() W2 = np.transpose(W2) b2 = params_nn[4].get_value()
U.mkdir_p(args.out_dir) #cmd = moses_root + '/bin/moses -show-weights -f ' + args.input_config + ' 2> /dev/null' #features = U.capture(cmd).strip().split('\n') features = iniReader.parseIni(args.input_config) output_nbest_path = args.out_dir + '/augmented.nbest' if args.no_aug: shutil.copy(args.input_nbest, output_nbest_path) else: augmenter.augment(args.model_path, args.input_nbest, args.vocab_path, output_nbest_path) L.info('Extracting stats and features') #L.warning('The optional arguments of extractor are not used yet') cmd = moses_root + '/bin/extractor -r ' + args.ref_paths + ' -n ' + output_nbest_path + ' --scfile ' + args.out_dir + '/statscore.data --ffile ' + args.out_dir + '/features.data' U.capture(cmd) with open(args.out_dir + '/init.opt', 'w') as init_opt: init_list = [] for line in features: tokens = line.split(" ") try: float(tokens[1]) init_list += tokens[1:] except ValueError: pass if not args.no_aug: init_list.append(args.init_value)