def get_function(self, func_name): if func_name == 'tanh': return T.tanh elif func_name == 'hardtanh': L.warning('Current hardTanh implementation is slow!') return lambda x: ((abs(x) <= 1) * x) + ((1 < abs(x)) * T.sgn(x)) elif func_name == 'xtanh': return lambda x: T.tanh(x) + 0.1 * x elif func_name == 'sigmoid': return T.nnet.sigmoid elif func_name == 'fastsigmoid': L.error('T.nnet.ultra_fast_sigmoid function has some problems') elif func_name == 'hardsigmoid': return T.nnet.hard_sigmoid elif func_name == 'xsigmoid': return lambda x: T.nnet.sigmoid(x) + 0.1 * x elif func_name == 'softplus': return T.nnet.softplus elif func_name == 'relu': #return lambda x: T.maximum(x, 0) return lambda x: x * (x > 0) #return T.nnet.relu # Update theano and then use this one instead elif func_name == 'leakyrelu': return lambda x: T.maximum(x, 0.01 * x) elif func_name == 'cappedrelu': return lambda x: T.minimum(x * (x > 0), 6) elif func_name == 'softmax': return T.nnet.softmax elif func_name == 'norm1': return lambda x: x / T.nlinalg.norm(x, 1) elif func_name == 'norm2': #return lambda x: x / T.nlinalg.norm(x, 2) return lambda x: x / T.dot(x, x)**0.5 else: L.error('Invalid function name given: ' + func_name)
def set_theano_device(device, threads): import sys import dlm.io.logging as L xassert(device == "cpu" or device.startswith("gpu"), "The device can only be 'cpu', 'gpu' or 'gpu<id>'") if device.startswith("gpu") and len(device) > 3: try: gpu_id = int(device[3:]) if not is_gpu_free(gpu_id): L.warning('The selected GPU (GPU' + str(gpu_id) + ') is apparently busy.') except ValueError: L.error("Unknown GPU device format: " + device) if device.startswith("gpu"): L.warning('Running on GPU yields non-deterministic results.') xassert(sys.modules.has_key('theano') == False, "dlm.utils.set_theano_device() function cannot be called after importing theano") os.environ['OMP_NUM_THREADS'] = str(threads) os.environ['THEANO_FLAGS'] = 'device=' + device os.environ['THEANO_FLAGS'] += ',force_device=True' os.environ['THEANO_FLAGS'] += ',floatX=float32' os.environ['THEANO_FLAGS'] += ',warn_float64=warn' os.environ['THEANO_FLAGS'] += ',cast_policy=numpy+floatX' #os.environ['THEANO_FLAGS'] += ',allow_gc=True' os.environ['THEANO_FLAGS'] += ',print_active_device=False' os.environ['THEANO_FLAGS'] += ',exception_verbosity=high' # Highly verbose debugging os.environ['THEANO_FLAGS'] += ',mode=FAST_RUN' os.environ['THEANO_FLAGS'] += ',nvcc.fastmath=False' # True: makes div and sqrt faster at the cost of precision, and possible bugs #os.environ['THEANO_FLAGS'] += ',optimizer_including=cudnn' # Comment out if CUDNN is not available # change theano to wrapper try: #import theano import backend.nn_wrapper as K except EnvironmentError: L.exception() global logger #if theano.config.device == "gpu": # L.info( # "Device: " + theano.config.device.upper() + " " # + str(theano.sandbox.cuda.active_device_number()) # + " (" + str(theano.sandbox.cuda.active_device_name()) + ")" # ) #else: # L.info("Device: " + theano.config.device.upper()) #global K try: K.set_platform('tensorflow') # theano is working L.info("Creating a variable inside utils") import numpy as np val = np.random.random((4, 2)) tmp = K.variable(val) except: print >> sys.stderr, "Unexpected error:", sys.exc_info() raise TypeError("Cannot set the platform")
def __init__(self, rng, input, n_in, n_out, W_values=None, init_method=0, b_values=None, no_bias=False, suffix=None): L.info("Linear layer, #inputs: %s, #outputs: %s" % (U.red(n_in), U.red(n_out))) self.input = input if W_values is None: if init_method == 0: # Useful for Relu activation high = 0.01 elif init_method == 1: # Useful for Tanh activation high = numpy.sqrt(6. / (n_in + n_out)) elif init_method == 2: # Useful for Sigmoid activation high = 4 * numpy.sqrt(6. / (n_in + n_out)) else: L.error('Invalid initialization method') W_values = numpy.asarray(rng.uniform(low=-high, high=high, size=(n_in, n_out)), dtype=theano.config.floatX) if b_values is None and not no_bias: b_values = numpy.zeros((n_out, ), dtype=theano.config.floatX) W_name = 'W' if suffix is not None: W_name += '.' + str(suffix) W = theano.shared(value=W_values, name=W_name, borrow=True) self.W = W if no_bias: self.output = T.dot(input, self.W) self.params = [self.W] else: b_name = 'b' if suffix is not None: b_name += '.' + str(suffix) b = theano.shared(value=b_values, name=b_name, borrow=True) self.b = b self.output = T.dot(input, self.W) + self.b self.params = [self.W, self.b]
def __init__(self, input_path): L.info("Initializing vocabulary from: " + input_path) self.word_to_id_dict = dict() self.id_to_word_dict = dict() curr_id = 0 with codecs.open(input_path, 'r', encoding='UTF-8') as input_file: for line in input_file: word = line.strip() self.word_to_id_dict[word] = curr_id self.id_to_word_dict[curr_id] = word curr_id += 1 try: self.unk_id = self.word_to_id_dict['<unk>'] self.padding_id = self.word_to_id_dict['<s>'] except KeyError: L.error("Given vocab file does not include <unk> or <s>") self.has_end_padding = self.word_to_id_dict.has_key('</s>')
def __init__(self, rng, input, n_in, n_out, W_values=None, init_method=0, b_values=None, no_bias=False, suffix=None): L.info("Linear layer, #inputs: %s, #outputs: %s" % (U.red(n_in), U.red(n_out))) self.input = input if W_values is None: if init_method == 0: # Useful for Relu activation high = 0.01 elif init_method == 1: # Useful for Tanh activation high = numpy.sqrt(6. / (n_in + n_out)) elif init_method == 2: # Useful for Sigmoid activation high = 4 * numpy.sqrt(6. / (n_in + n_out)) else: L.error('Invalid initialization method') W_values = numpy.asarray( rng.uniform( low=-high, high=high, size=(n_in, n_out) ), dtype=theano.config.floatX ) if b_values is None and not no_bias: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) W_name = 'W' if suffix is not None: W_name += '.' + str(suffix) W = theano.shared(value=W_values, name=W_name, borrow=True) self.W = W if no_bias: self.output = T.dot(input, self.W) self.params = [self.W] else: b_name = 'b' if suffix is not None: b_name += '.' + str(suffix) b = theano.shared(value=b_values, name=b_name, borrow=True) self.b = b self.output = T.dot(input, self.W) + self.b self.params = [self.W, self.b]
def set_theano_device(device, threads): import sys import dlm.io.logging as L xassert(device == "cpu" or device.startswith("gpu"), "The device can only be 'cpu', 'gpu' or 'gpu<id>'") if device.startswith("gpu") and len(device) > 3: try: gpu_id = int(device[3:]) if not is_gpu_free(gpu_id): L.warning('The selected GPU (GPU' + str(gpu_id) + ') is apparently busy.') except ValueError: L.error("Unknown GPU device format: " + device) if device.startswith("gpu"): L.warning('Running on GPU yields non-deterministic results.') xassert( sys.modules.has_key('theano') == False, "dlm.utils.set_theano_device() function cannot be called after importing theano" ) os.environ['OMP_NUM_THREADS'] = str(threads) os.environ['THEANO_FLAGS'] = 'device=' + device os.environ['THEANO_FLAGS'] += ',force_device=True' os.environ['THEANO_FLAGS'] += ',floatX=float32' os.environ['THEANO_FLAGS'] += ',warn_float64=warn' os.environ['THEANO_FLAGS'] += ',cast_policy=numpy+floatX' #os.environ['THEANO_FLAGS'] += ',allow_gc=True' os.environ['THEANO_FLAGS'] += ',print_active_device=False' os.environ[ 'THEANO_FLAGS'] += ',exception_verbosity=high' # Highly verbose debugging os.environ['THEANO_FLAGS'] += ',mode=FAST_RUN' os.environ[ 'THEANO_FLAGS'] += ',nvcc.fastmath=False' # True: makes div and sqrt faster at the cost of precision, and possible bugs #os.environ['THEANO_FLAGS'] += ',optimizer_including=cudnn' # Comment out if CUDNN is not available try: import theano except EnvironmentError: L.exception() global logger if theano.config.device == "gpu": L.info("Device: " + theano.config.device.upper() + " " + str(theano.sandbox.cuda.active_device_number()) + " (" + str(theano.sandbox.cuda.active_device_name()) + ")") else: L.info("Device: " + theano.config.device.upper())
def next_item(self): U.xassert(self.mode == 'r', "next() method can only be used in 'r' mode") try: segments = self.nbest_file.next().split("|||") except StopIteration: self.close() raise StopIteration try: index = int(segments[0]) except ValueError: L.error("The first segment in an n-best list must be an integer") hyp = segments[1].strip() features = segments[2].strip() score = segments[3].strip() phrase_alignments = None word_alignments = None if len(segments) > 4: phrase_alignments = segments[4].strip() if len(segments) > 5: word_alignments = segments[5].strip() return NBestItem(index, hyp, features, score, phrase_alignments, word_alignments)
def set_theano_device(device, threads): import sys import dlm.io.logging as L xassert(device == "cpu" or device.startswith("gpu"), "The device can only be 'cpu', 'gpu' or 'gpu<id>'") if device.startswith("gpu") and len(device) > 3: try: gpu_id = int(device[3:]) if not is_gpu_free(gpu_id): L.warning('The selected GPU (GPU' + str(gpu_id) + ') is apparently busy.') except ValueError: L.error("Unknown GPU device format: " + device) if device.startswith("gpu"): L.warning('Running on GPU yields non-deterministic results.') xassert(sys.modules.has_key('theano') == False, "dlm.utils.set_theano_device() function cannot be called after importing theano") os.environ['OMP_NUM_THREADS'] = str(threads) os.environ['THEANO_FLAGS'] = 'device=' + device os.environ['THEANO_FLAGS'] += ',force_device=True' os.environ['THEANO_FLAGS'] += ',floatX=float32' os.environ['THEANO_FLAGS'] += ',warn_float64=warn' os.environ['THEANO_FLAGS'] += ',cast_policy=numpy+floatX' # os.environ['THEANO_FLAGS'] += ',cuda.root=/usr/local/cuda' #os.environ['THEANO_FLAGS'] += ',allow_gc=True' os.environ['THEANO_FLAGS'] += ',print_active_device=False' os.environ['THEANO_FLAGS'] += ',exception_verbosity=high' # Highly verbose debugging os.environ['THEANO_FLAGS'] += ',mode=FAST_RUN' os.environ['THEANO_FLAGS'] += ',nvcc.fastmath=False' # True: makes div and sqrt faster at the cost of precision, and possible bugs #os.environ['THEANO_FLAGS'] += ',optimizer_including=cudnn' # Comment out if CUDNN is not available try: import theano except EnvironmentError: L.exception() global logger if theano.config.device == "gpu": L.info( "Device: " + theano.config.device.upper() + " " + str(theano.sandbox.cuda.active_device_number()) + " (" + str(theano.sandbox.cuda.active_device_name()) + ")" ) else: L.info("Device: " + theano.config.device.upper())
## Creating model # L.info('Building the model') args.vocab_size = trainset.get_vocab_size() args.ngram_size = trainset.get_ngram_size() args.num_classes = trainset.get_num_classes() classifier = MLP(args) L.info('Parameters: ' + str(classifier.params)) ######################### ## Training criterion # if args.loss_function == "nll": from dlm.criterions.weighted_nll import NegLogLikelihood criterion = NegLogLikelihood(classifier, args) elif args.loss_function == "nce": from dlm.criterions.nce import NCELikelihood noise_dist = trainset.get_unigram_model() criterion = NCELikelihood(classifier, args, noise_dist) else: L.error('Invalid loss function \'' + args.loss_function + '\'') ######################### ## Training # dlm.trainer.train(classifier, criterion, args, trainset, devset, testset)
mode = 2 N = int(args.command[7:]) # Nth feature output = open(args.output_path, mode='w') elif args.command.startswith('correl'): mode = 3 N = int(args.command[6:]) # Nth feature U.xassert(args.oracle, "correlN command needs a file (-s) containing oracle scores") with open(args.oracle, mode='r') as oracles_file: oracles = map(float, oracles_file.read().splitlines()) #output = open(args.output_path, mode='w') elif args.command.startswith('augment'): U.set_theano_device(args.device) from dlm.reranker import augmenter augmenter.augment(args.model_path, args.input_path, args.vocab_path, args.output_path) else: L.error('Invalid command: ' + args.command) counter = 0 features = [] for group in input_nbest: if mode == 0: for i in range(min(N, group.size())): output_nbest.write(group[i]) elif mode == 1: output_1best.write(group[0].hyp + "\n") elif mode == 2: for i in range(group.size()): features = group[i].features.split() output.write(features[N] + "\n") elif mode == 3: for i in range(group.size()):
args.features_info = trainset.get_features_info() classifier = MLP(args) L.info('Parameters: ' + str(classifier.params)) if args.base_model_path is not None: initialization_classifier = MLP(model_path=args.base_model_path) for param, aparam in zip(classifier.params, initialization_classifier.params): param.set_value(aparam.get_value()) ######################### ## Training criterion # if args.loss_function == "nll": from dlm.criterions.nll import NegLogLikelihood criterion = NegLogLikelihood(classifier, args) elif args.loss_function == "nce": from dlm.criterions.nce import NCELikelihood noise_dist = trainset.get_unigram_model() criterion = NCELikelihood(classifier, args, noise_dist) else: L.error('Invalid loss function \'' + args.loss_function + '\'') ######################### ## Training # dlm.trainer.train(classifier, criterion, args, trainset, devset, testset)
def train(classifier, criterion, args, trainset, devset, testset=None): if args.algorithm == "sgd": from dlm.algorithms.sgd import SGD as Trainer else: L.error("Invalid training algorithm: " + args.algorithm) # Get number of minibatches from the training file num_train_batches = trainset.get_num_batches() # Initialize the trainer object trainer = Trainer(classifier, criterion, args.learning_rate, trainset, clip_threshold=args.clip_threshold) # Initialize the Learning Rate tuner, which adjusts learning rate based on the development/validation file lr_tuner = LRTuner(low=0.01*args.learning_rate, high=10*args.learning_rate, inc=0.01*args.learning_rate) validation_frequency = 5000 # minibatches # Logging and statistics total_num_iter = args.num_epochs * num_train_batches hook = Hook(classifier, devset, testset, total_num_iter, args.out_dir) L.info('Training') start_time = time.time() verbose_freq = 1000 # minibatches epoch = 0 hook.evaluate(0) a = time.time() classifier.save_model(args.out_dir + '/model.epoch_0.gz', zipped=True) while (epoch < args.num_epochs): epoch = epoch + 1 L.info("Epoch: " + U.red(epoch)) minibatch_avg_cost_sum = 0 for minibatch_index in xrange(num_train_batches): # Makes an update of the paramters after processing the minibatch minibatch_avg_cost, gparams = trainer.step(minibatch_index) minibatch_avg_cost_sum += minibatch_avg_cost if minibatch_index % verbose_freq == 0: grad_norms = [np.linalg.norm(gparam) for gparam in gparams] L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f' % (minibatch_index, num_train_batches, minibatch_avg_cost_sum/(minibatch_index+1), trainer.get_learning_rate())) L.info('Grad Norms: [' + ', '.join(['%.6f' % gnorm for gnorm in grad_norms]) + ']') curr_iter = (epoch - 1) * num_train_batches + minibatch_index if curr_iter > 0 and curr_iter % validation_frequency == 0: hook.evaluate(curr_iter) L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f' % (num_train_batches, num_train_batches, minibatch_avg_cost_sum/num_train_batches, trainer.get_learning_rate())) dev_ppl = hook.evaluate(curr_iter) lr = trainer.get_learning_rate() if args.enable_lr_adjust: lr = lr_tuner.adapt_lr(dev_ppl, lr) trainer.set_learning_rate(lr) classifier.save_model(args.out_dir + '/model.epoch_' + str(epoch) + '.gz', zipped=True) end_time = time.time() hook.evaluate(total_num_iter) L.info('Optimization complete') L.info('Ran for %.2fm' % ((end_time - start_time) / 60.))
output = open(args.output_path, mode='w') elif args.command.startswith('correl'): mode = 3 N = int(args.command[6:]) # Nth feature U.xassert(args.oracle, "correlN command needs a file (-s) containing oracle scores") with open(args.oracle, mode='r') as oracles_file: oracles = map(float, oracles_file.read().splitlines()) #output = open(args.output_path, mode='w') elif args.command.startswith('augment'): U.set_theano_device(args.device) from dlm.reranker import augmenter augmenter.augment(args.model_path, args.input_path, args.vocab_path, args.output_path) else: L.error('Invalid command: ' + args.command) counter = 0 features = [] for group in input_nbest: if mode == 0: for i in range(min(N, group.size())): output_nbest.write(group[i]) elif mode == 1: output_1best.write(group[0].hyp + "\n") elif mode == 2: for i in range(group.size()): features = group[i].features.split() output.write(features[N] + "\n") elif mode == 3: for i in range(group.size()):
for tindex, sindex_list in enumerate(trg_aligns): if sindex_list == []: # No Alignment for the target token, look at nearby tokens, giving preference to right r_tindex = tindex + 1 l_tindex = tindex - 1 while r_tindex < len(ttokens) or l_tindex >= 0: if r_tindex < len(ttokens) and trg_aligns[r_tindex]: sindex_list = trg_aligns[r_tindex] break if l_tindex >= 0 and trg_aligns[l_tindex]: sindex_list = trg_aligns[l_tindex] break r_tindex = r_tindex + 1 l_tindex = l_tindex - 1 if sindex_list == []: L.error("No alignments in line " + sentence_count) mid = (len(sindex_list) - 1) / 2 # Middle of the source alignments sindex_align = sorted(sindex_list)[mid] src_ngrams = [] trg_ngrams = [] ngram_idx = [] # Get source context for i in range(max(0, args.src_context - sindex_align)): src_ngrams.append("<s>") src_ngrams = src_ngrams + stokens[ max(0, sindex_align - args.src_context):sindex_align + args.src_context + 1]
parser.add_argument("-iv", "--init-value", dest="init_value", default = '0.05', help="The initial value of the feature") parser.add_argument("-n", "--no-aug", dest="no_aug", action='store_true', help="Augmentation will be skipped, if this flag is set") parser.add_argument("-a", "--tuning-algorithm", dest="alg", default = 'mert', help="Tuning Algorithm (mert|pro|wpro)") parser.add_argument("-w", "--instance-weights", dest="instance_weights_path", help="Instance weights for wpro algorithm") parser.add_argument("-s", "--predictable-seed", dest="pred_seed", action='store_true', help="Tune with predictable seed to avoid randomness") args = parser.parse_args() U.set_theano_device(args.device) from dlm.reranker import augmenter from dlm.reranker import mosesIniReader as iniReader if os.environ.has_key('MOSES_ROOT'): moses_root = os.environ['MOSES_ROOT'] else: L.error("Set MOSES_ROOT variable to your moses root directory") U.mkdir_p(args.out_dir) #cmd = moses_root + '/bin/moses -show-weights -f ' + args.input_config + ' 2> /dev/null' #features = U.capture(cmd).strip().split('\n') features = iniReader.parseIni(args.input_config) output_nbest_path = args.out_dir + '/augmented.nbest' if args.no_aug: shutil.copy(args.input_nbest, output_nbest_path) else: augmenter.augment(args.model_path, args.input_nbest, args.vocab_path, output_nbest_path) L.info('Extracting stats and features')
for tindex, sindex_list in enumerate(trg_aligns): if sindex_list == []: # No Alignment for the target token, look at nearby tokens, giving preference to right r_tindex = tindex + 1 l_tindex = tindex - 1 while r_tindex < len(ttokens) or l_tindex >=0: if r_tindex < len(ttokens) and trg_aligns[r_tindex]: sindex_list = trg_aligns[r_tindex] break if l_tindex >= 0 and trg_aligns[l_tindex]: sindex_list = trg_aligns[l_tindex] break r_tindex = r_tindex + 1 l_tindex = l_tindex - 1 if sindex_list == []: L.error("No alignments in line " + sentence_count) mid = (len(sindex_list)-1)/2 # Middle of the source alignments sindex_align = sorted(sindex_list)[mid] src_ngrams = [] trg_ngrams = [] ngram_idx = [] # Get source context for i in range(max(0, args.src_context - sindex_align)): src_ngrams.append("<s>") src_ngrams = src_ngrams + stokens[max(0, sindex_align - args.src_context): sindex_align + args.src_context + 1] for i in range(max(0, sindex_align + args.src_context + 1 - len(stokens))): src_ngrams.append("</s>")
def xassert(condition, message): if not condition: import dlm.io.logging as L L.error(message)
except ValueError: return False counter = 0 for group in input_aug_nbest: index = 0 scores = dict() for item in group: features = np.asarray( [x for x in item.features.split() if is_number(x)], dtype=float) try: scores[index] = np.dot(features, weights) except ValueError: L.error( 'Number of features in the nbest and the weights file are not the same' ) index += 1 sorted_indices = sorted(scores, key=scores.get, reverse=True) for idx in sorted_indices: output_nbest.write(group[idx]) output_1best.write(group[sorted_indices[0]].hyp + "\n") counter += 1 if counter % 100 == 0: L.info("%i groups processed" % (counter)) L.info("Finished processing %i groups" % (counter)) output_nbest.close() output_1best.close() if args.clean_up:
def set_theano_device(device, threads): import sys import dlm.io.logging as L xassert(device == "cpu" or device.startswith("gpu"), "The device can only be 'cpu', 'gpu' or 'gpu<id>'") if device.startswith("gpu") and len(device) > 3: try: gpu_id = int(device[3:]) if not is_gpu_free(gpu_id): L.warning('The selected GPU (GPU' + str(gpu_id) + ') is apparently busy.') except ValueError: L.error("Unknown GPU device format: " + device) if device.startswith("gpu"): L.warning('Running on GPU yields non-deterministic results.') xassert( sys.modules.has_key('theano') == False, "dlm.utils.set_theano_device() function cannot be called after importing theano" ) os.environ['OMP_NUM_THREADS'] = str(threads) os.environ['THEANO_FLAGS'] = 'device=' + device os.environ['THEANO_FLAGS'] += ',force_device=True' os.environ['THEANO_FLAGS'] += ',floatX=float32' os.environ['THEANO_FLAGS'] += ',warn_float64=warn' os.environ['THEANO_FLAGS'] += ',cast_policy=numpy+floatX' #os.environ['THEANO_FLAGS'] += ',allow_gc=True' os.environ['THEANO_FLAGS'] += ',print_active_device=False' os.environ[ 'THEANO_FLAGS'] += ',exception_verbosity=high' # Highly verbose debugging os.environ['THEANO_FLAGS'] += ',mode=FAST_RUN' os.environ[ 'THEANO_FLAGS'] += ',nvcc.fastmath=False' # True: makes div and sqrt faster at the cost of precision, and possible bugs #os.environ['THEANO_FLAGS'] += ',optimizer_including=cudnn' # Comment out if CUDNN is not available # change theano to wrapper try: #import theano import backend.nn_wrapper as K except EnvironmentError: L.exception() global logger #if theano.config.device == "gpu": # L.info( # "Device: " + theano.config.device.upper() + " " # + str(theano.sandbox.cuda.active_device_number()) # + " (" + str(theano.sandbox.cuda.active_device_name()) + ")" # ) #else: # L.info("Device: " + theano.config.device.upper()) #global K try: K.set_platform('tensorflow') # theano is working L.info("Creating a variable inside utils") import numpy as np val = np.random.random((4, 2)) tmp = K.variable(val) except: print >> sys.stderr, "Unexpected error:", sys.exc_info() raise TypeError("Cannot set the platform")
parser.add_argument("-s", "--predictable-seed", dest="pred_seed", action='store_true', help="Tune with predictable seed to avoid randomness") args = parser.parse_args() U.set_theano_device(args.device) from dlm.reranker import augmenter from dlm.reranker import mosesIniReader as iniReader if os.environ.has_key('MOSES_ROOT'): moses_root = os.environ['MOSES_ROOT'] else: L.error("Set MOSES_ROOT variable to your moses root directory") U.mkdir_p(args.out_dir) #cmd = moses_root + '/bin/moses -show-weights -f ' + args.input_config + ' 2> /dev/null' #features = U.capture(cmd).strip().split('\n') features = iniReader.parseIni(args.input_config) output_nbest_path = args.out_dir + '/augmented.nbest' if args.no_aug: shutil.copy(args.input_nbest, output_nbest_path) else: augmenter.augment(args.model_path, args.input_nbest, args.vocab_path, output_nbest_path)
try: float(s) return True except ValueError: return False counter = 0 for group in input_aug_nbest: index = 0 scores = dict() for item in group: features = np.asarray([x for x in item.features.split() if is_number(x)], dtype=float) try: scores[index] = np.dot(features, weights) except ValueError: L.error('Number of features in the nbest and the weights file are not the same') index += 1 sorted_indices = sorted(scores, key=scores.get, reverse=True) for idx in sorted_indices: output_nbest.write(group[idx]) output_1best.write(group[sorted_indices[0]].hyp + "\n") counter += 1 if counter % 100 == 0: L.info("%i groups processed" % (counter)) L.info("Finished processing %i groups" % (counter)) output_nbest.close() output_1best.close() if args.clean_up: os.remove(output_nbest_path)