def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = setup_args() logging.info(args) tm = TranslationModel(args.model) input_lines_symbols = codecs.open(args.input , 'r', 'utf-8') input_lines = codecs.open(args.input + '.nounk', 'r', 'utf-8') gold_lines = codecs.open(args.gold + '.nounk', 'r', 'utf-8') index = 0 found = 0 for input_line, input_line_symbols, gold_line in zip(input_lines, input_lines_symbols, gold_lines): unk_map = build_unk_map(input_line_symbols, input_line) # logging.info(unk_map) translations_with_scores = tm.translate(input_line_symbols, k=args.k) translations = [data[1] for data in translations_with_scores] translations_replaced = [replace_symbols(translation, unk_map) for translation in translations] match_index = find_match(gold_line, translations_replaced) logging.info('Index: %d Match: %d'%(index, match_index)) if match_index != -1: found += 1 index += 1 recall_k = 0.0 recall_k += found recall_k /= index logging.info('Recall@%d: %f (%d/%d)'% (args.k, recall_k, found, index))
def _create_tm_from_counts(self): tm = TranslationModel(harmony_part=self._harmony_part, melody_part=self._melody_part) for harmony_note in self._tm_counts: total_notes_harmonized = sum(self._tm_counts[harmony_note].values()) if len(self._tm_counts[harmony_note].keys()) > 2: harmony_counts = self._tm_counts[harmony_note].items() for (melody_note, count) in harmony_counts: prob = count / float(total_notes_harmonized) tm.add_to_model(melody_note, harmony_note, prob, tm._tm_phrases) return tm
def __init__(self, translation_model, production_model): self.translation_model = translation_model self.production_model = production_model self.filter_max = 16 self.null_prior = 0.00007 #FIXME: problem with words being elided too much self.phi2_prior = 1.0 #FIXME: Refactor this constructor if type(translation_model) == type( str()) and type(production_model) == type(str()): self.production_model = EnglishModel(production_model) tm = TranslationModel() tm.learn_from_text(translation_model, production_model) self.translation_model = tm
def score_fr_en_europarl(): print "\nFR->EN Europarl:" # max lines is 300 at the moment num_lines = 300 num_chars = 50 tstart = time.time() en_lines = get_europarl_en_lines() fr_lines = get_europarl_fr_lines() en_learn_set = [] fr_learn_set = [] en_eval_set = [] fr_eval_set = [] for index, pair in enumerate(zip(en_lines, fr_lines)[:num_lines]): pair0 = pair[0][:num_chars] pair1 = pair[1][:num_chars] if index % 4 == 0: en_eval_set.append(pair0) fr_eval_set.append(pair1) else: en_learn_set.append(pair0) fr_learn_set.append(pair1) fr_text = '\n'.join(fr_learn_set) en_text = '\n'.join(en_learn_set) trx_model = TranslationModel() english = EnglishModel(['austen-emma.txt']) trx_model.learn_from_text(fr_text, en_text) translator = Translator(trx_model, english) scorer = TranslationScore() n = 0 score = 0 for xfr, xen in zip(fr_eval_set, en_eval_set): trx_en = translator.translate(xfr) n += 1 score += scorer.of(trx_en, xen) avg_score = float(score) / n print "lines: ", num_lines, ", chars: ", num_chars print "Translation score: {:0.2f}".format(avg_score) + " (of " + str( n) + " comparisons)" tend = time.time() print tdiff(tstart, tend) + " seconds elapsed."
def main(): args = setup_args() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(args) src_lines = codecs.open(args.source, 'r', 'utf-8').readlines() src_lines_nounk = codecs.open(args.source + args.suffix, 'r', 'utf-8').readlines() gold_lines = codecs.open(args.gold + args.suffix, 'r', 'utf-8').readlines() fw = codecs.open(args.model + SVM_RANK_DATA, 'w', 'utf-8') tm = TranslationModel(args.model) num_all_zeros = 0 train_id = 0 for sentence_idx, (src_line, src_line_nounk, gold_line) in enumerate(zip(src_lines, src_lines_nounk, gold_lines)): translations = tm.translate(src_line, k=args.num) logging.info('Source_line: %s'% src_line_nounk) logging.info('Gold_line: %s' % gold_line) unk_map = build_unk_map(src_line, src_line_nounk) logging.info('UNK_map: %s'% str(unk_map)) scores = [] translations_nounk = [] for idx, translation in enumerate(translations): translation_nounk = replace_symbols(translation[1], unk_map) translations_nounk.append(translation_nounk) bleu_nounk = get_bleu_score(gold_line, translation_nounk) scores.append(bleu_nounk) #logging.info('Tr:%d ::%s BLEU:%s'%(idx, translation_nounk, bleu_nounk)) if sum(scores) == 0.0: num_all_zeros += 1 continue scores_index = sorted(range(len(scores)), key=lambda k: scores[k], reverse=True) write_train_data(fw, sentence_idx, train_id, translations_nounk, scores, scores_index, src_line_nounk) train_id += 0 for index in scores_index: logging.info('Tr: %d Text:%s Pr:%f BLEU:%f'%(index, translations[index][1], translations[index][0], scores[index])) logging.info('Num all zeros: %d'%num_all_zeros)
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = setup_args() logging.info(args) tm = TranslationModel(args.model) fw_out = codecs.open(args.out, 'w', 'utf-8') line_num = 0 for input_line in codecs.open(args.input, 'r', 'utf-8'): results = tm.translate(input_line.strip(), k = 20) if args.all: index, best_bleu_score = find_best_translation(input_line, results) else: best_bleu_score = -1.0 index = 0 logging.info('Line:%d best_index:%d best_bleu:%f'% (line_num, index, best_bleu_score)) fw_out.write(results[index][1] + '\n') line_num += 1 fw_out.close()
def of(self, actual_text, expected_text): tm = TranslationModel() actual_parts = tm.preprocess(actual_text) expected_parts = tm.preprocess(expected_text) if len(expected_parts) == 0: return 1.0 actual_in_expected = 0 for actual in actual_parts: if actual in expected_parts: actual_in_expected += 1 true_positive = actual_in_expected / float(len(actual_parts)) expected_not_in_actual = 0 for expected in expected_parts: if expected not in actual_parts: expected_not_in_actual += 1 maxlen = max(len(expected_parts), len(actual_parts)) true_negative = 1.0 - expected_not_in_actual / float(maxlen) return true_positive * true_negative
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = setup_args() logging.info(args) tm = TranslationModel(args.model) f = codecs.open('%s-%s.csv'% (args.out, args.suffix), 'w') csv_f = csv.writer(f, delimiter=',', encoding='utf-8') data = ['Src', 'Target', 'Gold Standard'] csv_f.writerow(data) input_lines = codecs.open(args.input, 'r', 'utf-8').readlines() gold_lines = codecs.open(args.gold, 'r', 'utf-8').readlines() fw_sents = codecs.open('%s-%s-sents.out', 'w', 'utf-8') for input_line, gold_line in zip(input_lines, gold_lines): data = [] data.append(input_line.strip()) results = tm.translate(input_line.strip()) data.append(results[0][1]) data.append(gold_line.strip()) csv_f.writerow(data) fw_sents.write(results[0][1] + '\n')
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = setup_args() logging.info(args) tm = TranslationModel(args.model) input_lines_symbols = codecs.open(args.input, 'r', 'utf-8') input_lines = codecs.open(args.input + '.nounk', 'r', 'utf-8') gold_lines = codecs.open(args.gold + '.nounk', 'r', 'utf-8') index = 0 found = 0 for input_line, input_line_symbols, gold_line in zip( input_lines, input_lines_symbols, gold_lines): unk_map = build_unk_map(input_line_symbols, input_line) # logging.info(unk_map) translations_with_scores = tm.translate(input_line_symbols, k=args.k) translations = [data[1] for data in translations_with_scores] translations_replaced = [ replace_symbols(translation, unk_map) for translation in translations ] match_index = find_match(gold_line, translations_replaced) logging.info('Index: %d Match: %d' % (index, match_index)) if match_index != -1: found += 1 index += 1 recall_k = 0.0 recall_k += found recall_k /= index logging.info('Recall@%d: %f (%d/%d)' % (args.k, recall_k, found, index))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = setup_args() logging.info(args) tm = TranslationModel(args.model) f = codecs.open('%s-%s.csv' % (args.out, args.suffix), 'w') csv_f = csv.writer(f, delimiter=',', encoding='utf-8') data = ['Src', 'Target', 'Gold Standard'] csv_f.writerow(data) input_lines = codecs.open(args.input, 'r', 'utf-8').readlines() gold_lines = codecs.open(args.gold, 'r', 'utf-8').readlines() fw_sents = codecs.open('%s-%s-sents.out', 'w', 'utf-8') for input_line, gold_line in zip(input_lines, gold_lines): data = [] data.append(input_line.strip()) results = tm.translate(input_line.strip()) data.append(results[0][1]) data.append(gold_line.strip()) csv_f.writerow(data) fw_sents.write(results[0][1] + '\n')
def load_api_params(tf_config, graph, api_config="../config/api2nl.yaml"): params = {} # read config file and default config with open('../config/default.yaml') as f: default_config = utils.AttrDict(yaml.safe_load(f)) with open(api_config) as f: api_config = utils.AttrDict(yaml.safe_load(f)) # set default values for parameters that are not defined for k, v in default_config.items(): api_config.setdefault(k, v) api_config.checkpoint_dir = os.path.join(api_config.model_dir, 'checkpoints') #tasks = [api_config] #for task in tasks: # for parameter, value in api_config.items(): # task.setdefault(parameter, value) api_config.encoders = [ utils.AttrDict(encoder) for encoder in api_config.encoders ] api_config.decoders = [ utils.AttrDict(decoder) for decoder in api_config.decoders ] for encoder_or_decoder in api_config.encoders + api_config.decoders: for parameter, value in api_config.items(): encoder_or_decoder.setdefault(parameter, value) with tf.Session(config=tf_config, graph=graph) as sess: api_model = TranslationModel(**api_config) ckpt = tf.train.get_checkpoint_state(api_config.checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) print("Reading api model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) for v in tf.trainable_variables(): params[v.name] = sess.run(v.value()) return params
def main(args=None): args = parser.parse_args(args) # read config file and default config with open('../model/default.yaml') as f: default_config = utils.AttrDict(yaml.safe_load(f)) with open(args.config) as f: config = utils.AttrDict(yaml.safe_load(f)) if args.learning_rate is not None: args.reset_learning_rate = True # command-line parameters have higher precedence than config file for k, v in vars(args).items(): if v is not None: config[k] = v # set default values for parameters that are not defined for k, v in default_config.items(): config.setdefault(k, v) # if config.score_function: # config.score_functions = evaluation.name_mapping[config.score_function] if args.crash_test: config.max_train_size = 0 if not config.debug: os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '3' # disable TensorFlow's debugging logs decoding_mode = any(arg is not None for arg in (args.decode, args.eval, args.align)) # enforce parameter constraints assert config.steps_per_eval % config.steps_per_checkpoint == 0, ( 'steps-per-eval should be a multiple of steps-per-checkpoint') assert decoding_mode or args.train or args.save, ( 'you need to specify at least one action (decode, eval, align, or train)' ) assert not (args.average and args.ensemble) if args.train and args.purge: utils.log('deleting previous model') shutil.rmtree(config.model_dir, ignore_errors=True) os.makedirs(config.model_dir, exist_ok=True) # copy config file to model directory config_path = os.path.join(config.model_dir, 'config.yaml') if args.train and not os.path.exists(config_path): with open(args.config) as config_file, open(config_path, 'w') as dest_file: content = config_file.read() content = re.sub(r'model_dir:.*?\n', 'model_dir: {}\n'.format(config.model_dir), content, flags=re.MULTILINE) dest_file.write(content) # also copy default config config_path = os.path.join(config.model_dir, 'default.yaml') if args.train and not os.path.exists(config_path): shutil.copy('../config/default.yaml', config_path) logging_level = logging.DEBUG if args.verbose else logging.INFO # always log to stdout in decoding and eval modes (to avoid overwriting precious train logs) log_path = os.path.join(config.model_dir, config.log_file) logger = utils.create_logger(log_path if args.train else None) logger.setLevel(logging_level) utils.log('label: {}'.format(config.label)) utils.log('description:\n {}'.format('\n '.join( config.description.strip().split('\n')))) utils.log(' '.join(sys.argv)) # print command line try: # print git hash commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip() utils.log('commit hash {}'.format(commit_hash)) except: pass utils.log('tensorflow version: {}'.format(tf.__version__)) # log parameters utils.debug('program arguments') for k, v in sorted(config.items(), key=itemgetter(0)): utils.debug(' {:<20} {}'.format(k, pformat(v))) if isinstance(config.dev_prefix, str): config.dev_prefix = [config.dev_prefix] config.encoders = [utils.AttrDict(encoder) for encoder in config.encoders] config.decoders = [utils.AttrDict(decoder) for decoder in config.decoders] for encoder_or_decoder in config.encoders + config.decoders: for parameter, value in config.items(): encoder_or_decoder.setdefault(parameter, value) if args.max_output_len is not None: # override decoder's max len config.decoders[0].max_len = args.max_output_len config.checkpoint_dir = os.path.join(config.model_dir, 'checkpoints') # setting random seeds if config.seed is None: config.seed = random.randrange(sys.maxsize) if config.tf_seed is None: config.tf_seed = random.randrange(sys.maxsize) utils.log('python random seed: {}'.format(config.seed)) utils.log('tf random seed: {}'.format(config.tf_seed)) random.seed(config.seed) tf.set_random_seed(config.tf_seed) device = None if config.no_gpu: device = '/cpu:0' device_id = None elif config.gpu_id is not None: device = '/gpu:{}'.format(config.gpu_id) device_id = config.gpu_id else: device_id = 0 # hide other GPUs so that TensorFlow won't use memory on them os.environ['CUDA_VISIBLE_DEVICES'] = '' if device_id is None else str( device_id) tf_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) tf_config.gpu_options.allow_growth = config.allow_growth tf_config.gpu_options.per_process_gpu_memory_fraction = config.mem_fraction config.api_params = None api_graph = tf.Graph() transfer_graph = tf.Graph() if config.use_transfer: # utils.log("loading api params") ckpt = tf.train.get_checkpoint_state(config.checkpoint_dir) if not ckpt or not ckpt.model_checkpoint_path: utils.log("loading api params") config.api_params = load_api_params(tf_config=tf_config, graph=api_graph) def average_checkpoints(main_sess, sessions): for var in tf.global_variables(): avg_value = sum(sess.run(var) for sess in sessions) / len(sessions) main_sess.run(var.assign(avg_value)) with tf.Session(config=tf_config, graph=transfer_graph) as sess: global global_tf_config, global_transfer_graph global_tf_config = tf_config global_transfer_graph = transfer_graph utils.log('creating model') utils.log('using device: {}'.format(device)) with tf.device(device): if config.weight_scale: if config.initializer == 'uniform': initializer = tf.random_uniform_initializer( minval=-config.weight_scale, maxval=config.weight_scale) else: initializer = tf.random_normal_initializer( stddev=config.weight_scale) else: initializer = None tf.get_variable_scope().set_initializer(initializer) # exempt from creating gradient ops config.decode_only = decoding_mode model = TranslationModel(**config) # count parameters # not counting parameters created by training algorithm (e.g. Adam) variables = [ var for var in tf.global_variables() if not var.name.startswith('gradients') ] utils.log('model parameters ({})'.format(len(variables))) parameter_count = 0 for var in sorted(variables, key=lambda var: var.name): utils.log(' {} {}'.format(var.name, var.get_shape())) v = 1 for d in var.get_shape(): v *= d.value parameter_count += v utils.log('number of parameters: {:.2f}M'.format(parameter_count / 1e6)) best_checkpoint = os.path.join(config.checkpoint_dir, 'best') params = { 'variable_mapping': config.variable_mapping, 'reverse_mapping': config.reverse_mapping } if config.ensemble and len(config.checkpoints) > 1: model.initialize(config.checkpoints, **params) elif config.average and len(config.checkpoints) > 1: model.initialize(reset=True) sessions = [ tf.Session(config=tf_config) for _ in config.checkpoints ] for sess_, checkpoint in zip(sessions, config.checkpoints): model.initialize(sess=sess_, checkpoints=[checkpoint], **params) average_checkpoints(sess, sessions) elif (not config.checkpoints and decoding_mode and (os.path.isfile(best_checkpoint + '.index') or os.path.isfile(best_checkpoint + '.index'))): # in decoding and evaluation mode, unless specified otherwise (by `checkpoints`), # try to load the best checkpoint global global_sess global_sess = sess model.initialize(config.checkpoints, **params) else: # loads last checkpoint, unless `reset` is true model.initialize(sess=sess, **config) if args.save: model.save() elif args.decode is not None: global global_config, global_model global_config = config global_model = model utils.log('starting decoding') # model.decode(code_string, **config) app.run(host='0.0.0.0') elif args.eval is not None: model.evaluate(on_dev=False, **config) elif args.align is not None: model.align(**config) elif args.train: try: model.train(**config) except KeyboardInterrupt: sys.exit()