Example #1
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    args = setup_args()
    logging.info(args)

    tm = TranslationModel(args.model)
    input_lines_symbols = codecs.open(args.input , 'r', 'utf-8')
    input_lines = codecs.open(args.input + '.nounk',  'r', 'utf-8')
    gold_lines = codecs.open(args.gold + '.nounk', 'r', 'utf-8')

    index = 0
    found = 0
    for input_line, input_line_symbols, gold_line in zip(input_lines, input_lines_symbols, gold_lines):
        unk_map = build_unk_map(input_line_symbols, input_line)
        # logging.info(unk_map)

        translations_with_scores = tm.translate(input_line_symbols, k=args.k)
        translations = [data[1] for data in translations_with_scores]

        translations_replaced = [replace_symbols(translation, unk_map) for translation in translations]
        match_index = find_match(gold_line, translations_replaced)
        logging.info('Index: %d Match: %d'%(index, match_index))

        if match_index != -1:
            found += 1

        index += 1

    recall_k = 0.0
    recall_k += found
    recall_k /= index
    logging.info('Recall@%d: %f (%d/%d)'% (args.k, recall_k, found, index))
 def _create_tm_from_counts(self):
     tm = TranslationModel(harmony_part=self._harmony_part, melody_part=self._melody_part)
     for harmony_note in self._tm_counts:
         total_notes_harmonized = sum(self._tm_counts[harmony_note].values())
         if len(self._tm_counts[harmony_note].keys()) > 2:
             harmony_counts = self._tm_counts[harmony_note].items()
             for (melody_note, count) in harmony_counts:
                 prob = count / float(total_notes_harmonized)
                 tm.add_to_model(melody_note, harmony_note, prob, tm._tm_phrases)
     return tm
Example #3
0
    def __init__(self, translation_model, production_model):
        self.translation_model = translation_model
        self.production_model = production_model
        self.filter_max = 16
        self.null_prior = 0.00007  #FIXME: problem with words being elided too much
        self.phi2_prior = 1.0

        #FIXME: Refactor this constructor
        if type(translation_model) == type(
                str()) and type(production_model) == type(str()):
            self.production_model = EnglishModel(production_model)
            tm = TranslationModel()
            tm.learn_from_text(translation_model, production_model)
            self.translation_model = tm
Example #4
0
def score_fr_en_europarl():
    print "\nFR->EN Europarl:"
    # max lines is 300 at the moment
    num_lines = 300
    num_chars = 50

    tstart = time.time()
    en_lines = get_europarl_en_lines()
    fr_lines = get_europarl_fr_lines()

    en_learn_set = []
    fr_learn_set = []
    en_eval_set = []
    fr_eval_set = []

    for index, pair in enumerate(zip(en_lines, fr_lines)[:num_lines]):
        pair0 = pair[0][:num_chars]
        pair1 = pair[1][:num_chars]
        if index % 4 == 0:
            en_eval_set.append(pair0)
            fr_eval_set.append(pair1)
        else:
            en_learn_set.append(pair0)
            fr_learn_set.append(pair1)

    fr_text = '\n'.join(fr_learn_set)
    en_text = '\n'.join(en_learn_set)

    trx_model = TranslationModel()
    english = EnglishModel(['austen-emma.txt'])
    trx_model.learn_from_text(fr_text, en_text)
    translator = Translator(trx_model, english)

    scorer = TranslationScore()
    n = 0
    score = 0
    for xfr, xen in zip(fr_eval_set, en_eval_set):
        trx_en = translator.translate(xfr)
        n += 1
        score += scorer.of(trx_en, xen)
    avg_score = float(score) / n
    print "lines: ", num_lines, ", chars: ", num_chars
    print "Translation score: {:0.2f}".format(avg_score) + " (of " + str(
        n) + " comparisons)"
    tend = time.time()
    print tdiff(tstart, tend) + " seconds elapsed."
Example #5
0
def main():
    args = setup_args()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info(args)

    src_lines = codecs.open(args.source, 'r', 'utf-8').readlines()
    src_lines_nounk = codecs.open(args.source + args.suffix, 'r', 'utf-8').readlines()
    gold_lines = codecs.open(args.gold + args.suffix, 'r', 'utf-8').readlines()

    fw = codecs.open(args.model + SVM_RANK_DATA, 'w', 'utf-8')

    tm = TranslationModel(args.model)
    num_all_zeros = 0

    train_id = 0
    for sentence_idx, (src_line, src_line_nounk, gold_line) in enumerate(zip(src_lines, src_lines_nounk, gold_lines)):
        translations = tm.translate(src_line, k=args.num)
        logging.info('Source_line: %s'% src_line_nounk)
        logging.info('Gold_line: %s' % gold_line)

        unk_map = build_unk_map(src_line, src_line_nounk)
        logging.info('UNK_map: %s'% str(unk_map))

        scores = []
        translations_nounk = []
        for idx, translation in enumerate(translations):
            translation_nounk = replace_symbols(translation[1], unk_map)
            translations_nounk.append(translation_nounk)
            bleu_nounk = get_bleu_score(gold_line, translation_nounk)
            scores.append(bleu_nounk)
            #logging.info('Tr:%d ::%s BLEU:%s'%(idx, translation_nounk, bleu_nounk))

        if sum(scores) == 0.0:
            num_all_zeros += 1
            continue

        scores_index = sorted(range(len(scores)), key=lambda k: scores[k], reverse=True)
        write_train_data(fw, sentence_idx, train_id, translations_nounk, scores, scores_index, src_line_nounk)
        train_id += 0

        for index in scores_index:
            logging.info('Tr: %d Text:%s Pr:%f BLEU:%f'%(index, translations[index][1],
                                                              translations[index][0], scores[index]))
    logging.info('Num all zeros: %d'%num_all_zeros)
Example #6
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    args = setup_args()
    logging.info(args)

    tm = TranslationModel(args.model)
    fw_out = codecs.open(args.out, 'w', 'utf-8')

    line_num = 0
    for input_line in codecs.open(args.input, 'r', 'utf-8'):
        results = tm.translate(input_line.strip(), k = 20)
        if args.all:
            index, best_bleu_score = find_best_translation(input_line, results)
        else:
            best_bleu_score = -1.0
            index = 0

        logging.info('Line:%d best_index:%d best_bleu:%f'% (line_num, index, best_bleu_score))
        fw_out.write(results[index][1] + '\n')
        line_num += 1
    fw_out.close()
Example #7
0
    def of(self, actual_text, expected_text):
        tm = TranslationModel()
        actual_parts = tm.preprocess(actual_text)
        expected_parts = tm.preprocess(expected_text)

        if len(expected_parts) == 0:
            return 1.0

        actual_in_expected = 0
        for actual in actual_parts:
            if actual in expected_parts:
                actual_in_expected += 1
        true_positive = actual_in_expected / float(len(actual_parts))

        expected_not_in_actual = 0
        for expected in expected_parts:
            if expected not in actual_parts:
                expected_not_in_actual += 1
        maxlen = max(len(expected_parts), len(actual_parts))
        true_negative = 1.0 - expected_not_in_actual / float(maxlen)

        return true_positive * true_negative
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    args = setup_args()
    logging.info(args)

    tm = TranslationModel(args.model)
    f = codecs.open('%s-%s.csv'% (args.out, args.suffix), 'w')
    csv_f = csv.writer(f, delimiter=',', encoding='utf-8')

    data = ['Src', 'Target', 'Gold Standard']
    csv_f.writerow(data)
    input_lines = codecs.open(args.input, 'r', 'utf-8').readlines()
    gold_lines = codecs.open(args.gold, 'r', 'utf-8').readlines()

    fw_sents = codecs.open('%s-%s-sents.out', 'w', 'utf-8')
    for input_line, gold_line in zip(input_lines, gold_lines):
        data = []
        data.append(input_line.strip())
        results = tm.translate(input_line.strip())
        data.append(results[0][1])
        data.append(gold_line.strip())
        csv_f.writerow(data)
        fw_sents.write(results[0][1] + '\n')
Example #9
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    args = setup_args()
    logging.info(args)

    tm = TranslationModel(args.model)
    input_lines_symbols = codecs.open(args.input, 'r', 'utf-8')
    input_lines = codecs.open(args.input + '.nounk', 'r', 'utf-8')
    gold_lines = codecs.open(args.gold + '.nounk', 'r', 'utf-8')

    index = 0
    found = 0
    for input_line, input_line_symbols, gold_line in zip(
            input_lines, input_lines_symbols, gold_lines):
        unk_map = build_unk_map(input_line_symbols, input_line)
        # logging.info(unk_map)

        translations_with_scores = tm.translate(input_line_symbols, k=args.k)
        translations = [data[1] for data in translations_with_scores]

        translations_replaced = [
            replace_symbols(translation, unk_map)
            for translation in translations
        ]
        match_index = find_match(gold_line, translations_replaced)
        logging.info('Index: %d Match: %d' % (index, match_index))

        if match_index != -1:
            found += 1

        index += 1

    recall_k = 0.0
    recall_k += found
    recall_k /= index
    logging.info('Recall@%d: %f (%d/%d)' % (args.k, recall_k, found, index))
Example #10
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    args = setup_args()
    logging.info(args)

    tm = TranslationModel(args.model)
    f = codecs.open('%s-%s.csv' % (args.out, args.suffix), 'w')
    csv_f = csv.writer(f, delimiter=',', encoding='utf-8')

    data = ['Src', 'Target', 'Gold Standard']
    csv_f.writerow(data)
    input_lines = codecs.open(args.input, 'r', 'utf-8').readlines()
    gold_lines = codecs.open(args.gold, 'r', 'utf-8').readlines()

    fw_sents = codecs.open('%s-%s-sents.out', 'w', 'utf-8')
    for input_line, gold_line in zip(input_lines, gold_lines):
        data = []
        data.append(input_line.strip())
        results = tm.translate(input_line.strip())
        data.append(results[0][1])
        data.append(gold_line.strip())
        csv_f.writerow(data)
        fw_sents.write(results[0][1] + '\n')
Example #11
0
def load_api_params(tf_config, graph, api_config="../config/api2nl.yaml"):
    params = {}
    # read config file and default config
    with open('../config/default.yaml') as f:
        default_config = utils.AttrDict(yaml.safe_load(f))

    with open(api_config) as f:
        api_config = utils.AttrDict(yaml.safe_load(f))
        # set default values for parameters that are not defined
        for k, v in default_config.items():
            api_config.setdefault(k, v)
    api_config.checkpoint_dir = os.path.join(api_config.model_dir,
                                             'checkpoints')

    #tasks = [api_config]

    #for task in tasks:
    #   for parameter, value in api_config.items():
    #       task.setdefault(parameter, value)

    api_config.encoders = [
        utils.AttrDict(encoder) for encoder in api_config.encoders
    ]
    api_config.decoders = [
        utils.AttrDict(decoder) for decoder in api_config.decoders
    ]

    for encoder_or_decoder in api_config.encoders + api_config.decoders:
        for parameter, value in api_config.items():
            encoder_or_decoder.setdefault(parameter, value)

    with tf.Session(config=tf_config, graph=graph) as sess:
        api_model = TranslationModel(**api_config)
        ckpt = tf.train.get_checkpoint_state(api_config.checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables())
        print("Reading api model parameters from %s" %
              ckpt.model_checkpoint_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
        for v in tf.trainable_variables():
            params[v.name] = sess.run(v.value())
        return params
Example #12
0
def main(args=None):
    args = parser.parse_args(args)

    # read config file and default config
    with open('../model/default.yaml') as f:
        default_config = utils.AttrDict(yaml.safe_load(f))

    with open(args.config) as f:
        config = utils.AttrDict(yaml.safe_load(f))

        if args.learning_rate is not None:
            args.reset_learning_rate = True

        # command-line parameters have higher precedence than config file
        for k, v in vars(args).items():
            if v is not None:
                config[k] = v

        # set default values for parameters that are not defined
        for k, v in default_config.items():
            config.setdefault(k, v)


#    if config.score_function:
#        config.score_functions = evaluation.name_mapping[config.score_function]

    if args.crash_test:
        config.max_train_size = 0

    if not config.debug:
        os.environ[
            'TF_CPP_MIN_LOG_LEVEL'] = '3'  # disable TensorFlow's debugging logs
    decoding_mode = any(arg is not None
                        for arg in (args.decode, args.eval, args.align))

    # enforce parameter constraints
    assert config.steps_per_eval % config.steps_per_checkpoint == 0, (
        'steps-per-eval should be a multiple of steps-per-checkpoint')
    assert decoding_mode or args.train or args.save, (
        'you need to specify at least one action (decode, eval, align, or train)'
    )
    assert not (args.average and args.ensemble)

    if args.train and args.purge:
        utils.log('deleting previous model')
        shutil.rmtree(config.model_dir, ignore_errors=True)

    os.makedirs(config.model_dir, exist_ok=True)

    # copy config file to model directory
    config_path = os.path.join(config.model_dir, 'config.yaml')
    if args.train and not os.path.exists(config_path):
        with open(args.config) as config_file, open(config_path,
                                                    'w') as dest_file:
            content = config_file.read()
            content = re.sub(r'model_dir:.*?\n',
                             'model_dir: {}\n'.format(config.model_dir),
                             content,
                             flags=re.MULTILINE)
            dest_file.write(content)

    # also copy default config
    config_path = os.path.join(config.model_dir, 'default.yaml')
    if args.train and not os.path.exists(config_path):
        shutil.copy('../config/default.yaml', config_path)

    logging_level = logging.DEBUG if args.verbose else logging.INFO
    # always log to stdout in decoding and eval modes (to avoid overwriting precious train logs)
    log_path = os.path.join(config.model_dir, config.log_file)
    logger = utils.create_logger(log_path if args.train else None)
    logger.setLevel(logging_level)

    utils.log('label: {}'.format(config.label))
    utils.log('description:\n  {}'.format('\n  '.join(
        config.description.strip().split('\n'))))

    utils.log(' '.join(sys.argv))  # print command line
    try:  # print git hash
        commit_hash = subprocess.check_output(['git', 'rev-parse',
                                               'HEAD']).decode().strip()
        utils.log('commit hash {}'.format(commit_hash))
    except:
        pass

    utils.log('tensorflow version: {}'.format(tf.__version__))

    # log parameters
    utils.debug('program arguments')
    for k, v in sorted(config.items(), key=itemgetter(0)):
        utils.debug('  {:<20} {}'.format(k, pformat(v)))

    if isinstance(config.dev_prefix, str):
        config.dev_prefix = [config.dev_prefix]

    config.encoders = [utils.AttrDict(encoder) for encoder in config.encoders]
    config.decoders = [utils.AttrDict(decoder) for decoder in config.decoders]

    for encoder_or_decoder in config.encoders + config.decoders:
        for parameter, value in config.items():
            encoder_or_decoder.setdefault(parameter, value)

    if args.max_output_len is not None:  # override decoder's max len
        config.decoders[0].max_len = args.max_output_len

    config.checkpoint_dir = os.path.join(config.model_dir, 'checkpoints')

    # setting random seeds
    if config.seed is None:
        config.seed = random.randrange(sys.maxsize)
    if config.tf_seed is None:
        config.tf_seed = random.randrange(sys.maxsize)
    utils.log('python random seed: {}'.format(config.seed))
    utils.log('tf random seed:     {}'.format(config.tf_seed))
    random.seed(config.seed)
    tf.set_random_seed(config.tf_seed)

    device = None
    if config.no_gpu:
        device = '/cpu:0'
        device_id = None
    elif config.gpu_id is not None:
        device = '/gpu:{}'.format(config.gpu_id)
        device_id = config.gpu_id
    else:
        device_id = 0

    # hide other GPUs so that TensorFlow won't use memory on them
    os.environ['CUDA_VISIBLE_DEVICES'] = '' if device_id is None else str(
        device_id)

    tf_config = tf.ConfigProto(log_device_placement=False,
                               allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = config.allow_growth
    tf_config.gpu_options.per_process_gpu_memory_fraction = config.mem_fraction

    config.api_params = None
    api_graph = tf.Graph()
    transfer_graph = tf.Graph()

    if config.use_transfer:
        # utils.log("loading api params")
        ckpt = tf.train.get_checkpoint_state(config.checkpoint_dir)

        if not ckpt or not ckpt.model_checkpoint_path:
            utils.log("loading api params")
            config.api_params = load_api_params(tf_config=tf_config,
                                                graph=api_graph)

    def average_checkpoints(main_sess, sessions):
        for var in tf.global_variables():
            avg_value = sum(sess.run(var) for sess in sessions) / len(sessions)
            main_sess.run(var.assign(avg_value))

    with tf.Session(config=tf_config, graph=transfer_graph) as sess:
        global global_tf_config, global_transfer_graph
        global_tf_config = tf_config
        global_transfer_graph = transfer_graph
        utils.log('creating model')
        utils.log('using device: {}'.format(device))
        with tf.device(device):
            if config.weight_scale:
                if config.initializer == 'uniform':
                    initializer = tf.random_uniform_initializer(
                        minval=-config.weight_scale,
                        maxval=config.weight_scale)
                else:
                    initializer = tf.random_normal_initializer(
                        stddev=config.weight_scale)
            else:
                initializer = None

            tf.get_variable_scope().set_initializer(initializer)

            # exempt from creating gradient ops
            config.decode_only = decoding_mode
            model = TranslationModel(**config)

        # count parameters
        # not counting parameters created by training algorithm (e.g. Adam)
        variables = [
            var for var in tf.global_variables()
            if not var.name.startswith('gradients')
        ]
        utils.log('model parameters ({})'.format(len(variables)))
        parameter_count = 0
        for var in sorted(variables, key=lambda var: var.name):
            utils.log('  {} {}'.format(var.name, var.get_shape()))
            v = 1
            for d in var.get_shape():
                v *= d.value
            parameter_count += v
        utils.log('number of parameters: {:.2f}M'.format(parameter_count /
                                                         1e6))
        best_checkpoint = os.path.join(config.checkpoint_dir, 'best')

        params = {
            'variable_mapping': config.variable_mapping,
            'reverse_mapping': config.reverse_mapping
        }
        if config.ensemble and len(config.checkpoints) > 1:
            model.initialize(config.checkpoints, **params)
        elif config.average and len(config.checkpoints) > 1:
            model.initialize(reset=True)
            sessions = [
                tf.Session(config=tf_config) for _ in config.checkpoints
            ]
            for sess_, checkpoint in zip(sessions, config.checkpoints):
                model.initialize(sess=sess_,
                                 checkpoints=[checkpoint],
                                 **params)
            average_checkpoints(sess, sessions)
        elif (not config.checkpoints and decoding_mode
              and (os.path.isfile(best_checkpoint + '.index')
                   or os.path.isfile(best_checkpoint + '.index'))):
            # in decoding and evaluation mode, unless specified otherwise (by `checkpoints`),
            # try to load the best checkpoint

            global global_sess
            global_sess = sess

            model.initialize(config.checkpoints, **params)
        else:
            # loads last checkpoint, unless `reset` is true
            model.initialize(sess=sess, **config)

        if args.save:
            model.save()
        elif args.decode is not None:
            global global_config, global_model
            global_config = config
            global_model = model
            utils.log('starting decoding')
            # model.decode(code_string, **config)
            app.run(host='0.0.0.0')
        elif args.eval is not None:
            model.evaluate(on_dev=False, **config)
        elif args.align is not None:
            model.align(**config)
        elif args.train:
            try:
                model.train(**config)
            except KeyboardInterrupt:
                sys.exit()