Ejemplo n.º 1
0
def generate_spacy_factor_corpus(text_file,
                                 output_dir,
                                 lang_code,
                                 prefix,
                                 factor_separator=u'|'):
    mkdir_p(output_dir)
    text_output = codecs.open(os.path.join(
        output_dir, prefix + '.{}.'.format(lang_code) + 'tok'),
                              'w',
                              encoding='utf8')
    factor_output = codecs.open(os.path.join(
        output_dir, prefix + '.{}.'.format(lang_code) + 'factors'),
                                'w',
                                encoding='utf8')

    nlp = spacy.load(lang_code)
    logger.info('Loaded Spacy {} model'.format(lang_code))

    with codecs.open(text_file, encoding='utf8') as inp:

        for count, line in enumerate(inp):
            row = extract_factors(line, nlp)
            text, factors = zip(*[(factor_tup[0], factor_tup[1:])
                                  for factor_tup in row])
            text_output.write(u' '.join(text) + '\n')
            factor_output.write(
                u' '.join([factor_separator.join(f) for f in factors]) + '\n')
            if (count + 1) % 1000 == 0:
                logger.info('Processed {} rows'.format(count + 1))

    logger.info('Wrote new files: {} and {}'.format(text_output.name,
                                                    factor_output.name))
    text_output.close()
    factor_output.close()
Ejemplo n.º 2
0
def extract_ter_alignment(hyps_file, refs_file, output_path, src_lang, trg_lang, tercom_path):

    tercom_jar = os.path.join(tercom_path, 'tercom.7.25.jar')

    mkdir_p(output_path)
    output_prefix = os.path.join('{}-{}.tercom.out'.format(src_lang, trg_lang))

    # WORKING: we need to put hyps and refs files in a special format
    hyps_file_iter = codecs.open(hyps_file, encoding='utf8')
    refs_file_iter = codecs.open(refs_file, encoding='utf8')
    hyp_ref_iter = parallel_iterator(hyps_file_iter, refs_file_iter)

    temp_hyps_file = hyps_file + '.ter.temp'
    temp_refs_file = refs_file + '.ter.temp'
    with codecs.open(temp_hyps_file, 'w', encoding='utf8') as f_hyp:
        with codecs.open(temp_refs_file, 'w', encoding='utf8') as f_ref:
            for i, (hyp, ref) in enumerate(hyp_ref_iter):
                # Note the logic for escaping XML entities here
                f_hyp.write('%s\t(%.12d)\n' % (u' '.join([cgi.escape(w) for w in hyp]), i))
                f_ref.write('%s\t(%.12d)\n' % (u' '.join([cgi.escape(w) for w in ref]), i))

    # Run TERCOM.
    cmd = 'java -jar {} -r {} -h {} -n {} -d 0'.format(tercom_jar,
                                                       temp_refs_file,
                                                       temp_hyps_file,
                                                       output_prefix)
    p = subprocess.Popen(cmd, shell=True, stderr=sys.stderr, stdout=sys.stdout)
    p.wait()

    os.remove(temp_hyps_file)
    os.remove(temp_refs_file)

    # Parse TERCOM output xml
    mt_tokens, pe_tokens, edits, hters = \
        parse_pra_xml.parse_file('{}.xml'.format(output_prefix))

    tags_map = {'C': 'OK', 'S': 'BAD', 'I': 'BAD', 'D': 'BAD'}
    tags = [parse_pra_xml.get_tags(edit, tags_map, keep_inserts=False) for edit in edits]

    tags_output_file = os.path.join(output_path, output_prefix + '.tags')
    with codecs.open(tags_output_file, 'w', encoding='utf8') as out:
        for row in tags:
            out.write(u' '.join(row) + u'\n')
    logger.info('Wrote tags to: {}'.format(tags_output_file))
Ejemplo n.º 3
0
VOCAB_DIR = "/media/1tb_drive/nematus_ape_experiments/amunmt_ape_pretrained/system/models"
SRC_VOCAB = os.path.join(VOCAB_DIR, 'src-pe/vocab.src.json')
MT_VOCAB = os.path.join(VOCAB_DIR, 'mt-pe/vocab.mt.json')
PE_VOCAB = os.path.join(VOCAB_DIR, 'mt-pe/vocab.pe.json')

TRAIN_DATA_DIR = "/media/1tb_drive/Dropbox/data/qe/amunmt_artificial_ape_2016/data/500K_and_20x_task_internal"
SRC_TRAIN = os.path.join(TRAIN_DATA_DIR, 'train.mt.factor_corpus')
TRG_TRAIN = os.path.join(TRAIN_DATA_DIR, 'train.pe.prepped')

# WMT 16 EN-DE QE/APE DEV Data
QE_DATA_DIR = "/media/1tb_drive/Dropbox/data/qe/ape/concat_wmt_2016_2017"
SRC_DEV = os.path.join(QE_DATA_DIR, 'dev.mt.factor_corpus')
TRG_DEV = os.path.join(QE_DATA_DIR, 'dev.pe.prepped')

mkdir_p('model')

# start training from best model from previous experiment
STARTING_MODEL = '/media/1tb_drive/nematus_ape_experiments/ape_qe/en-de/model/model.npz.npz.best_bleu'

if __name__ == '__main__':
    validerr = train(saveto=os.path.join('model/model.npz'),
                     prior_model=STARTING_MODEL,
                     reload_=True,
                     dim_word=256,
                     dim=512,
                     n_words=PE_VOCAB_SIZE,
                     n_words_src=SRC_VOCAB_SIZE,
                     decay_c=0.,
                     clip_c=1.,
                     lrate=0.0001,
        tokens = freq_dict.keys()
        freqs = freq_dict.values()

        sorted_idx = numpy.argsort(freqs)
        sorted_words = [tokens[ii] for ii in sorted_idx[::-1]]

        token_dicts[i]['eos'] = 0
        token_dicts[i]['UNK'] = 1
        for ii, ww in enumerate(sorted_words):
            token_dicts[i][ww] = ii+2

    return token_dicts


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help="Input file with factor tuples separated by `factor_separator`")
    parser.add_argument("-o", "--output", help="Directory where output files will be written")
    parser.add_argument("-n", "--num_factors", type=int, help="the number of factors")

    args = parser.parse_args()

    factor_iterator = factor_iter(open(args.input, 'r'), args.num_factors)
    factor_dicts = vocab_dictionaries_from_factor_iterator(factor_iterator, num_factors=args.num_factors)
    mkdir_p(args.output)

    for idx, filename in enumerate(['factor_{}'.format(i+1) for i in range(args.num_factors)]):
        with open('%s.json'%os.path.join(args.output, filename), 'wb') as f:
            json.dump(factor_dicts[idx], f, indent=2, ensure_ascii=False)
        logger.info('Wrote index to: {}'.format(filename))
Ejemplo n.º 5
0
    def train(self,
              train_iter_func,
              dev_iter_func,
              restore_from=None,
              auto_log_suffix=True,
              start_iteration=0,
              shuffle=True):
        """
        Training with dev checks for QE sequence models

        Params:
          training_iter_func: function which returns iterable over (source, mt, labels) instances
          dev_iter_func: function which returns iterable over (source, mt, labels) instances

        """

        logdir = os.path.join(self.storage, 'logs')
        persist_dir = os.path.join(self.storage, 'model')
        mkdir_p(persist_dir)
        evaluation_logdir = os.path.join(self.storage, 'evaluation_reports')
        mkdir_p(evaluation_logdir)

        training_iter = train_iter_func()
        training_iter = itertools.cycle(training_iter)

        # wrap the data iter to add functionality
        if shuffle:
            shuffle_factor = self.config.get('shuffle_factor', 5000)
            training_iter = shuffle_instances_iterator(
                training_iter, shuffle_factor=shuffle_factor)

        # load pretrained source word embeddings
        source_embeddings = None
        if self.config.get('source_embeddings') is not None:
            source_embeddings = np.load(open(self.config['source_embeddings']))
        # TODO: support pretrained target and output vocabulary embeddings

        if auto_log_suffix:
            log_suffix = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            output_logdir = os.path.join(logdir, log_suffix)
        else:
            output_logdir = logdir

        dev_perfs = OrderedDict()
        mkdir_p(output_logdir)
        train_writer = tf.summary.FileWriter(output_logdir, self.graph)

        logger.info('Running session, logdir is: {}'.format(output_logdir))
        with tf.Session(graph=self.graph, config=config) as session:

            # Initialization ops
            if restore_from is None:
                tf.initialize_all_variables().run()
                logger.info('Initialized')

                # Pretrained Word Embeddings
                if source_embeddings is not None:
                    session.run(
                        tf.assign(self.word_embeddings, source_embeddings))
                    logger.info(
                        'Source word embeddings loaded from: {}'.format(
                            self.config['source_embeddings']))
            else:
                self.saver.restore(session, restore_from)
                logger.info(
                    'restored trained model from: {}'.format(restore_from))

            average_loss = 0

            val_freq = self.config['validation_freq']

            # SGD loop
            for step in range(self.config['num_steps']):

                if step % 10 == 0:
                    logger.info('running step: {}'.format(step))

                data_cols = self.get_batch(
                    training_iter,
                    self.config['batch_size'],
                    sample_prob=self.config['sample_prob'])

                source, source_mask, target, target_mask, output, output_mask = data_cols

                feed_dict = {
                    self.source: source,
                    self.source_mask: source_mask,
                    self.target: target,
                    self.target_mask: target_mask,
                    self.output: output,
                    self.output_mask: output_mask,
                    self.dropout_prob: self.config['dropout_prob']
                }

                # if step < self.config['training_transition_cutoff']:
                _, l, summary = session.run(
                    [
                        self.full_graph_optimizer,
                        self.cost,
                        #                             # self.accuracy,
                        self.merged
                    ],
                    feed_dict=feed_dict)
                # else:
                #     _, l, summary = session.run([self.entity_representation_optimizer,
                #                                            self.cost,
                #                                            # self.accuracy,
                #                                            self.merged], feed_dict=feed_dict)

                train_writer.add_summary(summary, step)

                average_loss += l

                # Validation
                if step % val_freq == 0:
                    logger.info('Running validation...')
                    logger.info('Training loss on last batch: {}'.format(l))

                    dev_iter = dev_iter_func()
                    dev_batch_len = self.config['batch_size']

                    total_correct = 0
                    total_instances = 0
                    source_out = []
                    mt_out = []
                    output_out = []
                    pred_out = []
                    acc_out = []

                    dev_batch = 0
                    while dev_batch_len > 0:
                        data_cols = self.get_batch(dev_iter,
                                                   dev_batch_len,
                                                   sample_prob=1.0)

                        # this will be zero once the iterator has finished
                        dev_batch_len = len(data_cols[0])
                        if dev_batch_len == 0:
                            continue

                        source, source_mask, target, target_mask, output, output_mask = data_cols

                        feed_dict = {
                            self.source: source,
                            self.source_mask: source_mask,
                            self.target: target,
                            self.target_mask: target_mask,
                            self.output: output,
                            self.output_mask: output_mask,
                            self.dropout_prob: 1.0
                        }

                        preds = session.run(self.predictions,
                                            feed_dict=feed_dict)
                        preds = np.argmax(preds, axis=2)

                        for s, t, p, o, m in zip(source, target, preds, output,
                                                 output_mask):
                            dev_source = [self.src_vocab_idict[w] for w in s]

                            output_len = np.count_nonzero(m)
                            mt_actual = t[:output_len]
                            pred_actual = p[:output_len]
                            output_actual = o[:output_len]
                            dev_mt = [
                                self.trg_vocab_idict[w] for w in mt_actual
                            ]
                            dev_pred = [
                                self.output_vocab_idict[w] for w in pred_actual
                            ]
                            dev_output = [
                                self.output_vocab_idict[w]
                                for w in output_actual
                            ]

                            num_correct = sum([
                                1 for p, a in zip(pred_actual, output_actual)
                                if p == a
                            ])
                            acc = num_correct / float(output_len)

                            source_out.append(dev_source)
                            mt_out.append(dev_mt)
                            pred_out.append(dev_pred)
                            output_out.append(dev_output)
                            acc_out.append(acc)

                            total_correct += num_correct
                            total_instances += output_len

                        dev_batch += 1

                    dev_reports = []
                    for s, m, p, o, a in zip(source_out, mt_out, pred_out,
                                             output_out, acc_out):
                        dev_report = {
                            'source': u' '.join(s),
                            'mt': u' '.join(m),
                            'pred': u' '.join(p),
                            'output': u' '.join(o),
                            'acc': a
                        }
                        dev_reports.append(dev_report)

                    evaluation_report = qe_output_evaluation(
                        mt_out,
                        pred_out,
                        output_out,
                        expanded_tagset=self.config['expanded_output_tagset'])
                    logger.info(u'Evaluation report at step: {} -- {}'.format(
                        step, evaluation_report))

                    dev_report_file = os.path.join(logdir,
                                                   'dev_{}.out'.format(step))
                    with codecs.open(dev_report_file, 'w',
                                     encoding='utf8') as dev_out:
                        dev_out.write(json.dumps(dev_reports, indent=2))
                    logger.info('Wrote validation report to: {}'.format(
                        dev_report_file))

                    evaluation_logfile = 'f1-product-{}.step-{}.json'.format(
                        evaluation_report['f1_product'], step)
                    evaluation_logfile = os.path.join(evaluation_logdir,
                                                      evaluation_logfile)
                    with codecs.open(evaluation_logfile, 'w',
                                     encoding='utf8') as eval_out:
                        eval_out.write(json.dumps(evaluation_report, indent=2))
                    logger.info('Wrote evaluation log to: {}'.format(
                        evaluation_logfile))

                    dev_perf = evaluation_report['f1_product']
                    dev_perfs[step] = dev_perf

                    if dev_perf == max(
                            v for k, v in dev_perfs.items()) and step > 0:
                        save_path = self.saver.save(
                            session,
                            os.path.join(persist_dir, 'best_model.ckpt'))
                        logger.info(
                            "Step: {} -- {} is the best score so far, model saved in file: {}"
                            .format(step, dev_perf, save_path))
                    if step > 0 and step % 10000 == 0:
                        save_path = self.saver.save(
                            session,
                            os.path.join(persist_dir,
                                         'model_{}.ckpt'.format(step)))
                        logger.info(
                            "Step: {} -- checkpoint model saved in file: {}".
                            format(step, save_path))

        logger.info("Step: {} -- Finished Training".format(step))