Example #1
0
def data_to_token_ids(data_path, target_path, vocab_path, reuse=False):
    if reuse and os.path.exists(target_path):
        return
    deleteFiles([target_path])
    vocab, _ = initialize_vocabulary(vocab_path)
    with gfile.GFile(data_path, mode="r") as data_file:
        with gfile.GFile(target_path, mode="w") as tokens_file:
            for line in data_file:
                token_ids = sentence_to_token_ids(line, vocab)
                tokens_file.write(" ".join([str(tok)
                                            for tok in token_ids]) + "\n")
Example #2
0
def eval_test():
    tf.reset_default_graph()
    test_out = os.path.join(FLAGS.data_dir, 'test_errors.out')
    deleteFiles([test_out])
    stats = {'R2W': 0, 'W2R': 0, 'W2W_C': 0, 'W2W_NC': 0}
    # change the reuse parameter if you want to build the data again
    _, _, _, _, en_test, fr_test, _, _ = data_utils.prepare_data(
        FLAGS.data_dir, reuse=FLAGS.reuse)
    with tf.Session(config=config_all) as sess:
        model = create_model(sess, True)
        test_set = read_data(en_test, fr_test)
        test_bucket_sizes = [len(test_set[b]) for b in range(len(_buckets))]
        print('Bucket Sizes : {}'.format(test_bucket_sizes))
        total_loss, num_batches = 0, 0

        for bucket_id in range(len(_buckets)):
            all_batches = ([u for u in k if u is not None]
                           for k in itertools.izip_longest(*[
                               test_set[bucket_id][i::FLAGS.batch_size]
                               for i in range(FLAGS.batch_size)
                           ]))
            for batch in all_batches:
                encoder_inputs, decoder_inputs, target_weights = model.prepare_batch(
                    batch, bucket_id)
                # setting the model batch size in case it is smaller (would be for the
                # last batch in the bucket)
                model.batch_size = len(batch)
                _, eval_loss, logits = model.step(sess, encoder_inputs,
                                                  decoder_inputs,
                                                  target_weights, bucket_id,
                                                  True)
                outputs = np.argmax(logits, axis=2).transpose()
                outseq = [
                    out[:list(out).index(data_utils.EOS_ID)] for out in outputs
                    if data_utils.EOS_ID in out
                ]
                stat_updates = update_error_counts(batch, outseq)
                stats = {k: stats[k] + v for k, v in stat_updates.items()}
                total_loss += math.exp(eval_loss)
                num_batches += 1
                # resetting the madel batch size
                model.batch_size = FLAGS.batch_size
        print("Loss over the test set : {}".format(total_loss / num_batches))
        print(stats)
        precision = stats['W2R'] / sum(
            [stats['W2R'], stats['R2W'], stats['W2W_C']])
        recall = stats['W2R'] / sum(
            [stats['W2R'], stats['W2W_NC'], stats['W2W_C']])
        f_m = (2 * precision * recall) / (precision + recall)
        print('P: {}\nR: {}\nF: {}'.format(precision, recall, f_m))
Example #3
0
def create_vocabulary(vocab_path, data_path, reuse=False):
    if reuse and os.path.exists(vocab_path):
        return
    deleteFiles([vocab_path])
    vocab = {}
    with open(data_path) as f:
        for line in f:
            for token in line.split():
                if token in vocab:
                    vocab[token] += 1
                else:
                    vocab[token] = 1
    vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
    writeToFile(vocab_path, '\n'.join(vocab_list))
Example #4
0
 def build_vocabulary(self):
     if self.reuse and os.path.exists(self.vocab_path):
         with open(self.vocab_path) as ifi:
             return len(ifi.readlines())
     deleteFiles([self.vocab_path])
     vocab = {}
     vocab_files = [
         self.train_path + '.inp', self.train_path + '.out',
         self.dev_path + '.inp', self.dev_path + '.out'
     ]
     for data_path in [fl for fl in vocab_files if os.path.exists(fl)]:
         with open(data_path) as f:
             for line in f:
                 for token in line.split():
                     vocab[token] = vocab.get(token, 0) + 1
     vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
     writeToFile(self.vocab_path, '\n'.join(vocab_list))
     return len(vocab_list)
Example #5
0
 def generate_binary_data(target_path, iterable):
     if self.reuse and os.path.exists(target_path):
         return
     deleteFiles([target_path])
     writer = tf.python_io.TFRecordWriter(target_path)
     for in_seq, out_seq in iterable:
         in_tokens = self.sentence_to_token_ids(in_seq, vocab)
         out_tokens = self.sentence_to_token_ids(out_seq, vocab)
         targets = out_tokens[1:] + [0]
         example = tf.train.SequenceExample(
             feature_lists=tf.train.FeatureLists(
                 feature_list={
                     'inp_seq': format_sequence(in_tokens),
                     'out_seq': format_sequence(out_tokens),
                     'targets': format_sequence(targets),
                 }))
         writer.write(example.SerializeToString())
     writer.close()
     print('Done writing {}'.format(target_path))
Example #6
0
import os
import json
from utilities import writeToFile, deleteFiles

outf = './data/diff_errors.txt'
deleteFiles([outf])


def compare_errors(o_errors, n_errors):
    o_indices = [e['pos'] for e in o_errors]
    n_indices = [e['pos'] for e in n_errors]
    for o_idx in o_indices:
        if o_idx in n_indices:
            o_err = o_errors[o_indices.index(o_idx)]
            n_err = n_errors[n_indices.index(o_idx)]
            diff = {
                'token': o_err['token'],
                'norm': o_err['norm'],
                'pos': o_err['pos']
            }
            flag = False
            if o_err['out'] != n_err['out']:
                diff['out_old'] = o_err['out']
                diff['out_new'] = n_err['out']
                flag = True
            if o_err['class'] != n_err['class']:
                diff['class_old'] = o_err['class']
                diff['class_new'] = n_err['class']
                flag = True
            if flag:
                yield diff
Example #7
0
def prepare_train_files(data_dir, train_portion=0.9, reuse=False):

    en_train = os.path.join(data_dir, 'train.en')
    fr_train = os.path.join(data_dir, 'train.fr')
    en_dev = os.path.join(data_dir, 'dev.en')
    fr_dev = os.path.join(data_dir, 'dev.fr')
    en_test = os.path.join(data_dir, 'test.en')
    fr_test = os.path.join(data_dir, 'test.fr')

    exist = [
        os.path.exists(f)
        for f in [en_train, fr_train, en_dev, fr_dev, en_test, fr_test]
    ]
    if reuse and all(exist):
        return (os.path.join(data_dir, 'train'), os.path.join(data_dir, 'dev'),
                os.path.join(data_dir, 'test'))

    samples = get_training_data(data_dir)

    deleteFiles([en_train, fr_train, en_dev, fr_dev, en_test, fr_test])

    ngram = 3
    num_samples = len(samples)
    num_train_samples = int(numpy.round(num_samples * train_portion))
    sep = ' {} '.format(_SEP)
    # Training Set
    for sample, n in zip(samples, range(num_train_samples)):
        for in_win, out in zip(context_window(sample['input'], ngram),
                               sample['output']):
            inp, out = in_win[ngram // 2].lower(), out.lower()
            if not valid_token(inp):
                continue
            # if inp in aspell:
            #     continue

            writeToFile(en_train, sep.join(convert_format(in_win)) + '\n')
            writeToFile(fr_train, ' '.join(list(out.replace(' ', '_'))) + '\n')

    # Dev Set
    for sample in samples[num_train_samples:]:
        for in_win, out in zip(context_window(sample['input'], ngram),
                               sample['output']):
            inp, out = in_win[ngram // 2].lower(), out.lower()
            if not valid_token(inp):
                continue
            # if inp in aspell:
            #     continue

            writeToFile(en_dev, sep.join(convert_format(in_win)) + '\n')
            writeToFile(fr_dev, ' '.join(list(out.replace(' ', '_'))) + '\n')

    # Test Set
    with open(os.path.join(data_dir, 'test_truth.json')) as ifi:
        samples = json.load(ifi)

    for sample in samples:
        for in_win, out in zip(context_window(sample['input'], ngram),
                               sample['output']):
            inp, out = in_win[ngram // 2].lower(), out.lower()
            if not valid_token(inp):
                continue
            if inp in aspell:
                continue

            writeToFile(en_test, sep.join(convert_format(in_win)) + '\n')
            writeToFile(fr_test, ' '.join(list(out.replace(' ', '_'))) + '\n')

    return (os.path.join(data_dir, 'train'), os.path.join(data_dir, 'dev'),
            os.path.join(data_dir, 'test'))
Example #8
0
    def prepare_samples(self):
        sep = ' {} '.format(_SEP)
        random.seed(self.seed)
        train_samples = self.load_samples(self.train_files) \
            if self.train_files else []
        test_samples = self.load_samples(self.test_files) \
            if self.test_files else []
        num_samples = len(train_samples)
        shuffled_indices = range(num_samples)
        random.shuffle(shuffled_indices)

        num_trn_dev_samples = int(np.round(num_samples * self.train_ratio))
        num_dev_samples = int(np.round(num_trn_dev_samples * self.dev_ratio))
        num_train_samples = num_trn_dev_samples - num_dev_samples

        # Filenames
        inp_train = self.train_path + '.inp'
        out_train = self.train_path + '.out'
        inp_dev = self.dev_path + '.inp'
        out_dev = self.dev_path + '.out'
        inp_test = self.test_path + '.inp'
        out_test = self.test_path + '.out'

        # Training Set
        if self.reuse and all(
            [os.path.exists(f) for f in [inp_train, out_train]]):
            pass
        else:
            deleteFiles([inp_train, out_train])
            for idx in shuffled_indices[:num_train_samples]:
                for in_win, out in zip(
                        self.context_window(train_samples[idx]['input'],
                                            self.ngram),
                        train_samples[idx]['output']):
                    inp = in_win[self.ngram // 2].lower()
                    out = [_GO] + list(out.lower().replace(' ', '_')) + [_EOS]
                    # If the output is blank, then we ignore the sample since we
                    # do not want to consider such cases for training. Example
                    # if the input was ['b', 'cuz'] the output will be
                    # ['because', ''].  For now, the model is not being trained
                    # to recognize that.
                    if not self.valid_token(inp) or len(out) == 0:
                        continue
                    if self.filter_vocab_train and inp in self.aspell:
                        continue
                    writeToFile(inp_train,
                                sep.join(self.convert_format(in_win)) + '\n')
                    # The output sequence might contain space implying that the
                    # normalized output was a set of two words. This space is
                    # represented as an underscore (`_`) because space is used
                    # as a delimiter in the input format.  Later while decoding,
                    # the output of the model should be processed to replace
                    # underscores with spaces.
                    writeToFile(out_train, ' '.join(out) + '\n')

        # Dev Set
        if self.reuse and all([os.path.exists(f) for f in [inp_dev, out_dev]]):
            pass
        else:
            deleteFiles([inp_dev, out_dev])
            for idx in shuffled_indices[num_train_samples:num_trn_dev_samples]:
                for in_win, out in zip(
                        self.context_window(train_samples[idx]['input'],
                                            self.ngram),
                        train_samples[idx]['output']):
                    inp = in_win[self.ngram // 2].lower()
                    out = [_GO] + list(out.lower().replace(' ', '_')) + [_EOS]
                    if not self.valid_token(inp) or len(out) == 0:
                        continue
                    if self.filter_vocab_train and inp in self.aspell:
                        continue
                    writeToFile(inp_dev,
                                sep.join(self.convert_format(in_win)) + '\n')
                    writeToFile(out_dev, ' '.join(out) + '\n')

        # Test Set
        if self.reuse and all(
            [os.path.exists(f) for f in [inp_test, out_test]]):
            pass
        else:
            deleteFiles([inp_test, out_test])
            test_samples.extend(train_samples[num_trn_dev_samples:])
            for sample in test_samples:
                for in_win, out in zip(
                        self.context_window(sample['input'], self.ngram),
                        sample['output']):
                    inp = in_win[self.ngram // 2].lower()
                    out = [_GO] + list(out.lower().replace(' ', '_')) + [_EOS]
                    if not self.valid_token(inp) or len(out) == 0:
                        continue
                    if self.filter_vocab_test and inp in self.aspell:
                        continue
                    writeToFile(inp_test,
                                sep.join(self.convert_format(in_win)) + '\n')
                    writeToFile(out_test, ' '.join(out) + '\n')