def data_to_token_ids(data_path, target_path, vocab_path, reuse=False): if reuse and os.path.exists(target_path): return deleteFiles([target_path]) vocab, _ = initialize_vocabulary(vocab_path) with gfile.GFile(data_path, mode="r") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: for line in data_file: token_ids = sentence_to_token_ids(line, vocab) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def eval_test(): tf.reset_default_graph() test_out = os.path.join(FLAGS.data_dir, 'test_errors.out') deleteFiles([test_out]) stats = {'R2W': 0, 'W2R': 0, 'W2W_C': 0, 'W2W_NC': 0} # change the reuse parameter if you want to build the data again _, _, _, _, en_test, fr_test, _, _ = data_utils.prepare_data( FLAGS.data_dir, reuse=FLAGS.reuse) with tf.Session(config=config_all) as sess: model = create_model(sess, True) test_set = read_data(en_test, fr_test) test_bucket_sizes = [len(test_set[b]) for b in range(len(_buckets))] print('Bucket Sizes : {}'.format(test_bucket_sizes)) total_loss, num_batches = 0, 0 for bucket_id in range(len(_buckets)): all_batches = ([u for u in k if u is not None] for k in itertools.izip_longest(*[ test_set[bucket_id][i::FLAGS.batch_size] for i in range(FLAGS.batch_size) ])) for batch in all_batches: encoder_inputs, decoder_inputs, target_weights = model.prepare_batch( batch, bucket_id) # setting the model batch size in case it is smaller (would be for the # last batch in the bucket) model.batch_size = len(batch) _, eval_loss, logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = np.argmax(logits, axis=2).transpose() outseq = [ out[:list(out).index(data_utils.EOS_ID)] for out in outputs if data_utils.EOS_ID in out ] stat_updates = update_error_counts(batch, outseq) stats = {k: stats[k] + v for k, v in stat_updates.items()} total_loss += math.exp(eval_loss) num_batches += 1 # resetting the madel batch size model.batch_size = FLAGS.batch_size print("Loss over the test set : {}".format(total_loss / num_batches)) print(stats) precision = stats['W2R'] / sum( [stats['W2R'], stats['R2W'], stats['W2W_C']]) recall = stats['W2R'] / sum( [stats['W2R'], stats['W2W_NC'], stats['W2W_C']]) f_m = (2 * precision * recall) / (precision + recall) print('P: {}\nR: {}\nF: {}'.format(precision, recall, f_m))
def create_vocabulary(vocab_path, data_path, reuse=False): if reuse and os.path.exists(vocab_path): return deleteFiles([vocab_path]) vocab = {} with open(data_path) as f: for line in f: for token in line.split(): if token in vocab: vocab[token] += 1 else: vocab[token] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) writeToFile(vocab_path, '\n'.join(vocab_list))
def build_vocabulary(self): if self.reuse and os.path.exists(self.vocab_path): with open(self.vocab_path) as ifi: return len(ifi.readlines()) deleteFiles([self.vocab_path]) vocab = {} vocab_files = [ self.train_path + '.inp', self.train_path + '.out', self.dev_path + '.inp', self.dev_path + '.out' ] for data_path in [fl for fl in vocab_files if os.path.exists(fl)]: with open(data_path) as f: for line in f: for token in line.split(): vocab[token] = vocab.get(token, 0) + 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) writeToFile(self.vocab_path, '\n'.join(vocab_list)) return len(vocab_list)
def generate_binary_data(target_path, iterable): if self.reuse and os.path.exists(target_path): return deleteFiles([target_path]) writer = tf.python_io.TFRecordWriter(target_path) for in_seq, out_seq in iterable: in_tokens = self.sentence_to_token_ids(in_seq, vocab) out_tokens = self.sentence_to_token_ids(out_seq, vocab) targets = out_tokens[1:] + [0] example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ 'inp_seq': format_sequence(in_tokens), 'out_seq': format_sequence(out_tokens), 'targets': format_sequence(targets), })) writer.write(example.SerializeToString()) writer.close() print('Done writing {}'.format(target_path))
import os import json from utilities import writeToFile, deleteFiles outf = './data/diff_errors.txt' deleteFiles([outf]) def compare_errors(o_errors, n_errors): o_indices = [e['pos'] for e in o_errors] n_indices = [e['pos'] for e in n_errors] for o_idx in o_indices: if o_idx in n_indices: o_err = o_errors[o_indices.index(o_idx)] n_err = n_errors[n_indices.index(o_idx)] diff = { 'token': o_err['token'], 'norm': o_err['norm'], 'pos': o_err['pos'] } flag = False if o_err['out'] != n_err['out']: diff['out_old'] = o_err['out'] diff['out_new'] = n_err['out'] flag = True if o_err['class'] != n_err['class']: diff['class_old'] = o_err['class'] diff['class_new'] = n_err['class'] flag = True if flag: yield diff
def prepare_train_files(data_dir, train_portion=0.9, reuse=False): en_train = os.path.join(data_dir, 'train.en') fr_train = os.path.join(data_dir, 'train.fr') en_dev = os.path.join(data_dir, 'dev.en') fr_dev = os.path.join(data_dir, 'dev.fr') en_test = os.path.join(data_dir, 'test.en') fr_test = os.path.join(data_dir, 'test.fr') exist = [ os.path.exists(f) for f in [en_train, fr_train, en_dev, fr_dev, en_test, fr_test] ] if reuse and all(exist): return (os.path.join(data_dir, 'train'), os.path.join(data_dir, 'dev'), os.path.join(data_dir, 'test')) samples = get_training_data(data_dir) deleteFiles([en_train, fr_train, en_dev, fr_dev, en_test, fr_test]) ngram = 3 num_samples = len(samples) num_train_samples = int(numpy.round(num_samples * train_portion)) sep = ' {} '.format(_SEP) # Training Set for sample, n in zip(samples, range(num_train_samples)): for in_win, out in zip(context_window(sample['input'], ngram), sample['output']): inp, out = in_win[ngram // 2].lower(), out.lower() if not valid_token(inp): continue # if inp in aspell: # continue writeToFile(en_train, sep.join(convert_format(in_win)) + '\n') writeToFile(fr_train, ' '.join(list(out.replace(' ', '_'))) + '\n') # Dev Set for sample in samples[num_train_samples:]: for in_win, out in zip(context_window(sample['input'], ngram), sample['output']): inp, out = in_win[ngram // 2].lower(), out.lower() if not valid_token(inp): continue # if inp in aspell: # continue writeToFile(en_dev, sep.join(convert_format(in_win)) + '\n') writeToFile(fr_dev, ' '.join(list(out.replace(' ', '_'))) + '\n') # Test Set with open(os.path.join(data_dir, 'test_truth.json')) as ifi: samples = json.load(ifi) for sample in samples: for in_win, out in zip(context_window(sample['input'], ngram), sample['output']): inp, out = in_win[ngram // 2].lower(), out.lower() if not valid_token(inp): continue if inp in aspell: continue writeToFile(en_test, sep.join(convert_format(in_win)) + '\n') writeToFile(fr_test, ' '.join(list(out.replace(' ', '_'))) + '\n') return (os.path.join(data_dir, 'train'), os.path.join(data_dir, 'dev'), os.path.join(data_dir, 'test'))
def prepare_samples(self): sep = ' {} '.format(_SEP) random.seed(self.seed) train_samples = self.load_samples(self.train_files) \ if self.train_files else [] test_samples = self.load_samples(self.test_files) \ if self.test_files else [] num_samples = len(train_samples) shuffled_indices = range(num_samples) random.shuffle(shuffled_indices) num_trn_dev_samples = int(np.round(num_samples * self.train_ratio)) num_dev_samples = int(np.round(num_trn_dev_samples * self.dev_ratio)) num_train_samples = num_trn_dev_samples - num_dev_samples # Filenames inp_train = self.train_path + '.inp' out_train = self.train_path + '.out' inp_dev = self.dev_path + '.inp' out_dev = self.dev_path + '.out' inp_test = self.test_path + '.inp' out_test = self.test_path + '.out' # Training Set if self.reuse and all( [os.path.exists(f) for f in [inp_train, out_train]]): pass else: deleteFiles([inp_train, out_train]) for idx in shuffled_indices[:num_train_samples]: for in_win, out in zip( self.context_window(train_samples[idx]['input'], self.ngram), train_samples[idx]['output']): inp = in_win[self.ngram // 2].lower() out = [_GO] + list(out.lower().replace(' ', '_')) + [_EOS] # If the output is blank, then we ignore the sample since we # do not want to consider such cases for training. Example # if the input was ['b', 'cuz'] the output will be # ['because', '']. For now, the model is not being trained # to recognize that. if not self.valid_token(inp) or len(out) == 0: continue if self.filter_vocab_train and inp in self.aspell: continue writeToFile(inp_train, sep.join(self.convert_format(in_win)) + '\n') # The output sequence might contain space implying that the # normalized output was a set of two words. This space is # represented as an underscore (`_`) because space is used # as a delimiter in the input format. Later while decoding, # the output of the model should be processed to replace # underscores with spaces. writeToFile(out_train, ' '.join(out) + '\n') # Dev Set if self.reuse and all([os.path.exists(f) for f in [inp_dev, out_dev]]): pass else: deleteFiles([inp_dev, out_dev]) for idx in shuffled_indices[num_train_samples:num_trn_dev_samples]: for in_win, out in zip( self.context_window(train_samples[idx]['input'], self.ngram), train_samples[idx]['output']): inp = in_win[self.ngram // 2].lower() out = [_GO] + list(out.lower().replace(' ', '_')) + [_EOS] if not self.valid_token(inp) or len(out) == 0: continue if self.filter_vocab_train and inp in self.aspell: continue writeToFile(inp_dev, sep.join(self.convert_format(in_win)) + '\n') writeToFile(out_dev, ' '.join(out) + '\n') # Test Set if self.reuse and all( [os.path.exists(f) for f in [inp_test, out_test]]): pass else: deleteFiles([inp_test, out_test]) test_samples.extend(train_samples[num_trn_dev_samples:]) for sample in test_samples: for in_win, out in zip( self.context_window(sample['input'], self.ngram), sample['output']): inp = in_win[self.ngram // 2].lower() out = [_GO] + list(out.lower().replace(' ', '_')) + [_EOS] if not self.valid_token(inp) or len(out) == 0: continue if self.filter_vocab_test and inp in self.aspell: continue writeToFile(inp_test, sep.join(self.convert_format(in_win)) + '\n') writeToFile(out_test, ' '.join(out) + '\n')