def __init__(self, data_file_name, vocabulary_file_path, batch_size, sequence_length): self.data_file = codecs.open(data_file_name, 'r', 'utf_8') self.vocabulary = Vocabulary() self.vocabulary.retrieve(vocabulary_file_path) self.batch_size = batch_size self.sequence_length = sequence_length
def main(): parser = argparse.ArgumentParser() parser.add_argument('--input_file', type=str, required=True) input_file = parser.parse_args().input_file vocabulary = Vocabulary() vocabulary.generate(input_file) output_file_name = "{}.vocab".format(input_file[:input_file.index('.')]) output_file = open(output_file_name, 'w') output_file.write(vocabulary.get_serialized_binary_representation()) output_file.close() print "Vocabulary saved in {}".format(output_file_name)
class Batch: dataset_full_passes = 0 def __init__(self, data_file_name, vocabulary_file_path, batch_size, sequence_length): self.data_file = codecs.open(data_file_name, 'r', 'utf_8') self.vocabulary = Vocabulary() self.vocabulary.retrieve(vocabulary_file_path) self.batch_size = batch_size self.sequence_length = sequence_length def get_next_batch(self): string_len = self.batch_size * self.sequence_length + self.batch_size current_batch = self.data_file.read(string_len) batch_vector = [] label_vector = [] if len(current_batch) < string_len: while len(current_batch) < string_len: current_batch += u' ' self.data_file.seek(0) self.dataset_full_passes += 1 print("Pass {} done".format(self.dataset_full_passes)) for i in np.arange(0, string_len, self.sequence_length + 1): sequence = current_batch[i:i + self.sequence_length] label = current_batch[i + self.sequence_length:i + self.sequence_length + 1] sequences_vector = [] for char in sequence: sequences_vector.append( self.vocabulary.binary_vocabulary[char]) batch_vector.append(sequences_vector) label_vector.append(self.vocabulary.binary_vocabulary[label]) return np.asarray(batch_vector), np.asarray(label_vector) def clean(self): self.data_file.close()
class Batch: dataset_full_passes = 0 def __init__(self, data_file_name, vocabulary_file_path, batch_size, sequence_length): self.data_file = codecs.open(data_file_name, 'r', 'utf_8') self.vocabulary = Vocabulary() self.vocabulary.retrieve(vocabulary_file_path) self.batch_size = batch_size self.sequence_length = sequence_length def get_next_batch(self): string_len = self.batch_size * self.sequence_length + self.batch_size current_batch = self.data_file.read(string_len) batch_vector = [] label_vector = [] if len(current_batch) < string_len: while len(current_batch) < string_len: current_batch += u' ' self.data_file.seek(0) self.dataset_full_passes += 1 print "Pass {} done".format(self.dataset_full_passes) for i in np.arange(0, string_len, self.sequence_length + 1): sequence = current_batch[i:i + self.sequence_length] label = current_batch[i + self.sequence_length:i + self.sequence_length + 1] sequences_vector = [] for char in sequence: sequences_vector.append(self.vocabulary.binary_vocabulary[char]) batch_vector.append(sequences_vector) label_vector.append(self.vocabulary.binary_vocabulary[label]) return np.asarray(batch_vector), np.asarray(label_vector) def clean(self): self.data_file.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, required=True) parser.add_argument('--vocabulary_file', type=str, required=True) parser.add_argument('--output_file', type=str, required=True) parser.add_argument('--seed', type=str, default="Once upon a time, ") parser.add_argument('--sample_length', type=int, default=1500) parser.add_argument('--log_frequency', type=int, default=100) args = parser.parse_args() model_name = args.model_name vocabulary_file = args.vocabulary_file output_file = args.output_file seed = args.seed.decode('utf-8') sample_length = args.sample_length log_frequency = args.log_frequency model = Model(model_name) model.restore() classifier = model.get_classifier() vocabulary = Vocabulary() vocabulary.retrieve(vocabulary_file) sample_file = codecs.open(output_file, 'w', 'utf_8') stack = deque([]) for i in range(0, model.sequence_length - len(seed)): stack.append(u' ') for char in seed: if char not in vocabulary.vocabulary: print char,"is not in vocabulary file" char = u' ' stack.append(char) sample_file.write(char) with tf.Session() as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) ckpt = tf.train.get_checkpoint_state(model_name) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) for i in range(0, sample_length): vector = [] for char in stack: vector.append(vocabulary.binary_vocabulary[char]) vector = np.array([vector]) prediction = sess.run(classifier, feed_dict={model.x: vector}) predicted_char = vocabulary.char_lookup[np.argmax(prediction)] stack.popleft() stack.append(predicted_char) sample_file.write(predicted_char) if i % log_frequency == 0: print "Progress: {}%".format((i * 100) / sample_length) sample_file.close() print "Sample saved in {}".format(output_file)