def __init__(self, data_file_name, vocabulary_file_path, batch_size, sequence_length):
        self.data_file = codecs.open(data_file_name, 'r', 'utf_8')

        self.vocabulary = Vocabulary()
        self.vocabulary.retrieve(vocabulary_file_path)

        self.batch_size = batch_size
        self.sequence_length = sequence_length
Exemple #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_file', type=str, required=True)
    input_file = parser.parse_args().input_file

    vocabulary = Vocabulary()
    vocabulary.generate(input_file)

    output_file_name = "{}.vocab".format(input_file[:input_file.index('.')])
    output_file = open(output_file_name, 'w')
    output_file.write(vocabulary.get_serialized_binary_representation())
    output_file.close()

    print "Vocabulary saved in {}".format(output_file_name)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_file', type=str, required=True)
    input_file = parser.parse_args().input_file

    vocabulary = Vocabulary()
    vocabulary.generate(input_file)

    output_file_name = "{}.vocab".format(input_file[:input_file.index('.')])
    output_file = open(output_file_name, 'w')
    output_file.write(vocabulary.get_serialized_binary_representation())
    output_file.close()

    print "Vocabulary saved in {}".format(output_file_name)
Exemple #4
0
    def __init__(self, data_file_name, vocabulary_file_path, batch_size, sequence_length):
        self.data_file = codecs.open(data_file_name, 'r', 'utf_8')

        self.vocabulary = Vocabulary()
        self.vocabulary.retrieve(vocabulary_file_path)

        self.batch_size = batch_size
        self.sequence_length = sequence_length
Exemple #5
0
class Batch:
    dataset_full_passes = 0

    def __init__(self, data_file_name, vocabulary_file_path, batch_size,
                 sequence_length):
        self.data_file = codecs.open(data_file_name, 'r', 'utf_8')

        self.vocabulary = Vocabulary()
        self.vocabulary.retrieve(vocabulary_file_path)

        self.batch_size = batch_size
        self.sequence_length = sequence_length

    def get_next_batch(self):
        string_len = self.batch_size * self.sequence_length + self.batch_size
        current_batch = self.data_file.read(string_len)
        batch_vector = []
        label_vector = []

        if len(current_batch) < string_len:
            while len(current_batch) < string_len:
                current_batch += u' '
            self.data_file.seek(0)
            self.dataset_full_passes += 1
            print("Pass {} done".format(self.dataset_full_passes))

        for i in np.arange(0, string_len, self.sequence_length + 1):
            sequence = current_batch[i:i + self.sequence_length]
            label = current_batch[i + self.sequence_length:i +
                                  self.sequence_length + 1]
            sequences_vector = []

            for char in sequence:
                sequences_vector.append(
                    self.vocabulary.binary_vocabulary[char])
            batch_vector.append(sequences_vector)
            label_vector.append(self.vocabulary.binary_vocabulary[label])

        return np.asarray(batch_vector), np.asarray(label_vector)

    def clean(self):
        self.data_file.close()
Exemple #6
0
class Batch:
    dataset_full_passes = 0

    def __init__(self, data_file_name, vocabulary_file_path, batch_size, sequence_length):
        self.data_file = codecs.open(data_file_name, 'r', 'utf_8')

        self.vocabulary = Vocabulary()
        self.vocabulary.retrieve(vocabulary_file_path)

        self.batch_size = batch_size
        self.sequence_length = sequence_length

    def get_next_batch(self):
        string_len = self.batch_size * self.sequence_length + self.batch_size
        current_batch = self.data_file.read(string_len)
        batch_vector = []
        label_vector = []

        if len(current_batch) < string_len:
            while len(current_batch) < string_len:
                current_batch += u' '
            self.data_file.seek(0)
            self.dataset_full_passes += 1
            print "Pass {} done".format(self.dataset_full_passes)

        for i in np.arange(0, string_len, self.sequence_length + 1):
            sequence = current_batch[i:i + self.sequence_length]
            label = current_batch[i + self.sequence_length:i + self.sequence_length + 1]
            sequences_vector = []

            for char in sequence:
                sequences_vector.append(self.vocabulary.binary_vocabulary[char])
            batch_vector.append(sequences_vector)
            label_vector.append(self.vocabulary.binary_vocabulary[label])

        return np.asarray(batch_vector), np.asarray(label_vector)

    def clean(self):
        self.data_file.close()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, required=True)
    parser.add_argument('--vocabulary_file', type=str, required=True)
    parser.add_argument('--output_file', type=str, required=True)

    parser.add_argument('--seed', type=str, default="Once upon a time, ")
    parser.add_argument('--sample_length', type=int, default=1500)
    parser.add_argument('--log_frequency', type=int, default=100)
    args = parser.parse_args()

    model_name = args.model_name
    vocabulary_file = args.vocabulary_file
    output_file = args.output_file
    seed = args.seed.decode('utf-8')
    sample_length = args.sample_length
    log_frequency = args.log_frequency

    model = Model(model_name)
    model.restore()
    classifier = model.get_classifier()

    vocabulary = Vocabulary()
    vocabulary.retrieve(vocabulary_file)

    sample_file = codecs.open(output_file, 'w', 'utf_8')

    stack = deque([])
    for i in range(0, model.sequence_length - len(seed)):
        stack.append(u' ')

    for char in seed:
        if char not in vocabulary.vocabulary:
            print char,"is not in vocabulary file"
            char = u' '
        stack.append(char)
        sample_file.write(char)

    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        saver = tf.train.Saver(tf.global_variables())
        ckpt = tf.train.get_checkpoint_state(model_name)

        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

            for i in range(0, sample_length):
                vector = []
                for char in stack:
                    vector.append(vocabulary.binary_vocabulary[char])
                vector = np.array([vector])
                prediction = sess.run(classifier, feed_dict={model.x: vector})
                predicted_char = vocabulary.char_lookup[np.argmax(prediction)]

                stack.popleft()
                stack.append(predicted_char)
                sample_file.write(predicted_char)

                if i % log_frequency == 0:
                    print "Progress: {}%".format((i * 100) / sample_length)

            sample_file.close()
            print "Sample saved in {}".format(output_file)
Exemple #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, required=True)
    parser.add_argument('--vocabulary_file', type=str, required=True)
    parser.add_argument('--output_file', type=str, required=True)

    parser.add_argument('--seed', type=str, default="Once upon a time, ")
    parser.add_argument('--sample_length', type=int, default=1500)
    parser.add_argument('--log_frequency', type=int, default=100)
    args = parser.parse_args()

    model_name = args.model_name
    vocabulary_file = args.vocabulary_file
    output_file = args.output_file
    seed = args.seed.decode('utf-8')
    sample_length = args.sample_length
    log_frequency = args.log_frequency

    model = Model(model_name)
    model.restore()
    classifier = model.get_classifier()

    vocabulary = Vocabulary()
    vocabulary.retrieve(vocabulary_file)

    sample_file = codecs.open(output_file, 'w', 'utf_8')

    stack = deque([])
    for i in range(0, model.sequence_length - len(seed)):
        stack.append(u' ')

    for char in seed:
        if char not in vocabulary.vocabulary:
            print char,"is not in vocabulary file"
            char = u' '
        stack.append(char)
        sample_file.write(char)

    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        saver = tf.train.Saver(tf.global_variables())
        ckpt = tf.train.get_checkpoint_state(model_name)

        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

            for i in range(0, sample_length):
                vector = []
                for char in stack:
                    vector.append(vocabulary.binary_vocabulary[char])
                vector = np.array([vector])
                prediction = sess.run(classifier, feed_dict={model.x: vector})
                predicted_char = vocabulary.char_lookup[np.argmax(prediction)]

                stack.popleft()
                stack.append(predicted_char)
                sample_file.write(predicted_char)

                if i % log_frequency == 0:
                    print "Progress: {}%".format((i * 100) / sample_length)

            sample_file.close()
            print "Sample saved in {}".format(output_file)