def main(args):
    if args.filelist is not None:
        paths_file = utils.read_lines(args.filelist)
    else:
        paths_file = args.files
    path_vocab = args.vocab

    utils.build_vocabulary(paths_file=paths_file,
                           path_vocab=path_vocab,
                           prune_at=50000,
                           min_count=-1,
                           special_words=["<root>"])
Ejemplo n.º 2
0
def main():
    config = utils.Config()

    utils.mkdir(os.path.join(config.getpath("data"), "scidtb-vocab"))

    relation_mapper = treetk.rstdt.RelationMapper(corpus_name="scidtb")

    filenames = []

    for filename in os.listdir(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "train")):
        filenames.append(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "train", filename))

    for filename in os.listdir(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "dev", "gold")):
        filenames.append(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "dev", "gold", filename))
    for filename in os.listdir(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "dev", "second_annotate")):
        filenames.append(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "dev", "second_annotate", filename))

    for filename in os.listdir(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "test", "gold")):
        filenames.append(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "test", "gold", filename))
    for filename in os.listdir(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "test", "second_annotate")):
        filenames.append(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "test", "second_annotate", filename))

    filenames = [n for n in filenames if n.endswith(".edus.tokens")]
    filenames.sort()

    tmp_f_path = os.path.join(config.getpath("data"), "scidtb-vocab", "tmp_f.txt")
    tmp_c_path = os.path.join(config.getpath("data"), "scidtb-vocab", "tmp_c.txt")
    with open(tmp_f_path, "w") as ff, open(tmp_c_path, "w") as fc:
        for filename in filenames:
            lines = utils.read_lines(filename.replace(".edus.tokens", ".arcs"), process=lambda line: line.split())
            assert len(lines) == 1
            line = lines[0]
            arcs = treetk.hyphens2arcs(line)
            fine_relations = [l for h,d,l in arcs]
            coarse_relations = [relation_mapper.f2c(l) for l in fine_relations]
            fine_relations = " ".join(fine_relations)
            coarse_relations = " ".join(coarse_relations)
            ff.write("%s\n" % fine_relations)
            fc.write("%s\n" % coarse_relations)

    utils.build_vocabulary(paths_file=[tmp_f_path],
                           path_vocab=os.path.join(config.getpath("data"), "scidtb-vocab", "relations.fine.vocab.txt"),
                           prune_at=50000,
                           min_count=-1,
                           special_words=["<root>"],
                           with_unk=False)
    utils.build_vocabulary(paths_file=[tmp_c_path],
                           path_vocab=os.path.join(config.getpath("data"), "scidtb-vocab", "relations.coarse.vocab.txt"),
                           prune_at=50000,
                           min_count=-1,
                           special_words=["<root>"],
                           with_unk=False)
Ejemplo n.º 3
0
    def make_partitions_quora(self):
        self.shuffle()
        vocab_non_sim = self._non_sim_data[:231027]
        vocab_sim = self._non_sim_data[:133263]
        vocab_processor, sequence_length = build_vocabulary(
            vocab_sim, vocab_non_sim)
        train_non_sim = [
            self.to_index_data(data, vocab_processor)
            for data in self._non_sim_data[:207026]
        ]
        train_sim = [
            self.to_index_data(data, vocab_processor)
            for data in self._sim_data[:117262]
        ]
        dev_non_sim = [
            self.to_index_data(data, vocab_processor)
            for data in self._non_sim_data[207027:231027]
        ]
        dev_sim = [
            self.to_index_data(data, vocab_processor)
            for data in self._sim_data[117263:133263]
        ]
        test_non_sim = [
            self.to_index_data(data, vocab_processor)
            for data in self._non_sim_data[231027:]
        ]
        test_sim = [
            self.to_index_data(data, vocab_processor)
            for data in self._sim_data[133263:]
        ]

        return train_non_sim, train_sim, dev_non_sim, dev_sim, \
               test_non_sim, test_sim, vocab_processor, sequence_length
Ejemplo n.º 4
0
    def __init__(self, batch_size, sequence_length, data_path = './ptb_data/', seed = 123):
        
        np.random.seed(seed)
        self.batch_size = batch_size
        self.seq_len = sequence_length
        train_file = data_path + 'ptb.train_small.txt'
        valid_file = data_path + 'ptb.valid.txt'
        test_file = data_path + 'ptb.test.txt'

        #word->vector based on the word frequency of training data
        #ex) 'the' -> 0, <unk> -> 1, ..., 'wachter' -> 9999
        #whole word->vector dictionary is saved in ptb_data/ptb_word_to_id.txt
        self.word_to_id, self.id_to_word = build_vocabulary(train_file)
        train_data = file_to_word_ids(train_file, self.word_to_id)
        valid_data = file_to_word_ids(valid_file, self.word_to_id)
        test_data = file_to_word_ids(test_file, self.word_to_id)
        
        #make x, y
        n_chunk = int((len(train_data)-1)//self.seq_len)
        self.train_x = np.reshape(train_data[:n_chunk*self.seq_len], [n_chunk, self.seq_len])
        self.train_y = np.reshape(train_data[1:n_chunk*self.seq_len+1], [n_chunk, self.seq_len])

        n_chunk = int((len(valid_data)-1)//self.seq_len)
        self.valid_x = np.reshape(valid_data[:n_chunk*self.seq_len], [n_chunk,self.seq_len])
        self.valid_y = np.reshape(valid_data[1:n_chunk*self.seq_len+1], [n_chunk, self.seq_len])

        n_chunk = int((len(test_data)-1)//self.seq_len)
        self.test_x = np.reshape(test_data[:n_chunk*self.seq_len], [n_chunk, self.seq_len])
        self.test_y = np.reshape(test_data[1:n_chunk*self.seq_len+1], [n_chunk, self.seq_len])

        self.mode = '' #train, valid, or test
        self.counter = 0
        self.n_data = self.train_x.shape[0]
        self.n_batch = int(self.n_data // self.batch_size)
        self.data_idx_perm = np.random.permutation(self.n_data)
Ejemplo n.º 5
0
def originalVocabulary(filename: str) -> dict:
    '''
    The following function will create the Original Vocabulary.

    We keep all the information we need using the following format:
    term -> [frequency, # of yes's, # of no's]
    '''
    data = utils.load_data(filename)
    vocabulary = dict()

    for tokens in data:
        tokensList = tokens[1].lower().split()
        sentiment = tokens[2]
        utils.build_vocabulary(vocabulary, tokensList, sentiment)

    return vocabulary
Ejemplo n.º 6
0
def load_ibm():
    """ Load the train and dev datasets """
    IBM_PATH = '/home/mgimenez/Dev/corpora/Quora/IBM'
    TRAIN_PATH = join(IBM_PATH, 'train.tsv')
    train = Corpus('ibm', TRAIN_PATH)
    DEV_PATH = join(IBM_PATH, 'dev.tsv')
    dev = Corpus('ibm', DEV_PATH)
    TEST_PATH = join(IBM_PATH, 'test.tsv')
    test = Corpus('ibm', TEST_PATH)

    vocab_processor, seq_len = build_vocabulary(train.sim_data,
                                                train.non_sim_data)
    train.to_index(vocab_processor)
    dev.to_index(vocab_processor)
    test.to_index(vocab_processor)

    return train.non_sim_data, train.sim_data, \
           dev.non_sim_data, dev.sim_data, \
           test.sim_data, test.non_sim_data, \
           vocab_processor, seq_len
Ejemplo n.º 7
0
    def create_vocabularies(self,
                            num_sim_sentences,
                            num_nonsim_sentences,
                            partitions_path=None):
        """ Create and save the vocabularies

        :param partitions_path: path where the binarized files should be saved
            if this is not present the vocabularies won't be saved.
        :return: the vocabulary processor.

        """
        vocab_non_sim = self._non_sim_data[:num_nonsim_sentences]
        vocab_sim = self._sim_data[:num_sim_sentences]
        vocab_processor, sequence_length = build_vocabulary(
            vocab_sim, vocab_non_sim)
        if partitions_path:
            if not isdir(partitions_path):
                makedirs(partitions_path)
            pickle.dump(vocab_processor,
                        open(join(partitions_path, "vocab.train"), "wb"))
            pickle.dump(sequence_length,
                        open(join(partitions_path, "sequence.len"), "wb"))

        return vocab_processor
Ejemplo n.º 8
0
                                                random_seed=0)

num_train, num_dev, fr_train, fr_dev = train_test_split(numbers,
                                                        french_numbers,
                                                        test_size=0.5,
                                                        random_state=0)

num_val, num_test, fr_val, fr_test = train_test_split(num_dev,
                                                      fr_dev,
                                                      test_size=0.5,
                                                      random_state=0)

tokenized_fr_train = [tokenize(s, word_level=True) for s in fr_train]
tokenized_num_train = [tokenize(s, word_level=False) for s in num_train]

fr_vocab, rev_fr_vocab = build_vocabulary(fr_train)
num_vocab, rev_num_vocab = build_vocabulary(num_train, word_level=False)
shared_vocab, rev_shared_vocab = build_vocabulary_token(tokenized_fr_train +
                                                        tokenized_num_train)

np.save('data_npy/fr_train', fr_train)
np.save('data_npy/num_train', num_train)
np.save('data_npy/fr_val', fr_val)
np.save('data_npy/num_val', num_val)
np.save('data_npy/fr_test', fr_test)
np.save('data_npy/num_test', num_test)
# np.save('data_npy/fr_vocab',fr_vocab)
np.save('data_npy/rev_fr_vocab', rev_fr_vocab)
# np.save('data_npy/num_vocab',num_vocab)
np.save('data_npy/rev_num_vocab', rev_num_vocab)
# np.save('data_npy/shared_vocab',shared_vocab)
Ejemplo n.º 9
0
def main():
    """
    Entry point for training and evaluation.
    """
    args = parse_arguments()

    # Summaries
    summaries_dir = '{0}/{1}'.format(
        FLAGS.summaries_dir,
        datetime.datetime.now().strftime('%d_%b_%Y-%H_%M_%S'))
    train_writer = tf.summary.FileWriter(summaries_dir + '/train')
    validation_writer = tf.summary.FileWriter(summaries_dir + '/validation')

    # Model directory
    model_name = str(int(time.time()))
    model_dir = '{0}/{1}'.format(FLAGS.checkpoints_dir, model_name)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Save configuration
    FLAGS(sys.argv)
    # config = FLAGS.__dict__['__flags']
    config = FLAGS
    with open('{}/config.pkl'.format(model_dir), 'wb') as f:
        pickle.dump(config, f)

    # Generate vocabulary and load compressed word embeddings model
    vocabulary = utils.build_vocabulary()
    ft_model = embeddings.get_fastText_embedding()
    word_embeddings = utils.compress_word_embedding(vocabulary, ft_model)
    # word_embeddings = None

    with tf.Session() as sess:
        model = BiLSTM(hidden_size=[FLAGS.hidden_size],
                       word_embeddings=word_embeddings,
                       embedding_size=300,
                       vocabulary_size=len(vocabulary),
                       max_seq_length=FLAGS.max_seq_length,
                       learning_rate=FLAGS.learning_rate)

        # Saver object
        saver = tf.train.Saver()

        # Restore checkpoint
        if args.checkpoint:
            saver.restore(sess, FLAGS.checkpoints_dir + '155')

        # Train model
        global_step = 0
        sess.run(tf.global_variables_initializer())

        # TODO implement tf.Dataset.
        # sess.run(model.dataset_iterator.make_initializer(train_dataset))
        for epoch in range(FLAGS.epochs):
            X_train, y_train, seq_lengths = utils.generate_data_batch(
                batch_size=FLAGS.batch_size,
                max_seq_length=FLAGS.max_seq_length,
                vocabulary=vocabulary,
                embeddings=word_embeddings)
            feeds_train = [
                model.loss, model.train_step, model.merged
                # model.embedding_lookup
            ]
            feed_dict_train = {
                model.input: X_train,
                model.target: y_train,
                model.seq_len: seq_lengths,
                model.keep_prob: FLAGS.keep_prob
                # model.embedding_init
            }

            try:
                train_loss, _, summary = sess.run(feeds_train, feed_dict_train)
            except Exception as e:
                # utils.debug_data(seq_source_ids, seq_ref_ids, seq_source_len, seq_ref_len, id_to_vocab)
                raise e

            train_writer.add_summary(summary, global_step)
            print('{0}/{1} train loss: {2:.4f}'.format(global_step + 1,
                                                       FLAGS.train_steps,
                                                       train_loss))

            # Check validation performance
            if (global_step + 1) % 101 == 0:
                # TODO implement tf.Dataset.
                # validation_init_op = iterator.make_initializer(valid_dataset)

                X_val, y_val, val_seq_len = utils.generate_data_batch(
                    max_seq_length=FLAGS.max_seq_length,
                    vocabulary=vocabulary,
                    train=False)
                feed_val = [
                    model.loss, model.accuracy, model.merged
                    # model.embedding_lookup
                ]
                feed_dict_val = {
                    model.input: X_val,
                    model.target: y_val,
                    model.seq_len: val_seq_len,
                    model.keep_prob: 1
                    # model.embedding_init
                }
                try:
                    val_loss, accuracy, summary = sess.run(
                        feed_val, feed_dict_val)
                except Exception as e:
                    # utils.debug_data(seq_source_ids, seq_ref_ids, seq_source_len, seq_ref_len, id_to_vocab)
                    raise e

                validation_writer.add_summary(summary, global_step)
                print('   validation loss: {0:.4f} (accuracy {1:.4f})'.format(
                    val_loss, accuracy))

            global_step += 1
            # End train batch

            save_path = saver.save(sess,
                                   '{}/model.ckpt'.format(model_dir),
                                   global_step=global_step)
Ejemplo n.º 10
0
    def write_partitions_mixed(self, partitions_path, one_hot=False):
        """ Create the partitions and write them in csv """
        # Shuffle the dataset
        # This was commented because the pipeline handles the shuffle.
        # self.shuffle()

        # Create and save the vocabularies
        vocab_non_sim = self._non_sim_data[:231027]
        vocab_sim = self._non_sim_data[:133263]
        vocab_processor, sequence_length = build_vocabulary(
            vocab_sim, vocab_non_sim)
        if not isdir(partitions_path):
            makedirs(partitions_path)

        pickle.dump(vocab_processor,
                    open(join(partitions_path, "vocab.train"), "wb"))
        pickle.dump(sequence_length,
                    open(join(partitions_path, "sequence.len"), "wb"))

        # Create and save the  TRAIN FILE
        writer = tf.python_io.TFRecordWriter(
            join(partitions_path, "train.tfrecords"))
        lines = 0
        for i in range(133263):
            # Write a non similar sentence
            data = self._non_sim_data[i]
            data_idx = self.to_index_data(data, vocab_processor)
            write_tfrecord(writer, data_idx, one_hot)

            # Write a similar sentence
            data = self._sim_data[i]
            data_idx = self.to_index_data(data, vocab_processor)
            write_tfrecord(writer, data_idx, one_hot)
            lines += 2

        for i in range(133263, 231027):
            data = self._non_sim_data[i]
            data_idx = self.to_index_data(data, vocab_processor)
            write_tfrecord(writer, data_idx, one_hot)
            lines += 1
        print("Saved {} data examples for training".format(lines))

        # Create and save the  DEV FILE
        writer = tf.python_io.TFRecordWriter(
            join(partitions_path, "dev.tfrecords"))
        lines = 0
        # Mixed part: similar and non similar sentences
        for i, j in zip(range(231027, 239027), range(133263, 141263)):
            data = self._non_sim_data[i]
            data_idx = self.to_index_data(data, vocab_processor)
            write_tfrecord(writer, data_idx, one_hot)

            data = self._sim_data[j]
            data_idx = self.to_index_data(data, vocab_processor)
            write_tfrecord(writer, data_idx, one_hot)
            lines += 2

        for i in range(239027, 243027):
            data = self._non_sim_data[i]
            data_idx = self.to_index_data(data, vocab_processor)
            write_tfrecord(writer, data_idx, one_hot)
            lines += 1
        print("Saved {} data examples for development".format(lines))

        # Create and save the  TEST FILE
        writer = tf.python_io.TFRecordWriter(
            join(partitions_path, "test.tfrecords"))
        lines = 0
        # Mixed part: similar and non similar sentences
        for i, j in zip(range(243027, 251027), range(141263, 149263)):
            data = self._non_sim_data[i]
            data_idx = self.to_index_data(data, vocab_processor)
            write_tfrecord(writer, data_idx, one_hot)

            data = self._sim_data[j]
            data_idx = self.to_index_data(data, vocab_processor)
            write_tfrecord(writer, data_idx, one_hot)
            lines += 2

        for i in range(251027, 255027):
            data = self._non_sim_data[i]
            data_idx = self.to_index_data(data, vocab_processor)
            write_tfrecord(writer, data_idx, one_hot)
            lines += 1
        print("Saved {} data examples for testing".format(lines))
Ejemplo n.º 11
0
import datetime
import os
import sys
import zipfile

import numpy as np
import tensorflow as tf

from utils import build_vocabulary

MODELS_FOLDER = os.path.join(os.path.dirname(__file__), "models")

TIMESTAMP = datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S")

input_filenames = sys.argv[1:]
_, indices, count, dictionary, reverse_dictionary = build_vocabulary(
    input_filenames)
vocab_size = len(dictionary)
print(indices[:7])

window_size = 3
vector_dim = 300
epochs = 200000

valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(indices,
                            vocab_size,
                            window_size=window_size,
Ejemplo n.º 12
0
MODELS_FOLDER = os.path.join(os.path.dirname(__file__), "models")
LSTM_MODEL_PATH = MODELS_FOLDER + "/lstm_model_2018-12-12-005552.json"
LSTM_WEIGHTS_PATH = MODELS_FOLDER + "/lstm_model_2018-12-12-005552.h5"

SEQUENCE_LEN = 20
NUM_PERIODS_UNTIL_STOP = 50

if __name__ == "__main__":
    # Load the LSTM model.
    with open(LSTM_MODEL_PATH, "r") as json_file:
        lstm = model_from_json(json_file.read())
        lstm.load_weights(LSTM_WEIGHTS_PATH)

    # Build out our input vocabulary from the set of 'Data' files.
    input_filenames = glob.glob(DATA_FOLDER + "/*")
    _, _, _, dictionary, reverse_dictionary = build_vocabulary(input_filenames)

    # Capture an initial input sequence of 20 words or less.
    print("\n\n\n\n")
    print("How would you like to start your Sherlock Holmes story?")
    print(
        "Please input up to 20 'words' (note that all punctuation will also be considered a 'word')."
    )
    print("Type your words here and press <ENTER> when you are done:")
    print(">>> ", end="", flush=True)
    input_seq = _preprocess(sys.stdin.readline())
    print("\nProcessing...")

    # Copy the input sequence as the initial output (capping at SEQUENCE_LEN).
    output = input_seq[:SEQUENCE_LEN]
Ejemplo n.º 13
0
def main():
    # Загрузка данных. Разделение на тренировочные и тестовые
    full = pyconll.load_from_file('./data/postag_sakha.conllu')
    full_train = full[:1700]
    full_test = full[1700:]
    print('Количество тренировочных предложений = ', len(full_train))
    print('Количество тестовых предложений = ', len(full_test))

    # Посчитаем максимальную длину слова и предложения
    MAX_SENT_LEN = max(len(sent) for sent in full_train)
    MAX_ORIG_TOKEN_LEN = max(
        len(token.form) for sent in full_train for token in sent)
    print('Наибольшая длина предложения', MAX_SENT_LEN)
    print('Наибольшая длина токена', MAX_ORIG_TOKEN_LEN)

    all_train_texts = [
        ' '.join(token.form for token in sent) for sent in full_train
    ]

    # Создаем словарь символов
    train_char_tokenized = tokenize_corpus(all_train_texts,
                                           tokenizer=character_tokenize)
    char_vocab, word_doc_freq = build_vocabulary(train_char_tokenized,
                                                 max_doc_freq=1.0,
                                                 min_count=5,
                                                 pad_word='<PAD>')
    print("Количество уникальных символов", len(char_vocab))
    print(list(char_vocab.items())[:10])

    # Создаем словарь тегов
    UNIQUE_TAGS = ['<NOTAG>'] + sorted(
        {token.upos
         for sent in full_train for token in sent if token.upos})
    label2id = {label: i for i, label in enumerate(UNIQUE_TAGS)}
    print(label2id)

    # Преобразование данных в числа
    train_inputs, train_labels = pos_corpus_to_tensor(full_train, char_vocab,
                                                      label2id, MAX_SENT_LEN,
                                                      MAX_ORIG_TOKEN_LEN)
    train_dataset = TensorDataset(train_inputs, train_labels)

    test_inputs, test_labels = pos_corpus_to_tensor(full_test, char_vocab,
                                                    label2id, MAX_SENT_LEN,
                                                    MAX_ORIG_TOKEN_LEN)
    test_dataset = TensorDataset(test_inputs, test_labels)

    sentence_level_model = SentenceLevelPOSTagger(
        len(char_vocab),
        len(label2id),
        embedding_size=64,
        single_backbone_kwargs=dict(layers_n=4, kernel_size=5, dropout=0.3),
        context_backbone_kwargs=dict(layers_n=4, kernel_size=3, dropout=0.3))
    print('Количество параметров',
          sum(np.product(t.shape) for t in sentence_level_model.parameters()))

    (best_val_loss, best_sentence_level_model) = train_eval_loop(
        sentence_level_model,
        train_dataset,
        test_dataset,
        F.cross_entropy,
        lr=5e-3,
        epoch_n=30,
        batch_size=64,
        device='cuda',
        early_stopping_patience=5,
        max_batches_per_epoch_train=500,
        max_batches_per_epoch_val=100,
        lr_scheduler_ctor=lambda optim: torch.optim.lr_scheduler.
        ReduceLROnPlateau(optim, patience=2, factor=0.5, verbose=True))

    torch.save(best_sentence_level_model, './models/cnn_pos')

    UNIQUE_TAGS1 = [
        'ADJ', 'ADV', 'AUX', 'CONJ', 'INTJ', 'NOUN', 'NUM', 'PART', 'PR',
        'PRON', 'VERB'
    ]

    from sklearn.metrics import confusion_matrix
    train_pred = predict_with_model(sentence_level_model, train_dataset)
    train_loss = F.cross_entropy(torch.tensor(train_pred),
                                 torch.tensor(train_labels))
    print('Среднее значение функции потерь на обучении', float(train_loss))
    print(
        classification_report(train_labels.view(-1),
                              train_pred.argmax(1).reshape(-1),
                              labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13],
                              target_names=UNIQUE_TAGS1))
    print()

    test_pred = predict_with_model(sentence_level_model, test_dataset)
    test_loss = F.cross_entropy(torch.tensor(test_pred),
                                torch.tensor(test_labels))
    print('Среднее значение функции потерь на валидации', float(test_loss))
    print(
        classification_report(test_labels.view(-1),
                              test_pred.argmax(1).reshape(-1),
                              labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13],
                              target_names=UNIQUE_TAGS1))
Ejemplo n.º 14
0
    path_loss = os.path.join(PLOTS_FOLDER, "lstm_plot_loss_" + TIMESTAMP)

    plt.savefig(path_loss, bbox_inches='tight')


if __name__ == "__main__":
    # Create folders for any generated files.
    os.makedirs(GENTEXT_FOLDER, exist_ok=True)
    os.makedirs(PLOTS_FOLDER, exist_ok=True)
    os.makedirs(MODELS_FOLDER, exist_ok=True)

    # PREPROCESS THE DATA
    # pass in the text file name as the first argument
    # e.g. `$ python lstm_rnn.py sample1.txt`
    input_filenames = sys.argv[1:]
    words, indices, count, word_indices, indices_word = build_vocabulary(input_filenames)

    # SEQUENCE THE TEXT
    sequences = []
    next_words = []
    for i in range(0, len(words) - SEQUENCE_LEN, STEP):
        sequences.append(words[i:i+SEQUENCE_LEN])
        next_words.append(words[i+SEQUENCE_LEN])

    # SPLIT DATA INTO TRAIN AND TEST DATA
    (sequences_train, next_words_train), (sequences_test, next_words_test) = shuffle_and_split_training_set(sequences, next_words)

    # BUILD AND COMPILE THE MODEL
    model = get_model()
    model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])