from matplotlib import pyplot as plt from keras.preprocessing.text import text_to_word_sequence from keras.preprocessing.text import Tokenizer import tensorflowjs as tfjs # Params BATCH_SIZE = 512 # Number of examples used in each iteration EPOCHS = 100 # Number of passes through entire dataset EMBEDDING = 40 # Dimension of word embedding vector # importing the data dir_path = 'annotated/corpus' dataProcessor = DataProcessor(dir_path, 'tei') sentences = dataProcessor.getListOfTuples() word2idx = {w: i + 2 for i, w in enumerate(dataProcessor.getWords())} word2idx['unk'] = 1 word2idx['pad'] = 0 idx2word = {i: w for w, i in word2idx.items()} tag2idx = {t: i + 1 for i, t in enumerate(dataProcessor.getTags())} tag2idx['pad'] = 0 idx2tag = {i: w for w, i in tag2idx.items()} # Write dictionary import json with open('model4_js/vocab/word2idx.json', 'w') as fp: json.dump(word2idx, fp) with open('model4_js/vocab/idx2word.json', 'w') as fp: