Example #1
0
from DataProcessor import DataProcessor
from matplotlib import pyplot as plt
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
import tensorflowjs as tfjs

# Params
BATCH_SIZE = 512  # Number of examples used in each iteration
EPOCHS = 100  # Number of passes through entire dataset
EMBEDDING = 40  # Dimension of word embedding vector

# importing the data
dir_path = 'annotated/corpus'
dataProcessor = DataProcessor(dir_path, 'tei')
sentences = dataProcessor.getListOfTuples()

word2idx = {w: i + 2 for i, w in enumerate(dataProcessor.getWords())}
word2idx['unk'] = 1
word2idx['pad'] = 0

idx2word = {i: w for w, i in word2idx.items()}

tag2idx = {t: i + 1 for i, t in enumerate(dataProcessor.getTags())}
tag2idx['pad'] = 0

idx2tag = {i: w for w, i in tag2idx.items()}

# Write dictionary
import json
with open('model4_js/vocab/word2idx.json', 'w') as fp: