def makeVocabulary(filenames, size): vocab = s2s.Dict([s2s.Constants.PAD_WORD, s2s.Constants.UNK_WORD, s2s.Constants.BOS_WORD, s2s.Constants.EOS_WORD], lower=opt.lower) for filename in filenames: with open(filename, encoding='utf-8') as f: for sent in f.readlines(): for word in sent.strip().split(' '): vocab.add(word) originalSize = vocab.size() vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def initVocabulary(name, dataFiles, vocabFile, vocabSize): vocab = None if vocabFile is not None: # If given, load existing word dictionary. logger.info('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') vocab = s2s.Dict() vocab.loadFile(vocabFile) logger.info('Loaded ' + str(vocab.size()) + ' ' + name + ' words') if vocab is None: # If a dictionary is still missing, generate it. logger.info('Building ' + name + ' vocabulary...') genWordVocab = makeVocabulary(dataFiles, vocabSize) vocab = genWordVocab return vocab
def makeVocabulary(filenames, size): vocab = s2s.Dict([ s2s.Constants.PAD_WORD, s2s.Constants.UNK_WORD, s2s.Constants.BOS_WORD, s2s.Constants.EOS_WORD ], lower=lower) for filename in filenames: with open(filename, encoding='utf-8') as f: for sent in f.readlines(): for word in sent.strip().replace( '\t', ' ').split(' '): # add tab for split if word: vocab.add(word) vocab.labelToIdx[''] = 0 # add null str originalSize = vocab.size() vocab = vocab.prune(size) logger.info('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab