def makeVocabulary( filenames, size ): #for the purpose of making vocabulary UNK_WORD means unknown words vocab = neusum.Dict([ neusum.Constants.PAD_WORD, neusum.Constants.UNK_WORD, neusum.Constants.BOS_WORD, neusum.Constants.EOS_WORD ], lower=lower) #convert all words to lower alphabet form for filename in filenames: with open( filename, encoding='utf-8' ) as f: #strip words where ever blank space found and add them to vocabulary for sent in f.readlines(): for word in sent.strip().split(' '): vocab.add(word) originalSize = vocab.size( ) #size of original vocabulary we get 732,204 words vocab = vocab.prune( size ) #we reduce the vocabulary to top 100,000 words as the later are very few in occurance logger.info('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) #feed into log file the data about vocabulary return vocab
def initVocabulary(name, dataFiles, vocabFile, vocabSize): vocab = None if vocabFile is not None: # If given, load existing word dictionary. logger.info('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') vocab = neusum.Dict(lower=lower) vocab.loadFile(vocabFile) logger.info('Loaded ' + str(vocab.size()) + ' ' + name + ' words') if vocab is None: # If a dictionary is still missing, generate it. logger.info('Building ' + name + ' vocabulary...') genWordVocab = makeVocabulary(dataFiles, vocabSize) vocab = genWordVocab return vocab
def makeVocabulary(filenames, size): vocab = neusum.Dict([ neusum.Constants.PAD_WORD, neusum.Constants.UNK_WORD, neusum.Constants.BOS_WORD, neusum.Constants.EOS_WORD ], lower=lower) for filename in filenames: with open(filename, encoding='utf-8') as f: for sent in f.readlines(): for word in sent.strip().split(' '): vocab.add(word) originalSize = vocab.size() vocab = vocab.prune(size) logger.info('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def initVocabulary(name, dataFiles, vocabFile, vocabSize): #this will initialize the process of vocabulary vocab = None if vocabFile is not None: # If given, load existing word dictionary. logger.info('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') vocab = neusum.Dict(lower=lower) vocab.loadFile(vocabFile) logger.info( 'Loaded ' + str(vocab.size()) + ' ' + name + ' words' ) #if any vocabulary is present read it and update log file if vocab is None: # If a dictionary is still missing, generate it. logger.info('Building ' + name + ' vocabulary...') genWordVocab = makeVocabulary(dataFiles, vocabSize) vocab = genWordVocab #if vocab is not present generate it return vocab