Beispiel #1
0
def makeVocabulary(
        filenames, size
):  #for the purpose of making vocabulary UNK_WORD means unknown words
    vocab = neusum.Dict([
        neusum.Constants.PAD_WORD, neusum.Constants.UNK_WORD,
        neusum.Constants.BOS_WORD, neusum.Constants.EOS_WORD
    ],
                        lower=lower)  #convert all words to lower alphabet form
    for filename in filenames:
        with open(
                filename, encoding='utf-8'
        ) as f:  #strip words where ever blank space found and add them to vocabulary
            for sent in f.readlines():
                for word in sent.strip().split(' '):
                    vocab.add(word)

    originalSize = vocab.size(
    )  #size of original vocabulary we get 732,204 words
    vocab = vocab.prune(
        size
    )  #we reduce the vocabulary to top 100,000 words as the later are very few in occurance
    logger.info('Created dictionary of size %d (pruned from %d)' %
                (vocab.size(),
                 originalSize))  #feed into log file the data about vocabulary

    return vocab
def initVocabulary(name, dataFiles, vocabFile, vocabSize):
    vocab = None
    if vocabFile is not None:
        # If given, load existing word dictionary.
        logger.info('Reading ' + name + ' vocabulary from \'' + vocabFile +
                    '\'...')
        vocab = neusum.Dict(lower=lower)
        vocab.loadFile(vocabFile)
        logger.info('Loaded ' + str(vocab.size()) + ' ' + name + ' words')

    if vocab is None:
        # If a dictionary is still missing, generate it.
        logger.info('Building ' + name + ' vocabulary...')
        genWordVocab = makeVocabulary(dataFiles, vocabSize)

        vocab = genWordVocab

    return vocab
def makeVocabulary(filenames, size):
    vocab = neusum.Dict([
        neusum.Constants.PAD_WORD, neusum.Constants.UNK_WORD,
        neusum.Constants.BOS_WORD, neusum.Constants.EOS_WORD
    ],
                        lower=lower)
    for filename in filenames:
        with open(filename, encoding='utf-8') as f:
            for sent in f.readlines():
                for word in sent.strip().split(' '):
                    vocab.add(word)

    originalSize = vocab.size()
    vocab = vocab.prune(size)
    logger.info('Created dictionary of size %d (pruned from %d)' %
                (vocab.size(), originalSize))

    return vocab
Beispiel #4
0
def initVocabulary(name, dataFiles, vocabFile,
                   vocabSize):  #this will initialize the process of vocabulary
    vocab = None
    if vocabFile is not None:
        # If given, load existing word dictionary.
        logger.info('Reading ' + name + ' vocabulary from \'' + vocabFile +
                    '\'...')
        vocab = neusum.Dict(lower=lower)
        vocab.loadFile(vocabFile)
        logger.info(
            'Loaded ' + str(vocab.size()) + ' ' + name + ' words'
        )  #if any vocabulary is present read it and update log file

    if vocab is None:
        # If a dictionary is still missing, generate it.
        logger.info('Building ' + name + ' vocabulary...')
        genWordVocab = makeVocabulary(dataFiles, vocabSize)

        vocab = genWordVocab  #if vocab is not present generate it

    return vocab