Exemple #1
0
def saveVocab(dataset, size, directory):
    import os
    import time

    vocab = createInitialVocab()

    if os.path.isdir(directory):
        outputPath = os.path.join(directory, "vocab.txt")

        if not os.path.exists(directory):
            os.makedirs(directory)
    else:
        outputPath = directory

    previousVocabSize = 0

    start = time.time()
    totalTokens = 0

    while True:
        string = dataset.next()
        if len(string) == 0:
            break
        if not string in vocab:
            vocab[string] = 0

        totalTokens += 1
        vocab[string] += 1

        if len(vocab) + Vocab.getVocabOffset(
        ) >= previousVocabSize + size * 0.01:
            previousVocabSize = len(vocab) + Vocab.getVocabOffset()
            logger.debug("Vocab size is " + str(previousVocabSize) +
                         " time so far: " + str(time.time() - start) +
                         " total tokens: " + str(totalTokens))

        if len(vocab) + Vocab.getVocabOffset() >= size:
            break

    with open(outputPath, "w", encoding='utf-8') as outputFile:
        for token, count in reversed(sorted(vocab.items(),
                                            key=lambda x: x[1])):
            if token[-1] != '\n':
                token += '\n'
            outputFile.write(token)
Exemple #2
0
    def maskOffTokens(self, labels):
        inputs = list(labels)

        for i in range(1, len(labels)):
            if self.random.binomial(1, 0.15):
                if self.random.binomial(1, 0.8):
                    inputs[i] = Vocab.getMaskToken()
                else:
                    if self.random.binomial(1, 0.5):
                        inputs[i] = self.random.randint(Vocab.getVocabOffset(),
                            self.vocab.getSize())

        inputs[0] = Vocab.getClassLabelToken()

        return inputs
 def isPredictedToken(self, token):
     return token == Vocab.getMaskToken() or token == Vocab.getVocabOffset()