def saveVocab(dataset, size, directory): import os import time vocab = createInitialVocab() if os.path.isdir(directory): outputPath = os.path.join(directory, "vocab.txt") if not os.path.exists(directory): os.makedirs(directory) else: outputPath = directory previousVocabSize = 0 start = time.time() totalTokens = 0 while True: string = dataset.next() if len(string) == 0: break if not string in vocab: vocab[string] = 0 totalTokens += 1 vocab[string] += 1 if len(vocab) + Vocab.getVocabOffset( ) >= previousVocabSize + size * 0.01: previousVocabSize = len(vocab) + Vocab.getVocabOffset() logger.debug("Vocab size is " + str(previousVocabSize) + " time so far: " + str(time.time() - start) + " total tokens: " + str(totalTokens)) if len(vocab) + Vocab.getVocabOffset() >= size: break with open(outputPath, "w", encoding='utf-8') as outputFile: for token, count in reversed(sorted(vocab.items(), key=lambda x: x[1])): if token[-1] != '\n': token += '\n' outputFile.write(token)
def maskOffTokens(self, labels): inputs = list(labels) for i in range(1, len(labels)): if self.random.binomial(1, 0.15): if self.random.binomial(1, 0.8): inputs[i] = Vocab.getMaskToken() else: if self.random.binomial(1, 0.5): inputs[i] = self.random.randint(Vocab.getVocabOffset(), self.vocab.getSize()) inputs[0] = Vocab.getClassLabelToken() return inputs
def isPredictedToken(self, token): return token == Vocab.getMaskToken() or token == Vocab.getVocabOffset()