class Generator: def __init__(self, fileName): t = Tokenizer() self.wordToInd = t.getWordToInd() self.indexToWord = t.getIndToWord() self.model = RNN(t.getVocabSize()) load(fileName, self.model) def postParse(self, sentence): sentence = [v for v in sentence if not v == "[" or v == "]"] out = str(sentence[0][0].upper() + sentence[0][1:]) for i in sentence[1:]: if i == "," or i == "." or i == ":" or i == ";" or i == "?" or i == "!": out += i elif i == "i": out += " I" else: out += " " + i return out def generateSentence(self): newSent = [self.wordToInd["SENTENCE_START"]] while not newSent[-1] == self.wordToInd["SENTENCE_END"]: nextWordProbs = self.model.forwardPropagation(newSent)[0] sampled = self.wordToInd["UNKNOWN_TOKEN"] while sampled == self.wordToInd["UNKNOWN_TOKEN"]: samples = np.random.multinomial(1, nextWordProbs[-1]) sampled = np.argmax(samples) newSent.append(sampled) sentence = [self.indexToWord[x] for x in newSent[1:-1]] #print (sentence) return (self.postParse(sentence)) def curateSentence(self): sentence = self.generateSentence() while (not (len(sentence) > 80 and len(sentence) < 270) or ("[") in sentence or ("]") in sentence): sentence = self.generateSentence() if len(sentence) > 140: ind = 0 for i in range(130, len(sentence)): if sentence[i] == " ": ind = i break split = [sentence[:ind + 1], sentence[ind + 1:]] if (len(split[0]) > 140 or len(split[1]) > 140): return self.curateSentence() else: return 2, [split[1], split[0]] else: return 1, [sentence]
def testSystem(): t = Tokenizer() xTrain, yTrain = t.getData() np.random.seed(10) model = RNN(15000) o, s = model.forwardPropagation(xTrain[30]) predictions = model.predict(xTrain[30]) print(o.shape) print(o) print(predictions.shape) print(predictions) print("Expected Loss: \n" + str(np.log(model.vocab))) print("Actual Loss:") print(model.calculateLoss(xTrain[:100], yTrain[:100]))