class CorpusIteratorFuncHead(): def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False): self.basis = CorpusIterator(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas) def permute(self): self.basis.permute() def length(self): return self.basis.length() def iterator(self, rejectShortSentences=False): iterator = self.basis.iterator( rejectShortSentences=rejectShortSentences) for sentence in iterator: reverse_content_head(sentence) yield sentence def getSentence(self, index): return reverse_content_head(self.basis.getSentence(index))
class CorpusIteratorFuncHeadFraction(): def __init__(self, language, partition="train", fraction=1.0, storeMorph=False, splitLemmas=False): self.basis = CorpusIterator(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=4) self.basis.data = self.basis.data[:int(fraction * len(self.basis.data))] self.permute() self.fraction = fraction def permute(self): self.basis.permute() def length(self): return self.basis.length() def iterator(self, rejectShortSentences=False): iterator = self.basis.iterator( rejectShortSentences=rejectShortSentences) counter = 0 print("Actual length", self.length()) for sentence in iterator: reverse_content_head(sentence) yield sentence def getSentence(self, index): return reverse_content_head(self.basis.getSentence(index))
yield input_indices, wordStartIndices+[len(input_indices)], relevant_logprob_sum input_indices = [2] # Start of Segment (makes sure that first word can be predicted from this token) wordStartIndices = [] DEV_PERIOD = 5000 epochCount = 0 corpusBase = CorpusIterator(args.language, storeMorph=True) while failedDevRuns < args.stopAfterFailures: epochCount += 1 print >> sys.stderr, "Epoch "+str(epochCount) print "Starting new epoch, permuting corpus" corpusBase.permute() # corpus = getNextSentence("train") corpus = corpusBase.iterator(rejectShortSentences = False) stream = createStream(corpus) if counter > 5: # if counter % DEV_PERIOD == 0: newDevLoss, _ = computeDevLoss() # devLosses.append( devLosses.append(newDevLoss) # newDevLoss = devLosses[-1]-1 # print("DON'T STOP don't stop")
printHere = (counter % 50 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize] _, _, _, newLoss, newWords, lossWords, lossPOS = doForwardPass( current, train=False) devLoss += newLoss devWords += newWords devLossWords += lossWords devLossPOS += lossPOS return devLoss / devWords, devLossWords / devWords, devLossPOS / devWords while True: # corpus = getNextSentence("train") corpus = CorpusIterator(language) corpus.permute() corpus = corpus.iterator(rejectShortSentences=True) while True: try: batch = map(lambda x: next(corpus), 10 * range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 100 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize]