class CorpusIteratorFuncHead_V(): def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False, shuffleDataSeed=None): self.basis = CorpusIterator_V(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=shuffleDataSeed) def permute(self): self.basis.permute() def length(self): return self.basis.length() def iterator(self, rejectShortSentences=False): iterator = self.basis.iterator( rejectShortSentences=rejectShortSentences) for sentence in iterator: reverse_content_head(sentence) yield sentence def getSentence(self, index): return reverse_content_head(self.basis.getSentence(index))
def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False, shuffleDataSeed=None): self.basis = CorpusIterator_V(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=shuffleDataSeed)
def __init__(self, language, partition="train", fraction=1.0, storeMorph=False, splitLemmas=False): self.basis = CorpusIterator_V(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=4) self.basis.data = self.basis.data[:int(fraction * len(self.basis.data))] self.permute() self.fraction = fraction
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train", "dev"]: countsByPartition[partition] = {} sentCount = 0 for sentence in CorpusIterator_V(args.language, partition, storeMorph=True, shuffleDataSeed=4).iterator(): sentCount += 1 if sentCount > 100 and partition == "dev": break for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 vocab_lemmas[line["lemma"]] = vocab_lemmas.get( line["lemma"], 0) + 1 countsByPartition[partition][ line["word"]] = countsByPartition[partition].get( line["word"], 0) + 1 depsVocab.add(line["dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) for morph in line["morph"]: morphKeyValuePairs.add(morph) if line["dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = (posHead, dep, posHere) keyWithDir = (posHead, dep, posHere, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key[0], key[1], key[2], "HD"), 0) + 1.0 dh = orderTable.get((key[0], key[1], key[2], "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key]) return dhLogits, vocab, keys, depsVocab
class CorpusIteratorFuncHeadFraction_V(): def __init__(self, language, partition="train", fraction=1.0, storeMorph=False, splitLemmas=False): self.basis = CorpusIterator_V(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=4) self.basis.data = self.basis.data[:int(fraction*len(self.basis.data))] self.permute() self.fraction = fraction def permute(self): self.basis.permute() def length(self): return self.basis.length() def iterator(self, rejectShortSentences = False): iterator = self.basis.iterator(rejectShortSentences=rejectShortSentences) counter = 0 print("Actual length", self.length()) for sentence in iterator: # if counter > self.fraction * self.length(): # break # counter += 1 reverse_content_head(sentence) yield sentence def getSentence(self, index): return reverse_content_head(self.basis.getSentence(index))
def computeDevLoss(): devBatchSize = 512 global printHere # global counter # global devSurprisalTable global horizon devLoss = 0.0 devWords = 0 # corpusDev = getNextSentence("dev") corpusDev = CorpusIterator_V( language, "dev", storeMorph=True).iterator(rejectShortSentences=False) stream = createStreamContinuous(corpusDev) surprisalTable = [0 for _ in range(horizon)] devCounter = 0 devCounterTimesBatchSize = 0 while True: # try: # input_indices, wordStartIndices = next(stream) try: input_indices_list = [] wordStartIndices_list = [] for _ in range(devBatchSize): input_indices, wordStartIndices = next(stream) input_indices_list.append(input_indices) wordStartIndices_list.append(wordStartIndices) except StopIteration: devBatchSize = len(input_indices_list) # break if devBatchSize == 0: break devCounter += 1 # counter += 1 printHere = (devCounter % 100 == 0) _, _, _, newLoss, newWords = doForwardPass( input_indices_list, wordStartIndices_list, surprisalTable=surprisalTable, doDropout=False, batchSizeHere=devBatchSize) devLoss += newLoss devWords += newWords if printHere: print "Dev examples " + str(devCounter) devCounterTimesBatchSize += devBatchSize devSurprisalTableHere = [ surp / (devCounterTimesBatchSize) for surp in surprisalTable ] return devLoss / devWords, devSurprisalTableHere
def runOnCorpus(): global chart chart = [[ torch.cuda.FloatTensor( [[float("-Inf") for _ in itos_setOfNonterminals] for _ in range(args.BATCHSIZE)]) for _ in range(args.MAX_BOUNDARY) ] for _ in range(args.MAX_BOUNDARY)] iterator = iterator_dense( CorpusIterator_V(args.language, "dev", shuffleDataSeed=4).iterator()) chunk = [] surprisals = [0 for _ in range(args.MAX_BOUNDARY)] while True: linearized = [] try: for _ in range(args.BATCHSIZE): linearized.append(next(iterator)) except StopIteration: if len(linearized) == 0: break args.BATCHSIZE = len(linearized) chart = [[ torch.cuda.FloatTensor( [[float("-Inf") for _ in itos_setOfNonterminals] for _ in range(args.BATCHSIZE)]) for _ in range(args.MAX_BOUNDARY) ] for _ in range(args.MAX_BOUNDARY)] print( sentCount, [ surprisals[i + 1] - surprisals[i] for i in range(args.MAX_BOUNDARY - 1) ] ) # [surprisalTableSums[0]/surprisalTableCounts[-1]] + [(surprisalTableSums[i+1]-surprisalTableSums[i])/surprisalTableCounts[-1] for i in range(MAX_BOUNDARY-1)]) computeSurprisals(linearized) surprisals = [ surprisalTableSums[i] / (surprisalTableCounts[i] + 1e-9) for i in range(args.MAX_BOUNDARY) ] print( sentCount, [ surprisals[i + 1] - surprisals[i] for i in range(args.MAX_BOUNDARY - 1) ] ) # [surprisalTableSums[0]/surprisalTableCounts[-1]] + [(surprisalTableSums[i+1]-surprisalTableSums[i])/surprisalTableCounts[-1] for i in range(MAX_BOUNDARY-1)]) return surprisals
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train", "dev"]: for sentence in CorpusIterator_V(language, partition, shuffleDataSeed=40).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["coarse_dep"] = makeCoarse(line["dep"]) depsVocab.add(line["coarse_dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["coarse_dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["coarse_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = dep keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key]) return dhLogits, vocab, keys, depsVocab
for w in sentence: entry = {} for key, value in w.iteritems(): entry[key] = value result.append(entry) return result dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0] * len(itos_deps)))) distanceWeights_Prior = Normal( Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0] * len(itos_deps)))) counter = 0 corpus = CorpusIterator_V(language, "train") def guide(corpus): mu_DH = pyro.param( "mu_DH", Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)) mu_Dist = pyro.param( "mu_Dist", Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)) sigma_DH = pyro.param( "sigma_DH", Variable(torch.FloatTensor([1.0] * len(itos_deps)),
logsoftmax = torch.nn.LogSoftmax() def deepCopy(sentence): result = [] for w in sentence: entry = {} for key, value in w.items(): entry[key] = value result.append(entry) return result dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps)))) distanceWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps)))) counter = 0 corpus = CorpusIterator_V(args.language,"train") def guide(corpus): mu_DH = pyro.param("mu_DH", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True)) mu_Dist = pyro.param("mu_Dist", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True)) sigma_DH = pyro.param("sigma_DH", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True)) sigma_Dist = pyro.param("sigma_Dist", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True)) dhWeights = pyro.sample("dhWeights", dist.Normal(mu_DH, sigma_DH)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) distanceWeights = pyro.sample("distanceWeights", dist.Normal(mu_Dist, sigma_Dist)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) def model(corpus): global counter dhWeights = pyro.sample("dhWeights", dhWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) distanceWeights = pyro.sample("distanceWeights", distanceWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
assert totalDepLength == 0 numberOfWords = wordNum return (totalDepLength, numberOfWords, byType) assert batchSize == 1 depLengths = [] #while True: outpath = "/u/scr/mhahn/japanese/"+str(myID) with open(outpath, "w") as outFile: print >> outFile, "\t".join(["Sent", "Length"]) counter = 0 if True: corpus = CorpusIterator_V(language,"train", shuffleDataSeed=40) corpusIterator = corpus.iterator() if corpus.length() == 0: quit() while True: try: batch = map(lambda x:next(corpusIterator), 10*range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) for partition in partitions: counter += 1 printHere = (counter % 100 == 0) current = batch[partition*batchSize:(partition+1)*batchSize]
assert tree["category"] in leftCornerCounts return leftCorner def linearizeTree2String(tree, sent): if tree["children"] is None: sent.append(tree["word"]) else: for x in tree["children"]: linearizeTree2String(x, sent) sentCount = 0 print("Collecting counts from training corpus") for sentence in CorpusIterator_V(args.language,"train", shuffleDataSeed=4).iterator(): sentCount += 1 ordered = orderSentence(sentence, sentCount % 400 == 0) linearized = [] linearizeTree2String(ordered, linearized) # if len(linearized) > 10: # continue # print(ordered) roots[ordered["category"]] = roots.get(ordered["category"], 0) + 1 rootsTotal = rootsTotal + 1 if sentCount % 100 == 0: print(sentCount, ordered["category"])
else: assert totalDepLength == 0 numberOfWords = wordNum return (totalDepLength, numberOfWords, byType) assert batchSize == 1 depLengths = [] #while True: outpath = "/u/scr/mhahn/japanese/" + str(myID) with open(outpath, "w") as outFile: print >> outFile, "\t".join(["Sent", "Length"]) counter = 0 if True: corpus = CorpusIterator_V(language, "train", shuffleDataSeed=40) corpusIterator = corpus.iterator() if corpus.length() == 0: quit() while True: try: batch = map(lambda x: next(corpusIterator), 10 * range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) for partition in partitions: counter += 1 printHere = (counter % 100 == 0)
assert tree["category"] in leftCornerCounts return leftCorner def linearizeTree2String(tree, sent): if tree["children"] is None: sent.append(tree["word"]) else: for x in tree["children"]: linearizeTree2String(x, sent) sentCount = 0 print("Collecting counts from training corpus") for sentence in CorpusIterator_V(args.language,"train", ignoreCorporaWithoutWords=True).iterator(): sentCount += 1 ordered = orderSentence(sentence, sentCount % 400 == 0) linearized = [] linearizeTree2String(ordered, linearized) # if len(linearized) > 10: # continue # print(ordered) roots[ordered["category"]] = roots.get(ordered["category"], 0) + 1 rootsTotal = rootsTotal + 1 if sentCount % 100 == 0: print(sentCount, ordered["category"])
logsoftmax = torch.nn.LogSoftmax() def deepCopy(sentence): result = [] for w in sentence: entry = {} for key, value in w.items(): entry[key] = value result.append(entry) return result #dhWeights_Prior = Normal(, Variable(torch.FloatTensor([1.0]* len(itos_deps)))) #distanceWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps)))) counter = 0 corpus = CorpusIterator_V(args.language,"train") #def guide(corpus): # mu_DH = pyro.param("mu_DH", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True)) # mu_Dist = pyro.param("mu_Dist", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True)) # # sigma_DH = pyro.param("sigma_DH", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True)) # sigma_Dist = pyro.param("sigma_Dist", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True)) # # dhWeights = pyro.sample("dhWeights", dist.Normal(mu_DH, sigma_DH)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) # distanceWeights = pyro.sample("distanceWeights", dist.Normal(mu_Dist, sigma_Dist)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) # #dhWeights = pyro.sample("dhWeights", dhWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) #distanceWeights = pyro.sample("distanceWeights", distanceWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) dhWeights = Variable(torch.FloatTensor([0.0] * len(itos_deps) * len(docs)).view(len(docs), len(itos_deps)))
header = [ "index", "word", "lemma", "posUni", "posFine", "morph", "head", "dep", "_", "_" ] from corpusIterator_V import CorpusIterator_V originalDistanceWeights = {} morphKeyValuePairs = set() vocab_lemmas = {} corpusTrain = CorpusIterator_V( args.language, "train", storeMorph=True).iterator(rejectShortSentences=False) pairs = set() counter = 0 data = [] for sentence in corpusTrain: # print(len(sentence)) verb = [] for line in sentence[::-1]: # print(line) if line["posUni"] == "PUNCT": continue verb.append(line) if line["posUni"] == "VERB": verb = verb[::-1] # print(verb)
assert tree["category"] in leftCornerCounts return leftCorner def linearizeTree2String(tree, sent): if tree["children"] is None: sent.append(tree["word"]) else: for x in tree["children"]: linearizeTree2String(x, sent) sentCount = 0 print("Collecting counts from training corpus") for sentence in CorpusIterator_V(language, "train").iterator(): sentCount += 1 ordered = orderSentence(sentence, sentCount % 50 == 0) linearized = [] linearizeTree2String(ordered, linearized) # if len(linearized) > 10: # continue # print(ordered) roots[ordered["category"]] = roots.get(ordered["category"], 0) + 1 roots["__TOTAL__"] = roots.get("__TOTAL__", 0) + 1 if sentCount % 100 == 0: print(sentCount, ordered["category"]) # update inStackDistribution
surprisalTable=surprisalTable, doDropout=False, batchSizeHere=devBatchSize) devLoss += newLoss devWords += newWords if printHere: print "Dev examples " + str(devCounter) devCounterTimesBatchSize += devBatchSize devSurprisalTableHere = [ surp / (devCounterTimesBatchSize) for surp in surprisalTable ] return devLoss / devWords, devSurprisalTableHere DEV_PERIOD = 5000 epochCount = 0 corpusBase = CorpusIterator_V(language, storeMorph=True) while failedDevRuns == 0: epochCount += 1 print "Starting new epoch, permuting corpus" corpusBase.permute() # corpus = getNextSentence("train") corpus = corpusBase.iterator(rejectShortSentences=False) stream = createStream(corpus) if counter > 5: # if counter % DEV_PERIOD == 0: newDevLoss, devSurprisalTableHere = computeDevLoss() # devLosses.append( devLosses.append(newDevLoss) print "New dev loss " + str(newDevLoss) + ". previous was: " + str( lastDevLoss)