def computeDevLoss(): devBatchSize = args.batchSize global printHere devLoss = 0.0 devWords = 0 corpusDev = CorpusIterator(args.language,"dev").iterator(rejectShortSentences = False) stream = createStream(corpusDev, training=False) surprisalTable = [0 for _ in range(2)] devCounter = 0 devCounterTimesBatchSize = 0 while True: try: input_indices_list = [] wordStartIndices_list = [] for _ in range(devBatchSize): input_indices, wordStartIndices, _ = next(stream) input_indices_list.append(input_indices) wordStartIndices_list.append(wordStartIndices) except StopIteration: devBatchSize = len(input_indices_list) if devBatchSize == 0: break devCounter += 1 printHere = (devCounter % 100 == 0) _, _, _, newLoss, newWords = doForwardPass(input_indices_list, wordStartIndices_list, surprisalTable = surprisalTable, doDropout=False, batchSizeHere=devBatchSize,relevant_logprob_sum=None ) devLoss += newLoss devWords += newWords if printHere: print "Dev examples "+str(devCounter) devCounterTimesBatchSize += devBatchSize return devLoss/devWords, None #devSurprisalTableHere
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() depsVocab.add("root") for partition in ["together"]: for sentence in CorpusIterator(args.language, partition).iterator(): sentenceHash = hash_(" ".join([x["word"] for x in sentence])) for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] if line["dep"] == "nsubj": line["dep"] = "nsubj_" + str(sentenceHash) + "_" + str( line["index"]) if line["dep"] == "obj": line["dep"] = "obj_" + str(sentenceHash) + "_" + str( line["index"]) line["fine_dep"] = line["dep"] depsVocab.add(line["fine_dep"]) dep = line["fine_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = dep keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit return dhLogits, vocab, keys, depsVocab
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train", "dev"]: for sentence in CorpusIterator(args.language, partition, storeMorph=True).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 vocab_lemmas[line["lemma"]] = vocab_lemmas.get( line["lemma"], 0) + 1 depsVocab.add(line["dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) for morph in line["morph"]: morphKeyValuePairs.add(morph) if line["dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = (posHead, dep, posHere) keyWithDir = (posHead, dep, posHere, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) dhLogits = {} for key in keys: hd = orderTable.get((key[0], key[1], key[2], "HD"), 0) + 1.0 dh = orderTable.get((key[0], key[1], key[2], "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key]) return dhLogits, vocab, keys, depsVocab
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["together"]: for sentence, metadata in CorpusIterator(args.language, partition).iterator(): docs[metadata["newdoc id"]] += 1 for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["fine_dep"] = line["dep"] depsVocab.add(line["fine_dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["fine_dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["fine_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = (posHead, dep, posHere) keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit return dhLogits, vocab, keys, depsVocab
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train", "dev"]: for sentence in CorpusIterator(args.language, partition).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["coarse_dep"] = makeCoarse(line["dep"]) depsVocab.add(line["coarse_dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["coarse_dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["coarse_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = dep keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key]) return dhLogits, vocab, keys, depsVocab
def computeDevLoss(): global printHere counterDev = 0 corpusDev = CorpusIterator(language, "dev").iterator(rejectShortSentences=True) partitionsDev = getPartitions(corpusDev) devLossU = 0 devLossL = 0 devAccuracy = 0 devAccuracyLabeled = 0 devWords = 0 for partitionDev in partitionsDev: counterDev += 1 printHere = (counterDev % 500 == 0) lossU, lossL, _, accuracy, accuracyLabeled, wordNum = forward( partitionDev, computeAccuracy=True, doDropout=False) devLossU += lossU.data.cpu().numpy() devLossL += lossL.data.cpu().numpy() devAccuracy += accuracy devAccuracyLabeled += accuracyLabeled devWords += wordNum if counterDev % 50 == 0: print "Run on dev " + str(counterDev) print(devLossU / devWords, devLossL / devWords, float(devAccuracy) / devWords, float(devAccuracyLabeled) / devWords, devWords) newDevLossL = devLossL / devWords newDevLossU = devLossU / devWords newDevAccuracy = float(devAccuracy) / devWords newDevAccuracyLabeled = float(devAccuracyLabeled) / devWords devLossesL.append(newDevLossL) devLossesU.append(newDevLossU) devAccuracies.append(newDevAccuracy) devAccuraciesLabeled.append(newDevAccuracyLabeled)
batchSize = 1 lr_lm = 0.1 crossEntropy = 10.0 def encodeWord(w): return stoi[w] + 3 if stoi[w] < vocab_size else 1 import torch.nn.functional counter = 0 while True: corpus = CorpusIterator(args.language, partition="together") corpus.permute() corpus = corpus.iterator(rejectShortSentences=False) for current in corpus: if counter > 50000000: print("Quitting at counter " + str(counter)) quit() counter += 1 printHere = (counter % 50 == 0) current = [current] batchOrdered, logits = orderSentence(current[0], dhLogits, printHere) metadata = current[0][1] maxLength = len(batchOrdered)
dhWeights = Variable(torch.FloatTensor([0.0] * len(itos_pure_deps)), requires_grad=True) distanceWeights = Variable(torch.FloatTensor([0.0] * len(itos_pure_deps)), requires_grad=True) for i, key in enumerate(itos_pure_deps): dhLogits[key] = 0.0 if key == "obj": dhLogits[key] = (10.0 if random() > 0.5 else -10.0) dhWeights.data[i] = dhLogits[key] originalDistanceWeights[key] = 0.0 #random() distanceWeights.data[i] = originalDistanceWeights[key] data_train = list( CorpusIterator(args.language, "train", storeMorph=True).iterator(rejectShortSentences=False)) data_dev = list( CorpusIterator(args.language, "dev", storeMorph=True).iterator(rejectShortSentences=False)) #print(len(data_train), len(data_dev)) #quit() words = [] affixFrequency = {} print(itos_pure_deps) itos_pure_deps = sorted(list(itos_pure_deps) + ["HEAD"]) stoi_pure_deps = dict(list(zip(itos_pure_deps, range(len(itos_pure_deps))))) itos_pure_deps_ = itos_pure_deps[::]
order = order.replace("OO", "O") #print(order) return order # quit() def mean(x): return sum(x) / (len(x) + 0.001) with open(PATH, "w") as outFile: corpus = list( CorpusIterator( args.language, partition="together").iterator(rejectShortSentences=True)) shuffle(corpus) best1 = {1: [], -1: []} mean1 = {1: [], -1: []} real1 = [] realOrder1 = [] for sentence in corpus: if counter > 200000: print "Quitting at counter " + str(counter) quit() counter += 1 if counter % 100 == 0: print(counter, (counter + 0.0001) / len(corpus)) printHere = (counter % 500 == 0)
def encodeWord(w): return stoi[w]+3 if stoi[w] < vocab_size else 1 import torch.nn.functional counter = 0 dependencyLengths = [] if True: corpus = CorpusIterator(args.language, partition="together", shuffleData=False).iterator(rejectShortSentences = True) while True: try: batch = map(lambda x:next(corpus), 10*range(1)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) for partition in partitions: if counter > 200000: print "Quitting at counter "+str(counter) quit() counter += 1 printHere = (counter % 50 == 0) current = batch[partition*1:(partition+1)*1]
if targetWord >= vocab_size: input_indices.append(stoi_pos_uni[line["posUni"]] + 3) else: input_indices.append(targetWord + 3 + len(itos_pos_uni)) yield input_indices, wordStartIndices + [len(input_indices) ], relevant_logprob_sum input_indices = [ 2 ] # Start of Segment (makes sure that first word can be predicted from this token) wordStartIndices = [] DEV_PERIOD = 5000 epochCount = 0 corpusBase = CorpusIterator(args.language, storeMorph=True) while failedDevRuns < args.stopAfterFailures: epochCount += 1 print >> sys.stderr, "Epoch " + str(epochCount) print "Starting new epoch, permuting corpus" corpusBase.permute() # corpus = getNextSentence("train") corpus = corpusBase.iterator(rejectShortSentences=False) stream = createStream(corpus) if counter > 5: # if counter % DEV_PERIOD == 0: newDevLoss, _ = computeDevLoss() # devLosses.append( devLosses.append(newDevLoss)
if ":" in x: return x[:x.index(":")] return x import hashlib def hash_(x): return hashlib.sha224(x).hexdigest() hashToSentence = {} for partition in ["together"]: for sentence in CorpusIterator(args.language, partition).iterator(): sentenceHash = hash_(" ".join([x["word"] for x in sentence])) hashToSentence[sentenceHash] = sentence TARGET_DIR = "/u/scr/mhahn/deps/DLM_MEMORY_OPTIMIZED/locality_optimized_dlm/manual_output_funchead_fine_depl_funchead_perSent/" import glob from collections import defaultdict orderBySentence = {x: [] for x in hashToSentence} files = glob.glob(TARGET_DIR + "/" + args.language + "*.tsv") for path in files: print(path) with open(path, "r") as inFile: header = next(inFile).strip().split("\t") header = dict(list(zip(header, range(len(header))))) # print >> outFile, "\t".join(map(str,["DH_Weight","CoarseDependency","HeadPOS", "DependentPOS", "DistanceWeight", "Language", "FileName"])) objDir = None
newDevLossL = devLossL / devWords newDevLossU = devLossU / devWords newDevAccuracy = float(devAccuracy) / devWords newDevAccuracyLabeled = float(devAccuracyLabeled) / devWords devLossesL.append(newDevLossL) devLossesU.append(newDevLossU) devAccuracies.append(newDevAccuracy) devAccuraciesLabeled.append(newDevAccuracyLabeled) counter = 0 epochs = 0 while True: corpus = CorpusIterator(language, "train").iterator(rejectShortSentences=True) partitions = getPartitions(corpus) epochs += 1 for partition in partitions: if counter > maxNumberOfUpdates: print "Ran for a long time, quitting." quit() counter += 1 printHere = (counter % 100 == 0) _, loss, policyLoss, _, _, wordNum = forward(partition) if wordNum == 0: assert loss is 0 else: backward(loss, policyLoss) if printHere: