def calculateTradeoffForWeights(weights): # Order the datasets based on the given weights train = [] dev = [] totalDependencyLength = 0.0 totalWordCount = 0.0 for data, processed in [(data_train, train), (data_dev, dev)]: for sentence in data: linearized, dependencyLength, wordCount = orderSentence( sentence, weights) assert dependencyLength + 1 >= wordCount, (dependencyLength, wordCount) totalDependencyLength += dependencyLength totalWordCount += wordCount for word in linearized: processed.append(word["word"]) # assert word["word"] != "_", " ".join([x["word"] for x in linearized]) processed.append("EOS") for _ in range(args.cutoff + 2): processed.append("PAD") processed.append("SOS") # print(processed[:100]) # quit() #print(train[:50]) #print(dev[:50]) auc, devSurprisalTable = calculateMemorySurprisalTradeoff(train, dev, args) print("VALUES", auc, totalDependencyLength / totalWordCount) overallObjective = auc + 2 * totalDependencyLength / totalWordCount return overallObjective, devSurprisalTable, totalDependencyLength / totalWordCount
def calculateTradeoffForWeights(weights): # Order the datasets based on the given weights train = [] dev = [] depLen, words = 0, 0 for data, processed in [(data_train, train), (data_dev, dev)]: for sentence in data: linearized, depLenHere = orderSentence(sentence, weights) depLen += depLenHere for word in linearized: processed.append(word["word"]) words += 1 # assert word["word"] != "_", " ".join([x["word"] for x in linearized]) processed.append("EOS") for _ in range(args.cutoff + 2): processed.append("PAD") processed.append("SOS") depLen = depLen / float(words) # print(processed[:100]) # quit() #print(train[:50]) #print(dev[:50]) if args.aucWeight > 0: auc, devSurprisalTable = calculateMemorySurprisalTradeoff( train, dev, args) else: auc = 0 devSurprisalTable = [0] return auc, devSurprisalTable, depLen
def calculateTradeoffForWeights(weights): # Order the datasets based on the given weights train = [] dev = [] for data, processed in [(data_train, train), (data_dev, dev)]: for verb in data: affixes = verb[1:] affixes = sorted(affixes, key=lambda x:weights.get(getRepresentation(x), 0)) for ch in [verb[0]] + affixes: processed.append(getSurprisalRepresentation(ch)) processed.append("EOS") for _ in range(args.cutoff+2): processed.append("PAD") processed.append("SOS") # print(processed[:100]) # quit() auc, devSurprisalTable = calculateMemorySurprisalTradeoff(train, dev, args) return auc, devSurprisalTable
def calculateTradeoffForWeights(weights): train = [] dev = [] # Iterate through the verb forms in the two data partitions, and linearize as a sequence of underlying morphemes for data, processed in [(data_train, train), (data_dev, dev)]: for verb in data: affixes = verb[1:] if args.model == "REAL": # Real ordering _ = 0 elif args.model == "REVERSE": # Reverse affixes affixes = affixes[::-1] else: # Order based on weights affixes = sorted( affixes, key=lambda x: weights.get(getRepresentation(x), 0)) for ch in [ verb[0] ] + affixes: # Express as a sequence of underlying morphemes (could also instead be a sequence of phonemes if we can phonemize the Korean input) processed.append(getSurprisalRepresentation(ch)) processed.append("EOS") # Indicate end-of-sequence for _ in range( args.cutoff + 2 ): # Interpose a padding symbol between each pair of successive verb forms. There is no relation between successive verb forms, and adding padding prevents the n-gram models from "trying to learn" any spurious relations between successive verb forms. processed.append("PAD") processed.append("SOS") # start-of-sequence for the next verb form # Calculate AUC and the surprisals over distances (see estimateTradeoffHeldout.py for further documentation) auc, devSurprisalTable = calculateMemorySurprisalTradeoff(train, dev, args) # Write results to a file model = args.model if "/" in model: model = model[model.rfind("_"):-4] + "-OPTIM" outpath = TARGET_DIR + args.language + "_" + __file__ + "_model_" + ( str(myID) + "-" + model if model in ["RANDOM", "UNIV"] else model) + ".txt" print(outpath) with open(outpath, "w") as outFile: print(str(args), file=outFile) print(" ".join(map(str, devSurprisalTable)), file=outFile) return auc
def calculateTradeoffForWeights(weights): # Order the datasets based on the given weights train = [] dev = [] for data, processed in [(data_train, train), (data_dev, dev)]: for sentence in data: linearized = orderSentence(sentence, weights) for word in linearized: processed.append(word["word"]) # assert word["word"] != "_", " ".join([x["word"] for x in linearized]) processed.append("EOS") for _ in range(args.cutoff+2): processed.append("PAD") processed.append("SOS") # print(processed[:100]) # quit() #print(train[:50]) #print(dev[:50]) auc, devSurprisalTable = calculateMemorySurprisalTradeoff(train, dev, args) return auc, devSurprisalTable