Ejemplo n.º 1
0
def calculateTradeoffForWeights(weights):
    # Order the datasets based on the given weights
    train = []
    dev = []
    totalDependencyLength = 0.0
    totalWordCount = 0.0
    for data, processed in [(data_train, train), (data_dev, dev)]:
        for sentence in data:
            linearized, dependencyLength, wordCount = orderSentence(
                sentence, weights)
            assert dependencyLength + 1 >= wordCount, (dependencyLength,
                                                       wordCount)
            totalDependencyLength += dependencyLength
            totalWordCount += wordCount
            for word in linearized:
                processed.append(word["word"])
#            assert word["word"] != "_", " ".join([x["word"] for x in linearized])
            processed.append("EOS")
            for _ in range(args.cutoff + 2):
                processed.append("PAD")
            processed.append("SOS")
#   print(processed[:100])


#    quit()
#print(train[:50])
#print(dev[:50])
    auc, devSurprisalTable = calculateMemorySurprisalTradeoff(train, dev, args)
    print("VALUES", auc, totalDependencyLength / totalWordCount)
    overallObjective = auc + 2 * totalDependencyLength / totalWordCount
    return overallObjective, devSurprisalTable, totalDependencyLength / totalWordCount
def calculateTradeoffForWeights(weights):
    # Order the datasets based on the given weights
    train = []
    dev = []
    depLen, words = 0, 0
    for data, processed in [(data_train, train), (data_dev, dev)]:
        for sentence in data:
            linearized, depLenHere = orderSentence(sentence, weights)
            depLen += depLenHere
            for word in linearized:
                processed.append(word["word"])
                words += 1


#            assert word["word"] != "_", " ".join([x["word"] for x in linearized])
            processed.append("EOS")
            for _ in range(args.cutoff + 2):
                processed.append("PAD")
            processed.append("SOS")
    depLen = depLen / float(words)
    #   print(processed[:100])
    #    quit()
    #print(train[:50])
    #print(dev[:50])
    if args.aucWeight > 0:
        auc, devSurprisalTable = calculateMemorySurprisalTradeoff(
            train, dev, args)
    else:
        auc = 0
        devSurprisalTable = [0]
    return auc, devSurprisalTable, depLen
Ejemplo n.º 3
0
def calculateTradeoffForWeights(weights):
    # Order the datasets based on the given weights
    train = []
    dev = []
    for data, processed in [(data_train, train), (data_dev, dev)]:
      for verb in data:
         affixes = verb[1:]
         affixes = sorted(affixes, key=lambda x:weights.get(getRepresentation(x), 0)) 
         for ch in [verb[0]] + affixes:
            processed.append(getSurprisalRepresentation(ch))
         processed.append("EOS")
         for _ in range(args.cutoff+2):
           processed.append("PAD")
         processed.append("SOS")
 #   print(processed[:100])
#    quit()
    auc, devSurprisalTable = calculateMemorySurprisalTradeoff(train, dev, args)
    return auc, devSurprisalTable
Ejemplo n.º 4
0
def calculateTradeoffForWeights(weights):
    train = []
    dev = []
    # Iterate through the verb forms in the two data partitions, and linearize as a sequence of underlying morphemes
    for data, processed in [(data_train, train), (data_dev, dev)]:
        for verb in data:
            affixes = verb[1:]
            if args.model == "REAL":  # Real ordering
                _ = 0
            elif args.model == "REVERSE":  # Reverse affixes
                affixes = affixes[::-1]
            else:  # Order based on weights
                affixes = sorted(
                    affixes,
                    key=lambda x: weights.get(getRepresentation(x), 0))

            for ch in [
                    verb[0]
            ] + affixes:  # Express as a sequence of underlying morphemes (could also instead be a sequence of phonemes if we can phonemize the Korean input)
                processed.append(getSurprisalRepresentation(ch))
            processed.append("EOS")  # Indicate end-of-sequence
            for _ in range(
                    args.cutoff + 2
            ):  # Interpose a padding symbol between each pair of successive verb forms. There is no relation between successive verb forms, and adding padding prevents the n-gram models from "trying to learn" any spurious relations between successive verb forms.
                processed.append("PAD")
            processed.append("SOS")  # start-of-sequence for the next verb form

    # Calculate AUC and the surprisals over distances (see estimateTradeoffHeldout.py for further documentation)
    auc, devSurprisalTable = calculateMemorySurprisalTradeoff(train, dev, args)

    # Write results to a file
    model = args.model
    if "/" in model:
        model = model[model.rfind("_"):-4] + "-OPTIM"
    outpath = TARGET_DIR + args.language + "_" + __file__ + "_model_" + (
        str(myID) + "-" +
        model if model in ["RANDOM", "UNIV"] else model) + ".txt"
    print(outpath)
    with open(outpath, "w") as outFile:
        print(str(args), file=outFile)
        print(" ".join(map(str, devSurprisalTable)), file=outFile)
    return auc
def calculateTradeoffForWeights(weights):
    # Order the datasets based on the given weights
    train = []
    dev = []
    for data, processed in [(data_train, train), (data_dev, dev)]:
      for sentence in data:
         linearized = orderSentence(sentence, weights)
         for word in linearized:
            processed.append(word["word"])
#            assert word["word"] != "_", " ".join([x["word"] for x in linearized])
         processed.append("EOS")
         for _ in range(args.cutoff+2):
           processed.append("PAD")
         processed.append("SOS")
 #   print(processed[:100])
#    quit()
    #print(train[:50])
    #print(dev[:50])
    auc, devSurprisalTable = calculateMemorySurprisalTradeoff(train, dev, args)
    return auc, devSurprisalTable