Python CorpusIterator Examples, corpusIterator_FuncHead.CorpusIterator Python Examples

Example #1

0

Show file

File: optimizeGrammarForI1_6_EvaluateGrammar_FuncHead.py Project: m-hahn/memory-surprisal

def computeDevLoss():
   devBatchSize = args.batchSize
   global printHere
   devLoss = 0.0
   devWords = 0
   corpusDev = CorpusIterator(args.language,"dev").iterator(rejectShortSentences = False)
   stream = createStream(corpusDev, training=False)

   surprisalTable = [0 for _ in range(2)]
   devCounter = 0
   devCounterTimesBatchSize = 0
   while True:
     try:
        input_indices_list = []
        wordStartIndices_list = []
        for _ in range(devBatchSize):
           input_indices, wordStartIndices, _ = next(stream)
           input_indices_list.append(input_indices)
           wordStartIndices_list.append(wordStartIndices)
     except StopIteration:
        devBatchSize = len(input_indices_list)
     if devBatchSize == 0:
       break
     devCounter += 1
     printHere = (devCounter % 100 == 0)
     _, _, _, newLoss, newWords = doForwardPass(input_indices_list, wordStartIndices_list, surprisalTable = surprisalTable, doDropout=False, batchSizeHere=devBatchSize,relevant_logprob_sum=None )
     devLoss += newLoss
     devWords += newWords
     if printHere:
         print "Dev examples "+str(devCounter)
     devCounterTimesBatchSize += devBatchSize
   return devLoss/devWords, None #devSurprisalTableHere

Example #2

0

Show file

def initializeOrderTable():
    orderTable = {}
    keys = set()
    vocab = {}
    distanceSum = {}
    distanceCounts = {}
    depsVocab = set()
    depsVocab.add("root")
    for partition in ["together"]:
        for sentence in CorpusIterator(args.language, partition).iterator():
            sentenceHash = hash_(" ".join([x["word"] for x in sentence]))
            for line in sentence:
                vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                posFine.add(line["posFine"])
                posUni.add(line["posUni"])

                if line["dep"] == "root":
                    continue
                posHere = line["posUni"]
                posHead = sentence[line["head"] - 1]["posUni"]
                if line["dep"] == "nsubj":
                    line["dep"] = "nsubj_" + str(sentenceHash) + "_" + str(
                        line["index"])
                if line["dep"] == "obj":
                    line["dep"] = "obj_" + str(sentenceHash) + "_" + str(
                        line["index"])

                line["fine_dep"] = line["dep"]
                depsVocab.add(line["fine_dep"])

                dep = line["fine_dep"]
                direction = "HD" if line["head"] < line["index"] else "DH"
                key = dep
                keyWithDir = (dep, direction)
                orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1
                keys.add(key)
                distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0
                distanceSum[key] = distanceSum.get(
                    key, 0.0) + abs(line["index"] - line["head"])
    #print orderTable
    dhLogits = {}
    for key in keys:
        hd = orderTable.get((key, "HD"), 0) + 1.0
        dh = orderTable.get((key, "DH"), 0) + 1.0
        dhLogit = log(dh) - log(hd)
        dhLogits[key] = dhLogit
    return dhLogits, vocab, keys, depsVocab

Example #3

0

Show file

def initializeOrderTable():
    orderTable = {}
    keys = set()
    vocab = {}
    distanceSum = {}
    distanceCounts = {}
    depsVocab = set()
    for partition in ["train", "dev"]:
        for sentence in CorpusIterator(args.language,
                                       partition,
                                       storeMorph=True).iterator():
            for line in sentence:
                vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                vocab_lemmas[line["lemma"]] = vocab_lemmas.get(
                    line["lemma"], 0) + 1

                depsVocab.add(line["dep"])
                posFine.add(line["posFine"])
                posUni.add(line["posUni"])

                for morph in line["morph"]:
                    morphKeyValuePairs.add(morph)
                if line["dep"] == "root":
                    continue

                posHere = line["posUni"]
                posHead = sentence[line["head"] - 1]["posUni"]
                dep = line["dep"]
                direction = "HD" if line["head"] < line["index"] else "DH"
                key = (posHead, dep, posHere)
                keyWithDir = (posHead, dep, posHere, direction)
                orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1
                keys.add(key)
                distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0
                distanceSum[key] = distanceSum.get(
                    key, 0.0) + abs(line["index"] - line["head"])
    dhLogits = {}
    for key in keys:
        hd = orderTable.get((key[0], key[1], key[2], "HD"), 0) + 1.0
        dh = orderTable.get((key[0], key[1], key[2], "DH"), 0) + 1.0
        dhLogit = log(dh) - log(hd)
        dhLogits[key] = dhLogit
        originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key])
    return dhLogits, vocab, keys, depsVocab

Example #4

0

Show file

File: optimizeDependencyLength_POS_NoSplit_FuncHead.py Project: m-hahn/optimization-landscapes

def initializeOrderTable():
    orderTable = {}
    keys = set()
    vocab = {}
    distanceSum = {}
    distanceCounts = {}
    depsVocab = set()
    for partition in ["together"]:
        for sentence, metadata in CorpusIterator(args.language,
                                                 partition).iterator():
            docs[metadata["newdoc id"]] += 1
            for line in sentence:
                vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                line["fine_dep"] = line["dep"]
                depsVocab.add(line["fine_dep"])
                posFine.add(line["posFine"])
                posUni.add(line["posUni"])

                if line["fine_dep"] == "root":
                    continue
                posHere = line["posUni"]
                posHead = sentence[line["head"] - 1]["posUni"]
                dep = line["fine_dep"]
                direction = "HD" if line["head"] < line["index"] else "DH"
                key = (posHead, dep, posHere)
                keyWithDir = (dep, direction)
                orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1
                keys.add(key)
                distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0
                distanceSum[key] = distanceSum.get(
                    key, 0.0) + abs(line["index"] - line["head"])
    #print orderTable
    dhLogits = {}
    for key in keys:
        hd = orderTable.get((key, "HD"), 0) + 1.0
        dh = orderTable.get((key, "DH"), 0) + 1.0
        dhLogit = log(dh) - log(hd)
        dhLogits[key] = dhLogit
    return dhLogits, vocab, keys, depsVocab

Example #5

0

Show file

def initializeOrderTable():
    orderTable = {}
    keys = set()
    vocab = {}
    distanceSum = {}
    distanceCounts = {}
    depsVocab = set()
    for partition in ["train", "dev"]:
        for sentence in CorpusIterator(args.language, partition).iterator():
            for line in sentence:
                vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                line["coarse_dep"] = makeCoarse(line["dep"])
                depsVocab.add(line["coarse_dep"])
                posFine.add(line["posFine"])
                posUni.add(line["posUni"])

                if line["coarse_dep"] == "root":
                    continue
                posHere = line["posUni"]
                posHead = sentence[line["head"] - 1]["posUni"]
                dep = line["coarse_dep"]
                direction = "HD" if line["head"] < line["index"] else "DH"
                key = dep
                keyWithDir = (dep, direction)
                orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1
                keys.add(key)
                distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0
                distanceSum[key] = distanceSum.get(
                    key, 0.0) + abs(line["index"] - line["head"])
    #print orderTable
    dhLogits = {}
    for key in keys:
        hd = orderTable.get((key, "HD"), 0) + 1.0
        dh = orderTable.get((key, "DH"), 0) + 1.0
        dhLogit = log(dh) - log(hd)
        dhLogits[key] = dhLogit
        originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key])
    return dhLogits, vocab, keys, depsVocab

Example #6

0

Show file

File: estimateParseability_Lexicalized2.py Project: m-hahn/optimization-landscapes

def computeDevLoss():
    global printHere
    counterDev = 0
    corpusDev = CorpusIterator(language,
                               "dev").iterator(rejectShortSentences=True)
    partitionsDev = getPartitions(corpusDev)
    devLossU = 0
    devLossL = 0
    devAccuracy = 0
    devAccuracyLabeled = 0
    devWords = 0
    for partitionDev in partitionsDev:
        counterDev += 1
        printHere = (counterDev % 500 == 0)
        lossU, lossL, _, accuracy, accuracyLabeled, wordNum = forward(
            partitionDev, computeAccuracy=True, doDropout=False)
        devLossU += lossU.data.cpu().numpy()
        devLossL += lossL.data.cpu().numpy()

        devAccuracy += accuracy
        devAccuracyLabeled += accuracyLabeled
        devWords += wordNum
        if counterDev % 50 == 0:
            print "Run on dev " + str(counterDev)
            print(devLossU / devWords, devLossL / devWords,
                  float(devAccuracy) / devWords,
                  float(devAccuracyLabeled) / devWords, devWords)

    newDevLossL = devLossL / devWords
    newDevLossU = devLossU / devWords

    newDevAccuracy = float(devAccuracy) / devWords
    newDevAccuracyLabeled = float(devAccuracyLabeled) / devWords
    devLossesL.append(newDevLossL)
    devLossesU.append(newDevLossU)

    devAccuracies.append(newDevAccuracy)
    devAccuraciesLabeled.append(newDevAccuracyLabeled)

Example #7

0

Show file

File: optimizeDependencyLength_POS_NoSplit_FuncHead.py Project: m-hahn/optimization-landscapes

batchSize = 1

lr_lm = 0.1

crossEntropy = 10.0


def encodeWord(w):
    return stoi[w] + 3 if stoi[w] < vocab_size else 1


import torch.nn.functional

counter = 0
while True:
    corpus = CorpusIterator(args.language, partition="together")
    corpus.permute()
    corpus = corpus.iterator(rejectShortSentences=False)

    for current in corpus:
        if counter > 50000000:
            print("Quitting at counter " + str(counter))
            quit()
        counter += 1
        printHere = (counter % 50 == 0)
        current = [current]
        batchOrdered, logits = orderSentence(current[0], dhLogits, printHere)

        metadata = current[0][1]

        maxLength = len(batchOrdered)

Example #8

0

Show file

dhWeights = Variable(torch.FloatTensor([0.0] * len(itos_pure_deps)),
                     requires_grad=True)
distanceWeights = Variable(torch.FloatTensor([0.0] * len(itos_pure_deps)),
                           requires_grad=True)
for i, key in enumerate(itos_pure_deps):
    dhLogits[key] = 0.0
    if key == "obj":
        dhLogits[key] = (10.0 if random() > 0.5 else -10.0)

    dhWeights.data[i] = dhLogits[key]

    originalDistanceWeights[key] = 0.0  #random()
    distanceWeights.data[i] = originalDistanceWeights[key]

data_train = list(
    CorpusIterator(args.language, "train",
                   storeMorph=True).iterator(rejectShortSentences=False))
data_dev = list(
    CorpusIterator(args.language, "dev",
                   storeMorph=True).iterator(rejectShortSentences=False))
#print(len(data_train), len(data_dev))
#quit()

words = []

affixFrequency = {}

print(itos_pure_deps)
itos_pure_deps = sorted(list(itos_pure_deps) + ["HEAD"])
stoi_pure_deps = dict(list(zip(itos_pure_deps, range(len(itos_pure_deps)))))

itos_pure_deps_ = itos_pure_deps[::]

Example #9

0

Show file

        order = order.replace("OO", "O")
    #print(order)
    return order


#    quit()


def mean(x):
    return sum(x) / (len(x) + 0.001)


with open(PATH, "w") as outFile:
    corpus = list(
        CorpusIterator(
            args.language,
            partition="together").iterator(rejectShortSentences=True))
    shuffle(corpus)

    best1 = {1: [], -1: []}
    mean1 = {1: [], -1: []}
    real1 = []
    realOrder1 = []
    for sentence in corpus:
        if counter > 200000:
            print "Quitting at counter " + str(counter)
            quit()
        counter += 1
        if counter % 100 == 0:
            print(counter, (counter + 0.0001) / len(corpus))
        printHere = (counter % 500 == 0)

Example #10

0

Show file

def encodeWord(w):
   return stoi[w]+3 if stoi[w] < vocab_size else 1




import torch.nn.functional


counter = 0


dependencyLengths = []

if True:
  corpus = CorpusIterator(args.language, partition="together", shuffleData=False).iterator(rejectShortSentences = True)

  while True:
    try:
       batch = map(lambda x:next(corpus), 10*range(1))
    except StopIteration:
       break
    batch = sorted(batch, key=len)
    partitions = range(10)
    for partition in partitions:
       if counter > 200000:
           print "Quitting at counter "+str(counter)
           quit()
       counter += 1
       printHere = (counter % 50 == 0)
       current = batch[partition*1:(partition+1)*1]

Example #11

0

Show file

                if targetWord >= vocab_size:
                    input_indices.append(stoi_pos_uni[line["posUni"]] + 3)
                else:
                    input_indices.append(targetWord + 3 + len(itos_pos_uni))

        yield input_indices, wordStartIndices + [len(input_indices)
                                                 ], relevant_logprob_sum
        input_indices = [
            2
        ]  # Start of Segment (makes sure that first word can be predicted from this token)
        wordStartIndices = []


DEV_PERIOD = 5000
epochCount = 0
corpusBase = CorpusIterator(args.language, storeMorph=True)
while failedDevRuns < args.stopAfterFailures:
    epochCount += 1
    print >> sys.stderr, "Epoch " + str(epochCount)
    print "Starting new epoch, permuting corpus"
    corpusBase.permute()
    #  corpus = getNextSentence("train")
    corpus = corpusBase.iterator(rejectShortSentences=False)
    stream = createStream(corpus)

    if counter > 5:
        #       if counter % DEV_PERIOD == 0:
        newDevLoss, _ = computeDevLoss()
        #             devLosses.append(
        devLosses.append(newDevLoss)

Example #12

0

Show file

File: collectSentencesProperties_FuncHead.py Project: m-hahn/optimization-landscapes

    if ":" in x:
        return x[:x.index(":")]
    return x


import hashlib


def hash_(x):
    return hashlib.sha224(x).hexdigest()


hashToSentence = {}

for partition in ["together"]:
    for sentence in CorpusIterator(args.language, partition).iterator():
        sentenceHash = hash_(" ".join([x["word"] for x in sentence]))
        hashToSentence[sentenceHash] = sentence

TARGET_DIR = "/u/scr/mhahn/deps/DLM_MEMORY_OPTIMIZED/locality_optimized_dlm/manual_output_funchead_fine_depl_funchead_perSent/"
import glob
from collections import defaultdict
orderBySentence = {x: [] for x in hashToSentence}
files = glob.glob(TARGET_DIR + "/" + args.language + "*.tsv")
for path in files:
    print(path)
    with open(path, "r") as inFile:
        header = next(inFile).strip().split("\t")
        header = dict(list(zip(header, range(len(header)))))
        #             print >> outFile, "\t".join(map(str,["DH_Weight","CoarseDependency","HeadPOS", "DependentPOS", "DistanceWeight", "Language", "FileName"]))
        objDir = None

Example #13

0

Show file

File: estimateParseability_Lexicalized2.py Project: m-hahn/optimization-landscapes

    newDevLossL = devLossL / devWords
    newDevLossU = devLossU / devWords

    newDevAccuracy = float(devAccuracy) / devWords
    newDevAccuracyLabeled = float(devAccuracyLabeled) / devWords
    devLossesL.append(newDevLossL)
    devLossesU.append(newDevLossU)

    devAccuracies.append(newDevAccuracy)
    devAccuraciesLabeled.append(newDevAccuracyLabeled)


counter = 0
epochs = 0
while True:
    corpus = CorpusIterator(language,
                            "train").iterator(rejectShortSentences=True)
    partitions = getPartitions(corpus)
    epochs += 1
    for partition in partitions:
        if counter > maxNumberOfUpdates:
            print "Ran for a long time, quitting."
            quit()

        counter += 1
        printHere = (counter % 100 == 0)
        _, loss, policyLoss, _, _, wordNum = forward(partition)
        if wordNum == 0:
            assert loss is 0
        else:
            backward(loss, policyLoss)
        if printHere: