def computeDevLoss():
    global printHere
    #   global counter
    #   global devSurprisalTable
    devLoss = 0.0
    devWords = 0
    #   corpusDev = getNextSentence("valid")
    corpusDev = corpusIteratorWikiWords.dev(args.language)
    stream = prepareDatasetChunks(corpusDev, train=False)

    surprisalTable = [0 for _ in range(args.horizon)]
    devCounter = 0
    devMemory = 0
    while True:
        devCounter += 1
        printHere = (devCounter % 50 == 0)
        try:
            with torch.no_grad():
                _, _, _, newLoss, newWords, devMemoryHere = forward(
                    next(stream),
                    surprisalTable=surprisalTable,
                    doDropout=False,
                    batchSizeHere=args.batchSize)
        except StopIteration:
            break

        devMemory += devMemoryHere.data.cpu().numpy()
        devLoss += newLoss
        devWords += newWords
        if printHere:
            print "Dev examples " + str(devCounter)
    devSurprisalTableHere = [
        surp / (devCounter * args.batchSize) for surp in surprisalTable
    ]
    return devLoss / devWords, devSurprisalTableHere, devMemory / devCounter
def computeDevLoss():
    global printHere
    #   global counter
    #   global devSurprisalTable
    devLoss = 0.0
    devWords = 0
    #   corpusDev = getNextSentence("valid")
    corpusDev = corpusIteratorWikiWords.dev(args.language)
    stream = createStream(corpusDev)

    surprisalTable = [0 for _ in range(args.horizon)]
    devCounter = 0
    devMemory = 0
    while True:
        #     try:
        #        input_indices, wordStartIndices = next(stream)
        try:
            input_indices_list = []
            wordStartIndices_list = []
            for _ in range(args.batchSize):
                input_indices, wordStartIndices = next(stream)
                input_indices_list.append(input_indices)
                wordStartIndices_list.append(wordStartIndices)
        except StopIteration:
            break
        devCounter += 1
        #     counter += 1
        printHere = (devCounter % 50 == 0)
        with torch.no_grad():
            _, _, _, newLoss, newWords, devMemoryHere = doForwardPass(
                input_indices_list,
                wordStartIndices_list,
                surprisalTable=surprisalTable,
                doDropout=False,
                batchSizeHere=args.batchSize)
        devMemory += devMemoryHere.data.cpu().numpy()
        devLoss += newLoss
        devWords += newWords
        if printHere:
            print "Dev examples " + str(devCounter)
    devSurprisalTableHere = [
        surp / (devCounter * args.batchSize) for surp in surprisalTable
    ]
    return devLoss / devWords, devSurprisalTableHere, devMemory / devCounter
   assert False
####################################





from torch.autograd import Variable


#data = AcqdivReaderPartition(acqdivCorpusReader, partition="train").reshuffledIterator(blankBeforeEOS=False)

rnn_drop.train(False)


data = corpusIteratorWikiWords.dev(args.language)
print("Got data")



numeric_with_blanks = []
count = 0
print("Prepare chunks")
for chunk in data:
  for word in chunk:
    numeric_with_blanks.append(stoi[" "]+3)
    for char in word:
  #    print((char if char != "\n" else "\\n", stoi[char]+3 if char in stoi else 2))
      count += 1
      if char not in stoi:
          print(char)
Ejemplo n.º 4
0
            if (labels[-1] == 1) and relevantNextWords[-1].startswith(
                    relevantWords[-1]
            ):  # this is actually not a hard assertion, it should just be quite unlikely in languages such as English
                print("WARNING",
                      list(zip(boundaries[j][i:], boundariesAll[j][i:])))


#                     if len(relevantWords[-1]) > 1:
#                       assert False

import time

devLosses = []
#for epoch in range(10000):
if True:
    training_data = corpusIteratorWikiWords.dev(args.language,
                                                removeMarkup=False)
    training_data_c = corpusIteratorWiki.dev(args.language, doShuffling=False)

    print("Got data")
    training_chars = prepareDatasetChunks(training_data,
                                          training_data_c,
                                          train=False)

    rnn_drop.train(False)
    startTime = time.time()
    trainChars = 0
    counter = 0
    while True:
        counter += 1
        try:
            numeric = [next(training_chars) for _ in range(args.batchSize)]
Ejemplo n.º 5
0
                "components": [c.state_dict() for c in modules]
            }
            torch.save(
                state, "/u/scr/mhahn/CODEBOOKS/" + args.language1 + "AND" +
                args.language2 + "_" + __file__ + "_code_" + str(args.myID) +
                ".txt")

        if (time.time() - totalStartTime) / 60 > 4000:
            print("Breaking early to get some result within 72 hours")
            totalStartTime = time.time()
            break

#     break
    rnn_drop.train(False)

    dev_data_1 = corpusIteratorWikiWords.dev(args.language1)
    dev_data_2 = corpusIteratorWikiWords.dev(args.language2)

    print("Got data")

    dev_chars = prepareDatasetChunksTwo(dev_data_1, dev_data_2, train=False)

    dev_loss = 0
    dev_char_count = 0
    counter = 0
    hidden, beginning = None, None
    while True:
        counter += 1
        try:
            numeric = next(dev_chars)
        except StopIteration: