Example #1
0
optim = torch.optim.SGD(parameters(), lr=args.learning_rate, momentum=0.0) # 0.02, 0.9

named_modules = {"rnn" : rnn, "output" : output, "char_embeddings" : char_embeddings, "optim" : optim}


# Load the model from the checkpoint, if there is one
if args.load_from is not None:
  checkpoint = torch.load(CHECKPOINT_HOME+args.load_from+".pth.tar")
  for name, module in named_modules.items():
      module.load_state_dict(checkpoint[name])

from torch.autograd import Variable

# Read the training data, so that it is reshuffled. When evaluating, we need to switch to a held-out set, but for development of this code, I'm using the training set (i.e., training set of the language model).
data = AcqdivReaderPartition(acqdivCorpusReadertrain).reshuffledIterator(blankBeforeEOS=False)
#print(list(data))

#print("stoi[char]+3")
#print(stoi[char]+3)

print("stoi")
print(stoi)
# Read the data into a list of integers
numeric_with_blanks = []
count = 0
print("Prepare chunks")
for chunk in data:
  print("chunk")
  print(chunk) # each "chunk" is a string representing an utterance
#  print(stoi[" "])
            if (labels[-1] == 1) and relevantNextWords[-1].startswith(
                    relevantWords[-1]
            ):  # this is actually not a hard assertion, it should just be quite unlikely in languages such as English
                print("WARNING",
                      list(zip(boundaries[j][i:], boundariesAll[j][i:])))


#                     if len(relevantWords[-1]) > 1:
#                       assert False

import time

devLosses = []
#for epoch in range(10000):
if True:
    training_data = AcqdivReaderPartition(
        acqdivCorpusReadertrain).reshuffledIterator()

    print("Got data")
    training_chars = prepareDatasetChunksTest(training_data, train=True)

    rnn_drop.train(False)
    startTime = time.time()
    trainChars = 0
    counter = 0
    while True:
        counter += 1
        try:
            numeric = [next(training_chars) for _ in range(args.batchSize)]
        except StopIteration:
            break
        printHere = (counter % 50 == 0)
            #relevantNextWords.append(nextRelevantWord)
            positionIDs.append(int(uniquePositionID[j][i]))
            charactersAll.append(codeToChar(int(numeric[j][i])))
            assert boundariesAll[j][i + 1] is not None


####################################################################################3

import time

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

training_data = list(
    AcqdivReaderPartition(acqdivCorpusReadertrain).reshuffledIterator(
        blankBeforeEOS=True, seed=0))

#############################################

fullDataset_perRound_general = [
    list(
        prepareDatasetChunksTest(training_data,
                                 train=True,
                                 offset=ROUND *
                                 (int(args.sequence_length / 3))))
    for ROUND in range(-2, 2)
]

with open("segmentation-predictions/" + args.language + "-table.txt",
          "w") as outFileTable:
    print("\t".join(