optim = torch.optim.SGD(parameters(), lr=args.learning_rate, momentum=0.0) # 0.02, 0.9 named_modules = {"rnn" : rnn, "output" : output, "char_embeddings" : char_embeddings, "optim" : optim} # Load the model from the checkpoint, if there is one if args.load_from is not None: checkpoint = torch.load(CHECKPOINT_HOME+args.load_from+".pth.tar") for name, module in named_modules.items(): module.load_state_dict(checkpoint[name]) from torch.autograd import Variable # Read the training data, so that it is reshuffled. When evaluating, we need to switch to a held-out set, but for development of this code, I'm using the training set (i.e., training set of the language model). data = AcqdivReaderPartition(acqdivCorpusReadertrain).reshuffledIterator(blankBeforeEOS=False) #print(list(data)) #print("stoi[char]+3") #print(stoi[char]+3) print("stoi") print(stoi) # Read the data into a list of integers numeric_with_blanks = [] count = 0 print("Prepare chunks") for chunk in data: print("chunk") print(chunk) # each "chunk" is a string representing an utterance # print(stoi[" "])
if (labels[-1] == 1) and relevantNextWords[-1].startswith( relevantWords[-1] ): # this is actually not a hard assertion, it should just be quite unlikely in languages such as English print("WARNING", list(zip(boundaries[j][i:], boundariesAll[j][i:]))) # if len(relevantWords[-1]) > 1: # assert False import time devLosses = [] #for epoch in range(10000): if True: training_data = AcqdivReaderPartition( acqdivCorpusReadertrain).reshuffledIterator() print("Got data") training_chars = prepareDatasetChunksTest(training_data, train=True) rnn_drop.train(False) startTime = time.time() trainChars = 0 counter = 0 while True: counter += 1 try: numeric = [next(training_chars) for _ in range(args.batchSize)] except StopIteration: break printHere = (counter % 50 == 0)
#relevantNextWords.append(nextRelevantWord) positionIDs.append(int(uniquePositionID[j][i])) charactersAll.append(codeToChar(int(numeric[j][i]))) assert boundariesAll[j][i + 1] is not None ####################################################################################3 import time from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split training_data = list( AcqdivReaderPartition(acqdivCorpusReadertrain).reshuffledIterator( blankBeforeEOS=True, seed=0)) ############################################# fullDataset_perRound_general = [ list( prepareDatasetChunksTest(training_data, train=True, offset=ROUND * (int(args.sequence_length / 3)))) for ROUND in range(-2, 2) ] with open("segmentation-predictions/" + args.language + "-table.txt", "w") as outFileTable: print("\t".join(