def main(argv=None): """ Training. """ ### parametres LEARNING_RATE = FLAGS.LEARNING_RATE NUMBER_OF_FRAMES = FLAGS.NUMBER_OF_FRAMES BATCH_SIZE = FLAGS.BATCH_SIZE EPOCH = FLAGS.EPOCH TRAINING_DEVICE = FLAGS.TRAINING_DEVICE VOCAB_SIZE = FLAGS.VOCAB_SIZE NUMBER_OF_WORDS = FLAGS.NUMBER_OF_WORDS HIDDEN_SIZE = FLAGS.HIDDEN_SIZE INPUT_SIZE = FLAGS.INPUT_SIZE NUMBER_OF_LAYERS = FLAGS.NUMBER_OF_LAYERS tsfm = transforms.Compose([ transforms.Resize([224, 224]), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) train_dir = FLAGS.train_dir #'D:/Machine_Learning/datasets/YouTubeClips_2/YouTubeClips/' train_corpus = FLAGS.train_corpus #'D:/Machine_Learning/datasets/video_corpus/video_corpus.csv' print("train_dir is =", train_dir) print("train_corpus =", train_corpus) utils = Utils() all_text = utils.output_text(train_corpus) text_processor = TextProcessor(freq_threshold=10) dictionary = text_processor.vocab_creator(all_text) ### training data preparation train_ds = CustomDataset(train_dir, train_corpus, device, dictionary, VOCAB_SIZE, NUMBER_OF_WORDS, INPUT_SIZE, NUMBER_OF_FRAMES, tsfm, model=md.model_vgg) train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE) ### Model definition encoder = Encoder_LSTM(input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUMBER_OF_LAYERS) decoder = Decoder_LSTM(input_size=VOCAB_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUMBER_OF_LAYERS, number_of_words=NUMBER_OF_WORDS) model_seq_to_seq = Seq2Seq(encoder, decoder).to(device) model = model_seq_to_seq ### load the state_dict of model if model has been pretrained. if (FLAGS.load_weights): print("there are weights to be loaded") model.load_state_dict(torch.load(FLAGS.load_weights)) ### optimizer and loss function optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) criterion = nn.CrossEntropyLoss() #### Model Training import time print_feq = 1 best_loss = np.inf for epoch in range(1, EPOCH + 1): model.train() epoch_loss = 0 for step, (img, label) in enumerate(train_dl): time_1 = time.time() ## timing X_1, X_2 = img ### get inputs X_1 = X_1.to(device) # Set device X_2 = X_2.to(device) # Set device label = label.to(device) # Set output device ### zero the parameter gradients optimizer.zero_grad() ### forward prediction = model(X_1, X_2) ### Optimize prediction = prediction.to(device) prediction = torch.squeeze(prediction, 0) label = torch.squeeze(label, 0) new_label = torch.zeros([label.shape[0]]) for l in range(label.shape[0]): new_label[l] = np.argmax(label[l].cpu()) new_label = new_label.to(device) loss = criterion(prediction, new_label.long()) # Backward prop. loss.backward() optimizer.step() ### print out statistics epoch_loss += loss.item() if step % print_feq == 0: print('epoch:', epoch, '\tstep:', step + 1, '/', len(train_dl) + 1, '\ttrain loss:', '{:.4f}'.format(loss.item()), '\ttime:', '{:.4f}'.format( (time.time() - time_1) * print_feq), 's') ### save best model if (epoch_loss < best_loss): best_loss = epoch_loss model_name = 'MODEL_SEQ2SEQ' + 'VOCAB_SIZE_' + str( VOCAB_SIZE) + 'NUMBER_OF_WORDS_' + str( NUMBER_OF_WORDS ) + 'HIDDEN_SIZE_' + str(HIDDEN_SIZE) + 'INPUT_SIZE_' + str( INPUT_SIZE) + 'NUMBER_OF_LAYERS_' + str(NUMBER_OF_LAYERS) torch.save(model.state_dict(), model_name + '.pth') print("The loss for this epoch is = :", epoch_loss / len(train_dl))
def main(argv=None): """ Training. """ ### parametres LEARNING_RATE = FLAGS.LEARNING_RATE NUMBER_OF_FRAMES = FLAGS.NUMBER_OF_FRAMES BATCH_SIZE = FLAGS.BATCH_SIZE EPOCH = FLAGS.EPOCH TRAINING_DEVICE = FLAGS.TRAINING_DEVICE VOCAB_SIZE = FLAGS.VOCAB_SIZE NUMBER_OF_WORDS = FLAGS.NUMBER_OF_WORDS HIDDEN_SIZE = FLAGS.HIDDEN_SIZE INPUT_SIZE = FLAGS.INPUT_SIZE NUMBER_OF_LAYERS = FLAGS.NUMBER_OF_LAYERS tsfm = transforms.Compose([ transforms.Resize([224, 224]), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) train_corpus = FLAGS.train_corpus utils = Utils() all_text = utils.output_text(train_corpus) text_processor = TextProcessor(freq_threshold=10) dictionary = text_processor.vocab_creator(all_text) ### Model definition encoder = Encoder_LSTM(input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUMBER_OF_LAYERS) decoder = Decoder_LSTM(input_size=VOCAB_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUMBER_OF_LAYERS, number_of_words=NUMBER_OF_WORDS) model_seq_to_seq = Seq2Seq(encoder, decoder).to(device) model = model_seq_to_seq ### load the state_dict of model if model has been pretrained. model.load_state_dict(torch.load(FLAGS.load_weights)) #### Model Testing model.eval() from random import randint import matplotlib.pyplot as plt utils = Utils() video_path = FLAGS.video_file video_pre_data = utils.video_to_frames(video_path, frame_number=NUMBER_OF_FRAMES, device='cuda', INPUT_SIZE=INPUT_SIZE, model=md.model_vgg, transform=tsfm) X_2 = torch.zeros([NUMBER_OF_WORDS, VOCAB_SIZE]) for i in range(NUMBER_OF_WORDS): if (i == 0): X_2[i][2] = 1 else: X_2[i][1] = 1 input_data = video_pre_data.unsqueeze(0) final_sentence = [] X_2 = X_2.unsqueeze(0) X_2 = X_2.to(device) input_data = input_data.to(device) for i in range(NUMBER_OF_WORDS - 1): with torch.no_grad(): predicted = model(input_data, X_2) predicted = predicted.squeeze(0) final_sentence.append( next((key for key, value in dictionary.items() if value == torch.argmax(predicted[i])), None)) X_2[0][i + 1][torch.argmax(predicted[i])] = 1 X_2[0][i + 1][1] = 0 print(final_sentence)