def main(argv=None): print("Running on {}".format(device)) parser = argparse.ArgumentParser( description="Train a transformer for a copy task" ) add_optimizer_arguments(parser) add_transformer_arguments(parser) add_auxiliary_arguments(parser) args = parser.parse_args(argv) print("args:\n-----\n", args) data_points = [] data_points_acc = [] n_of_each_model = 10 n_trials = 8 for model_type in [ 'transformer','lstm','rnn',]: #add back transformers and rnn for max_trained_depth in range(1, 12): for ii in range(n_of_each_model): print(f'dep{max_trained_depth}_ii_{ii}') if model_type == "transformer": d_model = 16 model = SequencePredictorRecurrentTransformer( d_model=d_model, n_classes=5, sequence_length=args.sequence_length, attention_type=args.attention_type, n_layers=args.n_layers, n_heads=args.n_heads, d_query=d_model, # used to be d_query dropout=args.dropout, softmax_temp=None, attention_dropout=args.attention_dropout, ) else: d_model = 8 model = SequencePredictorRNN( d_model=d_model, n_classes=5, n_layers=args.n_layers, dropout=args.dropout, rnn_type=model_type ) print(f"Created model:\n{model}") model.to(device) model.load_state_dict(torch.load(f"models_from_colab/agreement_models/model_{model_type}_depth_{max_trained_depth}_num_{ii}.zip", map_location=device)['model_state']) for test_depth in range(1, 21): # was 1, 32 stack_size = test_depth # Change this value to test longer / shorter sequences n_correct = 0 for i_trial in range(n_trials): x, y, m = SubjectVerbAgreement.get_seq(stack_size) model.eval() yhat = model(x.unsqueeze(1)) loss, acc = loss_fn(y.unsqueeze(1), yhat, m.unsqueeze(1)) n_correct += acc data_points.append({'model_type': model_type, 'max_trained_depth': max_trained_depth, 'test_depth': test_depth, 'accuracy': n_correct / n_trials}) print("data points") print(data_points) with open("data_points_pr_acc_r.txt", "wb") as fp: pickle.dump(data_points, fp) """
def main(argv=None): print("Running on {}".format(device)) parser = argparse.ArgumentParser( description="Train a transformer for a copy task" ) add_optimizer_arguments(parser) add_transformer_arguments(parser) add_auxiliary_arguments(parser) args = parser.parse_args(argv) print("args:\n-----\n", args) if args.model_type == "transformer": model = SequencePredictorRecurrentTransformer( d_model=args.d_model, n_classes=5, sequence_length=args.sequence_length, attention_type=args.attention_type, n_layers=args.n_layers, n_heads=args.n_heads, d_query=args.d_model, # used to be d_query dropout=args.dropout, softmax_temp=None, attention_dropout=args.attention_dropout, ) else: model = SequencePredictorRNN( d_model=args.d_model, n_classes=5, n_layers=args.n_layers, dropout=args.dropout, rnn_type=args.model_type ) print(f"Created model:\n{model}") model.to(device) print("Number of epochs model was trained on: ",torch.load(args.continue_from, map_location=device)['epoch']) model.load_state_dict(torch.load(args.continue_from, map_location=device)['model_state']) def format_preds(x, y, preds, mask): n = len(x) n_dig = math.floor(math.log10(n)) + 1 nums = [] for p_dig in range(n_dig): nums.append( "# |" + "".join([str((i//10**p_dig)%10) for i in range(n)]) + "\n") nums = "".join(nums[::-1]) xs = "x |" + "".join([str(int(v)) for v in x]) + "\n" ys = "y |" + "".join([elt if mask[i] == 1 else '?' for i, elt in enumerate([str(int(v)) for v in y])]) + "\n" yh = "yh|" + "".join([elt if mask[i] == 1 else '?' for i, elt in enumerate([str(int(v)) for v in preds])]) + "\n" return nums + xs + ys + yh acc_list = [] max_acc = None for stack_size in range(1, 64): x, y, m = SubjectVerbAgreement.get_seq(stack_size) # print(x.shape, y.shape, m.shape) model.eval() yhat = model(x.unsqueeze(1)) hdn = model.hidden_state # batch x seq x hdn loss, acc = loss_fn(y.unsqueeze(1), yhat, m.unsqueeze(1)) acc_list.append((stack_size, acc)) if acc == 1: max_acc = stack_size print("Highest perfect score at depth:", max_acc) plot_hidden_state_2d(np.array(acc_list), pca=False) stack_size = 7 # Change this value to test longer / shorter sequences x, y, m = SubjectVerbAgreement.get_seq(stack_size) model.eval() yhat = model(x.unsqueeze(1)) hdn = model.hidden_state # batch x seq x hdn loss, acc = loss_fn(y.unsqueeze(1), yhat, m.unsqueeze(1)) print("Model loss: ", loss) print("Model accuracy: ", acc) print(format_preds(x, y, torch.argmax(yhat, dim=2)[0], m)) plot_hidden_state_2d(hdn[0].detach().cpu().numpy(), pca=True) """
def main(argv=None): # Choose a device and move everything there print("Running on {}".format(device)) parser = argparse.ArgumentParser( description="Train a transformer for an agreement task" ) add_optimizer_arguments(parser) add_transformer_arguments(parser) add_auxiliary_arguments(parser) args = parser.parse_args(argv) print("args:\n-----\n", args) # Make the dataset and the model for model_type in ['rnn', 'lstm', 'transformer']: for max_depth in range(1, 12): ii = 0 while ii < 10: print(max_depth, ii) # skip existing models if os.path.isfile("/content/drive/My Drive/final_project_material/agreement_models/model_" + model_type + "_depth_" + str(max_depth) + "_num_" + str(ii)): ii += 1 continue train_set = SubjectVerbAgreement(2*max_depth+1, max_depth=max_depth) test_set = SubjectVerbAgreement(2*max_depth+1, max_depth=max_depth) train_loader = DataLoader( train_set, batch_size=batch_size, pin_memory=device=="cuda" ) test_loader = DataLoader( test_set, batch_size=batch_size, pin_memory=device=="cuda" ) if model_type == "transformer": d_model = 16 model = SequencePredictorRecurrentTransformer( d_model=d_model, n_classes=5, sequence_length=args.sequence_length, attention_type=args.attention_type, n_layers=args.n_layers, n_heads=args.n_heads, d_query=d_model, # used to be d_query dropout=args.dropout, softmax_temp=None, attention_dropout=args.attention_dropout, ) else: d_model=8 model = SequencePredictorRNN( d_model=d_model, n_classes=5, n_layers=args.n_layers, dropout=args.dropout, rnn_type=model_type ) print(f"Created model:\n{model}") model.to(device) # Start training optimizer = get_optimizer(model.parameters(), args) start_epoch = 1 if args.continue_from: start_epoch = load_model( args.continue_from, model, optimizer, device ) lr_schedule = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda e: 1. if e < args.reduce_lr_at else 0.1 ) for e in range(start_epoch, args.epochs+1): print('Epoch:', e) print('Training...') train(model, optimizer, train_loader, device) print('Evaluating...') acc = evaluate(model, test_loader, device, return_accuracy=True) lr_schedule.step() if e == 100: break if acc >= 0.95: save_model("/content/drive/My Drive/final_project_material/agreement_models/model_" + model_type + "_depth_" + str(max_depth) + "_num_" + str(ii), model, optimizer, e) ii += 1 break
def main(argv=None): parser = argparse.ArgumentParser( description="Train a transformer to generate images") add_transformer_arguments(parser) add_optimizer_arguments(parser) add_dataset_arguments(parser) parser.add_argument("--mixtures", type=int, default=10, help="How many logistics to use to model the output") parser.add_argument("--iterations", type=int, default=100, help="How many iterations to train for") parser.add_argument("--batch_size", type=int, default=4, help="How many samples to use together") parser.add_argument("--save_to", default=None, help="Set a file to save the models to.") parser.add_argument("--continue_from", default=None, help="Load the model from a file") parser.add_argument("--save_frequency", default=3000, type=int, help="Save every that many steps") parser.add_argument( "--evaluate_frequency", default=3000, type=int, help="Evaluate on the test set after that many iterations") parser.add_argument( "--yield_frequency", default=10**9, type=int, help="Stop after that many iterations so that other jobs can run") args = parser.parse_args(argv) print_transformer_arguments(args) # Make the dataset and the model train_set, test_set = get_dataset(args) model = ImageGenerator(args.d_query * args.n_heads, train_set.sequence_length, args.mixtures, attention_type=args.attention_type, n_layers=args.n_layers, n_heads=args.n_heads, d_query=args.d_query, dropout=args.dropout, softmax_temp=None, attention_dropout=args.attention_dropout, bits=args.bits, rounds=args.rounds, chunk_size=args.chunk_size, masked=True) # Choose a device and move everything there device = "cuda" if torch.cuda.is_available() else "cpu" print("Running on {}".format(device)) model.to(device) # Start training train_loader = DataLoader(train_set, batch_size=args.batch_size, pin_memory=device == "cuda") test_loader = DataLoader(test_set, batch_size=args.batch_size, pin_memory=device == "cuda") optimizer = get_optimizer(model.parameters(), args) iteration = 0 if args.continue_from: iteration = load_model(args.continue_from, model, optimizer, device) optimizer.set_lr(args.lr) callbacks = callback_chain( saver(args.save_frequency, args.save_to, model, optimizer), evaluator(args.evaluate_frequency, model, test_loader, device), stopper(args.yield_frequency)) yielded = train(model, optimizer, train_loader, iteration, args.iterations, callbacks, device) # Non-zero exit code to notify the process watcher that we yielded if yielded: sys.exit(1)
def main(argv=None): parser = argparse.ArgumentParser( description="Train a transformer for a copy task") add_optimizer_arguments(parser) add_transformer_arguments(parser) parser.add_argument("--sequence_length", type=int, default=128, help="Set the maximum sequence length") parser.add_argument("--n_classes", type=int, default=10, help="Set the number of classes") parser.add_argument("--epochs", type=int, default=100, help="How many epochs to train for") parser.add_argument("--batch_size", type=int, default=64, help="How many samples to use together") parser.add_argument("--reduce_lr_at", type=int, default=30, help="At this epoch divide the lr by 10") parser.add_argument("--save_to", default=None, help="Set a file to save the models to.") parser.add_argument("--continue_from", default=None, help="Load the model from a file") parser.add_argument("--save_frequency", default=1, type=int, help="Save every that many epochs") args = parser.parse_args(argv) print_transformer_arguments(args) # Make the dataset and the model train_set = CopyTask(args.sequence_length, args.n_classes) test_set = CopyTask(args.sequence_length, args.n_classes) model = SequencePredictor(args.d_query * args.n_heads, args.sequence_length, args.n_classes, attention_type=args.attention_type, n_layers=args.n_layers, n_heads=args.n_heads, d_query=args.d_query, dropout=args.dropout, softmax_temp=None, attention_dropout=args.attention_dropout, bits=args.bits, rounds=args.rounds, chunk_size=args.chunk_size, masked=args.masked) # Choose a device and move everything there device = "cuda" if torch.cuda.is_available() else "cpu" print("Running on {}".format(device)) model.to(device) # Start training train_loader = DataLoader(train_set, batch_size=args.batch_size, pin_memory=device == "cuda") test_loader = DataLoader(test_set, batch_size=args.batch_size, pin_memory=device == "cuda") optimizer = get_optimizer(model.parameters(), args) start_epoch = 1 if args.continue_from: start_epoch = load_model(args.continue_from, model, optimizer, device) lr_schedule = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda e: 1. if e < args.reduce_lr_at else 0.1) for e in range(start_epoch, args.epochs + 1): train(model, optimizer, train_loader, device) evaluate(model, test_loader, device) if (e % args.save_frequency) == 0 and args.save_to: save_model(args.save_to, model, optimizer, e) lr_schedule.step()
def main(argv=None): print("Running on {}".format(device)) parser = argparse.ArgumentParser( description="Train a transformer for a copy task") add_optimizer_arguments(parser) add_transformer_arguments(parser) add_auxiliary_arguments(parser) args = parser.parse_args(argv) print("args:\n-----\n", args) data_points = [] for model_type in ['rnn', 'lstm', 'transformer']: for max_trained_depth in range(1, 12): for test_depth in range(1, 21): for ii in range(10): if model_type == "transformer": model = SequencePredictorRecurrentTransformer( d_model=16, n_classes=5, sequence_length=args.sequence_length, attention_type=args.attention_type, n_layers=args.n_layers, n_heads=args.n_heads, d_query=8, # used to be d_query dropout=args.dropout, softmax_temp=None, attention_dropout=args.attention_dropout, ) else: model = SequencePredictorRNN( d_model=8 if model_type == 'lstm' else 8, n_classes=5, n_layers=args.n_layers, dropout=args.dropout, rnn_type=model_type) print(f"Created model:\n{model}") model.to(device) model_name = "models_from_colab/agreement_models/model_" + model_type + "_depth_" + str( max_trained_depth) + "_num_" + str(ii) + ".zip" model.load_state_dict( torch.load(model_name, map_location=device)['model_state']) stack_size = test_depth x, y, m = SubjectVerbAgreement.get_seq(stack_size) model.eval() yhat = model(x.unsqueeze(1)) hdn = model.hidden_state # batch x seq x hdn loss, acc = loss_fn(y.unsqueeze(1), yhat, m.unsqueeze(1)) data_points.append({ 'model_type': model_type, 'max_trained_depth': max_trained_depth, 'test_depth': test_depth, 'accuracy': acc }) print("data points:") print(data_points) with open("data_points_sva.txt", "wb") as fp: pickle.dump(data_points, fp) """
def main(argv=None): # Choose a device and move everything there print("Running on {}".format(device)) parser = argparse.ArgumentParser( description="Train a transformer for a copy task") add_optimizer_arguments(parser) add_transformer_arguments(parser) add_auxiliary_arguments(parser) args = parser.parse_args(argv) print("args:\n-----\n", args) # Make the dataset and the model max_depth = 12 train_set = CountTaskWithEOS(max_depth * 2 + 1, max_depth=max_depth) test_set = CountTaskWithEOS(max_depth * 2 + 1, max_depth=max_depth) train_loader = DataLoader(train_set, batch_size=args.batch_size, pin_memory=device == "cuda") test_loader = DataLoader(test_set, batch_size=args.batch_size, pin_memory=device == "cuda") if args.model_type == "transformer": model = SequencePredictorRecurrentTransformer( d_model=args.d_model, n_classes=args.n_classes, sequence_length=args.sequence_length, attention_type=args.attention_type, n_layers=args.n_layers, n_heads=args.n_heads, d_query=args.d_model, # used to be d_query dropout=args.dropout, softmax_temp=None, attention_dropout=args.attention_dropout, ) else: model = SequencePredictorRNN(d_model=args.d_model, n_classes=args.n_classes, n_layers=args.n_layers, dropout=args.dropout, rnn_type=args.model_type) print(f"Created model:\n{model}") model.to(device) # Start training optimizer = get_optimizer(model.parameters(), args) start_epoch = 1 if args.continue_from: start_epoch = load_model(args.continue_from, model, optimizer, device) lr_schedule = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda e: 1. if e < args.reduce_lr_at else 0.1) for e in range(start_epoch, args.epochs + 1): train(model, optimizer, train_loader, device) print('Epoch:', e) evaluate(model, test_loader, device) if (e % args.save_frequency) == 0 and args.save_to: save_model(args.save_to, model, optimizer, e) lr_schedule.step() if args.plot_hidden: x, y, m = test_set.__next__() # x is 1d of length sequence_len x.to(device) y.to(device) m.to(device) model.eval() yhat = model(x.unsqueeze(1).to(device)) hdn = model.hidden_state # batch x seq x hdn max_len = 10 print("Plotting on: ", x[:max_len]) plot_hidden_state_2d(hdn[0, :max_len, :].detach().cpu().numpy(), pca=True)
def main(argv): parser = argparse.ArgumentParser( description="Generate an image from a pretrained model") add_dataset_arguments(parser) add_transformer_arguments(parser) parser.add_argument( "model", help="The path to the model (give '-' for random intialization)") parser.add_argument("--mixtures", type=int, default=10, help="How many logistics to use to model the output") parser.add_argument("--plot", action="store_true", help="Plot the generated image") parser.add_argument("--save_image", help="Path to save an image to") parser.add_argument("--image_shape", type=lambda x: tuple(int(xi) for xi in x.split(",")), default=(28, 28), help="Reshape the prediction to plot it") parser.add_argument("--index", type=index_type, default=[0], help="Choose the index from the dataset") parser.add_argument("--offset", type=int, default=300, help="Choose the offset in the image") parser.add_argument("--training_set", action="store_true", help="Predict from the training set") parser.add_argument("--load_pytorch", action="store_true", help="Load old pytorch model") parser.add_argument("--force_cpu", action="store_true", help="Set the device to cpu") parser.add_argument("--recurrent", action="store_true", help="Use a recurrent model for inference") args = parser.parse_args(argv) print_transformer_arguments(args) # Choose a device to run on device = ("cuda" if torch.cuda.is_available() and not args.force_cpu else "cpu") # Get the dataset and load the model train_set, test_set = get_dataset(args) if args.recurrent: model = RecurrentGenerator(args.d_query * args.n_heads, train_set.sequence_length, args.mixtures, attention_type=args.attention_type, n_layers=args.n_layers, n_heads=args.n_heads, d_query=args.d_query, dropout=args.dropout, softmax_temp=None, attention_dropout=args.attention_dropout, bits=args.bits, rounds=args.rounds, chunk_size=args.chunk_size, masked=True) else: model = ImageGenerator(args.d_query * args.n_heads, train_set.sequence_length, args.mixtures, attention_type=args.attention_type, n_layers=args.n_layers, n_heads=args.n_heads, d_query=args.d_query, dropout=args.dropout, softmax_temp=None, attention_dropout=args.attention_dropout, bits=args.bits, rounds=args.rounds, chunk_size=args.chunk_size, masked=True) # Gather the images images = collect_batch(train_set if args.training_set else test_set, args.index, device) # Load the model if args.model != "-": if args.load_pytorch: load_model_pytorch(args.model, model, None, device) else: load_model(args.model, model, None, device) model.to(device) model.eval() # Do the predictions if args.recurrent: timer = Timer() pred_images = predict_with_recurrent(model, images, args.offset) print("Elapsed time:", timer.measure()) else: timer = Timer() pred_images = predict(model, images, args.offset) print("Elapsed time:", timer.measure()) # Plot or save the images if args.plot: print(pred_images) pred_images = pred_images.cpu() images = images.cpu() plt.figure() plt.imshow(pred_images[0].reshape(*args.image_shape)) plt.figure() plt.imshow(np.hstack([images[0], 0]).reshape(*args.image_shape)) plt.show() if args.save_image: pred_images = pred_images.cpu() images = images.cpu() for i in range(len(images)): imwrite(args.save_image.format("pred", i), pred_images[i].reshape(*args.image_shape)) imwrite(args.save_image.format("real", i), np.hstack([images[i], 0]).reshape(*args.image_shape))