def train(working_dir, grid_size, learning_rate, batch_size, num_walks, model_type, fn): train_props, val_props, test_props = get_props(working_dir, dtype=np.float32) means_stds = np.loadtxt(working_dir + "/means_stds.csv", dtype=np.float32, delimiter=',') # filter out redundant qm8 properties if train_props.shape[1] == 16: filtered_labels = list(range(0, 8)) + list(range(12, 16)) train_props = train_props[:, filtered_labels] val_props = val_props[:, filtered_labels] test_props = test_props[:, filtered_labels] means_stds = means_stds[:, filtered_labels] if model_type == "resnet18": model = ResNet(BasicBlock, [2, 2, 2, 2], grid_size, "regression", feat_nums, e_sizes, num_classes=train_props.shape[1]) elif model_type == "resnet34": model = ResNet(BasicBlock, [3, 4, 6, 3], grid_size, "regression", feat_nums, e_sizes, num_classes=train_props.shape[1]) elif model_type == "resnet50": model = ResNet(Bottleneck, [3, 4, 6, 3], grid_size, "regression", feat_nums, e_sizes, num_classes=train_props.shape[1]) elif model_type == "densenet121": model = densenet121(grid_size, "regression", feat_nums, e_sizes, num_classes=train_props.shape[1]) elif model_type == "densenet161": model = densenet161(grid_size, "regression", feat_nums, e_sizes, num_classes=train_props.shape[1]) elif model_type == "densenet169": model = densenet169(grid_size, "regression", feat_nums, e_sizes, num_classes=train_props.shape[1]) elif model_type == "densenet201": model = densenet201(grid_size, "regression", feat_nums, e_sizes, num_classes=train_props.shape[1]) else: print("specify a valid model") return model.float() model.cuda() loss_function_train = nn.MSELoss(reduction='none') loss_function_val = nn.L1Loss(reduction='none') optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # if model_type[0] == "r": # batch_size = 128 # optimizer = torch.optim.SGD(model.parameters(), lr=0.1, # momentum=0.9, weight_decay=5e-4, nesterov=True) # elif model_type[0] == "d": # batch_size = 512 # optimizer = torch.optim.SGD(model.parameters(), lr=0.1, # momentum=0.9, weight_decay=1e-4, nesterov=True) # else: # print("specify a vlid model") # return stds = means_stds[1, :] tl_list = [] vl_list = [] log_file = open(fn + "txt", "w") log_file.write("start") log_file.flush() for file_num in range(num_loads): if file_num % 20 == 0: model_file = open("../../scratch/" + fn + ".pkl", "wb") pickle.dump(model, model_file) model_file.close() log_file.write("load: " + str(file_num)) print("load: " + str(file_num)) # Get new random walks if file_num == 0: t = time.time() train_loader, val_loader, test_loader = get_loaders(working_dir, \ file_num, \ grid_size, \ batch_size, \ train_props, \ val_props=val_props, \ test_props=test_props) print("load time") print(time.time() - t) else: file_num = random.randint(0, num_walks - 1) t = time.time() train_loader, _, _ = get_loaders(working_dir, \ file_num, \ grid_size, \ batch_size, \ train_props) print("load time") print(time.time() - t) # Train on set of random walks, can do multiple epochs if desired for epoch in range(epochs_per_load): model.train() t = time.time() train_loss_list = [] train_mae_loss_list = [] for i, (walks_int, walks_float, props) in enumerate(train_loader): walks_int = walks_int.cuda() walks_int = walks_int.long() walks_float = walks_float.cuda() walks_float = walks_float.float() props = props.cuda() outputs = model(walks_int, walks_float) # Individual losses for each item loss_mae = torch.mean(loss_function_val(props, outputs), 0) train_mae_loss_list.append(loss_mae.cpu().detach().numpy()) loss = torch.mean(loss_function_train(props, outputs), 0) train_loss_list.append(loss.cpu().detach().numpy()) # Loss converted to single value for backpropagation loss = torch.sum(loss) optimizer.zero_grad() loss.backward() optimizer.step() model.eval() val_loss_list = [] with torch.no_grad(): for i, (walks_int, walks_float, props) in enumerate(val_loader): walks_int = walks_int.cuda() walks_int = walks_int.long() walks_float = walks_float.cuda() walks_float = walks_float.float() props = props.cuda() outputs = model(walks_int, walks_float) # Individual losses for each item loss = loss_function_val(props, outputs) val_loss_list.append(loss.cpu().detach().numpy()) # ith row of this array is the losses for each label in batch i train_loss_arr = np.array(train_loss_list) train_mae_arr = np.array(train_mae_loss_list) log_file.write("training mse loss\n") log_file.write(str(np.mean(train_loss_arr)) + "\n") log_file.write("training mae loss\n") log_file.write(str(np.mean(train_mae_arr)) + "\n") print("training mse loss") print(str(np.mean(train_loss_arr))) print("training mae loss") print(str(np.mean(train_mae_arr))) val_loss_arr = np.concatenate(val_loss_list, 0) val_loss = np.mean(val_loss_arr, 0) log_file.write("val loss\n") log_file.write(str(np.mean(val_loss_arr)) + "\n") print("val loss") print(str(np.mean(val_loss_arr))) # Unnormalized loss is for comparison to papers tnl = np.mean(train_mae_arr, 0) log_file.write("train normalized losses\n") log_file.write(" ".join(list(map(str, tnl))) + "\n") print("train normalized losses") print(" ".join(list(map(str, tnl)))) log_file.write("val normalized losses\n") log_file.write(" ".join(list(map(str, val_loss))) + "\n") print("val normalized losses") print(" ".join(list(map(str, val_loss)))) tunl = stds * tnl log_file.write("train unnormalized losses\n") log_file.write(" ".join(list(map(str, tunl))) + "\n") print("train unnormalized losses") print(" ".join(list(map(str, tunl)))) vunl = stds * val_loss log_file.write("val unnormalized losses\n") log_file.write(" ".join(list(map(str, vunl))) + "\n") log_file.write("\n") print("val unnormalized losses") print(" ".join(list(map(str, vunl)))) print("\n") print("time") print(time.time() - t) file_num += 1 log_file.flush() log_file.close() return model
def train(working_dir, grid_size, learning_rate, batch_size, num_cores): process = psutil.Process(os.getpid()) print(process.memory_info().rss / 1024 / 1024 / 1024) train_feat_dict = get_feat_dict(working_dir + "/train_smiles.csv") val_feat_dict = get_feat_dict(working_dir + "/val_smiles.csv") test_feat_dict = get_feat_dict(working_dir + "/test_smiles.csv") # There are about 0.08 gb process = psutil.Process(os.getpid()) print("pre model") print(process.memory_info().rss / 1024 / 1024 / 1024) torch.set_default_dtype(torch.float64) train_props, val_props, test_props = get_props(working_dir, dtype=int) print("pre model post props") print(process.memory_info().rss / 1024 / 1024 / 1024) model = ResNet(BasicBlock, [2, 2, 2, 2], grid_size, "classification", feat_nums, e_sizes, num_classes=train_props.shape[1]) model.float() model.cuda() print("model params") pytorch_total_params = sum(p.numel() for p in model.parameters()) print(pytorch_total_params) model.cpu() print("model") print(process.memory_info().rss / 1024 / 1024 / 1024) loss_function = masked_cross_entropy optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) tl_list = [] vl_list = [] tmra_list = [] vmra_list = [] for file_num in range(num_loads): # Get new random walks if file_num == 0: print("before get_loaders") process = psutil.Process(os.getpid()) print(process.memory_info().rss / 1024 / 1024 / 1024) train_loader, val_loader, test_loader = get_loaders(num_cores, \ working_dir, \ file_num, \ grid_size, \ batch_size, \ train_props, \ train_feat_dict, \ val_props=val_props, \ val_feat_dict=val_feat_dict, \ test_props=test_props, \ test_feat_dict=test_feat_dict) else: print("before get_loaders 2") process = psutil.Process(os.getpid()) print(process.memory_info().rss / 1024 / 1024 / 1024) train_loader, _, _ = get_loaders(num_cores, \ working_dir, \ file_num, \ grid_size, \ batch_size, \ train_props, \ train_feat_dict) # Train on a single set of random walks, can do multiple epochs if desired for epoch in range(epochs_per_load): model.train() model.cuda() t = time.time() train_loss_list = [] props_list = [] outputs_list = [] # change for i, (walks_int, walks_float, props) in enumerate(train_loader): walks_int = walks_int.cuda() walks_int = walks_int.long() walks_float = walks_float.cuda() walks_float = walks_float.float() props = props.cuda() props = props.long() props_list.append(props) outputs = model(walks_int, walks_float) outputs_list.append(outputs) loss = loss_function(props, outputs) train_loss_list.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() props = torch.cat(props_list, 0) props = props.cpu().numpy() outputs = torch.cat(outputs_list, 0) outputs = outputs.detach().cpu().numpy() # Get train rocauc value train_rocaucs = [] for i in range(props.shape[1]): mask = props[:, i] != 2 train_rocauc = roc_auc_score(props[mask, i], outputs[mask, i]) train_rocaucs.append(train_rocauc) model.eval() with torch.no_grad(): ds = val_loader.dataset walks_int = ds.int_feat_tensor walks_float = ds.float_feat_tensor props = ds.prop_tensor walks_int = walks_int.cuda() walks_int = walks_int.long() walks_float = walks_float.cuda() walks_float = walks_float.float() props = props.cuda() outputs = model(walks_int, walks_float) loss = loss_function(props, outputs) props = props.cpu().numpy() outputs = outputs.cpu().numpy() val_rocaucs = [] for i in range(props.shape[1]): mask = props[:, i] != 2 val_rocauc = roc_auc_score(props[mask, i], outputs[mask, i]) val_rocaucs.append(val_rocauc) print("load: " + str(file_num) + ", epochs: " + str(epoch)) print("training loss") # Slightly approximate since last batch can be smaller... tl = statistics.mean(train_loss_list) print(tl) print("val loss") vl = loss.item() print(vl) print("train mean roc auc") tmra = sum(train_rocaucs) / len(train_rocaucs) print(tmra) print("val mean roc auc") vmra = sum(val_rocaucs) / len(val_rocaucs) print(vmra) print("time") print(time.time() - t) tl_list.append(tl) vl_list.append(vl) tmra_list.append(tmra) vmra_list.append(vmra) model.cpu() file_num += 1 del train_loader save_plot(tl_list, vl_list, tmra_list, vmra_list) return model