def pickle_db_entries( filename="~/Applications/db_access/mol_builder/database_n200.pkl"): entries = DatabaseOperation.query_db_entries(db_collection="mol_builder", num_entries=200) # entries = DatabaseOperation.query_db_entries(db_collection="smd", num_entries=200) pickle_dump(entries, filename)
def pickle_molecules(outname, num_entries=500, db_collection="mol_builder", db_file=None): if db_file is None: if db_collection == "mol_builder": db_file = "/Users/mjwen/Applications/db_access/sam_db/sam_db_mol_builder.json" elif db_collection == "task": db_file = "/Users/mjwen/Applications/db_access/sam_db/sam_db_tasks.json" else: raise Exception( "Unrecognized db_collection = {}".format(db_collection)) entries = DatabaseOperation.query_db_entries( db_collection=db_collection, db_file=db_file, num_entries=num_entries, ) mols = DatabaseOperation.to_molecules(entries, db_collection=db_collection) # filename = "~/Applications/db_access/mol_builder/molecules_n200_unfiltered.pkl" # pickle_dump(mols, filename) mols = DatabaseOperation.filter_molecules(mols, connectivity=True, isomorphism=True) pickle_dump(mols, outname)
def main_worker(gpu, world_size, args): global best args.gpu = gpu if not args.distributed or (args.distributed and args.gpu == 0): print("\n\nStart training at:", datetime.now()) if args.distributed: dist.init_process_group( args.dist_backend, init_method=args.dist_url, world_size=world_size, rank=args.gpu, ) # Explicitly setting seed to ensure the same dataset split and models created in # two processes (when distributed) start from the same random weights and biases seed_torch() if args.restore: dataset_state_dict_filename = args.dataset_state_dict_filename if dataset_state_dict_filename is None: warnings.warn( "Restore with `args.dataset_state_dict_filename` set to None.") elif not Path(dataset_state_dict_filename).exists(): warnings.warn(f"`{dataset_state_dict_filename} not found; set " f"args.dataset_state_dict_filename` to None") dataset_state_dict_filename = None else: dataset_state_dict_filename = None # convert reactions in csv file to atom mapped label file if necessary mols, attrs, labels = read_input_files(args.molecule_file, args.molecule_attributes_file, args.reaction_file) dataset = ReactionNetworkDataset( grapher=get_grapher(), molecules=mols, labels=labels, extra_features=attrs, feature_transformer=True, label_transformer=True, state_dict_filename=dataset_state_dict_filename, ) trainset, valset, testset = train_validation_test_split(dataset, validation=0.1, test=0.1) if not args.distributed or (args.distributed and args.gpu == 0): torch.save(dataset.state_dict(), args.dataset_state_dict_filename) print("Trainset size: {}, valset size: {}: testset size: {}.".format( len(trainset), len(valset), len(testset))) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( trainset) else: train_sampler = None train_loader = DataLoaderReactionNetwork( trainset, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, ) # larger val and test set batch_size is faster but needs more memory # adjust the batch size of to fit memory bs = max(len(valset) // 10, 1) val_loader = DataLoaderReactionNetwork(valset, batch_size=bs, shuffle=False) bs = max(len(testset) // 10, 1) test_loader = DataLoaderReactionNetwork(testset, batch_size=bs, shuffle=False) ### model feature_names = ["atom", "bond", "global"] set2set_ntypes_direct = ["global"] feature_size = dataset.feature_size args.feature_size = feature_size args.set2set_ntypes_direct = set2set_ntypes_direct # save args if not args.distributed or (args.distributed and args.gpu == 0): yaml_dump(args, "train_args.yaml") model = GatedGCNReactionNetwork( in_feats=args.feature_size, embedding_size=args.embedding_size, gated_num_layers=args.gated_num_layers, gated_hidden_size=args.gated_hidden_size, gated_num_fc_layers=args.gated_num_fc_layers, gated_graph_norm=args.gated_graph_norm, gated_batch_norm=args.gated_batch_norm, gated_activation=args.gated_activation, gated_residual=args.gated_residual, gated_dropout=args.gated_dropout, num_lstm_iters=args.num_lstm_iters, num_lstm_layers=args.num_lstm_layers, set2set_ntypes_direct=args.set2set_ntypes_direct, fc_num_layers=args.fc_num_layers, fc_hidden_size=args.fc_hidden_size, fc_batch_norm=args.fc_batch_norm, fc_activation=args.fc_activation, fc_dropout=args.fc_dropout, outdim=1, conv="GatedGCNConv", ) if not args.distributed or (args.distributed and args.gpu == 0): print(model) if args.gpu is not None: model.to(args.gpu) if args.distributed: ddp_model = DDP(model, device_ids=[args.gpu]) ddp_model.feature_before_fc = model.feature_before_fc model = ddp_model ### optimizer, loss, and metric optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) loss_func = MSELoss(reduction="mean") metric = WeightedL1Loss(reduction="sum") ### learning rate scheduler and stopper scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.4, patience=50, verbose=True) stopper = EarlyStopping(patience=150) # load checkpoint state_dict_objs = { "model": model, "optimizer": optimizer, "scheduler": scheduler } if args.restore: try: if args.gpu is None: checkpoint = load_checkpoints(state_dict_objs, filename="checkpoint.pkl") else: # Map model to be loaded to specified single gpu. loc = "cuda:{}".format(args.gpu) checkpoint = load_checkpoints(state_dict_objs, map_location=loc, filename="checkpoint.pkl") args.start_epoch = checkpoint["epoch"] best = checkpoint["best"] print( f"Successfully load checkpoints, best {best}, epoch {args.start_epoch}" ) except FileNotFoundError as e: warnings.warn(str(e) + " Continue without loading checkpoints.") pass # start training if not args.distributed or (args.distributed and args.gpu == 0): print( "\n\n# Epoch Loss TrainAcc ValAcc Time (s)") sys.stdout.flush() for epoch in range(args.start_epoch, args.epochs): ti = time.time() # In distributed mode, calling the set_epoch method is needed to make shuffling # work; each process will use the same random seed otherwise. if args.distributed: train_sampler.set_epoch(epoch) # train loss, train_acc = train(optimizer, model, feature_names, train_loader, loss_func, metric, args.gpu) # bad, we get nan if np.isnan(loss): print("\n\nBad, we get nan for loss. Existing") sys.stdout.flush() sys.exit(1) # evaluate val_acc = evaluate(model, feature_names, val_loader, metric, args.gpu) if stopper.step(val_acc): pickle_dump(best, args.output_file) # save results for hyperparam tune break scheduler.step(val_acc) is_best = val_acc < best if is_best: best = val_acc # save checkpoint if not args.distributed or (args.distributed and args.gpu == 0): misc_objs = {"best": best, "epoch": epoch} save_checkpoints( state_dict_objs, misc_objs, is_best, msg=f"epoch: {epoch}, score {val_acc}", ) tt = time.time() - ti print("{:5d} {:12.6e} {:12.6e} {:12.6e} {:.2f}".format( epoch, loss, train_acc, val_acc, tt)) if epoch % 10 == 0: sys.stdout.flush() # load best to calculate test accuracy if args.gpu is None: checkpoint = load_checkpoints(state_dict_objs, filename="best_checkpoint.pkl") else: # Map model to be loaded to specified single gpu. loc = "cuda:{}".format(args.gpu) checkpoint = load_checkpoints(state_dict_objs, map_location=loc, filename="best_checkpoint.pkl") if not args.distributed or (args.distributed and args.gpu == 0): test_acc = evaluate(model, feature_names, test_loader, metric, args.gpu) print("\n#TestAcc: {:12.6e} \n".format(test_acc)) print("\nFinish training at:", datetime.now())
def compare_connectivity_across_graph_builder( self, union_plot_path, babel_plot_path, extender_plot_path, critic_plot_path, only_different=True, tex_file="tex_mol_connectivity_comparison.tex", ): """ Write the connectivity (with plot) of molecules obtained from different methods into a tex file. This is for easier comparison of the connectivity. Args: union_plot_path (Path): directory where plots for the molecules with connectivity determined using the union of all methods are stored. babel_plot_path (Path): directory where plots for the molecules with connectivity determined using babel are stored. Similar for `extender_plot_path` and `critic_plot_path`. only_different (bool): If `True`, write out molecules only the connectivity obtained from the methods are different. tex_file (Path): path to the output .tex file. """ union_plot_path = to_path(union_plot_path) babel_plot_path = to_path(babel_plot_path) extender_plot_path = to_path(extender_plot_path) critic_plot_path = to_path(critic_plot_path) # keep record of molecules of which the babel mol graph and critic mol graph are # different mols_differ_graph = [] for m in self.molecules: # mol builder m1 = copy.deepcopy(m) # babel builder m.convert_to_babel_mol_graph(use_metal_edge_extender=False) m2 = copy.deepcopy(m) # babel builder with extender m.convert_to_babel_mol_graph(use_metal_edge_extender=True) m3 = copy.deepcopy(m) # critic m.convert_to_critic_mol_graph() m4 = copy.deepcopy(m) if not only_different or not m3.mol_graph.isomorphic_to( m4.mol_graph): mols_differ_graph.append([m1, m2, m3, m4]) # write tex file tex_file = to_path(tex_file) with open(tex_file, "w") as f: f.write(TexWriter.head()) f.write( "On each page, we plot 4 mols (top to bottom) from: the union of metal " "extender and critic, babel without extender, babel with extender and the " "critic builder.\n") for i, mols in enumerate(mols_differ_graph): m = mols[0] # molecule info f.write(TexWriter.newpage()) f.write("formula: " + m.formula + "\n\n") f.write("charge: " + str(m.charge) + "\n\n") f.write("spin multiplicity: " + str(m.spin_multiplicity) + "\n\n") f.write("free energy: " + str(m.free_energy) + "\n\n") f.write("id: " + m.id + "\n\n") # edge distances f.write("atom pair distances:\n\n") for a1, a2 in itertools.combinations(range(m.num_atoms), 2): dist = np.linalg.norm(m.coords[a1] - m.coords[a2]) f.write("{} {}: {:.3f}\n\n".format(a1 + 1, a2 + 1, dist)) # comparing edge differences between builder babel_bonds = set([(a1 + 1, a2 + 1) for (a1, a2), _ in mols[1].bonds.items()]) extender_bonds = set([ (a1 + 1, a2 + 1) for (a1, a2), _ in mols[2].bonds.items() ]) critic_bonds = set([(a1 + 1, a2 + 1) for (a1, a2), _ in mols[3].bonds.items()]) intersection = extender_bonds.intersection(critic_bonds) extender_not_in_critic = extender_bonds - intersection critic_not_in_extender = critic_bonds - intersection f.write("extender added to babel: ") for b in extender_bonds - babel_bonds: f.write("{} ".format(b)) f.write("\n\n") f.write("extender bond not in critic: ") for b in extender_not_in_critic: f.write("{} ".format(b)) f.write("\n\n") f.write("critic bond not in extender: ") for b in critic_not_in_extender: f.write("{} ".format(b)) f.write("\n\n") # add mol graph png for j, m in enumerate(mols): if j == 0: fname = union_plot_path.joinpath(f"{m.id}.png") elif j == 1: fname = babel_plot_path.joinpath(f"{m.id}.png") elif j == 2: fname = extender_plot_path.joinpath(f"{m.id}.png") elif j == 3: fname = critic_plot_path.joinpath(f"{m.id}.png") f.write(TexWriter.single_figure(fname, figure_size=0.2)) f.write(TexWriter.verbatim("=" * 80)) f.write(TexWriter.tail()) filename = "~/Applications/db_access/mol_builder/molecules_union_builder.pkl" mols = [i[0] for i in mols_differ_graph] pickle_dump(mols, filename) filename = "~/Applications/db_access/mol_builder/molecules_babel_builder.pkl" mols = [i[1] for i in mols_differ_graph] pickle_dump(mols, filename) filename = "~/Applications/db_access/mol_builder/molecules_extender_builder.pkl" mols = [i[2] for i in mols_differ_graph] pickle_dump(mols, filename) filename = "~/Applications/db_access/mol_builder/molecules_critic_builder.pkl" mols = [i[3] for i in mols_differ_graph] pickle_dump(mols, filename) print( "### mol graph comparison. number of molecules {}, different mol graphs by " "babel extender builder and critic builder: {}".format( len(self.molecules), len(mols_differ_graph)))
def to_file(self, filename): pickle_dump(self.molecules, filename)