Beispiel #1
0
def pickle_db_entries(
        filename="~/Applications/db_access/mol_builder/database_n200.pkl"):
    entries = DatabaseOperation.query_db_entries(db_collection="mol_builder",
                                                 num_entries=200)
    # entries = DatabaseOperation.query_db_entries(db_collection="smd", num_entries=200)

    pickle_dump(entries, filename)
Beispiel #2
0
def pickle_molecules(outname,
                     num_entries=500,
                     db_collection="mol_builder",
                     db_file=None):

    if db_file is None:
        if db_collection == "mol_builder":
            db_file = "/Users/mjwen/Applications/db_access/sam_db/sam_db_mol_builder.json"
        elif db_collection == "task":
            db_file = "/Users/mjwen/Applications/db_access/sam_db/sam_db_tasks.json"
        else:
            raise Exception(
                "Unrecognized db_collection = {}".format(db_collection))

    entries = DatabaseOperation.query_db_entries(
        db_collection=db_collection,
        db_file=db_file,
        num_entries=num_entries,
    )

    mols = DatabaseOperation.to_molecules(entries, db_collection=db_collection)

    # filename = "~/Applications/db_access/mol_builder/molecules_n200_unfiltered.pkl"
    # pickle_dump(mols, filename)

    mols = DatabaseOperation.filter_molecules(mols,
                                              connectivity=True,
                                              isomorphism=True)
    pickle_dump(mols, outname)
def main_worker(gpu, world_size, args):
    global best
    args.gpu = gpu

    if not args.distributed or (args.distributed and args.gpu == 0):
        print("\n\nStart training at:", datetime.now())

    if args.distributed:
        dist.init_process_group(
            args.dist_backend,
            init_method=args.dist_url,
            world_size=world_size,
            rank=args.gpu,
        )

    # Explicitly setting seed to ensure the same dataset split and models created in
    # two processes (when distributed) start from the same random weights and biases
    seed_torch()

    if args.restore:
        dataset_state_dict_filename = args.dataset_state_dict_filename

        if dataset_state_dict_filename is None:
            warnings.warn(
                "Restore with `args.dataset_state_dict_filename` set to None.")
        elif not Path(dataset_state_dict_filename).exists():
            warnings.warn(f"`{dataset_state_dict_filename} not found; set "
                          f"args.dataset_state_dict_filename` to None")
            dataset_state_dict_filename = None
    else:
        dataset_state_dict_filename = None

    # convert reactions in csv file to atom mapped label file if necessary
    mols, attrs, labels = read_input_files(args.molecule_file,
                                           args.molecule_attributes_file,
                                           args.reaction_file)
    dataset = ReactionNetworkDataset(
        grapher=get_grapher(),
        molecules=mols,
        labels=labels,
        extra_features=attrs,
        feature_transformer=True,
        label_transformer=True,
        state_dict_filename=dataset_state_dict_filename,
    )

    trainset, valset, testset = train_validation_test_split(dataset,
                                                            validation=0.1,
                                                            test=0.1)

    if not args.distributed or (args.distributed and args.gpu == 0):
        torch.save(dataset.state_dict(), args.dataset_state_dict_filename)
        print("Trainset size: {}, valset size: {}: testset size: {}.".format(
            len(trainset), len(valset), len(testset)))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            trainset)
    else:
        train_sampler = None

    train_loader = DataLoaderReactionNetwork(
        trainset,
        batch_size=args.batch_size,
        shuffle=(train_sampler is None),
        sampler=train_sampler,
    )
    # larger val and test set batch_size is faster but needs more memory
    # adjust the batch size of to fit memory
    bs = max(len(valset) // 10, 1)
    val_loader = DataLoaderReactionNetwork(valset,
                                           batch_size=bs,
                                           shuffle=False)
    bs = max(len(testset) // 10, 1)
    test_loader = DataLoaderReactionNetwork(testset,
                                            batch_size=bs,
                                            shuffle=False)

    ### model

    feature_names = ["atom", "bond", "global"]
    set2set_ntypes_direct = ["global"]
    feature_size = dataset.feature_size

    args.feature_size = feature_size
    args.set2set_ntypes_direct = set2set_ntypes_direct

    # save args
    if not args.distributed or (args.distributed and args.gpu == 0):
        yaml_dump(args, "train_args.yaml")

    model = GatedGCNReactionNetwork(
        in_feats=args.feature_size,
        embedding_size=args.embedding_size,
        gated_num_layers=args.gated_num_layers,
        gated_hidden_size=args.gated_hidden_size,
        gated_num_fc_layers=args.gated_num_fc_layers,
        gated_graph_norm=args.gated_graph_norm,
        gated_batch_norm=args.gated_batch_norm,
        gated_activation=args.gated_activation,
        gated_residual=args.gated_residual,
        gated_dropout=args.gated_dropout,
        num_lstm_iters=args.num_lstm_iters,
        num_lstm_layers=args.num_lstm_layers,
        set2set_ntypes_direct=args.set2set_ntypes_direct,
        fc_num_layers=args.fc_num_layers,
        fc_hidden_size=args.fc_hidden_size,
        fc_batch_norm=args.fc_batch_norm,
        fc_activation=args.fc_activation,
        fc_dropout=args.fc_dropout,
        outdim=1,
        conv="GatedGCNConv",
    )

    if not args.distributed or (args.distributed and args.gpu == 0):
        print(model)

    if args.gpu is not None:
        model.to(args.gpu)
    if args.distributed:
        ddp_model = DDP(model, device_ids=[args.gpu])
        ddp_model.feature_before_fc = model.feature_before_fc
        model = ddp_model

    ### optimizer, loss, and metric
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    loss_func = MSELoss(reduction="mean")
    metric = WeightedL1Loss(reduction="sum")

    ### learning rate scheduler and stopper
    scheduler = ReduceLROnPlateau(optimizer,
                                  mode="min",
                                  factor=0.4,
                                  patience=50,
                                  verbose=True)
    stopper = EarlyStopping(patience=150)

    # load checkpoint
    state_dict_objs = {
        "model": model,
        "optimizer": optimizer,
        "scheduler": scheduler
    }
    if args.restore:
        try:

            if args.gpu is None:
                checkpoint = load_checkpoints(state_dict_objs,
                                              filename="checkpoint.pkl")
            else:
                # Map model to be loaded to specified single gpu.
                loc = "cuda:{}".format(args.gpu)
                checkpoint = load_checkpoints(state_dict_objs,
                                              map_location=loc,
                                              filename="checkpoint.pkl")

            args.start_epoch = checkpoint["epoch"]
            best = checkpoint["best"]
            print(
                f"Successfully load checkpoints, best {best}, epoch {args.start_epoch}"
            )

        except FileNotFoundError as e:
            warnings.warn(str(e) + " Continue without loading checkpoints.")
            pass

    # start training
    if not args.distributed or (args.distributed and args.gpu == 0):
        print(
            "\n\n# Epoch     Loss         TrainAcc        ValAcc     Time (s)")
        sys.stdout.flush()

    for epoch in range(args.start_epoch, args.epochs):
        ti = time.time()

        # In distributed mode, calling the set_epoch method is needed to make shuffling
        # work; each process will use the same random seed otherwise.
        if args.distributed:
            train_sampler.set_epoch(epoch)

        # train
        loss, train_acc = train(optimizer, model, feature_names, train_loader,
                                loss_func, metric, args.gpu)

        # bad, we get nan
        if np.isnan(loss):
            print("\n\nBad, we get nan for loss. Existing")
            sys.stdout.flush()
            sys.exit(1)

        # evaluate
        val_acc = evaluate(model, feature_names, val_loader, metric, args.gpu)

        if stopper.step(val_acc):
            pickle_dump(best,
                        args.output_file)  # save results for hyperparam tune
            break

        scheduler.step(val_acc)

        is_best = val_acc < best
        if is_best:
            best = val_acc

        # save checkpoint
        if not args.distributed or (args.distributed and args.gpu == 0):

            misc_objs = {"best": best, "epoch": epoch}

            save_checkpoints(
                state_dict_objs,
                misc_objs,
                is_best,
                msg=f"epoch: {epoch}, score {val_acc}",
            )

            tt = time.time() - ti

            print("{:5d}   {:12.6e}   {:12.6e}   {:12.6e}   {:.2f}".format(
                epoch, loss, train_acc, val_acc, tt))
            if epoch % 10 == 0:
                sys.stdout.flush()

    # load best to calculate test accuracy
    if args.gpu is None:
        checkpoint = load_checkpoints(state_dict_objs,
                                      filename="best_checkpoint.pkl")
    else:
        # Map model to be loaded to specified single  gpu.
        loc = "cuda:{}".format(args.gpu)
        checkpoint = load_checkpoints(state_dict_objs,
                                      map_location=loc,
                                      filename="best_checkpoint.pkl")

    if not args.distributed or (args.distributed and args.gpu == 0):
        test_acc = evaluate(model, feature_names, test_loader, metric,
                            args.gpu)

        print("\n#TestAcc: {:12.6e} \n".format(test_acc))
        print("\nFinish training at:", datetime.now())
Beispiel #4
0
    def compare_connectivity_across_graph_builder(
        self,
        union_plot_path,
        babel_plot_path,
        extender_plot_path,
        critic_plot_path,
        only_different=True,
        tex_file="tex_mol_connectivity_comparison.tex",
    ):
        """
        Write the connectivity (with plot) of molecules obtained from different methods
        into a tex file. This is for easier comparison of the connectivity.

        Args:
            union_plot_path (Path): directory where plots for the molecules with
                connectivity determined using the union of all methods are stored.
            babel_plot_path (Path): directory where plots for the molecules with
                connectivity determined using babel are stored. Similar for
                `extender_plot_path` and `critic_plot_path`.
            only_different (bool): If `True`, write out molecules only the connectivity
                obtained from the methods are different.
        tex_file (Path): path to the output .tex file.
        """

        union_plot_path = to_path(union_plot_path)
        babel_plot_path = to_path(babel_plot_path)
        extender_plot_path = to_path(extender_plot_path)
        critic_plot_path = to_path(critic_plot_path)

        # keep record of molecules of which the babel mol graph and critic mol graph are
        # different
        mols_differ_graph = []
        for m in self.molecules:

            # mol builder
            m1 = copy.deepcopy(m)

            # babel builder
            m.convert_to_babel_mol_graph(use_metal_edge_extender=False)
            m2 = copy.deepcopy(m)

            # babel builder with extender
            m.convert_to_babel_mol_graph(use_metal_edge_extender=True)
            m3 = copy.deepcopy(m)

            # critic
            m.convert_to_critic_mol_graph()
            m4 = copy.deepcopy(m)

            if not only_different or not m3.mol_graph.isomorphic_to(
                    m4.mol_graph):
                mols_differ_graph.append([m1, m2, m3, m4])

        # write tex file
        tex_file = to_path(tex_file)
        with open(tex_file, "w") as f:
            f.write(TexWriter.head())
            f.write(
                "On each page, we plot 4 mols (top to bottom) from: the union of metal "
                "extender and critic, babel without extender, babel with extender and the "
                "critic builder.\n")

            for i, mols in enumerate(mols_differ_graph):
                m = mols[0]

                # molecule info
                f.write(TexWriter.newpage())
                f.write("formula: " + m.formula + "\n\n")
                f.write("charge: " + str(m.charge) + "\n\n")
                f.write("spin multiplicity: " + str(m.spin_multiplicity) +
                        "\n\n")
                f.write("free energy: " + str(m.free_energy) + "\n\n")
                f.write("id: " + m.id + "\n\n")

                # edge distances
                f.write("atom pair distances:\n\n")

                for a1, a2 in itertools.combinations(range(m.num_atoms), 2):
                    dist = np.linalg.norm(m.coords[a1] - m.coords[a2])
                    f.write("{} {}: {:.3f}\n\n".format(a1 + 1, a2 + 1, dist))

                # comparing edge differences between builder
                babel_bonds = set([(a1 + 1, a2 + 1)
                                   for (a1, a2), _ in mols[1].bonds.items()])
                extender_bonds = set([
                    (a1 + 1, a2 + 1) for (a1, a2), _ in mols[2].bonds.items()
                ])
                critic_bonds = set([(a1 + 1, a2 + 1)
                                    for (a1, a2), _ in mols[3].bonds.items()])

                intersection = extender_bonds.intersection(critic_bonds)
                extender_not_in_critic = extender_bonds - intersection
                critic_not_in_extender = critic_bonds - intersection

                f.write("extender added to babel: ")
                for b in extender_bonds - babel_bonds:
                    f.write("{} ".format(b))
                f.write("\n\n")
                f.write("extender bond not in critic: ")
                for b in extender_not_in_critic:
                    f.write("{} ".format(b))
                f.write("\n\n")
                f.write("critic bond not in extender: ")
                for b in critic_not_in_extender:
                    f.write("{} ".format(b))
                f.write("\n\n")

                # add mol graph png
                for j, m in enumerate(mols):
                    if j == 0:
                        fname = union_plot_path.joinpath(f"{m.id}.png")
                    elif j == 1:
                        fname = babel_plot_path.joinpath(f"{m.id}.png")
                    elif j == 2:
                        fname = extender_plot_path.joinpath(f"{m.id}.png")
                    elif j == 3:
                        fname = critic_plot_path.joinpath(f"{m.id}.png")

                    f.write(TexWriter.single_figure(fname, figure_size=0.2))
                    f.write(TexWriter.verbatim("=" * 80))

            f.write(TexWriter.tail())

        filename = "~/Applications/db_access/mol_builder/molecules_union_builder.pkl"
        mols = [i[0] for i in mols_differ_graph]
        pickle_dump(mols, filename)
        filename = "~/Applications/db_access/mol_builder/molecules_babel_builder.pkl"
        mols = [i[1] for i in mols_differ_graph]
        pickle_dump(mols, filename)
        filename = "~/Applications/db_access/mol_builder/molecules_extender_builder.pkl"
        mols = [i[2] for i in mols_differ_graph]
        pickle_dump(mols, filename)
        filename = "~/Applications/db_access/mol_builder/molecules_critic_builder.pkl"
        mols = [i[3] for i in mols_differ_graph]
        pickle_dump(mols, filename)

        print(
            "### mol graph comparison. number of molecules {}, different mol graphs by "
            "babel extender builder and critic builder: {}".format(
                len(self.molecules), len(mols_differ_graph)))
Beispiel #5
0
 def to_file(self, filename):
     pickle_dump(self.molecules, filename)