Beispiel #1
0
    def test_cgcnn_featurizer(self):
        # Test regular classification.
        cla_props, cla_atom_features, cla_structs = self._get_cgcnn_data()
        atom_fea_len = 64
        cgcnn_featurizer = \
            CGCNNFeaturizer(atom_init_fea=cla_atom_features,
                            train_size=5, val_size=2, test_size=3,
                            atom_fea_len=atom_fea_len)

        cgcnn_featurizer.fit(X=cla_structs, y=cla_props)
        self.assertEqual(len(cgcnn_featurizer.feature_labels()), atom_fea_len)
        state_dict = cgcnn_featurizer.model.state_dict()
        self.assertEqual(state_dict['embedding.weight'].size(),
                         torch.Size([64, 92]))
        self.assertEqual(state_dict['embedding.bias'].size(), torch.Size([64]))
        self.assertEqual(state_dict['convs.0.fc_full.weight'].size(),
                         torch.Size([128, 169]))
        self.assertEqual(state_dict['convs.1.bn1.weight'].size(),
                         torch.Size([128]))
        self.assertEqual(state_dict['convs.2.bn2.bias'].size(),
                         torch.Size([64]))
        self.assertEqual(state_dict['conv_to_fc.weight'].size(),
                         torch.Size([128, 64]))
        self.assertEqual(state_dict['fc_out.weight'].size(),
                         torch.Size([2, 128]))

        for struct in cla_structs:
            result = cgcnn_featurizer.featurize(struct)
            self.assertEqual(len(result), atom_fea_len)

        # Test regular regression and default atom_init_fea and featurize_many.
        reg_props, reg_atom_features, reg_structs = \
            self._get_cgcnn_data("regression")
        cgcnn_featurizer = \
            CGCNNFeaturizer(task="regression", atom_fea_len=atom_fea_len,
                            train_size=6, val_size=2, test_size=2)

        cgcnn_featurizer.fit(X=reg_structs, y=reg_props)
        cgcnn_featurizer.set_n_jobs(1)

        result = cgcnn_featurizer.featurize_many(entries=reg_structs)
        self.assertEqual(
            np.array(result).shape, (len(reg_structs), atom_fea_len))

        # Test classification from pre-trained model.
        cgcnn_featurizer = \
            CGCNNFeaturizer(h_fea_len=32, n_conv=4,
                            pretrained_name='semi-metal-classification',
                            atom_init_fea=cla_atom_features, train_size=5,
                            val_size=2, test_size=3, atom_fea_len=atom_fea_len)
        cgcnn_featurizer.fit(X=cla_structs, y=cla_props)
        self.assertEqual(len(cgcnn_featurizer.feature_labels()), atom_fea_len)

        validate_features = [
            2.1295, 2.1288, 1.8504, 1.9175, 2.1094, 1.7770, 2.0471, 1.7426,
            1.7288, 1.7770
        ]
        for struct, validate_feature in zip(cla_structs, validate_features):
            result = cgcnn_featurizer.featurize(struct)
            self.assertEqual(len(result), atom_fea_len)
            self.assertAlmostEqual(result[0], validate_feature, 4)

        # Test regression from pre-trained model.
        cgcnn_featurizer = \
            CGCNNFeaturizer(task="regression", h_fea_len=32, n_conv=4,
                            pretrained_name='formation-energy-per-atom',
                            atom_init_fea=reg_atom_features,
                            train_size=5, val_size=2, test_size=3,
                            atom_fea_len=atom_fea_len)
        cgcnn_featurizer.fit(X=reg_structs, y=reg_props)
        self.assertEqual(len(cgcnn_featurizer.feature_labels()), atom_fea_len)

        validate_features = [
            1.6871, 1.5679, 1.5316, 1.6419, 1.6031, 1.4333, 1.5709, 1.5070,
            1.5038, 1.4333
        ]

        for struct, validate_feature in zip(reg_structs, validate_features):
            result = cgcnn_featurizer.featurize(struct)
            self.assertEqual(len(result), atom_fea_len)
            self.assertAlmostEqual(result[-1], validate_feature, 4)

        # Test warm start regression.
        warm_start_file = os.path.join(test_dir,
                                       'cgcnn_test_regression_model.pth.tar')
        warm_start_model = torch.load(warm_start_file)
        self.assertEqual(warm_start_model['epoch'], 31)
        self.assertEqual(warm_start_model['best_epoch'], 9)
        self.assertAlmostEqual(warm_start_model['best_mae_error'].numpy(),
                               2.3700, 4)

        cgcnn_featurizer = \
            CGCNNFeaturizer(task="regression", warm_start_file=warm_start_file,
                            epochs=100, atom_fea_len=atom_fea_len,
                            atom_init_fea=reg_atom_features,
                            train_size=6, val_size=2, test_size=2)
        cgcnn_featurizer.fit(X=reg_structs, y=reg_props)

        # If use CGCNN featurize_many(), you should change the multiprocessing
        # start_method to 'spawn', because Gloo (that uses Infiniband) and
        # NCCL2 are not fork safe, pytorch don't support them or just
        # set n_jobs = 1 to avoid multiprocessing as follows.
        set_start_method('spawn', force=True)
        result = cgcnn_featurizer.featurize_many(entries=reg_structs)
        self.assertEqual(
            np.array(result).shape, (len(reg_structs), atom_fea_len))

        # Test featurize_dataframe.
        df = pd.DataFrame.from_dict({"structure": cla_structs})
        cgcnn_featurizer = \
            CGCNNFeaturizer(atom_init_fea=cla_atom_features,
                            train_size=5, val_size=2, test_size=3,
                            atom_fea_len=atom_fea_len)
        cgcnn_featurizer.fit(X=df["structure"], y=cla_props)
        self.assertEqual(len(cgcnn_featurizer.feature_labels()), atom_fea_len)
        cgcnn_featurizer.set_n_jobs(1)
        result = cgcnn_featurizer.featurize_dataframe(df, "structure")
        self.assertTrue("CGCNN_feature_{}".format(atom_fea_len -
                                                  1) in result.columns)
        self.assertEqual(
            np.array(result).shape, (len(reg_structs), atom_fea_len + 1))

        # Test fit_featurize_dataframe.
        df = pd.DataFrame.from_dict({"structure": cla_structs})
        cgcnn_featurizer = \
            CGCNNFeaturizer(atom_init_fea=cla_atom_features,
                            train_size=5, val_size=2, test_size=3,
                            atom_fea_len=atom_fea_len)
        result = cgcnn_featurizer.fit_featurize_dataframe(df,
                                                          "structure",
                                                          fit_args=[cla_props])
        self.assertEqual(len(cgcnn_featurizer.feature_labels()), atom_fea_len)
        self.assertTrue("CGCNN_feature_{}".format(atom_fea_len -
                                                  1) in result.columns)
        self.assertEqual(
            np.array(result).shape, (len(reg_structs), atom_fea_len + 1))
Beispiel #2
0
    def test_cgcnn_featurizer(self):
        # Test regular classification.
        cla_props, cla_atom_features, cla_structs = self._get_cgcnn_data()
        atom_fea_len = 64
        cgcnn_featurizer = \
            CGCNNFeaturizer(atom_init_fea=cla_atom_features,
                            train_size=5, val_size=2, test_size=3,
                            atom_fea_len=atom_fea_len)

        cgcnn_featurizer.fit(X=cla_structs, y=cla_props)
        self.assertEqual(len(cgcnn_featurizer.feature_labels()), atom_fea_len)
        state_dict = cgcnn_featurizer.model.state_dict()
        self.assertEqual(state_dict['embedding.weight'].size(),
                         torch.Size([64, 92]))
        self.assertEqual(state_dict['embedding.bias'].size(),
                         torch.Size([64]))
        self.assertEqual(state_dict['convs.0.fc_full.weight'].size(),
                         torch.Size([128, 169]))
        self.assertEqual(state_dict['convs.1.bn1.weight'].size(),
                         torch.Size([128]))
        self.assertEqual(state_dict['convs.2.bn2.bias'].size(),
                         torch.Size([64]))
        self.assertEqual(state_dict['conv_to_fc.weight'].size(),
                         torch.Size([128, 64]))
        self.assertEqual(state_dict['fc_out.weight'].size(),
                         torch.Size([2, 128]))

        for struct in cla_structs:
            result = cgcnn_featurizer.featurize(struct)
            self.assertEqual(len(result), atom_fea_len)

        # Test regular regression and default atom_init_fea and featurize_many.
        reg_props, reg_atom_features, reg_structs = \
            self._get_cgcnn_data("regression")
        cgcnn_featurizer = \
            CGCNNFeaturizer(task="regression", atom_fea_len=atom_fea_len,
                            train_size=6, val_size=2, test_size=2)

        cgcnn_featurizer.fit(X=reg_structs, y=reg_props)
        cgcnn_featurizer.set_n_jobs(1)

        result = cgcnn_featurizer.featurize_many(entries=reg_structs)
        self.assertEqual(np.array(result).shape,
                         (len(reg_structs), atom_fea_len))

        # Test classification from pre-trained model.
        cgcnn_featurizer = \
            CGCNNFeaturizer(h_fea_len=32, n_conv=4,
                            pretrained_name='semi-metal-classification',
                            atom_init_fea=cla_atom_features, train_size=5,
                            val_size=2, test_size=3, atom_fea_len=atom_fea_len)
        cgcnn_featurizer.fit(X=cla_structs, y=cla_props)
        self.assertEqual(len(cgcnn_featurizer.feature_labels()), atom_fea_len)

        validate_features = [2.1295, 2.1288, 1.8504, 1.9175, 2.1094,
                             1.7770, 2.0471, 1.7426, 1.7288, 1.7770]
        for struct, validate_feature in zip(cla_structs, validate_features):
            result = cgcnn_featurizer.featurize(struct)
            self.assertEqual(len(result), atom_fea_len)
            self.assertAlmostEqual(result[0], validate_feature, 4)

        # Test regression from pre-trained model.
        cgcnn_featurizer = \
            CGCNNFeaturizer(task="regression", h_fea_len=32, n_conv=4,
                            pretrained_name='formation-energy-per-atom',
                            atom_init_fea=reg_atom_features,
                            train_size=5, val_size=2, test_size=3,
                            atom_fea_len=atom_fea_len)
        cgcnn_featurizer.fit(X=reg_structs, y=reg_props)
        self.assertEqual(len(cgcnn_featurizer.feature_labels()), atom_fea_len)

        validate_features = [1.6871, 1.5679, 1.5316, 1.6419, 1.6031,
                             1.4333, 1.5709, 1.5070, 1.5038, 1.4333]

        for struct, validate_feature in zip(reg_structs, validate_features):
            result = cgcnn_featurizer.featurize(struct)
            self.assertEqual(len(result), atom_fea_len)
            self.assertAlmostEqual(result[-1], validate_feature, 4)

        # Test warm start regression.
        warm_start_file = os.path.join(test_dir,
                                       'cgcnn_test_regression_model.pth.tar')
        warm_start_model = torch.load(warm_start_file)
        self.assertEqual(warm_start_model['epoch'], 31)
        self.assertEqual(warm_start_model['best_epoch'], 9)
        self.assertAlmostEqual(warm_start_model['best_mae_error'].numpy(),
                               2.3700, 4)

        cgcnn_featurizer = \
            CGCNNFeaturizer(task="regression", warm_start_file=warm_start_file,
                            epochs=100, atom_fea_len=atom_fea_len,
                            atom_init_fea=reg_atom_features,
                            train_size=6, val_size=2, test_size=2)
        cgcnn_featurizer.fit(X=reg_structs, y=reg_props)

        # If use CGCNN featurize_many(), you should change the multiprocessing
        # start_method to 'spawn', because Gloo (that uses Infiniband) and
        # NCCL2 are not fork safe, pytorch don't support them or just
        # set n_jobs = 1 to avoid multiprocessing as follows.
        set_start_method('spawn', force=True)
        result = cgcnn_featurizer.featurize_many(entries=reg_structs)
        self.assertEqual(np.array(result).shape,
                         (len(reg_structs), atom_fea_len))

        # Test featurize_dataframe.
        df = pd.DataFrame.from_dict({"structure": cla_structs})
        cgcnn_featurizer = \
            CGCNNFeaturizer(atom_init_fea=cla_atom_features,
                            train_size=5, val_size=2, test_size=3,
                            atom_fea_len=atom_fea_len)
        cgcnn_featurizer.fit(X=df["structure"], y=cla_props)
        self.assertEqual(len(cgcnn_featurizer.feature_labels()), atom_fea_len)
        cgcnn_featurizer.set_n_jobs(1)
        result = cgcnn_featurizer.featurize_dataframe(df, "structure")
        self.assertTrue("CGCNN_feature_{}".format(atom_fea_len - 1)
                        in result.columns)
        self.assertEqual(np.array(result).shape,
                         (len(reg_structs), atom_fea_len + 1))

        # Test fit_featurize_dataframe.
        df = pd.DataFrame.from_dict({"structure": cla_structs})
        cgcnn_featurizer = \
            CGCNNFeaturizer(atom_init_fea=cla_atom_features,
                            train_size=5, val_size=2, test_size=3,
                            atom_fea_len=atom_fea_len)
        result = cgcnn_featurizer.fit_featurize_dataframe(
            df, "structure", fit_args=[cla_props])
        self.assertEqual(len(cgcnn_featurizer.feature_labels()), atom_fea_len)
        self.assertTrue("CGCNN_feature_{}".format(atom_fea_len - 1)
                        in result.columns)
        self.assertEqual(np.array(result).shape,
                         (len(reg_structs), atom_fea_len + 1))
Beispiel #3
0
        "C:/Users/sterg/Documents/GitHub/cgcnn/data/cif-K_VRH/*.cif"):
    #structure_path = 'C:/Users/sterg/Documents/GitHub/cgcnn/data/cif-only-K_VRH/'+structure_file
    structure_path = structure_file
    structure = Structure.from_file(structure_path)
    structures.append(structure)
df = pd.DataFrame({"K_VRH": properties[1], "structure": structures})
print(df)  # make sure the dataframe appears like you intended
df.to_pickle("C:/Users/sterg/Documents/GitHub/cgcnn/data/cif-K_VRH.p")

#%%
featurizer = CGCNNFeaturizer(
    task='regression',
    atom_init_fea=None,
    pretrained_name='bulk-moduli',
    warm_start_file=
    'C:\\Users\\sterg\\Documents\\GitHub\\cgcnn\\data\\checkpoint.pth.tar',
    warm_start_latest=False,
    save_model_to_dir=None,
    save_checkpoint_to_dir=None,
    checkpoint_interval=100,
    del_checkpoint=True)

#%%
featurizer.fit(df.structure, df.K_VRH)

#%%
df.structure[0]

features = featurizer.featurize_many(df.structure,
                                     ignore_errors=True,
                                     return_errors=False,
Beispiel #4
0
def train():
    # Parse args
    args = parse_args()
    print(args)

    output_path = args.output_path
    dataset_name = args.dataset
    prop_col = args.property
    disable_cuda = args.disable_cuda
    distributed = args.distributed
    kf_indices = list(map(int, args.kf_indices.split(",")))

    if not disable_cuda:
        print("Cuda is on? {}".format(torch.cuda.is_available()))
        print("Cuda current device is : {}, it's name is : {}".format(
            torch.cuda.current_device(), torch.cuda.get_device_name(0)))

    if distributed:
        dist.init_process_group(backend="mpi")

    atom_feature_path = os.path.join(os.path.dirname(cgcnn.__file__), "..",
                                     "data", "sample-classification")

    with open(os.path.join(atom_feature_path, "atom_init.json")) as f:
        atom_features = json.load(f)

    tmp_output_path = os.path.join(output_path, dataset_name, prop_col)
    if not os.path.exists(tmp_output_path):
        os.makedirs(tmp_output_path, exist_ok=True)

    with gzip.open(args.input_file_prefix.format(dataset_name), "rb") as f:
        df = pd.DataFrame(pickle.load(f))[["structure", prop_col]].dropna()
    idx_list = list(range(len(df)))

    kf = KFold(n_splits=5, random_state=18012019, shuffle=True)
    for kf_idx, (remain_index, test_index) in enumerate(kf.split(idx_list)):
        if kf_idx in kf_indices:
            kf_tmp_output_path = os.path.join(tmp_output_path,
                                              "kfold_{}".format(kf_idx))
            if not os.path.exists(kf_tmp_output_path):
                os.makedirs(kf_tmp_output_path, exist_ok=True)
            train_index, val_index = train_test_split(remain_index,
                                                      test_size=0.25,
                                                      random_state=18012019,
                                                      shuffle=True)

            cgcnnfz = CGCNNFeaturizer(
                task=args.task,
                distributed=distributed,
                n_works=args.n_works,
                disable_cuda=disable_cuda,
                save_idx=kf_tmp_output_path,
                output_path=kf_tmp_output_path,
                atom_init_fea=atom_features,
                use_batch=False,
                test=args.test,
                dropout_percent=0.5,
                batch_size=args.batch_size,
                warm_start_file=args.warm_start,
                warm_start_latest=True,
                use_pretrained=False,
                save_model_to_dir=os.path.join(kf_tmp_output_path, "model"),
                save_checkpoint_to_dir=os.path.join(kf_tmp_output_path,
                                                    "checkpoint"),
                checkpoint_interval=10,
                num_epochs=args.max_epochs,
                print_freq=10,
                optim="SGD",
                momentum=0.9,
                lr_milestones=[800],
                lr=args.learning_rate,
                weight_decay=0.0,
                h_fea_len=32,
                pin_memory=args.pin_memory,
                n_conv=args.n_conv,
                n_h=1,
                del_checkpoint=False,
                use_idxes=True,
                train_set=train_index,
                val_set=val_index,
                test_set=test_index,
                atom_fea_len=64,
                log_dir=os.path.join(kf_tmp_output_path, "logger.log"),
                simple_log_dir=os.path.join(kf_tmp_output_path,
                                            "simple_logger.log"),
            )

            cgcnnfz.fit(X=df["structure"], y=df[prop_col])