Ejemplo n.º 1
0
    def predict(self, molecule=None, molecule_tensor=None, sigma=False):
        """
        Predict the output given a molecule. If a tensor is specified, it
        overrides the molecule argument.
        """
        if molecule is None and molecule_tensor is None:
            raise Exception('No molecule is specified...')

        if molecule_tensor is None:
            molecule_tensor = get_molecule_tensor(
                molecule, self.add_extra_atom_attribute,
                self.add_extra_bond_attribute, self.differentiate_atom_type,
                self.differentiate_bond_type)
            if self.padding:
                molecule_tensor = pad_molecule_tensor(molecule_tensor,
                                                      self.padding_final_size)
        molecule_tensor_array = np.array([molecule_tensor])
        if sigma:
            y_pred, y_sigma = self.model.predict(molecule_tensor_array,
                                                 sigma=sigma)
            if self.y_mean is not None and self.y_std is not None:
                y_pred = y_pred * self.y_std + self.y_mean
                y_sigma = y_sigma * self.y_std
            if self.prediction_task == "Cp(cal/mol/K)":
                return y_pred[0], y_sigma[0]
            else:
                return y_pred[0][0], y_sigma[0][0]
        else:
            y_pred = self.model.predict(molecule_tensor_array)
            if self.y_mean is not None and self.y_std is not None:
                y_pred = y_pred * self.y_std + self.y_mean
            if self.prediction_task == "Cp(cal/mol/K)":
                return y_pred[0]
            else:
                return y_pred[0][0]
    def test_get_molecule_tensor_2(self):

        mol_test = Molecule().fromSMILES('CC')

        mol_tensor_test = get_molecule_tensor(mol_test)

        self.assertEqual(len(mol_tensor_test), 2)
        self.assertEqual(len(mol_tensor_test[0]), 2)
        self.assertEqual(len(mol_tensor_test[0][0]),
                         get_attribute_vector_size())
Ejemplo n.º 3
0
def prepare_full_train_data_from_multiple_datasets(datasets,
                                                   add_extra_atom_attribute=True,
                                                   add_extra_bond_attribute=True,
                                                   differentiate_atom_type=True,
                                                   differentiate_bond_type=True,
                                                   padding=True,
                                                   padding_final_size=20,
                                                   prediction_task="Hf298(kcal/mol)",
                                                   save_meta=True,
                                                   save_tensors_dir=None,
                                                   meta_dir=None):
    if save_tensors_dir is not None:
        if not os.path.exists(save_tensors_dir):
            os.makedirs(save_tensors_dir)

    test_data_datasets = []
    train_datasets = []
    fidx = 0
    for host, db, table, testing_ratio in datasets:
        X, y, smis = get_data_from_db(host, db, table, prediction_task=prediction_task)

        # At this point, X just contains molecule objects,
        # so convert them to tensors. If save_tensors_dir is specified,
        # only store the file names in X and save the tensors to the disk.
        if save_tensors_dir is None:
            X = [get_molecule_tensor(mol,
                                     add_extra_atom_attribute=add_extra_atom_attribute,
                                     add_extra_bond_attribute=add_extra_bond_attribute,
                                     differentiate_atom_type=differentiate_atom_type,
                                     differentiate_bond_type=differentiate_bond_type,
                                     padding=padding,
                                     padding_final_size=padding_final_size)
                 for mol in X]
        else:
            X_new = []
            for mol in X:
                x = get_molecule_tensor(mol,
                                        add_extra_atom_attribute=add_extra_atom_attribute,
                                        add_extra_bond_attribute=add_extra_bond_attribute,
                                        differentiate_atom_type=differentiate_atom_type,
                                        differentiate_bond_type=differentiate_bond_type,
                                        padding=padding,
                                        padding_final_size=padding_final_size)
                fname = os.path.abspath(os.path.join(save_tensors_dir, '{}.npy'.format(fidx)))
                np.save(fname, x)
                X_new.append(fname)
                fidx += 1
            X = X_new

        logging.info('Splitting dataset with testing ratio of {0}...'.format(testing_ratio))
        split_data = split_test_from_train_and_val(X, y, smis, testing_ratio=testing_ratio)

        (X_test, y_test, X_train, y_train, smis_test, smis_train) = split_data

        test_data_datasets.append((X_test, y_test))
        train_datasets.append((X_train, y_train))

        if save_meta:
            smis_test_string = '\n'.join(smis_test)
            smis_train_string = '\n'.join(smis_train)
            if meta_dir is None:
                meta_dir = os.getcwd()
            smis_test_path = os.path.join(meta_dir, '{0}.{1}_smis_test.txt'.format(db, table))
            smis_train_path = os.path.join(meta_dir, '{0}.{1}_smis_train.txt'.format(db, table))
            with open(smis_test_path, 'w') as f_in:
                f_in.write(smis_test_string)
            with open(smis_train_path, 'w') as f_in:
                f_in.write(smis_train_string)

    # merge into one folded_Xs and folded_ys
    logging.info('Merging {} datasets for training...'.format(len(datasets)))
    (X_train, y_train) = train_datasets[0]
    if len(train_datasets) > 1:
        for X_train_1, y_train_1 in train_datasets[1:]:
            X_train.extend(X_train_1)
            y_train.extend(y_train_1)

    # merge into one X_test and y_test
    (X_test, y_test) = test_data_datasets[0]
    if len(test_data_datasets) > 1:
        for X_test_1, y_test_1 in test_data_datasets[1:]:
            X_test.extend(X_test_1)
            y_test.extend(y_test_1)

    return X_test, y_test, X_train, y_train
Ejemplo n.º 4
0
def prepare_full_train_data_from_file(datafile,
                                      add_extra_atom_attribute=True,
                                      add_extra_bond_attribute=True,
                                      differentiate_atom_type=True,
                                      differentiate_bond_type=True,
                                      padding=True,
                                      padding_final_size=20,
                                      save_meta=True,
                                      save_tensors_dir=None,
                                      testing_ratio=0.0,
                                      meta_dir=None):
    identifiers, y = [], []
    with open(datafile) as df:
        for line in df:
            line_split = line.strip().split()
            if line_split:
                identifier = line_split[0]
                ysingle = [float(yi) for yi in line_split[1:]]
                identifiers.append(identifier)
                y.append(ysingle)
    y = np.array(y).astype(np.float32)

    logging.info('Loading data from {}...'.format(datafile))
    if save_tensors_dir is not None:
        if not os.path.exists(save_tensors_dir):
            os.makedirs(save_tensors_dir)

        X = []
        for fidx, identifier in enumerate(identifiers):
            mol = str_to_mol(identifier)
            x = get_molecule_tensor(mol,
                                    add_extra_atom_attribute=add_extra_atom_attribute,
                                    add_extra_bond_attribute=add_extra_bond_attribute,
                                    differentiate_atom_type=differentiate_atom_type,
                                    differentiate_bond_type=differentiate_bond_type,
                                    padding=padding,
                                    padding_final_size=padding_final_size)
            fname = os.path.abspath(os.path.join(save_tensors_dir, '{}.npy'.format(fidx)))
            np.save(fname, x)
            X.append(fname)
    else:
        X = []
        for identifier in identifiers:
            mol = str_to_mol(identifier)
            x = get_molecule_tensor(mol,
                                    add_extra_atom_attribute=add_extra_atom_attribute,
                                    add_extra_bond_attribute=add_extra_bond_attribute,
                                    differentiate_atom_type=differentiate_atom_type,
                                    differentiate_bond_type=differentiate_bond_type,
                                    padding=padding,
                                    padding_final_size=padding_final_size)
            X.append(x)

    logging.info('Splitting dataset with testing ratio of {}...'.format(testing_ratio))
    split_data = split_test_from_train_and_val(X, y, extra_data=identifiers, testing_ratio=testing_ratio)

    X_test, y_test, X_train, y_train, identifiers_test, identifiers_train = split_data

    if save_meta:
        identifiers_test_string = '\n'.join(identifiers_test)
        identifiers_train_string = '\n'.join(identifiers_train)
        if meta_dir is None:
            meta_dir = os.getcwd()
        identifiers_test_path = os.path.join(meta_dir, 'identifiers_test.txt')
        identifiers_train_path = os.path.join(meta_dir, 'identifiers_train.txt')
        with open(identifiers_test_path, 'w') as f_in:
            f_in.write(identifiers_test_string)
        with open(identifiers_train_path, 'w') as f_in:
            f_in.write(identifiers_train_string)
    
    return X_test, y_test, X_train, y_train
Ejemplo n.º 5
0
def prepare_folded_data_from_multiple_datasets(datasets,
                                               folds,
                                               add_extra_atom_attribute=True,
                                               add_extra_bond_attribute=True,
                                               differentiate_atom_type=True,
                                               differentiate_bond_type=True,
                                               padding=True,
                                               padding_final_size=20,
                                               prediction_task="Hf298(kcal/mol)",
                                               save_tensors_dir=None):
    if save_tensors_dir is not None:
        if not os.path.exists(save_tensors_dir):
            os.makedirs(save_tensors_dir)

    folded_datasets = []
    test_data_datasets = []
    fidx = 0
    for host, db, table, testing_ratio in datasets:
        X, y, _ = get_data_from_db(host, db, table, prediction_task=prediction_task)

        # At this point, X just contains molecule objects,
        # so convert them to tensors. If save_tensors_dir is specified,
        # only store the file names in X and save the tensors to the disk.
        if save_tensors_dir is None:
            X = [get_molecule_tensor(mol,
                                     add_extra_atom_attribute=add_extra_atom_attribute,
                                     add_extra_bond_attribute=add_extra_bond_attribute,
                                     differentiate_atom_type=differentiate_atom_type,
                                     differentiate_bond_type=differentiate_bond_type,
                                     padding=padding,
                                     padding_final_size=padding_final_size)
                 for mol in X]
        else:
            X_new = []
            for mol in X:
                x = get_molecule_tensor(mol,
                                        add_extra_atom_attribute=add_extra_atom_attribute,
                                        add_extra_bond_attribute=add_extra_bond_attribute,
                                        differentiate_atom_type=differentiate_atom_type,
                                        differentiate_bond_type=differentiate_bond_type,
                                        padding=padding,
                                        padding_final_size=padding_final_size)
                fname = os.path.abspath(os.path.join(save_tensors_dir, '{}.npy'.format(fidx)))
                np.save(fname, x)
                X_new.append(fname)
                fidx += 1
            X = X_new

        logging.info('Splitting dataset with testing ratio of {0}...'.format(testing_ratio))
        split_data = split_test_from_train_and_val(X, y, testing_ratio=testing_ratio)

        (X_test, y_test, X_train_and_val, y_train_and_val) = split_data

        test_data_datasets.append((X_test, y_test))
        (folded_Xs, folded_ys) = prepare_folded_data(X_train_and_val, y_train_and_val, folds)
        folded_datasets.append((folded_Xs, folded_ys))

    # merge into one folded_Xs and folded_ys
    logging.info('Merging {} datasets for training...'.format(len(datasets)))
    (folded_Xs, folded_ys) = folded_datasets[0]
    if len(folded_datasets) > 1:
        for folded_Xs_1, folded_ys_1 in folded_datasets[1:]:
            folded_Xs_ext = []
            folded_ys_ext = []
            for idx, folded_X in enumerate(folded_Xs):
                folded_X.extend(folded_Xs_1[idx])
                folded_Xs_ext.append(folded_X)

                folded_y = folded_ys[idx]
                folded_y.extend(folded_ys_1[idx])
                folded_ys_ext.append(folded_y)

            folded_Xs = folded_Xs_ext
            folded_ys = folded_ys_ext

    # merge into one X_test and y_test
    (X_test, y_test) = test_data_datasets[0]
    if len(test_data_datasets) > 1:
        for X_test_1, y_test_1 in test_data_datasets[1:]:
            X_test.extend(X_test_1)
            y_test.extend(y_test_1)

    return X_test, y_test, folded_Xs, folded_ys