Beispiel #1
0
    def __getitem__(self, index):
        molecule_name = self.id[index]
        graph_file = get_path(
        ) + 'data/graphs/' + self.graph_dir + '/%s.pickle' % molecule_name
        graph = read_pickle_from_file(graph_file)
        assert (graph.molecule_name == molecule_name)

        mask = np.zeros(len(graph.coupling.type), np.bool)
        for t in self.coupling_types:
            mask += (graph.coupling.type == COUPLING_TYPE.index(t))

        graph.coupling.id = graph.coupling.id[mask]
        #graph.coupling.contribution = graph.coupling.contribution[mask]
        graph.coupling.index = graph.coupling.index[mask]
        graph.coupling.type = graph.coupling.type[mask]
        graph.coupling.value = graph.coupling.value[mask]

        atom = System(symbols=graph.axyz[0], positions=graph.axyz[1])
        acsf = ACSF_GENERATOR.create(atom)

        graph.node += [
            acsf,
        ]

        graph.node = np.concatenate(graph.node, -1)
        graph.edge = np.concatenate(graph.edge, -1)
        return graph
Beispiel #2
0
    def __getitem__(self, index):

        molecule_name = self.id[index]
        graph_file = DATA_DIR + '/structure/graph1/%s.pickle' % molecule_name
        graph = read_pickle_from_file(graph_file)
        assert (graph.molecule_name == molecule_name)

        # ##filter only J link
        # if 0:
        #     # 1JHC,     2JHC,     3JHC,     1JHN,     2JHN,     3JHN,     2JHH,     3JHH
        #     mask = np.zeros(len(graph.coupling.type),np.bool)
        #     for t in ['1JHC',     '2JHH']:
        #         mask += (graph.coupling.type == COUPLING_TYPE.index(t))
        #
        #     graph.coupling.id = graph.coupling.id [mask]
        #     graph.coupling.contribution = graph.coupling.contribution [mask]
        #     graph.coupling.index = graph.coupling.index [mask]
        #     graph.coupling.type = graph.coupling.type [mask]
        #     graph.coupling.value = graph.coupling.value [mask]

        if 1:
            atom = System(symbols=graph.axyz[0], positions=graph.axyz[1])
            acsf = ACSF_GENERATOR.create(atom)
            graph.node += [
                acsf,
            ]

        # if 1:
        #     graph.edge = graph.edge[:-1]

        graph.node = np.concatenate(graph.node, -1)
        graph.edge = np.concatenate(graph.edge, -1)
        return graph
Beispiel #3
0
def func_acsf(params):
    i, molecule = params
    #if i%1000 == 0:
    #    print(f"{i}th finish")
    st = st_dict[molecule]
    atoms = System(symbols=st["atom"].values,
                   positions=st[["x", "y", "z"]].values)
    return gen.create(atoms)
Beispiel #4
0
    def __getitem__(self, index):

        molecule_name = self.id[index]
        #graph_file = DATA_DIR + '/atoms-graph/graph/graph/%s.pickle'%molecule_name
        #graph_file = DATA_DIR + '/graph-v4/graph_v4/graph_v4/%s.pickle'%molecule_name
        graph_file = \
        '../data/graph_v8/%s.pickle'%molecule_name
        #graph_file = DATA_DIR + '/graph-v5/graph_v5/graph_v5/%s.pickle'%molecule_name
        #graph_file = DATA_DIR + '/molecule-graph/graph_v2/graph_v2/%s.pickle'%molecule_name
        graph = list(read_pickle_from_file(graph_file))
        assert (graph[0] == molecule_name)

        # ##filter only J link
        # if 0:
        #     # 1JHC,     2JHC,     3JHC,     1JHN,     2JHN,     3JHN,     2JHH,     3JHH
        #     mask = np.zeros(len(graph.coupling.type),np.bool)
        #     for t in ['1JHC',     '2JHH']:
        #         mask += (graph.coupling.type == COUPLING_TYPE.index(t))
        #
        #     graph.coupling.id = graph.coupling.id [mask]
        #     graph.coupling.contribution = graph.coupling.contribution [mask]
        #     graph.coupling.index = graph.coupling.index [mask]
        #     graph.coupling.type = graph.coupling.type [mask]
        #     graph.coupling.value = graph.coupling.value [mask]

        # add ACSF
        atom = System(symbols=graph[2][0], positions=graph[2][1])
        acsf = ACSF_GENERATOR.create(atom)
        graph[3] += [
            acsf,
        ]

        graph[g_node_idx][7] = graph[g_node_idx][7].reshape([-1, 1])

        graph[3] = np.concatenate(graph[3], -1)
        dist = np.concatenate(graph[4], -1)[:, 4].reshape(-1, 1)
        graph[4].append(1 / dist)
        graph[4].append(1 / dist**2)
        graph[4].append(1 / dist**3)
        graph[4].append(1 / dist**6)
        #for i in range(len(graph[4])):
        #    print(graph[4][])
        graph[4] = np.concatenate(graph[4], -1)
        graph[3][np.isnan(graph[3])] = 0
        graph[4][np.isnan(graph[4])] = 0
        # replace coupling atom_index2 -1 => 1
        #if np.isnan(graph[3]).sum()>0 or np.isnan(graph[4]).sum() > 0:
        #    print(graph)
        return graph
Beispiel #5
0
def get_scsf(data):
    ret_list = []
    for molecule_name in data["mol_names"]:
        df = gb_structure.get_group(molecule_name)
        df = df.sort_values(['atom_index'], ascending=True)
        a = df.atom.values.tolist()
        xyz = df[['x', 'y', 'z']].values

        atom = System(symbols=a, positions=xyz)
        acsf = ACSF_GENERATOR.create(atom)

        acsf_df = pd.DataFrame(acsf)
        acsf_df.columns = [f"acsf_{c}" for c in range(acsf_df.shape[1])]
        acsf_df = pd.concat([
            df[["molecule_name", "atom_index"]].reset_index(drop=True),
            acsf_df.reset_index(drop=True)
        ],
                            axis=1)
        ret_list.append(acsf_df)
    return pd.concat(ret_list, axis=0)
Beispiel #6
0
def structure_to_graph(structure_file):
    mol, smile = MolFromXYZ(structure_file)
    factory = ChemicalFeatures.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
    feature = factory.GetFeaturesForMol(mol)

    structure = pd.read_csv(structure_file, skiprows=1, header=None, sep=" ",
                            names=["atom", "x", "y", "z"])
    structure["radius"] = structure["atom"].map({'H': 0.38, 'C': 0.77, 'N': 0.75, 'O': 0.73, 'F': 0.71})
    xyz = structure[["x", "y", "z"]]
    norm_xyz = preprocessing.normalize(xyz, norm='l2')

    n_atoms = mol.GetNumAtoms()
    edge_array = []
    bond_features = []
    distance = []
    rel_distance = []
    angle = []
    bond_vector = []

    for i, j in itertools.product(range(n_atoms), repeat=2):
        if i == j:
            continue
        edge_array.append((i, j))

        bond = mol.GetBondBetweenAtoms(i, j)
        if bond:
            bond_type = bond.GetBondType()
        else:
            bond_type = None
        bond_features.append(one_hot_encoding(bond_type, BONDS))
        r = ((xyz.iloc[i] - xyz.iloc[j])**2).sum()**0.5
        rel_dist = r/(structure.iloc[i]["radius"] +
                      structure.iloc[j]["radius"])
        theta = (norm_xyz[i]*norm_xyz[j]).sum()
        distance.append([r])
        rel_distance.append([rel_dist])  # divide distance by sum of atomic radii
        angle.append([theta])
        bond_vector.append((xyz.iloc[i] - xyz.iloc[j]).tolist())

    #distance = np.digitize(np.array(distance), bins=[0, 1, 2, 4, 8])
    #rel_distance = np.digitize(np.array(rel_distance), bins=[0, 1, 2, 4, 8])
    #angle = np.digitize(np.array(angle), bins=[-1, -.6, -.2, .2, .6])

    edge_array = np.array(edge_array).T
    edge_features = np.concatenate([
        np.array(bond_features),
        np.array(distance) / 4 - 1,
        np.array(rel_distance) / 4 - 1,
        np.array(angle),  # absolute bond angle. Can use to calculate dihedral angles
        np.array(bond_vector)  # difference between coords of atoms i and j
    ], axis=1)

    atom_features = defaultdict(list)

    n_atoms = mol.GetNumAtoms()

    for i in range(n_atoms):
        atom = mol.GetAtomWithIdx(i)
        atom_features["symbol"].append(one_hot_encoding(atom.GetSymbol(), SYMBOLS))
        atom_features["aromatic"].append([atom.GetIsAromatic()])
        atom_features["hybridization"].append(one_hot_encoding(atom.GetHybridization(), HYBRIDIZATIONS))

        atom_features["num_h"].append([atom.GetTotalNumHs(includeNeighbors=True)])
        atom_features["atomic"].append([atom.GetAtomicNum()])

    atom = System(symbols=structure["atom"].values, positions=xyz.values)
    acsf = ACSF_GENERATOR.create(atom)
    atom_features["acsf"] = acsf

    acceptor = np.zeros((n_atoms, 1), np.uint8)
    donor = np.zeros((n_atoms, 1), np.uint8)

    for feat in feature:
        if feat.GetFamily() == 'Donor':
            for i in feat.GetAtomIds():
                donor[i] = 1
        elif feat.GetFamily() == 'Acceptor':
            for i in feat.GetAtomIds():
                acceptor[i] = 1

    print(len(atom_features["acsf"]), len(acceptor), len(donor))
    atom_features = np.concatenate([atom_features["symbol"], atom_features["aromatic"],
                                    atom_features["hybridization"], atom_features["num_h"],
                                    atom_features["atomic"], atom_features["acsf"],
                                    acceptor, donor], axis=1)

    return edge_array, edge_features, atom_features, smile, xyz.values
Beispiel #7
0
def make_graph(name, gb_structure, gb_scalar_coupling):
    # ['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type','scalar_coupling_constant']
    coupling_df = gb_scalar_coupling.get_group(name)

    # [molecule_name,atom_index,atom,x,y,z]
    df = gb_structure.get_group(name)
    df = df.sort_values(['atom_index'], ascending=True)
    a = df.atom.values.tolist()
    xyz = df[['x', 'y', 'z']].values

    mol = mol_from_axyz(a, xyz)
    mol_op = openbabel.OBMol()
    obConversion.ReadFile(mol_op, f'../input/champs-scalar-coupling/structures/{name}.xyz')

    factory = ChemicalFeatures.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
    feature = factory.GetFeaturesForMol(mol)

    num_atom = mol.GetNumAtoms()
    symbol = np.zeros((num_atom, len(SYMBOL)), np.uint8)  # category
    acceptor = np.zeros((num_atom, 1), np.uint8)
    donor = np.zeros((num_atom, 1), np.uint8)
    aromatic = np.zeros((num_atom, 1), np.uint8)
    hybridization = np.zeros((num_atom, len(HYBRIDIZATION)), np.uint8)
    num_h = np.zeros((num_atom, 1), np.float32)  # real
    atomic = np.zeros((num_atom, 1), np.float32)

    # new features
    degree = np.zeros((num_atom, 1), np.uint8)
    formalCharge = np.zeros((num_atom, 1), np.float32)
    chiral_tag = np.zeros((num_atom, 1), np.uint8)
    crippen_contribs = np.zeros((num_atom, 2), np.float32)
    tpsa = np.zeros((num_atom, 1), np.float32)
    labute_asac = np.zeros((num_atom, 1), np.float32)
    gasteiger_charges = np.zeros((num_atom, 1), np.float32)
    esataindices = np.zeros((num_atom, 1), np.float32)
    atomic_radiuss = np.zeros((num_atom, 1), np.float32)
    electronegate = np.zeros((num_atom, 1), np.float32)
    electronegate_sqre = np.zeros((num_atom, 1), np.float32)
    mass = np.zeros((num_atom, 1), np.float32)
    van = np.zeros((num_atom, 1), np.float32)
    cov = np.zeros((num_atom, 1), np.float32)
    ion = np.zeros((num_atom, 1), np.float32)

    for i in range(num_atom):
        atom = mol.GetAtomWithIdx(i)
        atom_op = mol_op.GetAtomById(i)
        symbol[i] = one_hot_encoding(atom.GetSymbol(), SYMBOL)
        aromatic[i] = atom.GetIsAromatic()
        hybridization[i] = one_hot_encoding(atom.GetHybridization(), HYBRIDIZATION)
        num_h[i] = atom.GetTotalNumHs(includeNeighbors=True)
        atomic[i] = atom.GetAtomicNum()

        degree[i] = atom.GetTotalDegree()
        formalCharge[i] = atom.GetFormalCharge()
        chiral_tag[i] = int(atom.GetChiralTag())

        crippen_contribs[i] = rdMolDescriptors._CalcCrippenContribs(mol)[i]
        tpsa[i] = rdMolDescriptors._CalcTPSAContribs(mol)[i]
        labute_asac[i] = rdMolDescriptors._CalcLabuteASAContribs(mol)[0][i]
        gasteiger_charges[i] = atom_op.GetPartialCharge()
        esataindices[i] = EState.EStateIndices(mol)[i]
        atomic_radiuss[i] = atomic_radius[atom.GetSymbol()]
        electronegate[i] = electronegativity[atom.GetSymbol()]
        electronegate_sqre[i] = electronegativity_square[atom.GetSymbol()]
        mass[i] = atomic_mass[atom.GetSymbol()]
        van[i] = vanderwaalsradius[atom.GetSymbol()]
        cov[i] = covalenzradius[atom.GetSymbol()]
        ion[i] = ionization_energy[atom.GetSymbol()]

    for t in range(0, len(feature)):
        if feature[t].GetFamily() == 'Donor':
            for i in feature[t].GetAtomIds():
                donor[i] = 1
        elif feature[t].GetFamily() == 'Acceptor':
            for i in feature[t].GetAtomIds():
                acceptor[i] = 1

    num_edge = num_atom * num_atom - num_atom
    edge_index = np.zeros((num_edge, 2), np.uint32)
    bond_type = np.zeros((num_edge, len(BOND_TYPE)), np.uint32)
    distance = np.zeros((num_edge, 1), np.float32)
    angle = np.zeros((num_edge, 1), np.float32)

    norm_xyz = preprocessing.normalize(xyz, norm='l2')

    ij = 0
    for i in range(num_atom):
        for j in range(num_atom):
            if i == j: continue
            edge_index[ij] = [i, j]

            bond = mol.GetBondBetweenAtoms(i, j)
            if bond is not None:
                bond_type[ij] = one_hot_encoding(bond.GetBondType(), BOND_TYPE)

            distance[ij] = np.linalg.norm(xyz[i] - xyz[j])
            angle[ij] = (norm_xyz[i] * norm_xyz[j]).sum()

            ij += 1

    xyz = xyz * 1.889726133921252

    atom = System(symbols=a, positions=xyz)
    acsf = ACSF_GENERATOR.create(atom)

    l = []
    for item in coupling_df[['atom_index_0', 'atom_index_1']].values.tolist():
        i = edge_index.tolist().index(item)
        l.append(i)

    l = np.array(l)

    coupling_edge_index = np.concatenate([coupling_df[['atom_index_0', 'atom_index_1']].values, l.reshape(len(l), 1)],
                                         axis=1)

    coupling = Coupling(coupling_df['id'].values,
                        coupling_df[['fc', 'sd', 'pso', 'dso']].values,
                        coupling_edge_index,
                        np.array([COUPLING_TYPE.index(t) for t in coupling_df.type.values], np.int32),
                        coupling_df['scalar_coupling_constant'].values,
                        )

    graph = Graph(
        name,
        Chem.MolToSmiles(mol),
        [a, xyz],
        [acsf, symbol, acceptor, donor, aromatic, hybridization, num_h, atomic, degree, formalCharge, chiral_tag,
         crippen_contribs, tpsa, labute_asac, gasteiger_charges, esataindices, atomic_radiuss, electronegate,
         electronegate_sqre, mass, van, cov, ion],
        [bond_type, distance, angle, ],
        edge_index,
        coupling,
    )

    return graph
Beispiel #8
0
    def __getitem__(self, idx):
        self.global_features = h5py.File(script_dir + '/../processed_data/global_116.h5', mode = 'r')
        self.atom_features = h5py.File(script_dir + '/../processed_data/atom_116.h5', mode = 'r')
        self.bond_features = h5py.File(script_dir + '/../processed_data/bond_116.h5', mode = 'r')
        
        molecule_id = self.molecules_ids[idx]
        molecule = self.molecules[idx]
        
        atom_descriptor = self.atom_descriptors.loc[molecule_id]
        bond_descriptor = self.bond_descriptors.loc[molecule_id]
        #bond_descriptor = bond_descriptor.loc[bond_descriptor['bond_distance'] <= 3]
        
        # Cycles
        if molecule_id in self.cycles.index:
            cycles = self.cycles.loc[molecule_id]
        else:
            cycles = pd.DataFrame(columns = self.cycles.columns)
            
        cycles_edge_index = cycles['edge_index'].values.astype(np.int64)
        cycles_id = cycles['cycle_id'].values.astype(np.int64)
        
        # Edge connectivity
        edges_connectivity = self.edges_connectivity.loc[molecule_id]
        edges_connectivity_ids = np.copy(edges_connectivity[['edge_index_0', 'edge_index_1']].values.astype(np.int64).T)
        edges_connectivity_vectors_0 = edges_connectivity[['vx_0', 'vy_0', 'vz_0']].values
        edges_connectivity_vectors_1 = edges_connectivity[['vx_1', 'vy_1', 'vz_1']].values
        
        edges_connectivity_feature_1 = np.sqrt(np.square(edges_connectivity_vectors_0).sum(axis = 1)).reshape(-1, 1)
        edges_connectivity_feature_2 = np.sqrt(np.square(edges_connectivity_vectors_1).sum(axis = 1)).reshape(-1, 1)
        edges_connectivity_feature_3 = np.sqrt(np.square(edges_connectivity_vectors_0 + edges_connectivity_vectors_1).sum(axis = 1)).reshape(-1, 1)
        edges_connectivity_feature_0 = (edges_connectivity_vectors_0 * edges_connectivity_vectors_1).sum(axis = 1).reshape(-1, 1) / edges_connectivity_feature_1 / edges_connectivity_feature_2
        edges_connectivity_features = np.concatenate([edges_connectivity_feature_0, edges_connectivity_feature_1, edges_connectivity_feature_2, edges_connectivity_feature_3], axis = 1)
        
        

        atom = list(atom_descriptor['atom'])
        xyz = atom_descriptor[['x', 'y', 'z']].values
        xyz = apply_random_rotation(xyz)

        connectivity = bond_descriptor[['atom_index_0', 'atom_index_1']].values

        global_feature_numeric = np.copy(self.global_features['numeric'][molecule_id].reshape(1, -1))
        global_feature_embeddings = np.copy(self.global_features['embeddings'][molecule_id].reshape(1, -1))
        
        atom_indexes = atom_descriptor['index'].values
        atom_index_min = atom_indexes.min()
        atom_index_max = atom_indexes.max()
        
        atom_feature_numeric = np.copy(self.atom_features['numeric'][atom_index_min : atom_index_max + 1][atom_indexes - atom_index_min])
        atom_feature_embeddings = np.copy(self.atom_features['embeddings'][atom_index_min : atom_index_max + 1][atom_indexes - atom_index_min])
        
        bond_indexes = bond_descriptor['index'].values
        bond_index_min = bond_indexes.min()
        bond_index_max = bond_indexes.max()
        
        bond_feature_numeric = np.copy(self.bond_features['numeric'][bond_index_min : bond_index_max + 1][bond_indexes - bond_index_min])
        bond_feature_embeddings = np.copy(self.bond_features['embeddings'][bond_index_min : bond_index_max + 1][bond_indexes - bond_index_min])
        
        self.global_features.close()
        self.atom_features.close()
        self.bond_features.close()
        
        # chemical descriptors
        atom = System(symbols = atom, positions=xyz)
        acsf = ACSF_GENERATOR.create(atom)
        
        atom_feature_numeric = np.concatenate([atom_feature_numeric, xyz, acsf], axis = 1)
        
        bond_vectors = build_bond_vector(connectivity, xyz)
        
        bond_feature_numeric = np.concatenate([bond_feature_numeric, bond_vectors], axis = 1)
        
        if self.name == 'train':
            # Target
            target = bond_descriptor['scalar_coupling_constant'].values.reshape(-1, 1)
            target_mask = (bond_descriptor['type'] != 'VOID').values.reshape(-1, 1)
            target_types = bond_descriptor['type_id'].values.reshape(-1, 1)
            target_idx = bond_descriptor['edge_index'].values.reshape(-1, 1)


            # data
            data = Data(
                x_numeric = torch.tensor(atom_feature_numeric, dtype = torch.float32),
                x_embeddings = torch.tensor(atom_feature_embeddings, dtype = torch.int64),
                
                edge_attr_numeric = torch.tensor(bond_feature_numeric, dtype = torch.float32),
                edge_attr_embeddings = torch.tensor(bond_feature_embeddings, dtype = torch.int64),
                
                u_numeric = torch.tensor(global_feature_numeric, dtype = torch.float32),
                u_embeddings = torch.tensor(global_feature_embeddings, dtype = torch.int64),
                
                edge_index = torch.tensor(connectivity.T),
                
                num_nodes = atom_feature_numeric.shape[0],
                
                molecule_ids = torch.tensor([molecule_id], dtype = torch.int64),
                
                y = torch.tensor(target, dtype = torch.float32),
                y_mask = torch.tensor(target_mask, dtype = torch.float32),
                y_types = torch.tensor(target_types, dtype = torch.int64),
                y_idx = torch.tensor(target_idx, dtype = torch.int32),
                
                cycles_edge_index = torch.tensor(cycles_edge_index),
                cycles_id = torch.tensor(cycles_id),
                
                edges_connectivity_ids = torch.tensor(edges_connectivity_ids),
                edges_connectivity_features = torch.tensor(edges_connectivity_features, dtype = torch.float32),
            )
            
            
            inputs = [
                data.u_embeddings, data.x_embeddings, data.edge_attr_embeddings,
                data.u_numeric, data.x_numeric, data.edge_attr_numeric,
            ]