Ejemplo n.º 1
0
    def calculate_unhashed_fps(self,draw_substructures=False,image_directory='./images_substructures'): 
        # get the dictionary for the substructures
        idxs = []
        substr_ids = []
        counts=[]
        substructure_dictionaries = []    
        for mol_index,mol in enumerate(self.mols):
            info={}
            fp = _GetMorganFingerprint(mol,radius=self.max_radius,bitInfo=info)
            substructure_dictionary = {k:mol_index for k,v in info.iteritems() if v[0][1] in self.radii}
            substructure_dictionaries.append({k:mol_index for k,v in info.iteritems() if v[0][1] in self.radii})
            substr_ids.append(substructure_dictionary.keys())
            idxs.append([mol_index]*len(substructure_dictionary.keys()))
            counts.append([ len(info.values()[x]) for x in _arange(0,len(info)) if info.values()[x][0][1] in self.radii])
            
            # get the smiles for the substructures
            amap = {}
            substructures_smiles = {k:[_MolToSmiles(_PathToSubmol(mol,_FindAtomEnvironmentOfRadiusN(mol,v[0][1],v[0][0]),atomMap=amap))] for k,v in info.iteritems() if v[0][1] in self.radii}
            self.substructures_smiles.update(substructures_smiles)
            
            # generate the images for the substructures if required..
            if draw_substructures:
                if not _exists(image_directory):
                    _makedirs(image_directory)
                for k,v in info.iteritems():
                    if k not in self.substructure_dictionary.keys() and v[0][1] in self.radii:
                        image_name="%s/Molecule_%d_substr_%d.pdf"%(image_directory,mol_index,k)
                        env=_FindAtomEnvironmentOfRadiusN(mol,v[0][1],v[0][0])
                        amap={}
                        submol=_PathToSubmol(mol,env,atomMap=amap)
                        _MolToFile(mol,image_name,size=(300,300),wedgeBonds=True,kekulize=True,highlightAtoms=amap.keys())
            
        #self.substructure_dictionary = self._combine_dicts(substructure_dictionary,self.substructure_dictionary)
        for d in substructure_dictionaries:
             for k, v in d.iteritems():
               l=self.substructure_dictionary.setdefault(k,[])
               if v not in l:
                 l.append(v)
            
        idxs = _array([val for sublist in idxs for val in sublist])
        counts = _array([val for sublist in counts for val in sublist])
        substr_ids_flattened = [val for sublist in substr_ids for val in sublist]
        substr_ids = _array(substr_ids_flattened)
        self.substructure_ids = substr_ids
        if len(self.reference_substructure_keys)==0:
            print "No input set of keys for the substructures. \nThus, the substructures present in the input molecules will be considered for the calculation of unhashed fingerprints."
            columns = _array(list(set(self.substructure_dictionary.keys())))
            columns = _sort(columns)
            self.columns_unhashed = columns
            dimensionality_unhashed = len(columns)
        else:
            columns = _array(self.reference_substructure_keys)
            columns = _sort(columns)
            self.columns_unhashed = columns
            dimensionality_unhashed = len(columns)
        
        fps_unhashed_binary = _zeros((len(self.mols),dimensionality_unhashed), dtype=int)
        fps_unhashed_counts = _zeros((len(self.mols),dimensionality_unhashed), dtype=int)

            
        mapping = _array([(substr_ids[x]==columns).nonzero() for x in _arange(0,len(substr_ids))])
        mapping = mapping.flatten()
        idxs = _array([idxs[x] for x in _arange(0,len(mapping)) if mapping[x].size != 0])
        counts = _array([counts[x] for x in _arange(0,len(mapping)) if mapping[x].size != 0])
        mapping = _array([mapping[x] for x in _arange(0,len(mapping)) if mapping[x].size != 0])
        if len(mapping) == 0:
            print "There is no intersection between the substructures \n(i)provided in the reference key set, and\n(ii) the substructures found in the input molecules."
            return
        
        fps_unhashed_binary[idxs,mapping] = _ones(len(mapping))
        fps_unhashed_counts[idxs,mapping] = counts
        self.fps_unhashed_binary = fps_unhashed_binary
        self.fps_unhashed_counts = fps_unhashed_counts
Ejemplo n.º 2
0
def ones(*shp, dtype='float64'):
    return _ones(shp, dtype=dtype)
    def calculate_unhashed_fps(self,
                               draw_substructures=False,
                               image_directory='./images_substructures'):
        # get the dictionary for the substructures
        idxs = []
        substr_ids = []
        counts = []

        for mol_index, mol in enumerate(self.mols):
            info = {}
            fp = _GetMorganFingerprint(mol,
                                       radius=self.max_radius,
                                       bitInfo=info)
            substructure_dictionary = {
                k: [mol_index]
                for k, v in info.iteritems() if v[0][1] in self.radii
            }
            substr_ids.append(substructure_dictionary.keys())
            idxs.append([mol_index] * len(substructure_dictionary.keys()))
            counts.append([
                len(info.values()[x]) for x in _arange(0, len(info))
                if info.values()[x][0][1] in self.radii
            ])

            # get the smiles for the substructures
            amap = {}
            substructures_smiles = {
                k: [
                    _MolToSmiles(
                        _PathToSubmol(mol,
                                      _FindAtomEnvironmentOfRadiusN(
                                          mol, v[0][1], v[0][0]),
                                      atomMap=amap))
                ]
                for k, v in info.iteritems() if v[0][1] in self.radii
            }
            self.substructures_smiles.update(substructures_smiles)

            # generate the images for the substructures if required..
            if draw_substructures:
                if not _exists(image_directory):
                    _makedirs(image_directory)
                for k, v in info.iteritems():
                    if k not in self.substructure_dictionary.keys(
                    ) and v[0][1] in self.radii:
                        image_name = "%s/Molecule_%d_substr_%d.pdf" % (
                            image_directory, mol_index, k)
                        env = _FindAtomEnvironmentOfRadiusN(
                            mol, v[0][1], v[0][0])
                        amap = {}
                        submol = _PathToSubmol(mol, env, atomMap=amap)
                        _MolToFile(mol,
                                   image_name,
                                   size=(300, 300),
                                   wedgeBonds=True,
                                   kekulize=True,
                                   highlightAtoms=amap.keys())

            self.substructure_dictionary = self._combine_dicts(
                substructure_dictionary, self.substructure_dictionary)

        idxs = _array([val for sublist in idxs for val in sublist])
        counts = _array([val for sublist in counts for val in sublist])
        substr_ids_flattened = [
            val for sublist in substr_ids for val in sublist
        ]
        substr_ids = _array(substr_ids_flattened)
        self.substructure_ids = substr_ids
        if len(self.reference_substructure_keys) == 0:
            print "No input set of keys for the substructures. \nThus, the substructures present in the input molecules will be considered for the calculation of unhashed fingerprints."
            columns = _array(list(set(self.substructure_dictionary.keys())))
            columns = _sort(columns)
            self.columns_unhashed = columns
            dimensionality_unhashed = len(columns)
        else:
            columns = _array(self.reference_substructure_keys)
            columns = _sort(columns)
            self.columns_unhashed = columns
            dimensionality_unhashed = len(columns)

        fps_unhashed_binary = _zeros((len(self.mols), dimensionality_unhashed),
                                     dtype=int)
        fps_unhashed_counts = _zeros((len(self.mols), dimensionality_unhashed),
                                     dtype=int)

        mapping = _array([(substr_ids[x] == columns).nonzero()
                          for x in _arange(0, len(substr_ids))])
        mapping = mapping.flatten()
        idxs = _array([
            idxs[x] for x in _arange(0, len(mapping)) if mapping[x].size != 0
        ])
        counts = _array([
            counts[x] for x in _arange(0, len(mapping)) if mapping[x].size != 0
        ])
        mapping = _array([
            mapping[x] for x in _arange(0, len(mapping))
            if mapping[x].size != 0
        ])
        if len(mapping) == 0:
            print "There is no intersection between the substructures \n(i)provided in the reference key set, and\n(ii) the substructures found in the input molecules."
            return

        fps_unhashed_binary[idxs, mapping] = _ones(len(mapping))
        fps_unhashed_counts[idxs, mapping] = counts
        self.fps_unhashed_binary = fps_unhashed_binary
        self.fps_unhashed_counts = fps_unhashed_counts
Ejemplo n.º 4
0
def ones(*shp, dtype="float64"):
    return _ones(shp, dtype=dtype)