def __init__(self , antigen , antibody):
        OrderedDefaultDict.__init__(self , lambda :list([0] * 60))

        self.antigen = antigen
        self.antibody = antibody
        self.nearby_relation = defaultdict(dict)#for cache

        self.nearby_reses_in_antigen = OrderedDefaultDict(lambda : defaultdict(list))#surrouding residues in antigen for each residue 
        self.nearby_reses_in_antibody = OrderedDefaultDict(lambda : defaultdict(list))#surrouding residues in antibody for each residue

        self.fp_rule = {#property id and the corresponding residue code
             0 : ['TYR', 'ASN', 'GLU', 'SER', 'CYS', 'THR', 'GLY'],         #polar 
             1 : ['PHE', 'LEU', 'ILE', 'TRP', 'VAL', 'MET', 'PRO', 'ALA'],  #hydrop
             2 : ['ARG', 'ASP', 'GLU', 'LYS', 'HIS'],                       #charged
             3 : ['ALA', 'VAL', 'LEU', 'ILE', 'MET', 'ASN', 'GLU', 'LYS',\
              'ARG', 'GLY', 'SER', 'THR', 'CYS', 'ASP', 'PHE'],             #lipids
             4 : ['PHE', 'TYR', 'TRP'],                                     #aromatic
             5 : ['PRO','HIS'],                                             #heterocyclic
        }#the key represents the group index, value for the residue code

        self.res_prop_ids = defaultdict(list)#the property ids that a given residue has
        #we need to do some conversion for fp_rule for better performance
        print "initializing FingerPrint_60 object"
        for prop_id , residues in self.fp_rule.items():
            for res_code in residues:
                self.res_prop_ids[res_code].append(prop_id)
        #print "res_prop_ids",self.res_prop_ids                

        self.atom_dist_cutoff = 4.0

        self.dist_group_cache = defaultdict(dict)
Exemple #2
0
 def __init__(self,res, bitlength, values = None):
     """(Residue, int, dict or list) => BaseResidueFingerprint"""
     OrderedDefaultDict.__init__(self,float)
     self.bitlength = bitlength
     self.min_idx = 0
     self.max_idx = self.min_idx + bitlength
     self.res = res
     
     if values is not None:
         #if it's dict
         if isinstance(values, dict):
             self.set_val(values)
         #if it's list
         elif isinstance(values, list):
             self.set_val(OrderedDict(enumerate(values)))
         else:
             raise ValueError("invalid values type, either dict or list")
def get_manual_groups(group_id = "157" ):
    print "generating manual classification and grouping"
    pdb_names = []
    
    for fname in glob.glob(pdb_src):
        complex_id = os.path.basename(fname).split('.')[0]
        pdb_names.append(complex_id.strip())
    #print pdb_names,len(pdb_names)
    
    pdb_fp = os.path.join(data_root , 'manual_classification_result/%s_pdbname.txt' %group_id)
    type_fp = os.path.join(data_root , 'manual_classification_result/%s_type.txt' %group_id)
    
    class_d = OrderedDefaultDict(list)
    for name,c_type in zip(open(pdb_fp).readlines(),\
                           open(type_fp).readlines()):
        name = '_'.join(name.strip().split())
        c_type = c_type.strip()
        if name and c_type and name in pdb_names:#not empty line
            class_d[c_type].append(name)
            
    return class_d.values()
class FingerPrint_60(OrderedDefaultDict):
    def __init__(self , antigen , antibody):
        OrderedDefaultDict.__init__(self , lambda :list([0] * 60))

        self.antigen = antigen
        self.antibody = antibody
        self.nearby_relation = defaultdict(dict)#for cache

        self.nearby_reses_in_antigen = OrderedDefaultDict(lambda : defaultdict(list))#surrouding residues in antigen for each residue 
        self.nearby_reses_in_antibody = OrderedDefaultDict(lambda : defaultdict(list))#surrouding residues in antibody for each residue

        self.fp_rule = {#property id and the corresponding residue code
             0 : ['TYR', 'ASN', 'GLU', 'SER', 'CYS', 'THR', 'GLY'],         #polar 
             1 : ['PHE', 'LEU', 'ILE', 'TRP', 'VAL', 'MET', 'PRO', 'ALA'],  #hydrop
             2 : ['ARG', 'ASP', 'GLU', 'LYS', 'HIS'],                       #charged
             3 : ['ALA', 'VAL', 'LEU', 'ILE', 'MET', 'ASN', 'GLU', 'LYS',\
              'ARG', 'GLY', 'SER', 'THR', 'CYS', 'ASP', 'PHE'],             #lipids
             4 : ['PHE', 'TYR', 'TRP'],                                     #aromatic
             5 : ['PRO','HIS'],                                             #heterocyclic
        }#the key represents the group index, value for the residue code

        self.res_prop_ids = defaultdict(list)#the property ids that a given residue has
        #we need to do some conversion for fp_rule for better performance
        print "initializing FingerPrint_60 object"
        for prop_id , residues in self.fp_rule.items():
            for res_code in residues:
                self.res_prop_ids[res_code].append(prop_id)
        #print "res_prop_ids",self.res_prop_ids                

        self.atom_dist_cutoff = 4.0

        self.dist_group_cache = defaultdict(dict)

    def residue_nearby_enough(self,res1 , res2):
        """
        determine whether two atoms are nearby enough given the `atom_dist_cutoff`
        """
        def atom_distance(atom1 , atom2):#the distance between two atoms
            diff = np.matrix( np.array(atom1.xyz) - np.array(atom2.xyz))
            return np.sqrt(( diff * diff.T ).sum())

        return self._res_distance(res1,res2) <= self.atom_dist_cutoff
        """
        if self.nearby_relation[res1].has_key(res2):#if it has been computed
            return  self.nearby_relation[res1][res2]

        for atom1 in res1.atom:
            for atom2 in res2.atom:
                if atom_distance(atom1 , atom2) <= self.atom_dist_cutoff:
                    self.nearby_relation[res1][res2] = True#cache the result
                    self.nearby_relation[res2][res1] = True#the symetrical case
                    return True
        self.nearby_relation[res1][res2] = False#cache the result
        self.nearby_relation[res2][res1] = False#the symetrical case
        return False                
        """

    def _res_distance(self,res1,res2):
        """residues distance """
        diff = np.matrix( np.average([atom.xyz for atom in res1.atom],axis = 0) - \
                          np.average([atom.xyz for atom in res2.atom],axis = 0) )
        return np.sqrt(( diff * diff.T ).sum())

    def _get_dist_group(self, dist, bound_list = [4. , 8. , 12. , 16. , 20.]):
        """get the group index it should belong to according to the distance """
        for level,upper_bound in enumerate(bound_list):
            #print upper_bound,dist
            if dist <= upper_bound:
                return level
        #not in the surrounding
        return -1            

    def _init_workers(self,w_count):
        """init workers preparing for parallel computing"""
        self.workers = []
        self.task_queue = Queue()
        for i in xrange(w_count):
            worker = GroupingWorker(self.task_queue)
            self.workers.append(worker)
            worker.start()
    
    def _is_dist_group_cached(self,res1,res2):
        """check if the group dist info is caculated already"""
        if self.dist_group_cache[res1.resnum].has_key(res2.resnum):
            return True
        else:    
            return False

    def _get_dist_group_from_cache(self,res1,res2):
        """as the function name indicates"""
        return self.dist_group_cache[res1.resnum][res2.resnum]

    def _cache_dist_group(self,res1,res2,dist_group):
        """cache the fruit"""

        self.dist_group_cache[res1.resnum][res2.resnum] = dist_group
        self.dist_group_cache[res2.resnum][res1.resnum] = dist_group

    def grouping_residue_by_distance(self):
        """iterate every residue in the complex and group their surrounding residues by distance"""

        print "grouping antigen side,total count: %d"     %(len(self.antigen.residue))
        #grouping the residues in antigen
        
        #assign tasks
        count = 0
        hit_count = 0
        miss_count = 0
        tmp = defaultdict(dict)
        for res1 in self.antigen.residue:
            for res2 in chain(self.antigen.residue, self.antibody.residue):
                if res1.resnum is res2.resnum:continue
                if self._is_dist_group_cached(res1,res2):#it is computing already
                    print "hit"
                    dist_group = self._get_dist_group_from_cache(res1,res2)#use it directly
                    print res1.resnum, res2.resnum
                    self.nearby_reses_in_antigen[res1][dist_group].append(res2)#updating the group list
                    hit_count += 1
                else:#it is new, we need to start from scratch
                    if self.residue_nearby_enough(res1 , res2):
                        dist = self._res_distance(res1 , res2 )#get the distance between res1 and res2
                        dist_group = self._get_dist_group(dist)#fit it into a group 
                        self.nearby_reses_in_antigen[res1][dist_group].append(res2)#updating the group list

                        self._cache_dist_group(res1,res2,dist_group)#cache the fruit
                        #print self.dist_group_cache
                        miss_count += 1
            count += 1 
            print count
        print hit_count,miss_count


    def get_fingerprint(self):
        if not self:#not computed
            print "grouping by distance"
            self.grouping_residue_by_distance()#first group those residues
            print "fisrt 30 bits started"    
            #the first 30 bits
            for res , groups in self.nearby_reses_in_antigen.items():
                for group_index , residues in groups.items():
                    for residue in residues:
                        for prop_id in self.res_prop_ids[residue.pdbres.strip().upper()]:
                            #increment the count of property at given position
                            self[res][group_index * 6 + prop_id] += 1
                            #print "#####for residue %d" %res.resnum
                            #print "%s" %(" ".join("%dp%d" %(g,p) for g in xrange(5) for p in xrange(6)))
                            #print ' '.join("%2d " %count for count in self[res])
                            #print res.resnum , group_index , prop_id
            #the 30 ~ 60 bits                            
            print "second 30 bits started"    
            for res , groups in self.nearby_reses_in_antibody.items():
                for group_index , residues in groups.items():
                    for residue in residues:
                        for prop_id in self.res_prop_ids[residue.pdbres.strip().upper()]:
                            #increment the count of property at given position, offset by 30
                            self[res][30 + group_index * 6 + prop_id] += 1
            return self

    def display_fingerprint(self , start = None , end = None):
        print "%s%s" %(' ' * 11 , " ".join("%dp%d" %(g,p) for g in xrange(5) for p in xrange(6)))
        for residue , fp in self.items():
            if start and end:
                fp = fp[start:end]
            elif not start and end:
                fp = fp[:end]
            elif start and not end:
                fp = fp[start:]

            print "%8d : %s" %(residue.resnum , ' '.join("%2d " %count for count in fp))


        
    def display_group_info(self):
        def _display_group_info(nearby_reses):
            for residue , groups in nearby_reses :
                print "%8d" %residue.resnum , 
                for group_index , residues in groups.items():
                    #twisted statement, hehe!
                    print "%d:%d(%s)" %( group_index , len(residues) ,\
                             ' '.join("%s(%s)" %(res.pdbres.strip().upper(),\
                                                ','.join('%d'%prop_id  for prop_id in self.res_prop_ids[res.pdbres.strip().upper()]))\
                                                     for res in  residues)),
                print                
            return#for clearity

        print "antigen part(first 30 bit)"
        _display_group_info(self.nearby_reses_in_antigen.items())
        print "antibody part(30 ~ 60 bit)"
        _display_group_info(self.nearby_reses_in_antibody.items())