def cull_main(similarities, thresholdPercentage, representativeChains, adjType, representativesReverse={}): """Perform the PDB redundancy removal. @param similarities: A record of the percentage sequence identity between the chains/entries up for culling. @type similarities : dictionary or file name (string) @param thresholdPercentage: The maximum permissible percentage sequence identity that any two chains/entries may possess. @type thresholdPercentage : float """ # Create the sparsematrix of the protein similarity graph. if adjType == 'chain': adjacent, proteinNames = adjlistcreation.pdb_chain_main(similarities, thresholdPercentage, representativeChains) elif adjType == 'entry': adjacent, proteinNames = adjlistcreation.pdb_entry_main(similarities, thresholdPercentage, representativeChains, representativesReverse) # Choose which proteins to remove from the similarity graph. if proteinNames == []: # This is True if there are no similarities greater than the given percentage sequence identity. If there are no # chains that are too similar, then there is no need to cull any chains from the network. proteinsToCull = [] else: # Choose which chains to remove from the similarity graph. proteinsToCull, proteinsToKeep = Leafcull.main(adjacent, proteinNames) return proteinsToCull
def cull_main( similarities, thresholdPercentage, representativeChains, adjType, representativesReverse={}, verboseOutput=False, startTime=0, ): """Perform the PDB redundancy removal. @param similarities: A record of the percentage sequence identity between the chains/entries up for culling. @type similarities : file name (string) @param thresholdPercentage: The maximum permissible percentage sequence identity that any two chains/entries may possess. @type thresholdPercentage : float @param representativeChains: The names of the chains/entries that will compose the protein similarity graph. @type representativeChains: set @param adjType: 'chain' or 'entry' indicating the type of culling to be performed. @type adjType: string @param representativesReverse: A mapping of representative chains to the chains that they represent. @type representativesReverse: dictionary @param verboseOutput: Whether status updates should be printed out to the user. @type verboseOutput: boolean @param startTime: The time when the culling began. Used to output elapsed time. @type startTime: integer return @type: list return @use: The redundant proteins to be removed from teh dataset. """ # Create the sparsematrix of the protein similarity graph. if verboseOutput: print "Creating the adjacency matrix. Time elapsed: ", time.time() - startTime if adjType == "chain": adjacent, proteinNames = adjlistcreation.pdb_chain_main(similarities, thresholdPercentage, representativeChains) elif adjType == "entry": adjacent, proteinNames = adjlistcreation.pdb_entry_main( similarities, thresholdPercentage, representativeChains, representativesReverse ) # Choose which proteins to remove from the similarity graph. if proteinNames == []: if verboseOutput: print "No similarities found. Culling not needed. Time elapsed: ", time.time() - startTime # This is True if there are no similarities greater than the given percentage sequence identity. If there are no # chains that are too similar, then there is no need to cull any chains from the network. proteinsToCull = [] else: if verboseOutput: print "Performing the culling. Time elapsed: ", time.time() - startTime # Choose which chains to remove from the similarity graph. proteinsToCull, proteinsToKeep = Leafcull.main(adjacent, proteinNames) if verboseOutput: print "Culling finished. Time elapsed: ", time.time() - startTime return proteinsToCull