def main(DIMACSFile): """Take a file in DIMACS format and turn it into a sparse matrix format. @param DIMACSFile: The location of a file in DIMACS format. @type DIMACSFile : string return @type: sparsematrix return @use : The sparsematrix representation of the DIMACS format graph. """ toNodes = [] fromNodes = [] numNodes = 0 readIn = open(DIMACSFile, 'r') for line in readIn: if line[0] == 'c': continue elif line[0] == 'p': chunks = line.split() numNodes = int(chunks[2]) elif line[0] == 'e': chunks = line.split() fromNodes.append(int(chunks[1]) - 1) toNodes.append(int(chunks[2]) - 1) readIn.close() adjacent = sparsematrix.sparse_matrix(numNodes) adjacent.addlist(fromNodes, toNodes) adjacent.addlist(toNodes, fromNodes) return adjacent
def main(similarities, cutoffPercent = 20, maxEValue = 1, minAlignLength = 20): """Create a sparse matrix from the processed PSI-BLAST output. @param similarities: The location of the processed PSI-BLAST output. @type similarities: string @param cutoffPercent: A percentage similarity > this parameter is deemed to be too similar. @type cutoffPercent: integer @param maxEValue: The maximum permissible value for the E value of an alignment. @type maxEValue: float @param minAlignLength: The number of amino acids aligned in the query and the hit sequence must be >= this value for the percentage similarity to be deemed significant. @type minAlignLength: integer """ proteinNames = [] # Store the names of all the proteins found to be too similar to another protein similarProteins = [] # Store the pairs that are too similar readSimilarities = open(similarities, 'r') for line in readSimilarities: chunks = line.split() if len(chunks) == 12: query = chunks[0] hit = chunks[4] percentage = chunks[9] elif len(chunks) == 13: query = chunks[0] hit = chunks[5] percentage = chunks[10] # Ignore similarities where the query and the hit are the same, the percentage similarity is <= cutoffPercent, # the E value is > maxEValue and the length of the alignment is < minAlignLength. invalid = (query == hit or float(percentage) <= cutoffPercent ) # If the similarity is valid record the proteins as being too similar. if not invalid: proteinNames.append(query) proteinNames.append(hit) similarProteins.append(tuple(sorted((query, hit)))) readSimilarities.close() proteinNames = list(set(proteinNames)) proteinNames.sort() similarProteins = list(set(similarProteins)) indexDict = dict((proteinNames[x], x) for x in range(len(proteinNames))) # Create the sparse matrix adjacent = sparsematrix.sparse_matrix(len(proteinNames)) xValues = [indexDict[x] for (x,y) in similarProteins] yValues = [indexDict[y] for (x,y) in similarProteins] adjacent.addlist(xValues, yValues) adjacent.addlist(yValues, xValues) return adjacent, proteinNames
def main(i, locForResults, dirPISCES, duplicates, resolutionData, fastaSequences, alignmentFile, minLength=20, maxLength=10000): """Compares the non-redundant dataset produced by Leaf under given parameters, with the one generated by PISCES. @param i: The gzipped list of non-redundant proteins generated by PISCES for the given parameters. @type i : string (file location) @param locForResults: The location where the results of the comparison should be written out. @type locForResults : string (file location) @param dirPISCES: The location of the directory that contains all of the culled lists generated by PISCES for all of the parameters. @type dirPISCES : string (directory location) @param duplicates: The location of the file containing the information about representative PDB chains (as determined by PISCES). @type duplicates : string (file location) @param resolutionData: The location of the file containing information about resolution and R Value for the chains. @type resolutionData : string (file location) @param fastaSequences: The file containing all the chains in the PDB in FASTA format. @type fastaSequences : string (file location) @param alignmentFile: The file containing the alignments for all the representative chains in the PDB (as calculated by PISCES). @type alignmentFile : string (file location) @param minLength: The minimum length that a protein chain can have (in terms of number of amino acids). @type minLength : integer @param maxLength: The maximum length that a protein chain can have (in terms of number of amino acids). @type maxLength : integer """ resultsFile = open(locForResults, 'a') # The name of the PISCES file gives informaiton such as the number of proteins in the non-redundant dataset, resolution and R Value. chunks = i.split('_') numberPISCESProteinsKept = int(chunks[-1][6:-3]) # The number of proteins kept by PISCES. seqIdenThreshold = int(chunks[1][2:]) # The percentage cutoff used. resolutionLimit = float(chunks[2][3:]) # The maximum resolution permitted. RFactorLimit = float(chunks[3][1:]) # The maximum R Value permitted. # Determine whether non-XRay structures were allowed, and whether structures containing only alpha carbons were allowed. if len(chunks) == 7: inclNotXray = True inclCAOnly = False elif len(chunks) == 8: inclNotXray = True inclCAOnly = True else: inclNotXray = False inclCAOnly = False # Extract resolution, alignment, representative and length information for all of the PDB chains. resolutionInfo = process_resolution(resolutionData) alignments = process_alignments(alignmentFile, seqIdenThreshold) redundant = process_redundant(duplicates) lengths = process_fasta(fastaSequences) print 'Now working on file: ', i validProteins = [] # The proteins that are valid for the parameter constraints (resolution, R Value etc.). # Go through resolution and select the identifiers of the proteins that meet the structural requirements. for p in lengths.keys(): proteinResolution = resolutionInfo[p]['Resolution'] proteinRFactor = resolutionInfo[p]['RFactor'] proteinFreeRFactor = resolutionInfo[p]['FreeRFactor'] if not inclNotXray: # If non-Xray structures are not being included: if resolutionInfo[p]['Experiment'] != 'XRAY' or proteinResolution == 'NA': # If the protein structure was not determined by Xray or the resolution is not known, then don't include it continue else: if resolutionInfo[p]['Experiment'] != 'XRAY': # If non-Xray structures are being included and the protein structure was not determined by Xray, then include it. proteinResolution = 'NA' proteinRFactor = 'NA' proteinFreeRFactor = 'NA' if not inclCAOnly and resolutionInfo[p]['CAOnly'] == 'yes': # If the structure is only CA, and that type of structure is not being considered, then don't include it. continue if proteinResolution != 'NA' and float(proteinResolution) > resolutionLimit: continue if lengths[p] < minLength or lengths[p] > maxLength: continue if proteinRFactor == 'NA': if not inclNotXray: continue elif float(proteinRFactor) > RFactorLimit: continue validProteins.append(p) # Free up memory used by the resolution and length dicts. del resolutionInfo del lengths # Go through the list of proteins that meet the structural requirements and remove all that can be # replaced by a representative sequence. Add the representative to the list if not already in the list. nonRedundantProteins = set([]) for p in validProteins: if redundant.has_key(p): nonRedundantProteins.add(redundant[p]) else: nonRedundantProteins.add(p) # Free up memory used by the redundant dict. del redundant # Go through the alignments and record all alignments for the proteins that remain. proteinNames = [] similarProteins = [] singletonProteins = [] for p in nonRedundantProteins: singleton = True if alignments.has_key(p): # If there are alignments involving p then record the ones that involve other proteins in nonRedundantProteins. for h in alignments[p]: if h in nonRedundantProteins and alignments[p][h] > seqIdenThreshold: singleton = False # If the two proteins are too similar then record this. proteinNames.append(p) proteinNames.append(h) similarProteins.append(tuple(sorted([p, h]))) if singleton: singletonProteins.append(p) # Free up memory used by the alignments dict. del alignments proteinNames = list(set(proteinNames)) proteinNames.sort() similarProteins = list(set(similarProteins)) indexDict = dict((proteinNames[x], x) for x in range(len(proteinNames))) # Create the sparse matrix. adjacent = sparsematrix.sparse_matrix(len(proteinNames)) xValues = [indexDict[x] for (x,y) in similarProteins] yValues = [indexDict[y] for (x,y) in similarProteins] adjacent.addlist(xValues, yValues) adjacent.addlist(yValues, xValues) # Perform the culling by Leaf. removedLeaf, proteinsToKeep, removeNode, nodesToKeep, timeTaken = CMLeaf.main(adjacent, proteinNames) # Calculate and record the improvement of Leaf over PISCES. numberLeafProteinsKept = len(nonRedundantProteins) - len(removedLeaf) percentageImprovementLeaf = (float((numberLeafProteinsKept - numberPISCESProteinsKept)) / numberPISCESProteinsKept) * 100 resultsFile.write(i + '\t' + str(numberPISCESProteinsKept) + '\t' + str(numberLeafProteinsKept) + '\t' + str(percentageImprovementLeaf) + '\t' + str(numOfNodes) + '\t' + str(maxDegree) + '\t' + str(meanDegree) + '\n') resultsFile.close()