def __init__(self, Directory, DerivedoI, PDBoI): """ Class attributes: Figures_L (List): list of all the figure types that will be created FiguresSVG_D (Dict): key is the type of figure, value is the SVG syntax that will draw the figure DerivedoInterest (String): Derived node of interest that the figure will be based on PDBoInterest (String): PDB structure that the derived node sequence aligned to and achieved a significant hit on """ #initial setup of what figures will be created self.Figures_L = [ "TreeAndStates", "Alignment", "Structurecartoon", "Structuresurface" ] self.FigureSVG_D = {Key: [] for Key in self.Figures_L} self.Directory = Directory if self.Directory.endswith("/"): pass else: self.Directory = self.Directory + "/" self.DerivedoInterest = DerivedoI self.PDBoInterest = PDBoI print self.Directory print self.DerivedoInterest print self.PDBoInterest #output directory where files will be written self.OutputDirectory = "%sFigures/%s-%s/" % ( self.Directory, self.DerivedoInterest, self.PDBoInterest) if os.path.exists(self.OutputDirectory): pass else: os.system("mkdir " + self.OutputDirectory) #paths to relevant input files self.ReportPATH = self.Directory + "Report.xml" self.TreePATH = self.Directory + "ModdedTree.nwk" self.MatrixPATH = self.Directory + "ScoringMatrix.xml" #parses the report file for sequences and branch relationships self.NodeToSeq_D = { re.compile("<H>(.+?)</H>").search(Seq).group(1): re.compile("<S>(.+?)</S>").search(Seq).group(1) for Seq in re.findall("<Seq>.+?</Seq>", open(self.ReportPATH, "r").read()) } self.BranchToAlgorithm_D = { re.compile("<Branch_name>(.+?)</Branch_name>").search( Branch).group(1): ScopeAlgorithm(Branch) for Branch in re.findall("<Branch>.+?</Branch>", open(self.ReportPATH, "r").read(), re.DOTALL) } self.RectCount = 0 #dimensions self.TreeFigWIDTH = 750 self.TreeFigHEIGHT = 500 self.TreeFigXOffset = 25 self.TreeFigYOffset = 50 #loads and parses tree, gets evolutionary distances for proper branch lengths self.CogentTree = LoadTree(self.TreePATH) self.FastMLTree = FastMLTree(self.TreePATH, False) self.FastMLTree.setBranchLengths() self.LongestDistance = self.getLongestEvoDistance() self.EvoDistance_D = { Key: self.getEvoDistance(Key) for Key in self.NodeToSeq_D.keys() if Key != self.FastMLTree.TopKey } self.EvoDistance_D[self.FastMLTree.TopKey] = 0.0 self.ModdedEvoDistance_D = self.modEvoDistance() self.TreeCoords_D = self.setTreeCoords() FurthestPosition = 0.0 FurthestClade = "" #gets the furthest evolutionary distance for Key in self.FastMLTree.LeafKey_L: Val = self.TreeCoords_D[Key][0] + (12 * len(Key)) if Val > FurthestPosition: FurthestPosition = Val FurthestClade = Key self.BranchoInterest = "" for Key in self.FastMLTree.BranchKey_L: if Key.split(">>")[1] == self.DerivedoInterest: self.BranchoInterest = Key #gets all relevant information for the states portion of the figure self.StateIndices_L = [ int(X) - 1 for X in self.BranchToAlgorithm_D[self.BranchoInterest]. getAllMutationXMLKeysAccordingToAccession(self.PDBoInterest) ] self.LeafStates_D = { Key: [self.NodeToSeq_D[Key][state] for state in self.StateIndices_L] for Key in self.FastMLTree.LeafKey_L } self.StateColour_D = self.getStateToHex() self.StateInc = 25.0 self.StateFigHEIGHT = 500 self.StateFigWIDTH = self.StateInc * (len(self.StateIndices_L)) + 50 self.StateFigXOffset = self.TreeFigXOffset + self.TreeFigWIDTH + ( 12 * len(FurthestClade)) + 25 self.StateFigYOffset = 50 #creates the states and tree figure self.FigureSVG_D["TreeAndStates"].append( self.getSVGHeader( self.TreeFigHEIGHT + (self.TreeFigYOffset * 2), self.StateFigXOffset + self.StateFigWIDTH + self.TreeFigXOffset)) self.makeTreeFig() self.makeStatesFig() self.FigureSVG_D["TreeAndStates"].append("</svg>") self.TreeAndStatesFOutPATH = self.OutputDirectory + "TreeAndStates.png" TreeStateFOut = open(self.TreeAndStatesFOutPATH, "w") cairosvg.svg2png(bytestring="\n".join( self.FigureSVG_D["TreeAndStates"]), write_to=TreeStateFOut) TreeStateFOut.close() LongestCladeName = "" for Key in self.FastMLTree.LeafKey_L: if len(Key) > len(LongestCladeName): LongestCladeName = Key #gets all relevant information for the alignment cartoon portion of the figure self.MatrixInfo = self.parseScoringMatrix() self.AlnInc = 11.0 self.AlignmentFigWIDTH = self.AlnInc * len( self.MatrixInfo["Sseq"]) + self.AlnInc + (8 * len(LongestCladeName)) self.AlignmentFigHEIGHT = self.AlnInc * ( len(self.FastMLTree.LeafKey_L) + 1) + self.AlnInc self.AlignmentFigXOffset = self.AlnInc self.AlignmentFigYOffset = self.AlnInc self.FigureSVG_D["Alignment"].append( self.getSVGHeader(self.AlignmentFigHEIGHT, self.AlignmentFigWIDTH)) self.makeAlignmentFig() self.FigureSVG_D["Alignment"].append("</svg>") self.AlignmentFOutPATH = self.OutputDirectory + "Alignment.png" AlignmentFOut = open(self.AlignmentFOutPATH, "w") cairosvg.svg2png(bytestring="\n".join(self.FigureSVG_D["Alignment"]), write_to=AlignmentFOut) AlignmentFOut.close() #relevant information for the structure file in PDB format self.ColouredStructureFile = self.getColoredStructureFile() self.StructureFOutPATH = self.OutputDirectory + "Structure.pdb" open(self.StructureFOutPATH, "w").write(self.ColouredStructureFile.read()) self.TotalFigWIDTH = 1000 self.TotalFigHEIGHT = 600 self.TotalElement_L = [ self.getSVGHeader(self.TotalFigHEIGHT, self.TotalFigWIDTH) ] self.TotalElement_L.append( '''\t<image x="0" y="0" width="1000" height="500" xlink:href="file://%s"/>''' % (self.TreeAndStatesFOutPATH)) self.TotalElement_L.append( '''\t<image x="0" y="500" width="1000" height="100" xlink:href="file://%s"/>''' % (self.AlignmentFOutPATH)) self.TotalElement_L.append("</svg>")
def __init__(self , Directory , DataDIR): """ Class attributes: DataDIR (String): Directory where main program is held Directory (String): Directory to read in report files and output the final PValue file ProteinFamilyName (String): Protein family descriptor to use in random distribution file generation ScopeXMLFile (String): Path to report (mutation mapping) file ModdedTreeFile (String): Path to newick syntax tree file with branch names according to first module ScoringMatrixXMLFile (String): Path to XML format file of all scoring keys (to be used in random distributions) DistributionPath (String): Path to the random distribution directory HydroPATH (String): Path to hydropathyindex file MassPATH (String): Path to sidechainmass file Hydro_D (Dict): Key is one letter AA code, Value is its hydropathy index Mass_D (Dict): Key is one letter AA code, Value is its side chain mass value Tree (FastMLTree obj): Tree object with the renamed branches NodeSequenceKey_L (List): List containing the names of all nodes NodeToSequence_D (Dict): Key is the node name, Value is the ancestral or extant sequence at that node BranchToAlgorithm_D (Dict): Key is the Branch key name, Value is a ScopeAlgorithm instance for that branch segment PDBContents_D (Dict): Key is PDB ID, value is dictionaries of all atom and residue information for that PDB file PDBXMLContents_D (Dict): Key is PDB ID, value is dictionaries of all atom and residue information for that PDBXML file ScoringMatrixCoverageKeys_D (Dict): Key is PDB ID, value is list of chain and position keys that correspond to successfully aligned regions AccsToMutationCount_D (Dict): Key is PDB ID, value is a list of integers for all branch segments with those number of mutations AccsToDistanceCount_D (Dict): Key is PDB ID, value is a list of integers for all branch segments with those number of mutations that can be joined by pairwise distances MassChanges_L (List): all mass change calculations that have happened anywhere on the tree HydroChanges_L (List): all hydropathy index change calculations that have happened anywhere on the tree RandomDistributions_D (Dict): Dictionary structure pointing to various number arrays based on the PDB ID used and the number of items drawn before averaging BranchToPValues_D (Dict): Dictionary structure pointing to the four P-Values for an ancestral, derived, PDB alignment triad """ self.DataDIR = DataDIR #gets input/output directory and protein family name self.Directory = Directory if self.Directory.endswith("/"): self.Directory = self.Directory[:-1] self.ProteinFamilyName = self.Directory if re.compile("/").search(self.Directory): self.ProteinFamilyName = self.Directory.split("/")[-1] #gets all path information for the relevant input files self.ScopeXMLFile = self.Directory+"/"+"Report.xml" self.ModdedTreeFile = self.Directory+"/"+"ModdedTree.nwk" self.ScoringMatrixXMLFile = self.Directory+"/"+"ScoringMatrix.xml" self.PDBToEvalue_D = self.getPDBToEvalue_D() #paths to more input files self.HydroPATH = self.DataDIR+"misc/hydropathyindex" self.MassPATH = self.DataDIR+"misc/sidechainmass" #makes a dictionary out of hydropathy index and mass input files self.Hydro_D = {line.split()[0] : float(line.replace("\n","").split()[1]) for line in open(self.HydroPATH,"r").readlines()} self.Mass_D = {line.split()[0] : float(line.replace("\n","").split()[1]) for line in open(self.MassPATH,"r").readlines()} #create the FastMLTree object and set branch lengths self.Tree = FastMLTree(self.ModdedTreeFile , False) self.Tree.setBranchLengths() #parse out sequence information NodeToSequence_LD = self.getNodeToSequence_LD() self.NodeSequenceKey_L = NodeToSequence_LD[0] self.NodeToSequence_D = NodeToSequence_LD[1] #parse out report XML file and create ScopeAlgorithm instances self.BranchToAlgorithm_D = self.getBranchToAlgorithm_D() #set PDB content dictionary and PDBXML content dictionary PDB_L = [] #for each branch key, check for new PDB ID keys for BranchKey in self.BranchToAlgorithm_D.keys(): AccKeysSearch = re.compile("<PDBs>(.+?)</PDBs>").search(self.BranchToAlgorithm_D[BranchKey].alignmentSet) if AccKeysSearch: AccKeys_L = AccKeysSearch.group(1).split(";") #for each PDB ID key found for AccKey in AccKeys_L: #only executes if the PDB ID key has not already been added to the dictionary if AccKey in set(PDB_L): pass else: PDB_L.append(AccKey) PDBAndPDBXMLContents_Dicts = getAllPDBFileDicts(PDB_L) self.PDBContents_D = PDBAndPDBXMLContents_Dicts[0] self.PDBXMLContents_D = PDBAndPDBXMLContents_Dicts[1] [self.setPDBAndPDBXMLContentDictionaries(self.BranchToAlgorithm_D[Key]) for Key in self.BranchToAlgorithm_D.keys()] #parses out ScoringMatrixCoverage file self.ScoringMatrixCoverageKeys_D = self.getScoringMatrixCoverageKeys() self.ScoringMatrixPDBXMLMatchedKeys_D = self.getScoringMatrixPDBXMLMatchedKeys_D() #gets the indices for PDB IDs to be used in SAS and distance random distribution generation self.AccsToMutationCount_D = self.getNCoveredMutations_D() #self.AccsToDistanceCount_D = self.getNDistances_D() #get list of mass and hydropathy index change values for use in random distributions BranchSegmentMutations_L = self.getAllBranchSegmentMutations() self.MassChanges_L = BranchSegmentMutations_L[0] self.HydroChanges_L = BranchSegmentMutations_L[1] self.RandomDistributions_D = self.getRandomDistributions_D() #create all random distributions #print self.RandomDistributions_D self.BranchToPValues_D = self.getAllBranchSegmentPValues() #get all PValues self.output() #output to PValue file
def __init__(self , FastaPATH , UserTreePATH , ProjectName): self.FastaPATH = FastaPATH self.UserTreePATH = UserTreePATH self.ProjectName = ProjectName #declaration of output variables self.ExitStatus = False self.ExitString = "" self.OSGString = "" #parses the tree file according to FastMLTree methods self.FastMLTree = FastMLTree(self.UserTreePATH , True) if self.FastMLTree.Parsed: #opens the Fasta file and gets the sequences as a dictionary ReadFasta = readFasta(self.FastaPATH) self.FastaKey_L = ReadFasta[0] self.Fasta_D = ReadFasta[1] #validates the sequences with the tree ValidSeqsWithTree = self.ValidateSeqsWithTree() #if sequence headers match with node headers if ValidSeqsWithTree[0]: #instantiates the WholeTreeOrthologousSubgroup class object self.OSG = WholeTreeOrthologousSubgroup(self.FastMLTree , self.FastaPATH , self.FastaKey_L , self.Fasta_D) #output string from the WholeTreeOrthologousSubgroup self.ExitStatus = True #print "A" #if the analysis worked if self.ExitStatus: #print "B" #prepares the final XML output string exit_string = [] exit_string.append("<Group>\n") exit_string.append("\t<Group_id>NA</Group_id>\n\t<Number_OSGs>1</Number_OSGs>\n") exit_string.append("\t<OSGs>\n") exit_string.append(self.OSG.ExitString) exit_string.append("\t</OSGs>\n") exit_string.append("</Group>\n") self.ExitString = ''.join(exit_string) os.system("mkdir -p %s" % self.ProjectName) #writes the protein adaptation XML file with open("%s/Report.xml" % (self.ProjectName) , "w") as w: w.write(self.ExitString) #writes a new modified newick tree file with internal node names according to the cogent convention with open("%s/ModdedTree.nwk" % (self.ProjectName) , "w") as w: w.write(self.FastMLTree.CogentTree.getNewick(with_distances=True).replace("'","")) #writes an XML file containing information pertaining to the ReferenceToPDB2DScoringMatrix object with open("%s/ScoringMatrix.xml" % (self.ProjectName) , "w") as w: w.write(self.OSG.MatrixGraphicsString) #writes FASTA file of reconstructed sequences with open("%s/AncestralSeqs.fa" % (self.ProjectName) , "w") as w: w.write(self.OSG.ReconstructedFASTAString) #writes text file of reconstruction probabilities with open("%s/AncestralProb.txt" % (self.ProjectName) , "w") as w: w.write(self.OSG.ReconstructedProbabilityString) #Final output message print "Done.\n" + \ "Main output XML file written to %s/Report.xml\n" % (self.ProjectName) + \ "Modified tree Newick file written to %s/ModdedTree.nwk\n" % (self.ProjectName) + \ "Scoring Matrix XML file written to %s/ScoringMatrix.xml\n" % (self.ProjectName) + \ "FASTA format file of reconstructed sequences written to %s/AncestralSeqs.fa\n" % (self.ProjectName) + \ "Text file of reconstructed sequence probabilities written to %s/AncestralProb.txt" % (self.ProjectName) #if the tree terminal nodes were not matched with the Fasta sequence headers else: self.ExitString = self.getTreeMatchedToFastaErrorMessage() print self.ExitString #if the FastMLTree object was not properly parsed else: self.ExitString = self.getTreeErrorMessage() print self.ExitString