Esempio n. 1
0
    def __init__(self, Directory, DerivedoI, PDBoI):
        """
        Class attributes:
        Figures_L (List): list of all the figure types that will be created
        FiguresSVG_D (Dict): key is the type of figure, value is the SVG syntax that will draw the figure
        DerivedoInterest (String): Derived node of interest that the figure will be based on
        PDBoInterest (String): PDB structure that the derived node sequence aligned to and achieved a significant hit on
        """

        #initial setup of what figures will be created
        self.Figures_L = [
            "TreeAndStates", "Alignment", "Structurecartoon",
            "Structuresurface"
        ]
        self.FigureSVG_D = {Key: [] for Key in self.Figures_L}

        self.Directory = Directory
        if self.Directory.endswith("/"):
            pass
        else:
            self.Directory = self.Directory + "/"

        self.DerivedoInterest = DerivedoI
        self.PDBoInterest = PDBoI

        print self.Directory
        print self.DerivedoInterest
        print self.PDBoInterest

        #output directory where files will be written
        self.OutputDirectory = "%sFigures/%s-%s/" % (
            self.Directory, self.DerivedoInterest, self.PDBoInterest)

        if os.path.exists(self.OutputDirectory):
            pass
        else:
            os.system("mkdir " + self.OutputDirectory)

        #paths to relevant input files
        self.ReportPATH = self.Directory + "Report.xml"
        self.TreePATH = self.Directory + "ModdedTree.nwk"
        self.MatrixPATH = self.Directory + "ScoringMatrix.xml"

        #parses the report file for sequences and branch relationships
        self.NodeToSeq_D = {
            re.compile("<H>(.+?)</H>").search(Seq).group(1):
            re.compile("<S>(.+?)</S>").search(Seq).group(1)
            for Seq in re.findall("<Seq>.+?</Seq>",
                                  open(self.ReportPATH, "r").read())
        }
        self.BranchToAlgorithm_D = {
            re.compile("<Branch_name>(.+?)</Branch_name>").search(
                Branch).group(1): ScopeAlgorithm(Branch)
            for Branch in re.findall("<Branch>.+?</Branch>",
                                     open(self.ReportPATH, "r").read(),
                                     re.DOTALL)
        }
        self.RectCount = 0

        #dimensions
        self.TreeFigWIDTH = 750
        self.TreeFigHEIGHT = 500
        self.TreeFigXOffset = 25
        self.TreeFigYOffset = 50

        #loads and parses tree, gets evolutionary distances for proper branch lengths
        self.CogentTree = LoadTree(self.TreePATH)
        self.FastMLTree = FastMLTree(self.TreePATH, False)
        self.FastMLTree.setBranchLengths()
        self.LongestDistance = self.getLongestEvoDistance()
        self.EvoDistance_D = {
            Key: self.getEvoDistance(Key)
            for Key in self.NodeToSeq_D.keys() if Key != self.FastMLTree.TopKey
        }
        self.EvoDistance_D[self.FastMLTree.TopKey] = 0.0
        self.ModdedEvoDistance_D = self.modEvoDistance()
        self.TreeCoords_D = self.setTreeCoords()

        FurthestPosition = 0.0
        FurthestClade = ""

        #gets the furthest evolutionary distance
        for Key in self.FastMLTree.LeafKey_L:
            Val = self.TreeCoords_D[Key][0] + (12 * len(Key))
            if Val > FurthestPosition:
                FurthestPosition = Val
                FurthestClade = Key

        self.BranchoInterest = ""

        for Key in self.FastMLTree.BranchKey_L:
            if Key.split(">>")[1] == self.DerivedoInterest:
                self.BranchoInterest = Key

        #gets all relevant information for the states portion of the figure
        self.StateIndices_L = [
            int(X) - 1 for X in self.BranchToAlgorithm_D[self.BranchoInterest].
            getAllMutationXMLKeysAccordingToAccession(self.PDBoInterest)
        ]
        self.LeafStates_D = {
            Key:
            [self.NodeToSeq_D[Key][state] for state in self.StateIndices_L]
            for Key in self.FastMLTree.LeafKey_L
        }
        self.StateColour_D = self.getStateToHex()

        self.StateInc = 25.0

        self.StateFigHEIGHT = 500
        self.StateFigWIDTH = self.StateInc * (len(self.StateIndices_L)) + 50
        self.StateFigXOffset = self.TreeFigXOffset + self.TreeFigWIDTH + (
            12 * len(FurthestClade)) + 25
        self.StateFigYOffset = 50
        #creates the states and tree figure
        self.FigureSVG_D["TreeAndStates"].append(
            self.getSVGHeader(
                self.TreeFigHEIGHT + (self.TreeFigYOffset * 2),
                self.StateFigXOffset + self.StateFigWIDTH +
                self.TreeFigXOffset))
        self.makeTreeFig()
        self.makeStatesFig()
        self.FigureSVG_D["TreeAndStates"].append("</svg>")

        self.TreeAndStatesFOutPATH = self.OutputDirectory + "TreeAndStates.png"
        TreeStateFOut = open(self.TreeAndStatesFOutPATH, "w")
        cairosvg.svg2png(bytestring="\n".join(
            self.FigureSVG_D["TreeAndStates"]),
                         write_to=TreeStateFOut)
        TreeStateFOut.close()

        LongestCladeName = ""
        for Key in self.FastMLTree.LeafKey_L:
            if len(Key) > len(LongestCladeName):
                LongestCladeName = Key

        #gets all relevant information for the alignment cartoon portion of the figure
        self.MatrixInfo = self.parseScoringMatrix()

        self.AlnInc = 11.0

        self.AlignmentFigWIDTH = self.AlnInc * len(
            self.MatrixInfo["Sseq"]) + self.AlnInc + (8 *
                                                      len(LongestCladeName))

        self.AlignmentFigHEIGHT = self.AlnInc * (
            len(self.FastMLTree.LeafKey_L) + 1) + self.AlnInc
        self.AlignmentFigXOffset = self.AlnInc
        self.AlignmentFigYOffset = self.AlnInc

        self.FigureSVG_D["Alignment"].append(
            self.getSVGHeader(self.AlignmentFigHEIGHT, self.AlignmentFigWIDTH))
        self.makeAlignmentFig()
        self.FigureSVG_D["Alignment"].append("</svg>")

        self.AlignmentFOutPATH = self.OutputDirectory + "Alignment.png"
        AlignmentFOut = open(self.AlignmentFOutPATH, "w")
        cairosvg.svg2png(bytestring="\n".join(self.FigureSVG_D["Alignment"]),
                         write_to=AlignmentFOut)
        AlignmentFOut.close()

        #relevant information for the structure file in PDB format
        self.ColouredStructureFile = self.getColoredStructureFile()
        self.StructureFOutPATH = self.OutputDirectory + "Structure.pdb"
        open(self.StructureFOutPATH,
             "w").write(self.ColouredStructureFile.read())

        self.TotalFigWIDTH = 1000
        self.TotalFigHEIGHT = 600

        self.TotalElement_L = [
            self.getSVGHeader(self.TotalFigHEIGHT, self.TotalFigWIDTH)
        ]
        self.TotalElement_L.append(
            '''\t<image x="0" y="0" width="1000" height="500" xlink:href="file://%s"/>'''
            % (self.TreeAndStatesFOutPATH))
        self.TotalElement_L.append(
            '''\t<image x="0" y="500" width="1000" height="100" xlink:href="file://%s"/>'''
            % (self.AlignmentFOutPATH))
        self.TotalElement_L.append("</svg>")
 def __init__(self , Directory , DataDIR):
     
     """
     Class attributes:
     DataDIR (String): Directory where main program is held
     Directory (String): Directory to read in report files and output the final PValue file
     ProteinFamilyName (String): Protein family descriptor to use in random distribution file generation
     
     ScopeXMLFile (String): Path to report (mutation mapping) file
     ModdedTreeFile (String): Path to newick syntax tree file with branch names according to first module
     ScoringMatrixXMLFile (String): Path to XML format file of all scoring keys (to be used in random distributions)
     DistributionPath (String): Path to the random distribution directory
     
     HydroPATH (String): Path to hydropathyindex file
     MassPATH (String): Path to sidechainmass file
     
     Hydro_D (Dict): Key is one letter AA code, Value is its hydropathy index
     Mass_D (Dict): Key is one letter AA code, Value is its side chain mass value
     
     Tree (FastMLTree obj): Tree object with the renamed branches
     NodeSequenceKey_L (List): List containing the names of all nodes
     NodeToSequence_D (Dict): Key is the node name, Value is the ancestral or extant sequence at that node
     
     BranchToAlgorithm_D (Dict): Key is the Branch key name, Value is a ScopeAlgorithm instance for that branch segment
     PDBContents_D (Dict): Key is PDB ID, value is dictionaries of all atom and residue information for that PDB file
     PDBXMLContents_D (Dict): Key is PDB ID, value is dictionaries of all atom and residue information for that PDBXML file
     
     ScoringMatrixCoverageKeys_D (Dict): Key is PDB ID, value is list of chain and position keys that correspond to successfully aligned regions
     AccsToMutationCount_D (Dict): Key is PDB ID, value is a list of integers for all branch segments with those number of mutations
     AccsToDistanceCount_D (Dict): Key is PDB ID, value is a list of integers for all branch segments with those number of mutations that can be joined by pairwise distances
     
     MassChanges_L (List): all mass change calculations that have happened anywhere on the tree
     HydroChanges_L (List): all hydropathy index change calculations that have happened anywhere on the tree
     
     RandomDistributions_D (Dict): Dictionary structure pointing to various number arrays based on the PDB ID used and the number of items drawn before averaging
     BranchToPValues_D (Dict): Dictionary structure pointing to the four P-Values for an ancestral, derived, PDB alignment triad
     """
     
     self.DataDIR = DataDIR
     
     #gets input/output directory and protein family name
     self.Directory = Directory
     if self.Directory.endswith("/"):
         self.Directory = self.Directory[:-1]
     
     self.ProteinFamilyName = self.Directory
     if re.compile("/").search(self.Directory):
         self.ProteinFamilyName = self.Directory.split("/")[-1]
     
     
     #gets all path information for the relevant input files
     self.ScopeXMLFile = self.Directory+"/"+"Report.xml"
     self.ModdedTreeFile = self.Directory+"/"+"ModdedTree.nwk"
     self.ScoringMatrixXMLFile = self.Directory+"/"+"ScoringMatrix.xml"
     self.PDBToEvalue_D = self.getPDBToEvalue_D()
     
     #paths to more input files
     self.HydroPATH = self.DataDIR+"misc/hydropathyindex"
     self.MassPATH = self.DataDIR+"misc/sidechainmass"
     
     #makes a dictionary out of hydropathy index and mass input files
     self.Hydro_D = {line.split()[0] : float(line.replace("\n","").split()[1]) for line in open(self.HydroPATH,"r").readlines()}
     self.Mass_D = {line.split()[0] : float(line.replace("\n","").split()[1]) for line in open(self.MassPATH,"r").readlines()}
     
     #create the FastMLTree object and set branch lengths
     self.Tree = FastMLTree(self.ModdedTreeFile , False)
     self.Tree.setBranchLengths()
     
     #parse out sequence information
     NodeToSequence_LD = self.getNodeToSequence_LD()
     self.NodeSequenceKey_L = NodeToSequence_LD[0]
     self.NodeToSequence_D = NodeToSequence_LD[1]
     
     #parse out report XML file and create ScopeAlgorithm instances
     self.BranchToAlgorithm_D = self.getBranchToAlgorithm_D()
     
     #set PDB content dictionary and PDBXML content dictionary
     
     PDB_L = []
     #for each branch key, check for new PDB ID keys
     for BranchKey in self.BranchToAlgorithm_D.keys():
         AccKeysSearch = re.compile("<PDBs>(.+?)</PDBs>").search(self.BranchToAlgorithm_D[BranchKey].alignmentSet)
         if AccKeysSearch:
             AccKeys_L = AccKeysSearch.group(1).split(";")
             #for each PDB ID key found
             for AccKey in AccKeys_L:        
                 #only executes if the PDB ID key has not already been added to the dictionary
                 if AccKey in set(PDB_L):
                     pass
                 else:
                     PDB_L.append(AccKey)
     
     PDBAndPDBXMLContents_Dicts = getAllPDBFileDicts(PDB_L)
     self.PDBContents_D = PDBAndPDBXMLContents_Dicts[0]
     self.PDBXMLContents_D = PDBAndPDBXMLContents_Dicts[1]
     [self.setPDBAndPDBXMLContentDictionaries(self.BranchToAlgorithm_D[Key]) for Key in self.BranchToAlgorithm_D.keys()]
     
     #parses out ScoringMatrixCoverage file
     self.ScoringMatrixCoverageKeys_D = self.getScoringMatrixCoverageKeys()
     self.ScoringMatrixPDBXMLMatchedKeys_D = self.getScoringMatrixPDBXMLMatchedKeys_D()
     
     #gets the indices for PDB IDs to be used in SAS and distance random distribution generation
     self.AccsToMutationCount_D = self.getNCoveredMutations_D()
     
     #self.AccsToDistanceCount_D = self.getNDistances_D()
     
     #get list of mass and hydropathy index change values for use in random distributions
     BranchSegmentMutations_L = self.getAllBranchSegmentMutations()
     self.MassChanges_L = BranchSegmentMutations_L[0]
     self.HydroChanges_L = BranchSegmentMutations_L[1]
     
     self.RandomDistributions_D = self.getRandomDistributions_D() #create all random distributions
     #print self.RandomDistributions_D
     self.BranchToPValues_D = self.getAllBranchSegmentPValues() #get all PValues
     
     self.output() #output to PValue file
Esempio n. 3
0
 def __init__(self , FastaPATH , UserTreePATH , ProjectName):
     self.FastaPATH = FastaPATH
     self.UserTreePATH = UserTreePATH
     self.ProjectName = ProjectName
     
     #declaration of output variables
     self.ExitStatus = False
     self.ExitString = ""
     self.OSGString = ""
     
     #parses the tree file according to FastMLTree methods
     self.FastMLTree = FastMLTree(self.UserTreePATH , True)
     
     if self.FastMLTree.Parsed:
         #opens the Fasta file and gets the sequences as a dictionary
         ReadFasta = readFasta(self.FastaPATH)
         self.FastaKey_L = ReadFasta[0]
         self.Fasta_D = ReadFasta[1]
         
         #validates the sequences with the tree
         ValidSeqsWithTree = self.ValidateSeqsWithTree()
         
         #if sequence headers match with node headers
         if ValidSeqsWithTree[0]:
             
             #instantiates the WholeTreeOrthologousSubgroup class object
             self.OSG = WholeTreeOrthologousSubgroup(self.FastMLTree , self.FastaPATH , self.FastaKey_L , self.Fasta_D)
             
             #output string from the WholeTreeOrthologousSubgroup
             self.ExitStatus = True
             #print "A"
             #if the analysis worked
             if self.ExitStatus:
                 #print "B"
                 #prepares the final XML output string
                 exit_string = []
                 exit_string.append("<Group>\n")
                 exit_string.append("\t<Group_id>NA</Group_id>\n\t<Number_OSGs>1</Number_OSGs>\n")
                 exit_string.append("\t<OSGs>\n")
                 exit_string.append(self.OSG.ExitString)
                 exit_string.append("\t</OSGs>\n")
                 exit_string.append("</Group>\n")
                 self.ExitString = ''.join(exit_string)
                 
                 os.system("mkdir -p %s" % self.ProjectName)
                 
                 #writes the protein adaptation XML file
                 with open("%s/Report.xml" % (self.ProjectName) , "w") as w:
                     w.write(self.ExitString)
                 
                 #writes a new modified newick tree file with internal node names according to the cogent convention
                 with open("%s/ModdedTree.nwk" % (self.ProjectName) , "w") as w:
                     w.write(self.FastMLTree.CogentTree.getNewick(with_distances=True).replace("'",""))
                 
                 #writes an XML file containing information pertaining to the ReferenceToPDB2DScoringMatrix object
                 with open("%s/ScoringMatrix.xml" % (self.ProjectName) , "w") as w:
                     w.write(self.OSG.MatrixGraphicsString)
                     
                 #writes FASTA file of reconstructed sequences
                 with open("%s/AncestralSeqs.fa" % (self.ProjectName) , "w") as w:
                     w.write(self.OSG.ReconstructedFASTAString)
                 
                 #writes text file of reconstruction probabilities
                 with open("%s/AncestralProb.txt" % (self.ProjectName) , "w") as w:
                     w.write(self.OSG.ReconstructedProbabilityString)
                 
                 #Final output message
                 print "Done.\n" + \
                       "Main output XML file written to %s/Report.xml\n" % (self.ProjectName) + \
                       "Modified tree Newick file written to %s/ModdedTree.nwk\n" % (self.ProjectName) + \
                       "Scoring Matrix XML file written to %s/ScoringMatrix.xml\n"  % (self.ProjectName) + \
                       "FASTA format file of reconstructed sequences written to %s/AncestralSeqs.fa\n" % (self.ProjectName) + \
                       "Text file of reconstructed sequence probabilities written to %s/AncestralProb.txt" % (self.ProjectName)
             
         #if the tree terminal nodes were not matched with the Fasta sequence headers 
         else:
             self.ExitString = self.getTreeMatchedToFastaErrorMessage()
             print self.ExitString
     
     #if the FastMLTree object was not properly parsed
     else:
         self.ExitString = self.getTreeErrorMessage()
         print self.ExitString