class ScopeAlgorithmTreeSet:
    
    "CONSTRUCTOR"
    def __init__(self , Directory , DataDIR):
        
        """
        Class attributes:
        DataDIR (String): Directory where main program is held
        Directory (String): Directory to read in report files and output the final PValue file
        ProteinFamilyName (String): Protein family descriptor to use in random distribution file generation
        
        ScopeXMLFile (String): Path to report (mutation mapping) file
        ModdedTreeFile (String): Path to newick syntax tree file with branch names according to first module
        ScoringMatrixXMLFile (String): Path to XML format file of all scoring keys (to be used in random distributions)
        DistributionPath (String): Path to the random distribution directory
        
        HydroPATH (String): Path to hydropathyindex file
        MassPATH (String): Path to sidechainmass file
        
        Hydro_D (Dict): Key is one letter AA code, Value is its hydropathy index
        Mass_D (Dict): Key is one letter AA code, Value is its side chain mass value
        
        Tree (FastMLTree obj): Tree object with the renamed branches
        NodeSequenceKey_L (List): List containing the names of all nodes
        NodeToSequence_D (Dict): Key is the node name, Value is the ancestral or extant sequence at that node
        
        BranchToAlgorithm_D (Dict): Key is the Branch key name, Value is a ScopeAlgorithm instance for that branch segment
        PDBContents_D (Dict): Key is PDB ID, value is dictionaries of all atom and residue information for that PDB file
        PDBXMLContents_D (Dict): Key is PDB ID, value is dictionaries of all atom and residue information for that PDBXML file
        
        ScoringMatrixCoverageKeys_D (Dict): Key is PDB ID, value is list of chain and position keys that correspond to successfully aligned regions
        AccsToMutationCount_D (Dict): Key is PDB ID, value is a list of integers for all branch segments with those number of mutations
        AccsToDistanceCount_D (Dict): Key is PDB ID, value is a list of integers for all branch segments with those number of mutations that can be joined by pairwise distances
        
        MassChanges_L (List): all mass change calculations that have happened anywhere on the tree
        HydroChanges_L (List): all hydropathy index change calculations that have happened anywhere on the tree
        
        RandomDistributions_D (Dict): Dictionary structure pointing to various number arrays based on the PDB ID used and the number of items drawn before averaging
        BranchToPValues_D (Dict): Dictionary structure pointing to the four P-Values for an ancestral, derived, PDB alignment triad
        """
        
        self.DataDIR = DataDIR
        
        #gets input/output directory and protein family name
        self.Directory = Directory
        if self.Directory.endswith("/"):
            self.Directory = self.Directory[:-1]
        
        self.ProteinFamilyName = self.Directory
        if re.compile("/").search(self.Directory):
            self.ProteinFamilyName = self.Directory.split("/")[-1]
        
        
        #gets all path information for the relevant input files
        self.ScopeXMLFile = self.Directory+"/"+"Report.xml"
        self.ModdedTreeFile = self.Directory+"/"+"ModdedTree.nwk"
        self.ScoringMatrixXMLFile = self.Directory+"/"+"ScoringMatrix.xml"
        self.PDBToEvalue_D = self.getPDBToEvalue_D()
        
        #paths to more input files
        self.HydroPATH = self.DataDIR+"misc/hydropathyindex"
        self.MassPATH = self.DataDIR+"misc/sidechainmass"
        
        #makes a dictionary out of hydropathy index and mass input files
        self.Hydro_D = {line.split()[0] : float(line.replace("\n","").split()[1]) for line in open(self.HydroPATH,"r").readlines()}
        self.Mass_D = {line.split()[0] : float(line.replace("\n","").split()[1]) for line in open(self.MassPATH,"r").readlines()}
        
        #create the FastMLTree object and set branch lengths
        self.Tree = FastMLTree(self.ModdedTreeFile , False)
        self.Tree.setBranchLengths()
        
        #parse out sequence information
        NodeToSequence_LD = self.getNodeToSequence_LD()
        self.NodeSequenceKey_L = NodeToSequence_LD[0]
        self.NodeToSequence_D = NodeToSequence_LD[1]
        
        #parse out report XML file and create ScopeAlgorithm instances
        self.BranchToAlgorithm_D = self.getBranchToAlgorithm_D()
        
        #set PDB content dictionary and PDBXML content dictionary
        
        PDB_L = []
        #for each branch key, check for new PDB ID keys
        for BranchKey in self.BranchToAlgorithm_D.keys():
            AccKeysSearch = re.compile("<PDBs>(.+?)</PDBs>").search(self.BranchToAlgorithm_D[BranchKey].alignmentSet)
            if AccKeysSearch:
                AccKeys_L = AccKeysSearch.group(1).split(";")
                #for each PDB ID key found
                for AccKey in AccKeys_L:        
                    #only executes if the PDB ID key has not already been added to the dictionary
                    if AccKey in set(PDB_L):
                        pass
                    else:
                        PDB_L.append(AccKey)
        
        PDBAndPDBXMLContents_Dicts = getAllPDBFileDicts(PDB_L)
        self.PDBContents_D = PDBAndPDBXMLContents_Dicts[0]
        self.PDBXMLContents_D = PDBAndPDBXMLContents_Dicts[1]
        [self.setPDBAndPDBXMLContentDictionaries(self.BranchToAlgorithm_D[Key]) for Key in self.BranchToAlgorithm_D.keys()]
        
        #parses out ScoringMatrixCoverage file
        self.ScoringMatrixCoverageKeys_D = self.getScoringMatrixCoverageKeys()
        self.ScoringMatrixPDBXMLMatchedKeys_D = self.getScoringMatrixPDBXMLMatchedKeys_D()
        
        #gets the indices for PDB IDs to be used in SAS and distance random distribution generation
        self.AccsToMutationCount_D = self.getNCoveredMutations_D()
        
        #self.AccsToDistanceCount_D = self.getNDistances_D()
        
        #get list of mass and hydropathy index change values for use in random distributions
        BranchSegmentMutations_L = self.getAllBranchSegmentMutations()
        self.MassChanges_L = BranchSegmentMutations_L[0]
        self.HydroChanges_L = BranchSegmentMutations_L[1]
        
        self.RandomDistributions_D = self.getRandomDistributions_D() #create all random distributions
        #print self.RandomDistributions_D
        self.BranchToPValues_D = self.getAllBranchSegmentPValues() #get all PValues
        
        self.output() #output to PValue file
    
    def getPDBToEvalue_D(self):
        Ret = {}
        allAlignments_L = re.findall("<PDB_alignment>.+?</PDB_alignment>", open(self.ScoringMatrixXMLFile,"r").read() , re.DOTALL)
        for aln in allAlignments_L:
            pdbid = re.compile("<PDB_id>(.+?)</PDB_id>").search(aln).group(1)
            e = re.compile("<E_value>(.+?)</E_value>").search(aln).group(1)
            
            acc = pdbid.split("|")[0].lower()
            chain = pdbid.split("|")[1]
            Ret[acc+"|"+chain] = float(e)
        return Ret
        
    
    "gets list and dictionary of node keys to sequences"
    def getNodeToSequence_LD(self):
        ret = {}
        retKey_L = []
        
        #parses all Seq headers in the xml report file, and makes a dictionary and list entry for each one
        AllSequences = re.findall("<Seq>.+?</Seq>" , open(self.ScopeXMLFile , "r").read())
        for Seq in AllSequences:
            SeqKey = re.compile("<H>(.+?)</H>").search(Seq).group(1)
            SeqObj = FASequence(SeqKey , re.compile("<S>(.+?)</S>").search(Seq).group(1))
            ret[SeqKey] = SeqObj
            retKey_L.append(SeqKey)
        return [retKey_L,ret]
    
    "gets dictionary of branch keys to ScopeAlgorithm objects representing those branches"
    def getBranchToAlgorithm_D(self):
        ret = {}
        
        #finds all branches, parses out the branch name and sets this as the key
        AllBranches = re.findall("<Branch>.+?</Branch>" , open(self.ScopeXMLFile , "r").read() , re.DOTALL)
        for Branch in AllBranches:
            BranchKey = re.compile("<Branch_name>(.+?)</Branch_name>").search(Branch).group(1)
            ret[BranchKey] = ScopeAlgorithm(Branch)
        return ret
    
    
    

    
    "sets the ScopeAlgorithm instance's PDBContents_D and PDBXMLContents_D"
    def setPDBAndPDBXMLContentDictionaries(self,SA):
        SA.PDBContents_D = self.PDBContents_D
        SA.PDBXMLContents_D = self.PDBXMLContents_D
    
    "gets dictionary of covered residues in each PDB accession"
    def getScoringMatrixCoverageKeys(self):
        ret = {}
        #finds all PDBIDs and adds their key as a separate entry in the dictionary
        for CoverageKeySet in re.findall("<Coverage>.+?</Coverage>" , re.compile("(<Coverages>.+?</Coverages>)",re.DOTALL).search(open(self.ScoringMatrixXMLFile,"r").read()).group(1)):
            ret[re.compile("<ID>(.+?)</ID>",).search(CoverageKeySet).group(1)] = re.compile("<Keys>(.+?)</Keys>").search(CoverageKeySet).group(1).split(",") #get all covered keys
        return ret
    
    "gets dictionary of residues that are covered by the alignment and also covered by the parse PDB structure file"
    def getScoringMatrixPDBXMLMatchedKeys_D(self):
        Ret = {}
        
        for AccKey in self.PDBXMLContents_D.keys():
            
            Ret[AccKey] = []
            for PosKey in self.ScoringMatrixCoverageKeys_D[AccKey]:
                #checks if the coverage key is also in the PDBXML residue dictionary
                if PosKey in sets.Set(self.PDBXMLContents_D[AccKey]["XMLResidue_D"].keys()):
                    Ret[AccKey].append(PosKey)
        return Ret
    
    "gets dictionary of PDB IDs and the list of integers used to draw SAS random distributions"
    def getNCoveredMutations_D(self):
        
        AccsToMutationCount_D = {}
        for BranchKey in self.BranchToAlgorithm_D.keys():
            SA = self.BranchToAlgorithm_D[BranchKey]
            
            #for each ScopeAlgorithm instance, if mutations are present
            if SA.mutationsPresent:
                AccsToMutationCountForSingleSA_D = {}
                AccsToKey_D = {}
                
                #for each mutated site
                for MutationXMLKey in SA.mutationsXMLkey_L:
                    
                    #checks if that mutation has coverage, then gets the accession and position of that mutation
                    if SA.mutationScore_D[MutationXMLKey]["Coverage"]:
                        AllAccPos = SA.getAccessionPosition_L(SA.mutationsXML_D[MutationXMLKey])
                        for AccPos in AllAccPos:
                            Acc = AccPos[0]
                            Pos = AccPos[1]
                            if Acc in AccsToMutationCount_D.keys():
                                pass
                            else:
                                AccsToMutationCount_D[Acc] = []
                            
                            #adds the PDB ID to the count dictionary for a single ScopeAlgorithm
                            if Acc in AccsToMutationCountForSingleSA_D.keys():
                                pass
                            else:
                                AccsToMutationCountForSingleSA_D[Acc] = 0
                                AccsToKey_D[Acc] = []
                        
                            #adds one count to that PDB ID
                            AccsToMutationCountForSingleSA_D[Acc] += 1
                            AccsToKey_D[Acc].append(Pos)
                
                #sets the ScopeAlgorithm AccsToMutationCount to the SingleSA_D
                SA.AccsToMutationCount = AccsToMutationCountForSingleSA_D
                SA.AccsToKey_D = AccsToKey_D
                
                #adds the index to the overall dictionary if it is not already in there
                for Key in AccsToMutationCountForSingleSA_D.keys():
                    if AccsToMutationCountForSingleSA_D[Key] in sets.Set(AccsToMutationCount_D[Key]):
                        pass
                    else:
                        #only adds the index if it is greater than 2 mutations
                        if AccsToMutationCountForSingleSA_D[Key] >= 2:
                            AccsToMutationCount_D[Key].append(AccsToMutationCountForSingleSA_D[Key])
                            
        return AccsToMutationCount_D
    
    "gets hydropathy index and mass change lists for all mutations that have occurred in the tree"
    def getAllBranchSegmentMutations(self):
        MassRet = []
        HydroRet = []
        
        #gets all mutations from all branches and all states
        M_L = [[self.BranchToAlgorithm_D[B].getMutationType(self.BranchToAlgorithm_D[B].mutationsXML_D[MutationXMLKey]) for MutationXMLKey in self.BranchToAlgorithm_D[B].mutationsXMLkey_L if self.BranchToAlgorithm_D[B].mutationScore_D[MutationXMLKey]["Coverage"]] for B in self.BranchToAlgorithm_D.keys() if self.BranchToAlgorithm_D[B].mutationsPresent]
        Mut_L = []
        for M in M_L:
            for Mut in M:
                Mut_L.append(Mut)
        
        #makes lists of hydropathy index changes and mass changes from the list of state changes
        HydroRet = [self.getHydroDif(Mut[0],Mut[1]) for Mut in Mut_L]
        MassRet = [self.getMassDif(Mut[0],Mut[1]) for Mut in Mut_L]

        return [MassRet,HydroRet]
        
    ####################################################################################################
    
    "gets squared difference in side chain mass between ancestral and derived sequence states"
    def getMassDif(self,StateA,StateB):
        return math.pow(self.Mass_D[StateA] - self.Mass_D[StateB] , 2)
    "gets squared difference in hydropathy index between ancestral and derived sequence states"
    def getHydroDif(self,StateA,StateB):
        return math.pow(self.Hydro_D[StateA] - self.Hydro_D[StateB] , 2)
    
    "gets list of all possible combinations of distances in a list of mutated sites from the same PDB structure"
    def getCombinatorialListOfPairwiseDistances(self,AccKey,AccsA_L):
        
        AccsB_L = AccsA_L[1:]
        count = 0
        Distances_L = []
        SA = self.BranchToAlgorithm_D[self.BranchToAlgorithm_D.keys()[0]]
        
        #combinations of pairwise distances
        for AccA in AccsA_L:
            for AccB in AccsB_L:
                #gets both points
                APDBXMLLine = SA.getPDBXMLLine(AccKey,AccA)
                BPDBXMLLine = SA.getPDBXMLLine(AccKey,AccB)
                
                if APDBXMLLine and BPDBXMLLine:    
                    APoint = SA.getAlphaCarbonPoint(APDBXMLLine)
                    BPoint = SA.getAlphaCarbonPoint(BPDBXMLLine)
                
                    #as long as they are not null, calculate the magnitude of distance between them and add it to a list
                    if APoint and BPoint:
                        Distances_L.append(SA.getDistanceMagnitude(APoint , BPoint))
            AccsB_L = AccsB_L[1:]
        
        return Distances_L
    
    "general method for writing/retrieving a random distribution array"
    def getAnyAverageRandomDist(self,AveragedNumbers_L):
        FinalNumbers_L = [AveragedNumber for AveragedNumber in AveragedNumbers_L if math.isnan(AveragedNumber) == False]
        return gaussian_kde(array(FinalNumbers_L))
    "gets the average of numbers in a list"
    def getAveragedData(self,Numbers_L):
        return numpy.mean([Number for Number in Numbers_L if Number != None])
    "gets a random sample of integers to be used as random indices to draw numbers for the random distributions"
    def getRandomSampleOfIntegers(self,MaxLength,Index):
        return random.sample(range(MaxLength),Index)
    
    "get SAS random distribution for a single PDB ID and index"
    def getRelativeSASRandDistForIndex(self,AccKey,Index):
        Ret = None
        #print AccKey
        #print Index
        #gets 10000 SAS averages of randomly selected indices on the protein structure within the alignment bounds
        try:
            
            Ret = self.getAnyAverageRandomDist(\
                    [self.getAveragedData(\
                        [self.BranchToAlgorithm_D[self.BranchToAlgorithm_D.keys()[0]].getRelativeGlobalSAS(self.PDBXMLContents_D[AccKey]["XMLResidue_D"][self.ScoringMatrixPDBXMLMatchedKeys_D[AccKey][xIndex]]) \
                            for xIndex in self.getRandomSampleOfIntegers(len(self.ScoringMatrixPDBXMLMatchedKeys_D[AccKey]),Index)]) for i in range(0,10000)])
            
        except Exception as e:
            print e
            
        return Ret
    
    "get distance random distribution for a single PDB ID and index"
    def getRelativeDistanceRandDistForIndex(self,AccKey,Index):
        Ret = None
        
        #gets 10000 distance averages of randomly selected indices on the protein structure within the alignment bounds
        try:
            Ret = self.getAnyAverageRandomDist(\
                    [self.getAveragedData(\
                        self.getCombinatorialListOfPairwiseDistances(AccKey,[self.ScoringMatrixPDBXMLMatchedKeys_D[AccKey][xIndex]\
                            for xIndex in self.getRandomSampleOfIntegers(len(self.ScoringMatrixPDBXMLMatchedKeys_D[AccKey]),Index)])) for i in range(0,10000)])
        except Exception as e:
            print e
        return Ret
    
    "gets SAS random distributions for all indices for one PDB ID"
    def getRelativeSASRandDistAllIndices(self,AccKey,Indices):
        return {str(Index) : self.getRelativeSASRandDistForIndex(AccKey,Index) for Index in Indices}
    "gets distance random distributions for all indices for one PDB ID"
    def getDistanceRandDistAllIndices(self,AccKey,Indices):
        return {str(Index) : self.getRelativeDistanceRandDistForIndex(AccKey,Index) for Index in Indices}
    "gets hydropathy index change random distributions for all indices for one PDB ID"
        
    "gets all random distributions for each criterion and each PDB ID key and each index (ie. highest method for random distributions)"
    def getRandomDistributions_D(self):
        return {"SAS":{AccKey : self.getRelativeSASRandDistAllIndices(AccKey,self.AccsToMutationCount_D[AccKey]) for AccKey in self.AccsToMutationCount_D.keys()},\
                "Dist": {AccKey : self.getDistanceRandDistAllIndices(AccKey,self.AccsToMutationCount_D[AccKey]) for AccKey in self.AccsToMutationCount_D.keys()}}
                
    
    ####################################################################################################
    
    "general method for retrieving a P-value"
    def getGeneralPValue(self,Criterion,Acc,Index,Point):
        Ret = None
        
        #does not execute if there are only 0.0's in the Random distribution (faulty distribution)
        if self.RandomDistributions_D[Criterion][Acc][str(Index)].n == 1:
            pass
        #gets CDF function (percentile) using the observed point as the input and distribution as the background
        else:
            neginf = float("inf") * -1.0
            Ret = self.RandomDistributions_D[Criterion][Acc][str(Index)].integrate_box_1d(neginf,Point)
        
        return Ret
    
    "get SAS p-value for a particular ancestral, derived, triple alignment triad"
    def getSASPValue(self,SA,AccKey):
        Avg = None
        PVal = None
        Ret = None
        
        if str(SA.AccsToMutationCount[AccKey]) in self.RandomDistributions_D["SAS"][AccKey].keys():
            try:
                RSAS_L = []
                
                for AccPos in SA.getAllPositionKeysAccordingToAccession(AccKey):
                    pdbxmlLine = SA.getPDBXMLLine(AccPos[0],AccPos[1])
                    if pdbxmlLine:
                        SASToAdd = SA.getRelativeGlobalSAS(pdbxmlLine)
                        RSAS_L.append(SASToAdd)
                
                Avg = self.getAveragedData(RSAS_L)
                
                if numpy.isnan(Avg):
                    pass
                else:
                    PVal = self.getGeneralPValue("SAS",AccKey,SA.AccsToMutationCount[AccKey],Avg)
                    Ret = [Avg,PVal]
                
                
            except Exception as e:
                pass
                
        return Ret
    
    "get distance p-value for a particular ancestral, derived, triple alignment triad"
    def getDistPValue(self,SA,AccKey):
        PVal = None
        Avg = None
        Ret = None
        
        if AccKey in SA.AccsToMutationCount.keys():
            if str(SA.AccsToMutationCount[AccKey]) in self.RandomDistributions_D["Dist"][AccKey].keys():
                try:
                    Pairwise_L = self.getCombinatorialListOfPairwiseDistances(AccKey,SA.AccsToKey_D[AccKey])
                    Avg = self.getAveragedData(Pairwise_L)
                    if numpy.isnan(Avg):
                        pass
                    else:
                        PVal = self.getGeneralPValue("Dist",AccKey,SA.AccsToMutationCount[AccKey],Avg)
                        Ret = [Avg,PVal]
                    
                except Exception as e:
                    print e
        return Ret
    
    
    
    "gets PValues for all four criteria for one ancestral,derived, PDB alignment triad"
    def getAllPValuesForAccession(self,SA,AccKey):
        return {"SAS":self.getSASPValue(SA,AccKey),\
                "Dist":self.getDistPValue(SA,AccKey)}
                
    "gets all PValues for one ancestral, derived alignment pair"
    def getAllPValuesForBranchSegment(self,SA):
        return {Acc:self.getAllPValuesForAccession(SA,Acc) for Acc in self.AccsToMutationCount_D.keys() if Acc in SA.AccsToMutationCount.keys()}
    "gets all PValues for all ancestral, derived alignment pairs"
    def getAllBranchSegmentPValues(self):
        return {BranchKey : self.getAllPValuesForBranchSegment(self.BranchToAlgorithm_D[BranchKey]) for BranchKey in self.BranchToAlgorithm_D.keys() if self.BranchToAlgorithm_D[BranchKey].mutationsPresent}
    
    
    ####################################################################################################
    
    "writes PValue information to the appropriate file"
    def output(self):
        AllOutput_L = ["Branch               PDB   #M   Msas  Psas  Mdis  Pdis"] #header line
        
        #for each ancestral, derived, PDB alignment triad
        for BranchKey in self.BranchToPValues_D.keys():
            
            for AccKey in self.BranchToPValues_D[BranchKey].keys():
                
                #if there is a PValue to this triad, then it will format an appropriate output string
                OutputString = None
                if self.BranchToPValues_D[BranchKey][AccKey]["SAS"] and self.BranchToPValues_D[BranchKey][AccKey]["Dist"]:
                    if self.BranchToPValues_D[BranchKey][AccKey]["Dist"][0] != 0.0:
                        
                        OutputString = "%s%s %s%s %s%s %s%s %s%s %s%s %s" %   (BranchKey, " "*(20-len(BranchKey)),\
                                                                               AccKey," "*(5-len(AccKey)),\
                                                                               str(len(self.BranchToAlgorithm_D[BranchKey].getAllMutationXMLAccordingToAccession(AccKey))), " "*(4-len(str(len(self.BranchToAlgorithm_D[BranchKey].getAllMutationXMLAccordingToAccession(AccKey))))),\
                                                                               str(round(self.BranchToPValues_D[BranchKey][AccKey]["SAS"][0] , 3)), " "*(5-len(str(round(self.BranchToPValues_D[BranchKey][AccKey]["SAS"][0] , 3)))),\
                                                                               '%.2E' % self.BranchToPValues_D[BranchKey][AccKey]["SAS"][1], " "*(5-len('%.2E' % self.BranchToPValues_D[BranchKey][AccKey]["SAS"][1])),\
                                                                               str(round(self.BranchToPValues_D[BranchKey][AccKey]["Dist"][0] , 3)), " "*(5-len(str(round(self.BranchToPValues_D[BranchKey][AccKey]["Dist"][0] , 3)))),\
                                                                               '%.2E' % self.BranchToPValues_D[BranchKey][AccKey]["Dist"][1])
                                                                               
                                                                               
                        AllOutput_L.append(OutputString)
                                                                   
        #writes PValues to output file and displays end prompt.
        AllPath = "%s/PValues.txt" % (self.Directory)
        with open(AllPath,"w") as w:
            w.write("\n".join(AllOutput_L))
            
        print "Done.\nAll P-Values written to %s" % (AllPath)
 def __init__(self , Directory , DataDIR):
     
     """
     Class attributes:
     DataDIR (String): Directory where main program is held
     Directory (String): Directory to read in report files and output the final PValue file
     ProteinFamilyName (String): Protein family descriptor to use in random distribution file generation
     
     ScopeXMLFile (String): Path to report (mutation mapping) file
     ModdedTreeFile (String): Path to newick syntax tree file with branch names according to first module
     ScoringMatrixXMLFile (String): Path to XML format file of all scoring keys (to be used in random distributions)
     DistributionPath (String): Path to the random distribution directory
     
     HydroPATH (String): Path to hydropathyindex file
     MassPATH (String): Path to sidechainmass file
     
     Hydro_D (Dict): Key is one letter AA code, Value is its hydropathy index
     Mass_D (Dict): Key is one letter AA code, Value is its side chain mass value
     
     Tree (FastMLTree obj): Tree object with the renamed branches
     NodeSequenceKey_L (List): List containing the names of all nodes
     NodeToSequence_D (Dict): Key is the node name, Value is the ancestral or extant sequence at that node
     
     BranchToAlgorithm_D (Dict): Key is the Branch key name, Value is a ScopeAlgorithm instance for that branch segment
     PDBContents_D (Dict): Key is PDB ID, value is dictionaries of all atom and residue information for that PDB file
     PDBXMLContents_D (Dict): Key is PDB ID, value is dictionaries of all atom and residue information for that PDBXML file
     
     ScoringMatrixCoverageKeys_D (Dict): Key is PDB ID, value is list of chain and position keys that correspond to successfully aligned regions
     AccsToMutationCount_D (Dict): Key is PDB ID, value is a list of integers for all branch segments with those number of mutations
     AccsToDistanceCount_D (Dict): Key is PDB ID, value is a list of integers for all branch segments with those number of mutations that can be joined by pairwise distances
     
     MassChanges_L (List): all mass change calculations that have happened anywhere on the tree
     HydroChanges_L (List): all hydropathy index change calculations that have happened anywhere on the tree
     
     RandomDistributions_D (Dict): Dictionary structure pointing to various number arrays based on the PDB ID used and the number of items drawn before averaging
     BranchToPValues_D (Dict): Dictionary structure pointing to the four P-Values for an ancestral, derived, PDB alignment triad
     """
     
     self.DataDIR = DataDIR
     
     #gets input/output directory and protein family name
     self.Directory = Directory
     if self.Directory.endswith("/"):
         self.Directory = self.Directory[:-1]
     
     self.ProteinFamilyName = self.Directory
     if re.compile("/").search(self.Directory):
         self.ProteinFamilyName = self.Directory.split("/")[-1]
     
     
     #gets all path information for the relevant input files
     self.ScopeXMLFile = self.Directory+"/"+"Report.xml"
     self.ModdedTreeFile = self.Directory+"/"+"ModdedTree.nwk"
     self.ScoringMatrixXMLFile = self.Directory+"/"+"ScoringMatrix.xml"
     self.PDBToEvalue_D = self.getPDBToEvalue_D()
     
     #paths to more input files
     self.HydroPATH = self.DataDIR+"misc/hydropathyindex"
     self.MassPATH = self.DataDIR+"misc/sidechainmass"
     
     #makes a dictionary out of hydropathy index and mass input files
     self.Hydro_D = {line.split()[0] : float(line.replace("\n","").split()[1]) for line in open(self.HydroPATH,"r").readlines()}
     self.Mass_D = {line.split()[0] : float(line.replace("\n","").split()[1]) for line in open(self.MassPATH,"r").readlines()}
     
     #create the FastMLTree object and set branch lengths
     self.Tree = FastMLTree(self.ModdedTreeFile , False)
     self.Tree.setBranchLengths()
     
     #parse out sequence information
     NodeToSequence_LD = self.getNodeToSequence_LD()
     self.NodeSequenceKey_L = NodeToSequence_LD[0]
     self.NodeToSequence_D = NodeToSequence_LD[1]
     
     #parse out report XML file and create ScopeAlgorithm instances
     self.BranchToAlgorithm_D = self.getBranchToAlgorithm_D()
     
     #set PDB content dictionary and PDBXML content dictionary
     
     PDB_L = []
     #for each branch key, check for new PDB ID keys
     for BranchKey in self.BranchToAlgorithm_D.keys():
         AccKeysSearch = re.compile("<PDBs>(.+?)</PDBs>").search(self.BranchToAlgorithm_D[BranchKey].alignmentSet)
         if AccKeysSearch:
             AccKeys_L = AccKeysSearch.group(1).split(";")
             #for each PDB ID key found
             for AccKey in AccKeys_L:        
                 #only executes if the PDB ID key has not already been added to the dictionary
                 if AccKey in set(PDB_L):
                     pass
                 else:
                     PDB_L.append(AccKey)
     
     PDBAndPDBXMLContents_Dicts = getAllPDBFileDicts(PDB_L)
     self.PDBContents_D = PDBAndPDBXMLContents_Dicts[0]
     self.PDBXMLContents_D = PDBAndPDBXMLContents_Dicts[1]
     [self.setPDBAndPDBXMLContentDictionaries(self.BranchToAlgorithm_D[Key]) for Key in self.BranchToAlgorithm_D.keys()]
     
     #parses out ScoringMatrixCoverage file
     self.ScoringMatrixCoverageKeys_D = self.getScoringMatrixCoverageKeys()
     self.ScoringMatrixPDBXMLMatchedKeys_D = self.getScoringMatrixPDBXMLMatchedKeys_D()
     
     #gets the indices for PDB IDs to be used in SAS and distance random distribution generation
     self.AccsToMutationCount_D = self.getNCoveredMutations_D()
     
     #self.AccsToDistanceCount_D = self.getNDistances_D()
     
     #get list of mass and hydropathy index change values for use in random distributions
     BranchSegmentMutations_L = self.getAllBranchSegmentMutations()
     self.MassChanges_L = BranchSegmentMutations_L[0]
     self.HydroChanges_L = BranchSegmentMutations_L[1]
     
     self.RandomDistributions_D = self.getRandomDistributions_D() #create all random distributions
     #print self.RandomDistributions_D
     self.BranchToPValues_D = self.getAllBranchSegmentPValues() #get all PValues
     
     self.output() #output to PValue file
    def __init__(self, Directory, DerivedoI, PDBoI):
        """
        Class attributes:
        Figures_L (List): list of all the figure types that will be created
        FiguresSVG_D (Dict): key is the type of figure, value is the SVG syntax that will draw the figure
        DerivedoInterest (String): Derived node of interest that the figure will be based on
        PDBoInterest (String): PDB structure that the derived node sequence aligned to and achieved a significant hit on
        """

        #initial setup of what figures will be created
        self.Figures_L = [
            "TreeAndStates", "Alignment", "Structurecartoon",
            "Structuresurface"
        ]
        self.FigureSVG_D = {Key: [] for Key in self.Figures_L}

        self.Directory = Directory
        if self.Directory.endswith("/"):
            pass
        else:
            self.Directory = self.Directory + "/"

        self.DerivedoInterest = DerivedoI
        self.PDBoInterest = PDBoI

        print self.Directory
        print self.DerivedoInterest
        print self.PDBoInterest

        #output directory where files will be written
        self.OutputDirectory = "%sFigures/%s-%s/" % (
            self.Directory, self.DerivedoInterest, self.PDBoInterest)

        if os.path.exists(self.OutputDirectory):
            pass
        else:
            os.system("mkdir " + self.OutputDirectory)

        #paths to relevant input files
        self.ReportPATH = self.Directory + "Report.xml"
        self.TreePATH = self.Directory + "ModdedTree.nwk"
        self.MatrixPATH = self.Directory + "ScoringMatrix.xml"

        #parses the report file for sequences and branch relationships
        self.NodeToSeq_D = {
            re.compile("<H>(.+?)</H>").search(Seq).group(1):
            re.compile("<S>(.+?)</S>").search(Seq).group(1)
            for Seq in re.findall("<Seq>.+?</Seq>",
                                  open(self.ReportPATH, "r").read())
        }
        self.BranchToAlgorithm_D = {
            re.compile("<Branch_name>(.+?)</Branch_name>").search(Branch).
            group(1): ScopeAlgorithm(Branch)
            for Branch in re.findall("<Branch>.+?</Branch>",
                                     open(self.ReportPATH, "r").read(), re.
                                     DOTALL)
        }
        self.RectCount = 0

        #dimensions
        self.TreeFigWIDTH = 750
        self.TreeFigHEIGHT = 500
        self.TreeFigXOffset = 25
        self.TreeFigYOffset = 50

        #loads and parses tree, gets evolutionary distances for proper branch lengths
        self.CogentTree = LoadTree(self.TreePATH)
        self.FastMLTree = FastMLTree(self.TreePATH, False)
        self.FastMLTree.setBranchLengths()
        self.LongestDistance = self.getLongestEvoDistance()
        self.EvoDistance_D = {
            Key: self.getEvoDistance(Key)
            for Key in self.NodeToSeq_D.keys() if Key != self.FastMLTree.TopKey
        }
        self.EvoDistance_D[self.FastMLTree.TopKey] = 0.0
        self.ModdedEvoDistance_D = self.modEvoDistance()
        self.TreeCoords_D = self.setTreeCoords()

        FurthestPosition = 0.0
        FurthestClade = ""

        #gets the furthest evolutionary distance
        for Key in self.FastMLTree.LeafKey_L:
            Val = self.TreeCoords_D[Key][0] + (12 * len(Key))
            if Val > FurthestPosition:
                FurthestPosition = Val
                FurthestClade = Key

        self.BranchoInterest = ""

        for Key in self.FastMLTree.BranchKey_L:
            if Key.split(">>")[1] == self.DerivedoInterest:
                self.BranchoInterest = Key

        #gets all relevant information for the states portion of the figure
        self.StateIndices_L = [
            int(X) - 1 for X in self.BranchToAlgorithm_D[self.BranchoInterest].
            getAllMutationXMLKeysAccordingToAccession(self.PDBoInterest)
        ]
        self.LeafStates_D = {
            Key:
            [self.NodeToSeq_D[Key][state] for state in self.StateIndices_L]
            for Key in self.FastMLTree.LeafKey_L
        }
        self.StateColour_D = self.getStateToHex()

        self.StateInc = 25.0

        self.StateFigHEIGHT = 500
        self.StateFigWIDTH = self.StateInc * (len(self.StateIndices_L)) + 50
        self.StateFigXOffset = self.TreeFigXOffset + self.TreeFigWIDTH + (
            12 * len(FurthestClade)) + 25
        self.StateFigYOffset = 50
        #creates the states and tree figure
        self.FigureSVG_D["TreeAndStates"].append(
            self.getSVGHeader(
                self.TreeFigHEIGHT + (self.TreeFigYOffset * 2),
                self.StateFigXOffset + self.StateFigWIDTH +
                self.TreeFigXOffset))
        self.makeTreeFig()
        self.makeStatesFig()
        self.FigureSVG_D["TreeAndStates"].append("</svg>")

        self.TreeAndStatesFOutPATH = self.OutputDirectory + "TreeAndStates.png"
        TreeStateFOut = open(self.TreeAndStatesFOutPATH, "w")
        cairosvg.svg2png(
            bytestring="\n".join(self.FigureSVG_D["TreeAndStates"]),
            write_to=TreeStateFOut)
        TreeStateFOut.close()

        LongestCladeName = ""
        for Key in self.FastMLTree.LeafKey_L:
            if len(Key) > len(LongestCladeName):
                LongestCladeName = Key

        #gets all relevant information for the alignment cartoon portion of the figure
        self.MatrixInfo = self.parseScoringMatrix()

        self.AlnInc = 11.0

        self.AlignmentFigWIDTH = self.AlnInc * len(
            self.MatrixInfo["Sseq"]) + self.AlnInc + (
                8 * len(LongestCladeName))

        self.AlignmentFigHEIGHT = self.AlnInc * (
            len(self.FastMLTree.LeafKey_L) + 1) + self.AlnInc
        self.AlignmentFigXOffset = self.AlnInc
        self.AlignmentFigYOffset = self.AlnInc

        self.FigureSVG_D["Alignment"].append(
            self.getSVGHeader(self.AlignmentFigHEIGHT, self.AlignmentFigWIDTH))
        self.makeAlignmentFig()
        self.FigureSVG_D["Alignment"].append("</svg>")

        self.AlignmentFOutPATH = self.OutputDirectory + "Alignment.png"
        AlignmentFOut = open(self.AlignmentFOutPATH, "w")
        cairosvg.svg2png(
            bytestring="\n".join(self.FigureSVG_D["Alignment"]),
            write_to=AlignmentFOut)
        AlignmentFOut.close()

        #relevant information for the structure file in PDB format
        self.ColouredStructureFile = self.getColoredStructureFile()
        self.StructureFOutPATH = self.OutputDirectory + "Structure.pdb"
        open(self.StructureFOutPATH,
             "w").write(self.ColouredStructureFile.read())

        self.TotalFigWIDTH = 1000
        self.TotalFigHEIGHT = 600

        self.TotalElement_L = [
            self.getSVGHeader(self.TotalFigHEIGHT, self.TotalFigWIDTH)
        ]
        self.TotalElement_L.append(
            '''\t<image x="0" y="0" width="1000" height="500" xlink:href="file://%s"/>'''
            % (self.TreeAndStatesFOutPATH))
        self.TotalElement_L.append(
            '''\t<image x="0" y="500" width="1000" height="100" xlink:href="file://%s"/>'''
            % (self.AlignmentFOutPATH))
        self.TotalElement_L.append("</svg>")
class ExplorePrediction:

    "CONSTRUCTOR"

    def __init__(self, Directory, DerivedoI, PDBoI):
        """
        Class attributes:
        Figures_L (List): list of all the figure types that will be created
        FiguresSVG_D (Dict): key is the type of figure, value is the SVG syntax that will draw the figure
        DerivedoInterest (String): Derived node of interest that the figure will be based on
        PDBoInterest (String): PDB structure that the derived node sequence aligned to and achieved a significant hit on
        """

        #initial setup of what figures will be created
        self.Figures_L = [
            "TreeAndStates", "Alignment", "Structurecartoon",
            "Structuresurface"
        ]
        self.FigureSVG_D = {Key: [] for Key in self.Figures_L}

        self.Directory = Directory
        if self.Directory.endswith("/"):
            pass
        else:
            self.Directory = self.Directory + "/"

        self.DerivedoInterest = DerivedoI
        self.PDBoInterest = PDBoI

        print self.Directory
        print self.DerivedoInterest
        print self.PDBoInterest

        #output directory where files will be written
        self.OutputDirectory = "%sFigures/%s-%s/" % (
            self.Directory, self.DerivedoInterest, self.PDBoInterest)

        if os.path.exists(self.OutputDirectory):
            pass
        else:
            os.system("mkdir " + self.OutputDirectory)

        #paths to relevant input files
        self.ReportPATH = self.Directory + "Report.xml"
        self.TreePATH = self.Directory + "ModdedTree.nwk"
        self.MatrixPATH = self.Directory + "ScoringMatrix.xml"

        #parses the report file for sequences and branch relationships
        self.NodeToSeq_D = {
            re.compile("<H>(.+?)</H>").search(Seq).group(1):
            re.compile("<S>(.+?)</S>").search(Seq).group(1)
            for Seq in re.findall("<Seq>.+?</Seq>",
                                  open(self.ReportPATH, "r").read())
        }
        self.BranchToAlgorithm_D = {
            re.compile("<Branch_name>(.+?)</Branch_name>").search(Branch).
            group(1): ScopeAlgorithm(Branch)
            for Branch in re.findall("<Branch>.+?</Branch>",
                                     open(self.ReportPATH, "r").read(), re.
                                     DOTALL)
        }
        self.RectCount = 0

        #dimensions
        self.TreeFigWIDTH = 750
        self.TreeFigHEIGHT = 500
        self.TreeFigXOffset = 25
        self.TreeFigYOffset = 50

        #loads and parses tree, gets evolutionary distances for proper branch lengths
        self.CogentTree = LoadTree(self.TreePATH)
        self.FastMLTree = FastMLTree(self.TreePATH, False)
        self.FastMLTree.setBranchLengths()
        self.LongestDistance = self.getLongestEvoDistance()
        self.EvoDistance_D = {
            Key: self.getEvoDistance(Key)
            for Key in self.NodeToSeq_D.keys() if Key != self.FastMLTree.TopKey
        }
        self.EvoDistance_D[self.FastMLTree.TopKey] = 0.0
        self.ModdedEvoDistance_D = self.modEvoDistance()
        self.TreeCoords_D = self.setTreeCoords()

        FurthestPosition = 0.0
        FurthestClade = ""

        #gets the furthest evolutionary distance
        for Key in self.FastMLTree.LeafKey_L:
            Val = self.TreeCoords_D[Key][0] + (12 * len(Key))
            if Val > FurthestPosition:
                FurthestPosition = Val
                FurthestClade = Key

        self.BranchoInterest = ""

        for Key in self.FastMLTree.BranchKey_L:
            if Key.split(">>")[1] == self.DerivedoInterest:
                self.BranchoInterest = Key

        #gets all relevant information for the states portion of the figure
        self.StateIndices_L = [
            int(X) - 1 for X in self.BranchToAlgorithm_D[self.BranchoInterest].
            getAllMutationXMLKeysAccordingToAccession(self.PDBoInterest)
        ]
        self.LeafStates_D = {
            Key:
            [self.NodeToSeq_D[Key][state] for state in self.StateIndices_L]
            for Key in self.FastMLTree.LeafKey_L
        }
        self.StateColour_D = self.getStateToHex()

        self.StateInc = 25.0

        self.StateFigHEIGHT = 500
        self.StateFigWIDTH = self.StateInc * (len(self.StateIndices_L)) + 50
        self.StateFigXOffset = self.TreeFigXOffset + self.TreeFigWIDTH + (
            12 * len(FurthestClade)) + 25
        self.StateFigYOffset = 50
        #creates the states and tree figure
        self.FigureSVG_D["TreeAndStates"].append(
            self.getSVGHeader(
                self.TreeFigHEIGHT + (self.TreeFigYOffset * 2),
                self.StateFigXOffset + self.StateFigWIDTH +
                self.TreeFigXOffset))
        self.makeTreeFig()
        self.makeStatesFig()
        self.FigureSVG_D["TreeAndStates"].append("</svg>")

        self.TreeAndStatesFOutPATH = self.OutputDirectory + "TreeAndStates.png"
        TreeStateFOut = open(self.TreeAndStatesFOutPATH, "w")
        cairosvg.svg2png(
            bytestring="\n".join(self.FigureSVG_D["TreeAndStates"]),
            write_to=TreeStateFOut)
        TreeStateFOut.close()

        LongestCladeName = ""
        for Key in self.FastMLTree.LeafKey_L:
            if len(Key) > len(LongestCladeName):
                LongestCladeName = Key

        #gets all relevant information for the alignment cartoon portion of the figure
        self.MatrixInfo = self.parseScoringMatrix()

        self.AlnInc = 11.0

        self.AlignmentFigWIDTH = self.AlnInc * len(
            self.MatrixInfo["Sseq"]) + self.AlnInc + (
                8 * len(LongestCladeName))

        self.AlignmentFigHEIGHT = self.AlnInc * (
            len(self.FastMLTree.LeafKey_L) + 1) + self.AlnInc
        self.AlignmentFigXOffset = self.AlnInc
        self.AlignmentFigYOffset = self.AlnInc

        self.FigureSVG_D["Alignment"].append(
            self.getSVGHeader(self.AlignmentFigHEIGHT, self.AlignmentFigWIDTH))
        self.makeAlignmentFig()
        self.FigureSVG_D["Alignment"].append("</svg>")

        self.AlignmentFOutPATH = self.OutputDirectory + "Alignment.png"
        AlignmentFOut = open(self.AlignmentFOutPATH, "w")
        cairosvg.svg2png(
            bytestring="\n".join(self.FigureSVG_D["Alignment"]),
            write_to=AlignmentFOut)
        AlignmentFOut.close()

        #relevant information for the structure file in PDB format
        self.ColouredStructureFile = self.getColoredStructureFile()
        self.StructureFOutPATH = self.OutputDirectory + "Structure.pdb"
        open(self.StructureFOutPATH,
             "w").write(self.ColouredStructureFile.read())

        self.TotalFigWIDTH = 1000
        self.TotalFigHEIGHT = 600

        self.TotalElement_L = [
            self.getSVGHeader(self.TotalFigHEIGHT, self.TotalFigWIDTH)
        ]
        self.TotalElement_L.append(
            '''\t<image x="0" y="0" width="1000" height="500" xlink:href="file://%s"/>'''
            % (self.TreeAndStatesFOutPATH))
        self.TotalElement_L.append(
            '''\t<image x="0" y="500" width="1000" height="100" xlink:href="file://%s"/>'''
            % (self.AlignmentFOutPATH))
        self.TotalElement_L.append("</svg>")

    "gets the header for any SVG format file"

    def getSVGHeader(self, FrameHEIGHT, FrameWIDTH):
        return """<?xml version="1.0" standalone="no"?>

<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">

<svg xmlns:xlink="http://www.w3.org/1999/xlink" xmlns='http://www.w3.org/2000/svg' version='1.1'
    width='%s' height='%s'>
""" % (str(FrameWIDTH), str(FrameHEIGHT))

    "Dictionary where the key is the amino acid character and the value is the background colour"

    def getStateToHex(self):
        return {"A":"80B3E6","C":"E68080","D":"CC4DCC","E":"CC4DCC","F":"80B3E6",\
                "G":"E6994D","H":"1AB3B3","I":"80B3E6","K":"E6331A","L":"80B3E6",\
                "M":"80B3E6","N":"1ACC1A","P":"CCCC00","Q":"1ACC1A","R":"E6331A",\
                "S":"1ACC1A","T":"1ACC1A","V":"80B3E6","W":"80B3E6","Y":"1AB3B3",\
                "-":"FFFFFF","X":"FFFFFF"}

    "returns the total evolutionary distance from the origin to the node of interest"

    def getEvoDistance(self, startingToNodeKey):
        distance = 0.0
        rootNodeHasNotBeenReached = True
        ToNodeKey = startingToNodeKey

        while rootNodeHasNotBeenReached:

            distance += self.FastMLTree.BranchLength_D[ToNodeKey]

            branchUpHasNotBeenFound = True

            for BranchKey in self.FastMLTree.BranchKey_L:
                if branchUpHasNotBeenFound:
                    if re.compile(">>" + ToNodeKey + "$").search(BranchKey):

                        branchUpHasNotBeenFound = False
                        ToNodeKey = BranchKey.split(">>")[0]

            if ToNodeKey == self.FastMLTree.TopKey:
                rootNodeHasNotBeenReached = False

        return distance

    "gets the node with the longest evolutionary distance from the origin"

    def getLongestEvoDistance(self):
        longestDistance = 0.0

        for LeafKey in self.FastMLTree.LeafKey_L:

            distance = self.getEvoDistance(LeafKey)

            if distance > longestDistance:
                longestDistance = distance

        return longestDistance

    "modifies evolutionary distance into a different format"

    def modEvoDistance(self):
        Ret = {}

        for Key in self.EvoDistance_D.keys():
            if Key == self.FastMLTree.TopKey:
                Ret[Key] = self.EvoDistance_D[Key]

            else:
                if self.EvoDistance_D[Key] == 0:
                    Ret[Key] = self.EvoDistance_D[Key]
                else:
                    Ret[Key] = self.EvoDistance_D[Key]
        return Ret

    "sets tree node coordinates (horizontal and vertical) for the SVG image"

    def setTreeCoords(self):

        Lines_L = self.CogentTree.asciiArt().split("\n")
        MaxVert = 0
        VertCoord_D = {}

        for i in range(0, len(Lines_L)):

            if re.compile("[a-zA-Z0-9_\.@]+").search(Lines_L[i]):
                Leaves = re.findall("([a-zA-Z0-9_\.@]+)", Lines_L[i])

                for Leaf in Leaves:

                    VertCoord_D[Leaf] = i
                    MaxVert = i

        TreeCoords_D = {
            Key: [(self.ModdedEvoDistance_D[Key] / self.LongestDistance) *
                  self.TreeFigWIDTH + self.TreeFigXOffset,
                  float(float(VertCoord_D[Key]) / float(MaxVert)) *
                  self.TreeFigHEIGHT + self.TreeFigYOffset]
            for Key in self.NodeToSeq_D.keys()
        }
        return TreeCoords_D

    "adds node names at each node vertex"

    def addNodeNamesAtNodePoints(self):
        for Key in self.FastMLTree.LeafKey_L:

            xy = self.TreeCoords_D[Key]
            xStart = str(xy[0])
            yStart = str(xy[1])
            self.FigureSVG_D["TreeAndStates"].append(
                '''\t<text x='%s' y='%s' text-anchor='left' font-size='20' font-family='Courier' style="fill: #000000;"  >%s</text>'''
                % (xStart, yStart, Key))

    "adds the vertical lines of the tree image"

    def addVerticalLines(self):

        for branchKey in self.FastMLTree.BranchKey_L:
            fro = branchKey.split(">>")[0]
            to = branchKey.split(">>")[1]

            froXY = self.TreeCoords_D[fro]
            toXY = self.TreeCoords_D[to]

            if branchKey == self.BranchoInterest:
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(255,0,0);stroke-width:1 " />'''
                    % (str(froXY[0]), str(froXY[1]), str(froXY[0]), str(
                        toXY[1])))
            else:
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(0,0,0);stroke-width:1 " />'''
                    % (str(froXY[0]), str(froXY[1]), str(froXY[0]), str(
                        toXY[1])))

    "adds the horizontal lines of the tree image"

    def addHorizontalLines(self):

        for branchKey in self.FastMLTree.BranchKey_L:

            fro = branchKey.split(">>")[0]
            to = branchKey.split(">>")[1]

            froXY = self.TreeCoords_D[fro]
            toXY = self.TreeCoords_D[to]

            if branchKey == self.BranchoInterest:
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(255,0,0);stroke-width:1 " />'''
                    % (str(froXY[0]), str(toXY[1]), str(toXY[0]), str(
                        toXY[1])))
            else:
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(0,0,0);stroke-width:1 " />'''
                    % (str(froXY[0]), str(toXY[1]), str(toXY[0]), str(
                        toXY[1])))

    "does all methods necessary to make the tree image"

    def makeTreeFig(self):
        self.addNodeNamesAtNodePoints()
        self.addVerticalLines()
        self.addHorizontalLines()

    "adds the rows for the mutated states in each sequence"

    def addStateRows(self):
        inc = self.StateInc
        vertInc = float(self.StateFigHEIGHT / float(len(self.LeafStates_D)))

        lowestY = float("inf")

        for Key in self.TreeCoords_D.keys():
            if self.TreeCoords_D[Key][1] < lowestY:
                lowestY = self.TreeCoords_D[Key][1]

        stateY = lowestY - (1.5 * vertInc)

        stateX = 0.0 + self.StateFigXOffset
        for i in self.StateIndices_L:

            self.FigureSVG_D["TreeAndStates"].append(
                '''\t<text x='%s' y='%s' text-anchor='middle' font-size='16' font-family='Courier' transform="rotate(90, %s, %s)" style="fill: #000000;"  >%s</text>'''
                % (str(stateX), str(stateY), str(stateX), str(stateY),
                   str(i + 1)))

            stateX += inc

        for Key in self.LeafStates_D.keys():
            X = 0.0 + self.StateFigXOffset

            for State in self.LeafStates_D[Key]:
                Y = self.TreeCoords_D[Key][1]

                RectX = X - (float(inc / 2.0))
                RectY = Y - (float(vertInc / 2.0)) - 5.0

                self.FigureSVG_D["TreeAndStates"].append('''\t<rect class='r%s' x='%s' y='%s' width='%s' height='%s' style="fill:#%s" />''' % (str(self.RectCount),\
                                                                                                                                 str(RectX),str(RectY),\
                                                                                                                                 str(inc),str(vertInc),\
                                                                                                                                 self.StateColour_D[State]))
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<text x='%s' y='%s' font-size='20' font-family='Courier' text-anchor='middle' style="fill: #000000;"  >%s</text>'''
                    % (str(X), str(Y), State))

                X += inc

    "executes the method to make the states figure"

    def makeStatesFig(self):
        self.addStateRows()

    "parses the scoring matrix for alignment to the PDB sequence information"

    def parseScoringMatrix(self):
        allAlignments_L = re.findall("<PDB_alignment>.+?</PDB_alignment>",
                                     open(self.MatrixPATH, "r").read(),
                                     re.DOTALL)
        KeyAln = ""
        NotFound = True

        for Alignment in allAlignments_L:
            if NotFound:
                PDBID = re.compile("<PDB_id>(.+?)</PDB_id>").search(
                    Alignment).group(1).split("|")[0]
                if self.PDBoInterest.upper() == PDBID:
                    NotFound = False
                    KeyAln = Alignment
                    self.ChainoInterest = re.compile(
                        "<PDB_id>(.+?)</PDB_id>").search(Alignment).group(
                            1).split("|")[1].lower()

        return {"Qstart" : int(re.compile("<Alignment_start_query>(.+?)</Alignment_start_query>").search(KeyAln).group(1))-1,\
                "Qend" : int(re.compile("<Alignment_end_query>(.+?)</Alignment_end_query>").search(KeyAln).group(1))-1,\
                "Sstart" : int(re.compile("<Alignment_start_subject>(.+?)</Alignment_start_subject>").search(KeyAln).group(1))-1,\
                "Send" : int(re.compile("<Alignment_end_subject>(.+?)</Alignment_end_subject>").search(KeyAln).group(1))-1,\
                "Sseq" : re.compile("<Aligned_subject_sequence>(.+?)</Aligned_subject_sequence>").search(KeyAln).group(1)}

    "makes the cartoon of all aligned sequences in the protein family"

    def makeAlignmentFig(self):
        AllSeqs_L = [self.MatrixInfo["Sseq"]] + [
            self.NodeToSeq_D[Key]
            [self.MatrixInfo["Qstart"]:self.MatrixInfo["Qstart"] +
             len(self.MatrixInfo["Sseq"])] for Key in self.FastMLTree.LeafKey_L
        ]
        l1 = len(AllSeqs_L[0])
        AllHeaders_L = [self.PDBoInterest] + self.FastMLTree.LeafKey_L
        l2 = 0

        for Header in AllHeaders_L:
            if len(Header) > l2:
                l2 = len(Header)

        l = l1

        xinc = self.AlnInc
        yinc = self.AlnInc

        Y = self.AlignmentFigYOffset

        for i in range(0, len(AllSeqs_L)):

            X = 0.0 + self.AlignmentFigXOffset

            for State in AllSeqs_L[i]:

                RectX = X - (float(xinc / 2.0))
                RectY = Y - (float(yinc / 2.0)) - 5.0

                self.FigureSVG_D["Alignment"].append('''\t<rect class='r%s' x='%s' y='%s' width='%s' height='%s' style="fill:#%s" />''' % (str(self.RectCount),\
                                                                                                                                 str(RectX),str(RectY),\
                                                                                                                                 str(xinc),str(yinc),\
                                                                                                                                 self.StateColour_D[State]))
                self.FigureSVG_D["Alignment"].append(
                    '''\t<text x='%s' y='%s' text-anchor='middle' font-size='10' font-family='Courier' style="fill: #000000;"  >%s</text>'''
                    % (str(X), str(Y), State))

                X += xinc

            self.FigureSVG_D["Alignment"].append(
                '''\t<text x='%s' y='%s' text-anchor='left' font-size='10' font-family='Courier' style="fill: #000000;"  >%s</text>'''
                % (str(X + self.AlnInc), str(Y), AllHeaders_L[i]))

            Y += yinc

    "gets a PDB format file with the temperature factors coloured to reflect mutated sites"

    def getColoredStructureFile(self):
        NotFound = True
        DesiredBranchKey = ""
        for BranchKey in self.FastMLTree.BranchKey_L:
            if BranchKey.split(">>")[1] == self.DerivedoInterest:
                DesiredBranchKey = BranchKey
                NotFound = False

        PDBAndPDBXMLContents = getAllPDBFileDicts([self.PDBoInterest])
        SA = self.BranchToAlgorithm_D[DesiredBranchKey]
        SA.PDBContents_D = PDBAndPDBXMLContents[0]
        SA.PDBXMLContents_D = PDBAndPDBXMLContents[1]

        FH = getOutputTempFile()
        SA.createPDBColoredFile(self.PDBoInterest, FH.name)

        return FH
Esempio n. 5
0
    def __init__(self, Directory, DerivedoI, PDBoI):
        """
        Class attributes:
        Figures_L (List): list of all the figure types that will be created
        FiguresSVG_D (Dict): key is the type of figure, value is the SVG syntax that will draw the figure
        DerivedoInterest (String): Derived node of interest that the figure will be based on
        PDBoInterest (String): PDB structure that the derived node sequence aligned to and achieved a significant hit on
        """

        #initial setup of what figures will be created
        self.Figures_L = [
            "TreeAndStates", "Alignment", "Structurecartoon",
            "Structuresurface"
        ]
        self.FigureSVG_D = {Key: [] for Key in self.Figures_L}

        self.Directory = Directory
        if self.Directory.endswith("/"):
            pass
        else:
            self.Directory = self.Directory + "/"

        self.DerivedoInterest = DerivedoI
        self.PDBoInterest = PDBoI

        print self.Directory
        print self.DerivedoInterest
        print self.PDBoInterest

        #output directory where files will be written
        self.OutputDirectory = "%sFigures/%s-%s/" % (
            self.Directory, self.DerivedoInterest, self.PDBoInterest)

        if os.path.exists(self.OutputDirectory):
            pass
        else:
            os.system("mkdir " + self.OutputDirectory)

        #paths to relevant input files
        self.ReportPATH = self.Directory + "Report.xml"
        self.TreePATH = self.Directory + "ModdedTree.nwk"
        self.MatrixPATH = self.Directory + "ScoringMatrix.xml"

        #parses the report file for sequences and branch relationships
        self.NodeToSeq_D = {
            re.compile("<H>(.+?)</H>").search(Seq).group(1):
            re.compile("<S>(.+?)</S>").search(Seq).group(1)
            for Seq in re.findall("<Seq>.+?</Seq>",
                                  open(self.ReportPATH, "r").read())
        }
        self.BranchToAlgorithm_D = {
            re.compile("<Branch_name>(.+?)</Branch_name>").search(
                Branch).group(1): ScopeAlgorithm(Branch)
            for Branch in re.findall("<Branch>.+?</Branch>",
                                     open(self.ReportPATH, "r").read(),
                                     re.DOTALL)
        }
        self.RectCount = 0

        #dimensions
        self.TreeFigWIDTH = 750
        self.TreeFigHEIGHT = 500
        self.TreeFigXOffset = 25
        self.TreeFigYOffset = 50

        #loads and parses tree, gets evolutionary distances for proper branch lengths
        self.CogentTree = LoadTree(self.TreePATH)
        self.FastMLTree = FastMLTree(self.TreePATH, False)
        self.FastMLTree.setBranchLengths()
        self.LongestDistance = self.getLongestEvoDistance()
        self.EvoDistance_D = {
            Key: self.getEvoDistance(Key)
            for Key in self.NodeToSeq_D.keys() if Key != self.FastMLTree.TopKey
        }
        self.EvoDistance_D[self.FastMLTree.TopKey] = 0.0
        self.ModdedEvoDistance_D = self.modEvoDistance()
        self.TreeCoords_D = self.setTreeCoords()

        FurthestPosition = 0.0
        FurthestClade = ""

        #gets the furthest evolutionary distance
        for Key in self.FastMLTree.LeafKey_L:
            Val = self.TreeCoords_D[Key][0] + (12 * len(Key))
            if Val > FurthestPosition:
                FurthestPosition = Val
                FurthestClade = Key

        self.BranchoInterest = ""

        for Key in self.FastMLTree.BranchKey_L:
            if Key.split(">>")[1] == self.DerivedoInterest:
                self.BranchoInterest = Key

        #gets all relevant information for the states portion of the figure
        self.StateIndices_L = [
            int(X) - 1 for X in self.BranchToAlgorithm_D[self.BranchoInterest].
            getAllMutationXMLKeysAccordingToAccession(self.PDBoInterest)
        ]
        self.LeafStates_D = {
            Key:
            [self.NodeToSeq_D[Key][state] for state in self.StateIndices_L]
            for Key in self.FastMLTree.LeafKey_L
        }
        self.StateColour_D = self.getStateToHex()

        self.StateInc = 25.0

        self.StateFigHEIGHT = 500
        self.StateFigWIDTH = self.StateInc * (len(self.StateIndices_L)) + 50
        self.StateFigXOffset = self.TreeFigXOffset + self.TreeFigWIDTH + (
            12 * len(FurthestClade)) + 25
        self.StateFigYOffset = 50
        #creates the states and tree figure
        self.FigureSVG_D["TreeAndStates"].append(
            self.getSVGHeader(
                self.TreeFigHEIGHT + (self.TreeFigYOffset * 2),
                self.StateFigXOffset + self.StateFigWIDTH +
                self.TreeFigXOffset))
        self.makeTreeFig()
        self.makeStatesFig()
        self.FigureSVG_D["TreeAndStates"].append("</svg>")

        self.TreeAndStatesFOutPATH = self.OutputDirectory + "TreeAndStates.png"
        TreeStateFOut = open(self.TreeAndStatesFOutPATH, "w")
        cairosvg.svg2png(bytestring="\n".join(
            self.FigureSVG_D["TreeAndStates"]),
                         write_to=TreeStateFOut)
        TreeStateFOut.close()

        LongestCladeName = ""
        for Key in self.FastMLTree.LeafKey_L:
            if len(Key) > len(LongestCladeName):
                LongestCladeName = Key

        #gets all relevant information for the alignment cartoon portion of the figure
        self.MatrixInfo = self.parseScoringMatrix()

        self.AlnInc = 11.0

        self.AlignmentFigWIDTH = self.AlnInc * len(
            self.MatrixInfo["Sseq"]) + self.AlnInc + (8 *
                                                      len(LongestCladeName))

        self.AlignmentFigHEIGHT = self.AlnInc * (
            len(self.FastMLTree.LeafKey_L) + 1) + self.AlnInc
        self.AlignmentFigXOffset = self.AlnInc
        self.AlignmentFigYOffset = self.AlnInc

        self.FigureSVG_D["Alignment"].append(
            self.getSVGHeader(self.AlignmentFigHEIGHT, self.AlignmentFigWIDTH))
        self.makeAlignmentFig()
        self.FigureSVG_D["Alignment"].append("</svg>")

        self.AlignmentFOutPATH = self.OutputDirectory + "Alignment.png"
        AlignmentFOut = open(self.AlignmentFOutPATH, "w")
        cairosvg.svg2png(bytestring="\n".join(self.FigureSVG_D["Alignment"]),
                         write_to=AlignmentFOut)
        AlignmentFOut.close()

        #relevant information for the structure file in PDB format
        self.ColouredStructureFile = self.getColoredStructureFile()
        self.StructureFOutPATH = self.OutputDirectory + "Structure.pdb"
        open(self.StructureFOutPATH,
             "w").write(self.ColouredStructureFile.read())

        self.TotalFigWIDTH = 1000
        self.TotalFigHEIGHT = 600

        self.TotalElement_L = [
            self.getSVGHeader(self.TotalFigHEIGHT, self.TotalFigWIDTH)
        ]
        self.TotalElement_L.append(
            '''\t<image x="0" y="0" width="1000" height="500" xlink:href="file://%s"/>'''
            % (self.TreeAndStatesFOutPATH))
        self.TotalElement_L.append(
            '''\t<image x="0" y="500" width="1000" height="100" xlink:href="file://%s"/>'''
            % (self.AlignmentFOutPATH))
        self.TotalElement_L.append("</svg>")
Esempio n. 6
0
class ExplorePrediction:

    "CONSTRUCTOR"

    def __init__(self, Directory, DerivedoI, PDBoI):
        """
        Class attributes:
        Figures_L (List): list of all the figure types that will be created
        FiguresSVG_D (Dict): key is the type of figure, value is the SVG syntax that will draw the figure
        DerivedoInterest (String): Derived node of interest that the figure will be based on
        PDBoInterest (String): PDB structure that the derived node sequence aligned to and achieved a significant hit on
        """

        #initial setup of what figures will be created
        self.Figures_L = [
            "TreeAndStates", "Alignment", "Structurecartoon",
            "Structuresurface"
        ]
        self.FigureSVG_D = {Key: [] for Key in self.Figures_L}

        self.Directory = Directory
        if self.Directory.endswith("/"):
            pass
        else:
            self.Directory = self.Directory + "/"

        self.DerivedoInterest = DerivedoI
        self.PDBoInterest = PDBoI

        print self.Directory
        print self.DerivedoInterest
        print self.PDBoInterest

        #output directory where files will be written
        self.OutputDirectory = "%sFigures/%s-%s/" % (
            self.Directory, self.DerivedoInterest, self.PDBoInterest)

        if os.path.exists(self.OutputDirectory):
            pass
        else:
            os.system("mkdir " + self.OutputDirectory)

        #paths to relevant input files
        self.ReportPATH = self.Directory + "Report.xml"
        self.TreePATH = self.Directory + "ModdedTree.nwk"
        self.MatrixPATH = self.Directory + "ScoringMatrix.xml"

        #parses the report file for sequences and branch relationships
        self.NodeToSeq_D = {
            re.compile("<H>(.+?)</H>").search(Seq).group(1):
            re.compile("<S>(.+?)</S>").search(Seq).group(1)
            for Seq in re.findall("<Seq>.+?</Seq>",
                                  open(self.ReportPATH, "r").read())
        }
        self.BranchToAlgorithm_D = {
            re.compile("<Branch_name>(.+?)</Branch_name>").search(
                Branch).group(1): ScopeAlgorithm(Branch)
            for Branch in re.findall("<Branch>.+?</Branch>",
                                     open(self.ReportPATH, "r").read(),
                                     re.DOTALL)
        }
        self.RectCount = 0

        #dimensions
        self.TreeFigWIDTH = 750
        self.TreeFigHEIGHT = 500
        self.TreeFigXOffset = 25
        self.TreeFigYOffset = 50

        #loads and parses tree, gets evolutionary distances for proper branch lengths
        self.CogentTree = LoadTree(self.TreePATH)
        self.FastMLTree = FastMLTree(self.TreePATH, False)
        self.FastMLTree.setBranchLengths()
        self.LongestDistance = self.getLongestEvoDistance()
        self.EvoDistance_D = {
            Key: self.getEvoDistance(Key)
            for Key in self.NodeToSeq_D.keys() if Key != self.FastMLTree.TopKey
        }
        self.EvoDistance_D[self.FastMLTree.TopKey] = 0.0
        self.ModdedEvoDistance_D = self.modEvoDistance()
        self.TreeCoords_D = self.setTreeCoords()

        FurthestPosition = 0.0
        FurthestClade = ""

        #gets the furthest evolutionary distance
        for Key in self.FastMLTree.LeafKey_L:
            Val = self.TreeCoords_D[Key][0] + (12 * len(Key))
            if Val > FurthestPosition:
                FurthestPosition = Val
                FurthestClade = Key

        self.BranchoInterest = ""

        for Key in self.FastMLTree.BranchKey_L:
            if Key.split(">>")[1] == self.DerivedoInterest:
                self.BranchoInterest = Key

        #gets all relevant information for the states portion of the figure
        self.StateIndices_L = [
            int(X) - 1 for X in self.BranchToAlgorithm_D[self.BranchoInterest].
            getAllMutationXMLKeysAccordingToAccession(self.PDBoInterest)
        ]
        self.LeafStates_D = {
            Key:
            [self.NodeToSeq_D[Key][state] for state in self.StateIndices_L]
            for Key in self.FastMLTree.LeafKey_L
        }
        self.StateColour_D = self.getStateToHex()

        self.StateInc = 25.0

        self.StateFigHEIGHT = 500
        self.StateFigWIDTH = self.StateInc * (len(self.StateIndices_L)) + 50
        self.StateFigXOffset = self.TreeFigXOffset + self.TreeFigWIDTH + (
            12 * len(FurthestClade)) + 25
        self.StateFigYOffset = 50
        #creates the states and tree figure
        self.FigureSVG_D["TreeAndStates"].append(
            self.getSVGHeader(
                self.TreeFigHEIGHT + (self.TreeFigYOffset * 2),
                self.StateFigXOffset + self.StateFigWIDTH +
                self.TreeFigXOffset))
        self.makeTreeFig()
        self.makeStatesFig()
        self.FigureSVG_D["TreeAndStates"].append("</svg>")

        self.TreeAndStatesFOutPATH = self.OutputDirectory + "TreeAndStates.png"
        TreeStateFOut = open(self.TreeAndStatesFOutPATH, "w")
        cairosvg.svg2png(bytestring="\n".join(
            self.FigureSVG_D["TreeAndStates"]),
                         write_to=TreeStateFOut)
        TreeStateFOut.close()

        LongestCladeName = ""
        for Key in self.FastMLTree.LeafKey_L:
            if len(Key) > len(LongestCladeName):
                LongestCladeName = Key

        #gets all relevant information for the alignment cartoon portion of the figure
        self.MatrixInfo = self.parseScoringMatrix()

        self.AlnInc = 11.0

        self.AlignmentFigWIDTH = self.AlnInc * len(
            self.MatrixInfo["Sseq"]) + self.AlnInc + (8 *
                                                      len(LongestCladeName))

        self.AlignmentFigHEIGHT = self.AlnInc * (
            len(self.FastMLTree.LeafKey_L) + 1) + self.AlnInc
        self.AlignmentFigXOffset = self.AlnInc
        self.AlignmentFigYOffset = self.AlnInc

        self.FigureSVG_D["Alignment"].append(
            self.getSVGHeader(self.AlignmentFigHEIGHT, self.AlignmentFigWIDTH))
        self.makeAlignmentFig()
        self.FigureSVG_D["Alignment"].append("</svg>")

        self.AlignmentFOutPATH = self.OutputDirectory + "Alignment.png"
        AlignmentFOut = open(self.AlignmentFOutPATH, "w")
        cairosvg.svg2png(bytestring="\n".join(self.FigureSVG_D["Alignment"]),
                         write_to=AlignmentFOut)
        AlignmentFOut.close()

        #relevant information for the structure file in PDB format
        self.ColouredStructureFile = self.getColoredStructureFile()
        self.StructureFOutPATH = self.OutputDirectory + "Structure.pdb"
        open(self.StructureFOutPATH,
             "w").write(self.ColouredStructureFile.read())

        self.TotalFigWIDTH = 1000
        self.TotalFigHEIGHT = 600

        self.TotalElement_L = [
            self.getSVGHeader(self.TotalFigHEIGHT, self.TotalFigWIDTH)
        ]
        self.TotalElement_L.append(
            '''\t<image x="0" y="0" width="1000" height="500" xlink:href="file://%s"/>'''
            % (self.TreeAndStatesFOutPATH))
        self.TotalElement_L.append(
            '''\t<image x="0" y="500" width="1000" height="100" xlink:href="file://%s"/>'''
            % (self.AlignmentFOutPATH))
        self.TotalElement_L.append("</svg>")

    "gets the header for any SVG format file"

    def getSVGHeader(self, FrameHEIGHT, FrameWIDTH):
        return """<?xml version="1.0" standalone="no"?>

<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">

<svg xmlns:xlink="http://www.w3.org/1999/xlink" xmlns='http://www.w3.org/2000/svg' version='1.1'
    width='%s' height='%s'>
""" % (str(FrameWIDTH), str(FrameHEIGHT))

    "Dictionary where the key is the amino acid character and the value is the background colour"

    def getStateToHex(self):
        return {"A":"80B3E6","C":"E68080","D":"CC4DCC","E":"CC4DCC","F":"80B3E6",\
                "G":"E6994D","H":"1AB3B3","I":"80B3E6","K":"E6331A","L":"80B3E6",\
                "M":"80B3E6","N":"1ACC1A","P":"CCCC00","Q":"1ACC1A","R":"E6331A",\
                "S":"1ACC1A","T":"1ACC1A","V":"80B3E6","W":"80B3E6","Y":"1AB3B3",\
                "-":"FFFFFF","X":"FFFFFF"}

    "returns the total evolutionary distance from the origin to the node of interest"

    def getEvoDistance(self, startingToNodeKey):
        distance = 0.0
        rootNodeHasNotBeenReached = True
        ToNodeKey = startingToNodeKey

        while rootNodeHasNotBeenReached:

            distance += self.FastMLTree.BranchLength_D[ToNodeKey]

            branchUpHasNotBeenFound = True

            for BranchKey in self.FastMLTree.BranchKey_L:
                if branchUpHasNotBeenFound:
                    if re.compile(">>" + ToNodeKey + "$").search(BranchKey):

                        branchUpHasNotBeenFound = False
                        ToNodeKey = BranchKey.split(">>")[0]

            if ToNodeKey == self.FastMLTree.TopKey:
                rootNodeHasNotBeenReached = False

        return distance

    "gets the node with the longest evolutionary distance from the origin"

    def getLongestEvoDistance(self):
        longestDistance = 0.0

        for LeafKey in self.FastMLTree.LeafKey_L:

            distance = self.getEvoDistance(LeafKey)

            if distance > longestDistance:
                longestDistance = distance

        return longestDistance

    "modifies evolutionary distance into a different format"

    def modEvoDistance(self):
        Ret = {}

        for Key in self.EvoDistance_D.keys():
            if Key == self.FastMLTree.TopKey:
                Ret[Key] = self.EvoDistance_D[Key]

            else:
                if self.EvoDistance_D[Key] == 0:
                    Ret[Key] = self.EvoDistance_D[Key]
                else:
                    Ret[Key] = self.EvoDistance_D[Key]
        return Ret

    "sets tree node coordinates (horizontal and vertical) for the SVG image"

    def setTreeCoords(self):

        Lines_L = self.CogentTree.asciiArt().split("\n")
        MaxVert = 0
        VertCoord_D = {}

        for i in range(0, len(Lines_L)):

            if re.compile("[a-zA-Z0-9_\.@]+").search(Lines_L[i]):
                Leaves = re.findall("([a-zA-Z0-9_\.@]+)", Lines_L[i])

                for Leaf in Leaves:

                    VertCoord_D[Leaf] = i
                    MaxVert = i

        TreeCoords_D = {
            Key: [(self.ModdedEvoDistance_D[Key] / self.LongestDistance) *
                  self.TreeFigWIDTH + self.TreeFigXOffset,
                  float(float(VertCoord_D[Key]) / float(MaxVert)) *
                  self.TreeFigHEIGHT + self.TreeFigYOffset]
            for Key in self.NodeToSeq_D.keys()
        }
        return TreeCoords_D

    "adds node names at each node vertex"

    def addNodeNamesAtNodePoints(self):
        for Key in self.FastMLTree.LeafKey_L:

            xy = self.TreeCoords_D[Key]
            xStart = str(xy[0])
            yStart = str(xy[1])
            self.FigureSVG_D["TreeAndStates"].append(
                '''\t<text x='%s' y='%s' text-anchor='left' font-size='20' font-family='Courier' style="fill: #000000;"  >%s</text>'''
                % (xStart, yStart, Key))

    "adds the vertical lines of the tree image"

    def addVerticalLines(self):

        for branchKey in self.FastMLTree.BranchKey_L:
            fro = branchKey.split(">>")[0]
            to = branchKey.split(">>")[1]

            froXY = self.TreeCoords_D[fro]
            toXY = self.TreeCoords_D[to]

            if branchKey == self.BranchoInterest:
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(255,0,0);stroke-width:1 " />'''
                    % (str(froXY[0]), str(froXY[1]), str(froXY[0]), str(
                        toXY[1])))
            else:
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(0,0,0);stroke-width:1 " />'''
                    % (str(froXY[0]), str(froXY[1]), str(froXY[0]), str(
                        toXY[1])))

    "adds the horizontal lines of the tree image"

    def addHorizontalLines(self):

        for branchKey in self.FastMLTree.BranchKey_L:

            fro = branchKey.split(">>")[0]
            to = branchKey.split(">>")[1]

            froXY = self.TreeCoords_D[fro]
            toXY = self.TreeCoords_D[to]

            if branchKey == self.BranchoInterest:
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(255,0,0);stroke-width:1 " />'''
                    %
                    (str(froXY[0]), str(toXY[1]), str(toXY[0]), str(toXY[1])))
            else:
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(0,0,0);stroke-width:1 " />'''
                    %
                    (str(froXY[0]), str(toXY[1]), str(toXY[0]), str(toXY[1])))

    "does all methods necessary to make the tree image"

    def makeTreeFig(self):
        self.addNodeNamesAtNodePoints()
        self.addVerticalLines()
        self.addHorizontalLines()

    "adds the rows for the mutated states in each sequence"

    def addStateRows(self):
        inc = self.StateInc
        vertInc = float(self.StateFigHEIGHT / float(len(self.LeafStates_D)))

        lowestY = float("inf")

        for Key in self.TreeCoords_D.keys():
            if self.TreeCoords_D[Key][1] < lowestY:
                lowestY = self.TreeCoords_D[Key][1]

        stateY = lowestY - (1.5 * vertInc)

        stateX = 0.0 + self.StateFigXOffset
        for i in self.StateIndices_L:

            self.FigureSVG_D["TreeAndStates"].append(
                '''\t<text x='%s' y='%s' text-anchor='middle' font-size='16' font-family='Courier' transform="rotate(90, %s, %s)" style="fill: #000000;"  >%s</text>'''
                % (str(stateX), str(stateY), str(stateX), str(stateY),
                   str(i + 1)))

            stateX += inc

        for Key in self.LeafStates_D.keys():
            X = 0.0 + self.StateFigXOffset

            for State in self.LeafStates_D[Key]:
                Y = self.TreeCoords_D[Key][1]

                RectX = X - (float(inc / 2.0))
                RectY = Y - (float(vertInc / 2.0)) - 5.0

                self.FigureSVG_D["TreeAndStates"].append('''\t<rect class='r%s' x='%s' y='%s' width='%s' height='%s' style="fill:#%s" />''' % (str(self.RectCount),\
                                                                                                                                 str(RectX),str(RectY),\
                                                                                                                                 str(inc),str(vertInc),\
                                                                                                                                 self.StateColour_D[State]))
                self.FigureSVG_D["TreeAndStates"].append(
                    '''\t<text x='%s' y='%s' font-size='20' font-family='Courier' text-anchor='middle' style="fill: #000000;"  >%s</text>'''
                    % (str(X), str(Y), State))

                X += inc

    "executes the method to make the states figure"

    def makeStatesFig(self):
        self.addStateRows()

    "parses the scoring matrix for alignment to the PDB sequence information"

    def parseScoringMatrix(self):
        allAlignments_L = re.findall("<PDB_alignment>.+?</PDB_alignment>",
                                     open(self.MatrixPATH, "r").read(),
                                     re.DOTALL)
        KeyAln = ""
        NotFound = True

        for Alignment in allAlignments_L:
            if NotFound:
                PDBID = re.compile("<PDB_id>(.+?)</PDB_id>").search(
                    Alignment).group(1).split("|")[0]
                if self.PDBoInterest.upper() == PDBID:
                    NotFound = False
                    KeyAln = Alignment
                    self.ChainoInterest = re.compile(
                        "<PDB_id>(.+?)</PDB_id>").search(Alignment).group(
                            1).split("|")[1].lower()

        return {"Qstart" : int(re.compile("<Alignment_start_query>(.+?)</Alignment_start_query>").search(KeyAln).group(1))-1,\
                "Qend" : int(re.compile("<Alignment_end_query>(.+?)</Alignment_end_query>").search(KeyAln).group(1))-1,\
                "Sstart" : int(re.compile("<Alignment_start_subject>(.+?)</Alignment_start_subject>").search(KeyAln).group(1))-1,\
                "Send" : int(re.compile("<Alignment_end_subject>(.+?)</Alignment_end_subject>").search(KeyAln).group(1))-1,\
                "Sseq" : re.compile("<Aligned_subject_sequence>(.+?)</Aligned_subject_sequence>").search(KeyAln).group(1)}

    "makes the cartoon of all aligned sequences in the protein family"

    def makeAlignmentFig(self):
        AllSeqs_L = [self.MatrixInfo["Sseq"]] + [
            self.NodeToSeq_D[Key]
            [self.MatrixInfo["Qstart"]:self.MatrixInfo["Qstart"] +
             len(self.MatrixInfo["Sseq"])] for Key in self.FastMLTree.LeafKey_L
        ]
        l1 = len(AllSeqs_L[0])
        AllHeaders_L = [self.PDBoInterest] + self.FastMLTree.LeafKey_L
        l2 = 0

        for Header in AllHeaders_L:
            if len(Header) > l2:
                l2 = len(Header)

        l = l1

        xinc = self.AlnInc
        yinc = self.AlnInc

        Y = self.AlignmentFigYOffset

        for i in range(0, len(AllSeqs_L)):

            X = 0.0 + self.AlignmentFigXOffset

            for State in AllSeqs_L[i]:

                RectX = X - (float(xinc / 2.0))
                RectY = Y - (float(yinc / 2.0)) - 5.0

                self.FigureSVG_D["Alignment"].append('''\t<rect class='r%s' x='%s' y='%s' width='%s' height='%s' style="fill:#%s" />''' % (str(self.RectCount),\
                                                                                                                                 str(RectX),str(RectY),\
                                                                                                                                 str(xinc),str(yinc),\
                                                                                                                                 self.StateColour_D[State]))
                self.FigureSVG_D["Alignment"].append(
                    '''\t<text x='%s' y='%s' text-anchor='middle' font-size='10' font-family='Courier' style="fill: #000000;"  >%s</text>'''
                    % (str(X), str(Y), State))

                X += xinc

            self.FigureSVG_D["Alignment"].append(
                '''\t<text x='%s' y='%s' text-anchor='left' font-size='10' font-family='Courier' style="fill: #000000;"  >%s</text>'''
                % (str(X + self.AlnInc), str(Y), AllHeaders_L[i]))

            Y += yinc

    "gets a PDB format file with the temperature factors coloured to reflect mutated sites"

    def getColoredStructureFile(self):
        NotFound = True
        DesiredBranchKey = ""
        for BranchKey in self.FastMLTree.BranchKey_L:
            if BranchKey.split(">>")[1] == self.DerivedoInterest:
                DesiredBranchKey = BranchKey
                NotFound = False

        PDBAndPDBXMLContents = getAllPDBFileDicts([self.PDBoInterest])
        SA = self.BranchToAlgorithm_D[DesiredBranchKey]
        SA.PDBContents_D = PDBAndPDBXMLContents[0]
        SA.PDBXMLContents_D = PDBAndPDBXMLContents[1]

        FH = getOutputTempFile()
        SA.createPDBColoredFile(self.PDBoInterest, FH.name)

        return FH
Esempio n. 7
0
 def __init__(self , FastaPATH , UserTreePATH , ProjectName):
     self.FastaPATH = FastaPATH
     self.UserTreePATH = UserTreePATH
     self.ProjectName = ProjectName
     
     #declaration of output variables
     self.ExitStatus = False
     self.ExitString = ""
     self.OSGString = ""
     
     #parses the tree file according to FastMLTree methods
     self.FastMLTree = FastMLTree(self.UserTreePATH , True)
     
     if self.FastMLTree.Parsed:
         #opens the Fasta file and gets the sequences as a dictionary
         ReadFasta = readFasta(self.FastaPATH)
         self.FastaKey_L = ReadFasta[0]
         self.Fasta_D = ReadFasta[1]
         
         #validates the sequences with the tree
         ValidSeqsWithTree = self.ValidateSeqsWithTree()
         
         #if sequence headers match with node headers
         if ValidSeqsWithTree[0]:
             
             #instantiates the WholeTreeOrthologousSubgroup class object
             self.OSG = WholeTreeOrthologousSubgroup(self.FastMLTree , self.FastaPATH , self.FastaKey_L , self.Fasta_D)
             
             #output string from the WholeTreeOrthologousSubgroup
             self.ExitStatus = True
             #print "A"
             #if the analysis worked
             if self.ExitStatus:
                 #print "B"
                 #prepares the final XML output string
                 exit_string = []
                 exit_string.append("<Group>\n")
                 exit_string.append("\t<Group_id>NA</Group_id>\n\t<Number_OSGs>1</Number_OSGs>\n")
                 exit_string.append("\t<OSGs>\n")
                 exit_string.append(self.OSG.ExitString)
                 exit_string.append("\t</OSGs>\n")
                 exit_string.append("</Group>\n")
                 self.ExitString = ''.join(exit_string)
                 
                 os.system("mkdir -p %s" % self.ProjectName)
                 
                 #writes the protein adaptation XML file
                 with open("%s/Report.xml" % (self.ProjectName) , "w") as w:
                     w.write(self.ExitString)
                 
                 #writes a new modified newick tree file with internal node names according to the cogent convention
                 with open("%s/ModdedTree.nwk" % (self.ProjectName) , "w") as w:
                     w.write(self.FastMLTree.CogentTree.getNewick(with_distances=True).replace("'",""))
                 
                 #writes an XML file containing information pertaining to the ReferenceToPDB2DScoringMatrix object
                 with open("%s/ScoringMatrix.xml" % (self.ProjectName) , "w") as w:
                     w.write(self.OSG.MatrixGraphicsString)
                     
                 #writes FASTA file of reconstructed sequences
                 with open("%s/AncestralSeqs.fa" % (self.ProjectName) , "w") as w:
                     w.write(self.OSG.ReconstructedFASTAString)
                 
                 #writes text file of reconstruction probabilities
                 with open("%s/AncestralProb.txt" % (self.ProjectName) , "w") as w:
                     w.write(self.OSG.ReconstructedProbabilityString)
                 
                 #Final output message
                 print "Done.\n" + \
                       "Main output XML file written to %s/Report.xml\n" % (self.ProjectName) + \
                       "Modified tree Newick file written to %s/ModdedTree.nwk\n" % (self.ProjectName) + \
                       "Scoring Matrix XML file written to %s/ScoringMatrix.xml\n"  % (self.ProjectName) + \
                       "FASTA format file of reconstructed sequences written to %s/AncestralSeqs.fa\n" % (self.ProjectName) + \
                       "Text file of reconstructed sequence probabilities written to %s/AncestralProb.txt" % (self.ProjectName)
             
         #if the tree terminal nodes were not matched with the Fasta sequence headers 
         else:
             self.ExitString = self.getTreeMatchedToFastaErrorMessage()
             print self.ExitString
     
     #if the FastMLTree object was not properly parsed
     else:
         self.ExitString = self.getTreeErrorMessage()
         print self.ExitString