def DistanceStruct(StructFile, SVMlFile, numberofsruct, constrainte): conf = loadConfig() Redondantestructure = defaultdict(aa) MatDist = defaultdict(aa) Redondantestructure1 = {} Dicnumberofsruct = {} for i in range(len(constrainte)): Dicnumberofsruct[constrainte[i]] = numberofsruct nb, DicStruct = GetBasePairsFromStructFile(StructFile) progress.StartTask("Dissimilarity Loop") for i in range(0, nb): for j in range(i + 1, nb): MatDist[i][j] = DistanceTwoStructs(DicStruct[i], DicStruct[j]) MatDist[j][i] = MatDist[i][j] ####### Check for redundancy if MatDist[i][j] == 0: jconstraint = int(j / numberofsruct) if j not in Redondantestructure1 and int( i / numberofsruct ) == jconstraint: # To be sure that the redundant structure belongs to the same probing condition Dicnumberofsruct[constrainte[jconstraint]] -= 1 Redondantestructure1[j] = jconstraint progress.EndTask() progress.StartTask("Export dissimilarity matrix") for elem in Redondantestructure1: jconstraint = Redondantestructure1[elem] StructureNumber = elem - jconstraint * numberofsruct Redondantestructure[constrainte[jconstraint]][ StructureNumber] = 1 # we mark redundant structures by value 1 # store the distance matrix in the SVMLFile SVMLFullPath = os.path.join(conf.OutputFolder, "tmp", SVMlFile) if os.path.isfile(SVMLFullPath): os.remove(SVMLFullPath) # To clean the previous version o = open(SVMLFullPath, "w") for i in range(len(MatDist)): o.write("%i\t" % (i + 1)) for j in range(len(MatDist)): if (i != j): o.write("%i:%.4f\t" % (j + 1, MatDist[i][j])) o.write("\n") o.close() progress.EndTask() progress.StartTask("Pickle all data") FF.PickleVariable(MatDist, "dissmatrix.pkl") FF.PickleVariable(list(Redondantestructure1.keys()), "Redondantestructures.pkl") FF.PickleVariable(Redondantestructure, "Redondantestructures_Id.pkl") FF.PickleVariable(Dicnumberofsruct, "Dicnumberofsruct.pkl") progress.EndTask() return 0
def drawStructure(Sequence, Structure, Shapefile, OutFile): conf = CF.loadConfig() cmopt = "" #print "shape",Shapefile if os.path.isfile(Shapefile): vals = FF.parseReactivityfile(Shapefile) cmopt = ' -colorMap "' + ";".join(["%.3f" % float(v) for v in vals]) + '"' + COLOR_MAP dummyout = os.path.join(conf.OutputFolder, "tmp", "varnamsg.txt") cmd = 'java -cp VARNAv3-93.jar fr.orsay.lri.varna.applications.VARNAcmd -bpStyle simple -sequenceDBN "%s" -structureDBN "%s" '%(Sequence, Structure) + cmopt + ' -algorithm line -o ' + OutFile #print cmd subprocess.call(cmd, stdin=None, stdout=open(dummyout, 'wb'), stderr=open(dummyout, 'w'), shell=True)
def RunEval(InputFile): Energy = [] # launch the RNaeval command conf = loadConfig() energiesFile = os.path.join(conf.OutputFolder, "tmp", "energyvalues") os.system('RNAeval <' + InputFile + '>' + energiesFile) # Parse the RNAevaloutput to extract energy values lines = FF.Parsefile(energiesFile) for i in xrange(1, len(lines), 2): # i is the stucture number and 'lines[i].split(" ")[1][1:-2]' is the corresponding energy value # print 'holla',(lines[i].split(" ")[1][1:-2]) Energy.append( lines[i].split(" ", 1)[1] [1:-2]) # TODO ,1 is to get the first occurence of the space !!! return Energy
def StructSampling(Pathconstraints, Conditions, numberStructures, T, m, b, defaultFasta): conf = loadConfig() dir = os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(numberStructures)) FF.CreateFold(dir) thermoMsgShown = False for filename in Conditions: lines = [] header = [] progress.StartTask("Processing %s"%(filename)) while len(lines) - NUM_HEADER_LINES < numberStructures: # If alternative sequence file found in constraints folder, use it rather than default Input = defaultFasta for p in Pathconstraints: tmpInput = os.path.join(p, filename + '.' + IPANEMAP.FASTA_EXTENSION) if os.path.isfile(tmpInput): Input = tmpInput output = os.path.join(dir, filename) Command = 'RNAsubopt -p ' + str(numberStructures) + ' -s -T ' + str(T) (hasHardConstraints, hasSoftConstraints) = (False, False) hardConstraintFile = os.path.join(conf.PathConstraintsFile, filename + '.txt') if os.path.isfile(hardConstraintFile): Command += ' -C --enforceConstraint ' hasHardConstraints = True Input = hardConstraintFile ShapeFile = os.path.join(conf.PathConstraintsFileShape, filename + '.txt') if os.path.isfile(ShapeFile): Command += ' --shape ' + ShapeFile + ' --shapeMethod="Dm' + str(m) + 'b' + str(b) + '"' hasSoftConstraints = True if not (hasHardConstraints or hasSoftConstraints or thermoMsgShown): progress.Print("Warning: Did not find suitable constraint file for this condition, using purely thermodynamic sampling") thermoMsgShown = True subprocess.call(Command, stdin=open(Input, 'r'), stdout=open(output, 'wb'), stderr=open(os.devnull, 'w'), shell=True) with open(output, 'r') as f: nlines = f.readlines() header = nlines[:NUM_HEADER_LINES] lines += nlines[NUM_HEADER_LINES:] with open(output, 'w') as f: f.writelines(header+lines[:numberStructures]) progress.EndTask() return dir
def EvalStructuresEnergies(StructFile, rna): # generate the rnaeval input file conf = loadConfig() InputFile = os.path.join(conf.OutputFolder, "tmp", "InputRNAeval") FromStructFiletoRNAEvalInput(StructFile, InputFile, rna) return RunEval(InputFile)
import sys from collections import defaultdict from conf import loadConfig, Logger import FileFunctions as FF import Sampling as SP import StructureFunctions as SF import StructureFunctions as SF import VisualizationTools as VT import ClustersTrait as CT import Optimize_clustering as OC from Progress import progress FASTA_EXTENSION = "fa" if __name__ == "__main__": conf = loadConfig() # Create folders FF.CreateFold(conf.OutputFolder) FF.CreateFold(os.path.join(conf.OutputFolder, "tmp")) FF.CreateFold(os.path.join(conf.OutputFolder, "tmp", conf.PickledData)) # Redirects all the print to the output Log file sys.stdout = Logger(os.path.join(conf.OutputFolder, conf.OutputLogfile)) # ******************************** Generate sample try: rna = os.path.split(conf.RNA)[-1] RNAName = rna[:-(len(FASTA_EXTENSION) + 1)] progress.StartTask("Processing RNA %s" % (RNAName))
def DefineNumberCluster(SVMLMatrix, Redundant, method, DM, BoltzmanFactor, Probingconditions, rna): conf = loadConfig() epsilon = 1 # Cetroid base pair distance threshold Cluster = defaultdict(lambda: defaultdict(CL.a)) Clust = defaultdict(lambda: defaultdict(CL.a)) CumulBE = defaultdict(lambda: defaultdict(CL.a)) Centroids = defaultdict(lambda: defaultdict(CL.a)) progress.StartTask("Initialization step") # Initialization step Cluster[1] = CL.MiniBatchKMeans(SVMLMatrix, 1) Centroids[1], BZ, X, Y, Z, IntradistanceStop = CT.CentroidBycluster( Cluster[1], os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(conf.SampleSize), 'Samples.txt'), BoltzmanFactor, int(conf.SampleSize), Probingconditions, rna) CumulBE[1] = CumulatedBoltzmannsbyCluster(Cluster[1], BZ, int(conf.SampleSize), Probingconditions) #print "***************************************verification bz", "Cluster \t Centroids \t CumulBE \t ", Centroids[1], CumulBE[1] progress.EndTask() for nb in range(2, 21): progress.StartTask("Clustering with %s clusters" % nb) progress.StartTask("Run MBKM") Clust[nb] = CL.MiniBatchKMeans(SVMLMatrix, nb) progress.EndTask() Centroids[nb], BZ, X, Y, Z, Intradistance = CT.CentroidBycluster( Clust[nb], os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(conf.SampleSize), 'Samples.txt'), BoltzmanFactor, int(conf.SampleSize), Probingconditions, rna) CumulBE[nb] = CumulatedBoltzmannsbyCluster(Clust[nb], BZ, int(conf.SampleSize), Probingconditions) lista = [] ''' ####***************************************************First crierion: if len([ elem for elem in IntradistanceStop if elem <= epsilon_intradist ] )!=0: print "************************************* Clustering done with ", nb ," as the optimal number of clusters using the first criterion intradistance*********************************************************************" break # ************************************* second criterion ''' for elem1 in Centroids[nb - 1].keys(): rep = [] ''' print "distance to all elements" print "Ref \t i \t i+1 \t BPdist \t CumulatedBz i \t CumulatedBz i+1 \t CumulatedBzdist" ''' for elem2 in Centroids[nb].keys(): rep.append( (elem2, SF.DistanceTwoStructs( SF.BasePairsFromStruct(Centroids[nb - 1][elem1]), SF.BasePairsFromStruct(Centroids[nb][elem2])))) minima = np.min([item[1] for item in rep]) pos = [elem[0] for elem in rep if elem[1] == minima][0] l1 = CumulBE[nb - 1][elem1] l2 = CumulBE[nb][pos] # print "what s wrong!", l1,l2 Dist = l1 - l2 lista.append((minima, (l1, l2, Dist))) ########## The new criterion i about the existence of probable cluster Bzmepsilon = 0.3 * CumulBE[1][0] BP_All_probable_centroids = [ BPdist for BPdist, Bzmandist in lista if Bzmandist[0] >= Bzmepsilon ] progress.EndTask() if (len([elem for elem in Intradistance if elem <= epsilon]) != 0 or len([ distance for distance in BP_All_probable_centroids if distance <= epsilon ]) == len(BP_All_probable_centroids)): FF.PickleVariable(Cluster[nb], "Clusters" + method + ".pkl") progress.Print("Choosing %s as the optimal number of clusters" % nb) break # for the entire clusters while keeping redundancy return Clust[nb], Centroids[nb]