def drawStructure(Sequence, Shapefile, outfile): lines = "" f = FF.Parsefile(Shapefile) lines = ";".join( ["%.3f" % float(line.strip().split('\t')[1]) for line in f]) cmd = 'java -cp VARNAv3-93.jar fr.orsay.lri.varna.applications.VARNAcmd -i ' + Sequence + ' -colorMap ' + '"' + lines + '"' + ' -colorMapStyle ' + '"-5.00:#4747B6,0.00:#4747FF,0.40:#1CFF47,0.70:#FF4747,2.41:#FFFF00"' + ' -algorithm line -o ' + outfile + " > /dev/null" os.system(cmd)
def Adjust_structure(File, directory, IndexRna): rnaaligned = FF.Parsefile(File)[0] listdeletion = [i for i, j in enumerate(rnaaligned) if j == '_'] lines = FF.Parsefile(File) #SeqLen=len(lines[1])-1 #print SeqLen,"seq length" fileout = os.path.join(directory, IndexRna + 'MSA') o = open(fileout, "w") o.write('>' + IndexRna + 'MSA\n') for struct in lines[2:-2]: struc = list(struct) for k in listdeletion: #print struct struc[k] = "" o.write('%s' % ("".join(struc))) o.close()
def FromStructFiletoRNAEvalInput(StructFile, InputRNAeval, rna): lines = FF.Parsefile(StructFile) o = open( InputRNAeval, "w" ) # geneate intermediate file with sequence+strcuture , seq+strcture .... as the input format to use RNAeval # print "sdfqspkojr",len(lines) for i in range(1, len(lines)): o.write("%s%s\t" % (rna, lines[i])) o.close()
def EnergyValuesFromStructure(StructFile, rna): Energy = [] # generate the rnaeval input file FromStructFiletoRNAEvalInput(StructFile, "InputRNAeval", rna) # launch the RNaeval command os.system('RNAeval <' + "InputRNAeval" + '>' + "energyvalues") # Parse the RNAevaloutput to extract energy values lines = FF.Parsefile("energyvalues") for i in xrange(1, len(lines), 2): Energy.append(lines[i].split(" ")[1][1:-2]) return Energy
def GetBasePairsFromStructFile( faPath): #return dic={structure:[liste de pairs de base ],....} #print faPath DicStruct = {} lines = FF.Parsefile(faPath) #print lines SeqLen = len(lines[1]) - 1 #print SeqLen,"seq length" for j in range(len(lines)): DicStruct[j] = ListBasePairsFromStruct(lines[j].strip().split(' ')[0]) return len(lines), DicStruct
def CentroidBycluster(clusters, StructFile, Boltzmann, numberofsruct, constrainte, rna): progress.StartTask("Computing centroids") dim_clustering = len(clusters) E = defaultdict() mycentroid = defaultdict() Intradistance = [] centroids = defaultdict(lambda: defaultdict()) Myproba = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0))) ListStructures = [SF.BasePairsFromStruct(Struct) for Struct in FF.Parsefile(StructFile)] progress.StartTask("Gathering base pairs") ListBPbystructure, ListBP, Myproba, Boltzmancluster = BasePairsbyCluster(clusters, ListStructures, Boltzmann, numberofsruct, constrainte) # Eliminate cluster reporting one structure ListDiameters, Listeliminated_clusers = ClustersDiameter(clusters, ListBPbystructure) for elem in Listeliminated_clusers: del clusters[elem] progress.EndTask() progress.StartTask("Computing cluster distance distribution") E = ClustersDistances(clusters, Boltzmann, ListBPbystructure, numberofsruct, constrainte) progress.EndTask() progress.StartTask("Computing MEA centroids") for ClusterNumber in clusters: mycentroid[ClusterNumber], centroids[ClusterNumber] = MEA(Myproba[ClusterNumber], rna) progress.EndTask() MatriceDistanceCentroids = scipy.zeros([dim_clustering, dim_clustering]) MatriceDistanceClustersEucld = scipy.zeros([dim_clustering, dim_clustering]) for ClusterNumber in clusters.keys(): for ClusterNumber2 in clusters.keys(): if ClusterNumber2 > ClusterNumber: l = SF.DistanceTwoStructs(centroids[ClusterNumber], centroids[ClusterNumber2]) #print "BP_centoid_distances", "\t", ClusterNumber, "\t", ClusterNumber2, "\t", l Intradistance.append(l) # print "distance between clusters comparing the centroide's distances",l MatriceDistanceCentroids[ClusterNumber][ClusterNumber2] = l MatriceDistanceCentroids[ClusterNumber2][ClusterNumber] = l # print "distance between clusters comparing the means distances", ClusterNumber, ClusterNumber2, np.abs(E[ClusterNumber]-E[ClusterNumber2]),np.sqrt(abs(pow(E[ClusterNumber],2)-pow(E[ClusterNumber2],2))) # print E l = np.sqrt(abs(pow(E[ClusterNumber], 2) - pow(E[ClusterNumber2], 2))) MatriceDistanceClustersEucld[ClusterNumber][ClusterNumber2] = l MatriceDistanceClustersEucld[ClusterNumber2][ClusterNumber] = l # print "distance between clusters compring the centroide's distances", ClusterNumber, ClusterNumber2, DistanceTwoBPlist(ListBPbystrcut[ClusterNumber][listCentroidStructure[ClusterNumber][0]],ListBPbystrcut[ClusterNumber2][listCentroidStructure[ClusterNumber2][0]]) # VT.plotDistanceClusters(MatriceDistanceCentroids, clusters, "blue", " Base pair distance between centroids") # VT.plotDistanceClusters(MatriceDistanceClustersEucld, clusters, "red", "Eucledian distance between structures") #print "BZ_distance_btw_clusters", "\t", E progress.EndTask() return mycentroid, Boltzmancluster, E, MatriceDistanceCentroids, ListDiameters, Intradistance
def ENERGY_VALUES_STRUCTURES(StructFile, rna): Energy = [] #generate the rnaeval input file FromStructFiletoRNAEvalInput(StructFile, "InputRNAeval", rna) # launch the RNaeval command os.system('RNAeval <' + "InputRNAeval" + '>' + "energyvalues") #Sp.call("RNAeval " , stdin="InputRNAeval", stdout="energyvalues", shell=True) # Parse the RNAevaloutput to extract energy values lines = FF.Parsefile("energyvalues") for i in xrange(1, len(lines), 2): # i is the stucture number and 'lines[i].split(" ")[1][1:-2]' is the corresponding energy value #print 'holla',(lines[i].split(" ")[1][1:-2]) Energy.append(lines[i].split(" ")[1][1:-2]) return Energy
def Load_Probabilities(mypath): Dic = {} B = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0))) Beta = [ f for f in listdir(mypath) if isfile(join(mypath, f)) and f.endswith('.proba') ] for i, file in enumerate(Beta): Dic[i] = file.split(".proba")[0] lines = FF.Parsefile(mypath + "/" + file) for it in range(len(lines)): lines[it] = lines[it].split("\t") B[i][int(lines[it][0])][int(lines[it][1])] = float(lines[it][2]) return Dic, B
def RunEval(InputFile): Energy = [] # launch the RNaeval command conf = loadConfig() energiesFile = os.path.join(conf.OutputFolder, "tmp", "energyvalues") os.system('RNAeval <' + InputFile + '>' + energiesFile) # Parse the RNAevaloutput to extract energy values lines = FF.Parsefile(energiesFile) for i in xrange(1, len(lines), 2): # i is the stucture number and 'lines[i].split(" ")[1][1:-2]' is the corresponding energy value # print 'holla',(lines[i].split(" ")[1][1:-2]) Energy.append( lines[i].split(" ", 1)[1] [1:-2]) # TODO ,1 is to get the first occurence of the space !!! return Energy
def GetBasePairsFromStructFile( faPath): # return dic={structure:[liste de pairs de base ],....} # print faPath DicStruct = {} lines = FF.Parsefile(faPath) # print lines SeqLen = len(lines[1]) - 1 # print SeqLen,"seq length" rawStructs = [] for j in range(len(lines)): sec_str = lines[j].strip().split(' ')[0] rawStructs.append(sec_str) DicStruct[j] = BasePairsFromStruct(sec_str) progress.Print("Loaded %s structures (%s distinct)" % (len(rawStructs), len(set(rawStructs)))) return len(lines), DicStruct
m = 2.6 / 2 b = -0.8 / 2 path_Fasta = 'fasta_files' Alignementfolder = 'Alignement' FileExtensionFasta = 'fa' print("Sampling Process for % s Structures" % (conf.numberofsruct)) OutputSamples = SP.StructSampling( [conf.PathConstrainteFile, conf.PathConstrainteFileShape], Alignementfolder, conf.numberofsruct, conf.Temperature, conf.Fastaextenstion, m, b) for filz in GetListFile(path_Fasta, FileExtensionFasta): print filz, "Treatement " startimebig = time.time() rna = FF.Parsefile( os.path.join(path_Fasta, filz + '.' + FileExtensionFasta))[1] Indexe = filz SVMlFile = "DissimilarityMatrix" + conf.numberofsruct listfiles = [filz + state for state in ["NMIA", "1M7", "MSA"]] OutputSamples = 'OutputSamples' + conf.numberofsruct MFESnbrstruct = len( listfiles) # 1 for the case where no constraint is given FF.MergeFiles(OutputSamples, os.path.join(OutputSamples, 'Samples.txt'), listfiles, 1) #endtime=time.time() #print("Sampling done with success in %53f\t"%(endtime-startime)) #!!!!!!!!!!!!! Distance Matrix calculation !!!!!!!!!!! # startime = time.time()
def FromStructFiletoRNAEvalInput(StructFile, InputRNAeval, rna): lines = FF.Parsefile(StructFile) StructsToRNAEvalInput(lines[SP.NUM_HEADER_LINES:], InputRNAeval, rna)
FF.CreateFold(os.path.join(conf.OutputFolder, "tmp", conf.PickledData)) # Redirects all the print to the output Log file sys.stdout = Logger(os.path.join(conf.OutputFolder, conf.OutputLogfile)) # ******************************** Generate sample try: rna = os.path.split(conf.RNA)[-1] RNAName = rna[:-(len(FASTA_EXTENSION) + 1)] progress.StartTask("Processing RNA %s" % (RNAName)) if not os.path.isfile(conf.RNA): raise FF.IPANEMAPError("Input file '%s' not found" % (conf.RNA)) # Get the rna sequence RNASequence = FF.Parsefile(conf.RNA)[1].strip() # Get probing conditions for the treated RNA ProbingConditions = [RNAName + state for state in conf.Conditions] # Specify whether to generate new sample or use a previously generated one OutputSamples = os.path.join(conf.OutputFolder, "tmp", 'OutputSamples') + conf.SampleSize if str.lower( conf.Sampling) == "true" or not os.path.isdir(OutputSamples): progress.StartTask("Sampling %s structures for each condition" % (conf.SampleSize)) OutputSamples = SP.StructSampling( [conf.PathConstraintsFile, conf.PathConstraintsFileShape], ProbingConditions, int(conf.SampleSize), conf.Temperature, conf.m, conf.b, conf.RNA)
#!/usr/bin/env python2.7 #@author SAADI AFAF, 2016 import conf , FileFunctions as FF, Sampling as SP, StructureFunctions as SF, Clustering as CL, VisualizationTools as VT, ClustersTrait as CT import time,os,sys,pickle from collections import defaultdict #Redirect all the print to Logfile.txt sys.stdout =conf.Logger("Logfile.txt") SVMlFile="DissimilarityMatrix"+conf.numberofsruct rna = FF.Parsefile(conf.rnafile)[1] listfiles=conf.constraintes+["MFES"] #!!!!!!!!!!!!!!Global study by comparing dotplotMatrix print '**************Calculation of Eucledian distance between different BP dot plot conditions**********' SF.DotplotRnaFold(conf.Psdotpath, conf.PathConstrainteFile, conf.PathConstrainteFileShape) SF.Writeproba(conf.Psdotpath, conf.Matrixproba, conf.constraintes, rna) VT.plotClusteringDistribution(int(conf.numberofsruct), conf.Matrixproba, len(rna)) print "Eucledian distance calculation done" #!!!!!!!!!!!!! SAMPLING!!!!!!!!!!! # This step requires as Input: fasta_constraints folder, fasta_Shape folder. # it generates using RNAsubopt a sample of strctures that will be written in OutptSamples_numberofstructure "Sample.txt" print '****************Sampling RNA Secondary Structures***********************' startime=time.time() print("Sampling Process for % s Structures"%(conf.numberofsruct)) OutputSamples=SP.StructSampling([conf.PathConstrainteFile,conf.PathConstrainteFileShape],conf.numberofsruct,conf.Temperature, conf.Fastaextenstion)