def printClusters(clusters): cid = 0 for c in clusters: cid += 1 progress.Print("ClusterID: %s" % cid) for s in c: progress.Print(s)
def GetBasePairsFromStructFile( faPath): # return dic={structure:[liste de pairs de base ],....} # print faPath DicStruct = {} lines = FF.Parsefile(faPath) # print lines SeqLen = len(lines[1]) - 1 # print SeqLen,"seq length" rawStructs = [] for j in range(len(lines)): sec_str = lines[j].strip().split(' ')[0] rawStructs.append(sec_str) DicStruct[j] = BasePairsFromStruct(sec_str) progress.Print("Loaded %s structures (%s distinct)" % (len(rawStructs), len(set(rawStructs)))) return len(lines), DicStruct
def StructSampling(Pathconstraints, Conditions, numberStructures, T, m, b, defaultFasta): conf = loadConfig() dir = os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(numberStructures)) FF.CreateFold(dir) thermoMsgShown = False for filename in Conditions: lines = [] header = [] progress.StartTask("Processing %s"%(filename)) while len(lines) - NUM_HEADER_LINES < numberStructures: # If alternative sequence file found in constraints folder, use it rather than default Input = defaultFasta for p in Pathconstraints: tmpInput = os.path.join(p, filename + '.' + IPANEMAP.FASTA_EXTENSION) if os.path.isfile(tmpInput): Input = tmpInput output = os.path.join(dir, filename) Command = 'RNAsubopt -p ' + str(numberStructures) + ' -s -T ' + str(T) (hasHardConstraints, hasSoftConstraints) = (False, False) hardConstraintFile = os.path.join(conf.PathConstraintsFile, filename + '.txt') if os.path.isfile(hardConstraintFile): Command += ' -C --enforceConstraint ' hasHardConstraints = True Input = hardConstraintFile ShapeFile = os.path.join(conf.PathConstraintsFileShape, filename + '.txt') if os.path.isfile(ShapeFile): Command += ' --shape ' + ShapeFile + ' --shapeMethod="Dm' + str(m) + 'b' + str(b) + '"' hasSoftConstraints = True if not (hasHardConstraints or hasSoftConstraints or thermoMsgShown): progress.Print("Warning: Did not find suitable constraint file for this condition, using purely thermodynamic sampling") thermoMsgShown = True subprocess.call(Command, stdin=open(Input, 'r'), stdout=open(output, 'wb'), stderr=open(os.devnull, 'w'), shell=True) with open(output, 'r') as f: nlines = f.readlines() header = nlines[:NUM_HEADER_LINES] lines += nlines[NUM_HEADER_LINES:] with open(output, 'w') as f: f.writelines(header+lines[:numberStructures]) progress.EndTask() return dir
ProbingConditions = [RNAName + state for state in conf.Conditions] # Specify whether to generate new sample or use a previously generated one OutputSamples = os.path.join(conf.OutputFolder, "tmp", 'OutputSamples') + conf.SampleSize if str.lower( conf.Sampling) == "true" or not os.path.isdir(OutputSamples): progress.StartTask("Sampling %s structures for each condition" % (conf.SampleSize)) OutputSamples = SP.StructSampling( [conf.PathConstraintsFile, conf.PathConstraintsFileShape], ProbingConditions, int(conf.SampleSize), conf.Temperature, conf.m, conf.b, conf.RNA) progress.EndTask() else: progress.Print("Using existing sample") progress.Print("Probing conditions: %s" % (ProbingConditions)) # Create a global file that contains structures sampled from the list of Probing conditions FF.MergeFiles(OutputSamples, os.path.join(OutputSamples, 'Samples.txt'), ProbingConditions, SP.NUM_HEADER_LINES) # Create a distance matrix file progress.StartTask("Computing dissimilarity matrix") SVMlFile = "DissimilarityMatrix" + conf.SampleSize # Calculate distance and identify redundant structures within the same condition SF.DistanceStruct(os.path.join(OutputSamples, 'Samples.txt'), SVMlFile, int(conf.SampleSize), ProbingConditions) progress.EndTask()
def DefineNumberCluster(SVMLMatrix, Redundant, method, DM, BoltzmanFactor, Probingconditions, rna): conf = loadConfig() epsilon = 1 # Cetroid base pair distance threshold Cluster = defaultdict(lambda: defaultdict(CL.a)) Clust = defaultdict(lambda: defaultdict(CL.a)) CumulBE = defaultdict(lambda: defaultdict(CL.a)) Centroids = defaultdict(lambda: defaultdict(CL.a)) progress.StartTask("Initialization step") # Initialization step Cluster[1] = CL.MiniBatchKMeans(SVMLMatrix, 1) Centroids[1], BZ, X, Y, Z, IntradistanceStop = CT.CentroidBycluster( Cluster[1], os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(conf.SampleSize), 'Samples.txt'), BoltzmanFactor, int(conf.SampleSize), Probingconditions, rna) CumulBE[1] = CumulatedBoltzmannsbyCluster(Cluster[1], BZ, int(conf.SampleSize), Probingconditions) #print "***************************************verification bz", "Cluster \t Centroids \t CumulBE \t ", Centroids[1], CumulBE[1] progress.EndTask() for nb in range(2, 21): progress.StartTask("Clustering with %s clusters" % nb) progress.StartTask("Run MBKM") Clust[nb] = CL.MiniBatchKMeans(SVMLMatrix, nb) progress.EndTask() Centroids[nb], BZ, X, Y, Z, Intradistance = CT.CentroidBycluster( Clust[nb], os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(conf.SampleSize), 'Samples.txt'), BoltzmanFactor, int(conf.SampleSize), Probingconditions, rna) CumulBE[nb] = CumulatedBoltzmannsbyCluster(Clust[nb], BZ, int(conf.SampleSize), Probingconditions) lista = [] ''' ####***************************************************First crierion: if len([ elem for elem in IntradistanceStop if elem <= epsilon_intradist ] )!=0: print "************************************* Clustering done with ", nb ," as the optimal number of clusters using the first criterion intradistance*********************************************************************" break # ************************************* second criterion ''' for elem1 in Centroids[nb - 1].keys(): rep = [] ''' print "distance to all elements" print "Ref \t i \t i+1 \t BPdist \t CumulatedBz i \t CumulatedBz i+1 \t CumulatedBzdist" ''' for elem2 in Centroids[nb].keys(): rep.append( (elem2, SF.DistanceTwoStructs( SF.BasePairsFromStruct(Centroids[nb - 1][elem1]), SF.BasePairsFromStruct(Centroids[nb][elem2])))) minima = np.min([item[1] for item in rep]) pos = [elem[0] for elem in rep if elem[1] == minima][0] l1 = CumulBE[nb - 1][elem1] l2 = CumulBE[nb][pos] # print "what s wrong!", l1,l2 Dist = l1 - l2 lista.append((minima, (l1, l2, Dist))) ########## The new criterion i about the existence of probable cluster Bzmepsilon = 0.3 * CumulBE[1][0] BP_All_probable_centroids = [ BPdist for BPdist, Bzmandist in lista if Bzmandist[0] >= Bzmepsilon ] progress.EndTask() if (len([elem for elem in Intradistance if elem <= epsilon]) != 0 or len([ distance for distance in BP_All_probable_centroids if distance <= epsilon ]) == len(BP_All_probable_centroids)): FF.PickleVariable(Cluster[nb], "Clusters" + method + ".pkl") progress.Print("Choosing %s as the optimal number of clusters" % nb) break # for the entire clusters while keeping redundancy return Clust[nb], Centroids[nb]