Exemple #1
0
def printClusters(clusters):
    cid = 0
    for c in clusters:
        cid += 1
        progress.Print("ClusterID: %s" % cid)
        for s in c:
            progress.Print(s)
Exemple #2
0
def GetBasePairsFromStructFile(
        faPath):  # return dic={structure:[liste de pairs de base ],....}
    # print faPath
    DicStruct = {}
    lines = FF.Parsefile(faPath)
    # print lines
    SeqLen = len(lines[1]) - 1
    # print SeqLen,"seq length"
    rawStructs = []
    for j in range(len(lines)):
        sec_str = lines[j].strip().split(' ')[0]
        rawStructs.append(sec_str)
        DicStruct[j] = BasePairsFromStruct(sec_str)
    progress.Print("Loaded %s structures (%s distinct)" %
                   (len(rawStructs), len(set(rawStructs))))
    return len(lines), DicStruct
Exemple #3
0
def StructSampling(Pathconstraints, Conditions, numberStructures, T, m, b, defaultFasta):
    conf = loadConfig()
    dir = os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(numberStructures))
    FF.CreateFold(dir)
    thermoMsgShown = False
    for filename in Conditions:
        lines = []
        header = []
        progress.StartTask("Processing %s"%(filename))
        while len(lines) - NUM_HEADER_LINES < numberStructures:

            # If alternative sequence file found in constraints folder, use it rather than default
            Input = defaultFasta
            for p in Pathconstraints:
                tmpInput = os.path.join(p, filename + '.' + IPANEMAP.FASTA_EXTENSION)
                if os.path.isfile(tmpInput):
                    Input = tmpInput

            output = os.path.join(dir, filename)
            Command = 'RNAsubopt  -p ' + str(numberStructures) + ' -s -T ' + str(T)

            (hasHardConstraints, hasSoftConstraints) = (False, False)

            hardConstraintFile = os.path.join(conf.PathConstraintsFile, filename + '.txt')
            if os.path.isfile(hardConstraintFile):
                Command += ' -C --enforceConstraint '
                hasHardConstraints = True
                Input = hardConstraintFile

            ShapeFile = os.path.join(conf.PathConstraintsFileShape, filename + '.txt')
            if os.path.isfile(ShapeFile):
                Command += ' --shape ' + ShapeFile + ' --shapeMethod="Dm' + str(m) + 'b' + str(b) + '"'
                hasSoftConstraints = True

            if not (hasHardConstraints or hasSoftConstraints or thermoMsgShown):
                progress.Print("Warning: Did not find suitable constraint file for this condition, using purely thermodynamic sampling")
                thermoMsgShown = True
            subprocess.call(Command, stdin=open(Input, 'r'), stdout=open(output, 'wb'),
                            stderr=open(os.devnull, 'w'), shell=True)
            with open(output, 'r') as f:
                nlines = f.readlines()
                header = nlines[:NUM_HEADER_LINES]
                lines += nlines[NUM_HEADER_LINES:]
        with open(output, 'w') as f:
            f.writelines(header+lines[:numberStructures])
        progress.EndTask()
    return dir
Exemple #4
0
        ProbingConditions = [RNAName + state for state in conf.Conditions]

        # Specify  whether to generate new sample or use a previously  generated one
        OutputSamples = os.path.join(conf.OutputFolder, "tmp",
                                     'OutputSamples') + conf.SampleSize
        if str.lower(
                conf.Sampling) == "true" or not os.path.isdir(OutputSamples):
            progress.StartTask("Sampling %s structures for each condition" %
                               (conf.SampleSize))
            OutputSamples = SP.StructSampling(
                [conf.PathConstraintsFile, conf.PathConstraintsFileShape],
                ProbingConditions, int(conf.SampleSize), conf.Temperature,
                conf.m, conf.b, conf.RNA)
            progress.EndTask()
        else:
            progress.Print("Using existing sample")

        progress.Print("Probing conditions: %s" % (ProbingConditions))
        # Create a global file that contains structures sampled from the list of Probing conditions
        FF.MergeFiles(OutputSamples, os.path.join(OutputSamples,
                                                  'Samples.txt'),
                      ProbingConditions, SP.NUM_HEADER_LINES)

        # Create a distance matrix file
        progress.StartTask("Computing dissimilarity matrix")
        SVMlFile = "DissimilarityMatrix" + conf.SampleSize
        # Calculate distance and identify redundant structures within the same condition
        SF.DistanceStruct(os.path.join(OutputSamples, 'Samples.txt'), SVMlFile,
                          int(conf.SampleSize), ProbingConditions)
        progress.EndTask()
Exemple #5
0
def DefineNumberCluster(SVMLMatrix, Redundant, method, DM, BoltzmanFactor,
                        Probingconditions, rna):
    conf = loadConfig()

    epsilon = 1  # Cetroid base pair distance threshold

    Cluster = defaultdict(lambda: defaultdict(CL.a))
    Clust = defaultdict(lambda: defaultdict(CL.a))
    CumulBE = defaultdict(lambda: defaultdict(CL.a))
    Centroids = defaultdict(lambda: defaultdict(CL.a))

    progress.StartTask("Initialization step")
    # Initialization step
    Cluster[1] = CL.MiniBatchKMeans(SVMLMatrix, 1)
    Centroids[1], BZ, X, Y, Z, IntradistanceStop = CT.CentroidBycluster(
        Cluster[1],
        os.path.join(conf.OutputFolder, "tmp",
                     'OutputSamples' + str(conf.SampleSize), 'Samples.txt'),
        BoltzmanFactor, int(conf.SampleSize), Probingconditions, rna)
    CumulBE[1] = CumulatedBoltzmannsbyCluster(Cluster[1], BZ,
                                              int(conf.SampleSize),
                                              Probingconditions)
    #print  "***************************************verification bz", "Cluster  \t Centroids  \t CumulBE \t ", Centroids[1], CumulBE[1]
    progress.EndTask()
    for nb in range(2, 21):
        progress.StartTask("Clustering with %s clusters" % nb)
        progress.StartTask("Run MBKM")
        Clust[nb] = CL.MiniBatchKMeans(SVMLMatrix, nb)
        progress.EndTask()
        Centroids[nb], BZ, X, Y, Z, Intradistance = CT.CentroidBycluster(
            Clust[nb],
            os.path.join(conf.OutputFolder, "tmp",
                         'OutputSamples' + str(conf.SampleSize),
                         'Samples.txt'), BoltzmanFactor, int(conf.SampleSize),
            Probingconditions, rna)
        CumulBE[nb] = CumulatedBoltzmannsbyCluster(Clust[nb], BZ,
                                                   int(conf.SampleSize),
                                                   Probingconditions)

        lista = []
        '''
        ####***************************************************First crierion:
        if len([ elem for elem in IntradistanceStop if elem <= epsilon_intradist ] )!=0:
            print "************************************* Clustering done with ", nb ," as the optimal number of clusters using the first criterion  intradistance*********************************************************************"
            break
        # ************************************* second criterion
        '''
        for elem1 in Centroids[nb - 1].keys():
            rep = []
            '''
            print "distance to all elements"
            print "Ref \t i \t i+1 \t BPdist \t CumulatedBz i \t CumulatedBz i+1 \t  CumulatedBzdist"
            '''
            for elem2 in Centroids[nb].keys():
                rep.append(
                    (elem2,
                     SF.DistanceTwoStructs(
                         SF.BasePairsFromStruct(Centroids[nb - 1][elem1]),
                         SF.BasePairsFromStruct(Centroids[nb][elem2]))))

            minima = np.min([item[1] for item in rep])
            pos = [elem[0] for elem in rep if elem[1] == minima][0]

            l1 = CumulBE[nb - 1][elem1]
            l2 = CumulBE[nb][pos]
            # print "what s wrong!", l1,l2
            Dist = l1 - l2

            lista.append((minima, (l1, l2, Dist)))
        ########## The new criterion i about the existence of probable cluster
        Bzmepsilon = 0.3 * CumulBE[1][0]

        BP_All_probable_centroids = [
            BPdist for BPdist, Bzmandist in lista if Bzmandist[0] >= Bzmepsilon
        ]
        progress.EndTask()

        if (len([elem for elem in Intradistance if elem <= epsilon]) != 0
                or len([
                    distance for distance in BP_All_probable_centroids
                    if distance <= epsilon
                ]) == len(BP_All_probable_centroids)):
            FF.PickleVariable(Cluster[nb], "Clusters" + method + ".pkl")
            progress.Print("Choosing %s as the optimal number of clusters" %
                           nb)
            break

        # for the entire clusters while keeping redundancy
    return Clust[nb], Centroids[nb]