Beispiel #1
0
def DistanceStruct(StructFile, SVMlFile, numberofsruct, constrainte):
    conf = loadConfig()
    Redondantestructure = defaultdict(aa)
    MatDist = defaultdict(aa)
    Redondantestructure1 = {}
    Dicnumberofsruct = {}

    for i in range(len(constrainte)):
        Dicnumberofsruct[constrainte[i]] = numberofsruct

    nb, DicStruct = GetBasePairsFromStructFile(StructFile)

    progress.StartTask("Dissimilarity Loop")
    for i in range(0, nb):
        for j in range(i + 1, nb):
            MatDist[i][j] = DistanceTwoStructs(DicStruct[i], DicStruct[j])
            MatDist[j][i] = MatDist[i][j]

            ####### Check for redundancy
            if MatDist[i][j] == 0:
                jconstraint = int(j / numberofsruct)
                if j not in Redondantestructure1 and int(
                        i / numberofsruct
                ) == jconstraint:  # To be sure that the redundant  structure belongs to the same probing condition
                    Dicnumberofsruct[constrainte[jconstraint]] -= 1
                    Redondantestructure1[j] = jconstraint
    progress.EndTask()

    progress.StartTask("Export dissimilarity matrix")
    for elem in Redondantestructure1:
        jconstraint = Redondantestructure1[elem]
        StructureNumber = elem - jconstraint * numberofsruct
        Redondantestructure[constrainte[jconstraint]][
            StructureNumber] = 1  # we mark redundant structures by value 1

    # store the distance matrix in the  SVMLFile
    SVMLFullPath = os.path.join(conf.OutputFolder, "tmp", SVMlFile)
    if os.path.isfile(SVMLFullPath):
        os.remove(SVMLFullPath)  # To clean the previous version
    o = open(SVMLFullPath, "w")
    for i in range(len(MatDist)):
        o.write("%i\t" % (i + 1))
        for j in range(len(MatDist)):
            if (i != j):
                o.write("%i:%.4f\t" % (j + 1, MatDist[i][j]))
        o.write("\n")
    o.close()
    progress.EndTask()

    progress.StartTask("Pickle all data")
    FF.PickleVariable(MatDist, "dissmatrix.pkl")
    FF.PickleVariable(list(Redondantestructure1.keys()),
                      "Redondantestructures.pkl")
    FF.PickleVariable(Redondantestructure, "Redondantestructures_Id.pkl")
    FF.PickleVariable(Dicnumberofsruct, "Dicnumberofsruct.pkl")
    progress.EndTask()
    return 0
Beispiel #2
0
def CentroidBycluster(clusters, StructFile, Boltzmann, numberofsruct, constrainte, rna):
    progress.StartTask("Computing centroids")
    dim_clustering = len(clusters)
    E = defaultdict()
    mycentroid = defaultdict()
    Intradistance = []
    centroids = defaultdict(lambda: defaultdict())
    Myproba = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0)))

    ListStructures = [SF.BasePairsFromStruct(Struct) for Struct in FF.Parsefile(StructFile)]

    progress.StartTask("Gathering base pairs")
    ListBPbystructure, ListBP, Myproba, Boltzmancluster = BasePairsbyCluster(clusters, ListStructures, Boltzmann,
                                                                             numberofsruct,
                                                                             constrainte)
    # Eliminate cluster reporting one structure
    ListDiameters, Listeliminated_clusers = ClustersDiameter(clusters, ListBPbystructure)
    for elem in Listeliminated_clusers:
        del clusters[elem]
    progress.EndTask()

    progress.StartTask("Computing cluster distance distribution")
    E = ClustersDistances(clusters, Boltzmann, ListBPbystructure, numberofsruct, constrainte)
    progress.EndTask()
    progress.StartTask("Computing MEA centroids")
    for ClusterNumber in clusters:
        mycentroid[ClusterNumber], centroids[ClusterNumber] = MEA(Myproba[ClusterNumber], rna)
    progress.EndTask()

    MatriceDistanceCentroids = scipy.zeros([dim_clustering, dim_clustering])
    MatriceDistanceClustersEucld = scipy.zeros([dim_clustering, dim_clustering])
    for ClusterNumber in clusters.keys():
        for ClusterNumber2 in clusters.keys():
            if ClusterNumber2 > ClusterNumber:
                l = SF.DistanceTwoStructs(centroids[ClusterNumber], centroids[ClusterNumber2])
                #print "BP_centoid_distances", "\t", ClusterNumber, "\t", ClusterNumber2, "\t", l
                Intradistance.append(l)
                # print "distance between clusters comparing the centroide's distances",l
                MatriceDistanceCentroids[ClusterNumber][ClusterNumber2] = l
                MatriceDistanceCentroids[ClusterNumber2][ClusterNumber] = l
                # print "distance between clusters comparing the means distances", ClusterNumber, ClusterNumber2, np.abs(E[ClusterNumber]-E[ClusterNumber2]),np.sqrt(abs(pow(E[ClusterNumber],2)-pow(E[ClusterNumber2],2)))
                # print E
                l = np.sqrt(abs(pow(E[ClusterNumber], 2) - pow(E[ClusterNumber2], 2)))
                MatriceDistanceClustersEucld[ClusterNumber][ClusterNumber2] = l
                MatriceDistanceClustersEucld[ClusterNumber2][ClusterNumber] = l
            # print "distance between clusters compring the centroide's distances", ClusterNumber, ClusterNumber2, DistanceTwoBPlist(ListBPbystrcut[ClusterNumber][listCentroidStructure[ClusterNumber][0]],ListBPbystrcut[ClusterNumber2][listCentroidStructure[ClusterNumber2][0]])
    # VT.plotDistanceClusters(MatriceDistanceCentroids, clusters, "blue", " Base pair distance between centroids")
    # VT.plotDistanceClusters(MatriceDistanceClustersEucld, clusters, "red", "Eucledian distance between structures")
    #print "BZ_distance_btw_clusters", "\t", E
    progress.EndTask()
    return mycentroid, Boltzmancluster, E, MatriceDistanceCentroids, ListDiameters, Intradistance
Beispiel #3
0
def ClustersDiameter(clusters, BPStructs):
    progress.StartTask("Computing cluster diameters")
    eliminated_clusters = []
    lista = []

    for ClusterNumber in clusters:
        if len(clusters[ClusterNumber]) > 1:  # not unique structure
            d = max([SF.DistanceTwoStructs(BPStructs[ClusterNumber][structure1],
                                           BPStructs[ClusterNumber][structure2])
                     for structure1 in clusters[ClusterNumber] for structure2 in clusters[ClusterNumber]])
        else:
            d = 0
            eliminated_clusters.append(ClusterNumber)
        lista.append(d)
    progress.EndTask()
    return lista, eliminated_clusters
Beispiel #4
0
def StructSampling(Pathconstraints, Conditions, numberStructures, T, m, b, defaultFasta):
    conf = loadConfig()
    dir = os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(numberStructures))
    FF.CreateFold(dir)
    thermoMsgShown = False
    for filename in Conditions:
        lines = []
        header = []
        progress.StartTask("Processing %s"%(filename))
        while len(lines) - NUM_HEADER_LINES < numberStructures:

            # If alternative sequence file found in constraints folder, use it rather than default
            Input = defaultFasta
            for p in Pathconstraints:
                tmpInput = os.path.join(p, filename + '.' + IPANEMAP.FASTA_EXTENSION)
                if os.path.isfile(tmpInput):
                    Input = tmpInput

            output = os.path.join(dir, filename)
            Command = 'RNAsubopt  -p ' + str(numberStructures) + ' -s -T ' + str(T)

            (hasHardConstraints, hasSoftConstraints) = (False, False)

            hardConstraintFile = os.path.join(conf.PathConstraintsFile, filename + '.txt')
            if os.path.isfile(hardConstraintFile):
                Command += ' -C --enforceConstraint '
                hasHardConstraints = True
                Input = hardConstraintFile

            ShapeFile = os.path.join(conf.PathConstraintsFileShape, filename + '.txt')
            if os.path.isfile(ShapeFile):
                Command += ' --shape ' + ShapeFile + ' --shapeMethod="Dm' + str(m) + 'b' + str(b) + '"'
                hasSoftConstraints = True

            if not (hasHardConstraints or hasSoftConstraints or thermoMsgShown):
                progress.Print("Warning: Did not find suitable constraint file for this condition, using purely thermodynamic sampling")
                thermoMsgShown = True
            subprocess.call(Command, stdin=open(Input, 'r'), stdout=open(output, 'wb'),
                            stderr=open(os.devnull, 'w'), shell=True)
            with open(output, 'r') as f:
                nlines = f.readlines()
                header = nlines[:NUM_HEADER_LINES]
                lines += nlines[NUM_HEADER_LINES:]
        with open(output, 'w') as f:
            f.writelines(header+lines[:numberStructures])
        progress.EndTask()
    return dir
Beispiel #5
0
        # Get probing conditions for the treated RNA
        ProbingConditions = [RNAName + state for state in conf.Conditions]

        # Specify  whether to generate new sample or use a previously  generated one
        OutputSamples = os.path.join(conf.OutputFolder, "tmp",
                                     'OutputSamples') + conf.SampleSize
        if str.lower(
                conf.Sampling) == "true" or not os.path.isdir(OutputSamples):
            progress.StartTask("Sampling %s structures for each condition" %
                               (conf.SampleSize))
            OutputSamples = SP.StructSampling(
                [conf.PathConstraintsFile, conf.PathConstraintsFileShape],
                ProbingConditions, int(conf.SampleSize), conf.Temperature,
                conf.m, conf.b, conf.RNA)
            progress.EndTask()
        else:
            progress.Print("Using existing sample")

        progress.Print("Probing conditions: %s" % (ProbingConditions))
        # Create a global file that contains structures sampled from the list of Probing conditions
        FF.MergeFiles(OutputSamples, os.path.join(OutputSamples,
                                                  'Samples.txt'),
                      ProbingConditions, SP.NUM_HEADER_LINES)

        # Create a distance matrix file
        progress.StartTask("Computing dissimilarity matrix")
        SVMlFile = "DissimilarityMatrix" + conf.SampleSize
        # Calculate distance and identify redundant structures within the same condition
        SF.DistanceStruct(os.path.join(OutputSamples, 'Samples.txt'), SVMlFile,
                          int(conf.SampleSize), ProbingConditions)
Beispiel #6
0
def DefineNumberCluster(SVMLMatrix, Redundant, method, DM, BoltzmanFactor,
                        Probingconditions, rna):
    conf = loadConfig()

    epsilon = 1  # Cetroid base pair distance threshold

    Cluster = defaultdict(lambda: defaultdict(CL.a))
    Clust = defaultdict(lambda: defaultdict(CL.a))
    CumulBE = defaultdict(lambda: defaultdict(CL.a))
    Centroids = defaultdict(lambda: defaultdict(CL.a))

    progress.StartTask("Initialization step")
    # Initialization step
    Cluster[1] = CL.MiniBatchKMeans(SVMLMatrix, 1)
    Centroids[1], BZ, X, Y, Z, IntradistanceStop = CT.CentroidBycluster(
        Cluster[1],
        os.path.join(conf.OutputFolder, "tmp",
                     'OutputSamples' + str(conf.SampleSize), 'Samples.txt'),
        BoltzmanFactor, int(conf.SampleSize), Probingconditions, rna)
    CumulBE[1] = CumulatedBoltzmannsbyCluster(Cluster[1], BZ,
                                              int(conf.SampleSize),
                                              Probingconditions)
    #print  "***************************************verification bz", "Cluster  \t Centroids  \t CumulBE \t ", Centroids[1], CumulBE[1]
    progress.EndTask()
    for nb in range(2, 21):
        progress.StartTask("Clustering with %s clusters" % nb)
        progress.StartTask("Run MBKM")
        Clust[nb] = CL.MiniBatchKMeans(SVMLMatrix, nb)
        progress.EndTask()
        Centroids[nb], BZ, X, Y, Z, Intradistance = CT.CentroidBycluster(
            Clust[nb],
            os.path.join(conf.OutputFolder, "tmp",
                         'OutputSamples' + str(conf.SampleSize),
                         'Samples.txt'), BoltzmanFactor, int(conf.SampleSize),
            Probingconditions, rna)
        CumulBE[nb] = CumulatedBoltzmannsbyCluster(Clust[nb], BZ,
                                                   int(conf.SampleSize),
                                                   Probingconditions)

        lista = []
        '''
        ####***************************************************First crierion:
        if len([ elem for elem in IntradistanceStop if elem <= epsilon_intradist ] )!=0:
            print "************************************* Clustering done with ", nb ," as the optimal number of clusters using the first criterion  intradistance*********************************************************************"
            break
        # ************************************* second criterion
        '''
        for elem1 in Centroids[nb - 1].keys():
            rep = []
            '''
            print "distance to all elements"
            print "Ref \t i \t i+1 \t BPdist \t CumulatedBz i \t CumulatedBz i+1 \t  CumulatedBzdist"
            '''
            for elem2 in Centroids[nb].keys():
                rep.append(
                    (elem2,
                     SF.DistanceTwoStructs(
                         SF.BasePairsFromStruct(Centroids[nb - 1][elem1]),
                         SF.BasePairsFromStruct(Centroids[nb][elem2]))))

            minima = np.min([item[1] for item in rep])
            pos = [elem[0] for elem in rep if elem[1] == minima][0]

            l1 = CumulBE[nb - 1][elem1]
            l2 = CumulBE[nb][pos]
            # print "what s wrong!", l1,l2
            Dist = l1 - l2

            lista.append((minima, (l1, l2, Dist)))
        ########## The new criterion i about the existence of probable cluster
        Bzmepsilon = 0.3 * CumulBE[1][0]

        BP_All_probable_centroids = [
            BPdist for BPdist, Bzmandist in lista if Bzmandist[0] >= Bzmepsilon
        ]
        progress.EndTask()

        if (len([elem for elem in Intradistance if elem <= epsilon]) != 0
                or len([
                    distance for distance in BP_All_probable_centroids
                    if distance <= epsilon
                ]) == len(BP_All_probable_centroids)):
            FF.PickleVariable(Cluster[nb], "Clusters" + method + ".pkl")
            progress.Print("Choosing %s as the optimal number of clusters" %
                           nb)
            break

        # for the entire clusters while keeping redundancy
    return Clust[nb], Centroids[nb]