Exemple #1
0
def DistanceStruct(StructFile, SVMlFile, numberofsruct, MFESnbrstruct,
                   constrainte):
    Redondantestructure = defaultdict(aa)
    MatDist = defaultdict(aa)
    Redondantestructure1 = []
    DicStruct = {}
    Dicnumberofsruct = {}

    for i in range(len(constrainte) - 1):
        Dicnumberofsruct[constrainte[i]] = numberofsruct
    Dicnumberofsruct[constrainte[len(constrainte) - 1]] = MFESnbrstruct

    nb, DicStruct = GetBasePairsFromStructFile(StructFile)

    for i in range(0, nb):
        for j in range(i + 1, nb):
            MatDist[i][j] = DistanceTwoBPlist(DicStruct[i], DicStruct[j])
            if MatDist[i][j] == 0:
                if j not in Redondantestructure1:
                    if j > numberofsruct * (len(constrainte) - 1):
                        Dicnumberofsruct[constrainte[len(constrainte) -
                                                     1]] -= 1
                    else:
                        Dicnumberofsruct[constrainte[int(j /
                                                         numberofsruct)]] -= 1
                    Redondantestructure1.append(j)

            MatDist[j][i] = MatDist[i][j]

    for elem in Redondantestructure1:
        if elem < numberofsruct * (len(constrainte) - 1):
            ConditionNumber = int((elem) / numberofsruct)
        else:
            ConditionNumber = len(constrainte) - 1
        StructureNumber = elem - ConditionNumber * numberofsruct
        Redondantestructure[constrainte[ConditionNumber]][StructureNumber] = 1

    # strore the distance matrix in the file SVMLFile
    o = open(os.path.join("output", SVMlFile), "w")
    for i in range(len(MatDist)):
        o.write("%i\t" % (i + 1))
        for j in range(len(MatDist)):
            if (i != j):
                o.write("%i:%.4f\t" % (j + 1, MatDist[i][j]))
        o.write("\n")
    o.close()

    if Redondantestructure != 0:
        print "Warning! redundant structures"
    FF.PickleVariable(MatDist, os.path.join(conf.PickledData,
                                            "dissmatrix.pkl"))
    FF.PickleVariable(
        Redondantestructure1,
        os.path.join(conf.PickledData, "Redondantestructures.pkl"))
    FF.PickleVariable(
        Redondantestructure,
        os.path.join(conf.PickledData, "Redondantestructures_Id.pkl"))
    FF.PickleVariable(Dicnumberofsruct,
                      os.path.join(conf.PickledData, "Dicnumberofsruct.pkl"))
    return 0
Exemple #2
0
def Boltzmann_Calc(constraintes, StructfileRepos, numberofsruct, MFESnbrstruct,
                   rna, Redondantestructure):
    Energy = defaultdict(aa)
    Boltzman = defaultdict(aa)
    ConditionalBoltzman = defaultdict(aa)
    ZBolzman = defaultdict(aa)

    for Condition in constraintes:
        FileStructure = StructfileRepos + '/' + Condition
        #print FileStructure,"ffftft"
        Energy[Condition] = ENERGY_VALUES_STRUCTURES(
            FileStructure, rna
        )  # list of energy values for the structures present in the Condition
    #print Energy,"llllllllllllllllllllll","done"
    for Condition in constraintes:
        #print MFESnbrstruct
        if Condition == "MFES":

            Boltzman[Condition] = [
                BoltzmannEnergy(Energy[Condition][i]) for i in range(2)
            ]
            #print "heehrh",Boltzman[Condition]
            #for i in range(MFESnbrstruct):
            #   print i, Boltzman[Condition][i]
            #print Boltzman[Condition],"mfe"
        else:
            listawithoutRedonddnace = []
            for i in range(numberofsruct):
                Boltzman[Condition][i] = BoltzmannEnergy(Energy[Condition][i])
                if Redondantestructure[Condition][
                        i] == 0:  # if the structure is not redundant
                    listawithoutRedonddnace.append(
                        BoltzmannEnergy(Energy[Condition][i]))

        #print Boltzman, "eeeeeeeeeeeeeeeeeeeeeeeeee"
        ZBolzman[Condition] = sum(
            listawithoutRedonddnace)  # Partition function

    #FF.PickleVariable(Boltzman, os.path.join(conf.PickledData, "Boltzman.pkl"))
    listall = []
    for Condition in constraintes[:-1]:  # to not count MFES
        lista = []
        for i in range(numberofsruct):
            if Redondantestructure[Condition][i] == 0:
                lista.append(
                    BoltzmannEnergy(Energy[Condition][i]) /
                    ZBolzman[Condition])
            else:
                lista.append(
                    0
                )  # to solve the problem of the number of structure variation
        listall += lista
        ConditionalBoltzman[Condition] = lista

    FF.PickleVariable(
        ConditionalBoltzman,
        os.path.join(conf.PickledData, "ConditionalBoltzman.pkl"))
    FF.PickleVariable(ZBolzman, os.path.join(conf.PickledData, "ZBolzman.pkl"))

    return Boltzman
Exemple #3
0
def DistanceStruct(StructFile, SVMlFile, numberofsruct, constrainte):
    conf = loadConfig()
    Redondantestructure = defaultdict(aa)
    MatDist = defaultdict(aa)
    Redondantestructure1 = {}
    Dicnumberofsruct = {}

    for i in range(len(constrainte)):
        Dicnumberofsruct[constrainte[i]] = numberofsruct

    nb, DicStruct = GetBasePairsFromStructFile(StructFile)

    progress.StartTask("Dissimilarity Loop")
    for i in range(0, nb):
        for j in range(i + 1, nb):
            MatDist[i][j] = DistanceTwoStructs(DicStruct[i], DicStruct[j])
            MatDist[j][i] = MatDist[i][j]

            ####### Check for redundancy
            if MatDist[i][j] == 0:
                jconstraint = int(j / numberofsruct)
                if j not in Redondantestructure1 and int(
                        i / numberofsruct
                ) == jconstraint:  # To be sure that the redundant  structure belongs to the same probing condition
                    Dicnumberofsruct[constrainte[jconstraint]] -= 1
                    Redondantestructure1[j] = jconstraint
    progress.EndTask()

    progress.StartTask("Export dissimilarity matrix")
    for elem in Redondantestructure1:
        jconstraint = Redondantestructure1[elem]
        StructureNumber = elem - jconstraint * numberofsruct
        Redondantestructure[constrainte[jconstraint]][
            StructureNumber] = 1  # we mark redundant structures by value 1

    # store the distance matrix in the  SVMLFile
    SVMLFullPath = os.path.join(conf.OutputFolder, "tmp", SVMlFile)
    if os.path.isfile(SVMLFullPath):
        os.remove(SVMLFullPath)  # To clean the previous version
    o = open(SVMLFullPath, "w")
    for i in range(len(MatDist)):
        o.write("%i\t" % (i + 1))
        for j in range(len(MatDist)):
            if (i != j):
                o.write("%i:%.4f\t" % (j + 1, MatDist[i][j]))
        o.write("\n")
    o.close()
    progress.EndTask()

    progress.StartTask("Pickle all data")
    FF.PickleVariable(MatDist, "dissmatrix.pkl")
    FF.PickleVariable(list(Redondantestructure1.keys()),
                      "Redondantestructures.pkl")
    FF.PickleVariable(Redondantestructure, "Redondantestructures_Id.pkl")
    FF.PickleVariable(Dicnumberofsruct, "Dicnumberofsruct.pkl")
    progress.EndTask()
    return 0
Exemple #4
0
def Boltzmann_Calc(constraintes, StructfileRepository, NumStructures, rna,
                   Redondantestructure):
    Energy = defaultdict(aa)
    Boltzman = defaultdict(aa)
    ConditionalBoltzman = defaultdict(aa)
    ZBolzman = defaultdict(aa)
    # Calculate structure energies in each condition sample
    for Condition in constraintes:
        FileStructure = os.path.join(StructfileRepository, Condition)
        Energy[Condition] = EvalStructuresEnergies(
            FileStructure, rna
        )  # list of energy values for the structures present in the Condition

    for Condition in constraintes:
        ListwithoutRedundnacy = []
        for i in range(NumStructures):
            Boltzman[Condition][i] = BoltzmannFactor(Energy[Condition][i])
            if Redondantestructure[Condition][
                    i] == 0:  # if the structure is not redundant
                ListwithoutRedundnacy.append(Boltzman[Condition][i])

        # Calculate the normalization term as the sum over all Boltzmann probabilities for one copy of each structure
        ZBolzman[Condition] = sum(ListwithoutRedundnacy)  # Partition function

    # FF.PickleVariable(Boltzman, "Boltzman.pkl")
    listall = []
    for Condition in constraintes:  # to not count MFES
        lista = []
        for i in range(NumStructures):
            if Redondantestructure[Condition][
                    i] == 0:  # a non redundnat structure
                lista.append(Boltzman[Condition][i] / ZBolzman[Condition])
            else:
                lista.append(
                    0.
                )  # Redundant structures have a conditional Boltzmann value NULL
        listall += lista
        ConditionalBoltzman[Condition] = lista
        # print "Condition \t  ConditionalBoltzman", Condition, ConditionalBoltzman[Condition]
    FF.PickleVariable(Boltzman, "Boltzman.pkl")
    FF.PickleVariable(ConditionalBoltzman, "ConditionalBoltzman.pkl")
    FF.PickleVariable(ZBolzman, "ZBolzman.pkl")

    return ConditionalBoltzman
Exemple #5
0
def AffinityPropagation(SVMLMatrix, Redundant):
    # To be able to use the pickeling we need to  defined at module level, that means it is not an instance method of a class and it's not nested within another function, and it is a "real" function with a name, not a lambda function.
    clusters = defaultdict(a)
    X, y = np.array(load_svmlight_file(SVMLMatrix))
    algorithm = cluster.MiniBatchKMeans(n_clusters=6)#
    #algorithm=cluster.AffinityPropagation(damping=.9, preference=None)
    algorithm.fit(X)
    if hasattr(algorithm, 'labels_'):
        y_pred = algorithm.labels_.astype(np.int)
    else:
        y_pred = algorithm.predict(X)
    for i in range(len(y_pred)):
        clusters[y_pred[i]].append(i + 1)
    # eliminate redundancy
    for elem in clusters:
        clusters[elem] = FilterCluster(clusters[elem], Redundant)
    # print "Clusters",clusters
    FF.PickleVariable(clusters, os.path.join(conf.PickledData,"Clusters_Aff_Prop.pkl"))
    return 0
Exemple #6
0
def DefineNumberCluster(SVMLMatrix, Redundant, method, DM, BoltzmanFactor,
                        Probingconditions, rna):
    conf = loadConfig()

    epsilon = 1  # Cetroid base pair distance threshold

    Cluster = defaultdict(lambda: defaultdict(CL.a))
    Clust = defaultdict(lambda: defaultdict(CL.a))
    CumulBE = defaultdict(lambda: defaultdict(CL.a))
    Centroids = defaultdict(lambda: defaultdict(CL.a))

    progress.StartTask("Initialization step")
    # Initialization step
    Cluster[1] = CL.MiniBatchKMeans(SVMLMatrix, 1)
    Centroids[1], BZ, X, Y, Z, IntradistanceStop = CT.CentroidBycluster(
        Cluster[1],
        os.path.join(conf.OutputFolder, "tmp",
                     'OutputSamples' + str(conf.SampleSize), 'Samples.txt'),
        BoltzmanFactor, int(conf.SampleSize), Probingconditions, rna)
    CumulBE[1] = CumulatedBoltzmannsbyCluster(Cluster[1], BZ,
                                              int(conf.SampleSize),
                                              Probingconditions)
    #print  "***************************************verification bz", "Cluster  \t Centroids  \t CumulBE \t ", Centroids[1], CumulBE[1]
    progress.EndTask()
    for nb in range(2, 21):
        progress.StartTask("Clustering with %s clusters" % nb)
        progress.StartTask("Run MBKM")
        Clust[nb] = CL.MiniBatchKMeans(SVMLMatrix, nb)
        progress.EndTask()
        Centroids[nb], BZ, X, Y, Z, Intradistance = CT.CentroidBycluster(
            Clust[nb],
            os.path.join(conf.OutputFolder, "tmp",
                         'OutputSamples' + str(conf.SampleSize),
                         'Samples.txt'), BoltzmanFactor, int(conf.SampleSize),
            Probingconditions, rna)
        CumulBE[nb] = CumulatedBoltzmannsbyCluster(Clust[nb], BZ,
                                                   int(conf.SampleSize),
                                                   Probingconditions)

        lista = []
        '''
        ####***************************************************First crierion:
        if len([ elem for elem in IntradistanceStop if elem <= epsilon_intradist ] )!=0:
            print "************************************* Clustering done with ", nb ," as the optimal number of clusters using the first criterion  intradistance*********************************************************************"
            break
        # ************************************* second criterion
        '''
        for elem1 in Centroids[nb - 1].keys():
            rep = []
            '''
            print "distance to all elements"
            print "Ref \t i \t i+1 \t BPdist \t CumulatedBz i \t CumulatedBz i+1 \t  CumulatedBzdist"
            '''
            for elem2 in Centroids[nb].keys():
                rep.append(
                    (elem2,
                     SF.DistanceTwoStructs(
                         SF.BasePairsFromStruct(Centroids[nb - 1][elem1]),
                         SF.BasePairsFromStruct(Centroids[nb][elem2]))))

            minima = np.min([item[1] for item in rep])
            pos = [elem[0] for elem in rep if elem[1] == minima][0]

            l1 = CumulBE[nb - 1][elem1]
            l2 = CumulBE[nb][pos]
            # print "what s wrong!", l1,l2
            Dist = l1 - l2

            lista.append((minima, (l1, l2, Dist)))
        ########## The new criterion i about the existence of probable cluster
        Bzmepsilon = 0.3 * CumulBE[1][0]

        BP_All_probable_centroids = [
            BPdist for BPdist, Bzmandist in lista if Bzmandist[0] >= Bzmepsilon
        ]
        progress.EndTask()

        if (len([elem for elem in Intradistance if elem <= epsilon]) != 0
                or len([
                    distance for distance in BP_All_probable_centroids
                    if distance <= epsilon
                ]) == len(BP_All_probable_centroids)):
            FF.PickleVariable(Cluster[nb], "Clusters" + method + ".pkl")
            progress.Print("Choosing %s as the optimal number of clusters" %
                           nb)
            break

        # for the entire clusters while keeping redundancy
    return Clust[nb], Centroids[nb]