Esempio n. 1
0
def DistanceStruct(StructFile, SVMlFile, numberofsruct, constrainte):
    conf = loadConfig()
    Redondantestructure = defaultdict(aa)
    MatDist = defaultdict(aa)
    Redondantestructure1 = {}
    Dicnumberofsruct = {}

    for i in range(len(constrainte)):
        Dicnumberofsruct[constrainte[i]] = numberofsruct

    nb, DicStruct = GetBasePairsFromStructFile(StructFile)

    progress.StartTask("Dissimilarity Loop")
    for i in range(0, nb):
        for j in range(i + 1, nb):
            MatDist[i][j] = DistanceTwoStructs(DicStruct[i], DicStruct[j])
            MatDist[j][i] = MatDist[i][j]

            ####### Check for redundancy
            if MatDist[i][j] == 0:
                jconstraint = int(j / numberofsruct)
                if j not in Redondantestructure1 and int(
                        i / numberofsruct
                ) == jconstraint:  # To be sure that the redundant  structure belongs to the same probing condition
                    Dicnumberofsruct[constrainte[jconstraint]] -= 1
                    Redondantestructure1[j] = jconstraint
    progress.EndTask()

    progress.StartTask("Export dissimilarity matrix")
    for elem in Redondantestructure1:
        jconstraint = Redondantestructure1[elem]
        StructureNumber = elem - jconstraint * numberofsruct
        Redondantestructure[constrainte[jconstraint]][
            StructureNumber] = 1  # we mark redundant structures by value 1

    # store the distance matrix in the  SVMLFile
    SVMLFullPath = os.path.join(conf.OutputFolder, "tmp", SVMlFile)
    if os.path.isfile(SVMLFullPath):
        os.remove(SVMLFullPath)  # To clean the previous version
    o = open(SVMLFullPath, "w")
    for i in range(len(MatDist)):
        o.write("%i\t" % (i + 1))
        for j in range(len(MatDist)):
            if (i != j):
                o.write("%i:%.4f\t" % (j + 1, MatDist[i][j]))
        o.write("\n")
    o.close()
    progress.EndTask()

    progress.StartTask("Pickle all data")
    FF.PickleVariable(MatDist, "dissmatrix.pkl")
    FF.PickleVariable(list(Redondantestructure1.keys()),
                      "Redondantestructures.pkl")
    FF.PickleVariable(Redondantestructure, "Redondantestructures_Id.pkl")
    FF.PickleVariable(Dicnumberofsruct, "Dicnumberofsruct.pkl")
    progress.EndTask()
    return 0
Esempio n. 2
0
def drawStructure(Sequence, Structure, Shapefile, OutFile):
    conf = CF.loadConfig()
    cmopt = ""
    #print "shape",Shapefile
    if os.path.isfile(Shapefile):
        vals = FF.parseReactivityfile(Shapefile)
        cmopt = ' -colorMap "' + ";".join(["%.3f" % float(v) for v in vals]) + '"' + COLOR_MAP
    dummyout = os.path.join(conf.OutputFolder, "tmp", "varnamsg.txt")
    cmd = 'java -cp VARNAv3-93.jar fr.orsay.lri.varna.applications.VARNAcmd  -bpStyle simple -sequenceDBN "%s" -structureDBN "%s" '%(Sequence, Structure) + cmopt + ' -algorithm line -o ' + OutFile
    #print cmd
    subprocess.call(cmd, stdin=None, stdout=open(dummyout, 'wb'),
                    stderr=open(dummyout, 'w'), shell=True)
Esempio n. 3
0
def RunEval(InputFile):
    Energy = []
    # launch the RNaeval command
    conf = loadConfig()
    energiesFile = os.path.join(conf.OutputFolder, "tmp", "energyvalues")
    os.system('RNAeval <' + InputFile + '>' + energiesFile)
    # Parse the RNAevaloutput to extract energy values
    lines = FF.Parsefile(energiesFile)
    for i in xrange(1, len(lines), 2):
        # i is the stucture number and 'lines[i].split(" ")[1][1:-2]' is  the  corresponding  energy value
        # print 'holla',(lines[i].split(" ")[1][1:-2])
        Energy.append(
            lines[i].split(" ", 1)[1]
            [1:-2])  # TODO ,1 is to get the first occurence of the space !!!
    return Energy
Esempio n. 4
0
def StructSampling(Pathconstraints, Conditions, numberStructures, T, m, b, defaultFasta):
    conf = loadConfig()
    dir = os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(numberStructures))
    FF.CreateFold(dir)
    thermoMsgShown = False
    for filename in Conditions:
        lines = []
        header = []
        progress.StartTask("Processing %s"%(filename))
        while len(lines) - NUM_HEADER_LINES < numberStructures:

            # If alternative sequence file found in constraints folder, use it rather than default
            Input = defaultFasta
            for p in Pathconstraints:
                tmpInput = os.path.join(p, filename + '.' + IPANEMAP.FASTA_EXTENSION)
                if os.path.isfile(tmpInput):
                    Input = tmpInput

            output = os.path.join(dir, filename)
            Command = 'RNAsubopt  -p ' + str(numberStructures) + ' -s -T ' + str(T)

            (hasHardConstraints, hasSoftConstraints) = (False, False)

            hardConstraintFile = os.path.join(conf.PathConstraintsFile, filename + '.txt')
            if os.path.isfile(hardConstraintFile):
                Command += ' -C --enforceConstraint '
                hasHardConstraints = True
                Input = hardConstraintFile

            ShapeFile = os.path.join(conf.PathConstraintsFileShape, filename + '.txt')
            if os.path.isfile(ShapeFile):
                Command += ' --shape ' + ShapeFile + ' --shapeMethod="Dm' + str(m) + 'b' + str(b) + '"'
                hasSoftConstraints = True

            if not (hasHardConstraints or hasSoftConstraints or thermoMsgShown):
                progress.Print("Warning: Did not find suitable constraint file for this condition, using purely thermodynamic sampling")
                thermoMsgShown = True
            subprocess.call(Command, stdin=open(Input, 'r'), stdout=open(output, 'wb'),
                            stderr=open(os.devnull, 'w'), shell=True)
            with open(output, 'r') as f:
                nlines = f.readlines()
                header = nlines[:NUM_HEADER_LINES]
                lines += nlines[NUM_HEADER_LINES:]
        with open(output, 'w') as f:
            f.writelines(header+lines[:numberStructures])
        progress.EndTask()
    return dir
Esempio n. 5
0
def EvalStructuresEnergies(StructFile, rna):
    # generate the rnaeval input file
    conf = loadConfig()
    InputFile = os.path.join(conf.OutputFolder, "tmp", "InputRNAeval")
    FromStructFiletoRNAEvalInput(StructFile, InputFile, rna)
    return RunEval(InputFile)
Esempio n. 6
0
import sys
from collections import defaultdict
from conf import loadConfig, Logger
import FileFunctions as FF
import Sampling as SP
import StructureFunctions as SF
import StructureFunctions as SF
import VisualizationTools as VT
import ClustersTrait as CT
import Optimize_clustering as OC
from Progress import progress

FASTA_EXTENSION = "fa"

if __name__ == "__main__":
    conf = loadConfig()

    # Create folders
    FF.CreateFold(conf.OutputFolder)
    FF.CreateFold(os.path.join(conf.OutputFolder, "tmp"))
    FF.CreateFold(os.path.join(conf.OutputFolder, "tmp", conf.PickledData))

    # Redirects all the print to the output Log file
    sys.stdout = Logger(os.path.join(conf.OutputFolder, conf.OutputLogfile))

    # ******************************** Generate sample

    try:
        rna = os.path.split(conf.RNA)[-1]
        RNAName = rna[:-(len(FASTA_EXTENSION) + 1)]
        progress.StartTask("Processing RNA %s" % (RNAName))
Esempio n. 7
0
def DefineNumberCluster(SVMLMatrix, Redundant, method, DM, BoltzmanFactor,
                        Probingconditions, rna):
    conf = loadConfig()

    epsilon = 1  # Cetroid base pair distance threshold

    Cluster = defaultdict(lambda: defaultdict(CL.a))
    Clust = defaultdict(lambda: defaultdict(CL.a))
    CumulBE = defaultdict(lambda: defaultdict(CL.a))
    Centroids = defaultdict(lambda: defaultdict(CL.a))

    progress.StartTask("Initialization step")
    # Initialization step
    Cluster[1] = CL.MiniBatchKMeans(SVMLMatrix, 1)
    Centroids[1], BZ, X, Y, Z, IntradistanceStop = CT.CentroidBycluster(
        Cluster[1],
        os.path.join(conf.OutputFolder, "tmp",
                     'OutputSamples' + str(conf.SampleSize), 'Samples.txt'),
        BoltzmanFactor, int(conf.SampleSize), Probingconditions, rna)
    CumulBE[1] = CumulatedBoltzmannsbyCluster(Cluster[1], BZ,
                                              int(conf.SampleSize),
                                              Probingconditions)
    #print  "***************************************verification bz", "Cluster  \t Centroids  \t CumulBE \t ", Centroids[1], CumulBE[1]
    progress.EndTask()
    for nb in range(2, 21):
        progress.StartTask("Clustering with %s clusters" % nb)
        progress.StartTask("Run MBKM")
        Clust[nb] = CL.MiniBatchKMeans(SVMLMatrix, nb)
        progress.EndTask()
        Centroids[nb], BZ, X, Y, Z, Intradistance = CT.CentroidBycluster(
            Clust[nb],
            os.path.join(conf.OutputFolder, "tmp",
                         'OutputSamples' + str(conf.SampleSize),
                         'Samples.txt'), BoltzmanFactor, int(conf.SampleSize),
            Probingconditions, rna)
        CumulBE[nb] = CumulatedBoltzmannsbyCluster(Clust[nb], BZ,
                                                   int(conf.SampleSize),
                                                   Probingconditions)

        lista = []
        '''
        ####***************************************************First crierion:
        if len([ elem for elem in IntradistanceStop if elem <= epsilon_intradist ] )!=0:
            print "************************************* Clustering done with ", nb ," as the optimal number of clusters using the first criterion  intradistance*********************************************************************"
            break
        # ************************************* second criterion
        '''
        for elem1 in Centroids[nb - 1].keys():
            rep = []
            '''
            print "distance to all elements"
            print "Ref \t i \t i+1 \t BPdist \t CumulatedBz i \t CumulatedBz i+1 \t  CumulatedBzdist"
            '''
            for elem2 in Centroids[nb].keys():
                rep.append(
                    (elem2,
                     SF.DistanceTwoStructs(
                         SF.BasePairsFromStruct(Centroids[nb - 1][elem1]),
                         SF.BasePairsFromStruct(Centroids[nb][elem2]))))

            minima = np.min([item[1] for item in rep])
            pos = [elem[0] for elem in rep if elem[1] == minima][0]

            l1 = CumulBE[nb - 1][elem1]
            l2 = CumulBE[nb][pos]
            # print "what s wrong!", l1,l2
            Dist = l1 - l2

            lista.append((minima, (l1, l2, Dist)))
        ########## The new criterion i about the existence of probable cluster
        Bzmepsilon = 0.3 * CumulBE[1][0]

        BP_All_probable_centroids = [
            BPdist for BPdist, Bzmandist in lista if Bzmandist[0] >= Bzmepsilon
        ]
        progress.EndTask()

        if (len([elem for elem in Intradistance if elem <= epsilon]) != 0
                or len([
                    distance for distance in BP_All_probable_centroids
                    if distance <= epsilon
                ]) == len(BP_All_probable_centroids)):
            FF.PickleVariable(Cluster[nb], "Clusters" + method + ".pkl")
            progress.Print("Choosing %s as the optimal number of clusters" %
                           nb)
            break

        # for the entire clusters while keeping redundancy
    return Clust[nb], Centroids[nb]