コード例 #1
0
ファイル: actions.py プロジェクト: kuredatan/taxoclassifier
def userNodeSelectionAct(dataArray):
    print dataArray[1]
    metadata = parseList(raw_input("Input the metadata that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))
    isInDatabase(metadata,dataArray[1])
    #@dataArray[2] = idSequences is a dictionary of (key=identifier,value=(name,rank of node))
    listofNodes = dataArray[2].values()
    nodesList = parseListNode(raw_input("Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listofNodes[-3]) + ";" + sanitizeNode(listofNodes[1]) + ";" + sanitizeNode(listofNodes[-1]) + " ]\n"))
    isInDatabase(nodesList,listofNodes)
    numberofSamples = len(dataArray[0])
    numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberofSamples) + "sample(s), how many samples do you want to create the training set? \n"))
    x = integer.match(numberStartingSamples)
    if not x or (x and (int(numberStartingSamples) > numberofSamples)):
        print "\n/!\ ERROR: You should write down an integer less or equal to",numberofSamples,"."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    #@shape for @assignedClasses is the same than the one for @classes
    assignedClasses,classes,valueSets = classifyIt(dataArray,metadata,nodesList,numberStartingSamples)
    numberClass = classes.lenMDL()
    youdenJ = countYouden(assignedClasses,classes,numberofSamples)
    interpretIt(youdenJ)
    answer = raw_input("Do you want to plot the classes obtained as a pie chart? Y/N\n")
    if answer == "Y":
        labels = [ "Metadata: " + str(metadata) + ", Values for each metadatum: " + str([ valueSet for valueSet in valueSets]) ]
        percentagesAs = assignedClasses.mapMDL(len)
        percentages = classes.mapMDL(len)
        plotPie(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadata " + str(metadata))
        plotPie(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadata " + str(metadata))
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    answer = raw_input("Do you want to save the results? Y/N \n")
    if (answer == "Y"):
        writeFile("Youden's J statistic for this classification is: " + str(youdenJ) + "\n","Assignments depending on " + listNodes(nodesList) + " to classes for metadata " + str(metadata))
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return assignedClasses,youdenJ
コード例 #2
0
def userNodeSelectionAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(raw_input("Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0]
    isInDatabase([metadatum],dataArray[1])
    listnodes = dataArray[3].values()
    nodesList = parseListNode(raw_input("Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listnodes[-3]) + ";" + sanitizeNode(listnodes[1]) + ";" + sanitizeNode(listnodes[-1]) + " ]\n"))
    isInDatabase(nodesList,listnodes)
    numberSamples = len(dataArray[0])
    numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberSamples) + " sample(s), how many samples do you want to create the training set?\n"))
    x = integer.match(numberStartingSamples)
    if not x or (x and int(numberStartingSamples) > numberSamples) :
        print "\n/!\ ERROR: You should write down an integer inferior or equal to ",numberSamples,"."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    assignedClasses,classes,valueSet = classifyIt(dataArray,metadatum,nodesList,numberStartingSamples)
    numberClass = len(classes)
    youdenJ = countYouden(assignedClasses,classes,numberStartingSamples)
    interpretIt(youdenJ)
    answer = raw_input("Do you want to plot the classes obtained as a pie chart? Y/N")
    if answer == "Y":
        labels = [ metadatum + " = " + str(value) for value in valueSet ]
        percentagesAs = [ len(class1) for class1 in assignedClasses ]
        percentages = [ len(class1) for class1 in classes ]
        plotPieChart(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadatum " + metadatum)
        plotPieChart(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    answer = raw_input("Do you want to save the results? Y/N")
    if (answer == "Y"):
        writeFile("Youden's J statistic for this classification is: " + str(youdenJ) + "\n","Assignments depending on " + listNodes(nodesList) + " to classes for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return assignedClasses,youdenJ
コード例 #3
0
ファイル: actions.py プロジェクト: cbib/taxotree
def distanceAct(dataArray):
    answer = raw_input("Import matrix? Y/N\n")
    if answer == "Y":
        filename = raw_input("Write down the file name where the matrix is stored [ without the extension .taxotree ].\n")
        matrix = importMatrix(filename)
    else:
        if not (answer == "N"):
            print "/!\ You should answer 'Y' or 'N'!"
        print "/!\ Computing similarity matrix..."
        print "[ You may have to wait for a few minutes... ]"
        matrix = computeSimilarity(dataArray)
        print "[Preview.]"
        print matrix
        answer = raw_input("Save the results? Y/N\n")
        if (answer == "Y"):  
            writeFile(m,"Similarity coefficients between patients using previous calculi on total ratio, pattern ratio and diversity coefficient\n\nNota Bene: 1e+14 stands for +inf\n","array")
        elif not (answer == "N"):
            print "\n/!\ You should answer 'Y' or 'N'!"
    answer = raw_input("Compute the most different groups of samples? Y/N\n")
    if (answer == "Y"):
        answer = raw_input("Do you want to select samples by metadata or to select all samples? metadata/all")
        if (answer == "metadata"):
            print dataArray[1]
            metadatum = parseList(raw_input("Choose the metadatum among those printed above [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))
            isInDatabase(metadatum,dataArray[1])
            _,valueSampleMetadatum = partitionSampleByMetadatumValue(metadatum[0],dataArray[1],dataArray[0])
            valueSampleMetadatumNameOnly = []
            for sampleGroup in valueSampleMetadatum:
                sampleGroupNameOnly = []
                for sample in sampleGroup:
                    sampleGroupNameOnly.append(sample[0])
                valueSampleMetadatumNameOnly.append(sampleGroupNameOnly)
            pairsList = mostDifferentSamplesGroups(matrix,dataArray[8],valueSampleMetadatumNameOnly)
        if (answer == "all"):
            pairsList = mostDifferentSamplesGroups(matrix,dataArray[8],[[sample] for sample in dataArray[8]])
        else:
            print "\n/!\ ERROR: You should answer 'metadata' or 'all'."
            raise ValueError
        print "[ Preview. ]"
        print "List of the pairs of most different sample groups according to the similarity coefficients computed:"
        for pair in pairsList:
            print pair
        answer2 = raw_input("\nSave the results? Y/N\n")
        if (answer2 == "Y"):
            stringPairs = ""
            for pair in pairsList:
                stringPairs += "*" + str(pair) + "\n"
            if (answer == "metadata"):
                stringSamples = ""
                for group in valueSampleMetadatumNameOnly:
                    stringSamples += "*" + str(group) + "\n"
                data = "Most different groups of samples ****\nsorted by values of metadatum: " + metadatum[0] + "\nGroups were:\n\n" + stringSamples + "\n\nAnd the most different ones are:\n\n" + stringPairs + "\n\nEND OF FILE ****"
            else:
                data = "Most different groups of samples ****\n\nThe most different ones are:\n\n" + stringPairs + "\n\nEND OF FILE ****"
            writeFile(data,"","text")
        elif not (answer2 == "N"):
            print "/!\ You should answer 'Y' or 'N'!"
コード例 #4
0
def userNodeSelectionAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(
        raw_input(
            "Input the metadatum that will cluster the set of samples among those written above. [ e.g. "
            + dataArray[1][0] + " ]\n")).split(";")[0]
    isInDatabase([metadatum], dataArray[1])
    listnodes = dataArray[3].values()
    nodesList = parseListNode(
        raw_input(
            "Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. "
            + sanitizeNode(listnodes[-3]) + ";" + sanitizeNode(listnodes[1]) +
            ";" + sanitizeNode(listnodes[-1]) + " ]\n"))
    isInDatabase(nodesList, listnodes)
    numberSamples = len(dataArray[0])
    numberStartingSamples = sanitize(
        raw_input(
            "Knowing there is/are " + str(numberSamples) +
            " sample(s), how many samples do you want to create the training set?\n"
        ))
    x = integer.match(numberStartingSamples)
    if not x or (x and int(numberStartingSamples) > numberSamples):
        print "\n/!\ ERROR: You should write down an integer inferior or equal to ", numberSamples, "."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    assignedClasses, classes, valueSet = classifyIt(dataArray, metadatum,
                                                    nodesList,
                                                    numberStartingSamples)
    numberClass = len(classes)
    youdenJ = countYouden(assignedClasses, classes, numberStartingSamples)
    interpretIt(youdenJ)
    answer = raw_input(
        "Do you want to plot the classes obtained as a pie chart? Y/N")
    if answer == "Y":
        labels = [metadatum + " = " + str(value) for value in valueSet]
        percentagesAs = [len(class1) for class1 in assignedClasses]
        percentages = [len(class1) for class1 in classes]
        plotPieChart(
            labels, percentagesAs, "Assignments depending on " +
            str(nodesList) + " to class for metadatum " + metadatum)
        plotPieChart(
            labels, percentages, "Real classes depending on " +
            str(nodesList) + " for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    answer = raw_input("Do you want to save the results? Y/N")
    if (answer == "Y"):
        writeFile(
            "Youden's J statistic for this classification is: " +
            str(youdenJ) + "\n", "Assignments depending on " +
            listNodes(nodesList) + " to classes for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return assignedClasses, youdenJ
コード例 #5
0
def randomSubSamplingAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(raw_input("Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0]
    isInDatabase([metadatum],dataArray[1])
    s = raw_input("Input the number s of random samplings.\n")
    n = raw_input("Input the number n of nodes to select at each try.\n")
    if not integer.match(s) or not integer.match(n):
        print "\n/!\ ERROR: s and n must both be integers."
        raise ValueError
    numberSamples = len(dataArray[0])
    numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberSamples) + "sample(s), how many samples do you want to create the training set? \n"))
    x = integer.match(numberStartingSamples)
    if not x or (x and int(numberStartingSamples) > numberSamples):
        print "\n/!\ ERROR: You should write down an integer."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    listnodes = dataArray[3].values()
    s,n = int(s),int(n)
    #Here the set of classes is a list of two lists containing the samples in C and not C
    bestClassification = []
    bestClassesList = []
    #Worse value for this coefficient
    currBestYouden = inf
    nodesNumber = len(dataArray[3])
    while s:
        #Randomly draw n distinct nodes among the nodes in the taxonomic tree
        nodesList = randomChoice(listnodes,n)
        assignedClasses,classes,valueSet = classifyIt(dataArray,metadatum,nodesList,numberStartingSamples)
        numberClass = len(classes)
        youdenJ = countYouden(assignedClasses,classes,numberSamples)
        res = numberClass - youdenJ
        if min(res,currBestYouden) == res:
            bestClassification = []
            for i in nodesList:
                bestClassification.append(i)
            currBestYouden = res
            bestClassesList = []
            for i in assignedClasses:
                bestClassesList.append(i)
        s -= 1
    interpretIt(numberClass - currBestYouden)
    if answer == "Y":
        percentagesAs = [ len(class1) for class1 in assignedClasses ]
        labels = [ metadatum + " = " + str(value) for value in valueSet ]
        percentages = [ len(class1) for class1 in classes ]
        plotPieChart(labels,percentagesAs,"Assignments depending on " + listNodes(nodesList) + " to class for metadatum " + metadatum)
        plotPieChart(labels,percentages,"Real classes depending on " + listNodes(nodesList) + " for metadatum " + metadatum)
    answer = raw_input("Do you want to save the results? Y/N")
    if (answer == "Y"):
        writeFile("Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this metadatum is:" + str(bestClassification),"Assignments to classes for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return bestClassification,(numberClass - currBestYouden),bestClassesList
コード例 #6
0
ファイル: actions.py プロジェクト: cbib/taxotree
def similarityAct(dataArray,iMatrix):
    print dataArray[1]
    metadataList = parseList(raw_input("Input the list of metadata you want to consider among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))
    isInDatabase(metadataList,dataArray[1])
    print "/!\ Computing similarity matrix..."
    m = similarity(dataArray[0],dataArray[1],metadataList)
    print "[Preview.]"
    print m
    answer = raw_input("Save the results? Y/N\n")
    if (answer == "Y"):  
        writeFile(m,"Similarity coefficients between patients for file meta/" + iMatrix + ".csv:\n" + listNodes(dataArray[8]),"array")
    elif not (answer == "N"):
        print "/!\ You should answer 'Y' or 'N'!"
    return m
コード例 #7
0
ファイル: actions.py プロジェクト: kuredatan/kruskaltree
def runAct(dataArray):
    print "Choosing the list of samples."
    #or use partition by metadatum values
    sampleNameList,metadataList,interval1List,interval2List = createSampleNameList(dataArray)
    n = len(sampleNameList)
    print "\nAVAILABLE COMPARISON FUNCTION(S):"
    fctF = printComparison()
    f = raw_input("\nChoose your comparison function above those printed above.\n")
    isInDatabase([f],fctF)
    completeGraph = Graph(n).constructComplete(sampleNameList,dataArray[7],f)
    superTree,w = kruskal(completeGraph)
    #Constructing distance matrix
    matrix = np.zeros((n,n))
    print "\nAVAILABLE DISTANCE FUNCTION(S):"
    fctD = printDistance()
    d = raw_input("\nChoose your distance function above those printed above.\n")
    isInDatabase([d],fctD)
    valueArray = []
    print "\nSUPERTREE of weight:",w
    print superTree.vertices
    print superTree.edges
    for i in range(n):
        for j in range(i,n):
            #matrix is symmetric (distance)
            s = applyFctD(d,superTree,i,j)
            matrix[i][j] = s
            matrix[j][i] = s
            valueArray.append(s)
    valueArray = sorted(valueArray)
    valueNumber = n*n/2
    quartile3 = valueNumber*3/4
    valueQuartile = valueArray[quartile3]
    mostDifferent = []
    #Distance is symmetric
    for i in range(n):
        for j in range(i+1,n):
            if matrix[i][j] >= valueQuartile:
                mostDifferent.append((sampleNameList[i],sampleNameList[j]))
    print "\nRESULTING MATRIX:"
    print matrix
    print "\n---\nMost different samples groups from:\n"
    for sampleGroup in sampleNameList:
        print sampleGroup
    print "\nare:\n"
    print mostDifferent
    print "\n--- END OF DISPLAY\n"
コード例 #8
0
ファイル: actions.py プロジェクト: kuredatan/kruskaltree
def createSampleNameList(dataArray):
    metadataList = []
    interval1List = []
    interval2List = []
    sampleIDList = dataArray[8]
    i = raw_input("/!\ How many different lists of samples do you want?\n")
    if not integer.match(i):
        print "\n/!\ ERROR: You need to enter a integer here!"
        raise ValueError
    numberList = int(i)
    sampleNameList = []
    if (numberList < 1):
        print "\n/!\ ERROR: Empty set of lists of samples!"
        raise ValueError
    while numberList:
        answer = raw_input("Do you want to select samples one by one, or to select samples matching requirements on metadata? one/matching \n")
        if (answer == "one"):
            if (len(sampleIDList) < 2):
                print "\n/!\ ERROR: List of samples is empty or only of length one!..."
                raise ValueError
            print sampleIDList
            sampleNameList11 = parseList(raw_input("Input the list of samples using the ID printed above. [e.g. " + sampleIDList[0] + ";"+ sampleIDList[1] + " ]\n"))
        elif (answer == "matching"):
            print dataArray[1]
            metadataList = parseList(raw_input("Input the list of metadata you want to consider among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))
            isInDatabase(metadataList,dataArray[1])
            interval1List = parseIntList(raw_input("Input the list of lower interval bounds corresponding to metadatum/metadata above. [ Please refer to README for more details. e.g. 1;2 ]\n"))
            if not (len(interval1List) == len(metadataList)):
                print "\n/!\ ERROR: You need to enter the same number of lower bounds than of metadata!"
                raise ValueError
            interval2List = parseIntList(raw_input("Input the list of upper interval bounds corresponding to metadatum/metadata above. [ Please refer to README for more details. e.g. 3;2 ]\n"))
            if not (len(interval2List) == len(metadataList)):
                print "\n/!\ ERROR: You need to enter the same number of upper bounds than of metadata!"
                raise ValueError
            sampleNameList11 = computeSamplesInGroup(dataArray[0],dataArray[1],metadataList,interval1List,interval2List)[0]
        else:
            print "\n/!\ ERROR: You need to answer either 'one' or 'matching' and not: \"",answer,"\"."
            raise ValueError
        isInDatabase(sampleNameList11,sampleIDList)
        sampleNameList.append(sampleNameList11)
        numberList -= 1
    return sampleNameList,metadataList,interval1List,interval2List
コード例 #9
0
ファイル: actions.py プロジェクト: cbib/taxotree
def percentageAct(dataArray):
    uTree = raw_input("Do you to get percentage of assignments to subtrees or to bacterias themselves? subtree/bacteria \n")
    usingTree = (uTree == "subtree")
    if not (uTree == "subtree" or uTree == "bacteria"):
        print "\n/!\ ERROR: You need to answer 'bacteria' or 'subtree'."
        raise ValueError
    nodesGroup = parseListNode(raw_input("Input the list of nodes/roots of subtrees you want to consider. [ Please look at the taxonomic tree file to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + ". ]\n"))
    isInDatabase(nodesGroup,dataArray[6])
    sampleNameList,metadataList,interval1List,interval2List = createSampleNameList(dataArray,True)
    result = percentageAssign(dataArray[0],dataArray[1],sampleNameList,dataArray[7],nodesGroup,dataArray[2],dataArray[3],usingTree)
    print "\n[Preview.]"
    print result
    l = len(result)
    data = np.zeros(l)
    for i in range(l):
        data[i] = result[i]
    print ""
    answer = raw_input("Save the results? Y/N\n")
    if (answer == "Y"):  
        writeFile(data,"Percentage of assignments ****\nin the group of nodes: " + listNodes(nodesGroup) + listSampleInvolved(metadataList,interval1List,interval2List,sampleNameList),"array")
    elif not (answer == "N"):
        print "/!\ You should answer 'Y' or 'N'!"
    return result,nodesGroup,sampleNameList,metadataList
コード例 #10
0
ファイル: actions.py プロジェクト: kuredatan/taxocluster
def clusteringAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(
        raw_input(
            "Select the metadatum among those above to cluster the set of samples. [e.g. "
            + dataArray[1][0] + "]\n")).split(";")[0]
    isInDatabase([metadatum], dataArray[1])
    valueSet, clusters1 = partitionSampleByMetadatumValue(
        metadatum, dataArray[1], dataArray[0])
    clusters = [[sample[0] for sample in cluster] for cluster in clusters1]
    #that is, k in K-means Algorithm
    numberClass = len(valueSet)
    print "/!\ Number of classes:", numberClass, "."
    startSet = [cluster[0] for cluster in clusters]
    #Selects the starting samples of each cluster
    kClusters = [[start] for start in startSet]
    if not (len(clusters) == numberClass):
        print "\n/!\ ERROR: Different lengths: numberClass", numberClass, "clusters:", len(
            clusters), "."
        raise ValueError
    trimmedList = trimList(dataArray[3], startSet)
    print "/!\ Clustering with the first distance..."
    #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster)
    #@dataArray[8] = distMatchedDict
    kClusters, meanSamples, distanceDict, distanceInClusters = kMeans(
        trimmedList, numberClass, kClusters, startSet, dataArray[8], dataArray)
    print "-- End of first clustering --"
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number == len(dataArray[3])):
        print "\n/!\ ERROR: A bug occurred during the clustering:", number, "=/=", len(
            dataArray[3]), "."
        raise ValueError
    #Deletes samples in cluster that are too far from the others
    kClusters, untaken = cleanClusters(kClusters, distanceInClusters)
    startSet = [cluster[0] for cluster in clusters]
    #Remove from untaken the starting samples
    untaken2 = []
    for x in untaken:
        if not (x in startSet):
            untaken2.append(x)
    untaken = untaken2
    #Remove the samples in untaken from the total set of samples
    sampleSet = []
    for cluster in kClusters:
        for x in cluster:
            if not (x in sampleSet):
                sampleSet.append(x)
    for x in startSet:
        if not (x in sampleSet):
            sampleSet.append(x)
    trimmedList = trimList(sampleSet, startSet)
    print "/!\ Clustering with the second distance..."
    #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2)
    #@dataArray[9] = distConsensusDict
    kClusters, meanSamples, distanceDict, _ = kMeans(trimmedList, numberClass,
                                                     kClusters, startSet,
                                                     dataArray[9],
                                                     dataArray)  #,meanSamples)
    print "-- End of second clustering --"
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number <= len(dataArray[3])):
        print "\n/!\ ERROR: An error occurred during the clustering:", number, ">", len(
            dataArray[3]), "."
        raise ValueError
    print "Printing the", numberClass, "clusters:"
    i = 1
    #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs
    for cluster in kClusters:
        print "\n-- Cluster #", i, "associated to", metadatum, "=", valueSet[
            i - 1], ":"
        print "Size:", len(cluster)
        print sorted(cluster)
        i += 1
    print "\nScore of the clustering (comprised between 0 and 1):"
    print "The more it is close to 1, the more the clustering is relevant."
    #The clustering obtained with the K-Means method
    kClustersCopy = [cluster for cluster in kClusters]
    #The clustering obtained by comparing the values of the metadatum
    clustersCopy = [cluster for cluster in clusters]
    #Score by using first method of comparison
    compareClusterScore = 0
    if not (len(kClustersCopy) == numberClass == len(clustersCopy)):
        print "\n/!\ ERROR: Length error in clustering:", numberClass, len(
            kClustersCopy), len(clustersCopy), "."
        raise ValueError
    while kClustersCopy and clustersCopy:
        cl1 = kClustersCopy.pop()
        cl2 = clustersCopy.pop()
        #clusters are non-empty
        x = compareCluster(cl1, cl2, untaken)
        if x:
            compareClusterScore += x
        else:
            compareClusterScore = None
            break
    if compareClusterScore:
        compareClusterScore = compareClusterScore / numberClass
        printClusterScore = compareClusterScore
    else:
        printClusterScore = "None"
    #Score by using second method of comparison
    #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass)
    print "Compare clusters score is:", printClusterScore, "."
    #print "Compare centers score is:",compareCentersScore,"."
    answer = raw_input("Do you want to save the results? Y/N\n")
    if (answer == "Y"):
        answer2 = raw_input(
            "Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n"
        )
        if (answer2 == "Y"):
            commonList = extractCommonNodes(kClusters, dataArray)
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:", answer2, "."
        data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str(
            valueSet)
        i = 0
        for cluster in kClusters:
            data += "\n\n-- Cluster #" + str(
                i + 1) + " associated to " + metadatum + " = " + str(
                    valueSet[i])
            data += "\nSize: " + str(len(cluster))
            if (answer2 == "Y"):
                data += "\nSet of common nodes: " + str(commonList[i])
            data += "\n" + str(cluster)
            i += 1
        data += "\n\nCompare clusters score is: " + str(compareClusterScore)
        #data += "\n\nCompare centers score is: " + str(compareCentersScore)
        data += "\n\nEND OF FILE ****"
        print "\n/!\ Saving clusters..."
        writeFile(data)
        answer2 = raw_input(
            "Do you want to compute the graph of the clusters? Y/N\n")
        if (answer2 == "Y"):
            print "\n/!\ Constructing the graph of the clusters..."
            #@dataArray[3] = filenames
            graph = convertClustersIntoGraph(kClusters, distanceDict,
                                             len(dataArray[3]))
            graphNO(graph)
            print "\n/!\ Done. The graph is in DOT format in \"files\" folder."
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:", answer2, "."
    elif not (answer == "N"):
        print "/!\ You should answer by Y or N."
コード例 #11
0
ファイル: actions.py プロジェクト: cbib/taxotree
def creatingArray(dataArray,pearson=False):
    #Available cases in Pearson function
    if pearson:
        typeInput = raw_input("Do you want to compute bacteria/bacteria or bacteria/metadatum? BB/BM [ Please read README for details. ]\n")
        if (typeInput == "BB"):
            valueInput1 = parseListNode(raw_input("Choose the first group of bacterias [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + " ]\n"))
            isInDatabase(valueInput1,dataArray[6])
            valueInput2 = parseListNode(raw_input("Choose the second group of bacterias [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + " ]\n"))
            isInDatabase(valueInput2,dataArray[6])
            xArray,yArray = getValueBacteriaBacteria(dataArray[2],dataArray[3],dataArray[8],valueInput1,valueInput2)
            return xArray,yArray,typeInput,valueInput1,valueInput2
        elif (typeInput == "BM"):
            valueInput1 = parseListNode(raw_input("Choose the group of bacterias [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + " ]\n"))
            isInDatabase(valueInput1,dataArray[6])
            print dataArray[1]
            valueInput2 = [parseList(raw_input("Choose the metadatum among those printed above [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))[0]]
            isInDatabase(valueInput2,dataArray[1])
            xArray,yArray = getValueBacteriaMetadata(dataArray[0],dataArray[1],valueInput1,dataArray[8],dataArray[2],dataArray[3],valueInput2)
            return xArray,yArray,typeInput,valueInput1,valueInput2
        else:
            print "\nERROR: You need to answer 'BB' or 'BM', and not ",typeInput
            raise ValueError
    #Available cases for only plotting graphs
    else:
        graphTypeInput = raw_input("Do you want to plot a graph or a pie chart? graph/pie [Read README for details. Histograms will be available in later versions]\n")
        if graphTypeInput == "graph":
            typeInput = raw_input("Do you want to plot bacteria/bacteria or bacteria/metadatum? BB/BM [ Please read README for details. ]\n")
            if (typeInput == "BB"):
                valueInput1 = parseListNode(raw_input("Choose the first group of bacterias [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + " ]\n"))
                isInDatabase(valueInput1,dataArray[6])
                valueInput2 = parseListNode(raw_input("Choose the second group of bacterias [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + " ]\n"))
                isInDatabase(valueInput2,dataArray[6])
                return graphTypeInput,getValueBacteriaBacteria(dataArray[2],dataArray[3],dataArray[8],valueInput1,valueInput2),typeInput,valueInput1,valueInput2
            elif (typeInput == "BM"):
                valueInput1 = parseListNode(raw_input("Choose the group of bacterias [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + " ]\n"))
                isInDatabase(valueInput1,dataArray[6])
                print dataArray[1]
                valueInput2 = [parseList(raw_input("Choose the metadatum among those printed above [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))[0]]
                isInDatabase(valueInput2,dataArray[1])
                return graphTypeInput,getValueBacteriaMetadata(dataArray[0],dataArray[1],valueInput1,dataArray[8],dataArray[2],dataArray[3],valueInput2),typeInput,valueInput1,valueInput2
            else:
                print "\nERROR: You need to answer 'BB' or 'BM', and not ",typeInput
                raise ValueError
        elif graphTypeInput == "pie":
                result,nodesGroup,sampleNameList,metadataList = percentageAct(dataArray)
                return graphTypeInput,result,nodesGroup,sampleNameList,metadataList
        else:
            print "\nERROR: You need to answer 'graph' or 'pie', and not ",graphTypeInput
            raise ValueError            
コード例 #12
0
ファイル: actions.py プロジェクト: kuredatan/taxocluster
def clusteringAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(raw_input("Select the metadatum among those above to cluster the set of samples. [e.g. " + dataArray[1][0] + "]\n")).split(";")[0]
    isInDatabase([metadatum],dataArray[1])
    valueSet,clusters1 = partitionSampleByMetadatumValue(metadatum,dataArray[1],dataArray[0])
    clusters = [[sample[0] for sample in cluster] for cluster in clusters1]
    #that is, k in K-means Algorithm
    numberClass = len(valueSet)
    print "/!\ Number of classes:",numberClass,"."
    startSet = [cluster[0] for cluster in clusters]
    #Selects the starting samples of each cluster
    kClusters = [[start] for start in startSet]
    if not (len(clusters) == numberClass):
        print "\n/!\ ERROR: Different lengths: numberClass",numberClass,"clusters:",len(clusters),"."
        raise ValueError
    trimmedList = trimList(dataArray[3],startSet)
    print "/!\ Clustering with the first distance..."
    #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster)
    #@dataArray[8] = distMatchedDict
    kClusters,meanSamples,distanceDict,distanceInClusters = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[8],dataArray)
    print "-- End of first clustering --"
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number == len(dataArray[3])):
        print "\n/!\ ERROR: A bug occurred during the clustering:",number,"=/=",len(dataArray[3]),"."
        raise ValueError
    #Deletes samples in cluster that are too far from the others
    kClusters,untaken = cleanClusters(kClusters,distanceInClusters)
    startSet = [cluster[0] for cluster in clusters]
    #Remove from untaken the starting samples
    untaken2 = []
    for x in untaken:
        if not (x in startSet):
            untaken2.append(x)
    untaken = untaken2
    #Remove the samples in untaken from the total set of samples
    sampleSet = []
    for cluster in kClusters:
        for x in cluster:
            if not (x in sampleSet):
                sampleSet.append(x)
    for x in startSet:
        if not (x in sampleSet):
            sampleSet.append(x)
    trimmedList = trimList(sampleSet,startSet)
    print "/!\ Clustering with the second distance..."
    #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2)
    #@dataArray[9] = distConsensusDict
    kClusters,meanSamples,distanceDict,_ = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[9],dataArray)#,meanSamples)
    print "-- End of second clustering --" 
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number <= len(dataArray[3])):
        print "\n/!\ ERROR: An error occurred during the clustering:",number,">",len(dataArray[3]),"."
        raise ValueError
    print "Printing the",numberClass,"clusters:"
    i = 1
    #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs
    for cluster in kClusters:
        print "\n-- Cluster #",i,"associated to",metadatum,"=",valueSet[i-1],":"
        print "Size:",len(cluster)
        print sorted(cluster)
        i += 1
    print "\nScore of the clustering (comprised between 0 and 1):"
    print "The more it is close to 1, the more the clustering is relevant."
    #The clustering obtained with the K-Means method
    kClustersCopy = [cluster for cluster in kClusters]
    #The clustering obtained by comparing the values of the metadatum
    clustersCopy = [cluster for cluster in clusters]
    #Score by using first method of comparison
    compareClusterScore = 0
    if not (len(kClustersCopy) == numberClass == len(clustersCopy)):
        print "\n/!\ ERROR: Length error in clustering:",numberClass,len(kClustersCopy),len(clustersCopy),"."
        raise ValueError
    while kClustersCopy and clustersCopy:
        cl1 = kClustersCopy.pop()
        cl2 = clustersCopy.pop()
        #clusters are non-empty
        x = compareCluster(cl1,cl2,untaken)
        if x:
            compareClusterScore += x
        else:
            compareClusterScore = None
            break
    if compareClusterScore:
        compareClusterScore = compareClusterScore/numberClass
        printClusterScore = compareClusterScore
    else:
        printClusterScore = "None"
    #Score by using second method of comparison
    #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass)
    print "Compare clusters score is:",printClusterScore,"."
    #print "Compare centers score is:",compareCentersScore,"."
    answer = raw_input("Do you want to save the results? Y/N\n")
    if (answer == "Y"):
        answer2 = raw_input("Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n")
        if (answer2 == "Y"):
            commonList = extractCommonNodes(kClusters,dataArray)
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:",answer2,"."
        data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str(valueSet)
        i = 0
        for cluster in kClusters:
            data += "\n\n-- Cluster #" + str(i+1) + " associated to " + metadatum + " = " + str(valueSet[i]) 
            data += "\nSize: " + str(len(cluster))
            if (answer2 == "Y"):
                data += "\nSet of common nodes: " + str(commonList[i])
            data += "\n" + str(cluster)
            i += 1
        data += "\n\nCompare clusters score is: " + str(compareClusterScore)
        #data += "\n\nCompare centers score is: " + str(compareCentersScore)
        data += "\n\nEND OF FILE ****"
        print "\n/!\ Saving clusters..."
        writeFile(data)
        answer2 = raw_input("Do you want to compute the graph of the clusters? Y/N\n")
        if (answer2 == "Y"):
            print "\n/!\ Constructing the graph of the clusters..."
            #@dataArray[3] = filenames
            graph = convertClustersIntoGraph(kClusters,distanceDict,len(dataArray[3]))
            graphNO(graph)
            print "\n/!\ Done. The graph is in DOT format in \"files\" folder."
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:",answer2,"."
    elif not (answer == "N"):
        print "/!\ You should answer by Y or N."
コード例 #13
0
def randomSubSamplingAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(
        raw_input(
            "Input the metadatum that will cluster the set of samples among those written above. [ e.g. "
            + dataArray[1][0] + " ]\n")).split(";")[0]
    isInDatabase([metadatum], dataArray[1])
    s = raw_input("Input the number s of random samplings.\n")
    n = raw_input("Input the number n of nodes to select at each try.\n")
    if not integer.match(s) or not integer.match(n):
        print "\n/!\ ERROR: s and n must both be integers."
        raise ValueError
    numberSamples = len(dataArray[0])
    numberStartingSamples = sanitize(
        raw_input(
            "Knowing there is/are " + str(numberSamples) +
            "sample(s), how many samples do you want to create the training set? \n"
        ))
    x = integer.match(numberStartingSamples)
    if not x or (x and int(numberStartingSamples) > numberSamples):
        print "\n/!\ ERROR: You should write down an integer."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    listnodes = dataArray[3].values()
    s, n = int(s), int(n)
    #Here the set of classes is a list of two lists containing the samples in C and not C
    bestClassification = []
    bestClassesList = []
    #Worse value for this coefficient
    currBestYouden = inf
    nodesNumber = len(dataArray[3])
    while s:
        #Randomly draw n distinct nodes among the nodes in the taxonomic tree
        nodesList = randomChoice(listnodes, n)
        assignedClasses, classes, valueSet = classifyIt(
            dataArray, metadatum, nodesList, numberStartingSamples)
        numberClass = len(classes)
        youdenJ = countYouden(assignedClasses, classes, numberSamples)
        res = numberClass - youdenJ
        if min(res, currBestYouden) == res:
            bestClassification = []
            for i in nodesList:
                bestClassification.append(i)
            currBestYouden = res
            bestClassesList = []
            for i in assignedClasses:
                bestClassesList.append(i)
        s -= 1
    interpretIt(numberClass - currBestYouden)
    if answer == "Y":
        percentagesAs = [len(class1) for class1 in assignedClasses]
        labels = [metadatum + " = " + str(value) for value in valueSet]
        percentages = [len(class1) for class1 in classes]
        plotPieChart(
            labels, percentagesAs, "Assignments depending on " +
            listNodes(nodesList) + " to class for metadatum " + metadatum)
        plotPieChart(
            labels, percentages, "Real classes depending on " +
            listNodes(nodesList) + " for metadatum " + metadatum)
    answer = raw_input("Do you want to save the results? Y/N")
    if (answer == "Y"):
        writeFile(
            "Best Youden's J statistic for this classification is: " +
            str(numberClass - currBestYouden) +
            "\nand most relevant list of nodes for this metadatum is:" +
            str(bestClassification),
            "Assignments to classes for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return bestClassification, (numberClass - currBestYouden), bestClassesList
コード例 #14
0
ファイル: actions.py プロジェクト: kuredatan/taxoclassifier
def randomSubSamplingAct(dataArray):
    print dataArray[1]
    metadata = parseList(raw_input("Input the metadata that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))
    isInDatabase(metadata,dataArray[1])
    s = raw_input("Input the number s of random samplings.\n")
    n = raw_input("Input the number n of nodes to select at each try.\n")
    numberofSamples = len(dataArray[0])
    if not integer.match(s) or not integer.match(n):
        print "\n/!\ ERROR: s and n must both be integers."
        raise ValueError
    s,n = int(s),int(n)
    numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberofSamples) + " sample(s), how many samples do you want to create the training set?\n"))
    x = integer.match(numberStartingSamples)
    if not x or (x and (int(numberStartingSamples) > numberofSamples)):
        print "\n/!\ ERROR: You should write down an integer less or equal to",numberofSamples,"."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    #Here the set of classes is a list of two lists containing the samples in C and not C
    bestClassification = []
    bestClassesList = []
    bestShape = []
    bestValuesList = []
    #Worse value for this coefficient
    currBestYouden = inf
    nodesNumber = len(dataArray[3])
    #@dataArray[2] = idSequences, which is a dictionary of (key=identifier,values=(name,rank of node))
    listofNodes = dataArray[2].values()
    while s:
        #Randomly draw n distinct nodes among the nodes in the taxonomic tree
        nodesList = randomChoice(listofNodes,n)
        assignedClasses,classes,valueSets = classifyIt(dataArray,metadata,nodesList,numberStartingSamples)
        numberClass = classes.lenMDL(shape)
        #len(dataArray[0])?
        youdenJ = countYouden(assignedClasses,classes,numberofSamples)
        res = numberClass - youdenJ
        if min(res,currBestYouden) == res:
            bestValuesList = []
            for i in valueSets:
                bestValuesList.append(i)
            bestClassification = []
            for i in nodesList:
                bestClassification.append(i)
            bestShape = []
            for i in shape:
                bestShape.append(i)
            currBestYouden = res
            bestClassesList = []
            for i in assignedClasses:
                bestClassesList.append(i)
        s -= 1
    interpretIt(numberClass - currBestYouden)
    if answer == "Y":
        labels = [ "Metadata: " + str(metadata) + ", Values for each metadatum: " + str([ valueSet for valueSet in valueSets]) ]
        percentagesAs = assignedClasses.mapMDL(len)
        percentages = classes.mapMDL(len)
        plotPie(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadata " + str(metadata))
        plotPie(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadata " + str(metadata))
    answer = raw_input("Do you want to save the results? Y/N \n")
    if (answer == "Y"):
        writeFile("Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this set of metadata is:" + str(bestClassification),"Assignments to classes for metadata " + str(metadata))
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return bestClassification,(numberClass - currBestYouden),bestClassesList