def userNodeSelectionAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(raw_input("Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0]
    isInDatabase([metadatum],dataArray[1])
    listnodes = dataArray[3].values()
    nodesList = parseListNode(raw_input("Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listnodes[-3]) + ";" + sanitizeNode(listnodes[1]) + ";" + sanitizeNode(listnodes[-1]) + " ]\n"))
    isInDatabase(nodesList,listnodes)
    numberSamples = len(dataArray[0])
    numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberSamples) + " sample(s), how many samples do you want to create the training set?\n"))
    x = integer.match(numberStartingSamples)
    if not x or (x and int(numberStartingSamples) > numberSamples) :
        print "\n/!\ ERROR: You should write down an integer inferior or equal to ",numberSamples,"."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    assignedClasses,classes,valueSet = classifyIt(dataArray,metadatum,nodesList,numberStartingSamples)
    numberClass = len(classes)
    youdenJ = countYouden(assignedClasses,classes,numberStartingSamples)
    interpretIt(youdenJ)
    answer = raw_input("Do you want to plot the classes obtained as a pie chart? Y/N")
    if answer == "Y":
        labels = [ metadatum + " = " + str(value) for value in valueSet ]
        percentagesAs = [ len(class1) for class1 in assignedClasses ]
        percentages = [ len(class1) for class1 in classes ]
        plotPieChart(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadatum " + metadatum)
        plotPieChart(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    answer = raw_input("Do you want to save the results? Y/N")
    if (answer == "Y"):
        writeFile("Youden's J statistic for this classification is: " + str(youdenJ) + "\n","Assignments depending on " + listNodes(nodesList) + " to classes for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return assignedClasses,youdenJ
def parseInfo(filename):
    samplesList = []
    file_matrix = open("meta/" + filename + ".csv", "r")
    lines = file_matrix.readlines()
    file_matrix.close()
    #Data need to be sanitized
    infoListDirty = lines[0].split(",")
    infoList = []
    for info in infoListDirty:
        infoList.append(sanitize(info.split("\n")[0]))
    for line in lines[1:]:
        #Construction of the list associated to one sample
        thisSampleList = []
        lsDirty = line.split(",")
        #Checks if lsDirty is not empty
        if not lsDirty:
            print "\n /!\ ERROR: [BUG] [parsingInfo/parseInfo] Parsing error. (1)"
            raise ValueError
        ls = []
        for data in lsDirty:
            if not (data == ""):
                ls.append(data)
            else:
                #unknown values are remplaced by "N"
                ls.append("N")
        for data in ls:
            thisSampleList.append(sanitize(data).split("\n")[0])
        #samplesList is the list of every sample's list
        if not (len(thisSampleList) == len(infoList)):
            print "\n /!\ ERROR: [BUG] [parsingInfo/parseInfo] Parsing error. (2)"
            raise ValueError
        samplesList.append(thisSampleList)
    return samplesList, infoList
Exemple #3
0
def parseInfo(filename):
    samplesList = []
    file_matrix = open("meta/" + filename + ".csv","r")
    lines = file_matrix.readlines()
    file_matrix.close()
    #Data need to be sanitized
    infoListDirty = lines[0].split(",")
    infoList = []
    for info in infoListDirty:
        infoList.append(sanitize(info.split("\n")[0]))
    for line in lines[1:]:
        #Construction of the list associated to one sample
        thisSampleList = []
        lsDirty = line.split(",")
        #Checks if lsDirty is not empty
        if not lsDirty:
            print "\n /!\ ERROR: [BUG] [parsingInfo/parseInfo] Parsing error. (1)"
            raise ValueError
	ls = []
        for data in lsDirty:
            if not (data == ""):
                ls.append(data)
            else:
                #unknown values are remplaced by "N"
                ls.append("N")
        for data in ls:
            thisSampleList.append(sanitize(data).split("\n")[0])
        #samplesList is the list of every sample's list
	if not (len(thisSampleList) == len(infoList)):
            print "\n /!\ ERROR: [BUG] [parsingInfo/parseInfo] Parsing error. (2)"
            raise ValueError
        samplesList.append(thisSampleList)
    return samplesList,infoList
Exemple #4
0
def parseMatrix(filename):
    speciesList = []
    samplesList = []
    file_matrix = open("meta/" + filename + ".csv", "r")
    lines = file_matrix.readlines()
    file_matrix.close()
    boolean = True
    for line in lines:
        ls = line.split(",")
        # First line gives the name and rank of species in the samples
        if boolean:
            # ls is then a list of strings of type "rank:name"
            # Turns "rank:name" into (name,rank)
            for string in ls:
                ls1 = string.split(":")
                rank = sanitize(ls1[0])
                # Deletes the white space after name
                # Otherwise equality on strings does not work
                name = sanitize(ls1[-1])
                speciesList.append((name, rank))
            boolean = False
            n = len(speciesList)
        else:
            thisSampleList = []
            for number in ls:
                number = sanitize(number)
                if integer.match(number):
                    thisSampleList.append(int(number))
                else:
                    thisSampleList.append(number)
            if not (len(thisSampleList) == n):
                print "\n /!\ ERROR: [BUG] [parsingMatrix/parseMatrix] Parsing error."
                raise ValueError
            samplesList.append(thisSampleList)
    return samplesList, speciesList
def userNodeSelectionAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(
        raw_input(
            "Input the metadatum that will cluster the set of samples among those written above. [ e.g. "
            + dataArray[1][0] + " ]\n")).split(";")[0]
    isInDatabase([metadatum], dataArray[1])
    listnodes = dataArray[3].values()
    nodesList = parseListNode(
        raw_input(
            "Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. "
            + sanitizeNode(listnodes[-3]) + ";" + sanitizeNode(listnodes[1]) +
            ";" + sanitizeNode(listnodes[-1]) + " ]\n"))
    isInDatabase(nodesList, listnodes)
    numberSamples = len(dataArray[0])
    numberStartingSamples = sanitize(
        raw_input(
            "Knowing there is/are " + str(numberSamples) +
            " sample(s), how many samples do you want to create the training set?\n"
        ))
    x = integer.match(numberStartingSamples)
    if not x or (x and int(numberStartingSamples) > numberSamples):
        print "\n/!\ ERROR: You should write down an integer inferior or equal to ", numberSamples, "."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    assignedClasses, classes, valueSet = classifyIt(dataArray, metadatum,
                                                    nodesList,
                                                    numberStartingSamples)
    numberClass = len(classes)
    youdenJ = countYouden(assignedClasses, classes, numberStartingSamples)
    interpretIt(youdenJ)
    answer = raw_input(
        "Do you want to plot the classes obtained as a pie chart? Y/N")
    if answer == "Y":
        labels = [metadatum + " = " + str(value) for value in valueSet]
        percentagesAs = [len(class1) for class1 in assignedClasses]
        percentages = [len(class1) for class1 in classes]
        plotPieChart(
            labels, percentagesAs, "Assignments depending on " +
            str(nodesList) + " to class for metadatum " + metadatum)
        plotPieChart(
            labels, percentages, "Real classes depending on " +
            str(nodesList) + " for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    answer = raw_input("Do you want to save the results? Y/N")
    if (answer == "Y"):
        writeFile(
            "Youden's J statistic for this classification is: " +
            str(youdenJ) + "\n", "Assignments depending on " +
            listNodes(nodesList) + " to classes for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return assignedClasses, youdenJ
def randomSubSamplingAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(raw_input("Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0]
    isInDatabase([metadatum],dataArray[1])
    s = raw_input("Input the number s of random samplings.\n")
    n = raw_input("Input the number n of nodes to select at each try.\n")
    if not integer.match(s) or not integer.match(n):
        print "\n/!\ ERROR: s and n must both be integers."
        raise ValueError
    numberSamples = len(dataArray[0])
    numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberSamples) + "sample(s), how many samples do you want to create the training set? \n"))
    x = integer.match(numberStartingSamples)
    if not x or (x and int(numberStartingSamples) > numberSamples):
        print "\n/!\ ERROR: You should write down an integer."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    listnodes = dataArray[3].values()
    s,n = int(s),int(n)
    #Here the set of classes is a list of two lists containing the samples in C and not C
    bestClassification = []
    bestClassesList = []
    #Worse value for this coefficient
    currBestYouden = inf
    nodesNumber = len(dataArray[3])
    while s:
        #Randomly draw n distinct nodes among the nodes in the taxonomic tree
        nodesList = randomChoice(listnodes,n)
        assignedClasses,classes,valueSet = classifyIt(dataArray,metadatum,nodesList,numberStartingSamples)
        numberClass = len(classes)
        youdenJ = countYouden(assignedClasses,classes,numberSamples)
        res = numberClass - youdenJ
        if min(res,currBestYouden) == res:
            bestClassification = []
            for i in nodesList:
                bestClassification.append(i)
            currBestYouden = res
            bestClassesList = []
            for i in assignedClasses:
                bestClassesList.append(i)
        s -= 1
    interpretIt(numberClass - currBestYouden)
    if answer == "Y":
        percentagesAs = [ len(class1) for class1 in assignedClasses ]
        labels = [ metadatum + " = " + str(value) for value in valueSet ]
        percentages = [ len(class1) for class1 in classes ]
        plotPieChart(labels,percentagesAs,"Assignments depending on " + listNodes(nodesList) + " to class for metadatum " + metadatum)
        plotPieChart(labels,percentages,"Real classes depending on " + listNodes(nodesList) + " for metadatum " + metadatum)
    answer = raw_input("Do you want to save the results? Y/N")
    if (answer == "Y"):
        writeFile("Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this metadatum is:" + str(bestClassification),"Assignments to classes for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return bestClassification,(numberClass - currBestYouden),bestClassesList
Exemple #7
0
def userNodeSelectionAct(dataArray):
    print dataArray[1]
    metadata = parseList(raw_input("Input the metadata that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))
    isInDatabase(metadata,dataArray[1])
    #@dataArray[2] = idSequences is a dictionary of (key=identifier,value=(name,rank of node))
    listofNodes = dataArray[2].values()
    nodesList = parseListNode(raw_input("Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listofNodes[-3]) + ";" + sanitizeNode(listofNodes[1]) + ";" + sanitizeNode(listofNodes[-1]) + " ]\n"))
    isInDatabase(nodesList,listofNodes)
    numberofSamples = len(dataArray[0])
    numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberofSamples) + "sample(s), how many samples do you want to create the training set? \n"))
    x = integer.match(numberStartingSamples)
    if not x or (x and (int(numberStartingSamples) > numberofSamples)):
        print "\n/!\ ERROR: You should write down an integer less or equal to",numberofSamples,"."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    #@shape for @assignedClasses is the same than the one for @classes
    assignedClasses,classes,valueSets = classifyIt(dataArray,metadata,nodesList,numberStartingSamples)
    numberClass = classes.lenMDL()
    youdenJ = countYouden(assignedClasses,classes,numberofSamples)
    interpretIt(youdenJ)
    answer = raw_input("Do you want to plot the classes obtained as a pie chart? Y/N\n")
    if answer == "Y":
        labels = [ "Metadata: " + str(metadata) + ", Values for each metadatum: " + str([ valueSet for valueSet in valueSets]) ]
        percentagesAs = assignedClasses.mapMDL(len)
        percentages = classes.mapMDL(len)
        plotPie(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadata " + str(metadata))
        plotPie(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadata " + str(metadata))
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    answer = raw_input("Do you want to save the results? Y/N \n")
    if (answer == "Y"):
        writeFile("Youden's J statistic for this classification is: " + str(youdenJ) + "\n","Assignments depending on " + listNodes(nodesList) + " to classes for metadata " + str(metadata))
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return assignedClasses,youdenJ
Exemple #8
0
def change_name(discord_id, new_name):
    cur = db.cursor()
    new_name = sanitize(new_name)
    uuid = get_uuid(discord_id)
    sql = """
        UPDATE players 
        SET name='%s' 
        WHERE uuid='%s'
        """
    cur.execute(sql, new_name, uuid)
Exemple #9
0
def clusteringAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(
        raw_input(
            "Select the metadatum among those above to cluster the set of samples. [e.g. "
            + dataArray[1][0] + "]\n")).split(";")[0]
    isInDatabase([metadatum], dataArray[1])
    valueSet, clusters1 = partitionSampleByMetadatumValue(
        metadatum, dataArray[1], dataArray[0])
    clusters = [[sample[0] for sample in cluster] for cluster in clusters1]
    #that is, k in K-means Algorithm
    numberClass = len(valueSet)
    print "/!\ Number of classes:", numberClass, "."
    startSet = [cluster[0] for cluster in clusters]
    #Selects the starting samples of each cluster
    kClusters = [[start] for start in startSet]
    if not (len(clusters) == numberClass):
        print "\n/!\ ERROR: Different lengths: numberClass", numberClass, "clusters:", len(
            clusters), "."
        raise ValueError
    trimmedList = trimList(dataArray[3], startSet)
    print "/!\ Clustering with the first distance..."
    #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster)
    #@dataArray[8] = distMatchedDict
    kClusters, meanSamples, distanceDict, distanceInClusters = kMeans(
        trimmedList, numberClass, kClusters, startSet, dataArray[8], dataArray)
    print "-- End of first clustering --"
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number == len(dataArray[3])):
        print "\n/!\ ERROR: A bug occurred during the clustering:", number, "=/=", len(
            dataArray[3]), "."
        raise ValueError
    #Deletes samples in cluster that are too far from the others
    kClusters, untaken = cleanClusters(kClusters, distanceInClusters)
    startSet = [cluster[0] for cluster in clusters]
    #Remove from untaken the starting samples
    untaken2 = []
    for x in untaken:
        if not (x in startSet):
            untaken2.append(x)
    untaken = untaken2
    #Remove the samples in untaken from the total set of samples
    sampleSet = []
    for cluster in kClusters:
        for x in cluster:
            if not (x in sampleSet):
                sampleSet.append(x)
    for x in startSet:
        if not (x in sampleSet):
            sampleSet.append(x)
    trimmedList = trimList(sampleSet, startSet)
    print "/!\ Clustering with the second distance..."
    #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2)
    #@dataArray[9] = distConsensusDict
    kClusters, meanSamples, distanceDict, _ = kMeans(trimmedList, numberClass,
                                                     kClusters, startSet,
                                                     dataArray[9],
                                                     dataArray)  #,meanSamples)
    print "-- End of second clustering --"
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number <= len(dataArray[3])):
        print "\n/!\ ERROR: An error occurred during the clustering:", number, ">", len(
            dataArray[3]), "."
        raise ValueError
    print "Printing the", numberClass, "clusters:"
    i = 1
    #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs
    for cluster in kClusters:
        print "\n-- Cluster #", i, "associated to", metadatum, "=", valueSet[
            i - 1], ":"
        print "Size:", len(cluster)
        print sorted(cluster)
        i += 1
    print "\nScore of the clustering (comprised between 0 and 1):"
    print "The more it is close to 1, the more the clustering is relevant."
    #The clustering obtained with the K-Means method
    kClustersCopy = [cluster for cluster in kClusters]
    #The clustering obtained by comparing the values of the metadatum
    clustersCopy = [cluster for cluster in clusters]
    #Score by using first method of comparison
    compareClusterScore = 0
    if not (len(kClustersCopy) == numberClass == len(clustersCopy)):
        print "\n/!\ ERROR: Length error in clustering:", numberClass, len(
            kClustersCopy), len(clustersCopy), "."
        raise ValueError
    while kClustersCopy and clustersCopy:
        cl1 = kClustersCopy.pop()
        cl2 = clustersCopy.pop()
        #clusters are non-empty
        x = compareCluster(cl1, cl2, untaken)
        if x:
            compareClusterScore += x
        else:
            compareClusterScore = None
            break
    if compareClusterScore:
        compareClusterScore = compareClusterScore / numberClass
        printClusterScore = compareClusterScore
    else:
        printClusterScore = "None"
    #Score by using second method of comparison
    #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass)
    print "Compare clusters score is:", printClusterScore, "."
    #print "Compare centers score is:",compareCentersScore,"."
    answer = raw_input("Do you want to save the results? Y/N\n")
    if (answer == "Y"):
        answer2 = raw_input(
            "Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n"
        )
        if (answer2 == "Y"):
            commonList = extractCommonNodes(kClusters, dataArray)
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:", answer2, "."
        data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str(
            valueSet)
        i = 0
        for cluster in kClusters:
            data += "\n\n-- Cluster #" + str(
                i + 1) + " associated to " + metadatum + " = " + str(
                    valueSet[i])
            data += "\nSize: " + str(len(cluster))
            if (answer2 == "Y"):
                data += "\nSet of common nodes: " + str(commonList[i])
            data += "\n" + str(cluster)
            i += 1
        data += "\n\nCompare clusters score is: " + str(compareClusterScore)
        #data += "\n\nCompare centers score is: " + str(compareCentersScore)
        data += "\n\nEND OF FILE ****"
        print "\n/!\ Saving clusters..."
        writeFile(data)
        answer2 = raw_input(
            "Do you want to compute the graph of the clusters? Y/N\n")
        if (answer2 == "Y"):
            print "\n/!\ Constructing the graph of the clusters..."
            #@dataArray[3] = filenames
            graph = convertClustersIntoGraph(kClusters, distanceDict,
                                             len(dataArray[3]))
            graphNO(graph)
            print "\n/!\ Done. The graph is in DOT format in \"files\" folder."
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:", answer2, "."
    elif not (answer == "N"):
        print "/!\ You should answer by Y or N."
Exemple #10
0
def clusteringAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(raw_input("Select the metadatum among those above to cluster the set of samples. [e.g. " + dataArray[1][0] + "]\n")).split(";")[0]
    isInDatabase([metadatum],dataArray[1])
    valueSet,clusters1 = partitionSampleByMetadatumValue(metadatum,dataArray[1],dataArray[0])
    clusters = [[sample[0] for sample in cluster] for cluster in clusters1]
    #that is, k in K-means Algorithm
    numberClass = len(valueSet)
    print "/!\ Number of classes:",numberClass,"."
    startSet = [cluster[0] for cluster in clusters]
    #Selects the starting samples of each cluster
    kClusters = [[start] for start in startSet]
    if not (len(clusters) == numberClass):
        print "\n/!\ ERROR: Different lengths: numberClass",numberClass,"clusters:",len(clusters),"."
        raise ValueError
    trimmedList = trimList(dataArray[3],startSet)
    print "/!\ Clustering with the first distance..."
    #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster)
    #@dataArray[8] = distMatchedDict
    kClusters,meanSamples,distanceDict,distanceInClusters = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[8],dataArray)
    print "-- End of first clustering --"
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number == len(dataArray[3])):
        print "\n/!\ ERROR: A bug occurred during the clustering:",number,"=/=",len(dataArray[3]),"."
        raise ValueError
    #Deletes samples in cluster that are too far from the others
    kClusters,untaken = cleanClusters(kClusters,distanceInClusters)
    startSet = [cluster[0] for cluster in clusters]
    #Remove from untaken the starting samples
    untaken2 = []
    for x in untaken:
        if not (x in startSet):
            untaken2.append(x)
    untaken = untaken2
    #Remove the samples in untaken from the total set of samples
    sampleSet = []
    for cluster in kClusters:
        for x in cluster:
            if not (x in sampleSet):
                sampleSet.append(x)
    for x in startSet:
        if not (x in sampleSet):
            sampleSet.append(x)
    trimmedList = trimList(sampleSet,startSet)
    print "/!\ Clustering with the second distance..."
    #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2)
    #@dataArray[9] = distConsensusDict
    kClusters,meanSamples,distanceDict,_ = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[9],dataArray)#,meanSamples)
    print "-- End of second clustering --" 
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number <= len(dataArray[3])):
        print "\n/!\ ERROR: An error occurred during the clustering:",number,">",len(dataArray[3]),"."
        raise ValueError
    print "Printing the",numberClass,"clusters:"
    i = 1
    #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs
    for cluster in kClusters:
        print "\n-- Cluster #",i,"associated to",metadatum,"=",valueSet[i-1],":"
        print "Size:",len(cluster)
        print sorted(cluster)
        i += 1
    print "\nScore of the clustering (comprised between 0 and 1):"
    print "The more it is close to 1, the more the clustering is relevant."
    #The clustering obtained with the K-Means method
    kClustersCopy = [cluster for cluster in kClusters]
    #The clustering obtained by comparing the values of the metadatum
    clustersCopy = [cluster for cluster in clusters]
    #Score by using first method of comparison
    compareClusterScore = 0
    if not (len(kClustersCopy) == numberClass == len(clustersCopy)):
        print "\n/!\ ERROR: Length error in clustering:",numberClass,len(kClustersCopy),len(clustersCopy),"."
        raise ValueError
    while kClustersCopy and clustersCopy:
        cl1 = kClustersCopy.pop()
        cl2 = clustersCopy.pop()
        #clusters are non-empty
        x = compareCluster(cl1,cl2,untaken)
        if x:
            compareClusterScore += x
        else:
            compareClusterScore = None
            break
    if compareClusterScore:
        compareClusterScore = compareClusterScore/numberClass
        printClusterScore = compareClusterScore
    else:
        printClusterScore = "None"
    #Score by using second method of comparison
    #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass)
    print "Compare clusters score is:",printClusterScore,"."
    #print "Compare centers score is:",compareCentersScore,"."
    answer = raw_input("Do you want to save the results? Y/N\n")
    if (answer == "Y"):
        answer2 = raw_input("Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n")
        if (answer2 == "Y"):
            commonList = extractCommonNodes(kClusters,dataArray)
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:",answer2,"."
        data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str(valueSet)
        i = 0
        for cluster in kClusters:
            data += "\n\n-- Cluster #" + str(i+1) + " associated to " + metadatum + " = " + str(valueSet[i]) 
            data += "\nSize: " + str(len(cluster))
            if (answer2 == "Y"):
                data += "\nSet of common nodes: " + str(commonList[i])
            data += "\n" + str(cluster)
            i += 1
        data += "\n\nCompare clusters score is: " + str(compareClusterScore)
        #data += "\n\nCompare centers score is: " + str(compareCentersScore)
        data += "\n\nEND OF FILE ****"
        print "\n/!\ Saving clusters..."
        writeFile(data)
        answer2 = raw_input("Do you want to compute the graph of the clusters? Y/N\n")
        if (answer2 == "Y"):
            print "\n/!\ Constructing the graph of the clusters..."
            #@dataArray[3] = filenames
            graph = convertClustersIntoGraph(kClusters,distanceDict,len(dataArray[3]))
            graphNO(graph)
            print "\n/!\ Done. The graph is in DOT format in \"files\" folder."
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:",answer2,"."
    elif not (answer == "N"):
        print "/!\ You should answer by Y or N."
Exemple #11
0
def getNameRankList(string):
    return [(sanitize(string.split(":")[-1]).split("\n")[0],sanitize(string.split(":")[0]).split("\n")[0])]
def randomSubSamplingAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(
        raw_input(
            "Input the metadatum that will cluster the set of samples among those written above. [ e.g. "
            + dataArray[1][0] + " ]\n")).split(";")[0]
    isInDatabase([metadatum], dataArray[1])
    s = raw_input("Input the number s of random samplings.\n")
    n = raw_input("Input the number n of nodes to select at each try.\n")
    if not integer.match(s) or not integer.match(n):
        print "\n/!\ ERROR: s and n must both be integers."
        raise ValueError
    numberSamples = len(dataArray[0])
    numberStartingSamples = sanitize(
        raw_input(
            "Knowing there is/are " + str(numberSamples) +
            "sample(s), how many samples do you want to create the training set? \n"
        ))
    x = integer.match(numberStartingSamples)
    if not x or (x and int(numberStartingSamples) > numberSamples):
        print "\n/!\ ERROR: You should write down an integer."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    listnodes = dataArray[3].values()
    s, n = int(s), int(n)
    #Here the set of classes is a list of two lists containing the samples in C and not C
    bestClassification = []
    bestClassesList = []
    #Worse value for this coefficient
    currBestYouden = inf
    nodesNumber = len(dataArray[3])
    while s:
        #Randomly draw n distinct nodes among the nodes in the taxonomic tree
        nodesList = randomChoice(listnodes, n)
        assignedClasses, classes, valueSet = classifyIt(
            dataArray, metadatum, nodesList, numberStartingSamples)
        numberClass = len(classes)
        youdenJ = countYouden(assignedClasses, classes, numberSamples)
        res = numberClass - youdenJ
        if min(res, currBestYouden) == res:
            bestClassification = []
            for i in nodesList:
                bestClassification.append(i)
            currBestYouden = res
            bestClassesList = []
            for i in assignedClasses:
                bestClassesList.append(i)
        s -= 1
    interpretIt(numberClass - currBestYouden)
    if answer == "Y":
        percentagesAs = [len(class1) for class1 in assignedClasses]
        labels = [metadatum + " = " + str(value) for value in valueSet]
        percentages = [len(class1) for class1 in classes]
        plotPieChart(
            labels, percentagesAs, "Assignments depending on " +
            listNodes(nodesList) + " to class for metadatum " + metadatum)
        plotPieChart(
            labels, percentages, "Real classes depending on " +
            listNodes(nodesList) + " for metadatum " + metadatum)
    answer = raw_input("Do you want to save the results? Y/N")
    if (answer == "Y"):
        writeFile(
            "Best Youden's J statistic for this classification is: " +
            str(numberClass - currBestYouden) +
            "\nand most relevant list of nodes for this metadatum is:" +
            str(bestClassification),
            "Assignments to classes for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return bestClassification, (numberClass - currBestYouden), bestClassesList
Exemple #13
0
def randomSubSamplingAct(dataArray):
    print dataArray[1]
    metadata = parseList(raw_input("Input the metadata that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))
    isInDatabase(metadata,dataArray[1])
    s = raw_input("Input the number s of random samplings.\n")
    n = raw_input("Input the number n of nodes to select at each try.\n")
    numberofSamples = len(dataArray[0])
    if not integer.match(s) or not integer.match(n):
        print "\n/!\ ERROR: s and n must both be integers."
        raise ValueError
    s,n = int(s),int(n)
    numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberofSamples) + " sample(s), how many samples do you want to create the training set?\n"))
    x = integer.match(numberStartingSamples)
    if not x or (x and (int(numberStartingSamples) > numberofSamples)):
        print "\n/!\ ERROR: You should write down an integer less or equal to",numberofSamples,"."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    #Here the set of classes is a list of two lists containing the samples in C and not C
    bestClassification = []
    bestClassesList = []
    bestShape = []
    bestValuesList = []
    #Worse value for this coefficient
    currBestYouden = inf
    nodesNumber = len(dataArray[3])
    #@dataArray[2] = idSequences, which is a dictionary of (key=identifier,values=(name,rank of node))
    listofNodes = dataArray[2].values()
    while s:
        #Randomly draw n distinct nodes among the nodes in the taxonomic tree
        nodesList = randomChoice(listofNodes,n)
        assignedClasses,classes,valueSets = classifyIt(dataArray,metadata,nodesList,numberStartingSamples)
        numberClass = classes.lenMDL(shape)
        #len(dataArray[0])?
        youdenJ = countYouden(assignedClasses,classes,numberofSamples)
        res = numberClass - youdenJ
        if min(res,currBestYouden) == res:
            bestValuesList = []
            for i in valueSets:
                bestValuesList.append(i)
            bestClassification = []
            for i in nodesList:
                bestClassification.append(i)
            bestShape = []
            for i in shape:
                bestShape.append(i)
            currBestYouden = res
            bestClassesList = []
            for i in assignedClasses:
                bestClassesList.append(i)
        s -= 1
    interpretIt(numberClass - currBestYouden)
    if answer == "Y":
        labels = [ "Metadata: " + str(metadata) + ", Values for each metadatum: " + str([ valueSet for valueSet in valueSets]) ]
        percentagesAs = assignedClasses.mapMDL(len)
        percentages = classes.mapMDL(len)
        plotPie(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadata " + str(metadata))
        plotPie(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadata " + str(metadata))
    answer = raw_input("Do you want to save the results? Y/N \n")
    if (answer == "Y"):
        writeFile("Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this set of metadata is:" + str(bestClassification),"Assignments to classes for metadata " + str(metadata))
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return bestClassification,(numberClass - currBestYouden),bestClassesList