def totalDiffRatioAct(dataArray): print "First list of samples." sampleNameList1,metadataList1,interval1List1,interval2List1 = createSampleNameList(dataArray) print "Second list of samples." sampleNameList2,metadataList2,interval1List2,interval2List2 = createSampleNameList(dataArray) common,in1,in2,numberA1,numberA2,_,_,_ = compute(dataArray[7],sampleNameList1,sampleNameList2) commonA = countAssignmentsInCommon(common,sampleNameList1,sampleNameList2) tratio = totalRatio(commonA,numberA1,numberA2) ntRatio = totalRatioNormalized(commonA,numberA1,numberA2) dratio = diffRatio(commonA) ndRatio = diffRatioNormalized(commonA,numberA1,numberA2) print "\nTotal Ratio Distance is: " + str(tratio) print "normalized Total Ratio is: " + str(ntRatio) + "\n[The more it is close to 1, the more the two groups are alike]\n" print "Diff Ratio Distance is: " + str(dratio) print "normalized Diff Ratio is: " + str(ndRatio) + "\n[The more it is close to 0, the more the two groups are alike]\n" print "[If you have obtained +inf (resp. -inf), it could mean you have selected no sample.]\n" answer = raw_input("Save the results? Y/N\n") if (answer == "Y"): data = "Total Ratio Results ****\n for " + str(sampleNameList1) + "\n" if metadataList1: data += "selected on metadata: " + str(metadataList1) + " with extremum values: " + str(interval1List1) + " (lower bounds) and " + str(interval2List1) + " (upper bounds) \n" data += " and " + str(sampleNameList2) + "\n" if metadataList2: data += "selected on metadata: " + str(metadataList2) + " with extremum values: " + str(interval1List2) + " (lower bounds) and " + str(interval2List2) + " (upper bounds) \n" data += "\nTotal Ratio Distance is: " + str(tratio) + "\n normalized Total Ratio is: " + str(ntRatio) + "\nDiff Ratio Distance is: " + str(dratio) + "\n normalized Diff Ratio is: " + str(ndRatio) +"\n\nEND OF FILE ****" writeFile(data,"","text") elif not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!"
def userNodeSelectionAct(dataArray): print dataArray[1] metadata = parseList(raw_input("Input the metadata that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n")) isInDatabase(metadata,dataArray[1]) #@dataArray[2] = idSequences is a dictionary of (key=identifier,value=(name,rank of node)) listofNodes = dataArray[2].values() nodesList = parseListNode(raw_input("Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listofNodes[-3]) + ";" + sanitizeNode(listofNodes[1]) + ";" + sanitizeNode(listofNodes[-1]) + " ]\n")) isInDatabase(nodesList,listofNodes) numberofSamples = len(dataArray[0]) numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberofSamples) + "sample(s), how many samples do you want to create the training set? \n")) x = integer.match(numberStartingSamples) if not x or (x and (int(numberStartingSamples) > numberofSamples)): print "\n/!\ ERROR: You should write down an integer less or equal to",numberofSamples,"." raise ValueError numberStartingSamples = int(numberStartingSamples) #@shape for @assignedClasses is the same than the one for @classes assignedClasses,classes,valueSets = classifyIt(dataArray,metadata,nodesList,numberStartingSamples) numberClass = classes.lenMDL() youdenJ = countYouden(assignedClasses,classes,numberofSamples) interpretIt(youdenJ) answer = raw_input("Do you want to plot the classes obtained as a pie chart? Y/N\n") if answer == "Y": labels = [ "Metadata: " + str(metadata) + ", Values for each metadatum: " + str([ valueSet for valueSet in valueSets]) ] percentagesAs = assignedClasses.mapMDL(len) percentages = classes.mapMDL(len) plotPie(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadata " + str(metadata)) plotPie(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadata " + str(metadata)) elif not (answer == "N"): print "\n Answer by Y or N!" answer = raw_input("Do you want to save the results? Y/N \n") if (answer == "Y"): writeFile("Youden's J statistic for this classification is: " + str(youdenJ) + "\n","Assignments depending on " + listNodes(nodesList) + " to classes for metadata " + str(metadata)) elif not (answer == "N"): print "\n Answer by Y or N!" return assignedClasses,youdenJ
def userNodeSelectionAct(dataArray): print dataArray[1] metadatum = sanitize(raw_input("Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0] isInDatabase([metadatum],dataArray[1]) listnodes = dataArray[3].values() nodesList = parseListNode(raw_input("Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listnodes[-3]) + ";" + sanitizeNode(listnodes[1]) + ";" + sanitizeNode(listnodes[-1]) + " ]\n")) isInDatabase(nodesList,listnodes) numberSamples = len(dataArray[0]) numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberSamples) + " sample(s), how many samples do you want to create the training set?\n")) x = integer.match(numberStartingSamples) if not x or (x and int(numberStartingSamples) > numberSamples) : print "\n/!\ ERROR: You should write down an integer inferior or equal to ",numberSamples,"." raise ValueError numberStartingSamples = int(numberStartingSamples) assignedClasses,classes,valueSet = classifyIt(dataArray,metadatum,nodesList,numberStartingSamples) numberClass = len(classes) youdenJ = countYouden(assignedClasses,classes,numberStartingSamples) interpretIt(youdenJ) answer = raw_input("Do you want to plot the classes obtained as a pie chart? Y/N") if answer == "Y": labels = [ metadatum + " = " + str(value) for value in valueSet ] percentagesAs = [ len(class1) for class1 in assignedClasses ] percentages = [ len(class1) for class1 in classes ] plotPieChart(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadatum " + metadatum) plotPieChart(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" answer = raw_input("Do you want to save the results? Y/N") if (answer == "Y"): writeFile("Youden's J statistic for this classification is: " + str(youdenJ) + "\n","Assignments depending on " + listNodes(nodesList) + " to classes for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" return assignedClasses,youdenJ
def getValueBacteriaMetadata(samplesInfoList,infoList,bacterias,sampleIDList,samplesOccList,speciesList,metadatum): xArray = [] #Stores the positions of number of assignments to each bacteria of the group in the occurrences matrix bacteriaPos = getPositionBacteria(bacterias,speciesList) valueSet,valueSampleMetadatum = partitionSampleByMetadatumValue(metadatum,infoList,samplesInfoList) #Integer values of metadatum are sorted yArray = sorted(valueSet,key=lambda x:x[1]) #For every different value of the metadatum for sampleValueList in valueSampleMetadatum: #gets the number of assignments to bacterias which positions are in bacteriaPos depending on the value of the metadatum (any sample in sampleValueList has the same value of the metadatum) res = getValueBacteria(samplesOccList,speciesList,sampleIDList,bacteriaPos,sampleValueList,True) if res: xArray.append(res) else: for sample in sampleValueList: xArray.append((sample,0)) print "\n[Preview.]" print "\n--- Number of assignments to the group of bacterias",bacterias,"in samples depending on the",len(xArray),"value(s) of metadatum",metadatum print xArray print "\n--- Set of values of metadatum",metadatum,"of length",len(yArray) string = "" for x in yArray[:-1]: string += str(x[1]) + ", " print (string + str(yArray[-1][1])) answer = raw_input("\nWrite both Bacteria and Metadatum files? Y/N\n") if (answer == "Y"): print "Saving first file" writeFile(xArray,"**** Values of assignments in samples samples depending on the value of metadatum" + str(metadatum) + "of nodes: " + str(bacterias) + "\n\n","array") print "Saving second file" writeFile(yArray,"**** Values of metadatum: " + str(metadatum) + "\n\n","array") elif not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!" return xArray,yArray
def getValueBacteriaBacteria(samplesOccList,speciesList,sampleIDList,bacterias1,bacterias2): xArray,yArray = [],[] #Stores the positions of number of assignments to each bacteria of the group in the occurrences matrix bacteriaPos1 = getPositionBacteria(bacterias1,speciesList) bacteriaPos2 = getPositionBacteria(bacterias2,speciesList) for sample in sampleIDList: #gets the number of assignments to the group of bacterias in the sample res = getValueBacteria(samplesOccList,speciesList,sampleIDList,bacteriaPos1,[sample]) #Due to the fact some samples are in info matrix and not in occurrence matrix, getValueBacteria may return an empty list if res: xArray += res else: xArray.append((sample,0)) res = getValueBacteria(samplesOccList,speciesList,sampleIDList,bacteriaPos2,[sample]) if res: #Contains only one element, since getValueBacteria was applied to only one element yArray += res else: yArray.append((sample,0)) print "\n[Preview.]" print "\n--- Number of assignments to the group of bacterias",bacterias1,"in all",len(xArray),"samples" print xArray print "\n--- Number of assignments to the group of bacterias",bacterias2,"in all",len(xArray),"samples" print yArray answer = raw_input("\nWrite both Bacteria files? Y/N\n") if (answer == "Y"): print "Saving first file..." writeFile(xArray,"**** Values of assignments in all samples of nodes: " + str(bacterias1) + "\n\n","array") print "Saving second file..." writeFile(yArray,"**** Values of assignments in all samples of nodes: " + str(bacterias2) + "\n\n","array") elif not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!" return xArray,yArray
def distanceAct(dataArray): answer = raw_input("Import matrix? Y/N\n") if answer == "Y": filename = raw_input("Write down the file name where the matrix is stored [ without the extension .taxotree ].\n") matrix = importMatrix(filename) else: if not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!" print "/!\ Computing similarity matrix..." print "[ You may have to wait for a few minutes... ]" matrix = computeSimilarity(dataArray) print "[Preview.]" print matrix answer = raw_input("Save the results? Y/N\n") if (answer == "Y"): writeFile(m,"Similarity coefficients between patients using previous calculi on total ratio, pattern ratio and diversity coefficient\n\nNota Bene: 1e+14 stands for +inf\n","array") elif not (answer == "N"): print "\n/!\ You should answer 'Y' or 'N'!" answer = raw_input("Compute the most different groups of samples? Y/N\n") if (answer == "Y"): answer = raw_input("Do you want to select samples by metadata or to select all samples? metadata/all") if (answer == "metadata"): print dataArray[1] metadatum = parseList(raw_input("Choose the metadatum among those printed above [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n")) isInDatabase(metadatum,dataArray[1]) _,valueSampleMetadatum = partitionSampleByMetadatumValue(metadatum[0],dataArray[1],dataArray[0]) valueSampleMetadatumNameOnly = [] for sampleGroup in valueSampleMetadatum: sampleGroupNameOnly = [] for sample in sampleGroup: sampleGroupNameOnly.append(sample[0]) valueSampleMetadatumNameOnly.append(sampleGroupNameOnly) pairsList = mostDifferentSamplesGroups(matrix,dataArray[8],valueSampleMetadatumNameOnly) if (answer == "all"): pairsList = mostDifferentSamplesGroups(matrix,dataArray[8],[[sample] for sample in dataArray[8]]) else: print "\n/!\ ERROR: You should answer 'metadata' or 'all'." raise ValueError print "[ Preview. ]" print "List of the pairs of most different sample groups according to the similarity coefficients computed:" for pair in pairsList: print pair answer2 = raw_input("\nSave the results? Y/N\n") if (answer2 == "Y"): stringPairs = "" for pair in pairsList: stringPairs += "*" + str(pair) + "\n" if (answer == "metadata"): stringSamples = "" for group in valueSampleMetadatumNameOnly: stringSamples += "*" + str(group) + "\n" data = "Most different groups of samples ****\nsorted by values of metadatum: " + metadatum[0] + "\nGroups were:\n\n" + stringSamples + "\n\nAnd the most different ones are:\n\n" + stringPairs + "\n\nEND OF FILE ****" else: data = "Most different groups of samples ****\n\nThe most different ones are:\n\n" + stringPairs + "\n\nEND OF FILE ****" writeFile(data,"","text") elif not (answer2 == "N"): print "/!\ You should answer 'Y' or 'N'!"
def userNodeSelectionAct(dataArray): print dataArray[1] metadatum = sanitize( raw_input( "Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0] isInDatabase([metadatum], dataArray[1]) listnodes = dataArray[3].values() nodesList = parseListNode( raw_input( "Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listnodes[-3]) + ";" + sanitizeNode(listnodes[1]) + ";" + sanitizeNode(listnodes[-1]) + " ]\n")) isInDatabase(nodesList, listnodes) numberSamples = len(dataArray[0]) numberStartingSamples = sanitize( raw_input( "Knowing there is/are " + str(numberSamples) + " sample(s), how many samples do you want to create the training set?\n" )) x = integer.match(numberStartingSamples) if not x or (x and int(numberStartingSamples) > numberSamples): print "\n/!\ ERROR: You should write down an integer inferior or equal to ", numberSamples, "." raise ValueError numberStartingSamples = int(numberStartingSamples) assignedClasses, classes, valueSet = classifyIt(dataArray, metadatum, nodesList, numberStartingSamples) numberClass = len(classes) youdenJ = countYouden(assignedClasses, classes, numberStartingSamples) interpretIt(youdenJ) answer = raw_input( "Do you want to plot the classes obtained as a pie chart? Y/N") if answer == "Y": labels = [metadatum + " = " + str(value) for value in valueSet] percentagesAs = [len(class1) for class1 in assignedClasses] percentages = [len(class1) for class1 in classes] plotPieChart( labels, percentagesAs, "Assignments depending on " + str(nodesList) + " to class for metadatum " + metadatum) plotPieChart( labels, percentages, "Real classes depending on " + str(nodesList) + " for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" answer = raw_input("Do you want to save the results? Y/N") if (answer == "Y"): writeFile( "Youden's J statistic for this classification is: " + str(youdenJ) + "\n", "Assignments depending on " + listNodes(nodesList) + " to classes for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" return assignedClasses, youdenJ
def randomSubSamplingAct(dataArray): print dataArray[1] metadatum = sanitize(raw_input("Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0] isInDatabase([metadatum],dataArray[1]) s = raw_input("Input the number s of random samplings.\n") n = raw_input("Input the number n of nodes to select at each try.\n") if not integer.match(s) or not integer.match(n): print "\n/!\ ERROR: s and n must both be integers." raise ValueError numberSamples = len(dataArray[0]) numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberSamples) + "sample(s), how many samples do you want to create the training set? \n")) x = integer.match(numberStartingSamples) if not x or (x and int(numberStartingSamples) > numberSamples): print "\n/!\ ERROR: You should write down an integer." raise ValueError numberStartingSamples = int(numberStartingSamples) listnodes = dataArray[3].values() s,n = int(s),int(n) #Here the set of classes is a list of two lists containing the samples in C and not C bestClassification = [] bestClassesList = [] #Worse value for this coefficient currBestYouden = inf nodesNumber = len(dataArray[3]) while s: #Randomly draw n distinct nodes among the nodes in the taxonomic tree nodesList = randomChoice(listnodes,n) assignedClasses,classes,valueSet = classifyIt(dataArray,metadatum,nodesList,numberStartingSamples) numberClass = len(classes) youdenJ = countYouden(assignedClasses,classes,numberSamples) res = numberClass - youdenJ if min(res,currBestYouden) == res: bestClassification = [] for i in nodesList: bestClassification.append(i) currBestYouden = res bestClassesList = [] for i in assignedClasses: bestClassesList.append(i) s -= 1 interpretIt(numberClass - currBestYouden) if answer == "Y": percentagesAs = [ len(class1) for class1 in assignedClasses ] labels = [ metadatum + " = " + str(value) for value in valueSet ] percentages = [ len(class1) for class1 in classes ] plotPieChart(labels,percentagesAs,"Assignments depending on " + listNodes(nodesList) + " to class for metadatum " + metadatum) plotPieChart(labels,percentages,"Real classes depending on " + listNodes(nodesList) + " for metadatum " + metadatum) answer = raw_input("Do you want to save the results? Y/N") if (answer == "Y"): writeFile("Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this metadatum is:" + str(bestClassification),"Assignments to classes for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" return bestClassification,(numberClass - currBestYouden),bestClassesList
def graphNO(graph): seen = dict.fromkeys([]) data = "graph g { \n" n,m = np.shape(graph) for i in range(n): for j in range(i+1,m): if graph[i][j]: namei,namej,delta,distance = graph[i][j] if delta and not seen.get((namei,namej)) : data += "%s -- %s [label=%s]; \n"%(sanitizeDot(namei),sanitizeDot(namej),str(distance)) seen.setdefault((namei,namej),1) seen.setdefault((namej,namej),1) data += " } \n" writeFile(data,"","dot")
def similarityAct(dataArray,iMatrix): print dataArray[1] metadataList = parseList(raw_input("Input the list of metadata you want to consider among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n")) isInDatabase(metadataList,dataArray[1]) print "/!\ Computing similarity matrix..." m = similarity(dataArray[0],dataArray[1],metadataList) print "[Preview.]" print m answer = raw_input("Save the results? Y/N\n") if (answer == "Y"): writeFile(m,"Similarity coefficients between patients for file meta/" + iMatrix + ".csv:\n" + listNodes(dataArray[8]),"array") elif not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!" return m
def diversityAct(dataArray): sampleNameList,metadata,interval1,interval2 = createSampleNameList(dataArray) #@dataArray[5] = n is the number of nodes in the taxonomic tree coefficient,sample = computeDiversityCoefficient(dataArray[5],sampleNameList,dataArray) print "\nMicrobial Diversity coefficient is: " + str(coefficient) print "[If you have obtained -inf, it could mean the taxonomic tree is actually empty.]\n" answer = raw_input("Save the results? Y/N\n") if (answer == "Y"): data = "Microbial Diversity Results ****\n for lists " + str(sampleNameList) + "\n" if metadata: data += "selected on metadata: " + str(metadata) + "with extreme values: " + str(interval1) + " (lower bounds) and " + str(interval2) + " (upper bounds) \n" data += "\nMicrobial Diversity coefficient is: " + str(coefficient) +"\n\nEND OF FILE ****" writeFile(data,"","text") elif not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!" answer = raw_input("Do you want to display the pie chart of the assignments to the taxonomic tree in the selected samples? Y/N\n") if (answer == "Y"): plotPieChart([sample1[0] for sample1 in sample],[sample1[1] for sample1 in sample],"Assignments to the taxonomic tree in " + str(sampleNameList[:3])) elif not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!"
def percentageAct(dataArray): uTree = raw_input("Do you to get percentage of assignments to subtrees or to bacterias themselves? subtree/bacteria \n") usingTree = (uTree == "subtree") if not (uTree == "subtree" or uTree == "bacteria"): print "\n/!\ ERROR: You need to answer 'bacteria' or 'subtree'." raise ValueError nodesGroup = parseListNode(raw_input("Input the list of nodes/roots of subtrees you want to consider. [ Please look at the taxonomic tree file to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + ". ]\n")) isInDatabase(nodesGroup,dataArray[6]) sampleNameList,metadataList,interval1List,interval2List = createSampleNameList(dataArray,True) result = percentageAssign(dataArray[0],dataArray[1],sampleNameList,dataArray[7],nodesGroup,dataArray[2],dataArray[3],usingTree) print "\n[Preview.]" print result l = len(result) data = np.zeros(l) for i in range(l): data[i] = result[i] print "" answer = raw_input("Save the results? Y/N\n") if (answer == "Y"): writeFile(data,"Percentage of assignments ****\nin the group of nodes: " + listNodes(nodesGroup) + listSampleInvolved(metadataList,interval1List,interval2List,sampleNameList),"array") elif not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!" return result,nodesGroup,sampleNameList,metadataList
def pearsonAct(dataArray): xNArray,yArray,typeInput,valueInput1,valueInput2 = creatingArray(dataArray,True) if typeInput == "BB": xArray = xNArray pearson = samplePearson(xArray,yArray) #typeInput = "BM" else: #xNArray is a list of list of (sampleID,number of assignments in this sample) pairs #We need thus to sum the numbers of assignments for a same value of the metadata xArray = [] for ls in xNArray: s = 0 for pair in ls: s += pair[1] xArray.append((ls[0][0],s)) pearson = samplePearson(xArray,yArray) print "\nPearson coefficient is: " + str(pearson) + "\n" answer = raw_input("Save the results? Y/N\n") if (answer == "Y"): data = "The (" + typeInput + ") Pearson coefficient ****\nfor values: \n\n" + str([ x[1] for x in xArray]) + "\ncorresponding to " + str(valueInput1) + "\n\n and\n\n" + str([ y[1] for y in yArray ]) + "\ncorresponding to " + str(valueInput2) + "\n\n is : " + str(pearson) + "\n\nEND OF FILE ****" writeFile(data,"","text") elif not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!" answer = raw_input("Plot the corresponding graph? Y/N\n") if (answer == "Y"): cleanedxArray = [ x[1] for x in xArray ] cleanedyArray = [ y[1] for y in yArray ] maxix,minix = getMaxMin(cleanedxArray) maxiy,miniy = getMaxMin(cleanedyArray) #It is more interesting to generate a histogram in these cases if len(cleanedxArray) < 4: plotHist(cleanedyArray,str(valueInput2[:3]) + "...",str(valueInput1[:3]) + "...",maxiy+1,miniy-1,maxix-1,minix+1,"Plotting (" + typeInput + ") Pearson coefficient and the graph of both sets of values") else: plotPearsonGraph(cleanedxArray,cleanedyArray,pearson,str(valueInput1[:3]) + "...",str(valueInput2[:3]) + "...",maxix+1,minix-1,maxiy+1,miniy-1,"Plotting (" + typeInput + ") Pearson coefficient and the graph of both sets of values") elif not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!"
def clusteringAct(dataArray): print dataArray[1] metadatum = sanitize( raw_input( "Select the metadatum among those above to cluster the set of samples. [e.g. " + dataArray[1][0] + "]\n")).split(";")[0] isInDatabase([metadatum], dataArray[1]) valueSet, clusters1 = partitionSampleByMetadatumValue( metadatum, dataArray[1], dataArray[0]) clusters = [[sample[0] for sample in cluster] for cluster in clusters1] #that is, k in K-means Algorithm numberClass = len(valueSet) print "/!\ Number of classes:", numberClass, "." startSet = [cluster[0] for cluster in clusters] #Selects the starting samples of each cluster kClusters = [[start] for start in startSet] if not (len(clusters) == numberClass): print "\n/!\ ERROR: Different lengths: numberClass", numberClass, "clusters:", len( clusters), "." raise ValueError trimmedList = trimList(dataArray[3], startSet) print "/!\ Clustering with the first distance..." #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster) #@dataArray[8] = distMatchedDict kClusters, meanSamples, distanceDict, distanceInClusters = kMeans( trimmedList, numberClass, kClusters, startSet, dataArray[8], dataArray) print "-- End of first clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number == len(dataArray[3])): print "\n/!\ ERROR: A bug occurred during the clustering:", number, "=/=", len( dataArray[3]), "." raise ValueError #Deletes samples in cluster that are too far from the others kClusters, untaken = cleanClusters(kClusters, distanceInClusters) startSet = [cluster[0] for cluster in clusters] #Remove from untaken the starting samples untaken2 = [] for x in untaken: if not (x in startSet): untaken2.append(x) untaken = untaken2 #Remove the samples in untaken from the total set of samples sampleSet = [] for cluster in kClusters: for x in cluster: if not (x in sampleSet): sampleSet.append(x) for x in startSet: if not (x in sampleSet): sampleSet.append(x) trimmedList = trimList(sampleSet, startSet) print "/!\ Clustering with the second distance..." #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2) #@dataArray[9] = distConsensusDict kClusters, meanSamples, distanceDict, _ = kMeans(trimmedList, numberClass, kClusters, startSet, dataArray[9], dataArray) #,meanSamples) print "-- End of second clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number <= len(dataArray[3])): print "\n/!\ ERROR: An error occurred during the clustering:", number, ">", len( dataArray[3]), "." raise ValueError print "Printing the", numberClass, "clusters:" i = 1 #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs for cluster in kClusters: print "\n-- Cluster #", i, "associated to", metadatum, "=", valueSet[ i - 1], ":" print "Size:", len(cluster) print sorted(cluster) i += 1 print "\nScore of the clustering (comprised between 0 and 1):" print "The more it is close to 1, the more the clustering is relevant." #The clustering obtained with the K-Means method kClustersCopy = [cluster for cluster in kClusters] #The clustering obtained by comparing the values of the metadatum clustersCopy = [cluster for cluster in clusters] #Score by using first method of comparison compareClusterScore = 0 if not (len(kClustersCopy) == numberClass == len(clustersCopy)): print "\n/!\ ERROR: Length error in clustering:", numberClass, len( kClustersCopy), len(clustersCopy), "." raise ValueError while kClustersCopy and clustersCopy: cl1 = kClustersCopy.pop() cl2 = clustersCopy.pop() #clusters are non-empty x = compareCluster(cl1, cl2, untaken) if x: compareClusterScore += x else: compareClusterScore = None break if compareClusterScore: compareClusterScore = compareClusterScore / numberClass printClusterScore = compareClusterScore else: printClusterScore = "None" #Score by using second method of comparison #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass) print "Compare clusters score is:", printClusterScore, "." #print "Compare centers score is:",compareCentersScore,"." answer = raw_input("Do you want to save the results? Y/N\n") if (answer == "Y"): answer2 = raw_input( "Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n" ) if (answer2 == "Y"): commonList = extractCommonNodes(kClusters, dataArray) elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:", answer2, "." data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str( valueSet) i = 0 for cluster in kClusters: data += "\n\n-- Cluster #" + str( i + 1) + " associated to " + metadatum + " = " + str( valueSet[i]) data += "\nSize: " + str(len(cluster)) if (answer2 == "Y"): data += "\nSet of common nodes: " + str(commonList[i]) data += "\n" + str(cluster) i += 1 data += "\n\nCompare clusters score is: " + str(compareClusterScore) #data += "\n\nCompare centers score is: " + str(compareCentersScore) data += "\n\nEND OF FILE ****" print "\n/!\ Saving clusters..." writeFile(data) answer2 = raw_input( "Do you want to compute the graph of the clusters? Y/N\n") if (answer2 == "Y"): print "\n/!\ Constructing the graph of the clusters..." #@dataArray[3] = filenames graph = convertClustersIntoGraph(kClusters, distanceDict, len(dataArray[3])) graphNO(graph) print "\n/!\ Done. The graph is in DOT format in \"files\" folder." elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:", answer2, "." elif not (answer == "N"): print "/!\ You should answer by Y or N."
def randomSubSamplingAct(dataArray): print dataArray[1] metadata = parseList(raw_input("Input the metadata that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n")) isInDatabase(metadata,dataArray[1]) s = raw_input("Input the number s of random samplings.\n") n = raw_input("Input the number n of nodes to select at each try.\n") numberofSamples = len(dataArray[0]) if not integer.match(s) or not integer.match(n): print "\n/!\ ERROR: s and n must both be integers." raise ValueError s,n = int(s),int(n) numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberofSamples) + " sample(s), how many samples do you want to create the training set?\n")) x = integer.match(numberStartingSamples) if not x or (x and (int(numberStartingSamples) > numberofSamples)): print "\n/!\ ERROR: You should write down an integer less or equal to",numberofSamples,"." raise ValueError numberStartingSamples = int(numberStartingSamples) #Here the set of classes is a list of two lists containing the samples in C and not C bestClassification = [] bestClassesList = [] bestShape = [] bestValuesList = [] #Worse value for this coefficient currBestYouden = inf nodesNumber = len(dataArray[3]) #@dataArray[2] = idSequences, which is a dictionary of (key=identifier,values=(name,rank of node)) listofNodes = dataArray[2].values() while s: #Randomly draw n distinct nodes among the nodes in the taxonomic tree nodesList = randomChoice(listofNodes,n) assignedClasses,classes,valueSets = classifyIt(dataArray,metadata,nodesList,numberStartingSamples) numberClass = classes.lenMDL(shape) #len(dataArray[0])? youdenJ = countYouden(assignedClasses,classes,numberofSamples) res = numberClass - youdenJ if min(res,currBestYouden) == res: bestValuesList = [] for i in valueSets: bestValuesList.append(i) bestClassification = [] for i in nodesList: bestClassification.append(i) bestShape = [] for i in shape: bestShape.append(i) currBestYouden = res bestClassesList = [] for i in assignedClasses: bestClassesList.append(i) s -= 1 interpretIt(numberClass - currBestYouden) if answer == "Y": labels = [ "Metadata: " + str(metadata) + ", Values for each metadatum: " + str([ valueSet for valueSet in valueSets]) ] percentagesAs = assignedClasses.mapMDL(len) percentages = classes.mapMDL(len) plotPie(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadata " + str(metadata)) plotPie(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadata " + str(metadata)) answer = raw_input("Do you want to save the results? Y/N \n") if (answer == "Y"): writeFile("Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this set of metadata is:" + str(bestClassification),"Assignments to classes for metadata " + str(metadata)) elif not (answer == "N"): print "\n Answer by Y or N!" return bestClassification,(numberClass - currBestYouden),bestClassesList
def randomSubSamplingAct(dataArray): print dataArray[1] metadatum = sanitize( raw_input( "Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0] isInDatabase([metadatum], dataArray[1]) s = raw_input("Input the number s of random samplings.\n") n = raw_input("Input the number n of nodes to select at each try.\n") if not integer.match(s) or not integer.match(n): print "\n/!\ ERROR: s and n must both be integers." raise ValueError numberSamples = len(dataArray[0]) numberStartingSamples = sanitize( raw_input( "Knowing there is/are " + str(numberSamples) + "sample(s), how many samples do you want to create the training set? \n" )) x = integer.match(numberStartingSamples) if not x or (x and int(numberStartingSamples) > numberSamples): print "\n/!\ ERROR: You should write down an integer." raise ValueError numberStartingSamples = int(numberStartingSamples) listnodes = dataArray[3].values() s, n = int(s), int(n) #Here the set of classes is a list of two lists containing the samples in C and not C bestClassification = [] bestClassesList = [] #Worse value for this coefficient currBestYouden = inf nodesNumber = len(dataArray[3]) while s: #Randomly draw n distinct nodes among the nodes in the taxonomic tree nodesList = randomChoice(listnodes, n) assignedClasses, classes, valueSet = classifyIt( dataArray, metadatum, nodesList, numberStartingSamples) numberClass = len(classes) youdenJ = countYouden(assignedClasses, classes, numberSamples) res = numberClass - youdenJ if min(res, currBestYouden) == res: bestClassification = [] for i in nodesList: bestClassification.append(i) currBestYouden = res bestClassesList = [] for i in assignedClasses: bestClassesList.append(i) s -= 1 interpretIt(numberClass - currBestYouden) if answer == "Y": percentagesAs = [len(class1) for class1 in assignedClasses] labels = [metadatum + " = " + str(value) for value in valueSet] percentages = [len(class1) for class1 in classes] plotPieChart( labels, percentagesAs, "Assignments depending on " + listNodes(nodesList) + " to class for metadatum " + metadatum) plotPieChart( labels, percentages, "Real classes depending on " + listNodes(nodesList) + " for metadatum " + metadatum) answer = raw_input("Do you want to save the results? Y/N") if (answer == "Y"): writeFile( "Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this metadatum is:" + str(bestClassification), "Assignments to classes for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" return bestClassification, (numberClass - currBestYouden), bestClassesList
def clusteringAct(dataArray): print dataArray[1] metadatum = sanitize(raw_input("Select the metadatum among those above to cluster the set of samples. [e.g. " + dataArray[1][0] + "]\n")).split(";")[0] isInDatabase([metadatum],dataArray[1]) valueSet,clusters1 = partitionSampleByMetadatumValue(metadatum,dataArray[1],dataArray[0]) clusters = [[sample[0] for sample in cluster] for cluster in clusters1] #that is, k in K-means Algorithm numberClass = len(valueSet) print "/!\ Number of classes:",numberClass,"." startSet = [cluster[0] for cluster in clusters] #Selects the starting samples of each cluster kClusters = [[start] for start in startSet] if not (len(clusters) == numberClass): print "\n/!\ ERROR: Different lengths: numberClass",numberClass,"clusters:",len(clusters),"." raise ValueError trimmedList = trimList(dataArray[3],startSet) print "/!\ Clustering with the first distance..." #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster) #@dataArray[8] = distMatchedDict kClusters,meanSamples,distanceDict,distanceInClusters = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[8],dataArray) print "-- End of first clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number == len(dataArray[3])): print "\n/!\ ERROR: A bug occurred during the clustering:",number,"=/=",len(dataArray[3]),"." raise ValueError #Deletes samples in cluster that are too far from the others kClusters,untaken = cleanClusters(kClusters,distanceInClusters) startSet = [cluster[0] for cluster in clusters] #Remove from untaken the starting samples untaken2 = [] for x in untaken: if not (x in startSet): untaken2.append(x) untaken = untaken2 #Remove the samples in untaken from the total set of samples sampleSet = [] for cluster in kClusters: for x in cluster: if not (x in sampleSet): sampleSet.append(x) for x in startSet: if not (x in sampleSet): sampleSet.append(x) trimmedList = trimList(sampleSet,startSet) print "/!\ Clustering with the second distance..." #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2) #@dataArray[9] = distConsensusDict kClusters,meanSamples,distanceDict,_ = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[9],dataArray)#,meanSamples) print "-- End of second clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number <= len(dataArray[3])): print "\n/!\ ERROR: An error occurred during the clustering:",number,">",len(dataArray[3]),"." raise ValueError print "Printing the",numberClass,"clusters:" i = 1 #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs for cluster in kClusters: print "\n-- Cluster #",i,"associated to",metadatum,"=",valueSet[i-1],":" print "Size:",len(cluster) print sorted(cluster) i += 1 print "\nScore of the clustering (comprised between 0 and 1):" print "The more it is close to 1, the more the clustering is relevant." #The clustering obtained with the K-Means method kClustersCopy = [cluster for cluster in kClusters] #The clustering obtained by comparing the values of the metadatum clustersCopy = [cluster for cluster in clusters] #Score by using first method of comparison compareClusterScore = 0 if not (len(kClustersCopy) == numberClass == len(clustersCopy)): print "\n/!\ ERROR: Length error in clustering:",numberClass,len(kClustersCopy),len(clustersCopy),"." raise ValueError while kClustersCopy and clustersCopy: cl1 = kClustersCopy.pop() cl2 = clustersCopy.pop() #clusters are non-empty x = compareCluster(cl1,cl2,untaken) if x: compareClusterScore += x else: compareClusterScore = None break if compareClusterScore: compareClusterScore = compareClusterScore/numberClass printClusterScore = compareClusterScore else: printClusterScore = "None" #Score by using second method of comparison #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass) print "Compare clusters score is:",printClusterScore,"." #print "Compare centers score is:",compareCentersScore,"." answer = raw_input("Do you want to save the results? Y/N\n") if (answer == "Y"): answer2 = raw_input("Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n") if (answer2 == "Y"): commonList = extractCommonNodes(kClusters,dataArray) elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:",answer2,"." data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str(valueSet) i = 0 for cluster in kClusters: data += "\n\n-- Cluster #" + str(i+1) + " associated to " + metadatum + " = " + str(valueSet[i]) data += "\nSize: " + str(len(cluster)) if (answer2 == "Y"): data += "\nSet of common nodes: " + str(commonList[i]) data += "\n" + str(cluster) i += 1 data += "\n\nCompare clusters score is: " + str(compareClusterScore) #data += "\n\nCompare centers score is: " + str(compareCentersScore) data += "\n\nEND OF FILE ****" print "\n/!\ Saving clusters..." writeFile(data) answer2 = raw_input("Do you want to compute the graph of the clusters? Y/N\n") if (answer2 == "Y"): print "\n/!\ Constructing the graph of the clusters..." #@dataArray[3] = filenames graph = convertClustersIntoGraph(kClusters,distanceDict,len(dataArray[3])) graphNO(graph) print "\n/!\ Done. The graph is in DOT format in \"files\" folder." elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:",answer2,"." elif not (answer == "N"): print "/!\ You should answer by Y or N."
def patternRatioAct(dataArray): print "First list of samples." sampleNameList1,metadata1,interval11,interval21 = createSampleNameList(dataArray) print "Second list of samples." sampleNameList2,metadata2,interval12,interval22 = createSampleNameList(dataArray) commonPatternsList = enumerateCommonPatterns(dataArray[7],sampleNameList1,sampleNameList2) specificPatternsList1 = enumerateSpecificPatterns(dataArray[7],sampleNameList1,sampleNameList2) specificPatternsList2 = enumerateSpecificPatterns(dataArray[7],sampleNameList2,sampleNameList1) pRatio = patternRatio(commonPatternsList,specificPatternsList1,specificPatternsList2) #Only printing patterns of length > 1 print "\n--- Total number of common patterns: ",len(commonPatternsList) print "--- Common patterns of length > 1 ---" if commonPatternsList: for x in commonPatternsList: if len(x[0]) > 1: print x[0] else: print "No pattern of length > 1." print "\n--- Total number of specific patterns in",sampleNameList1 if metadata1: print "selected on metadata: ",str(metadata1),"with lower and upper bounds being",str(interval11),"and",str(interval21),":" print len(specificPatternsList1) print "--- Specific patterns of length > 1 in",sampleNameList1,"---" if specificPatternsList1: for x in specificPatternsList1: if len(x[0]) > 1: print x[0] else: print "No pattern of length > 1." print "\n--- Total number of specific patterns in",sampleNameList2 if metadata2: print "selected on metadata: ",str(metadata2),"with lower and upper bounds being",str(interval12),"and",str(interval22),":" print len(specificPatternsList2) print "--- Specific patterns of length > 1 in",sampleNameList2,"---" if specificPatternsList2: for x in specificPatternsList2: if len(x[0]) > 1: print x[0] else: print "No pattern of length > 1." print "\nPattern Ratio is: ",pRatio,"\n" print "[ If pattern ratio is superior to one, it means the two groups of samples are quite alike. Please read README ]" print "[ If you obtained +inf, if there are common patterns (of length 1 or superior to 1), it could mean both groups of samples contain exactly the same set of nodes. If there is no common pattern, it could mean there is no sample in both groups ]\n" answer = raw_input("Save the results? Y/N\n") if (answer == "Y"): data = "Pattern Ratio Results ****\nfor lists of samples " + str(sampleNameList1) + "\n" if metadata1: data += "selected on metadata: " + str(metadata1) + " with lower and upper bounds being " + str(interval11) + " and " + str(interval21) + "\n" data += "\nand " + str(sampleNameList2) + "\n" if metadata2: data += "selected on metadata: " + str(metadata2) + " with lower and upper bounds being " + str(interval12) + " and " + str(interval22) + "\n" data += "\n-> Pattern Ratio is: " + str(pRatio) + "\n\nPrinting patterns: first is the list of nodes in the pattern, then the total number of assignations in this pattern and eventually the total number of nodes in the pattern\n\n-> Common Patterns:\n" for x in commonPatternsList: data += str(x) + "\n" data += "\n-> Specific patterns to " + str(sampleNameList1) + ":\n" for x in specificPatternsList1: data += str(x) + "\n" data += "\n-> Specific patterns to " + str(sampleNameList2) + ":\n" for x in specificPatternsList2: data += str(x) + "\n" data += "\nEND OF FILE ****" writeFile(data,"","text") elif not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!"