Example #1
0
def totalDiffRatioAct(dataArray):
    print "First list of samples."
    sampleNameList1,metadataList1,interval1List1,interval2List1 = createSampleNameList(dataArray)
    print "Second list of samples."
    sampleNameList2,metadataList2,interval1List2,interval2List2 = createSampleNameList(dataArray)
    common,in1,in2,numberA1,numberA2,_,_,_ = compute(dataArray[7],sampleNameList1,sampleNameList2)
    commonA = countAssignmentsInCommon(common,sampleNameList1,sampleNameList2)
    tratio = totalRatio(commonA,numberA1,numberA2)
    ntRatio = totalRatioNormalized(commonA,numberA1,numberA2)
    dratio = diffRatio(commonA)
    ndRatio = diffRatioNormalized(commonA,numberA1,numberA2)
    print "\nTotal Ratio Distance is: " + str(tratio)
    print "normalized Total Ratio is: " + str(ntRatio) + "\n[The more it is close to 1, the more the two groups are alike]\n"
    print "Diff Ratio Distance is: " + str(dratio)
    print "normalized Diff Ratio is: " + str(ndRatio) + "\n[The more it is close to 0, the more the two groups are alike]\n"
    print "[If you have obtained +inf (resp. -inf), it could mean you have selected no sample.]\n"
    answer = raw_input("Save the results? Y/N\n")
    if (answer == "Y"):
        data = "Total Ratio Results ****\n for " + str(sampleNameList1) + "\n"
        if metadataList1:
            data += "selected on metadata: " + str(metadataList1) + " with extremum values: " + str(interval1List1) + " (lower bounds) and " + str(interval2List1) + " (upper bounds) \n"
        data += " and " + str(sampleNameList2) + "\n"
        if metadataList2:
            data += "selected on metadata: " + str(metadataList2) + " with extremum values: " + str(interval1List2) + " (lower bounds) and " + str(interval2List2) + " (upper bounds) \n"
        data += "\nTotal Ratio Distance is: " + str(tratio) + "\n normalized Total Ratio is: " + str(ntRatio) + "\nDiff Ratio Distance is: " + str(dratio) + "\n normalized Diff Ratio is: " + str(ndRatio) +"\n\nEND OF FILE ****"  
        writeFile(data,"","text")
    elif not (answer == "N"):
        print "/!\ You should answer 'Y' or 'N'!"
Example #2
0
def userNodeSelectionAct(dataArray):
    print dataArray[1]
    metadata = parseList(raw_input("Input the metadata that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))
    isInDatabase(metadata,dataArray[1])
    #@dataArray[2] = idSequences is a dictionary of (key=identifier,value=(name,rank of node))
    listofNodes = dataArray[2].values()
    nodesList = parseListNode(raw_input("Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listofNodes[-3]) + ";" + sanitizeNode(listofNodes[1]) + ";" + sanitizeNode(listofNodes[-1]) + " ]\n"))
    isInDatabase(nodesList,listofNodes)
    numberofSamples = len(dataArray[0])
    numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberofSamples) + "sample(s), how many samples do you want to create the training set? \n"))
    x = integer.match(numberStartingSamples)
    if not x or (x and (int(numberStartingSamples) > numberofSamples)):
        print "\n/!\ ERROR: You should write down an integer less or equal to",numberofSamples,"."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    #@shape for @assignedClasses is the same than the one for @classes
    assignedClasses,classes,valueSets = classifyIt(dataArray,metadata,nodesList,numberStartingSamples)
    numberClass = classes.lenMDL()
    youdenJ = countYouden(assignedClasses,classes,numberofSamples)
    interpretIt(youdenJ)
    answer = raw_input("Do you want to plot the classes obtained as a pie chart? Y/N\n")
    if answer == "Y":
        labels = [ "Metadata: " + str(metadata) + ", Values for each metadatum: " + str([ valueSet for valueSet in valueSets]) ]
        percentagesAs = assignedClasses.mapMDL(len)
        percentages = classes.mapMDL(len)
        plotPie(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadata " + str(metadata))
        plotPie(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadata " + str(metadata))
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    answer = raw_input("Do you want to save the results? Y/N \n")
    if (answer == "Y"):
        writeFile("Youden's J statistic for this classification is: " + str(youdenJ) + "\n","Assignments depending on " + listNodes(nodesList) + " to classes for metadata " + str(metadata))
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return assignedClasses,youdenJ
def userNodeSelectionAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(raw_input("Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0]
    isInDatabase([metadatum],dataArray[1])
    listnodes = dataArray[3].values()
    nodesList = parseListNode(raw_input("Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listnodes[-3]) + ";" + sanitizeNode(listnodes[1]) + ";" + sanitizeNode(listnodes[-1]) + " ]\n"))
    isInDatabase(nodesList,listnodes)
    numberSamples = len(dataArray[0])
    numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberSamples) + " sample(s), how many samples do you want to create the training set?\n"))
    x = integer.match(numberStartingSamples)
    if not x or (x and int(numberStartingSamples) > numberSamples) :
        print "\n/!\ ERROR: You should write down an integer inferior or equal to ",numberSamples,"."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    assignedClasses,classes,valueSet = classifyIt(dataArray,metadatum,nodesList,numberStartingSamples)
    numberClass = len(classes)
    youdenJ = countYouden(assignedClasses,classes,numberStartingSamples)
    interpretIt(youdenJ)
    answer = raw_input("Do you want to plot the classes obtained as a pie chart? Y/N")
    if answer == "Y":
        labels = [ metadatum + " = " + str(value) for value in valueSet ]
        percentagesAs = [ len(class1) for class1 in assignedClasses ]
        percentages = [ len(class1) for class1 in classes ]
        plotPieChart(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadatum " + metadatum)
        plotPieChart(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    answer = raw_input("Do you want to save the results? Y/N")
    if (answer == "Y"):
        writeFile("Youden's J statistic for this classification is: " + str(youdenJ) + "\n","Assignments depending on " + listNodes(nodesList) + " to classes for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return assignedClasses,youdenJ
Example #4
0
def getValueBacteriaMetadata(samplesInfoList,infoList,bacterias,sampleIDList,samplesOccList,speciesList,metadatum):
    xArray = []
    #Stores the positions of number of assignments to each bacteria of the group in the occurrences matrix
    bacteriaPos = getPositionBacteria(bacterias,speciesList)
    valueSet,valueSampleMetadatum = partitionSampleByMetadatumValue(metadatum,infoList,samplesInfoList)
    #Integer values of metadatum are sorted
    yArray = sorted(valueSet,key=lambda x:x[1])
    #For every different value of the metadatum
    for sampleValueList in valueSampleMetadatum:
        #gets the number of assignments to bacterias which positions are in bacteriaPos depending on the value of the metadatum (any sample in sampleValueList has the same value of the metadatum)
        res = getValueBacteria(samplesOccList,speciesList,sampleIDList,bacteriaPos,sampleValueList,True)
        if res:
            xArray.append(res)
        else:
            for sample in sampleValueList:
                xArray.append((sample,0))
    print "\n[Preview.]"
    print "\n--- Number of assignments to the group of bacterias",bacterias,"in samples depending on the",len(xArray),"value(s) of metadatum",metadatum
    print xArray
    print "\n--- Set of values of metadatum",metadatum,"of length",len(yArray)
    string = ""
    for x in yArray[:-1]:
        string += str(x[1]) + ", "
    print (string + str(yArray[-1][1]))
    answer = raw_input("\nWrite both Bacteria and Metadatum files? Y/N\n")
    if (answer == "Y"):
        print "Saving first file"
        writeFile(xArray,"**** Values of assignments in samples samples depending on the value of metadatum" + str(metadatum) + "of nodes: " + str(bacterias) + "\n\n","array")
        print "Saving second file"
        writeFile(yArray,"**** Values of metadatum: " + str(metadatum) + "\n\n","array")
    elif not (answer == "N"):
        print "/!\ You should answer 'Y' or 'N'!"
    return xArray,yArray
Example #5
0
def getValueBacteriaBacteria(samplesOccList,speciesList,sampleIDList,bacterias1,bacterias2):
    xArray,yArray = [],[]
    #Stores the positions of number of assignments to each bacteria of the group in the occurrences matrix
    bacteriaPos1 = getPositionBacteria(bacterias1,speciesList)
    bacteriaPos2 = getPositionBacteria(bacterias2,speciesList)
    for sample in sampleIDList:
        #gets the number of assignments to the group of bacterias in the sample
        res = getValueBacteria(samplesOccList,speciesList,sampleIDList,bacteriaPos1,[sample])
        #Due to the fact some samples are in info matrix and not in occurrence matrix, getValueBacteria may return an empty list
        if res:
            xArray += res
        else:
            xArray.append((sample,0))
        res = getValueBacteria(samplesOccList,speciesList,sampleIDList,bacteriaPos2,[sample])
        if res:
            #Contains only one element, since getValueBacteria was applied to only one element
            yArray += res
        else:
            yArray.append((sample,0))
    print "\n[Preview.]"
    print "\n--- Number of assignments to the group of bacterias",bacterias1,"in all",len(xArray),"samples"
    print xArray
    print "\n--- Number of assignments to the group of bacterias",bacterias2,"in all",len(xArray),"samples"
    print yArray
    answer = raw_input("\nWrite both Bacteria files? Y/N\n")
    if (answer == "Y"):
        print "Saving first file..."
        writeFile(xArray,"**** Values of assignments in all samples of nodes: " + str(bacterias1) + "\n\n","array")
        print "Saving second file..."
        writeFile(yArray,"**** Values of assignments in all samples of nodes: " + str(bacterias2) + "\n\n","array")
    elif not (answer == "N"):
        print "/!\ You should answer 'Y' or 'N'!"
    return xArray,yArray
Example #6
0
def distanceAct(dataArray):
    answer = raw_input("Import matrix? Y/N\n")
    if answer == "Y":
        filename = raw_input("Write down the file name where the matrix is stored [ without the extension .taxotree ].\n")
        matrix = importMatrix(filename)
    else:
        if not (answer == "N"):
            print "/!\ You should answer 'Y' or 'N'!"
        print "/!\ Computing similarity matrix..."
        print "[ You may have to wait for a few minutes... ]"
        matrix = computeSimilarity(dataArray)
        print "[Preview.]"
        print matrix
        answer = raw_input("Save the results? Y/N\n")
        if (answer == "Y"):  
            writeFile(m,"Similarity coefficients between patients using previous calculi on total ratio, pattern ratio and diversity coefficient\n\nNota Bene: 1e+14 stands for +inf\n","array")
        elif not (answer == "N"):
            print "\n/!\ You should answer 'Y' or 'N'!"
    answer = raw_input("Compute the most different groups of samples? Y/N\n")
    if (answer == "Y"):
        answer = raw_input("Do you want to select samples by metadata or to select all samples? metadata/all")
        if (answer == "metadata"):
            print dataArray[1]
            metadatum = parseList(raw_input("Choose the metadatum among those printed above [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))
            isInDatabase(metadatum,dataArray[1])
            _,valueSampleMetadatum = partitionSampleByMetadatumValue(metadatum[0],dataArray[1],dataArray[0])
            valueSampleMetadatumNameOnly = []
            for sampleGroup in valueSampleMetadatum:
                sampleGroupNameOnly = []
                for sample in sampleGroup:
                    sampleGroupNameOnly.append(sample[0])
                valueSampleMetadatumNameOnly.append(sampleGroupNameOnly)
            pairsList = mostDifferentSamplesGroups(matrix,dataArray[8],valueSampleMetadatumNameOnly)
        if (answer == "all"):
            pairsList = mostDifferentSamplesGroups(matrix,dataArray[8],[[sample] for sample in dataArray[8]])
        else:
            print "\n/!\ ERROR: You should answer 'metadata' or 'all'."
            raise ValueError
        print "[ Preview. ]"
        print "List of the pairs of most different sample groups according to the similarity coefficients computed:"
        for pair in pairsList:
            print pair
        answer2 = raw_input("\nSave the results? Y/N\n")
        if (answer2 == "Y"):
            stringPairs = ""
            for pair in pairsList:
                stringPairs += "*" + str(pair) + "\n"
            if (answer == "metadata"):
                stringSamples = ""
                for group in valueSampleMetadatumNameOnly:
                    stringSamples += "*" + str(group) + "\n"
                data = "Most different groups of samples ****\nsorted by values of metadatum: " + metadatum[0] + "\nGroups were:\n\n" + stringSamples + "\n\nAnd the most different ones are:\n\n" + stringPairs + "\n\nEND OF FILE ****"
            else:
                data = "Most different groups of samples ****\n\nThe most different ones are:\n\n" + stringPairs + "\n\nEND OF FILE ****"
            writeFile(data,"","text")
        elif not (answer2 == "N"):
            print "/!\ You should answer 'Y' or 'N'!"
def userNodeSelectionAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(
        raw_input(
            "Input the metadatum that will cluster the set of samples among those written above. [ e.g. "
            + dataArray[1][0] + " ]\n")).split(";")[0]
    isInDatabase([metadatum], dataArray[1])
    listnodes = dataArray[3].values()
    nodesList = parseListNode(
        raw_input(
            "Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. "
            + sanitizeNode(listnodes[-3]) + ";" + sanitizeNode(listnodes[1]) +
            ";" + sanitizeNode(listnodes[-1]) + " ]\n"))
    isInDatabase(nodesList, listnodes)
    numberSamples = len(dataArray[0])
    numberStartingSamples = sanitize(
        raw_input(
            "Knowing there is/are " + str(numberSamples) +
            " sample(s), how many samples do you want to create the training set?\n"
        ))
    x = integer.match(numberStartingSamples)
    if not x or (x and int(numberStartingSamples) > numberSamples):
        print "\n/!\ ERROR: You should write down an integer inferior or equal to ", numberSamples, "."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    assignedClasses, classes, valueSet = classifyIt(dataArray, metadatum,
                                                    nodesList,
                                                    numberStartingSamples)
    numberClass = len(classes)
    youdenJ = countYouden(assignedClasses, classes, numberStartingSamples)
    interpretIt(youdenJ)
    answer = raw_input(
        "Do you want to plot the classes obtained as a pie chart? Y/N")
    if answer == "Y":
        labels = [metadatum + " = " + str(value) for value in valueSet]
        percentagesAs = [len(class1) for class1 in assignedClasses]
        percentages = [len(class1) for class1 in classes]
        plotPieChart(
            labels, percentagesAs, "Assignments depending on " +
            str(nodesList) + " to class for metadatum " + metadatum)
        plotPieChart(
            labels, percentages, "Real classes depending on " +
            str(nodesList) + " for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    answer = raw_input("Do you want to save the results? Y/N")
    if (answer == "Y"):
        writeFile(
            "Youden's J statistic for this classification is: " +
            str(youdenJ) + "\n", "Assignments depending on " +
            listNodes(nodesList) + " to classes for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return assignedClasses, youdenJ
def randomSubSamplingAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(raw_input("Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0]
    isInDatabase([metadatum],dataArray[1])
    s = raw_input("Input the number s of random samplings.\n")
    n = raw_input("Input the number n of nodes to select at each try.\n")
    if not integer.match(s) or not integer.match(n):
        print "\n/!\ ERROR: s and n must both be integers."
        raise ValueError
    numberSamples = len(dataArray[0])
    numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberSamples) + "sample(s), how many samples do you want to create the training set? \n"))
    x = integer.match(numberStartingSamples)
    if not x or (x and int(numberStartingSamples) > numberSamples):
        print "\n/!\ ERROR: You should write down an integer."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    listnodes = dataArray[3].values()
    s,n = int(s),int(n)
    #Here the set of classes is a list of two lists containing the samples in C and not C
    bestClassification = []
    bestClassesList = []
    #Worse value for this coefficient
    currBestYouden = inf
    nodesNumber = len(dataArray[3])
    while s:
        #Randomly draw n distinct nodes among the nodes in the taxonomic tree
        nodesList = randomChoice(listnodes,n)
        assignedClasses,classes,valueSet = classifyIt(dataArray,metadatum,nodesList,numberStartingSamples)
        numberClass = len(classes)
        youdenJ = countYouden(assignedClasses,classes,numberSamples)
        res = numberClass - youdenJ
        if min(res,currBestYouden) == res:
            bestClassification = []
            for i in nodesList:
                bestClassification.append(i)
            currBestYouden = res
            bestClassesList = []
            for i in assignedClasses:
                bestClassesList.append(i)
        s -= 1
    interpretIt(numberClass - currBestYouden)
    if answer == "Y":
        percentagesAs = [ len(class1) for class1 in assignedClasses ]
        labels = [ metadatum + " = " + str(value) for value in valueSet ]
        percentages = [ len(class1) for class1 in classes ]
        plotPieChart(labels,percentagesAs,"Assignments depending on " + listNodes(nodesList) + " to class for metadatum " + metadatum)
        plotPieChart(labels,percentages,"Real classes depending on " + listNodes(nodesList) + " for metadatum " + metadatum)
    answer = raw_input("Do you want to save the results? Y/N")
    if (answer == "Y"):
        writeFile("Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this metadatum is:" + str(bestClassification),"Assignments to classes for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return bestClassification,(numberClass - currBestYouden),bestClassesList
Example #9
0
def graphNO(graph):
    seen = dict.fromkeys([])
    data = "graph g { \n"
    n,m = np.shape(graph)
    for i in range(n):
        for j in range(i+1,m):
            if graph[i][j]:
                namei,namej,delta,distance = graph[i][j]
                if delta and not seen.get((namei,namej)) :
                    data += "%s -- %s [label=%s]; \n"%(sanitizeDot(namei),sanitizeDot(namej),str(distance))
                    seen.setdefault((namei,namej),1)
                    seen.setdefault((namej,namej),1)
    data += " } \n"
    writeFile(data,"","dot")
Example #10
0
def similarityAct(dataArray,iMatrix):
    print dataArray[1]
    metadataList = parseList(raw_input("Input the list of metadata you want to consider among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))
    isInDatabase(metadataList,dataArray[1])
    print "/!\ Computing similarity matrix..."
    m = similarity(dataArray[0],dataArray[1],metadataList)
    print "[Preview.]"
    print m
    answer = raw_input("Save the results? Y/N\n")
    if (answer == "Y"):  
        writeFile(m,"Similarity coefficients between patients for file meta/" + iMatrix + ".csv:\n" + listNodes(dataArray[8]),"array")
    elif not (answer == "N"):
        print "/!\ You should answer 'Y' or 'N'!"
    return m
Example #11
0
def diversityAct(dataArray):
    sampleNameList,metadata,interval1,interval2 = createSampleNameList(dataArray)
    #@dataArray[5] = n is the number of nodes in the taxonomic tree
    coefficient,sample = computeDiversityCoefficient(dataArray[5],sampleNameList,dataArray)
    print "\nMicrobial Diversity coefficient is: " + str(coefficient)
    print "[If you have obtained -inf, it could mean the taxonomic tree is actually empty.]\n"
    answer = raw_input("Save the results? Y/N\n")
    if (answer == "Y"):
        data = "Microbial Diversity Results ****\n for lists " + str(sampleNameList) + "\n"
        if metadata:
            data += "selected on metadata: " + str(metadata) + "with extreme values: " + str(interval1) + " (lower bounds) and " + str(interval2) + " (upper bounds) \n"
        data += "\nMicrobial Diversity coefficient is: " + str(coefficient) +"\n\nEND OF FILE ****"  
        writeFile(data,"","text")
    elif not (answer == "N"):
        print "/!\ You should answer 'Y' or 'N'!"
    answer = raw_input("Do you want to display the pie chart of the assignments to the taxonomic tree in the selected samples? Y/N\n")
    if (answer == "Y"):
        plotPieChart([sample1[0] for sample1 in sample],[sample1[1] for sample1 in sample],"Assignments to the taxonomic tree in " + str(sampleNameList[:3]))
    elif not (answer == "N"):
        print "/!\ You should answer 'Y' or 'N'!"
Example #12
0
def percentageAct(dataArray):
    uTree = raw_input("Do you to get percentage of assignments to subtrees or to bacterias themselves? subtree/bacteria \n")
    usingTree = (uTree == "subtree")
    if not (uTree == "subtree" or uTree == "bacteria"):
        print "\n/!\ ERROR: You need to answer 'bacteria' or 'subtree'."
        raise ValueError
    nodesGroup = parseListNode(raw_input("Input the list of nodes/roots of subtrees you want to consider. [ Please look at the taxonomic tree file to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + ". ]\n"))
    isInDatabase(nodesGroup,dataArray[6])
    sampleNameList,metadataList,interval1List,interval2List = createSampleNameList(dataArray,True)
    result = percentageAssign(dataArray[0],dataArray[1],sampleNameList,dataArray[7],nodesGroup,dataArray[2],dataArray[3],usingTree)
    print "\n[Preview.]"
    print result
    l = len(result)
    data = np.zeros(l)
    for i in range(l):
        data[i] = result[i]
    print ""
    answer = raw_input("Save the results? Y/N\n")
    if (answer == "Y"):  
        writeFile(data,"Percentage of assignments ****\nin the group of nodes: " + listNodes(nodesGroup) + listSampleInvolved(metadataList,interval1List,interval2List,sampleNameList),"array")
    elif not (answer == "N"):
        print "/!\ You should answer 'Y' or 'N'!"
    return result,nodesGroup,sampleNameList,metadataList
Example #13
0
def pearsonAct(dataArray):
    xNArray,yArray,typeInput,valueInput1,valueInput2 = creatingArray(dataArray,True)
    if typeInput == "BB":
        xArray = xNArray
        pearson = samplePearson(xArray,yArray)
    #typeInput = "BM"
    else:
        #xNArray is a list of list of (sampleID,number of assignments in this sample) pairs
        #We need thus to sum the numbers of assignments for a same value of the metadata
        xArray = []
        for ls in xNArray:
            s = 0
            for pair in ls:
                s += pair[1]
            xArray.append((ls[0][0],s))
        pearson = samplePearson(xArray,yArray)
    print "\nPearson coefficient is: " + str(pearson) + "\n"
    answer = raw_input("Save the results? Y/N\n")
    if (answer == "Y"):
        data = "The (" + typeInput + ") Pearson coefficient ****\nfor values: \n\n" + str([ x[1] for x in xArray]) + "\ncorresponding to " + str(valueInput1) + "\n\n and\n\n" + str([ y[1] for y in yArray ]) + "\ncorresponding to " + str(valueInput2) + "\n\n is : " + str(pearson) + "\n\nEND OF FILE ****"
        writeFile(data,"","text")
    elif not (answer == "N"):
        print "/!\ You should answer 'Y' or 'N'!"
    answer = raw_input("Plot the corresponding graph? Y/N\n")
    if (answer == "Y"):
        cleanedxArray = [ x[1] for x in xArray ]
        cleanedyArray = [ y[1] for y in yArray ]
        maxix,minix = getMaxMin(cleanedxArray)
        maxiy,miniy = getMaxMin(cleanedyArray)
        #It is more interesting to generate a histogram in these cases
        if len(cleanedxArray) < 4:
            plotHist(cleanedyArray,str(valueInput2[:3]) + "...",str(valueInput1[:3]) + "...",maxiy+1,miniy-1,maxix-1,minix+1,"Plotting (" + typeInput + ") Pearson coefficient and the graph of both sets of values")
        else:
            plotPearsonGraph(cleanedxArray,cleanedyArray,pearson,str(valueInput1[:3]) + "...",str(valueInput2[:3]) + "...",maxix+1,minix-1,maxiy+1,miniy-1,"Plotting (" + typeInput + ") Pearson coefficient and the graph of both sets of values")
    elif not (answer == "N"):
        print "/!\ You should answer 'Y' or 'N'!"
Example #14
0
def clusteringAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(
        raw_input(
            "Select the metadatum among those above to cluster the set of samples. [e.g. "
            + dataArray[1][0] + "]\n")).split(";")[0]
    isInDatabase([metadatum], dataArray[1])
    valueSet, clusters1 = partitionSampleByMetadatumValue(
        metadatum, dataArray[1], dataArray[0])
    clusters = [[sample[0] for sample in cluster] for cluster in clusters1]
    #that is, k in K-means Algorithm
    numberClass = len(valueSet)
    print "/!\ Number of classes:", numberClass, "."
    startSet = [cluster[0] for cluster in clusters]
    #Selects the starting samples of each cluster
    kClusters = [[start] for start in startSet]
    if not (len(clusters) == numberClass):
        print "\n/!\ ERROR: Different lengths: numberClass", numberClass, "clusters:", len(
            clusters), "."
        raise ValueError
    trimmedList = trimList(dataArray[3], startSet)
    print "/!\ Clustering with the first distance..."
    #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster)
    #@dataArray[8] = distMatchedDict
    kClusters, meanSamples, distanceDict, distanceInClusters = kMeans(
        trimmedList, numberClass, kClusters, startSet, dataArray[8], dataArray)
    print "-- End of first clustering --"
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number == len(dataArray[3])):
        print "\n/!\ ERROR: A bug occurred during the clustering:", number, "=/=", len(
            dataArray[3]), "."
        raise ValueError
    #Deletes samples in cluster that are too far from the others
    kClusters, untaken = cleanClusters(kClusters, distanceInClusters)
    startSet = [cluster[0] for cluster in clusters]
    #Remove from untaken the starting samples
    untaken2 = []
    for x in untaken:
        if not (x in startSet):
            untaken2.append(x)
    untaken = untaken2
    #Remove the samples in untaken from the total set of samples
    sampleSet = []
    for cluster in kClusters:
        for x in cluster:
            if not (x in sampleSet):
                sampleSet.append(x)
    for x in startSet:
        if not (x in sampleSet):
            sampleSet.append(x)
    trimmedList = trimList(sampleSet, startSet)
    print "/!\ Clustering with the second distance..."
    #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2)
    #@dataArray[9] = distConsensusDict
    kClusters, meanSamples, distanceDict, _ = kMeans(trimmedList, numberClass,
                                                     kClusters, startSet,
                                                     dataArray[9],
                                                     dataArray)  #,meanSamples)
    print "-- End of second clustering --"
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number <= len(dataArray[3])):
        print "\n/!\ ERROR: An error occurred during the clustering:", number, ">", len(
            dataArray[3]), "."
        raise ValueError
    print "Printing the", numberClass, "clusters:"
    i = 1
    #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs
    for cluster in kClusters:
        print "\n-- Cluster #", i, "associated to", metadatum, "=", valueSet[
            i - 1], ":"
        print "Size:", len(cluster)
        print sorted(cluster)
        i += 1
    print "\nScore of the clustering (comprised between 0 and 1):"
    print "The more it is close to 1, the more the clustering is relevant."
    #The clustering obtained with the K-Means method
    kClustersCopy = [cluster for cluster in kClusters]
    #The clustering obtained by comparing the values of the metadatum
    clustersCopy = [cluster for cluster in clusters]
    #Score by using first method of comparison
    compareClusterScore = 0
    if not (len(kClustersCopy) == numberClass == len(clustersCopy)):
        print "\n/!\ ERROR: Length error in clustering:", numberClass, len(
            kClustersCopy), len(clustersCopy), "."
        raise ValueError
    while kClustersCopy and clustersCopy:
        cl1 = kClustersCopy.pop()
        cl2 = clustersCopy.pop()
        #clusters are non-empty
        x = compareCluster(cl1, cl2, untaken)
        if x:
            compareClusterScore += x
        else:
            compareClusterScore = None
            break
    if compareClusterScore:
        compareClusterScore = compareClusterScore / numberClass
        printClusterScore = compareClusterScore
    else:
        printClusterScore = "None"
    #Score by using second method of comparison
    #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass)
    print "Compare clusters score is:", printClusterScore, "."
    #print "Compare centers score is:",compareCentersScore,"."
    answer = raw_input("Do you want to save the results? Y/N\n")
    if (answer == "Y"):
        answer2 = raw_input(
            "Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n"
        )
        if (answer2 == "Y"):
            commonList = extractCommonNodes(kClusters, dataArray)
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:", answer2, "."
        data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str(
            valueSet)
        i = 0
        for cluster in kClusters:
            data += "\n\n-- Cluster #" + str(
                i + 1) + " associated to " + metadatum + " = " + str(
                    valueSet[i])
            data += "\nSize: " + str(len(cluster))
            if (answer2 == "Y"):
                data += "\nSet of common nodes: " + str(commonList[i])
            data += "\n" + str(cluster)
            i += 1
        data += "\n\nCompare clusters score is: " + str(compareClusterScore)
        #data += "\n\nCompare centers score is: " + str(compareCentersScore)
        data += "\n\nEND OF FILE ****"
        print "\n/!\ Saving clusters..."
        writeFile(data)
        answer2 = raw_input(
            "Do you want to compute the graph of the clusters? Y/N\n")
        if (answer2 == "Y"):
            print "\n/!\ Constructing the graph of the clusters..."
            #@dataArray[3] = filenames
            graph = convertClustersIntoGraph(kClusters, distanceDict,
                                             len(dataArray[3]))
            graphNO(graph)
            print "\n/!\ Done. The graph is in DOT format in \"files\" folder."
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:", answer2, "."
    elif not (answer == "N"):
        print "/!\ You should answer by Y or N."
Example #15
0
def randomSubSamplingAct(dataArray):
    print dataArray[1]
    metadata = parseList(raw_input("Input the metadata that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))
    isInDatabase(metadata,dataArray[1])
    s = raw_input("Input the number s of random samplings.\n")
    n = raw_input("Input the number n of nodes to select at each try.\n")
    numberofSamples = len(dataArray[0])
    if not integer.match(s) or not integer.match(n):
        print "\n/!\ ERROR: s and n must both be integers."
        raise ValueError
    s,n = int(s),int(n)
    numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberofSamples) + " sample(s), how many samples do you want to create the training set?\n"))
    x = integer.match(numberStartingSamples)
    if not x or (x and (int(numberStartingSamples) > numberofSamples)):
        print "\n/!\ ERROR: You should write down an integer less or equal to",numberofSamples,"."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    #Here the set of classes is a list of two lists containing the samples in C and not C
    bestClassification = []
    bestClassesList = []
    bestShape = []
    bestValuesList = []
    #Worse value for this coefficient
    currBestYouden = inf
    nodesNumber = len(dataArray[3])
    #@dataArray[2] = idSequences, which is a dictionary of (key=identifier,values=(name,rank of node))
    listofNodes = dataArray[2].values()
    while s:
        #Randomly draw n distinct nodes among the nodes in the taxonomic tree
        nodesList = randomChoice(listofNodes,n)
        assignedClasses,classes,valueSets = classifyIt(dataArray,metadata,nodesList,numberStartingSamples)
        numberClass = classes.lenMDL(shape)
        #len(dataArray[0])?
        youdenJ = countYouden(assignedClasses,classes,numberofSamples)
        res = numberClass - youdenJ
        if min(res,currBestYouden) == res:
            bestValuesList = []
            for i in valueSets:
                bestValuesList.append(i)
            bestClassification = []
            for i in nodesList:
                bestClassification.append(i)
            bestShape = []
            for i in shape:
                bestShape.append(i)
            currBestYouden = res
            bestClassesList = []
            for i in assignedClasses:
                bestClassesList.append(i)
        s -= 1
    interpretIt(numberClass - currBestYouden)
    if answer == "Y":
        labels = [ "Metadata: " + str(metadata) + ", Values for each metadatum: " + str([ valueSet for valueSet in valueSets]) ]
        percentagesAs = assignedClasses.mapMDL(len)
        percentages = classes.mapMDL(len)
        plotPie(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadata " + str(metadata))
        plotPie(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadata " + str(metadata))
    answer = raw_input("Do you want to save the results? Y/N \n")
    if (answer == "Y"):
        writeFile("Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this set of metadata is:" + str(bestClassification),"Assignments to classes for metadata " + str(metadata))
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return bestClassification,(numberClass - currBestYouden),bestClassesList
Example #16
0
def randomSubSamplingAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(
        raw_input(
            "Input the metadatum that will cluster the set of samples among those written above. [ e.g. "
            + dataArray[1][0] + " ]\n")).split(";")[0]
    isInDatabase([metadatum], dataArray[1])
    s = raw_input("Input the number s of random samplings.\n")
    n = raw_input("Input the number n of nodes to select at each try.\n")
    if not integer.match(s) or not integer.match(n):
        print "\n/!\ ERROR: s and n must both be integers."
        raise ValueError
    numberSamples = len(dataArray[0])
    numberStartingSamples = sanitize(
        raw_input(
            "Knowing there is/are " + str(numberSamples) +
            "sample(s), how many samples do you want to create the training set? \n"
        ))
    x = integer.match(numberStartingSamples)
    if not x or (x and int(numberStartingSamples) > numberSamples):
        print "\n/!\ ERROR: You should write down an integer."
        raise ValueError
    numberStartingSamples = int(numberStartingSamples)
    listnodes = dataArray[3].values()
    s, n = int(s), int(n)
    #Here the set of classes is a list of two lists containing the samples in C and not C
    bestClassification = []
    bestClassesList = []
    #Worse value for this coefficient
    currBestYouden = inf
    nodesNumber = len(dataArray[3])
    while s:
        #Randomly draw n distinct nodes among the nodes in the taxonomic tree
        nodesList = randomChoice(listnodes, n)
        assignedClasses, classes, valueSet = classifyIt(
            dataArray, metadatum, nodesList, numberStartingSamples)
        numberClass = len(classes)
        youdenJ = countYouden(assignedClasses, classes, numberSamples)
        res = numberClass - youdenJ
        if min(res, currBestYouden) == res:
            bestClassification = []
            for i in nodesList:
                bestClassification.append(i)
            currBestYouden = res
            bestClassesList = []
            for i in assignedClasses:
                bestClassesList.append(i)
        s -= 1
    interpretIt(numberClass - currBestYouden)
    if answer == "Y":
        percentagesAs = [len(class1) for class1 in assignedClasses]
        labels = [metadatum + " = " + str(value) for value in valueSet]
        percentages = [len(class1) for class1 in classes]
        plotPieChart(
            labels, percentagesAs, "Assignments depending on " +
            listNodes(nodesList) + " to class for metadatum " + metadatum)
        plotPieChart(
            labels, percentages, "Real classes depending on " +
            listNodes(nodesList) + " for metadatum " + metadatum)
    answer = raw_input("Do you want to save the results? Y/N")
    if (answer == "Y"):
        writeFile(
            "Best Youden's J statistic for this classification is: " +
            str(numberClass - currBestYouden) +
            "\nand most relevant list of nodes for this metadatum is:" +
            str(bestClassification),
            "Assignments to classes for metadatum " + metadatum)
    elif not (answer == "N"):
        print "\n Answer by Y or N!"
    return bestClassification, (numberClass - currBestYouden), bestClassesList
Example #17
0
def clusteringAct(dataArray):
    print dataArray[1]
    metadatum = sanitize(raw_input("Select the metadatum among those above to cluster the set of samples. [e.g. " + dataArray[1][0] + "]\n")).split(";")[0]
    isInDatabase([metadatum],dataArray[1])
    valueSet,clusters1 = partitionSampleByMetadatumValue(metadatum,dataArray[1],dataArray[0])
    clusters = [[sample[0] for sample in cluster] for cluster in clusters1]
    #that is, k in K-means Algorithm
    numberClass = len(valueSet)
    print "/!\ Number of classes:",numberClass,"."
    startSet = [cluster[0] for cluster in clusters]
    #Selects the starting samples of each cluster
    kClusters = [[start] for start in startSet]
    if not (len(clusters) == numberClass):
        print "\n/!\ ERROR: Different lengths: numberClass",numberClass,"clusters:",len(clusters),"."
        raise ValueError
    trimmedList = trimList(dataArray[3],startSet)
    print "/!\ Clustering with the first distance..."
    #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster)
    #@dataArray[8] = distMatchedDict
    kClusters,meanSamples,distanceDict,distanceInClusters = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[8],dataArray)
    print "-- End of first clustering --"
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number == len(dataArray[3])):
        print "\n/!\ ERROR: A bug occurred during the clustering:",number,"=/=",len(dataArray[3]),"."
        raise ValueError
    #Deletes samples in cluster that are too far from the others
    kClusters,untaken = cleanClusters(kClusters,distanceInClusters)
    startSet = [cluster[0] for cluster in clusters]
    #Remove from untaken the starting samples
    untaken2 = []
    for x in untaken:
        if not (x in startSet):
            untaken2.append(x)
    untaken = untaken2
    #Remove the samples in untaken from the total set of samples
    sampleSet = []
    for cluster in kClusters:
        for x in cluster:
            if not (x in sampleSet):
                sampleSet.append(x)
    for x in startSet:
        if not (x in sampleSet):
            sampleSet.append(x)
    trimmedList = trimList(sampleSet,startSet)
    print "/!\ Clustering with the second distance..."
    #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2)
    #@dataArray[9] = distConsensusDict
    kClusters,meanSamples,distanceDict,_ = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[9],dataArray)#,meanSamples)
    print "-- End of second clustering --" 
    number = 0
    for cluster in kClusters:
        for _ in cluster:
            number += 1
    if not (number <= len(dataArray[3])):
        print "\n/!\ ERROR: An error occurred during the clustering:",number,">",len(dataArray[3]),"."
        raise ValueError
    print "Printing the",numberClass,"clusters:"
    i = 1
    #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs
    for cluster in kClusters:
        print "\n-- Cluster #",i,"associated to",metadatum,"=",valueSet[i-1],":"
        print "Size:",len(cluster)
        print sorted(cluster)
        i += 1
    print "\nScore of the clustering (comprised between 0 and 1):"
    print "The more it is close to 1, the more the clustering is relevant."
    #The clustering obtained with the K-Means method
    kClustersCopy = [cluster for cluster in kClusters]
    #The clustering obtained by comparing the values of the metadatum
    clustersCopy = [cluster for cluster in clusters]
    #Score by using first method of comparison
    compareClusterScore = 0
    if not (len(kClustersCopy) == numberClass == len(clustersCopy)):
        print "\n/!\ ERROR: Length error in clustering:",numberClass,len(kClustersCopy),len(clustersCopy),"."
        raise ValueError
    while kClustersCopy and clustersCopy:
        cl1 = kClustersCopy.pop()
        cl2 = clustersCopy.pop()
        #clusters are non-empty
        x = compareCluster(cl1,cl2,untaken)
        if x:
            compareClusterScore += x
        else:
            compareClusterScore = None
            break
    if compareClusterScore:
        compareClusterScore = compareClusterScore/numberClass
        printClusterScore = compareClusterScore
    else:
        printClusterScore = "None"
    #Score by using second method of comparison
    #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass)
    print "Compare clusters score is:",printClusterScore,"."
    #print "Compare centers score is:",compareCentersScore,"."
    answer = raw_input("Do you want to save the results? Y/N\n")
    if (answer == "Y"):
        answer2 = raw_input("Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n")
        if (answer2 == "Y"):
            commonList = extractCommonNodes(kClusters,dataArray)
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:",answer2,"."
        data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str(valueSet)
        i = 0
        for cluster in kClusters:
            data += "\n\n-- Cluster #" + str(i+1) + " associated to " + metadatum + " = " + str(valueSet[i]) 
            data += "\nSize: " + str(len(cluster))
            if (answer2 == "Y"):
                data += "\nSet of common nodes: " + str(commonList[i])
            data += "\n" + str(cluster)
            i += 1
        data += "\n\nCompare clusters score is: " + str(compareClusterScore)
        #data += "\n\nCompare centers score is: " + str(compareCentersScore)
        data += "\n\nEND OF FILE ****"
        print "\n/!\ Saving clusters..."
        writeFile(data)
        answer2 = raw_input("Do you want to compute the graph of the clusters? Y/N\n")
        if (answer2 == "Y"):
            print "\n/!\ Constructing the graph of the clusters..."
            #@dataArray[3] = filenames
            graph = convertClustersIntoGraph(kClusters,distanceDict,len(dataArray[3]))
            graphNO(graph)
            print "\n/!\ Done. The graph is in DOT format in \"files\" folder."
        elif not (answer2 == "N"):
            print "\n/!\ You should answer Y or N, not:",answer2,"."
    elif not (answer == "N"):
        print "/!\ You should answer by Y or N."
Example #18
0
def patternRatioAct(dataArray):
    print "First list of samples."
    sampleNameList1,metadata1,interval11,interval21 = createSampleNameList(dataArray)
    print "Second list of samples."
    sampleNameList2,metadata2,interval12,interval22 = createSampleNameList(dataArray)
    commonPatternsList = enumerateCommonPatterns(dataArray[7],sampleNameList1,sampleNameList2)
    specificPatternsList1 = enumerateSpecificPatterns(dataArray[7],sampleNameList1,sampleNameList2)
    specificPatternsList2 = enumerateSpecificPatterns(dataArray[7],sampleNameList2,sampleNameList1)
    pRatio = patternRatio(commonPatternsList,specificPatternsList1,specificPatternsList2)
    #Only printing patterns of length > 1
    print "\n--- Total number of common patterns: ",len(commonPatternsList)
    print "--- Common patterns of length > 1 ---"
    if commonPatternsList:
        for x in commonPatternsList:
            if len(x[0]) > 1:
                print x[0]
    else:
        print "No pattern of length > 1."
    print "\n--- Total number of specific patterns in",sampleNameList1
    if metadata1:
        print "selected on metadata: ",str(metadata1),"with lower and upper bounds being",str(interval11),"and",str(interval21),":"
    print len(specificPatternsList1)
    print "--- Specific patterns of length > 1 in",sampleNameList1,"---"
    if specificPatternsList1:
        for x in specificPatternsList1:
            if len(x[0]) > 1:
                print x[0]
    else:
        print "No pattern of length > 1."
    print "\n--- Total number of specific patterns in",sampleNameList2
    if metadata2:
        print "selected on metadata: ",str(metadata2),"with lower and upper bounds being",str(interval12),"and",str(interval22),":"
    print len(specificPatternsList2)
    print "--- Specific patterns of length > 1 in",sampleNameList2,"---"
    if specificPatternsList2:
        for x in specificPatternsList2:
            if len(x[0]) > 1:
                print x[0]
    else:
        print "No pattern of length > 1."
    print "\nPattern Ratio is: ",pRatio,"\n"
    print "[ If pattern ratio is superior to one, it means the two groups of samples are quite alike. Please read README ]"
    print "[ If you obtained +inf, if there are common patterns (of length 1 or superior to 1), it could mean both groups of samples contain exactly the same set of nodes. If there is no common pattern, it could mean there is no sample in both groups ]\n"
    answer = raw_input("Save the results? Y/N\n")
    if (answer == "Y"):
        data = "Pattern Ratio Results ****\nfor lists of samples " + str(sampleNameList1) + "\n"
        if metadata1:
            data += "selected on metadata: " + str(metadata1) + " with lower and upper bounds being " + str(interval11) + " and " + str(interval21) + "\n"
        data += "\nand " + str(sampleNameList2) + "\n"
        if metadata2:
            data += "selected on metadata: " + str(metadata2) + " with lower and upper bounds being " + str(interval12) + " and " + str(interval22) + "\n"
        data += "\n-> Pattern Ratio is: " + str(pRatio) + "\n\nPrinting patterns: first is the list of nodes in the pattern, then the total number of assignations in this pattern and eventually the total number of nodes in the pattern\n\n-> Common Patterns:\n"
        for x in commonPatternsList:
            data += str(x) + "\n"
        data += "\n-> Specific patterns to " + str(sampleNameList1) + ":\n"
        for x in specificPatternsList1:
            data += str(x) + "\n"
        data += "\n-> Specific patterns to " + str(sampleNameList2) + ":\n"
        for x in specificPatternsList2:
            data += str(x) + "\n"
        data += "\nEND OF FILE ****"
        writeFile(data,"","text")
    elif not (answer == "N"):
        print "/!\ You should answer 'Y' or 'N'!"