コード例 #1
0
ファイル: Ex7_Kmeans_PCA.py プロジェクト: AysegulSezen/asezen
def displayData(X, example_width):

    example_width = round(math.sqrt(X.shape[1]))

    m = X.shape[0]
    n = X.shape[1]
    example_height = int((n / example_width))

    # Compute number of items to display
    display_rows = math.floor(math.sqrt(m))
    display_cols = math.ceil(m / display_rows)

    # Between images padding
    pad = 1

    # Setup blank display
    w1 = pad + display_rows * (example_height + pad)
    h1 = int(pad + display_cols * (example_width + pad))
    display_array = -np.ones(shape=(w1, h1))

    #display_array[0:32,0:32]=X[0, :].reshape( example_height, example_width,order='F') #/ max_val;
    #display_array[0:32,32:64]=X[1, :].reshape( example_height, example_width,order='F')

    # Copy each example into a patch on the display array
    curr_ex = 0
    for j in range(1, display_rows + 1):
        for i in range(1, display_cols + 1):
            max_val = max(abs(X[curr_ex, :]))
            row0 = pad + (j - 1) * (example_height + pad)
            row1 = pad + (j - 1) * (example_height + pad) + example_height
            col0 = pad + (i - 1) * (example_width + pad)
            col1 = pad + (i - 1) * (example_width + pad) + example_width
            display_array[row0:row1, col0:col1] = X[curr_ex, :].reshape(
                example_height, example_width, order='F') / max_val

            curr_ex = curr_ex + 1

    # Display Image
    h = io.imshow(display_array, cmap='gray')
    io.show()

    return h, display_array
def main(sc, outputDir, outputIndices, filename, replicates, minTime, maxTime,
         speciesToBin, outputFileNum, skipTime, sparse, interactive):

    global globalSpeciesToBin, globalSkipTime, globalSparse, globalReplicates, globalMinTime, globalMaxTime

    # Broadcast the global variables.
    globalSpeciesToBin = sc.broadcast(speciesToBin)
    globalReplicates = sc.broadcast(replicates)
    globalMinTime = sc.broadcast(minTime)
    globalMaxTime = sc.broadcast(maxTime)
    globalSkipTime = sc.broadcast(skipTime)
    globalSparse = sc.broadcast(sparse)

    # Load the records from the sfile.
    allRecords = sc.newAPIHadoopFile(
        filename,
        "robertslab.hadoop.io.SFileInputFormat",
        "robertslab.hadoop.io.SFileHeader",
        "robertslab.hadoop.io.SFileRecord",
        keyConverter="robertslab.spark.sfile.SFileHeaderToPythonConverter",
        valueConverter="robertslab.spark.sfile.SFileRecordToPythonConverter")

    # Bin the species counts records and sum across all of the bins.
    results = allRecords.filter(filterLatticeTimeSeries).map(
        binLatticeOccupancy).reduceByKey(addLatticeBins).values().collect()
    totalTimePoints = results[0][0]
    bins = results[0][1]
    bins[:, :, :, :, 0] += totalTimePoints
    print "Recovered bins from %d total time points" % (totalTimePoints)
    print bins.shape

    # Get the file:
    inputFile = "ltable/grad_bc/yeast_cell/molar/vol_full/data_rdme_Dkp15/cell_modelII_48reps_gradient_0_c1b_1.0e-6_c2b_2.0e-6_c3b_6.0e-5_c0a_5.0e-4_c4_2.0e-4_c5_2.0e-3_c6_2.0e-6_Dk_5.0e-12_Dkp_5.0e-15_Dr_5.0e-12_Drl_5.0e-15.lm"
    f = h5py.File(inputFile, 'r')
    print("Processing %s file." % (inputFile))

    # Get the membrane sites
    lattice = f["/Model/Diffusion/LatticeSites"].value
    MembraneSites = (lattice == 1).astype(int)
    l, m, n = MembraneSites.nonzero()
    print "Length of membrane sites = " + str(len(l))

    # Calculating the mean:
    particle = np.zeros((bins.shape[0], bins.shape[1], bins.shape[2],
                         bins.shape[3], bins.shape[4]))
    data = np.zeros((bins.shape[0], bins.shape[1], bins.shape[2],
                     bins.shape[3], bins.shape[4]))
    for p in range(0, bins.shape[4]):
        particle[:, :, :, :, p] = p

    counts = np.zeros((len(l), 5))
    for i in range(0, len(speciesToBin)):
        for mem in range(0, len(l)):
            data[i, l[mem], m[mem],
                 n[mem], :] = particle[i, l[mem], m[mem],
                                       n[mem], :] * bins[i, l[mem], m[mem],
                                                         n[mem], :]
        mean = np.sum(data, axis=4)
        for mem in range(0, len(l)):
            counts[mem, 0] = speciesToBin[i]
            counts[mem, 1] = l[mem]
            counts[mem, 2] = m[mem]
            counts[mem, 3] = n[mem]
            counts[mem, 4] = float(mean[i, l[mem], m[mem],
                                        n[mem]]) / float(totalTimePoints)

    # Save the counts into a .mat file in the output directory named according to the output indices.
    # cellio.cellsave(outputDir,counts,outputIndices);
    outputFile = 'counts_event_%s.mat' % (outputFileNum)
    scipy.io.savemat(outputDir + outputFile, dict(counts=counts))
    print("Binned species data into %s" % (outputDir))

    # # Save the pdfs into a .mat file in the output directory named according to the output indices.
    # pdfs=np.zeros((len(speciesToBin),),dtype=object)
    # for i in range(0,len(speciesToBin)):
    #     counts = sum(data)
    #     pdf=bins[i,:,:,:,:].astype(float)/float(totalTimePoints)
    #     # /float(np.sum(bins[i,0,0,0,:]))
    #     pdfs[i] = pdf
    # cellio.cellsave(outputDir,pdfs,outputIndices);
    # print("Binned species data into %s"%(outputDir))

    # If interactive, show the pdf.
    if interactive:
        subvolumeCounts = bins.sum(axis=1).sum(axis=1).sum(axis=1)
        for i in range(0, len(speciesToBin)):
            print "Subvolume distribution for species %d" % (speciesToBin[i])
            plt.figure()
            plt.subplot(1, 1, 1)
            plt.bar(np.arange(0, subvolumeCounts.shape[1]),
                    np.log10(subvolumeCounts[i, :]))
            io.show()
コード例 #3
0
def ex7_Kmeans():  # main function
    #################-1-
    print('Finding closest centroids.')
    mat = scipy.io.loadmat('ex7data2.mat')
    X = mat['X']

    K = 3
    # 3 Centroids
    initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])

    # Find the closest centroids for the examples using the initial_centroids
    idx = findClosestCentroids(X, initial_centroids)
    print('Closest centroids for the first 3 examples:', idx[0:3])
    print('(the closest centroids should be 1, 3, 2 respectively)')

    #################-2-
    print('Computing centroids means.')

    #  Compute means based on the closest centroids found in the previous part.
    centroids = computeCentroids(X, idx, K)

    print('Centroids computed after initial finding of closest centroids:',
          centroids)
    print('(the centroids should be')
    print('   [ 2.428301 3.157924 ]\n')
    print('   [ 5.813503 2.633656 ]\n')
    print('   [ 7.119387 3.616684 ]\n\n')

    ################-3-
    print('Running K-Means clustering on example dataset.')

    K = 3
    max_iters = 10
    initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])
    [centroids, idx] = runkMeans(X, initial_centroids, max_iters, True)

    print('K-Means Done.')

    ###############-4-
    A = io.imread('bird_small.png')
    A = A / 255
    img_size = A.shape
    X = np.reshape(A, (img_size[0] * img_size[1], 3))
    K = 16
    max_iters = 10
    initial_centroids = kMeansInitCentroids(X, K)
    [centroids, idx] = runkMeans(X, initial_centroids, max_iters, False)
    io.imshow(A)

    ##############-5-

    print('Applying K-Means to compress an image.')
    #print('centroids:',centroids)
    idx1 = findClosestCentroids(X, centroids)
    #print('idx1:',idx1[0:10])
    #print('idx1 sh:',idx1.shape)

    X_recovered = [
    ]  #np.zeros(shape=(idx1.shape[0]))  # 16384 e 3 olması lazım
    for i in range(0, idx1.shape[0]):  # 16384
        X_recovered.append(centroids[int(idx1[i][0])])

    X_recovered = np.reshape(X_recovered, (img_size[0], img_size[1], 3))

    # Display the original image
    io.imshow(A)  #imagesc(A);
    #io.title("Original")
    io.show()

    # Display compressed image side by side
    #io.title('Compressed, with',K,'colors.')
    io.imshow(X_recovered)  #imagesc(X_recovered)
    io.show()

    ###############-6- out of homework. Doing same job by python class
    print('Image compression by K-means (sklearn) python class..')
    kmeans = KMeans(init="random",
                    n_clusters=K,
                    n_init=10,
                    max_iter=max_iters,
                    random_state=42)

    kmeans.fit(X)  # X : pixels of image.

    print('kmeans.inertia_:', kmeans.inertia_)
    print(
        'kmeans.cluster_centers_:',
        kmeans.cluster_centers_)  # this is same with centroids variable on up.
    print('kmeans.n_iter_:', kmeans.n_iter_)
    print('kmeans.labels_[:5]', kmeans.labels_[:10])
    print('kmeans.labels_:', kmeans.labels_.shape)

    y_kmeans = kmeans.predict(X)  # y_kmeans is same with idx1 variable on up.

    print('y_kmeans:', y_kmeans.shape)
    print('y_kmeans 0-10:', y_kmeans[0:10])

    X_recovered2 = []
    for i in range(0, y_kmeans.shape[0]):  # 16384
        X_recovered2.append(kmeans.cluster_centers_[int(y_kmeans[i])])

    X_recovered2 = np.reshape(X_recovered2, (img_size[0], img_size[1], 3))
    io.imshow(X_recovered2)
    io.show()
コード例 #4
0
def main(sc, outputDir, outputIndices, filename, replicates, trajectory, speciesToBin, outputFileNum, skipTime, sparse, interactive):
    
    global globalSpeciesToBin, globalSkipTime, globalSparse, globalReplicates, globalTrajectory
    
    # Broadcast the global variables.
    globalSpeciesToBin=sc.broadcast(speciesToBin)
    globalReplicates = sc.broadcast(replicates)
    globalTrajectory = sc.broadcast(trajectory)
    globalSkipTime=sc.broadcast(skipTime)
    globalSparse=sc.broadcast(sparse)
    
    # Load the records from the sfile.
    allRecords = sc.newAPIHadoopFile(filename, "robertslab.hadoop.io.SFileInputFormat", "robertslab.hadoop.io.SFileHeader", "robertslab.hadoop.io.SFileRecord", keyConverter="robertslab.spark.sfile.SFileHeaderToPythonConverter", valueConverter="robertslab.spark.sfile.SFileRecordToPythonConverter")

    # Bin the species counts records and sum across all of the bins.
    results = allRecords.filter(filterLatticeTimeSeries).map(binLatticeOccupancy).reduceByKey(addLatticeBins).values().collect()
    totalTimePoints=results[0][0]
    bins=results[0][1]
    # bins[:,:,:,:,0] += totalTimePoints
    print "Recovered bins from %d total time points"%(totalTimePoints)
    print bins.shape

    # # Get the file:
    # inputFile = "../../ltable/grad_bc/yeast_cell/molar/vol_full/data_rdme_Dkp15/cell_modelII_48reps_gradient_0_c1b_1.0e-6_c2b_2.0e-6_c3b_6.0e-5_c0a_5.0e-4_c4_2.0e-4_c5_2.0e-3_c6_2.0e-6_Dk_5.0e-12_Dkp_5.0e-15_Dr_5.0e-12_Drl_5.0e-15.lm"
    # f = h5py.File(inputFile,'r')
    # print ("Processing %s file."%(inputFile))
    #
    # # Get the membrane sites
    # lattice = f["/Model/Diffusion/LatticeSites"].value
    # MembraneSites = (lattice==1).astype(int)
    # l, m, n = MembraneSites.nonzero()
    # print "Length of membrane sites = " + str(len(l))
    #
    # # Calculating the mean:
    # particle = np.zeros((bins.shape[0],bins.shape[1],bins.shape[2],bins.shape[3],bins.shape[4]))
    # data = np.zeros((bins.shape[0],bins.shape[1],bins.shape[2],bins.shape[3],bins.shape[4]))
    # for p in range(0,bins.shape[4]):
    #     particle[:,:,:,:,p] = p
    #
    # counts = np.zeros((len(l),5))
    # for i in range(0,len(speciesToBin)):
    #     for mem in range(0,len(l)):
    #         data[i,l[mem],m[mem],n[mem],:] = particle[i,l[mem],m[mem],n[mem],:]*bins[i,l[mem],m[mem],n[mem],:]
    #     mean = np.sum(data,axis=4)
    #     for mem in range(0,len(l)):
    #         counts[mem,0] = speciesToBin[i]
    #         counts[mem,1] = l[mem]
    #         counts[mem,2] = m[mem]
    #         counts[mem,3] = n[mem]
    #         counts[mem,4] = float(mean[i,l[mem],m[mem],n[mem]])/float(totalTimePoints)

    # Save the counts into a .mat file in the output directory named according to the output indices.
    # cellio.cellsave(outputDir,counts,outputIndices);
    outputFile = 'traj_%s_%s_%s.p'%(trajectory[0],trajectory[-1],outputFileNum)
    pickle.dump(results, open(outputDir+outputFile, "wb"))
    # scipy.io.savemat(outputDir+outputFile, dict(bins=bins))
    print("Binned species data into %s"%(outputDir))

    # # Save the pdfs into a .mat file in the output directory named according to the output indices.
    # pdfs=np.zeros((len(speciesToBin),),dtype=object)
    # for i in range(0,len(speciesToBin)):
    #     counts = sum(data)
    #     pdf=bins[i,:,:,:,:].astype(float)/float(totalTimePoints)
    #     # /float(np.sum(bins[i,0,0,0,:]))
    #     pdfs[i] = pdf
    # cellio.cellsave(outputDir,pdfs,outputIndices);
    # print("Binned species data into %s"%(outputDir))

    # If interactive, show the pdf.
    if interactive:
        subvolumeCounts=bins.sum(axis=1).sum(axis=1).sum(axis=1)
        for i in range(0,len(speciesToBin)):
            print "Subvolume distribution for species %d"%(speciesToBin[i])
            plt.figure()
            plt.subplot(1,1,1)
            plt.bar(np.arange(0,subvolumeCounts.shape[1]),np.log10(subvolumeCounts[i,:]))
            io.show()
コード例 #5
0
# ploted the points along with the centroid coordinates of each cluster to see how the centroid positions effects clustering
print "\nCentroid position "
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:, 0],
            kmeans.cluster_centers_[:, 1],
            color='black')
plt.title('kmeans_cluster_centers')
plt.show()
print "\nDone with Apply K-Means classifier on ex6data1.mat\n"

print('\nRunning K-Means clustering on pixels from an image.\n\n')
image = io.imread('bird_small.png')
io.imshow(image)
plt.title('original_bird_small_image')
io.show()

rows = image.shape[0]
cols = image.shape[1]
image = image.reshape(image.shape[0] * image.shape[1], 3)

#  kmeans algorithms with with 16 colors and max iter 10
kmeans = KMeans(n_clusters=128, n_init=10, max_iter=10)
kmeans.fit(image)

clusters = np.asarray(kmeans.cluster_centers_, dtype=np.uint8)
labels = np.asarray(kmeans.labels_, dtype=np.uint8)
labels = labels.reshape(rows, cols)

# saving in standard binary file format
np.save('codebook_tiger.npy', clusters)