def create_dic(): all_names = open((FILE_ADDRESSES + 'all_names.txt'), 'a') # clear the file if it is has values from previous tests all_names.truncate() # for each year read names and write them to a file for num in range(2000, 2013): file_name = FILE_ADDRESSES + 'Popular Baby Names ' + str(num) + '.htm' text = read_file(file_name, 'rU') # no number and a word: name names = re.findall('<td>[^\d*]\w*</td>', text) # just a number: rank ranks = re.findall('<td>\d*</td>', text) # create a dictionary object from values names_with_ranks = {} # there are both girls and boys names in one file, so for every two name we increase the rank number i = 0 for index in range(len(names)): names_with_ranks[re.search('[^<td>]\w+', names[index]).group()] = re.search( '[^<td>]\d*', ranks[i]).group() if (index % 2 == 1): i += 1 fill_dic(names_with_ranks, all_names) all_names.close()
def part_1(): data = read_file('takens_1.txt') time = np.arange(0, data.shape[0], 1) x0 = data[:, 0] # Plotting the data fig, ax = plt.subplots(1, 1) ax.plot(x0, data[:, 1], c='dodgerblue', linewidth=0.5) ax.set_xlabel('$x$') ax.set_ylabel('$y$') plt.show() # Plotting the first coordinate against the line number in the dataset (the “time”) fig, ax = plt.subplots(1, 1) ax.plot(time, x0, c='dodgerblue', linewidth=0.5) ax.set_xlabel('time') ax.set_ylabel('$x$') plt.show() # Plotting the coordinate against its delayed version , 2 dimensional n_delay = 50 #delayed = np.hstack((x0[-n_delay:], x0[:data.shape[0]-n_delay])) x = x0[:x0.shape[0] - n_delay] delayed = x0[n_delay:] fig, ax = plt.subplots(1, 1) ax.plot(x, delayed, c='dodgerblue', linewidth=0.5) # x0 ax.set_xlabel('$x(t)$') ax.set_ylabel('$x(t+ \Delta n)$') ax.set_title('$\Delta n = {}$'.format(n_delay)) plt.show() # Plotting the coordinate against its delayed version , 3 dimensional n_delay_2 = 2 * n_delay x_2 = x0[:x0.shape[0] - n_delay_2] delayed_1 = x0[n_delay:x0.shape[0] - n_delay] delayed_2 = x0[n_delay_2:] fig = plt.figure() ax0 = fig.gca(projection='3d') ax0.plot(x_2, delayed_1, delayed_2, c='dodgerblue', linestyle=':', antialiased=True) ax0.set_xlabel('$x(t)$') ax0.set_ylabel('$x(t+ \Delta n)$') ax0.set_zlabel('$x(t+ 2 \Delta n)$') ax0.set_title('$\Delta n = {}$'.format(n_delay)) plt.show()
def count_names(name): text = read_file(FILE_ADDRESSES + 'all_names.txt', 'rU') ranks = [] m = re.findall('(' + name + ')\t(\d+)', text, re.IGNORECASE) # print m for name in m: ranks.append(name[1]) # print ranks names_with_count = (str(len(m)), ranks) # counted_names.append(name) return names_with_count
def main(): ## Part 1 N = 1000 L = 5 X, tk = generate_points(N) plot_generated_points(X) tk = np.array(tk) S, lambda_l = diffusion_map_algorithm(X, L) plot_5_eigenfunctions(tk, S) ## Part 2 N = 1000 L = 10 X, t = make_swiss_roll(N, noise=0.0, random_state=None) plot_swiss_roll(X,t) S, lambda_l = diffusion_map_algorithm(X,L) plot_eigenfunctions(S,t) # PCA X = X - X.mean(axis=0, keepdims=True) U, sigma, V = np.linalg.svd(X, 0) S = np.diag(sigma) trace = S.trace() print("Sigma values of Swiss Roll: ", sigma) # Reconstruction with 3 principal components energy_3 = 0 S_3 = np.zeros(S.shape) for i in range(3): S_3[i][i] = sigma[i] energy_3 += sigma[i] / trace reconstructed_3 = np.dot(U, np.dot(S_3, V)) # Reconstruction with 2 principal components energy_2 = 0 S_2 = np.zeros(S.shape) for i in range(2): S_2[i][i] = sigma[i] energy_2 += sigma[i] / trace reconstructed_2 = np.dot(U, np.dot(S_2, V)) fig = plt.figure() ax1 = fig.gca(projection='3d') ax1.scatter(reconstructed_3[:, 0], reconstructed_3[:, 1], reconstructed_3[:, 2], c=t, cmap="Spectral", s=2) ax1.set_title("Swiss Roll \n Reconstructed with 3 principal components \n Energy: {:.2f}%".format(energy_3 * 100)) ax1.set_xlabel('x') ax1.set_ylabel('y') ax1.set_zlabel('z') plt.tight_layout() plt.show() fig = plt.figure() ax2 = fig.gca(projection='3d') ax2.scatter(reconstructed_2[:, 0], reconstructed_2[:, 1], reconstructed_2[:, 2], c=t, cmap="Spectral", s=2) ax2.set_title("Swiss Roll \n Reconstructed with 2 principal components \n Energy: {:.2f}%".format(energy_2 * 100)) ax2.set_xlabel('x') ax2.set_ylabel('y') ax2.set_zlabel('z') plt.tight_layout() plt.show() bonus() ## Part 3 X = read_file('data_DMAP_PCA_vadere.txt') L = 10 time = np.arange(X.shape[0]) s, lambda_l = diffusion_map_algorithm(X, L) plot_eigenfunctions(s,time) # plotting lambda values plt.plot(lambda_l, 'o') plt.show() plot_5_eigenfunctions(time, s)
def part_1(): # Read data X = read_file('pca_dataset.txt') # Find center of data set mean_d1, mean_d2 = X.mean(0) mean = X.mean(axis=0, keepdims=True) X_centered = X - mean mean_centered_d1, mean_centered_d2 = X_centered.mean(0) # Make PCA analysis via SVD U, sigma, VT = np.linalg.svd(X_centered, 0) V = VT.T S = np.diag(sigma) trace = S.trace() S_one_dimension = np.zeros(S.shape) S_one_dimension[0][0] = S[0][0] X_one_dimension = U.dot(S_one_dimension).dot(VT) MSE_one = (X_centered - X_one_dimension)**2 MSE_one = np.sum(MSE_one) print("MSE One Dimension: {:.4f}".format(MSE_one**2)) # Approximates one-dimensional linear subspace X_1D = U.dot(S).dot(VT[0]) print("X 1D: " + str(X_1D)) fig = plt.figure(figsize=(6, 3)) ax = fig.add_subplot(1, 1, 1) ax.set_title('1-Dimensional Projection') ax.scatter(X_1D, np.zeros(X_1D.shape), label='Projected Data', c="red", s=3) plt.xlabel("z") plt.legend(loc='upper left') plt.tight_layout() plt.show() print("X_head: " + str(X_one_dimension)) print("U: " + str(U)) print("V: " + str(V)) print("Sigma: " + str(S)) print("Trace: " + str(trace)) print("Energy of " + str(sigma[0]) + ": " + str(sigma[0] / trace)) print("Energy of " + str(sigma[1]) + ": " + str(sigma[1] / trace)) # Plot data set fig = plt.figure(figsize=(7, 7)) ax = fig.add_subplot(1, 1, 1) ax.set_title('PCA') ax.scatter(X[:, 0], X[:, 1], label='Data', c="mediumseagreen", s=3) ax.scatter(X_centered[:, 0], X_centered[:, 1], label='Centered Data', c="lightskyblue", s=3) ax.scatter(X_one_dimension[:, 0], X_one_dimension[:, 1], label='Projected Data', c="red", s=1) plt.xlabel("x") plt.ylabel("f(x)") plt.grid(True) plt.legend(loc='upper left') # Mark the center of data set ax.plot(mean_d1, mean_d2, 'o', markersize=5, color='olivedrab', label='Center of data') ax.plot(mean_centered_d1, mean_centered_d2, 'o', markersize=5, color='darkblue', label='Center of centralized data') # Draw the direction of two principal components plt.arrow(mean_d1, mean_d2, V[0, 0], V[1, 0], width=0.01, color='darkred', alpha=0.5) plt.arrow(mean_d1, mean_d2, V[0, 1], V[1, 1], width=0.01, color='darkblue', alpha=0.5) # Show eigenvalues of Sigma plt.text(V[0, 0] - 0.6, V[1, 0] + 0.1, "{:.4f}".format(sigma[0]), fontsize=12, color='darkred') plt.text(V[0, 1], V[1, 1] - 0.1, "{:.4f}".format(sigma[1]), fontsize=12, color='darkblue') plt.show()
def part_3(): # Read data X = read_file('data_DMAP_PCA_vadere.txt') # Visualize the path of the first two pedestrians in the two-dimensional space. fig = plt.figure(figsize=(10, 7)) ax = fig.add_subplot() ax.set_title('Pedestrian Paths') ax.plot(X[:, 0], X[:, 1], color='dodgerblue', linewidth=0.5, label='First Pedestrian') ax.scatter(X[0, 0], X[0, 1], c='lightskyblue', s=5, label='Starting Point of the First Pedestrian') ax.scatter(X[-1, 0], X[-1, 1], c='darkblue', s=5, label='Ending Point of the First Pedestrian') ax.plot(X[:, 2], X[:, 3], color='firebrick', linewidth=0.5, label='Second Pedestrian') ax.scatter(X[0, 2], X[0, 3], c='lightcoral', s=5, label='Starting Point of the Second Pedestrian') ax.scatter(X[-1, 2], X[-1, 3], c='darkred', s=5, label='Ending Point of the Second Pedestrian') plt.xlabel("x") plt.ylabel("y") plt.legend(loc='upper right') plt.show() # Make PCA analysis via SVD U, sigma, V = np.linalg.svd(X, 0) S = np.diag(sigma) trace = S.trace() # Reconstruction with 2 principal components energy_2 = 0 S_2 = np.zeros(S.shape) for i in range(2): S_2[i][i] = sigma[i] energy_2 += sigma[i] / trace reconstructed_2 = np.dot(U, np.dot(S_2, V)) # Reconstruction with 3 principal components energy_3 = 0 S_3 = np.zeros(S.shape) for i in range(3): S_3[i][i] = sigma[i] energy_3 += sigma[i] / trace reconstructed_3 = np.dot(U, np.dot(S_3, V)) # Reconstruction with 4 principal components energy_4 = 0 S_4 = np.zeros(S.shape) for i in range(4): S_4[i][i] = sigma[i] energy_4 += sigma[i] / trace reconstructed_4 = np.dot(U, np.dot(S_4, V)) # Reconstruction with 5 principal components energy_5 = 0 S_5 = np.zeros(S.shape) for i in range(5): S_5[i][i] = sigma[i] energy_5 += sigma[i] / trace reconstructed_5 = np.dot(U, np.dot(S_5, V)) fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(10, 10)) ax1.plot(reconstructed_2[:, 0], reconstructed_2[:, 1], color='dodgerblue', linewidth=0.5, label='First Pedestrian') ax1.plot(reconstructed_2[:, 2], reconstructed_2[:, 3], color='firebrick', linewidth=0.5, label='Second Pedestrian') ax1.set_title( "Reconstructed with 2 principal components \n Energy: {:.2f}%".format( energy_2 * 100)) ax1.legend(loc="upper right") ax2.plot(reconstructed_3[:, 0], reconstructed_3[:, 1], color='dodgerblue', linewidth=0.5, label='First Pedestrian') ax2.plot(reconstructed_3[:, 2], reconstructed_3[:, 3], color='firebrick', linewidth=0.5, label='Second Pedestrian') ax2.set_title( "Reconstructed with 3 principal components\n Energy: {:.2f}%".format( energy_3 * 100)) ax2.legend(loc="upper right") ax3.plot(reconstructed_4[:, 0], reconstructed_4[:, 1], color='dodgerblue', linewidth=0.5, label='First Pedestrian') ax3.plot(reconstructed_4[:, 2], reconstructed_4[:, 3], color='firebrick', linewidth=0.5, label='Second Pedestrian') ax3.set_title( "Reconstructed with 4 principal components \n Energy: {:.2f}%".format( energy_4 * 100)) ax3.legend(loc="upper right") ax4.plot(reconstructed_5[:, 0], reconstructed_5[:, 1], color='dodgerblue', linewidth=0.5, label='First Pedestrian') ax4.plot(reconstructed_5[:, 2], reconstructed_5[:, 3], color='firebrick', linewidth=0.5, label='Second Pedestrian') ax4.set_title( "Reconstructed with 5 principal components \n Energy: {:.2f}%".format( energy_5 * 100)) ax4.legend(loc="upper right") fig.text(0.5, 0.01, 'x', ha='center') fig.text(0.01, 0.5, 'y', va='center', rotation='vertical') plt.tight_layout() plt.savefig('part_3_reconstructed_paths.png') plt.show()