def create_dic():
    all_names = open((FILE_ADDRESSES + 'all_names.txt'), 'a')
    # clear the file if it is has values from previous tests
    all_names.truncate()

    # for each year read names and write them to a file
    for num in range(2000, 2013):
        file_name = FILE_ADDRESSES + 'Popular Baby Names ' + str(num) + '.htm'
        text = read_file(file_name, 'rU')

        # no number and a word: name
        names = re.findall('<td>[^\d*]\w*</td>', text)
        # just a number: rank
        ranks = re.findall('<td>\d*</td>', text)

        # create a dictionary object from values
        names_with_ranks = {}

        # there are both girls and boys names in one file, so for every two name we increase the rank number
        i = 0
        for index in range(len(names)):
            names_with_ranks[re.search('[^<td>]\w+',
                                       names[index]).group()] = re.search(
                                           '[^<td>]\d*', ranks[i]).group()
            if (index % 2 == 1):
                i += 1
        fill_dic(names_with_ranks, all_names)
    all_names.close()
Beispiel #2
0
def part_1():
    data = read_file('takens_1.txt')
    time = np.arange(0, data.shape[0], 1)
    x0 = data[:, 0]

    # Plotting the data
    fig, ax = plt.subplots(1, 1)
    ax.plot(x0, data[:, 1], c='dodgerblue', linewidth=0.5)
    ax.set_xlabel('$x$')
    ax.set_ylabel('$y$')
    plt.show()

    # Plotting the first coordinate against the line number in the dataset (the “time”)
    fig, ax = plt.subplots(1, 1)
    ax.plot(time, x0, c='dodgerblue', linewidth=0.5)
    ax.set_xlabel('time')
    ax.set_ylabel('$x$')
    plt.show()

    # Plotting the coordinate against its delayed version , 2 dimensional
    n_delay = 50
    #delayed = np.hstack((x0[-n_delay:], x0[:data.shape[0]-n_delay]))
    x = x0[:x0.shape[0] - n_delay]
    delayed = x0[n_delay:]
    fig, ax = plt.subplots(1, 1)
    ax.plot(x, delayed, c='dodgerblue', linewidth=0.5)  # x0
    ax.set_xlabel('$x(t)$')
    ax.set_ylabel('$x(t+ \Delta n)$')
    ax.set_title('$\Delta n = {}$'.format(n_delay))
    plt.show()

    # Plotting the coordinate against its delayed version , 3 dimensional
    n_delay_2 = 2 * n_delay
    x_2 = x0[:x0.shape[0] - n_delay_2]
    delayed_1 = x0[n_delay:x0.shape[0] - n_delay]
    delayed_2 = x0[n_delay_2:]
    fig = plt.figure()
    ax0 = fig.gca(projection='3d')
    ax0.plot(x_2,
             delayed_1,
             delayed_2,
             c='dodgerblue',
             linestyle=':',
             antialiased=True)
    ax0.set_xlabel('$x(t)$')
    ax0.set_ylabel('$x(t+ \Delta n)$')
    ax0.set_zlabel('$x(t+ 2 \Delta n)$')
    ax0.set_title('$\Delta n = {}$'.format(n_delay))
    plt.show()
def count_names(name):
    text = read_file(FILE_ADDRESSES + 'all_names.txt', 'rU')
    
    ranks = []
    
    m = re.findall('(' + name + ')\t(\d+)', text, re.IGNORECASE)
#     print m
    for name in m:
        ranks.append(name[1])
#     print ranks
    
    names_with_count = (str(len(m)), ranks)
        
#         counted_names.append(name)
        
    return names_with_count
Beispiel #4
0
def main():
    ## Part 1
    N = 1000
    L = 5
    X, tk = generate_points(N)
    plot_generated_points(X)
    tk = np.array(tk)
    S, lambda_l = diffusion_map_algorithm(X, L)
    plot_5_eigenfunctions(tk, S)


    ## Part 2
    N = 1000
    L = 10
    X, t = make_swiss_roll(N, noise=0.0, random_state=None)
    plot_swiss_roll(X,t)
    S, lambda_l = diffusion_map_algorithm(X,L)
    plot_eigenfunctions(S,t)

    # PCA

    X = X - X.mean(axis=0, keepdims=True)
    U, sigma, V = np.linalg.svd(X, 0)
    S = np.diag(sigma)
    trace = S.trace()

    print("Sigma values of Swiss Roll: ", sigma)

    # Reconstruction with 3 principal components
    energy_3 = 0
    S_3 = np.zeros(S.shape)
    for i in range(3):
        S_3[i][i] = sigma[i]
        energy_3 += sigma[i] / trace
    reconstructed_3 = np.dot(U, np.dot(S_3, V))

    # Reconstruction with 2 principal components
    energy_2 = 0
    S_2 = np.zeros(S.shape)
    for i in range(2):
        S_2[i][i] = sigma[i]
        energy_2 += sigma[i] / trace
    reconstructed_2 = np.dot(U, np.dot(S_2, V))


    fig = plt.figure()
    ax1 = fig.gca(projection='3d')
    ax1.scatter(reconstructed_3[:, 0], reconstructed_3[:, 1], reconstructed_3[:, 2], c=t, cmap="Spectral", s=2)
    ax1.set_title("Swiss Roll \n Reconstructed with 3 principal components \n Energy: {:.2f}%".format(energy_3 * 100))
    ax1.set_xlabel('x')
    ax1.set_ylabel('y')
    ax1.set_zlabel('z')
    plt.tight_layout()
    plt.show()

    fig = plt.figure()
    ax2 = fig.gca(projection='3d')
    ax2.scatter(reconstructed_2[:, 0], reconstructed_2[:, 1], reconstructed_2[:, 2], c=t, cmap="Spectral", s=2)
    ax2.set_title("Swiss Roll \n Reconstructed with 2 principal components \n Energy: {:.2f}%".format(energy_2 * 100))
    ax2.set_xlabel('x')
    ax2.set_ylabel('y')
    ax2.set_zlabel('z')

    plt.tight_layout()
    plt.show()

    bonus()

    ## Part 3

    X = read_file('data_DMAP_PCA_vadere.txt')
    L = 10
    time = np.arange(X.shape[0])
    s, lambda_l = diffusion_map_algorithm(X, L)
    plot_eigenfunctions(s,time)

    # plotting lambda values
    plt.plot(lambda_l, 'o')
    plt.show()
    plot_5_eigenfunctions(time, s)
Beispiel #5
0
def part_1():
    # Read data
    X = read_file('pca_dataset.txt')

    # Find center of data set
    mean_d1, mean_d2 = X.mean(0)
    mean = X.mean(axis=0, keepdims=True)
    X_centered = X - mean
    mean_centered_d1, mean_centered_d2 = X_centered.mean(0)

    # Make PCA analysis via SVD
    U, sigma, VT = np.linalg.svd(X_centered, 0)
    V = VT.T
    S = np.diag(sigma)
    trace = S.trace()

    S_one_dimension = np.zeros(S.shape)
    S_one_dimension[0][0] = S[0][0]
    X_one_dimension = U.dot(S_one_dimension).dot(VT)
    MSE_one = (X_centered - X_one_dimension)**2
    MSE_one = np.sum(MSE_one)
    print("MSE One Dimension: {:.4f}".format(MSE_one**2))

    # Approximates one-dimensional linear subspace
    X_1D = U.dot(S).dot(VT[0])
    print("X 1D: " + str(X_1D))
    fig = plt.figure(figsize=(6, 3))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_title('1-Dimensional Projection')
    ax.scatter(X_1D,
               np.zeros(X_1D.shape),
               label='Projected Data',
               c="red",
               s=3)
    plt.xlabel("z")
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

    print("X_head: " + str(X_one_dimension))
    print("U: " + str(U))
    print("V: " + str(V))
    print("Sigma: " + str(S))
    print("Trace: " + str(trace))
    print("Energy of " + str(sigma[0]) + ": " + str(sigma[0] / trace))
    print("Energy of " + str(sigma[1]) + ": " + str(sigma[1] / trace))

    # Plot data set
    fig = plt.figure(figsize=(7, 7))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_title('PCA')
    ax.scatter(X[:, 0], X[:, 1], label='Data', c="mediumseagreen", s=3)
    ax.scatter(X_centered[:, 0],
               X_centered[:, 1],
               label='Centered Data',
               c="lightskyblue",
               s=3)
    ax.scatter(X_one_dimension[:, 0],
               X_one_dimension[:, 1],
               label='Projected Data',
               c="red",
               s=1)
    plt.xlabel("x")
    plt.ylabel("f(x)")
    plt.grid(True)
    plt.legend(loc='upper left')

    # Mark the center of data set
    ax.plot(mean_d1,
            mean_d2,
            'o',
            markersize=5,
            color='olivedrab',
            label='Center of data')
    ax.plot(mean_centered_d1,
            mean_centered_d2,
            'o',
            markersize=5,
            color='darkblue',
            label='Center of centralized data')

    # Draw the direction of two principal components
    plt.arrow(mean_d1,
              mean_d2,
              V[0, 0],
              V[1, 0],
              width=0.01,
              color='darkred',
              alpha=0.5)
    plt.arrow(mean_d1,
              mean_d2,
              V[0, 1],
              V[1, 1],
              width=0.01,
              color='darkblue',
              alpha=0.5)

    # Show eigenvalues of Sigma
    plt.text(V[0, 0] - 0.6,
             V[1, 0] + 0.1,
             "{:.4f}".format(sigma[0]),
             fontsize=12,
             color='darkred')
    plt.text(V[0, 1],
             V[1, 1] - 0.1,
             "{:.4f}".format(sigma[1]),
             fontsize=12,
             color='darkblue')
    plt.show()
Beispiel #6
0
def part_3():
    # Read data
    X = read_file('data_DMAP_PCA_vadere.txt')

    # Visualize the path of the first two pedestrians in the two-dimensional space.
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot()
    ax.set_title('Pedestrian Paths')
    ax.plot(X[:, 0],
            X[:, 1],
            color='dodgerblue',
            linewidth=0.5,
            label='First Pedestrian')
    ax.scatter(X[0, 0],
               X[0, 1],
               c='lightskyblue',
               s=5,
               label='Starting Point of the First Pedestrian')
    ax.scatter(X[-1, 0],
               X[-1, 1],
               c='darkblue',
               s=5,
               label='Ending Point of the First Pedestrian')
    ax.plot(X[:, 2],
            X[:, 3],
            color='firebrick',
            linewidth=0.5,
            label='Second Pedestrian')
    ax.scatter(X[0, 2],
               X[0, 3],
               c='lightcoral',
               s=5,
               label='Starting Point of the Second Pedestrian')
    ax.scatter(X[-1, 2],
               X[-1, 3],
               c='darkred',
               s=5,
               label='Ending Point of the Second Pedestrian')
    plt.xlabel("x")
    plt.ylabel("y")
    plt.legend(loc='upper right')
    plt.show()

    # Make PCA analysis via SVD
    U, sigma, V = np.linalg.svd(X, 0)
    S = np.diag(sigma)
    trace = S.trace()

    # Reconstruction with 2 principal components
    energy_2 = 0
    S_2 = np.zeros(S.shape)
    for i in range(2):
        S_2[i][i] = sigma[i]
        energy_2 += sigma[i] / trace
    reconstructed_2 = np.dot(U, np.dot(S_2, V))

    # Reconstruction with 3 principal components
    energy_3 = 0
    S_3 = np.zeros(S.shape)
    for i in range(3):
        S_3[i][i] = sigma[i]
        energy_3 += sigma[i] / trace
    reconstructed_3 = np.dot(U, np.dot(S_3, V))

    # Reconstruction with 4 principal components
    energy_4 = 0
    S_4 = np.zeros(S.shape)
    for i in range(4):
        S_4[i][i] = sigma[i]
        energy_4 += sigma[i] / trace
    reconstructed_4 = np.dot(U, np.dot(S_4, V))

    # Reconstruction with 5 principal components
    energy_5 = 0
    S_5 = np.zeros(S.shape)
    for i in range(5):
        S_5[i][i] = sigma[i]
        energy_5 += sigma[i] / trace
    reconstructed_5 = np.dot(U, np.dot(S_5, V))

    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(10, 10))
    ax1.plot(reconstructed_2[:, 0],
             reconstructed_2[:, 1],
             color='dodgerblue',
             linewidth=0.5,
             label='First Pedestrian')
    ax1.plot(reconstructed_2[:, 2],
             reconstructed_2[:, 3],
             color='firebrick',
             linewidth=0.5,
             label='Second Pedestrian')
    ax1.set_title(
        "Reconstructed with 2 principal components \n Energy: {:.2f}%".format(
            energy_2 * 100))
    ax1.legend(loc="upper right")
    ax2.plot(reconstructed_3[:, 0],
             reconstructed_3[:, 1],
             color='dodgerblue',
             linewidth=0.5,
             label='First Pedestrian')
    ax2.plot(reconstructed_3[:, 2],
             reconstructed_3[:, 3],
             color='firebrick',
             linewidth=0.5,
             label='Second Pedestrian')
    ax2.set_title(
        "Reconstructed with 3 principal components\n Energy: {:.2f}%".format(
            energy_3 * 100))
    ax2.legend(loc="upper right")
    ax3.plot(reconstructed_4[:, 0],
             reconstructed_4[:, 1],
             color='dodgerblue',
             linewidth=0.5,
             label='First Pedestrian')
    ax3.plot(reconstructed_4[:, 2],
             reconstructed_4[:, 3],
             color='firebrick',
             linewidth=0.5,
             label='Second Pedestrian')
    ax3.set_title(
        "Reconstructed with 4 principal components \n Energy: {:.2f}%".format(
            energy_4 * 100))
    ax3.legend(loc="upper right")
    ax4.plot(reconstructed_5[:, 0],
             reconstructed_5[:, 1],
             color='dodgerblue',
             linewidth=0.5,
             label='First Pedestrian')
    ax4.plot(reconstructed_5[:, 2],
             reconstructed_5[:, 3],
             color='firebrick',
             linewidth=0.5,
             label='Second Pedestrian')
    ax4.set_title(
        "Reconstructed with 5 principal components \n Energy: {:.2f}%".format(
            energy_5 * 100))
    ax4.legend(loc="upper right")
    fig.text(0.5, 0.01, 'x', ha='center')
    fig.text(0.01, 0.5, 'y', va='center', rotation='vertical')
    plt.tight_layout()
    plt.savefig('part_3_reconstructed_paths.png')
    plt.show()