Beispiel #1
0
def findBadWords():
    '''
    Plots different information about the words that were incorrectly guessed
    such as the semantic features rmse's and counts of words versus the semantic
    feature at which they didn't perform well
    :return: [rmsecorrect,rmseincorrect]
    '''
    w = io.mmread("w_lse_dim299.mtx")
    [xtrainPCA, xtestPCA] = pcaData(299, fmri_train, fmri_test)
    [yright, ywrong] = test_suite.prepareData(wordid_test, wordfeature_std)
    [guessed_words,
     percentage] = test_suite.word_guesser(xtestPCA, w, 0, yright, ywrong)
    indexes_incorrect_guesses = np.where(guessed_words == False)[0].tolist()
    n = yright.shape[0]
    f = yright.shape[1]
    rmsecorrect = np.zeros((n, f))
    rmseincorrect = np.zeros((n, f))
    j = 0
    for j in range(len(indexes_incorrect_guesses)):
        i = indexes_incorrect_guesses[j]
        ypredict = xtestPCA[i].dot(w.T)
        correct_word = dictionary[int(wordid_test[i][0] - 1)]
        ycorrect = yright[i]
        rmsecorrect[j] = np.sqrt(np.square(ycorrect - ypredict))
        incorrect_word = dictionary[int(wordid_test[i][1] - 1)]
        yincorrect = ywrong[i]
        rmseincorrect[j] = np.sqrt(np.square(yincorrect - ypredict))
        print(correct_word,
              sum(rmsecorrect[j]) / f, incorrect_word,
              sum(rmseincorrect[j]) / f)
    correct_line, = plt.plot(rmsecorrect[0], label='bear')
    incorrect_line, = plt.plot(rmseincorrect[0], label='airplane')
    plt.legend(handles=[correct_line, incorrect_line])
    plt.xlabel('semantic feature')
    plt.ylabel('RMSE')
    plt.show()
    difference_rmse = rmsecorrect - rmseincorrect  # if rmsecorrect > rmseincorrect then this is bad
    indexes_bad_rmse = np.where(difference_rmse > 0)
    plt.hist(indexes_bad_rmse[1], range(f))
    plt.xlabel('semantic feature')
    plt.ylabel('Counts of Words')
    plt.show()
    differences_summed = np.sum(difference_rmse, axis=0)
    plt.plot(differences_summed)
    plt.show()
    return [rmsecorrect, rmseincorrect]
Beispiel #2
0
def findBadWords ():
    '''
    Plots different information about the words that were incorrectly guessed
    such as the semantic features rmse's and counts of words versus the semantic
    feature at which they didn't perform well
    :return: [rmsecorrect,rmseincorrect]
    '''
    w = io.mmread ("w_lse_dim299.mtx")
    [xtrainPCA,xtestPCA] = pcaData (299, fmri_train, fmri_test)
    [yright,ywrong] = test_suite.prepareData (wordid_test,wordfeature_std)
    [guessed_words, percentage] = test_suite.word_guesser(xtestPCA, w, 0, yright, ywrong)
    indexes_incorrect_guesses = np.where(guessed_words == False)[0].tolist()
    n = yright.shape[0]
    f = yright.shape[1]
    rmsecorrect = np.zeros((n,f))
    rmseincorrect = np.zeros((n,f))
    j = 0
    for j in range(len(indexes_incorrect_guesses)):
        i = indexes_incorrect_guesses[j]
        ypredict = xtestPCA[i].dot(w.T)
        correct_word = dictionary[int(wordid_test[i][0] - 1)]
        ycorrect = yright[i]
        rmsecorrect[j] = np.sqrt(np.square(ycorrect - ypredict))
        incorrect_word = dictionary[int(wordid_test[i][1] - 1)]
        yincorrect = ywrong[i]
        rmseincorrect[j] = np.sqrt(np.square(yincorrect - ypredict))
        print(correct_word, sum(rmsecorrect[j])/f,incorrect_word,sum(rmseincorrect[j])/f)
    correct_line, = plt.plot(rmsecorrect[0],  label='bear')
    incorrect_line, = plt.plot(rmseincorrect[0],  label='airplane')
    plt.legend(handles=[correct_line,incorrect_line])
    plt.xlabel('semantic feature')
    plt.ylabel('RMSE')
    plt.show()
    difference_rmse = rmsecorrect - rmseincorrect # if rmsecorrect > rmseincorrect then this is bad
    indexes_bad_rmse = np.where(difference_rmse>0)
    plt.hist(indexes_bad_rmse [1],range(f))
    plt.xlabel('semantic feature')
    plt.ylabel('Counts of Words')
    plt.show()
    differences_summed = np.sum (difference_rmse,axis = 0)
    plt.plot(differences_summed)
    plt.show()
    return [rmsecorrect,rmseincorrect]
Beispiel #3
0
def nonlinearFeatures(dimensions):
    '''
    Applies PCA for the given amount of dimensions. It then multiplies
    each principal component with each other principal component to create
    non linear features and finds the least squares solution.
    :param dimensions: amount of principal components to keep
    :return: [accuracy,rmsetrain,rmsetest,rmsetestwrong]
    '''
    [xtrainPCA, xtestPCA] = pcaData(300, fmri_train, fmri_train)
    n = xtrainPCA.shape[0]
    dimPCA = xtrainPCA.shape[1]
    xNonlinear = np.zeros((n, dimPCA + dimPCA * dimPCA / 2))
    xtestNonlinear = np.zeros(
        (xtestPCA.shape[0], dimPCA + dimPCA * dimPCA / 2))
    counter = dimPCA
    xNonlinear[:, 0:dimPCA] = xtrainPCA
    xtestNonlinear[:, 0:dimPCA] = xtestPCA
    for i in range(dimPCA):
        for j in range(i + 1, dimPCA):
            xNonlinear[:, counter] = xtrainPCA[:, i] * xtrainPCA[:, j]
            xtestNonlinear[:, counter] = xtestPCA[:, i] * xtestPCA[:, j]
            counter += 1
    num_features = ytrain.shape[1]
    d = xNonlinear.shape[1]
    ntotdata = xNonlinear.shape[0]
    bestw = np.zeros([num_features, d])
    bestw0 = np.zeros(num_features)
    accuracy = np.zeros(d)
    for i in range(num_features):
        y = ytrain[:, i].reshape(ntotdata, 1)
        x = xNonlinear[:, :]
        w = least_squares(x, y)
        bestw[i, :] = w.reshape(d)
    wfile = "w_lse_nonlinear.mtx"
    io.mmwrite(wfile, bestw)
    [accuracy] = test_suite.main(bestw, bestw0, wordid_test, wordfeature_std,
                                 xtestNonlinear)
    print(accuracy)
    [ytest, ywrong] = test_suite.prepareData(wordid_test, wordfeature_std)
    rmsetest = rmse_per_semantic_feature(xtestNonlinear, ytest, bestw)
    rmsetrain = rmse_per_semantic_feature(xNonlinear, ytrain, bestw)
    rmsetestwrong = rmse_per_semantic_feature(xtestNonlinear, ywrong, bestw)
    return [accuracy, rmsetrain, rmsetest, rmsetestwrong]
Beispiel #4
0
def findw_PCA_LSE(dimensions,
                  train_data,
                  test_data,
                  wordid_test,
                  ytrain,
                  wfile="w_lse_dim299.mtx"):
    '''
    Find number of dimensions given principal components of the data
    and using that it solves least squares to obtain the weights for each semantic
    feature. Using the weights obtained it calculates the predicted semantic features
    on the test data set and returns the accuracy on a guess out of 2 words. It also
    returnsrmse on test and training data, and on the wrong column of training data.
    :param dimensions: number of pca components
    :param train_data: training data
    :param test_data: test data
    :param wordid_test: ids of the words in the test data set
    :param ytrain: semantic features values for training data
    :param wfile: name of file where w's will be written
    :return:
    '''
    [xtrainPCA, xtestPCA] = pcaData(dimensions, train_data, test_data)
    num_features = ytrain.shape[1]
    d = xtrainPCA.shape[1]
    ntotdata = xtrainPCA.shape[0]
    bestw = np.zeros([num_features, d])
    bestw0 = np.zeros(num_features)
    accuracy = np.zeros(d)
    for i in range(num_features):
        print('looking at feature ', i)
        y = ytrain[:, i].reshape(ntotdata, 1)
        x = xtrainPCA[:, :]
        w = least_squares(x, y)
        bestw[i, :] = w.reshape(dimensions)
    io.mmwrite(wfile, bestw)
    [accuracy] = test_suite.main(bestw, bestw0, wordid_test, wordfeature_std,
                                 xtestPCA)
    print(accuracy)
    [ytest, ywrong] = test_suite.prepareData(wordid_test, wordfeature_std)
    rmsetest = rmse_per_semantic_feature(xtestPCA, ytest, bestw)
    rmsetrain = rmse_per_semantic_feature(xtrainPCA, ytrain, bestw)
    rmsetestwrong = rmse_per_semantic_feature(xtestPCA, ywrong, bestw)
    return [accuracy, rmsetrain, rmsetest, rmsetestwrong]
Beispiel #5
0
def nonlinearFeatures (dimensions):
    '''
    Applies PCA for the given amount of dimensions. It then multiplies
    each principal component with each other principal component to create
    non linear features and finds the least squares solution.
    :param dimensions: amount of principal components to keep
    :return: [accuracy,rmsetrain,rmsetest,rmsetestwrong]
    '''
    [xtrainPCA,xtestPCA] = pcaData(300, fmri_train, fmri_train)
    n = xtrainPCA.shape[0]
    dimPCA = xtrainPCA.shape[1]
    xNonlinear = np.zeros((n,dimPCA + dimPCA*dimPCA/2))
    xtestNonlinear = np.zeros((xtestPCA.shape[0],dimPCA + dimPCA*dimPCA/2))
    counter = dimPCA
    xNonlinear [:,0:dimPCA] = xtrainPCA
    xtestNonlinear [:,0:dimPCA] = xtestPCA
    for i in range(dimPCA):
        for j in range(i+1,dimPCA):
            xNonlinear[:,counter] = xtrainPCA[:,i] * xtrainPCA[:,j]
            xtestNonlinear[:,counter] = xtestPCA[:,i] * xtestPCA[:,j]
            counter +=1
    num_features = ytrain.shape[1]
    d = xNonlinear.shape[1]
    ntotdata = xNonlinear.shape[0]
    bestw = np.zeros([num_features,d])
    bestw0 = np.zeros(num_features)
    accuracy = np.zeros(d)
    for i in range(num_features):
        y = ytrain[:,i].reshape(ntotdata,1)
        x = xNonlinear[:,:]
        w = least_squares (x,y)
        bestw[i,:]  = w.reshape(d)
    wfile = "w_lse_nonlinear.mtx"
    io.mmwrite(wfile, bestw)
    [accuracy] = test_suite.main(bestw,bestw0,wordid_test,wordfeature_std,xtestNonlinear)
    print(accuracy)
    [ytest,ywrong] = test_suite.prepareData (wordid_test,wordfeature_std)
    rmsetest = rmse_per_semantic_feature (xtestNonlinear,ytest,bestw)
    rmsetrain = rmse_per_semantic_feature (xNonlinear,ytrain,bestw)
    rmsetestwrong = rmse_per_semantic_feature (xtestNonlinear,ywrong,bestw)
    return [accuracy,rmsetrain,rmsetest,rmsetestwrong]
Beispiel #6
0
def findw_PCA_LSE (dimensions, train_data,test_data, wordid_test, ytrain, wfile = "w_lse_dim299.mtx"):
    '''
    Find number of dimensions given principal components of the data
    and using that it solves least squares to obtain the weights for each semantic
    feature. Using the weights obtained it calculates the predicted semantic features
    on the test data set and returns the accuracy on a guess out of 2 words. It also
    returnsrmse on test and training data, and on the wrong column of training data.
    :param dimensions: number of pca components
    :param train_data: training data
    :param test_data: test data
    :param wordid_test: ids of the words in the test data set
    :param ytrain: semantic features values for training data
    :param wfile: name of file where w's will be written
    :return:
    '''
    [xtrainPCA,xtestPCA] = pcaData(dimensions, train_data, test_data)
    num_features = ytrain.shape[1]
    d = xtrainPCA.shape[1]
    ntotdata = xtrainPCA.shape[0]
    bestw = np.zeros([num_features,d])
    bestw0 = np.zeros(num_features)
    accuracy = np.zeros(d)
    for i in range(num_features):
        print ('looking at feature ', i)
        y = ytrain[:,i].reshape(ntotdata,1)
        x = xtrainPCA[:,:]
        w = least_squares (x,y)
        bestw[i,:]  = w.reshape(dimensions)  
    io.mmwrite(wfile, bestw)
    [accuracy] = test_suite.main(bestw,bestw0,wordid_test,wordfeature_std,xtestPCA)
    print(accuracy)
    [ytest,ywrong] = test_suite.prepareData (wordid_test,wordfeature_std)
    rmsetest = rmse_per_semantic_feature (xtestPCA,ytest,bestw)
    rmsetrain = rmse_per_semantic_feature (xtrainPCA,ytrain,bestw)
    rmsetestwrong = rmse_per_semantic_feature (xtestPCA,ywrong,bestw)
    return [accuracy,rmsetrain,rmsetest,rmsetestwrong]