def _makeTextMatrix(self, inputFile, stopwordFile):

        # Generate text matrix with TmgSimple from 02450
        textMatrix = TmgSimple(filename=formattedDatabase,
                               stopwords_filename=stopwordFile)

        attributeNames = textMatrix.get_words(sort=True)

        # Make an output file
        attFile = open(attributFile, 'w')
        datFile = open(dataFile, 'w')

        for word in attributeNames:
            attFile.write(word)
            attFile.write('\n')

        attFile.close

        for i in range(40):
            np.savetxt(datFile, textMatrix.get_matrix(i*1000, (i+1)*1000,
                                                      sort=True), fmt='%i')
        datFile.close
Example #2
0
# exercise 3.1.4
import numpy as np
from tmgsimple import TmgSimple

# Generate text matrix with help of simple class TmgSimple
tm = TmgSimple(filename='../Data/textDocs.txt',
               stopwords_filename='../Data/stopWords.txt',
               stem=True)

# Extract variables representing data
X = tm.get_matrix(sort=True)
attributeNames = tm.get_words(sort=True)

# Display the result
print(attributeNames)
print(X)

print('Ran Exercise 3.1.4')
Example #3
0
# exercise 3.1.5

import numpy as np
import scipy.linalg as linalg
from similarity import similarity
from tmgsimple import TmgSimple

# Generate text matrix with help of simple class TmgSimple
tm = TmgSimple(filename='../Data/textDocs.txt',
               stopwords_filename='../Data/stopWords.txt',
               stem=True)

# Extract variables representing data
X = np.mat(tm.get_matrix(sort=True))
attributeNames = tm.get_words(sort=True)

# Query vector
q = np.matrix([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])

# Method 1 ('for' loop - slow)
N = np.shape(X)[0]
# get the number of data objects
sim = np.zeros((N, 1))  # allocate a vector for the similarity
for i in range(N):
    x = X[i, :]  # Get the i'th data object (here: document)
    sim[i] = q / linalg.norm(q) * x.T / linalg.norm(
        x)  # Compute cosine similarity

# Method 2 (one line of code with no iterations - faster)
sim = (q * X.T).T / (np.sqrt(np.power(X, 2).sum(axis=1)) *
                     np.sqrt(np.power(q, 2).sum(axis=1)))
# exercise 2.1.4

import numpy as np
from tmgsimple import TmgSimple

# Generate text matrix with help of simple class TmgSimple
tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True)

# Extract variables representing data
X = tm.get_matrix(sort=True)
attributeNames = tm.get_words(sort=True)

# Display the result
print attributeNames
print X

# exercise 2.1.5

import numpy as np
import scipy.linalg as linalg
from similarity import similarity
from tmgsimple import TmgSimple


# Generate text matrix with help of simple class TmgSimple
tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True)

# Extract variables representing data
X = np.mat(tm.get_matrix(sort=True))
attributeNames = tm.get_words(sort=True)

# Query vector
q = np.matrix([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])


# Method 1 ('for' loop - slow)
N = np.shape(X)[0]; # get the number of data objects
sim = np.zeros((N,1)) # allocate a vector for the similarity
for i in range(N):
    x = X[i,:] # Get the i'th data object (here: document)
    sim[i] = q/linalg.norm(q) * x.T/linalg.norm(x) # Compute cosine similarity

# Method 2 (one line of code with no iterations - faster)
sim = (q*X.T).T / (np.sqrt(np.power(X,2).sum(axis=1)) * np.sqrt(np.power(q,2).sum(axis=1)))

# Method 3 (use the "similarity" function)
sim = similarity(X, q, 'cos');