def _makeTextMatrix(self, inputFile, stopwordFile):

        # Generate text matrix with TmgSimple from 02450
        textMatrix = TmgSimple(filename=formattedDatabase,
                               stopwords_filename=stopwordFile)

        attributeNames = textMatrix.get_words(sort=True)

        # Make an output file
        attFile = open(attributFile, 'w')
        datFile = open(dataFile, 'w')

        for word in attributeNames:
            attFile.write(word)
            attFile.write('\n')

        attFile.close

        for i in range(40):
            np.savetxt(datFile, textMatrix.get_matrix(i*1000, (i+1)*1000,
                                                      sort=True), fmt='%i')
        datFile.close
Example #2
0
# exercise 3.1.4
import numpy as np
from tmgsimple import TmgSimple

# Generate text matrix with help of simple class TmgSimple
tm = TmgSimple(filename='../Data/textDocs.txt',
               stopwords_filename='../Data/stopWords.txt',
               stem=True)

# Extract variables representing data
X = tm.get_matrix(sort=True)
attributeNames = tm.get_words(sort=True)

# Display the result
print(attributeNames)
print(X)

print('Ran Exercise 3.1.4')
Example #3
0
document at least contains 2 of your key words, i.e. the document-term matrix 
should have approximately 10 columns and each row of the matrix must at least 
contain 2 non-zero entries.
"""

bagOfWords = ['matrix', 'Google', 'ranking', 'web', 'webpage', 'rank']

"""
3.1.2
"""
import numpy as np
from tmgsimple import TmgSimple
from similarity import similarity

# Generate text matrix with help of simple class TmgSimple
tm = TmgSimple(filename='../02450Toolbox_Python/Data/textDocs.txt', )

# Extract variables representing data
X = tm.get_matrix(sort=True)
attributeNamesWithOutStop = tm.get_words(sort=True)

# Display the result
print attributeNamesWithOutStop
print X

"""
3.1.3
With stopwords
"""
print('Now with stopwords !!!')
tm = TmgSimple(filename='../02450Toolbox_Python/Data/textDocs.txt', stopwords_filename='../02450Toolbox_Python/Data/stopWords.txt')
# exercise 2.1.4

import numpy as np
from tmgsimple import TmgSimple

# Generate text matrix with help of simple class TmgSimple
tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True)

# Extract variables representing data
X = tm.get_matrix(sort=True)
attributeNames = tm.get_words(sort=True)

# Display the result
print attributeNames
print X

Example #5
0
# exercise 2.1.2
from tmgsimple import TmgSimple
import tmgsimple
#help(tmgsimple)
fn='C:\\Users\\Bahram\\PycharmProjects\\Machine-Learning-and-Data-Mining\\02450Toolbox_Python\\Data\\textDocs.txt'
stopwords='C:\\Users\\Bahram\\PycharmProjects\\Machine-Learning-and-Data-Mining\\02450Toolbox_Python\\Data\\stopWords.txt'
tm = TmgSimple(filename=fn,stopwords_filename=stopwords,stem=True,min_term_length=5)
attributeNames = tm.get_words(sort=True)
x=tm.get_matrix(sort=True)

print attributeNames
print x


"""

# Generate text matrix with help of simple class TmgSimple


# Extract variables representing data
X = tm.get_matrix(sort=True)


# Display the result
print attributeNames
print X
"""
# exercise 2.1.5

import numpy as np
import scipy.linalg as linalg
from similarity import similarity
from tmgsimple import TmgSimple


# Generate text matrix with help of simple class TmgSimple
tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True)

# Extract variables representing data
X = np.mat(tm.get_matrix(sort=True))
attributeNames = tm.get_words(sort=True)

# Query vector
q = np.matrix([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])


# Method 1 ('for' loop - slow)
N = np.shape(X)[0]; # get the number of data objects
sim = np.zeros((N,1)) # allocate a vector for the similarity
for i in range(N):
    x = X[i,:] # Get the i'th data object (here: document)
    sim[i] = q/linalg.norm(q) * x.T/linalg.norm(x) # Compute cosine similarity

# Method 2 (one line of code with no iterations - faster)
sim = (q*X.T).T / (np.sqrt(np.power(X,2).sum(axis=1)) * np.sqrt(np.power(q,2).sum(axis=1)))

# Method 3 (use the "similarity" function)
sim = similarity(X, q, 'cos');