def _makeTextMatrix(self, inputFile, stopwordFile): # Generate text matrix with TmgSimple from 02450 textMatrix = TmgSimple(filename=formattedDatabase, stopwords_filename=stopwordFile) attributeNames = textMatrix.get_words(sort=True) # Make an output file attFile = open(attributFile, 'w') datFile = open(dataFile, 'w') for word in attributeNames: attFile.write(word) attFile.write('\n') attFile.close for i in range(40): np.savetxt(datFile, textMatrix.get_matrix(i*1000, (i+1)*1000, sort=True), fmt='%i') datFile.close
# exercise 3.1.4 import numpy as np from tmgsimple import TmgSimple # Generate text matrix with help of simple class TmgSimple tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True) # Extract variables representing data X = tm.get_matrix(sort=True) attributeNames = tm.get_words(sort=True) # Display the result print(attributeNames) print(X) print('Ran Exercise 3.1.4')
document at least contains 2 of your key words, i.e. the document-term matrix should have approximately 10 columns and each row of the matrix must at least contain 2 non-zero entries. """ bagOfWords = ['matrix', 'Google', 'ranking', 'web', 'webpage', 'rank'] """ 3.1.2 """ import numpy as np from tmgsimple import TmgSimple from similarity import similarity # Generate text matrix with help of simple class TmgSimple tm = TmgSimple(filename='../02450Toolbox_Python/Data/textDocs.txt', ) # Extract variables representing data X = tm.get_matrix(sort=True) attributeNamesWithOutStop = tm.get_words(sort=True) # Display the result print attributeNamesWithOutStop print X """ 3.1.3 With stopwords """ print('Now with stopwords !!!') tm = TmgSimple(filename='../02450Toolbox_Python/Data/textDocs.txt', stopwords_filename='../02450Toolbox_Python/Data/stopWords.txt')
# exercise 2.1.4 import numpy as np from tmgsimple import TmgSimple # Generate text matrix with help of simple class TmgSimple tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True) # Extract variables representing data X = tm.get_matrix(sort=True) attributeNames = tm.get_words(sort=True) # Display the result print attributeNames print X
# exercise 2.1.2 from tmgsimple import TmgSimple import tmgsimple #help(tmgsimple) fn='C:\\Users\\Bahram\\PycharmProjects\\Machine-Learning-and-Data-Mining\\02450Toolbox_Python\\Data\\textDocs.txt' stopwords='C:\\Users\\Bahram\\PycharmProjects\\Machine-Learning-and-Data-Mining\\02450Toolbox_Python\\Data\\stopWords.txt' tm = TmgSimple(filename=fn,stopwords_filename=stopwords,stem=True,min_term_length=5) attributeNames = tm.get_words(sort=True) x=tm.get_matrix(sort=True) print attributeNames print x """ # Generate text matrix with help of simple class TmgSimple # Extract variables representing data X = tm.get_matrix(sort=True) # Display the result print attributeNames print X """
# exercise 2.1.5 import numpy as np import scipy.linalg as linalg from similarity import similarity from tmgsimple import TmgSimple # Generate text matrix with help of simple class TmgSimple tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True) # Extract variables representing data X = np.mat(tm.get_matrix(sort=True)) attributeNames = tm.get_words(sort=True) # Query vector q = np.matrix([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]) # Method 1 ('for' loop - slow) N = np.shape(X)[0]; # get the number of data objects sim = np.zeros((N,1)) # allocate a vector for the similarity for i in range(N): x = X[i,:] # Get the i'th data object (here: document) sim[i] = q/linalg.norm(q) * x.T/linalg.norm(x) # Compute cosine similarity # Method 2 (one line of code with no iterations - faster) sim = (q*X.T).T / (np.sqrt(np.power(X,2).sum(axis=1)) * np.sqrt(np.power(q,2).sum(axis=1))) # Method 3 (use the "similarity" function) sim = similarity(X, q, 'cos');