def _makeTextMatrix(self, inputFile, stopwordFile): # Generate text matrix with TmgSimple from 02450 textMatrix = TmgSimple(filename=formattedDatabase, stopwords_filename=stopwordFile) attributeNames = textMatrix.get_words(sort=True) # Make an output file attFile = open(attributFile, 'w') datFile = open(dataFile, 'w') for word in attributeNames: attFile.write(word) attFile.write('\n') attFile.close for i in range(40): np.savetxt(datFile, textMatrix.get_matrix(i*1000, (i+1)*1000, sort=True), fmt='%i') datFile.close
# exercise 3.1.4 import numpy as np from tmgsimple import TmgSimple # Generate text matrix with help of simple class TmgSimple tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True) # Extract variables representing data X = tm.get_matrix(sort=True) attributeNames = tm.get_words(sort=True) # Display the result print(attributeNames) print(X) print('Ran Exercise 3.1.4')
bagOfWords = ['matrix', 'Google', 'ranking', 'web', 'webpage', 'rank'] """ 3.1.2 """ import numpy as np from tmgsimple import TmgSimple from similarity import similarity # Generate text matrix with help of simple class TmgSimple tm = TmgSimple(filename='../02450Toolbox_Python/Data/textDocs.txt', ) # Extract variables representing data X = tm.get_matrix(sort=True) attributeNamesWithOutStop = tm.get_words(sort=True) # Display the result print attributeNamesWithOutStop print X """ 3.1.3 With stopwords """ print('Now with stopwords !!!') tm = TmgSimple(filename='../02450Toolbox_Python/Data/textDocs.txt', stopwords_filename='../02450Toolbox_Python/Data/stopWords.txt') # Extract variables representing data X = tm.get_matrix(sort=True) attributeNamesWithStop = tm.get_words(sort=True)
# exercise 2.1.4 import numpy as np from tmgsimple import TmgSimple # Generate text matrix with help of simple class TmgSimple tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True) # Extract variables representing data X = tm.get_matrix(sort=True) attributeNames = tm.get_words(sort=True) # Display the result print attributeNames print X