def displayHistoOfMatrix(fname): infos = readFile(fname) titles, words, matrix = extractArrays(infos) #pl.hist(matrix.sum(axis=0), bins=120, range=(0, 120)) pl.hist(matrix.sum(axis=0), bins=120) pl.show()
def whatever(): fname = 'data3/data10.json' #matrix = np.array([ # [0,0,0,12,2,0], # valeurs pour le 1er film # [0,0,0,12,2,0], # valeurs pour le 2e film # [0,0,0,12,2,0] # ]) # valeur pour un mot infos = readFile(fname) titles, words, matrix = extractArrays(infos) print titles pl.hist(matrix.sum(axis=0), bins=140) pl.show() word_counts = matrix.sum(axis=0) word_mask = word_counts > 1 words = np.array(words) matrix[:,word_mask] words[word_mask] print matrix.shape print words print matrix[:,word_mask].shape print words[word_mask].shape
print 'Taille matrix avant : ' + str(matrix.shape) print 'Taille words avant : ' + str(words.shape) return words.tolist(), matrix if __name__ == '__main__': # THIS CODE LOAD 3 ARRAYS FROM A FILE FROM THE FOLDER 'data3' # The arrays are 'titles', 'words' and 'matrix' # They are filtered and saved in the folder 'data4' #dataset = [1, 3, 5, 10, 50, 100]#, 500, 3393] #dataset = [3393] dataset = [3] for n in dataset: fname = 'data3/data' + str(n) + '.json' infos = readFile(fname) titles, words, matrix = extractArrays(infos) # apply filter words, matrix = removeLonelyWords(words, matrix) matrix = matrix.tolist() output_fname = 'data4/data' + str(n) + '.json' saveToFile(titles, words, matrix, output_fname) print 'File \'' + output_fname + '\' saved.'
def printSizeOfMatrix(fname): infos = readFile(fname) titles, words, matrix = extractArrays(infos) print matrix.shape