def whatever(): fname = 'data3/data10.json' #matrix = np.array([ # [0,0,0,12,2,0], # valeurs pour le 1er film # [0,0,0,12,2,0], # valeurs pour le 2e film # [0,0,0,12,2,0] # ]) # valeur pour un mot infos = readFile(fname) titles, words, matrix = extractArrays(infos) print titles pl.hist(matrix.sum(axis=0), bins=140) pl.show() word_counts = matrix.sum(axis=0) word_mask = word_counts > 1 words = np.array(words) matrix[:,word_mask] words[word_mask] print matrix.shape print words print matrix[:,word_mask].shape print words[word_mask].shape
def displayHistoOfMatrix(fname): infos = readFile(fname) titles, words, matrix = extractArrays(infos) #pl.hist(matrix.sum(axis=0), bins=120, range=(0, 120)) pl.hist(matrix.sum(axis=0), bins=120) pl.show()
def main(): # "./test/documents/testDoc.csv" samplePath = sys.argv[1] text_documents = readFile(samplePath) word_to_idx = buildVocabDict(text_documents) print(word_to_idx) # "./test/vocabulary/test_word_to_idx" dictPath = sys.argv[2] outDir = sys.argv[3] # Save the dict so that it can be used for predicting with open(dictPath, 'wb') as outf: pickle.dump(word_to_idx, outf) # Customize parameters embedSize = 256 hiddSize = 256 numLSTM = 2 batchSize = 16 vocabSize = len(word_to_idx) print(word_to_idx) model = LSTMlanguageModel(embedSize, hiddSize, numLSTM, vocabSize, batchSize) print("trainning model") model = trainModel(model, text_documents, word_to_idx, batchSize, outDir)
print 'Taille matrix avant : ' + str(matrix.shape) print 'Taille words avant : ' + str(words.shape) return words.tolist(), matrix if __name__ == '__main__': # THIS CODE LOAD 3 ARRAYS FROM A FILE FROM THE FOLDER 'data3' # The arrays are 'titles', 'words' and 'matrix' # They are filtered and saved in the folder 'data4' #dataset = [1, 3, 5, 10, 50, 100]#, 500, 3393] #dataset = [3393] dataset = [3] for n in dataset: fname = 'data3/data' + str(n) + '.json' infos = readFile(fname) titles, words, matrix = extractArrays(infos) # apply filter words, matrix = removeLonelyWords(words, matrix) matrix = matrix.tolist() output_fname = 'data4/data' + str(n) + '.json' saveToFile(titles, words, matrix, output_fname) print 'File \'' + output_fname + '\' saved.'
def printSizeOfMatrix(fname): infos = readFile(fname) titles, words, matrix = extractArrays(infos) print matrix.shape