def whatever():
    fname = 'data3/data10.json'

        #matrix = np.array([
        #    [0,0,0,12,2,0],  # valeurs pour le 1er film
        #    [0,0,0,12,2,0],  # valeurs pour le 2e film
        #    [0,0,0,12,2,0]
        #    ])
              # valeur pour un mot

    infos = readFile(fname)
    titles, words, matrix = extractArrays(infos)

    print titles

    pl.hist(matrix.sum(axis=0), bins=140)
    pl.show()

    word_counts = matrix.sum(axis=0)
    word_mask = word_counts > 1

    words = np.array(words)
    matrix[:,word_mask]
    words[word_mask]

    print matrix.shape
    print words
    print matrix[:,word_mask].shape
    print words[word_mask].shape
def displayHistoOfMatrix(fname):
    infos = readFile(fname)
    titles, words, matrix = extractArrays(infos)

    #pl.hist(matrix.sum(axis=0), bins=120, range=(0, 120))
    pl.hist(matrix.sum(axis=0), bins=120)
    pl.show()
Example #3
0
def main():
    # "./test/documents/testDoc.csv"
    samplePath = sys.argv[1]
    text_documents = readFile(samplePath)
    word_to_idx = buildVocabDict(text_documents)
    print(word_to_idx)

    # "./test/vocabulary/test_word_to_idx"
    dictPath = sys.argv[2]

    outDir = sys.argv[3]
    # Save the dict so that it can be used for predicting
    with open(dictPath, 'wb') as outf:
        pickle.dump(word_to_idx, outf)

    # Customize parameters
    embedSize = 256
    hiddSize = 256
    numLSTM = 2
    batchSize = 16
    vocabSize = len(word_to_idx)
    print(word_to_idx)

    model = LSTMlanguageModel(embedSize, hiddSize, numLSTM,
                              vocabSize, batchSize)
    print("trainning model")
    model = trainModel(model, text_documents, word_to_idx, batchSize, outDir)
	print 'Taille matrix avant : ' + str(matrix.shape)
	print 'Taille words avant : ' + str(words.shape)

	return words.tolist(), matrix

if __name__ == '__main__':

    # THIS CODE LOAD 3 ARRAYS FROM A FILE FROM THE FOLDER 'data3'
    # The arrays are 'titles', 'words' and 'matrix'
    # They are filtered and saved in the folder 'data4'

    #dataset = [1, 3, 5, 10, 50, 100]#, 500, 3393]
    #dataset = [3393]
    dataset = [3]

    for n in dataset:
        fname = 'data3/data' + str(n) + '.json'
        infos = readFile(fname)
    	titles, words, matrix = extractArrays(infos)

    	# apply filter
    	words, matrix = removeLonelyWords(words, matrix)

    	matrix = matrix.tolist()
        
        output_fname = 'data4/data' + str(n) + '.json'
        saveToFile(titles, words, matrix, output_fname)

        print 'File \'' + output_fname + '\' saved.' 
        
def printSizeOfMatrix(fname):
    infos = readFile(fname)
    titles, words, matrix = extractArrays(infos)

    print matrix.shape