def splitDataset(data, name, fileCategory): # uses slicing, one of the most useful and least-well-known features of scipy sparse matrices # you pass in a list of row indices you want to keep, and it will create a sliced copy that includes only those rows # slicing also works on column indices # callout to the person who first opened my eyes to them: # http://stackoverflow.com/questions/13352280/slicing-sparse-matrices-in-scipy-which-types-work-best # if this "sparse" matrix only has a single value for each row, we have to treat it as a column matrix, and slice it accordingly # this is the case for our idColumn, and frequently our y values as well. if data.shape[0] == 1: validation = data[:,validationIndices] trainingData = data[:,trainingIndices] else: validation = data[validationIndices,:] trainingData = data[trainingIndices,:] # ntpath theoretically works really well across systems name = ntpath.basename(name) # remove the file extension name = name[0:-4] validationFile = path.join(outputDirectory, name + 'validationData.npz') trainingDataFile = path.join(outputDirectory, name + 'trainingData.npz') save_sparse_csr(trainingDataFile, trainingData) save_sparse_csr(validationFile, validation) # send the file names back to the parent process, where we aggregate and save them fileNameDict = { fileCategory + 'trainingData': trainingDataFile, fileCategory + 'validationData': validationFile } messageParent(fileNameDict, 'splitFileNames')
def writeDataDense(X, args, headerRow, nn): # grab the name of the training and testing files from the full path to those datasets trainingFileName = args['trainingPrettyName'] + '.csv' testingFileName = args['testingPrettyName'] + args[ 'trainingPrettyName'] + '.csv' if (nn): trainingFileName = 'nn_' + trainingFileName testingFileName = 'nn_' + testingFileName # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent. X_train = path.join(args['outputFolder'], 'X_train_' + trainingFileName) X_test = path.join(args['outputFolder'], 'X_test_' + testingFileName) with open(X_train, 'w+') as outputFile: csvOutputFile = csv.writer(outputFile) csvOutputFile.writerow(headerRow) # grab only the rows that were part of our training file from the combined X dataset csvOutputFile.writerows(X[0:args['trainingLength']]) with open(X_test, 'w+') as outputFile: csvOutputFile = csv.writer(outputFile) csvOutputFile.writerow(headerRow) # grab the rest of the rows from our X dataset, which comprise the testing dataset csvOutputFile.writerows(X[args['trainingLength']:]) if (nn): fileNames = {'X_train_nn': X_train, 'X_test_nn': X_test} else: fileNames = {'X_train': X_train, 'X_test': X_test} messageParent(fileNames, 'fileNames')
def splitDataset(data, name, fileCategory): # uses slicing, one of the most useful and least-well-known features of scipy sparse matrices # you pass in a list of row indices you want to keep, and it will create a sliced copy that includes only those rows # slicing also works on column indices # callout to the person who first opened my eyes to them: # http://stackoverflow.com/questions/13352280/slicing-sparse-matrices-in-scipy-which-types-work-best # if this "sparse" matrix only has a single value for each row, we have to treat it as a column matrix, and slice it accordingly # this is the case for our idColumn, and frequently our y values as well. if data.shape[0] == 1: validation = data[:, validationIndices] trainingData = data[:, trainingIndices] else: validation = data[validationIndices, :] trainingData = data[trainingIndices, :] # ntpath theoretically works really well across systems name = ntpath.basename(name) # remove the file extension name = name[0:-4] validationFile = path.join(outputDirectory, name + "validationData.npz") trainingDataFile = path.join(outputDirectory, name + "trainingData.npz") save_sparse_csr(trainingDataFile, trainingData) save_sparse_csr(validationFile, validation) # send the file names back to the parent process, where we aggregate and save them fileNameDict = {fileCategory + "trainingData": trainingDataFile, fileCategory + "validationData": validationFile} messageParent(fileNameDict, "splitFileNames")
def writeMetadata(y, idColumn, args, headerRow, validationSplitColumn, hasCustomValidationSplit): # these are the file names (with full file paths) that we will be writing to y_train = path.join(args["outputFolder"], "y_train_" + args["trainingPrettyName"] + ".npz") id_train = path.join(args["outputFolder"], "id_train_" + args["trainingPrettyName"] + ".npz") id_test = path.join( args["outputFolder"], "id_test_" + args["testingPrettyName"] + args["trainingPrettyName"] + ".npz" ) validation_split_column = path.join( args["outputFolder"], "validation_split_column_" + args["trainingPrettyName"] + ".npz" ) trainingLength = args["trainingLength"] # convert all our data to np arrays, and break apart based on whether it's in the training data or not idTrainData = np.array(idColumn[0:trainingLength]) idTestData = np.array(idColumn[trainingLength:]) y = np.array(y) validationSplitColumnData = np.array(validationSplitColumn) # if our values are not already stored as numbers, convert them to numbers try: ySparse = csr_matrix(y) except: yInt = [float(i) for i in y[0:trainingLength]] ySparse = csr_matrix(yInt) try: idTrainSparse = csr_matrix(idTrainData) except: idTrainInt = [float(i) for i in idTrainData] idTrainSparse = csr_matrix(idTrainInt) try: idTestSparse = csr_matrix(idTestData) except: idTestInt = [float(i) for i in idTestData] idTestSparse = csr_matrix(idTestInt) try: validationSplitSparse = csr_matrix(validationSplitColumnData) except: validationSplitInt = [float(i) for i in validationSplitColumnData] validationSplitSparse = csr_matrix(validationSplitInt) save_sparse_csr(y_train, ySparse) save_sparse_csr(id_train, idTrainSparse) save_sparse_csr(id_test, idTestSparse) save_sparse_csr(validation_split_column, validationSplitSparse) fileNames = { "y_train": y_train, "id_train": id_train, "id_test": id_test, "idHeader": args["idHeader"], "outputHeader": args["outputHeader"], "validation_split_column": validation_split_column, "hasCustomValidationSplit": hasCustomValidationSplit, } messageParent(fileNames, "fileNames")
def writeDataDense(X, args, headerRow, nn): # grab the name of the training and testing files from the full path to those datasets trainingFileName = args["trainingPrettyName"] + ".csv" testingFileName = args["testingPrettyName"] + args["trainingPrettyName"] + ".csv" if nn: trainingFileName = "nn_" + trainingFileName testingFileName = "nn_" + testingFileName # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent. X_train = path.join(args["outputFolder"], "X_train_" + trainingFileName) X_test = path.join(args["outputFolder"], "X_test_" + testingFileName) with open(X_train, "w+") as outputFile: csvOutputFile = csv.writer(outputFile) csvOutputFile.writerow(headerRow) # grab only the rows that were part of our training file from the combined X dataset csvOutputFile.writerows(X[0 : args["trainingLength"]]) with open(X_test, "w+") as outputFile: csvOutputFile = csv.writer(outputFile) csvOutputFile.writerow(headerRow) # grab the rest of the rows from our X dataset, which comprise the testing dataset csvOutputFile.writerows(X[args["trainingLength"] :]) if nn: fileNames = {"X_train_nn": X_train, "X_test_nn": X_test} else: fileNames = {"X_train": X_train, "X_test": X_test} messageParent(fileNames, "fileNames")
def writeDataSparse(X, args, headerRow, nn): # grab the name of the training and testing files from the full path to those datasets trainingFileName = args['trainingPrettyName'] + '.npz' testingFileName = args['testingPrettyName'] + args[ 'trainingPrettyName'] + '.npz' if type(nn) is not bool: trainingFileName = 'nn_' + trainingFileName testingFileName = 'nn_' + testingFileName yFileName = 'y_train_' + 'nn_' + args['trainingPrettyName'] + '.npz' y_train = path.join(args['outputFolder'], yFileName) y = np.array(nn) y = [float(i) for i in y] # if our values are not already stored as numbers, convert them to numbers try: ySparse = csr_matrix(y) printParent('successfully turned y into a sparse matrix!') except: yInt = [float(i) for i in y[0:trainingLength]] ySparse = csr_matrix(yInt) save_sparse_csr(y_train, ySparse) # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent. X_train = path.join(args['outputFolder'], 'X_train_' + trainingFileName) X_test = path.join(args['outputFolder'], 'X_test_' + testingFileName) # scipy sparse matrices need a list of indices to slice # http://stackoverflow.com/questions/13352280/slicing-sparse-matrices-in-scipy-which-types-work-best trainRange = range(args['trainingLength']) testRange = range(args['trainingLength'], args['trainingLength'] + args['testingLength']) save_sparse_csr(X_train, X[trainRange, :]) save_sparse_csr(X_test, X[testRange]) if type(nn) is not bool: fileNames = { 'X_train_nn': X_train, 'X_test_nn': X_test, 'y_train_nn': y_train } else: fileNames = {'X_train': X_train, 'X_test': X_test} messageParent(fileNames, 'fileNames')
def writeMetadataDense(y, idColumn, args, headerRow): # grab the name of the training and testing files from the full path to those datasets # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent. y_train = path.join(args['outputFolder'], 'y_train_' + args['trainingPrettyName'] + '.csv') id_train = path.join(args['outputFolder'], 'id_train_' + args['trainingPrettyName'] + '.csv') id_test = path.join( args['outputFolder'], 'id_test_' + args['testingPrettyName'] + args['trainingPrettyName'] + '.csv') with open(y_train, 'w+') as outputFile: csvOutputFile = csv.writer(outputFile) # write the pretty name for the header row to the output file csvOutputFile.writerow([args['outputHeader']]) # grab only the rows that were part of our training file from the combined dataset for rowIdx, row in enumerate(y): if (rowIdx < args['trainingLength']): csvOutputFile.writerow([row]) with open(id_train, 'w+') as outputFile: csvOutputFile = csv.writer(outputFile) # write the pretty name for the header row to the output file csvOutputFile.writerow([args['idHeader']]) # grab only the rows that were part of our training file from the combined dataset for rowIdx, row in enumerate(idColumn): if (rowIdx < args['trainingLength']): csvOutputFile.writerow([row]) with open(id_test, 'w+') as outputFile: csvOutputFile = csv.writer(outputFile) # write the pretty name for the header row to the output file csvOutputFile.writerow([args['idHeader']]) # grab only the rows that were part of our testing file from the combined dataset for rowIdx, row in enumerate(idColumn): if (rowIdx >= args['trainingLength']): csvOutputFile.writerow([row]) fileNames = {'y_train': y_train, 'id_train': id_train, 'id_test': id_test} messageParent(fileNames, 'fileNames')
def writeMetadataDense(y, idColumn, args, headerRow): # grab the name of the training and testing files from the full path to those datasets # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent. y_train = path.join(args["outputFolder"], "y_train_" + args["trainingPrettyName"] + ".csv") id_train = path.join(args["outputFolder"], "id_train_" + args["trainingPrettyName"] + ".csv") id_test = path.join( args["outputFolder"], "id_test_" + args["testingPrettyName"] + args["trainingPrettyName"] + ".csv" ) with open(y_train, "w+") as outputFile: csvOutputFile = csv.writer(outputFile) # write the pretty name for the header row to the output file csvOutputFile.writerow([args["outputHeader"]]) # grab only the rows that were part of our training file from the combined dataset for rowIdx, row in enumerate(y): if rowIdx < args["trainingLength"]: csvOutputFile.writerow([row]) with open(id_train, "w+") as outputFile: csvOutputFile = csv.writer(outputFile) # write the pretty name for the header row to the output file csvOutputFile.writerow([args["idHeader"]]) # grab only the rows that were part of our training file from the combined dataset for rowIdx, row in enumerate(idColumn): if rowIdx < args["trainingLength"]: csvOutputFile.writerow([row]) with open(id_test, "w+") as outputFile: csvOutputFile = csv.writer(outputFile) # write the pretty name for the header row to the output file csvOutputFile.writerow([args["idHeader"]]) # grab only the rows that were part of our testing file from the combined dataset for rowIdx, row in enumerate(idColumn): if rowIdx >= args["trainingLength"]: csvOutputFile.writerow([row]) fileNames = {"y_train": y_train, "id_train": id_train, "id_test": id_test} messageParent(fileNames, "fileNames")
def writeDataSparse(X, args, headerRow, nn): # grab the name of the training and testing files from the full path to those datasets trainingFileName = args["trainingPrettyName"] + ".npz" testingFileName = args["testingPrettyName"] + args["trainingPrettyName"] + ".npz" if type(nn) is not bool: trainingFileName = "nn_" + trainingFileName testingFileName = "nn_" + testingFileName yFileName = "y_train_" + "nn_" + args["trainingPrettyName"] + ".npz" y_train = path.join(args["outputFolder"], yFileName) y = np.array(nn) y = [float(i) for i in y] # if our values are not already stored as numbers, convert them to numbers try: ySparse = csr_matrix(y) printParent("successfully turned y into a sparse matrix!") except: yInt = [float(i) for i in y[0:trainingLength]] ySparse = csr_matrix(yInt) save_sparse_csr(y_train, ySparse) # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent. X_train = path.join(args["outputFolder"], "X_train_" + trainingFileName) X_test = path.join(args["outputFolder"], "X_test_" + testingFileName) # scipy sparse matrices need a list of indices to slice # http://stackoverflow.com/questions/13352280/slicing-sparse-matrices-in-scipy-which-types-work-best trainRange = range(args["trainingLength"]) testRange = range(args["trainingLength"], args["trainingLength"] + args["testingLength"]) save_sparse_csr(X_train, X[trainRange, :]) save_sparse_csr(X_test, X[testRange]) if type(nn) is not bool: fileNames = {"X_train_nn": X_train, "X_test_nn": X_test, "y_train_nn": y_train} else: fileNames = {"X_train": X_train, "X_test": X_test} messageParent(fileNames, "fileNames")
def writeMetadata(y, idColumn, args, headerRow, validationSplitColumn, hasCustomValidationSplit): # these are the file names (with full file paths) that we will be writing to y_train = path.join(args['outputFolder'], 'y_train_' + args['trainingPrettyName'] + '.npz') id_train = path.join(args['outputFolder'], 'id_train_' + args['trainingPrettyName'] + '.npz') id_test = path.join( args['outputFolder'], 'id_test_' + args['testingPrettyName'] + args['trainingPrettyName'] + '.npz') validation_split_column = path.join( args['outputFolder'], 'validation_split_column_' + args['trainingPrettyName'] + '.npz') trainingLength = args['trainingLength'] # convert all our data to np arrays, and break apart based on whether it's in the training data or not idTrainData = np.array(idColumn[0:trainingLength]) idTestData = np.array(idColumn[trainingLength:]) y = np.array(y) validationSplitColumnData = np.array(validationSplitColumn) # if our values are not already stored as numbers, convert them to numbers try: ySparse = csr_matrix(y) except: yInt = [float(i) for i in y[0:trainingLength]] ySparse = csr_matrix(yInt) try: idTrainSparse = csr_matrix(idTrainData) except: idTrainInt = [float(i) for i in idTrainData] idTrainSparse = csr_matrix(idTrainInt) try: idTestSparse = csr_matrix(idTestData) except: idTestInt = [float(i) for i in idTestData] idTestSparse = csr_matrix(idTestInt) try: validationSplitSparse = csr_matrix(validationSplitColumnData) except: validationSplitInt = [float(i) for i in validationSplitColumnData] validationSplitSparse = csr_matrix(validationSplitInt) save_sparse_csr(y_train, ySparse) save_sparse_csr(id_train, idTrainSparse) save_sparse_csr(id_test, idTestSparse) save_sparse_csr(validation_split_column, validationSplitSparse) fileNames = { 'y_train': y_train, 'id_train': id_train, 'id_test': id_test, 'idHeader': args['idHeader'], 'outputHeader': args['outputHeader'], 'validation_split_column': validation_split_column, 'hasCustomValidationSplit': hasCustomValidationSplit } messageParent(fileNames, 'fileNames')
# when doing the cross-validated search, we potentially been holding out a significant portion of the dataset # once we have found the best hyperparameters, train on the entire dataset # we have already verified that this is the best set of hyperparameters using cross-validation if X.shape[0] != X_train.shape[0] or extendedTraining: longTrainClassifier.fit(X, y) finishLongTrainTime = time.time() printParent(classifierName + "'s training on the longer data set took:") printParent( round((finishLongTrainTime - startLongTrainTime)/60, 1) ) longTrainClassifierScore = longTrainClassifier.score(X, y) printParent(classifierName + "'s score against the larger training data set is:") printParent(longTrainClassifierScore) messageObj['longTrainScore'] = longTrainClassifierScore # save our classifiers from the validationRound to a separate folder if globalArgs['validationRound']: classifierFolder = path.join(globalArgs['bestClassifiersFolder'], 'ensemblingAlgos', 'best' + classifierName) else: classifierFolder = path.join(globalArgs['bestClassifiersFolder'], 'best' + classifierName) if not os.path.exists(classifierFolder): os.makedirs(classifierFolder) joblib.dump(longTrainClassifier, path.join(classifierFolder, 'best' + classifierName + '.pkl') ) messageParent(messageObj, 'trainingResults')
def format( X, y, idColumn, args ): X = min_max_scaler.fit_transform( X ).tolist() brainArr = [] for rowIndex, row in enumerate(X): # brainJS expects a very particular format for each row: # an object with input and output properties, each of which is an array rowObj = {} # we might need to wrap output in a list if the output is a single number, like we have. rowObj['output'] = [] # if a y value exists for this rowIndex, grab it! # otherwise, it's likely because the user did not pass in a column for y values for the output dataset, so we'll set it equal to the empty string instead try: # grab the output value from the y dataset saved earlier yRow = y[ rowIndex ] except: yRow = "" # designed to handle the possibility of multiple output columns if( isinstance( yRow, list )): rowObj['output'].extend( yRow ) else: # the output value is expected to be a list, so if y values are not lists, then we need to wrap the y value in a list rowObj['output'].append( yRow ) rowObj[ 'id' ] = idColumn[ rowIndex ] for idx, val in enumerate(row): # python saves floats in their binary representation, which gets written to file in scientific notation # this scientific notation is not necessarily json compatible # so we format these values as a string # unfortunately, these strings will only have 6 decimal places after the 0. this should not be a major limitation, as that still allows 6 orders of magnitude of differentiation. it would be a rather surprising neural network who could draw meaningful distinctions between values that are 7 orders of magnitude different from the max value vs values that are 6 orders of magnitude different from the max value. # in our case, they'll both just get saved as 0. row[idx] = '{:f}'.format(val) rowObj['input'] = row brainArr.append( rowObj ) trainingFileName = path.split( args['trainingData'] )[ -1 ] testingFileName = path.split( args['testingData'] )[ -1 ] brainJS_train = path.join( args['outputFolder'], 'brainJS_train_' + trainingFileName ) brainJS_test = path.join( args['outputFolder'], 'brainJS_test_' + testingFileName ) with open( brainJS_train, 'w+') as outputFile: csvOutputFile = csv.writer(outputFile) for rowIndex, row in enumerate(brainArr): if( rowIndex < args['trainingLength'] ): # csvWriter.writerow expects each row to be a list # since our rows are actually just dictionaries, we need to wrap it in a list each time so the writer knows this is a single row csvOutputFile.writerow( [ json.dumps( row ) ] ) with open( brainJS_test, 'w+') as outputFile: csvOutputFile = csv.writer(outputFile) for rowIndex, row in enumerate(brainArr): if( rowIndex >= args['trainingLength'] ): csvOutputFile.writerow( [ row ] ) fileNames = { 'brainJS_train': brainJS_train, 'brainJS_test': brainJS_test } messageParent( fileNames, 'fileNames' ) return brainArr