Ejemplo n.º 1
0
def splitDataset(data, name, fileCategory):

    # uses slicing, one of the most useful and least-well-known features of scipy sparse matrices
    # you pass in a list of row indices you want to keep, and it will create a sliced copy that includes only those rows
    # slicing also works on column indices
    # callout to the person who first opened my eyes to them:
    # http://stackoverflow.com/questions/13352280/slicing-sparse-matrices-in-scipy-which-types-work-best

    # if this "sparse" matrix only has a single value for each row, we have to treat it as a column matrix, and slice it accordingly
    # this is the case for our idColumn, and frequently our y values as well.
    if data.shape[0] == 1:
        validation = data[:,validationIndices]
        trainingData = data[:,trainingIndices]

    else:
        validation = data[validationIndices,:]
        trainingData = data[trainingIndices,:]

    # ntpath theoretically works really well across systems
    name = ntpath.basename(name)
    # remove the file extension
    name = name[0:-4]

    validationFile = path.join(outputDirectory, name + 'validationData.npz')
    trainingDataFile = path.join(outputDirectory, name + 'trainingData.npz')

    save_sparse_csr(trainingDataFile, trainingData)
    save_sparse_csr(validationFile, validation)

    # send the file names back to the parent process, where we aggregate and save them
    fileNameDict = {
        fileCategory + 'trainingData': trainingDataFile,
        fileCategory + 'validationData': validationFile
    }
    messageParent(fileNameDict, 'splitFileNames')
Ejemplo n.º 2
0
def writeDataDense(X, args, headerRow, nn):

    # grab the name of the training and testing files from the full path to those datasets
    trainingFileName = args['trainingPrettyName'] + '.csv'
    testingFileName = args['testingPrettyName'] + args[
        'trainingPrettyName'] + '.csv'

    if (nn):
        trainingFileName = 'nn_' + trainingFileName
        testingFileName = 'nn_' + testingFileName

    # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent.
    X_train = path.join(args['outputFolder'], 'X_train_' + trainingFileName)
    X_test = path.join(args['outputFolder'], 'X_test_' + testingFileName)

    with open(X_train, 'w+') as outputFile:
        csvOutputFile = csv.writer(outputFile)
        csvOutputFile.writerow(headerRow)
        # grab only the rows that were part of our training file from the combined X dataset
        csvOutputFile.writerows(X[0:args['trainingLength']])

    with open(X_test, 'w+') as outputFile:
        csvOutputFile = csv.writer(outputFile)
        csvOutputFile.writerow(headerRow)
        # grab the rest of the rows from our X dataset, which comprise the testing dataset
        csvOutputFile.writerows(X[args['trainingLength']:])

    if (nn):
        fileNames = {'X_train_nn': X_train, 'X_test_nn': X_test}
    else:
        fileNames = {'X_train': X_train, 'X_test': X_test}

    messageParent(fileNames, 'fileNames')
Ejemplo n.º 3
0
def splitDataset(data, name, fileCategory):

    # uses slicing, one of the most useful and least-well-known features of scipy sparse matrices
    # you pass in a list of row indices you want to keep, and it will create a sliced copy that includes only those rows
    # slicing also works on column indices
    # callout to the person who first opened my eyes to them:
    # http://stackoverflow.com/questions/13352280/slicing-sparse-matrices-in-scipy-which-types-work-best

    # if this "sparse" matrix only has a single value for each row, we have to treat it as a column matrix, and slice it accordingly
    # this is the case for our idColumn, and frequently our y values as well.
    if data.shape[0] == 1:
        validation = data[:, validationIndices]
        trainingData = data[:, trainingIndices]

    else:
        validation = data[validationIndices, :]
        trainingData = data[trainingIndices, :]

    # ntpath theoretically works really well across systems
    name = ntpath.basename(name)
    # remove the file extension
    name = name[0:-4]

    validationFile = path.join(outputDirectory, name + "validationData.npz")
    trainingDataFile = path.join(outputDirectory, name + "trainingData.npz")

    save_sparse_csr(trainingDataFile, trainingData)
    save_sparse_csr(validationFile, validation)

    # send the file names back to the parent process, where we aggregate and save them
    fileNameDict = {fileCategory + "trainingData": trainingDataFile, fileCategory + "validationData": validationFile}
    messageParent(fileNameDict, "splitFileNames")
Ejemplo n.º 4
0
def writeMetadata(y, idColumn, args, headerRow, validationSplitColumn, hasCustomValidationSplit):

    # these are the file names (with full file paths) that we will be writing to
    y_train = path.join(args["outputFolder"], "y_train_" + args["trainingPrettyName"] + ".npz")
    id_train = path.join(args["outputFolder"], "id_train_" + args["trainingPrettyName"] + ".npz")
    id_test = path.join(
        args["outputFolder"], "id_test_" + args["testingPrettyName"] + args["trainingPrettyName"] + ".npz"
    )
    validation_split_column = path.join(
        args["outputFolder"], "validation_split_column_" + args["trainingPrettyName"] + ".npz"
    )

    trainingLength = args["trainingLength"]

    # convert all our data to np arrays, and break apart based on whether it's in the training data or not
    idTrainData = np.array(idColumn[0:trainingLength])
    idTestData = np.array(idColumn[trainingLength:])
    y = np.array(y)
    validationSplitColumnData = np.array(validationSplitColumn)

    # if our values are not already stored as numbers, convert them to numbers
    try:
        ySparse = csr_matrix(y)
    except:
        yInt = [float(i) for i in y[0:trainingLength]]
        ySparse = csr_matrix(yInt)

    try:
        idTrainSparse = csr_matrix(idTrainData)
    except:
        idTrainInt = [float(i) for i in idTrainData]
        idTrainSparse = csr_matrix(idTrainInt)

    try:
        idTestSparse = csr_matrix(idTestData)
    except:
        idTestInt = [float(i) for i in idTestData]
        idTestSparse = csr_matrix(idTestInt)

    try:
        validationSplitSparse = csr_matrix(validationSplitColumnData)
    except:
        validationSplitInt = [float(i) for i in validationSplitColumnData]
        validationSplitSparse = csr_matrix(validationSplitInt)

    save_sparse_csr(y_train, ySparse)
    save_sparse_csr(id_train, idTrainSparse)
    save_sparse_csr(id_test, idTestSparse)
    save_sparse_csr(validation_split_column, validationSplitSparse)

    fileNames = {
        "y_train": y_train,
        "id_train": id_train,
        "id_test": id_test,
        "idHeader": args["idHeader"],
        "outputHeader": args["outputHeader"],
        "validation_split_column": validation_split_column,
        "hasCustomValidationSplit": hasCustomValidationSplit,
    }
    messageParent(fileNames, "fileNames")
Ejemplo n.º 5
0
def writeDataDense(X, args, headerRow, nn):

    # grab the name of the training and testing files from the full path to those datasets
    trainingFileName = args["trainingPrettyName"] + ".csv"
    testingFileName = args["testingPrettyName"] + args["trainingPrettyName"] + ".csv"

    if nn:
        trainingFileName = "nn_" + trainingFileName
        testingFileName = "nn_" + testingFileName

    # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent.
    X_train = path.join(args["outputFolder"], "X_train_" + trainingFileName)
    X_test = path.join(args["outputFolder"], "X_test_" + testingFileName)

    with open(X_train, "w+") as outputFile:
        csvOutputFile = csv.writer(outputFile)
        csvOutputFile.writerow(headerRow)
        # grab only the rows that were part of our training file from the combined X dataset
        csvOutputFile.writerows(X[0 : args["trainingLength"]])

    with open(X_test, "w+") as outputFile:
        csvOutputFile = csv.writer(outputFile)
        csvOutputFile.writerow(headerRow)
        # grab the rest of the rows from our X dataset, which comprise the testing dataset
        csvOutputFile.writerows(X[args["trainingLength"] :])

    if nn:
        fileNames = {"X_train_nn": X_train, "X_test_nn": X_test}
    else:
        fileNames = {"X_train": X_train, "X_test": X_test}

    messageParent(fileNames, "fileNames")
Ejemplo n.º 6
0
def writeDataSparse(X, args, headerRow, nn):

    # grab the name of the training and testing files from the full path to those datasets
    trainingFileName = args['trainingPrettyName'] + '.npz'
    testingFileName = args['testingPrettyName'] + args[
        'trainingPrettyName'] + '.npz'

    if type(nn) is not bool:
        trainingFileName = 'nn_' + trainingFileName
        testingFileName = 'nn_' + testingFileName
        yFileName = 'y_train_' + 'nn_' + args['trainingPrettyName'] + '.npz'
        y_train = path.join(args['outputFolder'], yFileName)

        y = np.array(nn)
        y = [float(i) for i in y]

        # if our values are not already stored as numbers, convert them to numbers
        try:
            ySparse = csr_matrix(y)
            printParent('successfully turned y into a sparse matrix!')
        except:
            yInt = [float(i) for i in y[0:trainingLength]]
            ySparse = csr_matrix(yInt)

        save_sparse_csr(y_train, ySparse)

    # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent.
    X_train = path.join(args['outputFolder'], 'X_train_' + trainingFileName)
    X_test = path.join(args['outputFolder'], 'X_test_' + testingFileName)

    # scipy sparse matrices need a list of indices to slice
    # http://stackoverflow.com/questions/13352280/slicing-sparse-matrices-in-scipy-which-types-work-best
    trainRange = range(args['trainingLength'])
    testRange = range(args['trainingLength'],
                      args['trainingLength'] + args['testingLength'])

    save_sparse_csr(X_train, X[trainRange, :])
    save_sparse_csr(X_test, X[testRange])

    if type(nn) is not bool:
        fileNames = {
            'X_train_nn': X_train,
            'X_test_nn': X_test,
            'y_train_nn': y_train
        }
    else:
        fileNames = {'X_train': X_train, 'X_test': X_test}
    messageParent(fileNames, 'fileNames')
Ejemplo n.º 7
0
def writeMetadataDense(y, idColumn, args, headerRow):
    # grab the name of the training and testing files from the full path to those datasets

    # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent.
    y_train = path.join(args['outputFolder'],
                        'y_train_' + args['trainingPrettyName'] + '.csv')
    id_train = path.join(args['outputFolder'],
                         'id_train_' + args['trainingPrettyName'] + '.csv')
    id_test = path.join(
        args['outputFolder'], 'id_test_' + args['testingPrettyName'] +
        args['trainingPrettyName'] + '.csv')

    with open(y_train, 'w+') as outputFile:
        csvOutputFile = csv.writer(outputFile)

        # write the pretty name for the header row to the output file
        csvOutputFile.writerow([args['outputHeader']])

        # grab only the rows that were part of our training file from the combined dataset
        for rowIdx, row in enumerate(y):
            if (rowIdx < args['trainingLength']):
                csvOutputFile.writerow([row])

    with open(id_train, 'w+') as outputFile:
        csvOutputFile = csv.writer(outputFile)

        # write the pretty name for the header row to the output file
        csvOutputFile.writerow([args['idHeader']])

        # grab only the rows that were part of our training file from the combined dataset
        for rowIdx, row in enumerate(idColumn):
            if (rowIdx < args['trainingLength']):
                csvOutputFile.writerow([row])

    with open(id_test, 'w+') as outputFile:
        csvOutputFile = csv.writer(outputFile)

        # write the pretty name for the header row to the output file
        csvOutputFile.writerow([args['idHeader']])

        # grab only the rows that were part of our testing file from the combined dataset
        for rowIdx, row in enumerate(idColumn):
            if (rowIdx >= args['trainingLength']):
                csvOutputFile.writerow([row])

    fileNames = {'y_train': y_train, 'id_train': id_train, 'id_test': id_test}
    messageParent(fileNames, 'fileNames')
Ejemplo n.º 8
0
def writeMetadataDense(y, idColumn, args, headerRow):
    # grab the name of the training and testing files from the full path to those datasets

    # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent.
    y_train = path.join(args["outputFolder"], "y_train_" + args["trainingPrettyName"] + ".csv")
    id_train = path.join(args["outputFolder"], "id_train_" + args["trainingPrettyName"] + ".csv")
    id_test = path.join(
        args["outputFolder"], "id_test_" + args["testingPrettyName"] + args["trainingPrettyName"] + ".csv"
    )

    with open(y_train, "w+") as outputFile:
        csvOutputFile = csv.writer(outputFile)

        # write the pretty name for the header row to the output file
        csvOutputFile.writerow([args["outputHeader"]])

        # grab only the rows that were part of our training file from the combined dataset
        for rowIdx, row in enumerate(y):
            if rowIdx < args["trainingLength"]:
                csvOutputFile.writerow([row])

    with open(id_train, "w+") as outputFile:
        csvOutputFile = csv.writer(outputFile)

        # write the pretty name for the header row to the output file
        csvOutputFile.writerow([args["idHeader"]])

        # grab only the rows that were part of our training file from the combined dataset
        for rowIdx, row in enumerate(idColumn):
            if rowIdx < args["trainingLength"]:
                csvOutputFile.writerow([row])

    with open(id_test, "w+") as outputFile:
        csvOutputFile = csv.writer(outputFile)

        # write the pretty name for the header row to the output file
        csvOutputFile.writerow([args["idHeader"]])

        # grab only the rows that were part of our testing file from the combined dataset
        for rowIdx, row in enumerate(idColumn):
            if rowIdx >= args["trainingLength"]:
                csvOutputFile.writerow([row])

    fileNames = {"y_train": y_train, "id_train": id_train, "id_test": id_test}
    messageParent(fileNames, "fileNames")
Ejemplo n.º 9
0
def writeDataSparse(X, args, headerRow, nn):

    # grab the name of the training and testing files from the full path to those datasets
    trainingFileName = args["trainingPrettyName"] + ".npz"
    testingFileName = args["testingPrettyName"] + args["trainingPrettyName"] + ".npz"

    if type(nn) is not bool:
        trainingFileName = "nn_" + trainingFileName
        testingFileName = "nn_" + testingFileName
        yFileName = "y_train_" + "nn_" + args["trainingPrettyName"] + ".npz"
        y_train = path.join(args["outputFolder"], yFileName)

        y = np.array(nn)
        y = [float(i) for i in y]

        # if our values are not already stored as numbers, convert them to numbers
        try:
            ySparse = csr_matrix(y)
            printParent("successfully turned y into a sparse matrix!")
        except:
            yInt = [float(i) for i in y[0:trainingLength]]
            ySparse = csr_matrix(yInt)

        save_sparse_csr(y_train, ySparse)

    # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent.
    X_train = path.join(args["outputFolder"], "X_train_" + trainingFileName)
    X_test = path.join(args["outputFolder"], "X_test_" + testingFileName)

    # scipy sparse matrices need a list of indices to slice
    # http://stackoverflow.com/questions/13352280/slicing-sparse-matrices-in-scipy-which-types-work-best
    trainRange = range(args["trainingLength"])
    testRange = range(args["trainingLength"], args["trainingLength"] + args["testingLength"])

    save_sparse_csr(X_train, X[trainRange, :])
    save_sparse_csr(X_test, X[testRange])

    if type(nn) is not bool:
        fileNames = {"X_train_nn": X_train, "X_test_nn": X_test, "y_train_nn": y_train}
    else:
        fileNames = {"X_train": X_train, "X_test": X_test}
    messageParent(fileNames, "fileNames")
Ejemplo n.º 10
0
def writeMetadata(y, idColumn, args, headerRow, validationSplitColumn,
                  hasCustomValidationSplit):

    # these are the file names (with full file paths) that we will be writing to
    y_train = path.join(args['outputFolder'],
                        'y_train_' + args['trainingPrettyName'] + '.npz')
    id_train = path.join(args['outputFolder'],
                         'id_train_' + args['trainingPrettyName'] + '.npz')
    id_test = path.join(
        args['outputFolder'], 'id_test_' + args['testingPrettyName'] +
        args['trainingPrettyName'] + '.npz')
    validation_split_column = path.join(
        args['outputFolder'],
        'validation_split_column_' + args['trainingPrettyName'] + '.npz')

    trainingLength = args['trainingLength']

    # convert all our data to np arrays, and break apart based on whether it's in the training data or not
    idTrainData = np.array(idColumn[0:trainingLength])
    idTestData = np.array(idColumn[trainingLength:])
    y = np.array(y)
    validationSplitColumnData = np.array(validationSplitColumn)

    # if our values are not already stored as numbers, convert them to numbers
    try:
        ySparse = csr_matrix(y)
    except:
        yInt = [float(i) for i in y[0:trainingLength]]
        ySparse = csr_matrix(yInt)

    try:
        idTrainSparse = csr_matrix(idTrainData)
    except:
        idTrainInt = [float(i) for i in idTrainData]
        idTrainSparse = csr_matrix(idTrainInt)

    try:
        idTestSparse = csr_matrix(idTestData)
    except:
        idTestInt = [float(i) for i in idTestData]
        idTestSparse = csr_matrix(idTestInt)

    try:
        validationSplitSparse = csr_matrix(validationSplitColumnData)
    except:
        validationSplitInt = [float(i) for i in validationSplitColumnData]
        validationSplitSparse = csr_matrix(validationSplitInt)

    save_sparse_csr(y_train, ySparse)
    save_sparse_csr(id_train, idTrainSparse)
    save_sparse_csr(id_test, idTestSparse)
    save_sparse_csr(validation_split_column, validationSplitSparse)

    fileNames = {
        'y_train': y_train,
        'id_train': id_train,
        'id_test': id_test,
        'idHeader': args['idHeader'],
        'outputHeader': args['outputHeader'],
        'validation_split_column': validation_split_column,
        'hasCustomValidationSplit': hasCustomValidationSplit
    }
    messageParent(fileNames, 'fileNames')
Ejemplo n.º 11
0
# when doing the cross-validated search, we potentially been holding out a significant portion of the dataset
# once we have found the best hyperparameters, train on the entire dataset
    # we have already verified that this is the best set of hyperparameters using cross-validation
if X.shape[0] != X_train.shape[0] or extendedTraining:
    longTrainClassifier.fit(X, y)

    finishLongTrainTime = time.time()
    printParent(classifierName + "'s training on the longer data set took:")
    printParent( round((finishLongTrainTime - startLongTrainTime)/60, 1) )


longTrainClassifierScore = longTrainClassifier.score(X, y)
printParent(classifierName + "'s score against the larger training data set is:")
printParent(longTrainClassifierScore)
messageObj['longTrainScore'] = longTrainClassifierScore


# save our classifiers from the validationRound to a separate folder
if globalArgs['validationRound']:
    classifierFolder = path.join(globalArgs['bestClassifiersFolder'], 'ensemblingAlgos', 'best' + classifierName)
else:
    classifierFolder = path.join(globalArgs['bestClassifiersFolder'], 'best' + classifierName)

if not os.path.exists(classifierFolder):
    os.makedirs(classifierFolder)

joblib.dump(longTrainClassifier,  path.join(classifierFolder, 'best' + classifierName + '.pkl') )

messageParent(messageObj, 'trainingResults')
Ejemplo n.º 12
0
def format( X, y, idColumn, args ):
    X = min_max_scaler.fit_transform( X ).tolist()

    brainArr = []
    for rowIndex, row in enumerate(X):

        # brainJS expects a very particular format for each row:
            # an object with input and output properties, each of which is an array
        rowObj = {}

        # we might need to wrap output in a list if the output is a single number, like we have. 
        rowObj['output'] = []

        # if a y value exists for this rowIndex, grab it!
        # otherwise, it's likely because the user did not pass in a column for y values for the output dataset, so we'll set it equal to the empty string instead
        try:
            # grab the output value from the y dataset saved earlier
            yRow = y[ rowIndex ]
        except:
            yRow = ""

        # designed to handle the possibility of multiple output columns
        if( isinstance( yRow, list )):
            rowObj['output'].extend( yRow )
        else:
            # the output value is expected to be a list, so if y values are not lists, then we need to wrap the y value in a list
            rowObj['output'].append( yRow )

        rowObj[ 'id' ] = idColumn[ rowIndex ]

        for idx, val in enumerate(row):
            # python saves floats in their binary representation, which gets written to file in scientific notation
            # this scientific notation is not necessarily json compatible
            # so we format these values as a string
            # unfortunately, these strings will only have 6 decimal places after the 0. this should not be a major limitation, as that still allows 6 orders of magnitude of differentiation. it would be a rather surprising neural network who could draw meaningful distinctions between values that are 7 orders of magnitude different from the max value vs values that are 6 orders of magnitude different from the max value. 
            # in our case, they'll both just get saved as 0. 
            row[idx] = '{:f}'.format(val)
            
        rowObj['input'] = row
        brainArr.append( rowObj )

    trainingFileName = path.split( args['trainingData'] )[ -1 ]
    testingFileName = path.split( args['testingData'] )[ -1 ]

    brainJS_train = path.join( args['outputFolder'], 'brainJS_train_' + trainingFileName )
    brainJS_test = path.join( args['outputFolder'], 'brainJS_test_' + testingFileName )

    with open( brainJS_train, 'w+') as outputFile:
        csvOutputFile = csv.writer(outputFile)
        for rowIndex, row in enumerate(brainArr):
            if( rowIndex < args['trainingLength'] ):
                # csvWriter.writerow expects each row to be a list
                # since our rows are actually just dictionaries, we need to wrap it in a list each time so the writer knows this is a single row
                csvOutputFile.writerow( [ json.dumps( row ) ] )

    with open( brainJS_test, 'w+') as outputFile:
        csvOutputFile = csv.writer(outputFile)
        for rowIndex, row in enumerate(brainArr):
            if( rowIndex >= args['trainingLength'] ):
                csvOutputFile.writerow( [ row ] )

    fileNames = {
        'brainJS_train': brainJS_train,
        'brainJS_test': brainJS_test
    }
    messageParent( fileNames, 'fileNames' )

    return brainArr