Example #1
0
def joinTrainValid(trainData, validData):
    info("Joining train and validation data", ind=6)
    validVal = Series(repeat(0, trainData.shape[0]))
    trainData = trainData.assign(isValid=validVal.values)
    info("Train data has size " + getDim(trainData), ind=6)
    validVal = Series(repeat(1, validData.shape[0]))
    validData = validData.assign(isValid=validVal.values)
    info("Validation data has size " + getDim(validData), ind=6)
    pddf = trainData.append(validData)
    info("Combined data has size " + getDim(pddf), ind=6)
    return pddf
Example #2
0
def joinTrainTest(trainData, testData):
    info("Joining train and test data", ind=6)
    trainVal = Series(repeat(1, trainData.shape[0]))
    trainData = trainData.assign(isTrain=trainVal.values)
    info("Train data has size " + getDim(trainData), ind=6)
    testVal = Series(repeat(0, testData.shape[0]))
    testData = testData.assign(isTrain=testVal.values)
    info("Test data has size " + getDim(testData), ind=6)
    pddf = trainData.append(testData)
    info("Combined data has size " + getDim(pddf), ind=6)
    return pddf
Example #3
0
def getTrainData(config):
    X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames(
        config)
    if isFile(X_trainName) and isFile(y_trainName):
        info("Loading {0}".format(X_trainName), ind=4)
        X_train = getJoblib(X_trainName)
        info("Found data that is {0}".format(getDim(X_train)), ind=4)

        info("Loading {0}".format(y_trainName), ind=4)
        y_train = getJoblib(y_trainName)
        info("Found data that is {0}".format(getDim(y_train)), ind=4)
        return X_train, y_train
    else:
        error("Train data is not ready")
        return None
Example #4
0
def testModel(modelname, estimator, X_test, config):  
    info("Testing a {0} estimator".format(modelname), ind=0)
    info("X data is {0}".format(getDim(X_test)), ind=2)
    
    problemType = config['problem']
    results = {"good": True, "label": None, "prob": None, "pred": None}
    
    if isinstance(estimator, dict):
        estimator = estimator['estimator']
        
    
    if estimator is None:
        error("The {0} estimator is NULL".format(modelname))
        results['good'] = False
        return results
    
    
    if isClassification(problemType):
        info("Predicting classification labels/classes for {0}".format(modelname), ind=4)
        try:
            results['label'] = estimator.predict(X_test)
        except:
            results['good'] = False
            error("There is a problem getting labels for {0}".format(modelname), ind=4)
        
        info("Predicting classification probabilities for {0}".format(modelname), ind=4)
        try:
            proba = estimator.predict_proba(X_test)
            results['prob'] = proba[:,1]
        except:
            results['good'] = False
            error("There is a problem getting probabilities for {0}".format(modelname), ind=4)
            

    if isRegression(problemType):
        info("Predicting regression score/output for {0}".format(modelname), ind=4)
        try:
            results['pred'] = estimator.predict(X_test)
        except:
            results['good'] = False
            error("There is a problem getting prediction for {0}".format(modelname), ind=4)


    if results['good'] == True:
        info("Everything looks good for the {0} estimator".format(modelname), ind=4)
    else:        
        info("There is a problem with the {0} estimator".format(modelname), ind=4)


    return results
Example #5
0
def readData(config):
    info("Getting data for analysis")

    ## Get name
    name = config['name']

    ## Load the data we need
    if name == "uptake":
        pddf = readUptake(config)
    elif name == "kdd99":
        pddf = readKDD99(config)
    elif name in ["boston", "diabetes", "wine", "digits", "cancer"]:
        pddf = readDataset(config, name)
    elif name == "regression":
        pddf = makeRegression(config)
    elif name == "classification":
        pddf = makeClassification(config)
    else:
        raise ValueError("Name", name, "not recognized in readData()")

    info("Using data that is " + getDim(pddf), ind=0)
    return pddf
Example #6
0
def getEncodedData(pddata):
    info('Convert Categorical Data To Integer', ind=4)


    ## label encode data
    labelEncoders, results = getLabelEncoders(pddata)


    ## create data frame of categorical features
    encodedCatData = pd.DataFrame({cat_colname: encoded for cat_colname, label_encoder, encoded in results})


    ## drop columns
    info('Dropping original '+getNcols(encodedCatData, asStr=True)+' columns', ind=6)
    dropEncodedColumns(pddata, encodedCatData.columns)
    info('Original data is now '+getDim(pddata), ind=6)


    ## join to original data
    #info('Joining encoded data', ind=6)
    #encodedData = encodedCatData.join(pddata)

    
    return pddata, encodedCatData, labelEncoders
Example #7
0
def loadTrainTestData(config):
    X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames(
        config)
    if all([
            isFile(X_trainName),
            isFile(X_testName),
            isFile(X_validName),
            isFile(y_trainName),
            isFile(y_testName),
            isFile(y_validName)
    ]):
        info("Loading saved final train/test datasets.", ind=2)

        info("Loading {0}".format(X_trainName), ind=4)
        X_train = getJoblib(X_trainName)
        info("Found data that is {0}".format(getDim(X_train)), ind=4)
        info("Loading {0}".format(X_testName), ind=4)
        X_test = getJoblib(X_testName)
        info("Found data that is {0}".format(getDim(X_test)), ind=4)
        info("Loading {0}".format(X_validName), ind=4)
        X_valid = getJoblib(X_validName)
        info("Found data that is {0}".format(getDim(X_valid)), ind=4)

        info("Loading {0}".format(y_trainName), ind=4)
        y_train = getJoblib(y_trainName)
        info("Found data that is {0}".format(getDim(y_train)), ind=4)
        info("Loading {0}".format(y_testName), ind=4)
        y_test = getJoblib(y_testName)
        info("Found data that is {0}".format(getDim(y_test)), ind=4)
        info("Loading {0}".format(y_validName), ind=4)
        y_valid = getJoblib(y_validName)
        info("Found data that is {0}".format(getDim(y_valid)), ind=4)

        return X_train, X_test, X_valid, y_train, y_test, y_valid
    else:
        error("Train/test datasets are not ready!")
Example #8
0
def getTrainTestData(pddf, config):
    info("Creating final train/test datasets.", ind=0)

    ## Config info
    targetConfig = config['target']
    targetcol = targetConfig['colname']
    outputConfig = config['output']
    compress = outputConfig['compress']

    if not isColumn(pddf, targetcol):
        raise ValueError("Target column", targetcol,
                         "is not included in data!")

    ## Determine if the data showed up split (seperate train/test files)
    isSplit = False
    isValid = False
    if isColumn(pddf, "isTrain"):
        info("Data is already split", ind=2)
        isSplit = True
    elif isColumn(pddf, "isValid"):
        info("Validation data is ready, but train/test data must be created",
             ind=2)
        isValid = True
    else:
        info("Train/test data must be created", ind=2)

    ## Create data if it's split
    if isSplit:
        info("Splitting train data", ind=2)
        X_train = pddf[pddf['isTrain'] == 1]
        y_train = X_train[targetcol]
        X_train.drop(labels=[targetcol, 'isTrain'], axis=1, inplace=True)

        info("Splitting test data", ind=2)
        X_test = pddf[pddf['isTrain'] == 0]
        y_test = X_test[targetcol]
        X_test.drop(labels=[targetcol, 'isTrain'], axis=1, inplace=True)

        X_valid = None
        y_valid = None
    elif isValid:
        info("Splitting validation data", ind=2)
        X_valid = pddf[pddf['isValid'] == 1]
        y_valid = X_valid[targetcol]

        info("Creating train/test data that contains validated data", ind=2)
        X_data = pddf[pddf['isValid'] == 0]
        y = X_data[targetcol]
        X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                            y,
                                                            test_size=0.2)
    else:
        info("Creating train/test data that is not already split or validated",
             ind=2)
        y = pddf[targetcol]
        pddf.drop(labels=[targetcol], axis=1, inplace=True)
        X_train, X_test, y_train, y_test = train_test_split(pddf,
                                                            y,
                                                            test_size=0.2)
        X_valid = None
        y_valid = None

    if isSplit:
        info("Dropping {0} from DataFrame".format(", ".join(
            [targetcol, 'isTrain'])))
        pddf.drop(labels=[targetcol, 'isTrain'], axis=1, inplace=True)
    elif isValid:
        info("Dropping {0} from DataFrame".format(", ".join(
            [targetcol, 'isValid'])))
        pddf.drop(labels=[targetcol, 'isValid'], axis=1, inplace=True)

    X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames(
        config)

    info("Saving {0} data to {1}".format(getDim(X_train), X_trainName), ind=4)
    saveJoblib(X_trainName, X_train, compress)
    info("Saving {0} data to {1}".format(getDim(X_test), X_testName), ind=4)
    saveJoblib(X_testName, X_test, compress)
    info("Saving {0} data to {1}".format(getDim(X_valid), X_validName), ind=4)
    saveJoblib(X_validName, X_valid, compress)

    info("Saving {0} data to {1}".format(getDim(y_train), y_trainName), ind=4)
    saveJoblib(y_trainName, y_train, compress)
    info("Saving {0} data to {1}".format(getDim(y_test), y_testName), ind=4)
    saveJoblib(y_testName, y_test, compress)
    info("Saving {0} data to {1}".format(getDim(y_valid), y_validName), ind=4)
    saveJoblib(y_validName, y_valid, compress)

    return X_train, X_test, X_valid, y_train, y_test, y_valid
Example #9
0
def formatData(trainData, testData, config):
    info('Formatting training data of size ' + getDim(trainData), ind=0)
    info('Formatting testing data of size ' + getDim(testData), ind=0)

    ## Config info
    targetConfig = config['target']
    targetcol = targetConfig['colname']
    positiveTarget = targetConfig['positive']
    targetNAstrategy = targetConfig['NAstrategy']
    featureConfig = config['feature']
    featureNAstrategy = featureConfig['NAstrategy']

    if not isColumn(trainData, targetcol):
        raise ValueError("Target column", targetcol, "is not a valid column.")

    # 1) Get problem type
    targetData = trainData[targetcol]
    if config.get('problem'):
        problemType = config['problem']
    else:
        problemType = getProblemType(targetData)
        config['problem'] = problemType

    # 2) format target based on what we want
    info('Formatting target', ind=1)
    if isClassification(problemType):
        convertToBinaryInt(trainData, targetcol, positiveTarget)
        if isColumn(testData, targetcol):
            convertToBinaryInt(testData, targetcol, positiveTarget)
    if isRegression(problemType):
        info('Not formatting target since it is regression', ind=1)

    # 3) replace NA
    info('Replace NA in data', ind=1)
    print featureNAstrategy
    replaceTargetNA(trainData, targetcol, targetNAstrategy)
    replaceFeatureNA(trainData, targetcol, featureNAstrategy)
    if isColumn(testData, targetcol):
        replaceTargetNA(testData, targetcol, targetNAstrategy)
    replaceFeatureNA(testData, targetcol, featureNAstrategy)

    # 4) drop columns we don't need
    dropData(trainData, config)
    dropData(testData, config)

    return trainData, testData

    # 5) format remaining data to numeric
    info('Formatting features to numeric', ind=1)
    convertCategoricalToNumeric(trainData, targetcol)
    convertCategoricalToNumeric(testData, targetcol)
    info('Post formatting the training data is now ' + getDim(trainData),
         ind=2)
    info('Post formatting the testing data is now ' + getDim(trainData), ind=2)

    #pddata.drop([colname], axis = 1, inplace = True)
    #pddata = pddata.join(expData)

    # 5) replace low variance
    info('Remove low variance features in data', ind=1)

    info('Finished formatting data', ind=0)

    return pddata
Example #10
0
def formatData(pddf, config):
    info('Formatting data of size ' + getDim(pddf), ind=0)

    ## Config info
    targetConfig = config['target']
    targetcol = targetConfig['colname']
    positiveTarget = targetConfig['positive']
    targetNAstrategy = targetConfig['NAstrategy']
    featureConfig = config['feature']
    featureNAstrategy = featureConfig['NAstrategy']

    if not isColumn(pddf, targetcol):
        raise ValueError("Target column", targetcol, "is not a valid column.")

    # 1) Get problem type
    targetData = pddf[targetcol]
    if config.get('problem'):
        problemType = config['problem']
    else:
        problemType = getProblemType(targetData)
        config['problem'] = problemType

    # 2) format target based on what we want
    info('Formatting target', ind=2)
    if isClassification(problemType):
        convertToBinaryInt(pddf, targetcol, positiveTarget)
    if isRegression(problemType):
        info('Not formatting target since it is regression', ind=1)

    # 3) replace NA
    info('Replace NA in data', ind=2)
    replaceTargetNA(pddf, targetcol, targetNAstrategy)
    replaceFeatureNA(pddf, targetcol, featureNAstrategy)

    # 4) remove low variance data
    info('Remove low variance in data', ind=2)

    # 5) drop columns we don't need
    info('Analyze data for possible drops', ind=2)
    analyzeColumns(pddf, config)
    dropData(pddf, config)
    info('Post column data the data is now ' + getDim(pddf), ind=2)

    # 6) label and one-hot encode data
    info('Label encode training data to numeric', ind=2)
    pddf, encodedCatData, labelEncoders = getEncodedData(pddf)
    info('Hot encode training data to sparse data frame', ind=1)
    encodedData = getHotEncodedData(encodedCatData, labelEncoders)
    info('Join training data together', ind=2)
    pddf = pddf.join(encodedData)
    info('Post formatting the data is now ' + getDim(pddf), ind=2)

    # 7) replace low variance
    info('Remove low variance features in data', ind=2)
    if isClassification(problemType):
        info('Classification is To do!', ind=4)
    if isRegression(problemType):
        info('Not removing any features since it is regression', ind=1)

    # 8) replace NA (if any remain)
    info('Replace NA (if any) in data', ind=2)
    replaceTargetNA(pddf, targetcol, targetNAstrategy)
    replaceFeatureNA(pddf, targetcol, featureNAstrategy)
    if sum(pddf.isnull().any()) > 0:
        error("There are still NA entries in the dataset!", ind=4)

    info('Finished formatting data. Data is now ' + getDim(pddf), ind=2)

    return pddf
Example #11
0
def trainModel(modelname, X_train, y_train, config):    
    info("Training a {0} estimator".format(modelname), ind=0)
    info("X data is {0}".format(getDim(X_train)), ind=2)
    info("y data is {0}".format(getDim(y_train)), ind=2)
    
    problemType = config['problem']
    info("This is a {0} problem".format(problemType), ind=2)
    
    modelData = getModelData(config, modelname)
    tuneForParams = True
    refitModel = False
    goodModel = True
    if modelData is not None:
        if modelData.get('tune') is False:
            tuneForParams = False
        if modelData.get('fit') is True:
            tuneForParams = False
        if modelData.get('cv') is True:
            tuneForParams = False
        if modelData.get('refit') is True:
            refitModel = True
        if modelData.get('error') is True:
            goodModel = False
    else:
        info("No model parameters were given. Using default {0} estimator".format(modelname), ind=4)
        tuneForParams = False

    if goodModel is False:
        error("Model {0} is no good and will not run it.".format(modelname))
        return None
    

    #################################################################
    # Get Model
    #################################################################
    retval = getModel(config, modelname)


    #################################################################
    # Tune Parameters
    #################################################################
    estimator = retval['estimator']
    params    = retval['params']
    

    if tuneForParams:
        tuneResults = tuneModel(modelname, estimator, params, X_train, y_train, config)
        estimator   = tuneResults['estimator']
        params      = tuneResults['params']
        
        if refitModel:
            try:
                estimator.set_params(probability=True)
                info("Set probability to True for model refit", ind=4)
            except:
                info("Could not set probability to True for model refit")
            info("Re-fitting for {0} model parameters with probability".format(modelname), ind=4)
            estimator = estimator.fit(X_train, y_train)
            info("Finished re-fitting {0} model parameters with probability".format(modelname), ind=4)
    else:
        if estimator is not None:
            info("Fitting for {0} model parameters".format(modelname), ind=2)
            estimator = estimator.fit(X_train, y_train)
            info("Finished fitting {0} model parameters".format(modelname), ind=4)
        else:
            error("No model with name {0} was trained".format(modelname))


    return estimator
Example #12
0
def readCSV(filename):
    info("Reading data [" + filename + "]")
    pddata = read_csv(filename, low_memory=False)

    info("Read data with size " + getDim(pddata))
    return pddata