예제 #1
0
    def __init__(
        self,
        num_classes=None,
        num_features=None,
        dense0_num_units=10,
        dropout_p=0.1,
        dense1_num_units=10,
        update_learning_rate=0.1,
        update_momentum=0.1,
        eval_size=0.1,
        max_epochs=10,
        verbose=5,
    ):

        self.logger = Logger(self.packageName).getLogger()
        self.logger.debug("Starting...")

        layers0 = [
            ("input", InputLayer),
            ("dense0", DenseLayer),
            ("dropout1", DropoutLayer),
            ("dense1", DenseLayer),
            ("output", DenseLayer),
        ]

        self.classifier = NeuralNet(
            layers=layers0,
            input_shape=(None, num_features),
            dense0_num_units=dense0_num_units,
            dropout1_p=dropout_p,
            dense1_num_units=dense1_num_units,
            output_num_units=num_classes,
            output_nonlinearity=softmax,
            update=nesterov_momentum,
            update_learning_rate=update_learning_rate,
            update_momentum=update_momentum,
            eval_size=eval_size,
            max_epochs=max_epochs,
            verbose=verbose,
        )

        self.num_classes = num_classes
        self.num_features = num_features
        self.dense0_num_units = dense0_num_units
        self.dropout_p = dropout_p
        self.dense1_num_units = dense1_num_units
        self.update_learning_rate = update_learning_rate
        self.update_momentum = update_momentum
        self.eval_size = eval_size
        self.max_epochs = max_epochs
        self.verbose = verbose

        return
예제 #2
0
    def __init__(self):
        self.logger = Logger(self.packageName).getLogger()

        self.classifiers = []
        self.addClassifier('xgb', 'XGBoost Classifier', 'classifier.XGB', 'XGB')
        self.addClassifier('nnn', 'NOLEARN Lasagne neural network', 'classifier.NNnolearn', 'NNnolearn')
        self.addClassifier('nns', 'SCIKIT neuralnetwork', 'classifier.NNscikit', 'NNscikit')
        self.addClassifier('for', 'SCIKIT Random Forest Classifier', 'sklearn.ensemble', 'RandomForestClassifier')
        self.addClassifier('ext', 'SCIKIT Extra Trees Classifier', 'sklearn.ensemble', 'ExtraTreesClassifier')
        self.addClassifier('svc', 'SCIKIT SVC', 'sklearn.svm', 'SVC')
        self.addClassifier('nsv', 'SCIKIT NU SVC', 'sklearn.svm', 'NuSVC')
        self.addClassifier('knn', 'SCIKIT Nearest Neighbour Classifier', 'sklearn.neighbors', 'KNeighborsClassifier')
        self.addClassifier('dtr', 'SCIKIT Decision Tree', 'sklearn.tree', 'DecisionTreeClassifier')
        self.addClassifier('log', 'SCIKIT Logistic Regression', 'sklearn.linear_model', 'LogisticRegression')
        self.addClassifier('pct', 'SCIKIT Perceptron', 'sklearn.linear_model', 'Perceptron')
        self.addClassifier('sgd', 'SCIKIT SGD Classifier', 'sklearn.linear_model', 'SGDClassifier')

        return
    def __init__(self,
                num_classes=None,
                num_features=None,
                learning_rate=0.01,
                learning_rule='sgd',
                learning_momentum=0.9,
                dropout_rate=None,
                weight_decay=None,
                random_state=0,
                n_iter=10):
        
        self.logger = Logger(self.packageName).getLogger()
        self.logger.debug('Starting...')

        self.num_classes = num_classes
        self.num_features = num_features
        self.learning_rule = learning_rule
        self.learning_momentum = learning_momentum
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.random_state = random_state
        self.n_iter = n_iter
        self.hidden_units = round( (self.num_features + self.num_classes)/3, 0)

        #                        Layer('Tanh', units=self.num_features),
        #                        Layer('Maxout', units=self.num_features, pieces=2),
        self.classifier = Classifier(
                            layers=[
                                Layer('Maxout', units=self.num_features, pieces=2),
                                Layer('Sigmoid', units=self.hidden_units),
                                Layer('Softmax', units=self.num_classes)
                            ],
                            learning_rule=self.learning_rule,
                            learning_rate=self.learning_rate,
                            learning_momentum=self.learning_momentum,
                            dropout_rate=self.dropout_rate,
                            weight_decay=self.weight_decay,
                            random_state=self.random_state,
                            n_iter=self.n_iter)

        return
class ClassifierScikitNN():

    packageName = 'com.brodagroup.machinelearning.ClassifierScikitNN'
    
    logger = None
    hidden_units = None
    classifier = None

    # Initializer
    def __init__(self,
                num_classes=None,
                num_features=None,
                learning_rate=0.01,
                learning_rule='sgd',
                learning_momentum=0.9,
                dropout_rate=None,
                weight_decay=None,
                random_state=0,
                n_iter=10):
        
        self.logger = Logger(self.packageName).getLogger()
        self.logger.debug('Starting...')

        self.num_classes = num_classes
        self.num_features = num_features
        self.learning_rule = learning_rule
        self.learning_momentum = learning_momentum
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.random_state = random_state
        self.n_iter = n_iter
        self.hidden_units = round( (self.num_features + self.num_classes)/3, 0)

        #                        Layer('Tanh', units=self.num_features),
        #                        Layer('Maxout', units=self.num_features, pieces=2),
        self.classifier = Classifier(
                            layers=[
                                Layer('Maxout', units=self.num_features, pieces=2),
                                Layer('Sigmoid', units=self.hidden_units),
                                Layer('Softmax', units=self.num_classes)
                            ],
                            learning_rule=self.learning_rule,
                            learning_rate=self.learning_rate,
                            learning_momentum=self.learning_momentum,
                            dropout_rate=self.dropout_rate,
                            weight_decay=self.weight_decay,
                            random_state=self.random_state,
                            n_iter=self.n_iter)

        return
        
    def __str__(self):
        x = self.packageName + '('
        x = x + '\n\t num_classes={0}, num_features: {1}'.format(self.num_classes, self.num_features)
        x = x + '\n\t learning_rule={0}, learning_rate: {1}'.format(self.learning_rule, self.learning_rate)
        x = x + '\n\t learning_momentum={0}, dropout_rate: {1}'.format(self.learning_momentum, self.dropout_rate)
        x = x + '\n\t hidden_units={0}, weight_decay: {1}'.format(self.hidden_units, self.weight_decay)
        x = x + '\n\t random_state={0}, n_iter: {1}'.format(self.random_state, self.n_iter)
        return(x)
        
    def fit(self, X, y):
        self.classifier.fit(X,y)
        
    def predict(self, X):
        y_pred = self.classifier.predict(X)
        return(y_pred)
        
    def predict_proba(self, X):
        y_pred = self.classifier.predict_proba(X)
        return(y_pred)
        
    def get_params(self, deep=True):
        return {
            "num_classes": self.num_classes,
            "num_features": self.num_features,
            "learning_rule": self.learning_rule,
            "learning_rate": self.learning_rate,
            "learning_momentum": self.learning_momentum,
            "dropout_rate": self.dropout_rate,
            "weight_decay": self.weight_decay,
            "random_state": self.random_state,
            "n_iter": self.n_iter
            }
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self        
예제 #5
0
class ClassifierList():

    packageName = 'com.brodagroup.machinelearning.classifierlist'

    logger = None

    # Initializer
    def __init__(self):
        self.logger = Logger(self.packageName).getLogger()

        self.classifiers = []
        self.addClassifier('xgb', 'XGBoost Classifier', 'classifier.XGB', 'XGB')
        self.addClassifier('nnn', 'NOLEARN Lasagne neural network', 'classifier.NNnolearn', 'NNnolearn')
        self.addClassifier('nns', 'SCIKIT neuralnetwork', 'classifier.NNscikit', 'NNscikit')
        self.addClassifier('for', 'SCIKIT Random Forest Classifier', 'sklearn.ensemble', 'RandomForestClassifier')
        self.addClassifier('ext', 'SCIKIT Extra Trees Classifier', 'sklearn.ensemble', 'ExtraTreesClassifier')
        self.addClassifier('svc', 'SCIKIT SVC', 'sklearn.svm', 'SVC')
        self.addClassifier('nsv', 'SCIKIT NU SVC', 'sklearn.svm', 'NuSVC')
        self.addClassifier('knn', 'SCIKIT Nearest Neighbour Classifier', 'sklearn.neighbors', 'KNeighborsClassifier')
        self.addClassifier('dtr', 'SCIKIT Decision Tree', 'sklearn.tree', 'DecisionTreeClassifier')
        self.addClassifier('log', 'SCIKIT Logistic Regression', 'sklearn.linear_model', 'LogisticRegression')
        self.addClassifier('pct', 'SCIKIT Perceptron', 'sklearn.linear_model', 'Perceptron')
        self.addClassifier('sgd', 'SCIKIT SGD Classifier', 'sklearn.linear_model', 'SGDClassifier')

        return

    def loadClass(self, moduleName, className, parameters):
        self.logger.debug('Loading module: {0}, class: {1}'.format(moduleName, className))
        self.logger.debug('Using load parameters: {0}'.format(parameters))

        try:
            module_ = import_module(moduleName)
            try:
                class_ = getattr(module_, className)
                instance = class_(**parameters)
            except AttributeError:
                raise RuntimeError('Class does not exist: {0}'.format(className))
        except ImportError:
            raise RuntimeError('Module does not exist: {0}'.format(moduleName))
        return instance

    def addClassifier(self, code, name, moduleName, className):
        x = [code, name, moduleName, className]
        self.classifiers.append(x)
        return

    def load(self, code, parameters):

        item = next((x for x in self.classifiers if x[0] == code), None)

        if item == None:
            raise RuntimeError('Classifier code not found: {0}'.format(code))

        code = item[0]
        name = item[1]
        moduleName = item[2]
        className = item[3]

        classifier = self.loadClass(moduleName, className, parameters)

        return classifier

    def list(self):
        x = np.array(self.classifiers)
        a = x[:,0]
        return(a.tolist())
예제 #6
0
class NNnolearn:

    packageName = "com.brodagroup.machinelearning.classifer.NNnolearn"

    logger = None
    classifier = None

    # Initializer
    def __init__(
        self,
        num_classes=None,
        num_features=None,
        dense0_num_units=10,
        dropout_p=0.1,
        dense1_num_units=10,
        update_learning_rate=0.1,
        update_momentum=0.1,
        eval_size=0.1,
        max_epochs=10,
        verbose=5,
    ):

        self.logger = Logger(self.packageName).getLogger()
        self.logger.debug("Starting...")

        layers0 = [
            ("input", InputLayer),
            ("dense0", DenseLayer),
            ("dropout1", DropoutLayer),
            ("dense1", DenseLayer),
            ("output", DenseLayer),
        ]

        self.classifier = NeuralNet(
            layers=layers0,
            input_shape=(None, num_features),
            dense0_num_units=dense0_num_units,
            dropout1_p=dropout_p,
            dense1_num_units=dense1_num_units,
            output_num_units=num_classes,
            output_nonlinearity=softmax,
            update=nesterov_momentum,
            update_learning_rate=update_learning_rate,
            update_momentum=update_momentum,
            eval_size=eval_size,
            max_epochs=max_epochs,
            verbose=verbose,
        )

        self.num_classes = num_classes
        self.num_features = num_features
        self.dense0_num_units = dense0_num_units
        self.dropout_p = dropout_p
        self.dense1_num_units = dense1_num_units
        self.update_learning_rate = update_learning_rate
        self.update_momentum = update_momentum
        self.eval_size = eval_size
        self.max_epochs = max_epochs
        self.verbose = verbose

        return

    def __str__(self):
        return x

    def fit(self, X, y):
        self.classifier.fit(X, y)

    def predict(self, X):
        y_pred = self.classifier.predict(X)
        return y_pred

    def predict_proba(self, X):
        y_pred = self.classifier.predict_proba(X)
        return y_pred

    def get_params(self, deep=True):
        return {
            "num_classes": self.num_classes,
            "num_features": self.num_features,
            "dense0_num_units": self.dense0_num_units,
            "dropout_p": self.dropout_p,
            "dense1_num_units": self.dense1_num_units,
            "update_learning_rate": self.update_learning_rate,
            "eval_size": self.eval_size,
            "max_epochs": self.max_epochs,
            "verbose": self.verbose,
        }

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
예제 #7
0
 def __init__(self):
     self.logger = Logger(self.packageName).getLogger()
     self.parameters = []
     return
예제 #8
0
class Data:

    packageName = "com.brodagroup.machinelearning.common.Data"

    logger = None

    # Initializer
    def __init__(self):
        self.logger = Logger(self.packageName).getLogger()
        self.parameters = []
        return

    def configure():
        return 0

    def load(self, pathCSV):
        # dataframe = pd.read_csv(pathCSV, quotechar='"', skipinitialspace=True)
        dataframe = pd.read_csv(pathCSV)
        return dataframe

    def loadDataFrame(self, pathPKL):
        dataframe = pd.read_pickle(pathPKL)
        return dataframe

    def saveDataFrame(self, dataframe, pathPKL):
        dataframe.to_pickle(pathPKL)
        return

    def segment(self, features, target, totalPct, testingPct, randomState):

        # Use a small subset of testing hypothesis to lower run-time
        numRows, numFeatures = features.shape
        rowsUsed = int(numRows * totalPct)

        xfeatures = features[0:rowsUsed]
        xtarget = target[0:rowsUsed]

        # Note:  features represents "X" and target represents "y"
        X_train, X_test, y_train, y_test = train_test_split(
            xfeatures, xtarget, test_size=testingPct, random_state=randomState
        )

        return (X_train, X_test, y_train, y_test)

    def shuffle(self, dataset):
        np.random.shuffle(dataset)
        return

    def sample(self, dataset, count):
        sample = np.random.choice(dataset.index.values, count)
        return sample

    def normalize(self, df, type="std"):

        self.logger.info("Normalizing data, type: {0}".format(type))

        cols = df.columns.values

        inArray = df[cols].values
        outArray = None

        if type == "minmax":
            # minmax_scale = preprocessing.MinMaxScaler().fit(df[cols])
            minmax_scale = preprocessing.MinMaxScaler().fit(inArray)
            outArray = minmax_scale.transform(inArray)
        else:
            # std_scale = preprocessing.StandardScaler().fit(df[cols])
            std_scale = preprocessing.StandardScaler().fit(inArray)
            outArray = std_scale.transform(inArray)

        df = pd.DataFrame(data=outArray, columns=cols)

        return df

    def threshold(self, X, lower, lvalue, upper, uvalue):
        X[X <= lower] = lvalue
        X[X >= upper] = uvalue
        return X

    def join(self, leftDF, rightDF, onKeys):
        result = pd.merge(leftDF, rightDF, on=onKeys)
        return result

    def categorize(self, df, field):
        self.logger.debug("Categorizing field: {0}, type: {1}".format(field, df[field].dtype))

        # Create and fill new columns for the categorized field
        if df[field].dtype == "object":

            lbl = preprocessing.LabelEncoder()
            values = list(pd.Series(df[field].values.ravel()).unique())

            self.logger.debug("Field: {0}, has value count: {1}".format(field, len(values)))

            if len(values) > 2:

                for value in values:
                    # Create the new field name based upon original name and values
                    # Note -- take into account missing values
                    if pd.isnull(value):
                        xfield = field + "-" + "Missing"
                    else:
                        # Strip commas
                        xvalue = value.replace(",", "")
                        xfield = field + "-" + xvalue

                    # Create and fill in the new columns with values
                    df.loc[:, xfield] = -1
                    self.logger.debug("Creating field: {0}, type: {1}".format(xfield, df[xfield].dtype))
                    # df[xfield] = df[field].apply(lambda x: 1 if x == value else 0)
                    df.loc[:, xfield] = df.loc[:, field].apply(lambda x: 1 if x == value else 0)

                # Remove the original field
                self.logger.debug("Dropping field: {0}, type: {1}".format(field, df[field].dtype))
                df = df.drop(field, axis=1)

        return df

    def sync(self, dfA, dfB):

        self.logger.debug("Synchronizing...")

        listA = list(dfA.columns.values)
        self.logger.debug("DataFrame A, columns: {0}".format(listA))

        listB = list(dfB.columns.values)
        self.logger.debug("DataFrame B, columns: {0}".format(listB))

        setA = set(listA)
        setB = set(listB)

        columnsNotInB = setA.difference(setB)
        self.logger.debug("Columns in A but not in B: {0}".format(columnsNotInB))

        for column in columnsNotInB:
            dfB[column] = 0

        columnsNotInA = setB.difference(setA)
        self.logger.debug("Columns in B but not in A: {0}".format(columnsNotInA))

        for column in columnsNotInA:
            dfA[column] = 0

        return (dfA, dfB)

    def prune(self, df, keep=None, remove=None):
        if keep:
            self.logger.info("Pruning, keeping fields: {0}".format(keep))
            df = df[keep]

        if remove:
            self.logger.info("Pruning, removing fields: {0}".format(remove))
            df = df.drop(remove, axis=1)

        return df

    def encodeList(self, columns, dfA, dfB):

        if columns:
            lbl = preprocessing.LabelEncoder()
            for column in columns:
                dfA, dfB = self.encode(column, dfA, dfB)

        return (dfA, dfB)

    def encode(self, column, dfA, dfB):

        # Note that all input dataframes must be encoded in a similar fashion
        # and hence can not be done independently or else they will
        # get encoded based upon values present in that data set alone,
        # which is not an issue unless the values in the dataframes are
        # slightly different... for example, dataframe A (dfA) and B (dfB) both
        # have categorized values in the same specific column, but
        # dfA has values 'Y','N' and dfB has values 'maybe','sometimes','Y','N',
        # and 'almost always' then they will get encoded differently: 'Y' may be
        # encoded as '0' in dfA but '2' in dfB

        lbl = preprocessing.LabelEncoder()

        self.logger.debug("Encoding field: {0}".format(column))

        valuesA = list(dfA[column].values)
        valuesB = list(dfB[column].values)
        values = valuesA + valuesB
        lbl.fit(values)

        # self.logger.debug('Encoding field: {0}, classes: {1}'.format(column, lbl.classes_))
        # xto = lbl.transform(values)
        # xfrom = lbl.inverse_transform(xto)
        # self.logger.debug('Encoding field {0}, FROM: {1}'.format(column, xfrom))
        # self.logger.debug('Encoding field {0}, TO:   {1}'.format(column, xto))

        dfA[column] = lbl.transform(valuesA)
        dfB[column] = lbl.transform(valuesB)

        return (dfA, dfB)
예제 #9
0
 def __init__(self):
     self.logger = Logger(self.packageName).getLogger()
     return
예제 #10
0
class Runner():

    packageName = 'com.brodagroup.machinelearning.common.Runner'

    logger = None
    rpt = None
    gridsearchrpt = None
    featurerpt = None
    scoringrpt = None
    preprocessor = None

    # features: dataframe used for fit / learning
    features = None

    # test: dataframe used for prediction
    test = None

    # target:  dataframe (single column) of actual/correct values (for scoring)
    target = None

    # expected: dataframe (single column) of actual values (for verification that algo works)
    expected = None
    hasExpected = False

    # y_pred: array of predictions (integer)
    y_pred = None

    # yy_pred: array of prediction probabilities (float)
    yy_pred = None

    # Initializer
    def __init__(self):
        self.logger = Logger(self.packageName).getLogger()
        return

    def dumpConfiguration(self):
        pretty = json.dumps(self.configuration, sort_keys=True, indent=4)
        return(pretty)

    def configure(self, jsonstr=None, file=None, url=None, overrides=None):
        if file:
            self.logger.info('Using configuration file: {0}'.format(file))
            with open(file, encoding='utf-8') as configurationFile:
                configuration = json.loads(configurationFile.read())
        elif url:
            configuration = urllib.urlopen(url).read()
        elif jsonstr:
            configuration = json
        else:
            raise RuntimeError('Configuration not provided (json|file|url)')

        self.configuration = configuration
        self.logger.info('Using configuration: {0}'.format(self.dumpConfiguration()))

        self.override(overrides=overrides)

        classifierCode = self.configuration['classifier']
        parameters = self.configuration['parameters']

        classifierList = ClassifierList()
        classifier = classifierList.load(classifierCode, parameters)
        self.classifier = classifier

        return

    def modifyConfiguration(self, dictionary, name, value, iter):
        iter = iter + 1

        parts = name.split('.')
        name = parts[0]

        if type(dictionary[name]) is dict:
            xdict = self.configuration[name]
            if iter > 3:
                raise('Error -- too many levels in configuration')

            xname = parts[1]
            self.modifyConfiguration(xdict, xname, value, iter)
        else:
            dictionary[name] = value
            self.logger.info('Setting name: {0} to value: {1}'.format(name, value))

        return(name)

    def override(self, overrides=None):

        if overrides:
            self.logger.info('Overriding parameters: {0}'.format(overrides))
            for nvp in overrides:
                x = nvp.split(':')

                name = x[0]
                value = x[1]
                self.modifyConfiguration(self.configuration, name, value, 0)

            self.logger.info('Using new configuration: {0}'.format(self.dumpConfiguration()))

        return

    def preprocessor(self,c):
        self.logger.info('Setting preprocessor')
        self.preprocessor = c
        return

    def load(self):
        self.logger.info('Loading data')

        data = Data()

        trainCSV = self.configuration['trainCSV']
        testCSV = self.configuration['testCSV']

        featuresPKL = self.configuration['featuresPKL']
        targetPKL = self.configuration['targetPKL']
        testPKL = self.configuration['testPKL']

        expectedCSV = None
        expectedPKL = None
        try:
            expectedCSV = self.configuration['expectedCSV']
            expectedPKL = self.configuration['expectedPKL']
        except:
            pass

        # If the dataframe (pickled) file exists, then load it
        # Otherwise, load the CSV, preprocess it, and then save it as a
        # PKL file which will reduce load times
        tmpFeatures = None
        tmpTarget = None
        tmptest = None

        if( os.path.exists(featuresPKL) ):
            self.logger.info('Loading train PKL: {0}'.format(featuresPKL))
            tmpFeatures = data.loadDataFrame(featuresPKL)
            self.logger.info('Loading target PKL: {0}'.format(targetPKL))
            tmpTarget = data.loadDataFrame(targetPKL)
            self.logger.info('Loading test PKL: {0}'.format(testPKL))
            tmpTest = data.loadDataFrame(testPKL)

        else:
            self.logger.info('Loading train CSV: {0}'.format(trainCSV))
            rawtrain = data.load(trainCSV)
            self.logger.info('Loading test CSV: {0}'.format(testCSV))
            rawtest = data.load(testCSV)

            # Preprocess the data
            tmpFeatures, tmpTarget, tmpTest = self.preprocessor.execute(rawtrain, rawtest)

            # Save the dataframe (lower load times)
            self.logger.info('Saving features PKL: {0}'.format(featuresPKL))
            data.saveDataFrame(tmpFeatures, featuresPKL)
            self.logger.info('Saving target PKL: {0}'.format(targetPKL))
            data.saveDataFrame(tmpTarget, targetPKL)
            self.logger.info('Saving test PKL: {0}'.format(testPKL))
            data.saveDataFrame(tmpTest, testPKL)

        if( expectedPKL and os.path.exists(expectedPKL) ):
            self.logger.info('loading expected PKL: {0}'.format(expectedPKL))
            tmpExpected = data.loadDataFrame(expectedPKL)
            self.hasExpected = True

        elif( expectedCSV and os.path.exists(expectedCSV) ):
            self.logger.info('Loading expected CSV: {0}'.format(expectedCSV))
            tmpExpected = data.load(expectedCSV)
            self.logger.info('Saving expected PKL: {0}'.format(expectedPKL))
            data.saveDataFrame(tmpExpected, expectedPKL)
            self.hasExpected = True

        self.features = tmpFeatures
        self.target = tmpTarget
        self.test = tmpTest

        if self.hasExpected:
            self.expected = tmpExpected

        return

    # Segment the TRAINING set into a smaller
    # cross validation set of data
    def segment(self):

        self.logger.info('Segmenting...')

        data = Data();

        totalpct = float(self.configuration['totalpct'])
        testpct = float(self.configuration['testpct'])
        randomstate = int(self.configuration['randomstate'])
        X_train, X_test, y_train, y_test = data.segment(self.features, self.target, totalpct, testpct, randomstate)

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

        return

    def fit(self):

        npXTrain = np.array(self.X_train).astype(np.float32)
        npyTrain = np.array(self.y_train).astype(np.int32)

        self.classifier.fit(npXTrain, npyTrain)
        return

    def crossvalidate(self):
        npXTrain = np.array(self.X_train).astype(np.float32)
        npyTrain = np.array(self.y_train).astype(np.int32)

        rptDF = self.classifier.crossvalidate(npXTrain, npyTrain)
        self.crossvalidationDF = rptDF
        pd.set_option('display.max_rows', 10000)
        self.logger.info('Cross Validation Report\n{0}'.format(rptDF))
        return

    def gridsearch(self, use=True, score='roc_auc'):

        self.logger.info('Executing grid search...')

        parameters = self.configuration['gridsearch']

        x = GridSearchCV(self.classifier, parameters, cv=6, scoring=score, verbose=10, n_jobs=6)
        #x = GridSearchCV(self.classifier, parameters, cv=5, scoring=score, verbose=10)

        npXTrain = np.array(self.X_train).astype(np.float32)
        npyTrain = np.array(self.y_train).astype(np.int32)
        x.fit(npXTrain, npyTrain)

        rpt = 'Grid Search Analysis \t\t' + str(dt.datetime.now())
        rpt = rpt + '\n\nParameters {0}'.format(parameters)
        rpt = rpt + '\n\nBest parameters set found:'
        rpt = rpt + '\n\t' + '{0}'.format(x.best_estimator_)
        rpt = rpt + '\n\nGrid Search Scores (using {0}):'.format(score)
        rpt = rpt + '\nSCORE\t\tSTDDEV(+/-)\tPARAMETERS:'
        for params, mean_score, scores in x.grid_scores_:
            rpt = rpt + '\n' + '{0:0.7f}\t{1:0.7f}'.format(mean_score, scores.std() / 2)
            for key in params:
                value = params[key]
                rpt = rpt + '\t\t{0}\t\t{1}'.format(key, value)

        if use:
            self.classifier = x.best_estimator_

        self.gridsearchrpt = rpt

        return(rpt)

    def importance(self):

        self.logger.info('Creating feature importance report...')

        rpt = None

        rpt = 'Feature Importance \t\t' + str(dt.datetime.now())
        if self.classifier == None:
            return(rpt)

        if hasattr(self.classifier, 'importance'):
            df = self.classifier.importance(self.X_train.columns.values)
            rpt = rpt + '\n\n{0}'.format(df)

        if hasattr(self.classifier, 'feature_importances_'):

            fi = pd.DataFrame(self.classifier.feature_importances_)
            columns = pd.DataFrame(self.X_train.columns.values)
            result = pd.concat([columns, fi], axis=1)
            result.columns = ['Feature', 'Importance']
            sorted = result.sort(['Importance','Feature'], ascending=[False, True])
            rpt = rpt + '\n{0}'.format(sorted)

        #pd.set_option('display.max_rows', len(sorted))
        #pd.reset_option('display.max_rows')
        self.featurerpt = rpt

        return(rpt)

    def score(self):

        self.logger.info('Scoring...')

        npXTest = np.array(self.X_test).astype(np.float32)

        y_pred = self.classifier.predict(npXTest)
        yy_pred = self.classifier.predict_proba(npXTest)[:,1]

        print('\n***')
        print(self.features.shape)
        print(self.test.shape)
        print('***\n')

        reportName = 'Cross Verification Data Report \t\t' + str(dt.datetime.now())

        scorer = Scorer()
        y_test = self.y_test
        rpt = scorer.score(
                y_test,
                y_pred,
                yy_pred,
                classifier=self.classifier,
                title=reportName,
                configuration=self.configuration )

        self.y_pred = y_pred
        self.yy_pred = yy_pred
        self.scoringrpt = rpt

        return(rpt)

    def inspect(self, name):
        x = getattr(self, name)
        return(x)

    def inquire(self, name):
        x = hasattr(self, name)
        return(x)

    def inject(self, name, value):
        x = setattr(self, name, value)
        return(x)

    def report(self):
        self.logger.info('Executing full report')

        rpt = '\nFull Report\n'

        if self.featurerpt:
            rpt = rpt + '\n\n{0}'.format(self.featurerpt)
        if self.scoringrpt:
            rpt = rpt + '\n\n{0}'.format(self.scoringrpt)
        if self.gridsearchrpt:
            rpt = rpt + '\n\n{0}'.format(self.gridsearchrpt)

        self.rpt = rpt

        return(rpt)

    def predict(self):
        self.logger.info('Predicting...')

        submissionSample = self.configuration['submissionSample']
        submissionDir = self.configuration['submissionDir']

        timestamp = dt.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

        submissionVFile = submissionDir + '/' + 'submission-values-' + timestamp + '.csv'
        submissionPFile = submissionDir + '/' + 'submission-probabilities-' + timestamp + '.csv'
        submissionLog = submissionDir + '/' + 'submission-' + timestamp + '.txt'

        npTest = np.array(self.test).astype(np.float32)

        y_pred = self.classifier.predict(npTest)
        yy_pred = self.classifier.predict_proba(npTest)[:,1]

        predictionrpt = None
        if self.hasExpected:

            self.logger.debug('Target is available... Scoring target')
            # The second column contains the actual values
            y_test = self.expected.iloc[:,1]

            scorer = Scorer()
            reportName = '\nTarget Data Prediction Report \t\t' + timestamp
            predictionrpt = scorer.score(
                    y_test,
                    y_pred,
                    yy_pred,
                    classifier=self.classifier,
                    title=reportName,
                    configuration=self.configuration )
            print(predictionrpt)

        sample = pd.read_csv(submissionSample)
        sample.QuoteConversion_Flag = y_pred
        sample.to_csv(submissionVFile, index=False)

        probabilities = pd.read_csv(submissionSample)
        probabilities.QuoteConversion_Flag = yy_pred
        probabilities.to_csv(submissionPFile, index=False)

        mfeatures, nfeatures= self.features.shape
        mtest, ntest = self.test.shape
        mxtrain, nxtrain= self.X_train.shape
        mxtest, nxtest = self.X_test.shape

        self.logger.debug('Saving submission information')
        with open(submissionLog, 'a') as f:
            f.write('Submission Report \t\t\t Generated at: {0}'.format(timestamp))
            f.write('\n\nData Statistics:')
            f.write('\n\Feature data: \trows: {0}, columns: {1}'.format(mfeatures, nfeatures))
            f.write('\n\tTest data: \t\trows: {0}, columns: {1}'.format(mtest, ntest))
            f.write('\n\nCross Validation Statistics:')
            f.write('\n\tTraining data: \trows: {0}, columns: {1}'.format(mxtrain, nxtrain))
            f.write('\n\tTest data: \t\trows: {0}, columns: {1}'.format(mxtest, nxtest))
            f.write('\n\nValues file:\t\t{0}'.format(submissionVFile))
            f.write('\nProbabilities file:\t{0}'.format(submissionPFile))
            f.write('\nProbabilities file:\t{0}'.format(submissionPFile))
            f.write('\n')
            f.write('{0}'.format(self.report()))

            if predictionrpt:
                f.write('\n\n{0}'.format(predictionrpt))

        return(submissionLog, submissionVFile, submissionPFile, self.classifier)