def __init__( self, num_classes=None, num_features=None, dense0_num_units=10, dropout_p=0.1, dense1_num_units=10, update_learning_rate=0.1, update_momentum=0.1, eval_size=0.1, max_epochs=10, verbose=5, ): self.logger = Logger(self.packageName).getLogger() self.logger.debug("Starting...") layers0 = [ ("input", InputLayer), ("dense0", DenseLayer), ("dropout1", DropoutLayer), ("dense1", DenseLayer), ("output", DenseLayer), ] self.classifier = NeuralNet( layers=layers0, input_shape=(None, num_features), dense0_num_units=dense0_num_units, dropout1_p=dropout_p, dense1_num_units=dense1_num_units, output_num_units=num_classes, output_nonlinearity=softmax, update=nesterov_momentum, update_learning_rate=update_learning_rate, update_momentum=update_momentum, eval_size=eval_size, max_epochs=max_epochs, verbose=verbose, ) self.num_classes = num_classes self.num_features = num_features self.dense0_num_units = dense0_num_units self.dropout_p = dropout_p self.dense1_num_units = dense1_num_units self.update_learning_rate = update_learning_rate self.update_momentum = update_momentum self.eval_size = eval_size self.max_epochs = max_epochs self.verbose = verbose return
def __init__(self): self.logger = Logger(self.packageName).getLogger() self.classifiers = [] self.addClassifier('xgb', 'XGBoost Classifier', 'classifier.XGB', 'XGB') self.addClassifier('nnn', 'NOLEARN Lasagne neural network', 'classifier.NNnolearn', 'NNnolearn') self.addClassifier('nns', 'SCIKIT neuralnetwork', 'classifier.NNscikit', 'NNscikit') self.addClassifier('for', 'SCIKIT Random Forest Classifier', 'sklearn.ensemble', 'RandomForestClassifier') self.addClassifier('ext', 'SCIKIT Extra Trees Classifier', 'sklearn.ensemble', 'ExtraTreesClassifier') self.addClassifier('svc', 'SCIKIT SVC', 'sklearn.svm', 'SVC') self.addClassifier('nsv', 'SCIKIT NU SVC', 'sklearn.svm', 'NuSVC') self.addClassifier('knn', 'SCIKIT Nearest Neighbour Classifier', 'sklearn.neighbors', 'KNeighborsClassifier') self.addClassifier('dtr', 'SCIKIT Decision Tree', 'sklearn.tree', 'DecisionTreeClassifier') self.addClassifier('log', 'SCIKIT Logistic Regression', 'sklearn.linear_model', 'LogisticRegression') self.addClassifier('pct', 'SCIKIT Perceptron', 'sklearn.linear_model', 'Perceptron') self.addClassifier('sgd', 'SCIKIT SGD Classifier', 'sklearn.linear_model', 'SGDClassifier') return
def __init__(self, num_classes=None, num_features=None, learning_rate=0.01, learning_rule='sgd', learning_momentum=0.9, dropout_rate=None, weight_decay=None, random_state=0, n_iter=10): self.logger = Logger(self.packageName).getLogger() self.logger.debug('Starting...') self.num_classes = num_classes self.num_features = num_features self.learning_rule = learning_rule self.learning_momentum = learning_momentum self.dropout_rate = dropout_rate self.learning_rate = learning_rate self.weight_decay = weight_decay self.random_state = random_state self.n_iter = n_iter self.hidden_units = round( (self.num_features + self.num_classes)/3, 0) # Layer('Tanh', units=self.num_features), # Layer('Maxout', units=self.num_features, pieces=2), self.classifier = Classifier( layers=[ Layer('Maxout', units=self.num_features, pieces=2), Layer('Sigmoid', units=self.hidden_units), Layer('Softmax', units=self.num_classes) ], learning_rule=self.learning_rule, learning_rate=self.learning_rate, learning_momentum=self.learning_momentum, dropout_rate=self.dropout_rate, weight_decay=self.weight_decay, random_state=self.random_state, n_iter=self.n_iter) return
class ClassifierScikitNN(): packageName = 'com.brodagroup.machinelearning.ClassifierScikitNN' logger = None hidden_units = None classifier = None # Initializer def __init__(self, num_classes=None, num_features=None, learning_rate=0.01, learning_rule='sgd', learning_momentum=0.9, dropout_rate=None, weight_decay=None, random_state=0, n_iter=10): self.logger = Logger(self.packageName).getLogger() self.logger.debug('Starting...') self.num_classes = num_classes self.num_features = num_features self.learning_rule = learning_rule self.learning_momentum = learning_momentum self.dropout_rate = dropout_rate self.learning_rate = learning_rate self.weight_decay = weight_decay self.random_state = random_state self.n_iter = n_iter self.hidden_units = round( (self.num_features + self.num_classes)/3, 0) # Layer('Tanh', units=self.num_features), # Layer('Maxout', units=self.num_features, pieces=2), self.classifier = Classifier( layers=[ Layer('Maxout', units=self.num_features, pieces=2), Layer('Sigmoid', units=self.hidden_units), Layer('Softmax', units=self.num_classes) ], learning_rule=self.learning_rule, learning_rate=self.learning_rate, learning_momentum=self.learning_momentum, dropout_rate=self.dropout_rate, weight_decay=self.weight_decay, random_state=self.random_state, n_iter=self.n_iter) return def __str__(self): x = self.packageName + '(' x = x + '\n\t num_classes={0}, num_features: {1}'.format(self.num_classes, self.num_features) x = x + '\n\t learning_rule={0}, learning_rate: {1}'.format(self.learning_rule, self.learning_rate) x = x + '\n\t learning_momentum={0}, dropout_rate: {1}'.format(self.learning_momentum, self.dropout_rate) x = x + '\n\t hidden_units={0}, weight_decay: {1}'.format(self.hidden_units, self.weight_decay) x = x + '\n\t random_state={0}, n_iter: {1}'.format(self.random_state, self.n_iter) return(x) def fit(self, X, y): self.classifier.fit(X,y) def predict(self, X): y_pred = self.classifier.predict(X) return(y_pred) def predict_proba(self, X): y_pred = self.classifier.predict_proba(X) return(y_pred) def get_params(self, deep=True): return { "num_classes": self.num_classes, "num_features": self.num_features, "learning_rule": self.learning_rule, "learning_rate": self.learning_rate, "learning_momentum": self.learning_momentum, "dropout_rate": self.dropout_rate, "weight_decay": self.weight_decay, "random_state": self.random_state, "n_iter": self.n_iter } def set_params(self, **parameters): for parameter, value in parameters.items(): setattr(self, parameter, value) return self
class ClassifierList(): packageName = 'com.brodagroup.machinelearning.classifierlist' logger = None # Initializer def __init__(self): self.logger = Logger(self.packageName).getLogger() self.classifiers = [] self.addClassifier('xgb', 'XGBoost Classifier', 'classifier.XGB', 'XGB') self.addClassifier('nnn', 'NOLEARN Lasagne neural network', 'classifier.NNnolearn', 'NNnolearn') self.addClassifier('nns', 'SCIKIT neuralnetwork', 'classifier.NNscikit', 'NNscikit') self.addClassifier('for', 'SCIKIT Random Forest Classifier', 'sklearn.ensemble', 'RandomForestClassifier') self.addClassifier('ext', 'SCIKIT Extra Trees Classifier', 'sklearn.ensemble', 'ExtraTreesClassifier') self.addClassifier('svc', 'SCIKIT SVC', 'sklearn.svm', 'SVC') self.addClassifier('nsv', 'SCIKIT NU SVC', 'sklearn.svm', 'NuSVC') self.addClassifier('knn', 'SCIKIT Nearest Neighbour Classifier', 'sklearn.neighbors', 'KNeighborsClassifier') self.addClassifier('dtr', 'SCIKIT Decision Tree', 'sklearn.tree', 'DecisionTreeClassifier') self.addClassifier('log', 'SCIKIT Logistic Regression', 'sklearn.linear_model', 'LogisticRegression') self.addClassifier('pct', 'SCIKIT Perceptron', 'sklearn.linear_model', 'Perceptron') self.addClassifier('sgd', 'SCIKIT SGD Classifier', 'sklearn.linear_model', 'SGDClassifier') return def loadClass(self, moduleName, className, parameters): self.logger.debug('Loading module: {0}, class: {1}'.format(moduleName, className)) self.logger.debug('Using load parameters: {0}'.format(parameters)) try: module_ = import_module(moduleName) try: class_ = getattr(module_, className) instance = class_(**parameters) except AttributeError: raise RuntimeError('Class does not exist: {0}'.format(className)) except ImportError: raise RuntimeError('Module does not exist: {0}'.format(moduleName)) return instance def addClassifier(self, code, name, moduleName, className): x = [code, name, moduleName, className] self.classifiers.append(x) return def load(self, code, parameters): item = next((x for x in self.classifiers if x[0] == code), None) if item == None: raise RuntimeError('Classifier code not found: {0}'.format(code)) code = item[0] name = item[1] moduleName = item[2] className = item[3] classifier = self.loadClass(moduleName, className, parameters) return classifier def list(self): x = np.array(self.classifiers) a = x[:,0] return(a.tolist())
class NNnolearn: packageName = "com.brodagroup.machinelearning.classifer.NNnolearn" logger = None classifier = None # Initializer def __init__( self, num_classes=None, num_features=None, dense0_num_units=10, dropout_p=0.1, dense1_num_units=10, update_learning_rate=0.1, update_momentum=0.1, eval_size=0.1, max_epochs=10, verbose=5, ): self.logger = Logger(self.packageName).getLogger() self.logger.debug("Starting...") layers0 = [ ("input", InputLayer), ("dense0", DenseLayer), ("dropout1", DropoutLayer), ("dense1", DenseLayer), ("output", DenseLayer), ] self.classifier = NeuralNet( layers=layers0, input_shape=(None, num_features), dense0_num_units=dense0_num_units, dropout1_p=dropout_p, dense1_num_units=dense1_num_units, output_num_units=num_classes, output_nonlinearity=softmax, update=nesterov_momentum, update_learning_rate=update_learning_rate, update_momentum=update_momentum, eval_size=eval_size, max_epochs=max_epochs, verbose=verbose, ) self.num_classes = num_classes self.num_features = num_features self.dense0_num_units = dense0_num_units self.dropout_p = dropout_p self.dense1_num_units = dense1_num_units self.update_learning_rate = update_learning_rate self.update_momentum = update_momentum self.eval_size = eval_size self.max_epochs = max_epochs self.verbose = verbose return def __str__(self): return x def fit(self, X, y): self.classifier.fit(X, y) def predict(self, X): y_pred = self.classifier.predict(X) return y_pred def predict_proba(self, X): y_pred = self.classifier.predict_proba(X) return y_pred def get_params(self, deep=True): return { "num_classes": self.num_classes, "num_features": self.num_features, "dense0_num_units": self.dense0_num_units, "dropout_p": self.dropout_p, "dense1_num_units": self.dense1_num_units, "update_learning_rate": self.update_learning_rate, "eval_size": self.eval_size, "max_epochs": self.max_epochs, "verbose": self.verbose, } def set_params(self, **parameters): for parameter, value in parameters.items(): setattr(self, parameter, value) return self
def __init__(self): self.logger = Logger(self.packageName).getLogger() self.parameters = [] return
class Data: packageName = "com.brodagroup.machinelearning.common.Data" logger = None # Initializer def __init__(self): self.logger = Logger(self.packageName).getLogger() self.parameters = [] return def configure(): return 0 def load(self, pathCSV): # dataframe = pd.read_csv(pathCSV, quotechar='"', skipinitialspace=True) dataframe = pd.read_csv(pathCSV) return dataframe def loadDataFrame(self, pathPKL): dataframe = pd.read_pickle(pathPKL) return dataframe def saveDataFrame(self, dataframe, pathPKL): dataframe.to_pickle(pathPKL) return def segment(self, features, target, totalPct, testingPct, randomState): # Use a small subset of testing hypothesis to lower run-time numRows, numFeatures = features.shape rowsUsed = int(numRows * totalPct) xfeatures = features[0:rowsUsed] xtarget = target[0:rowsUsed] # Note: features represents "X" and target represents "y" X_train, X_test, y_train, y_test = train_test_split( xfeatures, xtarget, test_size=testingPct, random_state=randomState ) return (X_train, X_test, y_train, y_test) def shuffle(self, dataset): np.random.shuffle(dataset) return def sample(self, dataset, count): sample = np.random.choice(dataset.index.values, count) return sample def normalize(self, df, type="std"): self.logger.info("Normalizing data, type: {0}".format(type)) cols = df.columns.values inArray = df[cols].values outArray = None if type == "minmax": # minmax_scale = preprocessing.MinMaxScaler().fit(df[cols]) minmax_scale = preprocessing.MinMaxScaler().fit(inArray) outArray = minmax_scale.transform(inArray) else: # std_scale = preprocessing.StandardScaler().fit(df[cols]) std_scale = preprocessing.StandardScaler().fit(inArray) outArray = std_scale.transform(inArray) df = pd.DataFrame(data=outArray, columns=cols) return df def threshold(self, X, lower, lvalue, upper, uvalue): X[X <= lower] = lvalue X[X >= upper] = uvalue return X def join(self, leftDF, rightDF, onKeys): result = pd.merge(leftDF, rightDF, on=onKeys) return result def categorize(self, df, field): self.logger.debug("Categorizing field: {0}, type: {1}".format(field, df[field].dtype)) # Create and fill new columns for the categorized field if df[field].dtype == "object": lbl = preprocessing.LabelEncoder() values = list(pd.Series(df[field].values.ravel()).unique()) self.logger.debug("Field: {0}, has value count: {1}".format(field, len(values))) if len(values) > 2: for value in values: # Create the new field name based upon original name and values # Note -- take into account missing values if pd.isnull(value): xfield = field + "-" + "Missing" else: # Strip commas xvalue = value.replace(",", "") xfield = field + "-" + xvalue # Create and fill in the new columns with values df.loc[:, xfield] = -1 self.logger.debug("Creating field: {0}, type: {1}".format(xfield, df[xfield].dtype)) # df[xfield] = df[field].apply(lambda x: 1 if x == value else 0) df.loc[:, xfield] = df.loc[:, field].apply(lambda x: 1 if x == value else 0) # Remove the original field self.logger.debug("Dropping field: {0}, type: {1}".format(field, df[field].dtype)) df = df.drop(field, axis=1) return df def sync(self, dfA, dfB): self.logger.debug("Synchronizing...") listA = list(dfA.columns.values) self.logger.debug("DataFrame A, columns: {0}".format(listA)) listB = list(dfB.columns.values) self.logger.debug("DataFrame B, columns: {0}".format(listB)) setA = set(listA) setB = set(listB) columnsNotInB = setA.difference(setB) self.logger.debug("Columns in A but not in B: {0}".format(columnsNotInB)) for column in columnsNotInB: dfB[column] = 0 columnsNotInA = setB.difference(setA) self.logger.debug("Columns in B but not in A: {0}".format(columnsNotInA)) for column in columnsNotInA: dfA[column] = 0 return (dfA, dfB) def prune(self, df, keep=None, remove=None): if keep: self.logger.info("Pruning, keeping fields: {0}".format(keep)) df = df[keep] if remove: self.logger.info("Pruning, removing fields: {0}".format(remove)) df = df.drop(remove, axis=1) return df def encodeList(self, columns, dfA, dfB): if columns: lbl = preprocessing.LabelEncoder() for column in columns: dfA, dfB = self.encode(column, dfA, dfB) return (dfA, dfB) def encode(self, column, dfA, dfB): # Note that all input dataframes must be encoded in a similar fashion # and hence can not be done independently or else they will # get encoded based upon values present in that data set alone, # which is not an issue unless the values in the dataframes are # slightly different... for example, dataframe A (dfA) and B (dfB) both # have categorized values in the same specific column, but # dfA has values 'Y','N' and dfB has values 'maybe','sometimes','Y','N', # and 'almost always' then they will get encoded differently: 'Y' may be # encoded as '0' in dfA but '2' in dfB lbl = preprocessing.LabelEncoder() self.logger.debug("Encoding field: {0}".format(column)) valuesA = list(dfA[column].values) valuesB = list(dfB[column].values) values = valuesA + valuesB lbl.fit(values) # self.logger.debug('Encoding field: {0}, classes: {1}'.format(column, lbl.classes_)) # xto = lbl.transform(values) # xfrom = lbl.inverse_transform(xto) # self.logger.debug('Encoding field {0}, FROM: {1}'.format(column, xfrom)) # self.logger.debug('Encoding field {0}, TO: {1}'.format(column, xto)) dfA[column] = lbl.transform(valuesA) dfB[column] = lbl.transform(valuesB) return (dfA, dfB)
def __init__(self): self.logger = Logger(self.packageName).getLogger() return
class Runner(): packageName = 'com.brodagroup.machinelearning.common.Runner' logger = None rpt = None gridsearchrpt = None featurerpt = None scoringrpt = None preprocessor = None # features: dataframe used for fit / learning features = None # test: dataframe used for prediction test = None # target: dataframe (single column) of actual/correct values (for scoring) target = None # expected: dataframe (single column) of actual values (for verification that algo works) expected = None hasExpected = False # y_pred: array of predictions (integer) y_pred = None # yy_pred: array of prediction probabilities (float) yy_pred = None # Initializer def __init__(self): self.logger = Logger(self.packageName).getLogger() return def dumpConfiguration(self): pretty = json.dumps(self.configuration, sort_keys=True, indent=4) return(pretty) def configure(self, jsonstr=None, file=None, url=None, overrides=None): if file: self.logger.info('Using configuration file: {0}'.format(file)) with open(file, encoding='utf-8') as configurationFile: configuration = json.loads(configurationFile.read()) elif url: configuration = urllib.urlopen(url).read() elif jsonstr: configuration = json else: raise RuntimeError('Configuration not provided (json|file|url)') self.configuration = configuration self.logger.info('Using configuration: {0}'.format(self.dumpConfiguration())) self.override(overrides=overrides) classifierCode = self.configuration['classifier'] parameters = self.configuration['parameters'] classifierList = ClassifierList() classifier = classifierList.load(classifierCode, parameters) self.classifier = classifier return def modifyConfiguration(self, dictionary, name, value, iter): iter = iter + 1 parts = name.split('.') name = parts[0] if type(dictionary[name]) is dict: xdict = self.configuration[name] if iter > 3: raise('Error -- too many levels in configuration') xname = parts[1] self.modifyConfiguration(xdict, xname, value, iter) else: dictionary[name] = value self.logger.info('Setting name: {0} to value: {1}'.format(name, value)) return(name) def override(self, overrides=None): if overrides: self.logger.info('Overriding parameters: {0}'.format(overrides)) for nvp in overrides: x = nvp.split(':') name = x[0] value = x[1] self.modifyConfiguration(self.configuration, name, value, 0) self.logger.info('Using new configuration: {0}'.format(self.dumpConfiguration())) return def preprocessor(self,c): self.logger.info('Setting preprocessor') self.preprocessor = c return def load(self): self.logger.info('Loading data') data = Data() trainCSV = self.configuration['trainCSV'] testCSV = self.configuration['testCSV'] featuresPKL = self.configuration['featuresPKL'] targetPKL = self.configuration['targetPKL'] testPKL = self.configuration['testPKL'] expectedCSV = None expectedPKL = None try: expectedCSV = self.configuration['expectedCSV'] expectedPKL = self.configuration['expectedPKL'] except: pass # If the dataframe (pickled) file exists, then load it # Otherwise, load the CSV, preprocess it, and then save it as a # PKL file which will reduce load times tmpFeatures = None tmpTarget = None tmptest = None if( os.path.exists(featuresPKL) ): self.logger.info('Loading train PKL: {0}'.format(featuresPKL)) tmpFeatures = data.loadDataFrame(featuresPKL) self.logger.info('Loading target PKL: {0}'.format(targetPKL)) tmpTarget = data.loadDataFrame(targetPKL) self.logger.info('Loading test PKL: {0}'.format(testPKL)) tmpTest = data.loadDataFrame(testPKL) else: self.logger.info('Loading train CSV: {0}'.format(trainCSV)) rawtrain = data.load(trainCSV) self.logger.info('Loading test CSV: {0}'.format(testCSV)) rawtest = data.load(testCSV) # Preprocess the data tmpFeatures, tmpTarget, tmpTest = self.preprocessor.execute(rawtrain, rawtest) # Save the dataframe (lower load times) self.logger.info('Saving features PKL: {0}'.format(featuresPKL)) data.saveDataFrame(tmpFeatures, featuresPKL) self.logger.info('Saving target PKL: {0}'.format(targetPKL)) data.saveDataFrame(tmpTarget, targetPKL) self.logger.info('Saving test PKL: {0}'.format(testPKL)) data.saveDataFrame(tmpTest, testPKL) if( expectedPKL and os.path.exists(expectedPKL) ): self.logger.info('loading expected PKL: {0}'.format(expectedPKL)) tmpExpected = data.loadDataFrame(expectedPKL) self.hasExpected = True elif( expectedCSV and os.path.exists(expectedCSV) ): self.logger.info('Loading expected CSV: {0}'.format(expectedCSV)) tmpExpected = data.load(expectedCSV) self.logger.info('Saving expected PKL: {0}'.format(expectedPKL)) data.saveDataFrame(tmpExpected, expectedPKL) self.hasExpected = True self.features = tmpFeatures self.target = tmpTarget self.test = tmpTest if self.hasExpected: self.expected = tmpExpected return # Segment the TRAINING set into a smaller # cross validation set of data def segment(self): self.logger.info('Segmenting...') data = Data(); totalpct = float(self.configuration['totalpct']) testpct = float(self.configuration['testpct']) randomstate = int(self.configuration['randomstate']) X_train, X_test, y_train, y_test = data.segment(self.features, self.target, totalpct, testpct, randomstate) self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test return def fit(self): npXTrain = np.array(self.X_train).astype(np.float32) npyTrain = np.array(self.y_train).astype(np.int32) self.classifier.fit(npXTrain, npyTrain) return def crossvalidate(self): npXTrain = np.array(self.X_train).astype(np.float32) npyTrain = np.array(self.y_train).astype(np.int32) rptDF = self.classifier.crossvalidate(npXTrain, npyTrain) self.crossvalidationDF = rptDF pd.set_option('display.max_rows', 10000) self.logger.info('Cross Validation Report\n{0}'.format(rptDF)) return def gridsearch(self, use=True, score='roc_auc'): self.logger.info('Executing grid search...') parameters = self.configuration['gridsearch'] x = GridSearchCV(self.classifier, parameters, cv=6, scoring=score, verbose=10, n_jobs=6) #x = GridSearchCV(self.classifier, parameters, cv=5, scoring=score, verbose=10) npXTrain = np.array(self.X_train).astype(np.float32) npyTrain = np.array(self.y_train).astype(np.int32) x.fit(npXTrain, npyTrain) rpt = 'Grid Search Analysis \t\t' + str(dt.datetime.now()) rpt = rpt + '\n\nParameters {0}'.format(parameters) rpt = rpt + '\n\nBest parameters set found:' rpt = rpt + '\n\t' + '{0}'.format(x.best_estimator_) rpt = rpt + '\n\nGrid Search Scores (using {0}):'.format(score) rpt = rpt + '\nSCORE\t\tSTDDEV(+/-)\tPARAMETERS:' for params, mean_score, scores in x.grid_scores_: rpt = rpt + '\n' + '{0:0.7f}\t{1:0.7f}'.format(mean_score, scores.std() / 2) for key in params: value = params[key] rpt = rpt + '\t\t{0}\t\t{1}'.format(key, value) if use: self.classifier = x.best_estimator_ self.gridsearchrpt = rpt return(rpt) def importance(self): self.logger.info('Creating feature importance report...') rpt = None rpt = 'Feature Importance \t\t' + str(dt.datetime.now()) if self.classifier == None: return(rpt) if hasattr(self.classifier, 'importance'): df = self.classifier.importance(self.X_train.columns.values) rpt = rpt + '\n\n{0}'.format(df) if hasattr(self.classifier, 'feature_importances_'): fi = pd.DataFrame(self.classifier.feature_importances_) columns = pd.DataFrame(self.X_train.columns.values) result = pd.concat([columns, fi], axis=1) result.columns = ['Feature', 'Importance'] sorted = result.sort(['Importance','Feature'], ascending=[False, True]) rpt = rpt + '\n{0}'.format(sorted) #pd.set_option('display.max_rows', len(sorted)) #pd.reset_option('display.max_rows') self.featurerpt = rpt return(rpt) def score(self): self.logger.info('Scoring...') npXTest = np.array(self.X_test).astype(np.float32) y_pred = self.classifier.predict(npXTest) yy_pred = self.classifier.predict_proba(npXTest)[:,1] print('\n***') print(self.features.shape) print(self.test.shape) print('***\n') reportName = 'Cross Verification Data Report \t\t' + str(dt.datetime.now()) scorer = Scorer() y_test = self.y_test rpt = scorer.score( y_test, y_pred, yy_pred, classifier=self.classifier, title=reportName, configuration=self.configuration ) self.y_pred = y_pred self.yy_pred = yy_pred self.scoringrpt = rpt return(rpt) def inspect(self, name): x = getattr(self, name) return(x) def inquire(self, name): x = hasattr(self, name) return(x) def inject(self, name, value): x = setattr(self, name, value) return(x) def report(self): self.logger.info('Executing full report') rpt = '\nFull Report\n' if self.featurerpt: rpt = rpt + '\n\n{0}'.format(self.featurerpt) if self.scoringrpt: rpt = rpt + '\n\n{0}'.format(self.scoringrpt) if self.gridsearchrpt: rpt = rpt + '\n\n{0}'.format(self.gridsearchrpt) self.rpt = rpt return(rpt) def predict(self): self.logger.info('Predicting...') submissionSample = self.configuration['submissionSample'] submissionDir = self.configuration['submissionDir'] timestamp = dt.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') submissionVFile = submissionDir + '/' + 'submission-values-' + timestamp + '.csv' submissionPFile = submissionDir + '/' + 'submission-probabilities-' + timestamp + '.csv' submissionLog = submissionDir + '/' + 'submission-' + timestamp + '.txt' npTest = np.array(self.test).astype(np.float32) y_pred = self.classifier.predict(npTest) yy_pred = self.classifier.predict_proba(npTest)[:,1] predictionrpt = None if self.hasExpected: self.logger.debug('Target is available... Scoring target') # The second column contains the actual values y_test = self.expected.iloc[:,1] scorer = Scorer() reportName = '\nTarget Data Prediction Report \t\t' + timestamp predictionrpt = scorer.score( y_test, y_pred, yy_pred, classifier=self.classifier, title=reportName, configuration=self.configuration ) print(predictionrpt) sample = pd.read_csv(submissionSample) sample.QuoteConversion_Flag = y_pred sample.to_csv(submissionVFile, index=False) probabilities = pd.read_csv(submissionSample) probabilities.QuoteConversion_Flag = yy_pred probabilities.to_csv(submissionPFile, index=False) mfeatures, nfeatures= self.features.shape mtest, ntest = self.test.shape mxtrain, nxtrain= self.X_train.shape mxtest, nxtest = self.X_test.shape self.logger.debug('Saving submission information') with open(submissionLog, 'a') as f: f.write('Submission Report \t\t\t Generated at: {0}'.format(timestamp)) f.write('\n\nData Statistics:') f.write('\n\Feature data: \trows: {0}, columns: {1}'.format(mfeatures, nfeatures)) f.write('\n\tTest data: \t\trows: {0}, columns: {1}'.format(mtest, ntest)) f.write('\n\nCross Validation Statistics:') f.write('\n\tTraining data: \trows: {0}, columns: {1}'.format(mxtrain, nxtrain)) f.write('\n\tTest data: \t\trows: {0}, columns: {1}'.format(mxtest, nxtest)) f.write('\n\nValues file:\t\t{0}'.format(submissionVFile)) f.write('\nProbabilities file:\t{0}'.format(submissionPFile)) f.write('\nProbabilities file:\t{0}'.format(submissionPFile)) f.write('\n') f.write('{0}'.format(self.report())) if predictionrpt: f.write('\n\n{0}'.format(predictionrpt)) return(submissionLog, submissionVFile, submissionPFile, self.classifier)