class Data: packageName = "com.brodagroup.machinelearning.common.Data" logger = None # Initializer def __init__(self): self.logger = Logger(self.packageName).getLogger() self.parameters = [] return def configure(): return 0 def load(self, pathCSV): # dataframe = pd.read_csv(pathCSV, quotechar='"', skipinitialspace=True) dataframe = pd.read_csv(pathCSV) return dataframe def loadDataFrame(self, pathPKL): dataframe = pd.read_pickle(pathPKL) return dataframe def saveDataFrame(self, dataframe, pathPKL): dataframe.to_pickle(pathPKL) return def segment(self, features, target, totalPct, testingPct, randomState): # Use a small subset of testing hypothesis to lower run-time numRows, numFeatures = features.shape rowsUsed = int(numRows * totalPct) xfeatures = features[0:rowsUsed] xtarget = target[0:rowsUsed] # Note: features represents "X" and target represents "y" X_train, X_test, y_train, y_test = train_test_split( xfeatures, xtarget, test_size=testingPct, random_state=randomState ) return (X_train, X_test, y_train, y_test) def shuffle(self, dataset): np.random.shuffle(dataset) return def sample(self, dataset, count): sample = np.random.choice(dataset.index.values, count) return sample def normalize(self, df, type="std"): self.logger.info("Normalizing data, type: {0}".format(type)) cols = df.columns.values inArray = df[cols].values outArray = None if type == "minmax": # minmax_scale = preprocessing.MinMaxScaler().fit(df[cols]) minmax_scale = preprocessing.MinMaxScaler().fit(inArray) outArray = minmax_scale.transform(inArray) else: # std_scale = preprocessing.StandardScaler().fit(df[cols]) std_scale = preprocessing.StandardScaler().fit(inArray) outArray = std_scale.transform(inArray) df = pd.DataFrame(data=outArray, columns=cols) return df def threshold(self, X, lower, lvalue, upper, uvalue): X[X <= lower] = lvalue X[X >= upper] = uvalue return X def join(self, leftDF, rightDF, onKeys): result = pd.merge(leftDF, rightDF, on=onKeys) return result def categorize(self, df, field): self.logger.debug("Categorizing field: {0}, type: {1}".format(field, df[field].dtype)) # Create and fill new columns for the categorized field if df[field].dtype == "object": lbl = preprocessing.LabelEncoder() values = list(pd.Series(df[field].values.ravel()).unique()) self.logger.debug("Field: {0}, has value count: {1}".format(field, len(values))) if len(values) > 2: for value in values: # Create the new field name based upon original name and values # Note -- take into account missing values if pd.isnull(value): xfield = field + "-" + "Missing" else: # Strip commas xvalue = value.replace(",", "") xfield = field + "-" + xvalue # Create and fill in the new columns with values df.loc[:, xfield] = -1 self.logger.debug("Creating field: {0}, type: {1}".format(xfield, df[xfield].dtype)) # df[xfield] = df[field].apply(lambda x: 1 if x == value else 0) df.loc[:, xfield] = df.loc[:, field].apply(lambda x: 1 if x == value else 0) # Remove the original field self.logger.debug("Dropping field: {0}, type: {1}".format(field, df[field].dtype)) df = df.drop(field, axis=1) return df def sync(self, dfA, dfB): self.logger.debug("Synchronizing...") listA = list(dfA.columns.values) self.logger.debug("DataFrame A, columns: {0}".format(listA)) listB = list(dfB.columns.values) self.logger.debug("DataFrame B, columns: {0}".format(listB)) setA = set(listA) setB = set(listB) columnsNotInB = setA.difference(setB) self.logger.debug("Columns in A but not in B: {0}".format(columnsNotInB)) for column in columnsNotInB: dfB[column] = 0 columnsNotInA = setB.difference(setA) self.logger.debug("Columns in B but not in A: {0}".format(columnsNotInA)) for column in columnsNotInA: dfA[column] = 0 return (dfA, dfB) def prune(self, df, keep=None, remove=None): if keep: self.logger.info("Pruning, keeping fields: {0}".format(keep)) df = df[keep] if remove: self.logger.info("Pruning, removing fields: {0}".format(remove)) df = df.drop(remove, axis=1) return df def encodeList(self, columns, dfA, dfB): if columns: lbl = preprocessing.LabelEncoder() for column in columns: dfA, dfB = self.encode(column, dfA, dfB) return (dfA, dfB) def encode(self, column, dfA, dfB): # Note that all input dataframes must be encoded in a similar fashion # and hence can not be done independently or else they will # get encoded based upon values present in that data set alone, # which is not an issue unless the values in the dataframes are # slightly different... for example, dataframe A (dfA) and B (dfB) both # have categorized values in the same specific column, but # dfA has values 'Y','N' and dfB has values 'maybe','sometimes','Y','N', # and 'almost always' then they will get encoded differently: 'Y' may be # encoded as '0' in dfA but '2' in dfB lbl = preprocessing.LabelEncoder() self.logger.debug("Encoding field: {0}".format(column)) valuesA = list(dfA[column].values) valuesB = list(dfB[column].values) values = valuesA + valuesB lbl.fit(values) # self.logger.debug('Encoding field: {0}, classes: {1}'.format(column, lbl.classes_)) # xto = lbl.transform(values) # xfrom = lbl.inverse_transform(xto) # self.logger.debug('Encoding field {0}, FROM: {1}'.format(column, xfrom)) # self.logger.debug('Encoding field {0}, TO: {1}'.format(column, xto)) dfA[column] = lbl.transform(valuesA) dfB[column] = lbl.transform(valuesB) return (dfA, dfB)
class Runner(): packageName = 'com.brodagroup.machinelearning.common.Runner' logger = None rpt = None gridsearchrpt = None featurerpt = None scoringrpt = None preprocessor = None # features: dataframe used for fit / learning features = None # test: dataframe used for prediction test = None # target: dataframe (single column) of actual/correct values (for scoring) target = None # expected: dataframe (single column) of actual values (for verification that algo works) expected = None hasExpected = False # y_pred: array of predictions (integer) y_pred = None # yy_pred: array of prediction probabilities (float) yy_pred = None # Initializer def __init__(self): self.logger = Logger(self.packageName).getLogger() return def dumpConfiguration(self): pretty = json.dumps(self.configuration, sort_keys=True, indent=4) return(pretty) def configure(self, jsonstr=None, file=None, url=None, overrides=None): if file: self.logger.info('Using configuration file: {0}'.format(file)) with open(file, encoding='utf-8') as configurationFile: configuration = json.loads(configurationFile.read()) elif url: configuration = urllib.urlopen(url).read() elif jsonstr: configuration = json else: raise RuntimeError('Configuration not provided (json|file|url)') self.configuration = configuration self.logger.info('Using configuration: {0}'.format(self.dumpConfiguration())) self.override(overrides=overrides) classifierCode = self.configuration['classifier'] parameters = self.configuration['parameters'] classifierList = ClassifierList() classifier = classifierList.load(classifierCode, parameters) self.classifier = classifier return def modifyConfiguration(self, dictionary, name, value, iter): iter = iter + 1 parts = name.split('.') name = parts[0] if type(dictionary[name]) is dict: xdict = self.configuration[name] if iter > 3: raise('Error -- too many levels in configuration') xname = parts[1] self.modifyConfiguration(xdict, xname, value, iter) else: dictionary[name] = value self.logger.info('Setting name: {0} to value: {1}'.format(name, value)) return(name) def override(self, overrides=None): if overrides: self.logger.info('Overriding parameters: {0}'.format(overrides)) for nvp in overrides: x = nvp.split(':') name = x[0] value = x[1] self.modifyConfiguration(self.configuration, name, value, 0) self.logger.info('Using new configuration: {0}'.format(self.dumpConfiguration())) return def preprocessor(self,c): self.logger.info('Setting preprocessor') self.preprocessor = c return def load(self): self.logger.info('Loading data') data = Data() trainCSV = self.configuration['trainCSV'] testCSV = self.configuration['testCSV'] featuresPKL = self.configuration['featuresPKL'] targetPKL = self.configuration['targetPKL'] testPKL = self.configuration['testPKL'] expectedCSV = None expectedPKL = None try: expectedCSV = self.configuration['expectedCSV'] expectedPKL = self.configuration['expectedPKL'] except: pass # If the dataframe (pickled) file exists, then load it # Otherwise, load the CSV, preprocess it, and then save it as a # PKL file which will reduce load times tmpFeatures = None tmpTarget = None tmptest = None if( os.path.exists(featuresPKL) ): self.logger.info('Loading train PKL: {0}'.format(featuresPKL)) tmpFeatures = data.loadDataFrame(featuresPKL) self.logger.info('Loading target PKL: {0}'.format(targetPKL)) tmpTarget = data.loadDataFrame(targetPKL) self.logger.info('Loading test PKL: {0}'.format(testPKL)) tmpTest = data.loadDataFrame(testPKL) else: self.logger.info('Loading train CSV: {0}'.format(trainCSV)) rawtrain = data.load(trainCSV) self.logger.info('Loading test CSV: {0}'.format(testCSV)) rawtest = data.load(testCSV) # Preprocess the data tmpFeatures, tmpTarget, tmpTest = self.preprocessor.execute(rawtrain, rawtest) # Save the dataframe (lower load times) self.logger.info('Saving features PKL: {0}'.format(featuresPKL)) data.saveDataFrame(tmpFeatures, featuresPKL) self.logger.info('Saving target PKL: {0}'.format(targetPKL)) data.saveDataFrame(tmpTarget, targetPKL) self.logger.info('Saving test PKL: {0}'.format(testPKL)) data.saveDataFrame(tmpTest, testPKL) if( expectedPKL and os.path.exists(expectedPKL) ): self.logger.info('loading expected PKL: {0}'.format(expectedPKL)) tmpExpected = data.loadDataFrame(expectedPKL) self.hasExpected = True elif( expectedCSV and os.path.exists(expectedCSV) ): self.logger.info('Loading expected CSV: {0}'.format(expectedCSV)) tmpExpected = data.load(expectedCSV) self.logger.info('Saving expected PKL: {0}'.format(expectedPKL)) data.saveDataFrame(tmpExpected, expectedPKL) self.hasExpected = True self.features = tmpFeatures self.target = tmpTarget self.test = tmpTest if self.hasExpected: self.expected = tmpExpected return # Segment the TRAINING set into a smaller # cross validation set of data def segment(self): self.logger.info('Segmenting...') data = Data(); totalpct = float(self.configuration['totalpct']) testpct = float(self.configuration['testpct']) randomstate = int(self.configuration['randomstate']) X_train, X_test, y_train, y_test = data.segment(self.features, self.target, totalpct, testpct, randomstate) self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test return def fit(self): npXTrain = np.array(self.X_train).astype(np.float32) npyTrain = np.array(self.y_train).astype(np.int32) self.classifier.fit(npXTrain, npyTrain) return def crossvalidate(self): npXTrain = np.array(self.X_train).astype(np.float32) npyTrain = np.array(self.y_train).astype(np.int32) rptDF = self.classifier.crossvalidate(npXTrain, npyTrain) self.crossvalidationDF = rptDF pd.set_option('display.max_rows', 10000) self.logger.info('Cross Validation Report\n{0}'.format(rptDF)) return def gridsearch(self, use=True, score='roc_auc'): self.logger.info('Executing grid search...') parameters = self.configuration['gridsearch'] x = GridSearchCV(self.classifier, parameters, cv=6, scoring=score, verbose=10, n_jobs=6) #x = GridSearchCV(self.classifier, parameters, cv=5, scoring=score, verbose=10) npXTrain = np.array(self.X_train).astype(np.float32) npyTrain = np.array(self.y_train).astype(np.int32) x.fit(npXTrain, npyTrain) rpt = 'Grid Search Analysis \t\t' + str(dt.datetime.now()) rpt = rpt + '\n\nParameters {0}'.format(parameters) rpt = rpt + '\n\nBest parameters set found:' rpt = rpt + '\n\t' + '{0}'.format(x.best_estimator_) rpt = rpt + '\n\nGrid Search Scores (using {0}):'.format(score) rpt = rpt + '\nSCORE\t\tSTDDEV(+/-)\tPARAMETERS:' for params, mean_score, scores in x.grid_scores_: rpt = rpt + '\n' + '{0:0.7f}\t{1:0.7f}'.format(mean_score, scores.std() / 2) for key in params: value = params[key] rpt = rpt + '\t\t{0}\t\t{1}'.format(key, value) if use: self.classifier = x.best_estimator_ self.gridsearchrpt = rpt return(rpt) def importance(self): self.logger.info('Creating feature importance report...') rpt = None rpt = 'Feature Importance \t\t' + str(dt.datetime.now()) if self.classifier == None: return(rpt) if hasattr(self.classifier, 'importance'): df = self.classifier.importance(self.X_train.columns.values) rpt = rpt + '\n\n{0}'.format(df) if hasattr(self.classifier, 'feature_importances_'): fi = pd.DataFrame(self.classifier.feature_importances_) columns = pd.DataFrame(self.X_train.columns.values) result = pd.concat([columns, fi], axis=1) result.columns = ['Feature', 'Importance'] sorted = result.sort(['Importance','Feature'], ascending=[False, True]) rpt = rpt + '\n{0}'.format(sorted) #pd.set_option('display.max_rows', len(sorted)) #pd.reset_option('display.max_rows') self.featurerpt = rpt return(rpt) def score(self): self.logger.info('Scoring...') npXTest = np.array(self.X_test).astype(np.float32) y_pred = self.classifier.predict(npXTest) yy_pred = self.classifier.predict_proba(npXTest)[:,1] print('\n***') print(self.features.shape) print(self.test.shape) print('***\n') reportName = 'Cross Verification Data Report \t\t' + str(dt.datetime.now()) scorer = Scorer() y_test = self.y_test rpt = scorer.score( y_test, y_pred, yy_pred, classifier=self.classifier, title=reportName, configuration=self.configuration ) self.y_pred = y_pred self.yy_pred = yy_pred self.scoringrpt = rpt return(rpt) def inspect(self, name): x = getattr(self, name) return(x) def inquire(self, name): x = hasattr(self, name) return(x) def inject(self, name, value): x = setattr(self, name, value) return(x) def report(self): self.logger.info('Executing full report') rpt = '\nFull Report\n' if self.featurerpt: rpt = rpt + '\n\n{0}'.format(self.featurerpt) if self.scoringrpt: rpt = rpt + '\n\n{0}'.format(self.scoringrpt) if self.gridsearchrpt: rpt = rpt + '\n\n{0}'.format(self.gridsearchrpt) self.rpt = rpt return(rpt) def predict(self): self.logger.info('Predicting...') submissionSample = self.configuration['submissionSample'] submissionDir = self.configuration['submissionDir'] timestamp = dt.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') submissionVFile = submissionDir + '/' + 'submission-values-' + timestamp + '.csv' submissionPFile = submissionDir + '/' + 'submission-probabilities-' + timestamp + '.csv' submissionLog = submissionDir + '/' + 'submission-' + timestamp + '.txt' npTest = np.array(self.test).astype(np.float32) y_pred = self.classifier.predict(npTest) yy_pred = self.classifier.predict_proba(npTest)[:,1] predictionrpt = None if self.hasExpected: self.logger.debug('Target is available... Scoring target') # The second column contains the actual values y_test = self.expected.iloc[:,1] scorer = Scorer() reportName = '\nTarget Data Prediction Report \t\t' + timestamp predictionrpt = scorer.score( y_test, y_pred, yy_pred, classifier=self.classifier, title=reportName, configuration=self.configuration ) print(predictionrpt) sample = pd.read_csv(submissionSample) sample.QuoteConversion_Flag = y_pred sample.to_csv(submissionVFile, index=False) probabilities = pd.read_csv(submissionSample) probabilities.QuoteConversion_Flag = yy_pred probabilities.to_csv(submissionPFile, index=False) mfeatures, nfeatures= self.features.shape mtest, ntest = self.test.shape mxtrain, nxtrain= self.X_train.shape mxtest, nxtest = self.X_test.shape self.logger.debug('Saving submission information') with open(submissionLog, 'a') as f: f.write('Submission Report \t\t\t Generated at: {0}'.format(timestamp)) f.write('\n\nData Statistics:') f.write('\n\Feature data: \trows: {0}, columns: {1}'.format(mfeatures, nfeatures)) f.write('\n\tTest data: \t\trows: {0}, columns: {1}'.format(mtest, ntest)) f.write('\n\nCross Validation Statistics:') f.write('\n\tTraining data: \trows: {0}, columns: {1}'.format(mxtrain, nxtrain)) f.write('\n\tTest data: \t\trows: {0}, columns: {1}'.format(mxtest, nxtest)) f.write('\n\nValues file:\t\t{0}'.format(submissionVFile)) f.write('\nProbabilities file:\t{0}'.format(submissionPFile)) f.write('\nProbabilities file:\t{0}'.format(submissionPFile)) f.write('\n') f.write('{0}'.format(self.report())) if predictionrpt: f.write('\n\n{0}'.format(predictionrpt)) return(submissionLog, submissionVFile, submissionPFile, self.classifier)