def run(self, _training, _model, _batchSize, _resultFile): csv = CSV(_training) csv.randomize(1000) csv.removeIndices() R = ResultMatrix() for i in range(int(len(csv.data)/_batchSize)): c = CSV() c.header = csv.header c.data = csv.data[0:(i+1)*_batchSize] file = self.resultFolder + "subset_" + str(i) + ".csv" c.save(file) header, data = Experiment(file).regression([_model], 10) R.add(header, data) R.save(_resultFile)
def exportWeights(self, _features, _file): M = CSV() if len(self.classes) > 0: M.header = ['class0', 'class1'] + _features else: M.header = _features for c in range(len(self.weights)): W = self.weights[c] F = [] for feature in _features: if feature in W: F.append(W[feature]) else: F.append(0) if len(self.classes) > 0: M.data.append(','.join(self.classes[c] + [str(x) for x in F])) else: M.data.append(','.join([str(x) for x in F])) M.save(_file)
from models.randomforest.RandomForest import RandomForest from experiment.Experiment import Experiment from code.CodeGenerator import CodeGenerator from data.CSV import CSV from data.ResultMatrix import ResultMatrix import numpy as np import matplotlib.pyplot as plt from plot.PlotTool import PlotTool from plot.ResultVisualizer import ResultVisualizer # define the training data set and set up the model training = "../examples/mnoA.csv" model = RandomForest() model.config.trees = 10 model.config.depth = 5 # perform a 10-fold cross validation e = Experiment(training, "example_rf_mdi") e.regression([model], 10) # M = CSV(e.path("features_0.csv")).toMatrix() M.normalizeRows() M.sortByMean() M.save(e.path("rf_features.csv")) # ResultVisualizer().barchart(e.path("rf_features.csv"), xlabel="Feature", ylabel="Relative Feature Importance", savePNG=e.path(e.id+".png"))
e = Experiment(training, "example_feature_reduction") e.regression([model], 10) CSV(e.path("cv_0.csv")).save(e.path("subset_0.csv")) xTicks = ["None"] # obtain a feature ranking M = CSV(e.path("features_0.csv")).toMatrix() M.normalizeRows() M.sortByMean() # sequentially remove the least important feature from the training data and retrain the model subset = e.path("subset.csv") for i in range(len(M.header) - 1): key = M.header[-1] M.header = M.header[0:-1] csv.removeColumnWithKey(key) csv.save(subset) e = Experiment(subset, "example_feature_reduction") e.regression([model], 10) CSV(e.path("cv_0.csv")).save(e.path("subset_" + str(i + 1) + ".csv")) xTicks.append(key) # files = [e.path("subset_" + str(i) + ".csv") for i in range(len(xTicks))] ResultVisualizer().boxplots(files, "r2", xTicks, xlabel='Sequentially Removed Features', ylabel='R2', savePNG=e.path("example_feature_reduction.png"))