def readUptake(config): info("Getting uptake data.") datadir = getDataDir(config) outputConfig = config['output'] compress = outputConfig['compress'] dataName = setFile(datadir, outputConfig['name']) inputConfig = config['input'] trainName = setFile(datadir, inputConfig['train']) validName = setFile(datadir, inputConfig['valid']) featureConfig = config['feature'] dlFile = setFile(datadir, featureConfig['dropList']) if isFile(dataName) and isFile(dlFile): info("Loading previously create data frames") pddf = getJoblib(dataName) else: trainData = readCSV(trainName) validData = readCSV(validName) ## Remove 'market' from validation data validData.drop(labels='market', axis=1, inplace=True) pddf = joinTrainValid(trainData, validData) info("Saving training and validation data") saveJoblib(dataName, pddf, compress) info("Wrote training and validation data to " + dataName) info("Saving feature data") writeDropList(dlFile, pddf, dlData=None) return pddf
def dropData(pddf, config): info("Dropping columns", ind=4) basepath = config['basepath'] name = config['name'] dropListFile = config['feature']['dropList'] dname = setSubDir(basepath, ['data', name]) dlFile = setFile(dname, dropListFile) if not isFile(dlFile): info("There is no drop file. Not doing anything.", ind=4) return widths = [ int(x) for x in open(dlFile).readline().replace("\n", "").split(',') ] dlData = read_fwf(dlFile, widths=widths, skiprows=1) drops = dlData['Feature'][dlData['Drop?'] == 1] info("Dropping " + getNrows(drops, asStr=True) + " columns", ind=6) info("Data has " + getNrows(pddf, asStr=True) + " rows and " + getNcols(pddf, asStr=True) + " cols", ind=6) pddf.drop(labels=drops.values, axis=1, inplace=True) info("Data now has " + getNrows(pddf, asStr=True) + " rows and " + getNcols(pddf, asStr=True) + " cols", ind=6)
def analyzeColumns(pddf, config): info("Analyzing " + getNcols(pddf, asStr=True) + " columns to possible drops.", ind=2) targetConfig = config['target'] targetcol = targetConfig['colname'] #problemType = config['problem'] #positiveTarget = config['positiveTarget'] #if isClassification(problemType): # targetData = trainData[targetcol] basepath = config['basepath'] name = config['name'] dropListFile = config['feature']['dropList'] dname = setSubDir(basepath, ['data', name]) dlFile = setFile(dname, dropListFile) if not isFile(dlFile): info("There is no drop file. Not doing anything.", ind=4) return widths = [ int(x) for x in open(dlFile).readline().replace("\n", "").split(',') ] dlData = read_fwf(dlFile, widths=widths, skiprows=1) ## Keep record of overrides overrides = dlData['Feature'][dlData['Drop?'].isnull() == False] ## Set drop to 0 initially dlData['Drop?'].fillna(0, inplace=True) ## Drop anything with high cardinality (>50) dlData['Card'] = dlData['Card'].apply(to_numeric, errors='coerce') dlData['Card'].fillna(0, inplace=True) dlData.loc[dlData['Card'] >= 200, 'Drop?'] = 1 drops = dlData['Feature'][dlData['Drop?'] == 1] ## Drop with more than 20% missing data maxNA = getNrows(pddf) * 0.25 dlData.loc[dlData['nNA'] >= maxNA, 'Drop?'] = 1 drops = dlData['Feature'][dlData['Drop?'] == 1] ## Fill overrides #if getNrows(overrides) > 0: # dlData.loc[dlData['Feature'].isin(overrides['Feature']), 'Drop?'] = overrides['Drop?'] ## Lastly, make sure we don't trop the target dlData.loc[dlData['Feature'] == targetcol, "Drop?"] = 0 ## Show features to drop drops = dlData['Feature'][dlData['Drop?'] == 1] print drops ## Rewrite drop list writeDropList(dlFile, pddf=None, dlData=dlData)
def getModelFileName(config, modelname): prefix = None if isRegression(config['problem']): prefix = "regressor" elif isClassification(config['problem']): prefix = "classifier" modelFileName = setFile(getModelsDir(config), "{0}-{1}.p".format(prefix,modelname)) return modelFileName
def readKDD99(config): info("Getting KDD '99 data.") datadir = getDataDir(config) outputConfig = config['output'] compress = outputConfig['compress'] dataName = setFile(datadir, outputConfig['name']) featureConfig = config['feature'] dlFile = setFile(datadir, featureConfig['dropList']) if isFile(dataName) and isFile(dlFile): info("Loading previously create data frames") pddf = getJoblib(dataName) else: info("Downloading KDD '99 data", ind=2) tmp = datasets.fetch_kddcup99() X = tmp['data'] y = tmp['target'] y = y.reshape((y.shape[0], 1)) pddf = DataFrame(append(arr=X, values=y, axis=1)) tmp = pddf.head(n=1000) for column in tmp.columns: try: tmp[column].mean() pddf[column] = to_numeric(pddf[column], errors="coerce") except: continue colFile = setFile(datadir, "names.dat") colnames = open(colFile).readlines() targets = colnames[0].split(",") columns = [x.split(":")[0] for x in colnames[1:]] columns.append("TARGET") pddf.columns = columns info("Saving data to {0}".format(dataName)) saveJoblib(jlfile=dataName, jldata=pddf, compress=compress) info("Saving feature data to {0}".format(dlFile)) writeDropList(dlFile, pddf, dlData=None) return pddf
def generateRouteFeatures(data): t0 = start() features = {} for i, driverID in enumerate(data.keys()): driverData = data[driverID] print "Process driver {0}".format(driverID) features[driverID] = createRoutes(driverData, driverID) if i % 5 == 0: inter(t0, i, len(data)) end(t0) savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverPaths.p") saveJoblib(savefile, features, compress=True)
def readTrips(): drivers = findDirs( "/Users/tgadfort/Documents/pymva/axa/Axa-Insurance-Telematics-Kaggle") drivers = [getBasename(x) for x in drivers] data = {} for driverID in drivers: print "Reading trips from driver {0}".format(driverID) data[driverID] = readDriverTrips(driverID) savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverData.p") saveJoblib(savefile, data, compress=True)
def getMNIST(): datadir = "/Users/tgadfort/Documents/pymva/data" outdir = setDir(datadir, "mnist") names = [ "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz", "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz" ] for name in names: url = "http://yann.lecun.com/exdb/mnist/" + name savename = setFile(outdir, name) if not isFile(savename): urlretrieve(url, savename) statinfo = stat(savename) print('Succesfully downloaded', savename, statinfo.st_size, 'bytes.') name = name.replace(".gz", ".p") npfile = setFile(outdir, name) if not isFile(npfile): data = extract_images(savename) saveJoblib(npfile, data)
def getTrainTestNames(config): dname = getDataDir(config) X_trainName = setFile(dname, "X_train.p") X_testName = setFile(dname, "X_test.p") X_validName = setFile(dname, "X_valid.p") y_trainName = setFile(dname, "y_train.p") y_testName = setFile(dname, "y_test.p") y_validName = setFile(dname, "y_valid.p") return X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName
def generateDriverModels(data): t0 = start() features = {} for i, driverID in enumerate(data.keys()): driverData = data[driverID] print "Process driver {0}".format(driverID) dm = DriverModel(driverID, driverData) results, headers = dm.agg_mat, dm.agg_headers results = nan_to_num(results) features[driverID] = results if i % 5 == 0: inter(t0, i, len(data)) end(t0) savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverModels.p") saveJoblib(savefile, features, compress=True)
def plotConfusionMatrix(perfs, config, outdir, ext, pp=None): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ title = 'Confusion Matrix' normalize = True cmap = plt.cm.Blues try: cm = perfs['xgboost']['Confusion']['matrix'] except: return classes = getTargetNames(config) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, newaxis] plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') value = title plotname = setFile(outdir, ".".join([value, ext])) info("Saving {0} plot to {1}".format(title, plotname), ind=4) plt.savefig(plotname) plt.close()
def plotResults(perfs, y_truth, config): info("Making Performance Plots", ind=0) outdir = getPlotsDir(config) performanceConfig = config['performance'] ext = performanceConfig['ext'] isPdf = ext == 'pdf' isMultipage = performanceConfig['multipage'] if isMultipage and isPdf: pdfname = setFile(outdir, 'results.pdf') info("Saving all performance plots to {0}".format(pdfname), ind=2) pp = PdfPages(pdfname) else: info("Saving all performance plots individually as {0}".format(ext), ind=2) pp = None badModels = [x for x in perfs.keys() if len(perfs[x]) == 0] for modelname in badModels: info("Not plotting {0}".format(modelname)) del perfs[modelname] if isClassification(config['problem']): plotKappa(perfs, outdir, ext, pp) plotPrecision(perfs, outdir, ext, pp) plotRecall(perfs, outdir, ext, pp) plotLogLoss(perfs, outdir, ext, pp) plotAccuracy(perfs, outdir, ext, pp) plotPrecisionRecall(perfs, outdir, ext, pp) plotROC(perfs, outdir, ext, pp) plotConfusionMatrix(perfs, config, outdir, ext, pp) if isRegression(config['problem']): plotMAE(perfs, outdir, ext, pp) plotMSE(perfs, outdir, ext, pp) plotExplainedVariance(perfs, outdir, ext, pp) plotR2(perfs, outdir, ext, pp) plotResiduals(perfs, outdir, ext, pp) if isMultipage and isPdf: info("Closing multipage pdf", ind=2) pp.savefig() pp.close()
def plotResidualsAndPrediction(perfs, y_test, outdir, ext, pp=None): sns.set(style="whitegrid") modelnames = perfs.keys() x = y_test x.name = "Truth" for i, modelname in enumerate(modelnames): y = perfs[modelname]['Residuals'] title = "{0} Residuals And Prediction".format(modelname) y.name = "Residuals" # Plot the residuals after fitting a linear model ax = sns.residplot(x, y, lowess=True, color="b") ax.set_title(title) value = title plotname = setFile(outdir, ".".join([value, ext])) info("Saving {0} plot to {1}".format(title, plotname), ind=4) plt.savefig(plotname)
def generateTripFeatures(data): t0 = start() features = {} for i, driverID in enumerate(data.keys()): driverData = data[driverID] print "Process driver {0}".format(driverID) results = None for j, trip in enumerate(driverData): tripResults = tripFeatures(trip.values) if results is None: results = tripResults else: results = vstack((results, tripResults)) results = nan_to_num(results) features[driverID] = results if i % 5 == 0: inter(t0, i, len(data)) end(t0) savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverTripFeatures.p") saveJoblib(savefile, features, compress=True)
def plotBar(perfs, value, title, outdir, ext, pp=None): sns.set_style("whitegrid") modelnames = perfs.keys() values = [perfs[x][value] for x in modelnames] ax = sns.barplot(x=modelnames, y=values) ax.set_title(title) for item in ax.get_xticklabels(): item.set_rotation(45) #plt.show() if pp is not None: info("Saving {0} plot to multipage pdf".format(title), ind=4) pp.savefig() else: plotname = setFile(outdir, ".".join([value, ext])) info("Saving {0} plot to {1}".format(title, plotname), ind=4) plt.savefig(plotname) plt.close()
def plotROC(perfs, outdir, ext, pp=None): info("Plotting ROC Curves for {0} Classifiers".format(len(perfs))) modelnames = perfs.keys() plt.figure() current_palette = sns.color_palette() #colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) for i, modelname in enumerate(modelnames): perfdata = perfs[modelname] auc = perfdata['AUC'] tpr = perfdata['ROC']['tpr'] fpr = perfdata['ROC']['fpr'] plt.plot(fpr, tpr, label='{0} ({1:0.2f})' ''.format(modelname, auc), color=current_palette[i], linestyle='-', linewidth=3) title = "Receiver Operating Characteristic" value = "ROC" plt.plot([0, 1], [0, 1], 'k--', lw=2) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('') plt.legend(loc="lower right") if pp is not None: info("Saving {0} plot to multipage pdf".format(title), ind=4) pp.savefig() else: plotname = setFile(outdir, ".".join([value, ext])) info("Saving {0} plot to {1}".format(title, plotname), ind=4) plt.savefig(plotname) plt.close()
def plotResiduals(perfs, outdir, ext, pp=None): sns.set(style="whitegrid") modelnames = perfs.keys() for i, modelname in enumerate(modelnames): y = perfs[modelname]['Residuals'] miny = np.percentile(y, 1) maxy = np.percentile(y, 99) capy = np.copy(y) capy[capy < miny] = miny capy[capy > maxy] = maxy y.name = "Residuals" ax = sns.distplot(capy, rug=False, label=modelname) title = "Residuals" ax.set_title(title) ax.legend() value = "Residuals" plotname = setFile(outdir, ".".join([value, ext])) info("Saving {0} plot to {1}".format(title, plotname), ind=4) plt.savefig(plotname) plt.close()
def loadConfig(): configname = setFile("/Users/tgadfort/Documents/pymva", "config.yaml") info("Importing [{0}]".format(configname), ind=0) config = get(configname) return config
def getTrips(): savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverData.p") data = getJoblib(savefile) return data