def readUptake(config): info("Getting uptake data.") datadir = getDataDir(config) outputConfig = config['output'] compress = outputConfig['compress'] dataName = setFile(datadir, outputConfig['name']) inputConfig = config['input'] trainName = setFile(datadir, inputConfig['train']) validName = setFile(datadir, inputConfig['valid']) featureConfig = config['feature'] dlFile = setFile(datadir, featureConfig['dropList']) if isFile(dataName) and isFile(dlFile): info("Loading previously create data frames") pddf = getJoblib(dataName) else: trainData = readCSV(trainName) validData = readCSV(validName) ## Remove 'market' from validation data validData.drop(labels='market', axis=1, inplace=True) pddf = joinTrainValid(trainData, validData) info("Saving training and validation data") saveJoblib(dataName, pddf, compress) info("Wrote training and validation data to " + dataName) info("Saving feature data") writeDropList(dlFile, pddf, dlData=None) return pddf
def generateRouteFeatures(data): t0 = start() features = {} for i, driverID in enumerate(data.keys()): driverData = data[driverID] print "Process driver {0}".format(driverID) features[driverID] = createRoutes(driverData, driverID) if i % 5 == 0: inter(t0, i, len(data)) end(t0) savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverPaths.p") saveJoblib(savefile, features, compress=True)
def readTrips(): drivers = findDirs( "/Users/tgadfort/Documents/pymva/axa/Axa-Insurance-Telematics-Kaggle") drivers = [getBasename(x) for x in drivers] data = {} for driverID in drivers: print "Reading trips from driver {0}".format(driverID) data[driverID] = readDriverTrips(driverID) savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverData.p") saveJoblib(savefile, data, compress=True)
def saveTrainedModel(config, modelResults): modelname = modelResults['name'] modelFileName = getModelFileName(config, modelname) if modelname == "tpot": tpotObj = modelResults['estimator'] tpotFileName = modelFileName.replace(".p", ".py") tpotObj.export(tpotFileName) del modelResults['estimator'] saveJoblib(jlfile=modelFileName, jldata=modelResults, compress=True) else: saveJoblib(jlfile=modelFileName, jldata=modelResults, compress=True)
def generateDriverModels(data): t0 = start() features = {} for i, driverID in enumerate(data.keys()): driverData = data[driverID] print "Process driver {0}".format(driverID) dm = DriverModel(driverID, driverData) results, headers = dm.agg_mat, dm.agg_headers results = nan_to_num(results) features[driverID] = results if i % 5 == 0: inter(t0, i, len(data)) end(t0) savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverModels.p") saveJoblib(savefile, features, compress=True)
def readKDD99(config): info("Getting KDD '99 data.") datadir = getDataDir(config) outputConfig = config['output'] compress = outputConfig['compress'] dataName = setFile(datadir, outputConfig['name']) featureConfig = config['feature'] dlFile = setFile(datadir, featureConfig['dropList']) if isFile(dataName) and isFile(dlFile): info("Loading previously create data frames") pddf = getJoblib(dataName) else: info("Downloading KDD '99 data", ind=2) tmp = datasets.fetch_kddcup99() X = tmp['data'] y = tmp['target'] y = y.reshape((y.shape[0], 1)) pddf = DataFrame(append(arr=X, values=y, axis=1)) tmp = pddf.head(n=1000) for column in tmp.columns: try: tmp[column].mean() pddf[column] = to_numeric(pddf[column], errors="coerce") except: continue colFile = setFile(datadir, "names.dat") colnames = open(colFile).readlines() targets = colnames[0].split(",") columns = [x.split(":")[0] for x in colnames[1:]] columns.append("TARGET") pddf.columns = columns info("Saving data to {0}".format(dataName)) saveJoblib(jlfile=dataName, jldata=pddf, compress=compress) info("Saving feature data to {0}".format(dlFile)) writeDropList(dlFile, pddf, dlData=None) return pddf
def generateTripFeatures(data): t0 = start() features = {} for i, driverID in enumerate(data.keys()): driverData = data[driverID] print "Process driver {0}".format(driverID) results = None for j, trip in enumerate(driverData): tripResults = tripFeatures(trip.values) if results is None: results = tripResults else: results = vstack((results, tripResults)) results = nan_to_num(results) features[driverID] = results if i % 5 == 0: inter(t0, i, len(data)) end(t0) savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverTripFeatures.p") saveJoblib(savefile, features, compress=True)
def getMNIST(): datadir = "/Users/tgadfort/Documents/pymva/data" outdir = setDir(datadir, "mnist") names = [ "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz", "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz" ] for name in names: url = "http://yann.lecun.com/exdb/mnist/" + name savename = setFile(outdir, name) if not isFile(savename): urlretrieve(url, savename) statinfo = stat(savename) print('Succesfully downloaded', savename, statinfo.st_size, 'bytes.') name = name.replace(".gz", ".p") npfile = setFile(outdir, name) if not isFile(npfile): data = extract_images(savename) saveJoblib(npfile, data)
def getTrainTestData(pddf, config): info("Creating final train/test datasets.", ind=0) ## Config info targetConfig = config['target'] targetcol = targetConfig['colname'] outputConfig = config['output'] compress = outputConfig['compress'] if not isColumn(pddf, targetcol): raise ValueError("Target column", targetcol, "is not included in data!") ## Determine if the data showed up split (seperate train/test files) isSplit = False isValid = False if isColumn(pddf, "isTrain"): info("Data is already split", ind=2) isSplit = True elif isColumn(pddf, "isValid"): info("Validation data is ready, but train/test data must be created", ind=2) isValid = True else: info("Train/test data must be created", ind=2) ## Create data if it's split if isSplit: info("Splitting train data", ind=2) X_train = pddf[pddf['isTrain'] == 1] y_train = X_train[targetcol] X_train.drop(labels=[targetcol, 'isTrain'], axis=1, inplace=True) info("Splitting test data", ind=2) X_test = pddf[pddf['isTrain'] == 0] y_test = X_test[targetcol] X_test.drop(labels=[targetcol, 'isTrain'], axis=1, inplace=True) X_valid = None y_valid = None elif isValid: info("Splitting validation data", ind=2) X_valid = pddf[pddf['isValid'] == 1] y_valid = X_valid[targetcol] info("Creating train/test data that contains validated data", ind=2) X_data = pddf[pddf['isValid'] == 0] y = X_data[targetcol] X_train, X_test, y_train, y_test = train_test_split(X_data, y, test_size=0.2) else: info("Creating train/test data that is not already split or validated", ind=2) y = pddf[targetcol] pddf.drop(labels=[targetcol], axis=1, inplace=True) X_train, X_test, y_train, y_test = train_test_split(pddf, y, test_size=0.2) X_valid = None y_valid = None if isSplit: info("Dropping {0} from DataFrame".format(", ".join( [targetcol, 'isTrain']))) pddf.drop(labels=[targetcol, 'isTrain'], axis=1, inplace=True) elif isValid: info("Dropping {0} from DataFrame".format(", ".join( [targetcol, 'isValid']))) pddf.drop(labels=[targetcol, 'isValid'], axis=1, inplace=True) X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames( config) info("Saving {0} data to {1}".format(getDim(X_train), X_trainName), ind=4) saveJoblib(X_trainName, X_train, compress) info("Saving {0} data to {1}".format(getDim(X_test), X_testName), ind=4) saveJoblib(X_testName, X_test, compress) info("Saving {0} data to {1}".format(getDim(X_valid), X_validName), ind=4) saveJoblib(X_validName, X_valid, compress) info("Saving {0} data to {1}".format(getDim(y_train), y_trainName), ind=4) saveJoblib(y_trainName, y_train, compress) info("Saving {0} data to {1}".format(getDim(y_test), y_testName), ind=4) saveJoblib(y_testName, y_test, compress) info("Saving {0} data to {1}".format(getDim(y_valid), y_validName), ind=4) saveJoblib(y_validName, y_valid, compress) return X_train, X_test, X_valid, y_train, y_test, y_valid