def prepDataTest(data, fields, fillna = True, fillVal = {}, dump = False, dumpPrefix = 'set'): data = data[fields] if fillna: data = analysis.fillnaDict(data, fillVal) if dump: np.savetxt("%s_testActual.csv" % (dumpPrefix), data, delimiter=",") return data
def prepDataTrain(data, label='Label', fields=[], split=True, splitPercent=10, shuffle=False, fillna=True, typeSub='mean', dump=True, dumpPrefix='set'): if len(fields) == 0: fields = data.dtype.names if shuffle: np.random.shuffle(data) print 'Shuffled' if split: test = data[(len(data) - int(len(data) * (splitPercent / 100.0))):] data = data[:(len(data) - int(len(data) * (splitPercent / 100.0)))] print 'Split' fillVal = {} if fillna: data, fillVal = analysis.fillna(data, typeSub, fields, -1) if split: test = analysis.fillnaDict(test, fillVal) print 'Filled NaN' if dump: np.savetxt("%s_train.csv" % (dumpPrefix), data, delimiter=",") if split: np.savetxt("%s_test.csv" % (dumpPrefix), test, delimiter=",") print 'Data dumped' index = 0 features = [] features = fields for feature in fields: if feature == label: features = np.delete(fields, index, 0) index += 1 if split: if fillna: return data, test, features, fillVal else: return data, test, features, {} else: if fillna: return data, [], features, fillVal else: return data, [], features, {}
def prepDataTest(data, fields, fillna=True, fillVal={}, dump=False, dumpPrefix='set'): data = data[fields] if fillna: data = analysis.fillnaDict(data, fillVal) if dump: np.savetxt("%s_testActual.csv" % (dumpPrefix), data, delimiter=",") return data
def prepDataTrain(data, label = 'Label', fields = [], split = True, splitPercent = 10, shuffle = False, fillna = True, typeSub = 'mean', dump = True, dumpPrefix = 'set'): if len(fields) == 0: fields = data.dtype.names if shuffle: np.random.shuffle(data) print 'Shuffled' if split: test = data[(len(data) - int(len(data)*(splitPercent/100.0))):] data = data[:(len(data) - int(len(data)*(splitPercent/100.0)))] print 'Split' fillVal = {} if fillna: data, fillVal = analysis.fillna(data, typeSub, fields, -1) if split: test = analysis.fillnaDict(test, fillVal) print 'Filled NaN' if dump: np.savetxt("%s_train.csv" % (dumpPrefix), data, delimiter=",") if split: np.savetxt("%s_test.csv" % (dumpPrefix), test, delimiter=",") print 'Data dumped' index = 0 features = [] features = fields for feature in fields: if feature == label: features = np.delete(fields, index, 0) index += 1 if split: if fillna: return data, test, features, fillVal else: return data, test, features, {} else: if fillna: return data, [], features, fillVal else: return data, [], features, {}