def readTestingData(testDataFname, fieldMaps): """ read test data @param testDataFname: name of the testing data file @return: (DatasetPair with just X, sampleWeights) """ fieldNames = ['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked','weight'] nameMap = dict(zip(fieldNames, range(len(fieldNames)))) dataTypes = np.array([np.int, '|S82', '|S82', np.float, np.int, np.int, '|S82', np.float, '|S82', '|S82', np.int]) outputFieldNames = [] data = map(list, csv2dict(testDataFname, hasHeader=True, dataTypes=dataTypes, colIndices=None, defaultNumValue=float('nan'))) all_x = list() # sample weights sampleWeights = np.array(getCol(data, [nameMap['weight']])) # attach numerical fields first for name in ['pclass', 'age', 'sibsp', 'parch', 'fare']: outputFieldNames.append(name) all_x.append(getCol(data, [nameMap[name]])) # attach text fields for name in ['sex', 'name', 'ticket', 'cabin', 'embarked']: outputFieldNames.append(name) fieldMap = fieldMaps[name] all_x.append([fieldMap[v] for v in getCol(data, [nameMap[name]])]) all_x = np.array(zip(*all_x)) return DatasetPair(all_x, fieldNames=outputFieldNames), sampleWeights
def compareResultsToTrueResults(testResults, trueResFname): """ compares results to true results @param testResults: vector of test results @param trueResFname: name of the file containing a column of true results @return: the evaluation score """ y_true = getCol(csv2dict(trueResFname, hasHeader=True, dataTypes=[np.int]), 0) return evaluate(testResults, y_true)
def readTrainingData(trainDataFname, testDataFname): """ @param trainDataFname: name of the training data file @param testDataFname: name of the testing data file. used here only to get the string->index mapping of the text columns @return: allDataPair, fieldMaps, sampleWeights """ fieldNames = ['survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked','weight'] nameMap = dict(zip(fieldNames, range(len(fieldNames)))) dataTypes = np.array([np.int, np.int, '|S82', '|S82', np.float, np.int, np.int, '|S82', np.float, '|S82', '|S82', np.int]) outputFieldNames = [] # ------ read original data --------- data = map(list, csv2dict(trainDataFname, hasHeader=True, dataTypes=dataTypes, colIndices=None, defaultNumValue=float('nan'))) all_y = np.array(getCol(data, [nameMap['survived']])) all_x = list() # sample weights sampleWeights = np.array(getCol(data, [nameMap['weight']])) # attach numerical fields first for name in ['pclass', 'age', 'sibsp', 'parch', 'fare']: outputFieldNames.append(name) all_x.append(getCol(data, [nameMap[name]])) # attach text fields testData = csv2dict(testDataFname, hasHeader=True) numTrainingDataPts = len(data) fieldMaps = {} for name in ['sex', 'name', 'ticket', 'cabin', 'embarked']: outputFieldNames.append(name) col, fieldMap = integerizeList(getCol(data, [nameMap[name]]) + list(testData[:, nameMap[name]-1])) all_x.append(col[:numTrainingDataPts]) fieldMaps[name] = fieldMap # normalize data allDataPair = DatasetPair(np.array(zip(*all_x)), all_y, outputFieldNames) return allDataPair, fieldMaps, sampleWeights