def createTree(X, y, minSampleSplit=2, maxDepth=None, depth=0): X = np.array(X) y = np.array(y) tree = {} depth = depth + 1 if ((isinstance(maxDepth, (int, float)) and depth <= maxDepth) or maxDepth == None) and len(y) >= minSampleSplit and computeEntropy(y) != 0: splits = computeSplits(X) optimumSplitList = optimumSplit(X, y, splits) columnIndex = list(optimumSplitList.keys())[0] threshold = list(optimumSplitList.values())[0] lowerX, upperX, lowerY, upperY = splitData(X, y, columnIndex, threshold) lowerNode = createTree(lowerX, lowerY, minSampleSplit=minSampleSplit, depth=depth, maxDepth=maxDepth) upperNode = createTree(upperX, upperY, minSampleSplit=minSampleSplit, depth=depth, maxDepth=maxDepth) tree[columnIndex] = [threshold, lowerNode, upperNode] else: return selectClass(y) return tree
def optimumSplit(X, y, splits): X = np.array(X) y = np.array(y) optimumSplittings = {} minimumEntropy = 10000 optimumColumn = None optimumThreshold = None for columnIndex, thresholdList in splits.items(): for threshold in thresholdList: x1, x2, y1, y2 = splitData(X, y, columnIndex, threshold) if computeBranchEntropy(y1, y2) < minimumEntropy: minimumEntropy = computeBranchEntropy(y1, y2) optimumColumn = columnIndex optimumThreshold = threshold optimumSplittings[optimumColumn] = optimumThreshold return optimumSplittings
data = loadmat('Data/Data.mat') #X is a matrix containing Training Data #Y is a matrix containing Training Labels X = data['X'] y = data['y'] print('Displaying 100 Random Images') rand_indices = np.random.permutation(range(X.shape[0])) sel = X[rand_indices[0:100], :] displayData(sel) print('Seperating Data into Test and Training Sets') print('\n') #create Test and Train examples X_test, X_train, y_train, Y_test, Y = splitData(X, y) print('One Hot Encoding Labels') print('\n') encoder = OneHotEncoder(sparse=False, categories='auto') y_onehot = encoder.fit_transform(y) y_train = encoder.fit_transform(y_train) print('Setting up Neural Network') print('\n') # initial setup input_size = 400 hidden_size = 25 num_labels = 10 learning_rate = .9
reviews[i] = ' '.join(data[i][0:-1]) labels[i] = int(data[i][-1]) ##### convert labels with 0 to -1 ########### for i in range(len(labels)): if labels[i]==0: labels[i]=-1 labels = np.asarray(labels) path_to_weight_matrix = 'path to weight matrix' fname = 'weightmatrix.h5' weight_matrix_df = pd.read_hdf(fname) weight_matrix = weight_matrix_df.as_matrix() splits = 10 trainb_ilst,trainy_ilst,testb_ilst,testy_ilst = splitData(weight_matrix,labels,splits) split_idx = 0 #### determine train and test data ##### train_mat = np.array(trainb_ilst[split_idx]) train_labels = np.array(trainy_ilst[split_idx]) test_mat = np.array(testb_ilst[split_idx]) test_labels = np.array(testy_ilst[split_idx]) num_features = 80 #### set training parameters ########## # load guess matrix either with LSA or word2vec fname = 'lsaguessvectors.h5' word_guess_df = pd.read_hdf(fname) word_guess = word_guess_df.as_matrix()