def ForwardSelect(data, k, trace=False): ''' Step-wise forward selection method Start with an empty set of features. Iteratively add the one feature out of the non-chosen features which improves the Silhouette coefficient the most. The algorthim is to have converged when adding any feature does not improve the coefficint, or no features remain unchosen. ''' selected = np.zeros(0, int) # idx of selected features, start w/ empty baseCoeff = -1 - 1e-9 # -1 is worst possible performance dM = pairwiseDist(data) # pre-calc distance matrix for memoization converged, nRound = False, 1 while not converged: # loop until convergence bestFeat, bestCoeff, means, labels = SelectBestFeature( data, selected, k, dM) if bestCoeff <= baseCoeff: # if new feature doesn't improve performance converged = True else: # if new feature improves performance selected = np.hstack([selected, bestFeat]) # add feature to selection baseCoeff = bestCoeff # set new coeff as baseline performance outs = (means, labels) # save output vars if len(selected) == data.shape[1]: converged = True # algo converged if all features selected if trace: # print iteration info if requesed tmplate = "[%02d] Best coeff=%f, set:%s" print(tmplate % (nRound, bestCoeff, str(selected))) nRound += 1 return ( selected, ) + outs # return selected features, means, cluster labels
def geneticAlgoSelect(data, k, prm, trace=False): '''Main function of genetic algorithm selection. Generate a population of feature sets by randomly generating 0 and 1s given a probability as specified in the input parameter. For this population, evaluate the fitness with the help of a memo. By storing computation results into a dictionary, subsequent individuals with the same set of features can be skipped and results retrieved from the dict. The algorithm is to considered have converged if improvement to Silhouette coefficient has not been made for a specific number of generations. This minimum required improvement is specified in the input parameter. For every generation, performs the regular selection, crossover, and mutation operator on the population. ''' pop = np.random.rand(prm['popSize'], data.shape[1]) < prm['onProb'] pop = minOneFeature(pop) # at least 1 feature must be selected memo = dict() # dict of result for memoization dMat = pairwiseDist(data) # pre-calc distance matrix for memoization baseFit = 0 # worst possible fitenss score converged, gen, stagnGens = False, 1, 0 # initialize loop vars while not converged: # loop until GA has converged #print(np.asanyarray(pop,int)) fit, memo = evalFitness(data, k, pop, memo, dMat) # evaluate fitness bestIdx = np.argmax(fit) # keep track of best indiviaul bestFit, bestIndv = fit[bestIdx], pop[bestIdx] # best fit and features #print((bestFit,np.where(bestIndv)[0])) if (bestFit - baseFit < prm['minImprove']) and stagnGens > prm['stagnLim']: converged = True out = baseFit - 1, np.where(baseIndv)[ 0] # silhouette coeff and list else: # not converged, selection + crossover + mutation if (bestFit - baseFit < prm['minImprove']): stagnGens += 1 else: baseFit, baseIndv = bestFit, bestIndv # record long-run best parentInd = selectParents(fit, pop.shape[0]) # select parents pop = crossOver(pop, parentInd) # cross-over to get next gen pop = mutate(pop, prm['mutateProb']) # mutate if trace: print('Generation %d: best fitness = %.10f' % (gen, baseFit)) print('\tBest set: %s' % str(np.where(baseIndv)[0])) gen += 1 return out
def updateMeans(data, means): ''' Calculate and update centroids for K-means algorithm. The function has two parts: 1) Assign each pt to the mean for which it has the shortest distance 2) Calculate new means to be centroid of all the points in the group Returns the new means and the classification of data points to these means ''' tmpDist = pairwiseDist(means, data) # dist between means and all data pts minClus = tmpDist.argmin(axis=0) # find group where distance is smallest newMeans = np.zeros([len(means), data.shape[1]]) # new mean points for n, x in enumerate(means): # loop over all clusters tmp = np.vstack( (data[minClus == n, ], x)) # concat data pt and centroid newMeans[n] = tmp.mean(axis=0) # new mean = centroid of all pts return newMeans, minClus
def KNN(trainX, trainY, testX, K, categorical): ''' K-nearest Neighbors classifier. Arguments are: the training data, training label, test data, the K hyperparameter, and a boolean variable indicating whether the data is categorical. The function first calculate all the pair-wise distance between the test data points and the training data point. It then finds the closest K data points in the training data set. The lables of these K data points are then either: 1) taken an average of if the data is not categorical, or 2) taken the plurality of if it's categorical ''' dists = pairwiseDist(testX, trainX) # all pairwise dist of two datasets knnIdx, _ = kMinValIdx(dists, K) # idx of K closest data pts in training set knnLabels = trainY[knnIdx] # labels of these closest data points testY = np.empty(testX.shape, trainY.dtype) # pre-allocate test data labels if not categorical: # regression, calculate mean testY = knnLabels.mean(axis=1) # mean of k-closest label values else: # classification, get most common class label testY = np.array([mostCommonElem(lab) for lab in knnLabels]) return testY # return results
def consistentSubset(trainX, trainY, K=1): ''' Using Hart's algorithm to find a consistent subset. Arguments are: training data, training label, and K. K is default to one as per the original Hart's algorithm. The algorithm randomly picks ''' dists = pairwiseDist(trainX) # all pairwise dist of two datasets idx = np.arange(trainX.shape[0]) # construct index of data rows Z, idx = pickAndRemove(idx) # randomly pick 1st pt of of subset converged = False while not converged: converged = True # stop unless a misclassification np.random.shuffle(idx) # shuffle sequence of sample to train randomly for x in idx: # loop over all samples nnIdx = kMinValIdx(dists[x, Z], 1)[0] # idx of NN in Z nnLabel = trainY[nnIdx].flatten() # label of NN of x in Z if nnLabel != trainY[x]: # if misclassification Z = np.hstack([Z, x]) # add to consistent subset converged = False # continue training idx = np.setdiff1d(idx, Z) # remove training set from samples return Z, idx