def crossValidate(S): accuracies = {0: {}, 1: {}} # 0 = BT | 1 = RF num = [10, 20, 40, 50] totTime = 2*len(num)*10 it = 0 for s in range(0, 2): for t in num: accuracies[s][t] = [] for s in range(0, 2): for t in num: for idx in range(1, 10): # print(idx) #test_set = S[idx] SC = list(S) SC.pop(idx) SC = pd.concat(SC, axis=0) if s == 0: models = trees.bootstrap(SC, t, 8) testResult = round(trees.testBagging( SC, models, "BT CV_Numtrees"), 2) accuracies[0][t].append(testResult) if s == 1: models = trees.bootstrapRandom(SC, t, 8) testResult = round(trees.testBagging( SC, models, "RF CV_Numtrees"), 2) accuracies[1][t].append(testResult) it += 1 sys.stdout.flush() progressBar("####### CV_Numtrees Total ########", it, totTime) return accuracies
def NMI(clusters, data): classLabels = data[1].unique() IG = 0 i = 1 for i, c in enumerate(classLabels): for _, value in clusters.items(): labelled = pd.DataFrame( {'x': value[:, 0], 'y': value[:, 1], 'label': value[:, 2]}) total = len(data) vc = labelled['label'].value_counts().get(float(c), -1) pcg = (vc / total) if vc != -1 else 0 pc = data[1].value_counts()[c] / total pg = len(labelled) / total if pcg == 0: IG += 0 else: IG += (pcg * np.log(pcg/(pc*pg))) progressBar("NMI", i, len(classLabels)) HC = 0 for c in classLabels: pc = data[1].value_counts()[c] / len(data) HC += pc * np.log(pc) HC = -HC HG = 0 for _, value in clusters.items(): labelled = pd.DataFrame( {'x': value[:, 0], 'y': value[:, 1], 'label': value[:, 2]}) pg = len(labelled) / len(data) HG += pg * np.log(pg) HG = - HG nmi = IG / (HC + HG) return nmi
def bootstrapRandom(trainingData, m, depthLimit): models = [] for i in range(m): current = trainingData.sample(frac=1, replace=True) attributes = set(list(current.drop("decision", axis=1))) models.append(buildRandomTree(current, attributes, 1, depthLimit)) progressBar("Bootstrap RF", i, m) return models
def svmGradientDescent(X, y, weights, eta, Lambda, iterations, tol): for step in range(iterations): dw = svm_gradient(weights, X, y, Lambda) old = copy.copy(weights) weights = weights + (eta*dw) progressBar("SVM Training", step, iterations) if (dist(weights, old) < tol): break sys.stdout.flush() print("\n") return weights
def preprocess(data): r = np.arange(6501, len(data)) data.drop(r, inplace=True) headers1 = ['race', 'race_o', 'field'] headers3 = ['gender'] headers4 = ['gaming', 'reading'] preference_scores_of_participant = [ 'attractive_important', 'sincere_important', 'intelligence_important', 'funny_important', 'ambition_important', 'shared_interests_important' ] preference_scores_of_partner = [ 'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests' ] preference_scores = preference_scores_of_participant + preference_scores_of_partner preference_totals = {} for header in preference_scores: preference_totals[header] = 0 for header in headers1: data.drop(header, axis=1, inplace=True) for header in preference_scores: preference_totals[header] = data[header].sum() for i, row in data.iterrows(): for header in headers4: current = row[header] if int(current) > 10: data.at[i, header] = 10 for header in headers3: data.at[i, header] = 0 if row[header] == 'female' else 1 for header in preference_scores: current = row[header] if preference_totals[header] != 0: data.at[i, header] = current / preference_totals[header] if (i % 100 == 0): progressBar("Preprocessing", i, 6500) sys.stdout.flush() # Label Encoding labels = [0, 1] discrete_labels = ['gender', 'samerace', 'decision'] cont_labels = (list(data)) for label in discrete_labels: cont_labels.remove(label) for label in cont_labels: # print(label) data[label] = pd.cut(data[label], bins=2, labels=labels, include_lowest=True) return data
def step1(dataset, seed, name, visualize): wc_stats = {} sc_stats = {} K = [2, 4, 8, 16, 32] for i, k in enumerate(K): clusters, centroids = kmeans.kmeans(dataset, k, seed, False) no_labels = {i: clusters[i][:, :2] for i in clusters} wcssd = kmeans.WC_SSD(no_labels, centroids) sc = kmeans.SC(no_labels) wc_stats[k] = wcssd sc_stats[k] = sc progressBar("Analysis Step 1", i, len(K)) if visualize: visualize_step1(wc_stats, "({}) WCSSD".format(name)) visualize_step1(sc_stats, "({}) Silhouette Coefficient".format(name)) return wc_stats, sc_stats
def kmeans(data, k, seed, visualize=False): np.random.seed(seed) points = data.iloc[:, [2, 3]].values labels = data.iloc[:, 1].values maxIter = 50 N = points.shape[0] # number of training samples numFeatures = points.shape[1] # x and y coords centroids = np.array([]).reshape(numFeatures, 0) clusters = {} for i in np.random.randint(0, N, size=k): centroids = np.c_[centroids, points[i]] for step in range(maxIter): d = np.array([]).reshape(N, 0) for i in range(k): # Euclidean Distance dist = np.sum((points-centroids[:, i])**2, axis=1) d = np.c_[d, dist] C = np.argmin(d, axis=1)+1 temp = {} for i in range(k): temp[i+1] = np.array([]).reshape(3, 0) for i in range(N): temp[C[i]] = np.c_[temp[C[i]], np.append(points[i], labels[i])] for i in range(k): temp[i+1] = temp[i+1].T for i in range(k): centroids[:, i] = np.mean(temp[i+1][:, :2], axis=0) clusters = temp progressBar("K-Means Clustering", step, maxIter) sys.stdout.flush() if visualize: color = ['red', 'blue', 'green', 'cyan', 'magenta', '#cc6600', '#ff66cc', '#4d2600', '#cccc00', '#66ff66'] labels = ['cluster1', 'cluster2', 'cluster3', 'cluster4', 'cluster5', 'cluster6', 'cluster7', 'cluster8', 'cluster9', 'cluster10'] for i in range(k): plt.scatter(clusters[i+1][:, 0], clusters[i+1][:, 1], c=color[i], label=int(stats.mode(clusters[i+1][:, 2])[0][0])) plt.scatter(centroids[0, :], centroids[1, :], s=300, c='yellow', label='Centroids') plt.xlabel('X-Coordinate') plt.ylabel('Y-Coordinate') plt.legend() plt.show() return clusters, centroids
def SC(clusters): S = [] for key, value in clusters.items(): #print("SC Value",value) A = pdist(value) if A.size == 0: A = 0 A = np.unique(A) A = np.mean(A) others = {i: clusters[i] for i in clusters if i != key} B = [cdist(value, others[other]) for other in others] B = [np.mean(i) for i in B] B = np.mean(B) s = (B-A) / np.maximum(A, B) S.append(s) progressBar("SC", key, len(clusters)) sys.stdout.flush() sc = np.mean(S) return sc
def test(data, probs, headers, name): correct = 0 total = 0 for index, row in data.iterrows(): current = row actual = current['decision'] prediction = makePrediction(probs, current, headers) if prediction == actual: correct += 1 total += 1 else: total += 1 if (index % 100 == 0): progressBar(name, index, len(data)) sys.stdout.flush() sys.stdout.write("\n") accuracy = round(100 * float(correct) / float(total), 2) print(name + " Accuracy: " + str(accuracy)) return accuracy
def testTree(data, model, name): total = 0 correct = 0 i = 0 for index, row in data.iterrows(): current = row actual = current['decision'] prediction = makePrediction(model, current) if prediction == actual: correct += 1 total += 1 else: total += 1 if (i % 100 == 0): progressBar(name, i, len(data)) i += 1 sys.stdout.flush() sys.stdout.write("\n") accuracy = round(100*float(correct) / float(total), 2) print(name + " Accuracy: " + str(accuracy)) return accuracy
def crossValidate(S): accuracies = {0: {}, 1: {}, 2: {}} # 0 = DT | 1 = BT | 2 = RF t_frac = [0.05, 0.075, 0.1, 0.15, 0.2] totTime = 3 * len(t_frac) * 10 it = 0 for s in range(0, 3): for frac in t_frac: accuracies[s][frac] = [] for s in range(0, 3): for frac in t_frac: for idx in range(1, 10): # print(idx) #test_set = S[idx] SC = list(S) SC.pop(idx) SC = pd.concat(SC, axis=0) train_set = SC.sample(frac=frac, random_state=32) if s == 0: attributes = set(list(train_set.drop("decision", axis=1))) model = trees.buildTree(train_set, attributes, 1, 8) testResult = round( trees.testTree(train_set, model, "DT CV_Frac"), 2) accuracies[0][frac].append(testResult) if s == 1: m = 30 models = trees.bootstrap(train_set, m, 8) testResult = round( trees.testBagging(train_set, models, "BT CV_Frac"), 2) accuracies[1][frac].append(testResult) if s == 2: m = 30 models = trees.bootstrapRandom(train_set, m, 8) testResult = round( trees.testBagging(train_set, models, "RF CV_Frac"), 2) accuracies[2][frac].append(testResult) it += 1 sys.stdout.flush() progressBar("####### CV_Frac Total ########", it, totTime) return accuracies
def crossValidate(S): accuracies = {0: {}, 1: {}, 2: {}} # 0 = DT | 1 = BT | 2 = RF d = [3, 5, 7, 9] totTime = 3 * len(d) * 10 it = 0 for s in range(0, 3): for depth in d: accuracies[s][depth] = [] for s in range(0, 3): for depth in d: for idx in range(1, 10): # print(idx) #test_set = S[idx] SC = list(S) SC.pop(idx) SC = pd.concat(SC, axis=0) if s == 0: attributes = set(list(SC.drop("decision", axis=1))) model = trees.buildTree(SC, attributes, 1, depth) testResult = round( trees.testTree(SC, model, "DT CV_Depth"), 2) accuracies[0][depth].append(testResult) if s == 1: m = 30 models = trees.bootstrap(SC, m, depth) testResult = round( trees.testBagging(SC, models, "BT CV_Depth"), 2) accuracies[1][depth].append(testResult) if s == 2: m = 30 models = trees.bootstrapRandom(SC, m, depth) testResult = round( trees.testBagging(SC, models, "RF CV_Depth"), 2) accuracies[2][depth].append(testResult) it += 1 sys.stdout.flush() progressBar("####### CV_Depth Total ########", it, totTime) return accuracies
def preprocess(data): r = np.arange(6501, len(data)) data.drop(r) headers1 = ['race', 'race_o', 'field'] headers2 = ['field'] headers3 = ['gender', 'race', 'race_o', 'field'] headers4 = ['gaming', 'reading'] preference_scores_of_participant = ['attractive_important', 'sincere_important', 'intelligence_important', 'funny_important', 'ambition_important', 'shared_interests_important'] preference_scores_of_partner = ['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests'] preference_scores = preference_scores_of_participant + preference_scores_of_partner preference_totals = {} preference_counts = {} #encodingLengths = {} for score in preference_scores: preference_totals[score] = 0 preference_counts[score] = 0 #encoding_values = {} for i, row in data.iterrows(): for header in headers4: current = row[header] if int(current) > 10: data.ix[i, header] = 10 for header in headers1: current = row[header] if current.startswith("'") and current.endswith("'") and len(current) > 1: data.ix[i, header] = current.strip("\'") for header in headers2: current = data.ix[i, header] if current[0].isupper(): current = current.lower() data.ix[i, header] = current for header in preference_scores: current = row[header] preference_totals[header] += float(current) preference_counts[header] += 1 for header in preference_scores: current = row[header] if preference_totals[header] != 0: data.ix[i, header] = float(current) / preference_totals[header] if (i % 100 == 0): progressBar("Preprocessing", i, 6500) sys.stdout.flush() gender_l = sorted(list(data['gender'].unique())) (gender_len, gender_index) = (len(gender_l), gender_l.index('female')) gender_vector = getEncoding(gender_len, gender_index) race_l = sorted(list(data['race'].unique())) (race_len, race_index) = (len(race_l), race_l.index('Black/African American')) race_vector = getEncoding(race_len, race_index) raceo_l = sorted(list(data['race_o'].unique())) (raceo_len, raceo_index) = (len(raceo_l), raceo_l.index('Other')) raceo_vector = getEncoding(raceo_len, raceo_index) field_l = sorted(list(data['field'].unique())) (field_len, field_index) = (len(field_l), field_l.index('economics')) field_vector = getEncoding(field_len, field_index) for header in headers3: dropped = header+"_"+sorted(data[header].unique())[-1] data = pd.concat([data, pd.get_dummies( data[header], prefix=header)], axis=1) data.drop(header, axis=1, inplace=True) data.drop(dropped, axis=1, inplace=True) print("\n") print("Mapped vector for female in column gender: "+str(gender_vector)) print("Mapped vector for Black/African American in column race: "+str(race_vector)) print("Mapped vector for Other in column race_o: "+str(raceo_vector)) print("Mapped vector for economics in column field: "+str(field_vector)) return data