def crossValidate(S):
    accuracies = {0: {}, 1: {}}  # 0 = BT | 1 = RF
    num = [10, 20, 40, 50]
    totTime = 2*len(num)*10
    it = 0
    for s in range(0, 2):
        for t in num:
            accuracies[s][t] = []
    for s in range(0, 2):
        for t in num:
            for idx in range(1, 10):
                # print(idx)
                #test_set = S[idx]
                SC = list(S)
                SC.pop(idx)
                SC = pd.concat(SC, axis=0)
                if s == 0:
                    models = trees.bootstrap(SC, t, 8)
                    testResult = round(trees.testBagging(
                        SC, models, "BT CV_Numtrees"), 2)
                    accuracies[0][t].append(testResult)
                if s == 1:
                    models = trees.bootstrapRandom(SC, t, 8)
                    testResult = round(trees.testBagging(
                        SC, models, "RF CV_Numtrees"), 2)
                    accuracies[1][t].append(testResult)
                it += 1
                sys.stdout.flush()
                progressBar("####### CV_Numtrees Total ########", it, totTime)
    return accuracies
def NMI(clusters, data):
    classLabels = data[1].unique()
    IG = 0
    i = 1
    for i, c in enumerate(classLabels):
        for _, value in clusters.items():
            labelled = pd.DataFrame(
                {'x': value[:, 0], 'y': value[:, 1], 'label': value[:, 2]})
            total = len(data)
            vc = labelled['label'].value_counts().get(float(c), -1)
            pcg = (vc / total) if vc != -1 else 0
            pc = data[1].value_counts()[c] / total
            pg = len(labelled) / total
            if pcg == 0:
                IG += 0
            else:
                IG += (pcg * np.log(pcg/(pc*pg)))

        progressBar("NMI", i, len(classLabels))

    HC = 0
    for c in classLabels:
        pc = data[1].value_counts()[c] / len(data)
        HC += pc * np.log(pc)
    HC = -HC
    HG = 0
    for _, value in clusters.items():
        labelled = pd.DataFrame(
            {'x': value[:, 0], 'y': value[:, 1], 'label': value[:, 2]})
        pg = len(labelled) / len(data)
        HG += pg * np.log(pg)
    HG = - HG

    nmi = IG / (HC + HG)
    return nmi
def bootstrapRandom(trainingData, m, depthLimit):
    models = []
    for i in range(m):
        current = trainingData.sample(frac=1, replace=True)
        attributes = set(list(current.drop("decision", axis=1)))
        models.append(buildRandomTree(current, attributes, 1, depthLimit))
        progressBar("Bootstrap RF", i, m)
    return models
Beispiel #4
0
def svmGradientDescent(X, y, weights, eta, Lambda, iterations, tol):
    for step in range(iterations):
        dw = svm_gradient(weights, X, y, Lambda)
        old = copy.copy(weights)
        weights = weights + (eta*dw)
        progressBar("SVM Training", step, iterations)
        if (dist(weights, old) < tol):
            break
    sys.stdout.flush()
    print("\n")
    return weights
def preprocess(data):
    r = np.arange(6501, len(data))
    data.drop(r, inplace=True)
    headers1 = ['race', 'race_o', 'field']
    headers3 = ['gender']
    headers4 = ['gaming', 'reading']
    preference_scores_of_participant = [
        'attractive_important', 'sincere_important', 'intelligence_important',
        'funny_important', 'ambition_important', 'shared_interests_important'
    ]
    preference_scores_of_partner = [
        'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence',
        'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests'
    ]
    preference_scores = preference_scores_of_participant + preference_scores_of_partner
    preference_totals = {}
    for header in preference_scores:
        preference_totals[header] = 0
    for header in headers1:
        data.drop(header, axis=1, inplace=True)
    for header in preference_scores:
        preference_totals[header] = data[header].sum()
    for i, row in data.iterrows():
        for header in headers4:
            current = row[header]
            if int(current) > 10:
                data.at[i, header] = 10
        for header in headers3:
            data.at[i, header] = 0 if row[header] == 'female' else 1
        for header in preference_scores:
            current = row[header]
            if preference_totals[header] != 0:
                data.at[i, header] = current / preference_totals[header]
        if (i % 100 == 0):
            progressBar("Preprocessing", i, 6500)
    sys.stdout.flush()
    # Label Encoding
    labels = [0, 1]
    discrete_labels = ['gender', 'samerace', 'decision']
    cont_labels = (list(data))
    for label in discrete_labels:
        cont_labels.remove(label)
    for label in cont_labels:
        # print(label)
        data[label] = pd.cut(data[label],
                             bins=2,
                             labels=labels,
                             include_lowest=True)
    return data
Beispiel #6
0
def step1(dataset, seed, name, visualize):
    wc_stats = {}
    sc_stats = {}
    K = [2, 4, 8, 16, 32]
    for i, k in enumerate(K):
        clusters, centroids = kmeans.kmeans(dataset, k, seed, False)
        no_labels = {i: clusters[i][:, :2] for i in clusters}
        wcssd = kmeans.WC_SSD(no_labels, centroids)
        sc = kmeans.SC(no_labels)
        wc_stats[k] = wcssd
        sc_stats[k] = sc
        progressBar("Analysis Step 1", i, len(K))
    if visualize:
        visualize_step1(wc_stats, "({}) WCSSD".format(name))
        visualize_step1(sc_stats, "({}) Silhouette Coefficient".format(name))
    return wc_stats, sc_stats
def kmeans(data, k, seed, visualize=False):
    np.random.seed(seed)
    points = data.iloc[:, [2, 3]].values
    labels = data.iloc[:, 1].values
    maxIter = 50
    N = points.shape[0]  # number of training samples
    numFeatures = points.shape[1]  # x and y coords
    centroids = np.array([]).reshape(numFeatures, 0)
    clusters = {}
    for i in np.random.randint(0, N, size=k):
        centroids = np.c_[centroids, points[i]]

    for step in range(maxIter):
        d = np.array([]).reshape(N, 0)
        for i in range(k):
            # Euclidean Distance
            dist = np.sum((points-centroids[:, i])**2, axis=1)
            d = np.c_[d, dist]
        C = np.argmin(d, axis=1)+1
        temp = {}
        for i in range(k):
            temp[i+1] = np.array([]).reshape(3, 0)
        for i in range(N):
            temp[C[i]] = np.c_[temp[C[i]], np.append(points[i], labels[i])]
        for i in range(k):
            temp[i+1] = temp[i+1].T
        for i in range(k):
            centroids[:, i] = np.mean(temp[i+1][:, :2], axis=0)
        clusters = temp
        progressBar("K-Means Clustering", step, maxIter)
    sys.stdout.flush()

    if visualize:
        color = ['red', 'blue', 'green', 'cyan', 'magenta',
                 '#cc6600', '#ff66cc', '#4d2600', '#cccc00', '#66ff66']
        labels = ['cluster1', 'cluster2', 'cluster3', 'cluster4', 'cluster5',
                  'cluster6', 'cluster7', 'cluster8', 'cluster9', 'cluster10']
        for i in range(k):
            plt.scatter(clusters[i+1][:, 0], clusters[i+1][:, 1],
                        c=color[i], label=int(stats.mode(clusters[i+1][:, 2])[0][0]))
        plt.scatter(centroids[0, :], centroids[1, :],
                    s=300, c='yellow', label='Centroids')
        plt.xlabel('X-Coordinate')
        plt.ylabel('Y-Coordinate')
        plt.legend()
        plt.show()
    return clusters, centroids
def SC(clusters):
    S = []
    for key, value in clusters.items():
        #print("SC Value",value)
        A = pdist(value)
        if A.size == 0:
            A = 0
        A = np.unique(A)
        A = np.mean(A)
        others = {i: clusters[i] for i in clusters if i != key}
        B = [cdist(value, others[other]) for other in others]
        B = [np.mean(i) for i in B]
        B = np.mean(B)
        s = (B-A) / np.maximum(A, B)
        S.append(s)
        progressBar("SC", key, len(clusters))
    sys.stdout.flush()
    sc = np.mean(S)
    return sc
Beispiel #9
0
def test(data, probs, headers, name):
    correct = 0
    total = 0
    for index, row in data.iterrows():
        current = row
        actual = current['decision']
        prediction = makePrediction(probs, current, headers)
        if prediction == actual:
            correct += 1
            total += 1
        else:
            total += 1
        if (index % 100 == 0):
            progressBar(name, index, len(data))
    sys.stdout.flush()
    sys.stdout.write("\n")
    accuracy = round(100 * float(correct) / float(total), 2)
    print(name + " Accuracy: " + str(accuracy))
    return accuracy
def testTree(data, model, name):
    total = 0
    correct = 0
    i = 0
    for index, row in data.iterrows():
        current = row
        actual = current['decision']
        prediction = makePrediction(model, current)
        if prediction == actual:
            correct += 1
            total += 1
        else:
            total += 1
        if (i % 100 == 0):
            progressBar(name, i, len(data))
        i += 1
    sys.stdout.flush()
    sys.stdout.write("\n")
    accuracy = round(100*float(correct) / float(total), 2)
    print(name + " Accuracy: " + str(accuracy))
    return accuracy
Beispiel #11
0
def crossValidate(S):
    accuracies = {0: {}, 1: {}, 2: {}}  # 0 = DT | 1 = BT | 2 = RF
    t_frac = [0.05, 0.075, 0.1, 0.15, 0.2]
    totTime = 3 * len(t_frac) * 10
    it = 0
    for s in range(0, 3):
        for frac in t_frac:
            accuracies[s][frac] = []
    for s in range(0, 3):
        for frac in t_frac:
            for idx in range(1, 10):
                # print(idx)
                #test_set = S[idx]
                SC = list(S)
                SC.pop(idx)
                SC = pd.concat(SC, axis=0)
                train_set = SC.sample(frac=frac, random_state=32)
                if s == 0:
                    attributes = set(list(train_set.drop("decision", axis=1)))
                    model = trees.buildTree(train_set, attributes, 1, 8)
                    testResult = round(
                        trees.testTree(train_set, model, "DT CV_Frac"), 2)
                    accuracies[0][frac].append(testResult)
                if s == 1:
                    m = 30
                    models = trees.bootstrap(train_set, m, 8)
                    testResult = round(
                        trees.testBagging(train_set, models, "BT CV_Frac"), 2)
                    accuracies[1][frac].append(testResult)
                if s == 2:
                    m = 30
                    models = trees.bootstrapRandom(train_set, m, 8)
                    testResult = round(
                        trees.testBagging(train_set, models, "RF CV_Frac"), 2)
                    accuracies[2][frac].append(testResult)
                it += 1
                sys.stdout.flush()
                progressBar("####### CV_Frac Total ########", it, totTime)
    return accuracies
def crossValidate(S):
    accuracies = {0: {}, 1: {}, 2: {}}  # 0 = DT | 1 = BT | 2 = RF
    d = [3, 5, 7, 9]
    totTime = 3 * len(d) * 10
    it = 0
    for s in range(0, 3):
        for depth in d:
            accuracies[s][depth] = []
    for s in range(0, 3):
        for depth in d:
            for idx in range(1, 10):
                # print(idx)
                #test_set = S[idx]
                SC = list(S)
                SC.pop(idx)
                SC = pd.concat(SC, axis=0)
                if s == 0:
                    attributes = set(list(SC.drop("decision", axis=1)))
                    model = trees.buildTree(SC, attributes, 1, depth)
                    testResult = round(
                        trees.testTree(SC, model, "DT CV_Depth"), 2)
                    accuracies[0][depth].append(testResult)
                if s == 1:
                    m = 30
                    models = trees.bootstrap(SC, m, depth)
                    testResult = round(
                        trees.testBagging(SC, models, "BT CV_Depth"), 2)
                    accuracies[1][depth].append(testResult)
                if s == 2:
                    m = 30
                    models = trees.bootstrapRandom(SC, m, depth)
                    testResult = round(
                        trees.testBagging(SC, models, "RF CV_Depth"), 2)
                    accuracies[2][depth].append(testResult)
                it += 1
                sys.stdout.flush()
                progressBar("####### CV_Depth Total ########", it, totTime)
    return accuracies
def preprocess(data):
    r = np.arange(6501, len(data))
    data.drop(r)
    headers1 = ['race', 'race_o', 'field']
    headers2 = ['field']
    headers3 = ['gender', 'race', 'race_o', 'field']
    headers4 = ['gaming', 'reading']
    preference_scores_of_participant = ['attractive_important', 'sincere_important',
                                        'intelligence_important', 'funny_important', 'ambition_important', 'shared_interests_important']
    preference_scores_of_partner = ['pref_o_attractive', 'pref_o_sincere',
                                    'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests']
    preference_scores = preference_scores_of_participant + preference_scores_of_partner
    preference_totals = {}
    preference_counts = {}
    #encodingLengths = {}
    for score in preference_scores:
        preference_totals[score] = 0
        preference_counts[score] = 0
    #encoding_values = {}
    for i, row in data.iterrows():
        for header in headers4:
            current = row[header]
            if int(current) > 10:
                data.ix[i, header] = 10
        for header in headers1:
            current = row[header]
            if current.startswith("'") and current.endswith("'") and len(current) > 1:
                data.ix[i, header] = current.strip("\'")
        for header in headers2:
            current = data.ix[i, header]
            if current[0].isupper():
                current = current.lower()
                data.ix[i, header] = current
        for header in preference_scores:
            current = row[header]
            preference_totals[header] += float(current)
            preference_counts[header] += 1
        for header in preference_scores:
            current = row[header]
            if preference_totals[header] != 0:
                data.ix[i, header] = float(current) / preference_totals[header]
        if (i % 100 == 0):
            progressBar("Preprocessing", i, 6500)
        sys.stdout.flush()
    gender_l = sorted(list(data['gender'].unique()))
    (gender_len, gender_index) = (len(gender_l), gender_l.index('female'))
    gender_vector = getEncoding(gender_len, gender_index)

    race_l = sorted(list(data['race'].unique()))
    (race_len, race_index) = (len(race_l), race_l.index('Black/African American'))
    race_vector = getEncoding(race_len, race_index)

    raceo_l = sorted(list(data['race_o'].unique()))
    (raceo_len, raceo_index) = (len(raceo_l), raceo_l.index('Other'))
    raceo_vector = getEncoding(raceo_len, raceo_index)

    field_l = sorted(list(data['field'].unique()))
    (field_len, field_index) = (len(field_l), field_l.index('economics'))
    field_vector = getEncoding(field_len, field_index)

    for header in headers3:
        dropped = header+"_"+sorted(data[header].unique())[-1]
        data = pd.concat([data, pd.get_dummies(
            data[header], prefix=header)], axis=1)
        data.drop(header, axis=1, inplace=True)
        data.drop(dropped, axis=1, inplace=True)

    print("\n")
    print("Mapped vector for female in column gender: "+str(gender_vector))
    print("Mapped vector for Black/African American in column race: "+str(race_vector))
    print("Mapped vector for Other in column race_o: "+str(raceo_vector))
    print("Mapped vector for economics in column field: "+str(field_vector))
    return data