Example #1
0
def apply_kmeans(do_pca, x_train, y_train, x_test, y_test, kmeans_max_iter,
                 kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################

    for k in range(1, kmeans_max_k):
        kmeans = KMeans(k, kmeans_max_iter)
        sse_vs_iter = kmeans.fit(x_train)
        train_sses_vs_iter.append(sse_vs_iter)
        train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
        train_sses_vs_k.append(min(sse_vs_iter))

    plot_y_vs_x_list(train_sses_vs_iter,
                     x_label='iter',
                     y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d' % do_pca)
    plot_y_vs_x(train_sses_vs_k,
                x_label='k',
                y_label='sse',
                save_path='plot_sse_vs_k_%d' % do_pca)
    plot_y_vs_x(train_purities_vs_k,
                x_label='k',
                y_label='purities',
                save_path='plot_purity_vs_k_%d' % do_pca)
Example #2
0
def apply_kmeans3(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################

    result = []
    for k in range(1, 11):
        print('k:', k)
        for times in range(0, 5):
            kmeans = KMeans(k, kmeans_max_iter)
            sse_vs_iter = kmeans.fit(x_train)
            train_sses_vs_iter.append(sse_vs_iter)
            train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
            train_sses_vs_k.append(min(sse_vs_iter))
        print(train_purities_vs_k)
        avg = sum(train_purities_vs_k) / len(train_purities_vs_k)
        result.append(avg)
        train_purities_vs_k = []

    print(result)
    print('max purity', max(result))
    plot_y_vs_x(result,
                x_label='k',
                y_label='purities',
                save_path='plot_purity_vs_k_%d' % do_pca)
Example #3
0
def apply_kmeans1(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################

    for run in range(0, 5):
        kmeans = KMeans(6, kmeans_max_iter)
        sse_vs_iter = kmeans.fit(x_train)
        train_sses_vs_iter.append(sse_vs_iter)
        train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
        train_sses_vs_k.append(min(sse_vs_iter))

    result = []
    for col in range(len(train_sses_vs_iter[0])):
        sum = 0
        for row in range(0, 5):
            sum += train_sses_vs_iter[row][col]
        sum = sum / 5
        result.append(sum)
    result = [result]

    print(result)

    plot_y_vs_x_list(result,
                     x_label='iter',
                     y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d' % do_pca)
Example #4
0
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################

    start = time.time()
    for k in range(1, kmeans_max_k):
        print("On step k =", k, "of", kmeans_max_k,
              "\telapsed time: %.2f" % (time.time() - start), "s")
        kmeans = KMeans(k, kmeans_max_iter)
        sse_vs_iter = kmeans.fit(x_train)
        train_sses_vs_iter.append(sse_vs_iter)
        train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
        train_sses_vs_k.append(min(sse_vs_iter))

    plot_y_vs_x_list(train_sses_vs_iter,
                     x_label='iter',
                     y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d' % do_pca)
    plot_y_vs_x(train_sses_vs_k,
                x_label='k',
                y_label='sse',
                save_path='plot_sse_vs_k_%d' % do_pca)
    plot_y_vs_x(train_purities_vs_k,
                x_label='k',
                y_label='purities',
                save_path='plot_purity_vs_k_%d' % do_pca)
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################
    # iterations for 5 different runs of k-means.
    for k in range(0, 5):
        kmeans = KMeans(6, kmeans_max_iter)
        sse_vs_iter = kmeans.fit(x_train)
        train_sses_vs_iter.append(sse_vs_iter)
        train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
        train_sses_vs_k.append(min(sse_vs_iter))
        if k == 0:
            avg_list = [0] * len(sse_vs_iter)
        avg_list = [
            avg_list[i] + sse_vs_iter[i] for i in range(len(sse_vs_iter))
        ]

    plot_y_vs_x_list(train_sses_vs_iter,
                     x_label='iter',
                     y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d' % do_pca)
    plot_y_vs_x(avg_list,
                x_label='iterations',
                y_label='sse',
                save_path='plot_sse_vs_iter_%d' % do_pca)
Example #6
0
def main(dataset_fn, output_fn, clusters_no):
    geo_locs = []
    # read location data from csv file and store each location as a Point(latit,longit) object
    df = pd.read_csv(dataset_fn)
    for index, row in df.iterrows():
        loc_ = Point(float(row['LAT']), float(row['LON']))  #tuples for location
        geo_locs.append(loc_)
    # run k_means clustering
    model = KMeans(geo_locs, clusters_no)
    flag = model.fit(True)
    if flag == -1:
        print("No of points are less than cluster number!")
    else:
        # save clustering results is a list of lists where each list represents one cluster
        model.save(output_fn)
Example #7
0
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################

    for k in range(1, kmeans_max_k):
        sses = None
        avg_purity = 0.

        # do five tests to reduce effect of random start
        for i in range(5):
            kmeans = KMeans(k, kmeans_max_iter)
            sse = kmeans.fit(x_train)
            if (sses == None):
                sses = sse
            else:
                for j in range(len(sse)):
                    sses[j] = (sses[j] + sse[j])

            avg_purity += kmeans.get_purity(x_train, y_train)

        avg_purity = avg_purity / 5.

        for j in range(len(sses)):
            sses[j] = sses[j] / 5.0
        # avg_sses = np.sum(np.array(sses), 0) / 5

        train_sses_vs_iter.append(sses)
        train_purities_vs_k.append(avg_purity)
        train_sses_vs_k.append(min(sses))

    plot_y_vs_x_list(train_sses_vs_iter,
                     x_label='iter',
                     y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d' % do_pca)
    plot_y_vs_x(train_sses_vs_k,
                x_label='k',
                y_label='sse',
                save_path='plot_sse_vs_k_%d' % do_pca)
    plot_y_vs_x(train_purities_vs_k,
                x_label='k',
                y_label='purities',
                save_path='plot_purity_vs_k_%d' % do_pca)
Example #8
0
def user__upvotes_cast__to__average_post_length__to__profile_views(
        k, users, posts):
    """
    Classifies users based on the following:
        - amount of upvotes a user has cast
        - user's average post length (question and answer)
        - user's profile views

    :param k: Number of clusters
    :param users: RDD with XML file with users
    :param posts: RDD with XML file with posts
    :return: RDD of clustered data
    """
    # (user_id, average_post_length)
    user_avg_post_length = posts\
        .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'Body']))\
        .filter(lambda a: helpers.is_valid_tuple(a, 2))\
        .map(lambda data: (int(data[0]), len(data[1])))\
        .aggregateByKey((0, 0), lambda a, b: (a[0] + b, a[1] + 1), lambda a, b: (a[0] + b[0], a[1] + b[1]))\
        .mapValues(lambda value: value[0] / value[1])

    # (id, (upvotes_cast, profile_views))
    user_data = users\
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'UpVotes', 'Views'], int))\
        .filter(lambda a: helpers.is_valid_tuple(a, 3))\
        .map(lambda data: (data[0], (data[1], data[2])))

    # (upvotes_cast, views, average_post_length)
    joined_data = user_data.join(user_avg_post_length)\
        .map(lambda data: (data[1][0][0], data[1][0][1], data[1][1]))

    return KMeans(k).fit(joined_data)
 def test_calculate_centroid(self):
     points = [
         (1, 2, 3),
         (3, 4, 5),
     ]
     centroids = KMeans.calculate_centroid(points)
     self.assertEqual(centroids, (2, 3, 4))
Example #10
0
def user__reputation__to__own_questions_answered(k, user_lines, post_lines):
    # (user_id, rep)
    user_id_reputation = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'Reputation'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    # (user_id, n_questions)
    user_id_questions = post_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'OwnerUserId', 'PostTypeId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 3) and a[2] == 1) \
        .map(lambda a: (a[0], a[1]))

    # (user_id, n_asked)
    user_id_answers = post_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['ParentId', 'OwnerUserId', 'PostTypeId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 3) and a[2] == 2) \
        .map(lambda a: (a[0], a[1]))

    # (user_id, n_questions_self_answered)
    user_id_own_answers = user_id_questions.intersection(user_id_answers) \
        .map(lambda a: (a[1], 1)) \
        .reduceByKey(add)

    # (rep, n_questions_self_answered)
    result = user_id_reputation.join(user_id_own_answers).map(
        lambda a: (a[1][0], a[1][1]))

    return KMeans(k).fit(result)
Example #11
0
def user_rep_to_answers_and_questions(k, user_lines, posts_lines):
    reputation = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'Reputation'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    posts = posts_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'PostTypeId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    questions = posts \
        .filter(lambda a: a[1] == 1) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    answers = posts \
        .filter(lambda a: a[1] == 2) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    # (user_id, (answer_count, question_count))
    ratio = answers.join(questions)

    # (user_id, (user_rep, (answer_count, question_count)))
    result = reputation.join(ratio)

    # (user_rep, answer_count, question_count)
    flat_result = result.map(lambda a: (a[1][0], a[1][1][0], a[1][1][1]))

    return KMeans(k).fit(flat_result)
 def test_calculate_average_distance(self):
     centre = (1, 1)
     points = [
         (4, 1),
         (5, 4),
     ]
     self.assertEqual(KMeans.calculate_average_distance(centre, points), 4)
Example #13
0
def user_questions_answered(k, posts_lines):
    result = posts_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'PostTypeId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2) and a[1] == 2) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add) \
        .map(lambda a: (a[1],))

    return KMeans(k).fit(result)
Example #14
0
def main():
    k = 3
    X =   [[random.randint(0,20),random.randint(0,20)] for i in range(30)]       \
        + [[random.randint(40,60), random.randint(40,60)] for i in range(30)]    \
        + [[random.randint(80, 100), random.randint(80, 100)] for i in range(30)]

    print(f"Cluster points:{X}")

    kmeans = KMeans(n_cluster=k, tol=3e-4)
    centroids = kmeans.fit(X)
    prediction = kmeans.predict([[0.0,0.0],[50.0,40.0],[100.0,100.0]])

    print(f"KMeans centroids: {centroids}")
    print(f"KMeans predict for [0,0],[50,40],[100,100]]: {prediction}")

    colors = ['r', 'g', 'b']
    for i in range(k):
            plt.scatter([x[0] for x in X], [x[1] for x in X], s=7, c=colors[i])
    plt.scatter([x[0] for x in centroids], [x[1] for x in centroids], marker='*', s=200, c='black')
    plt.show()
def apply_kmeans_3(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []
    averg_list = []

    for k in range(1, 11):
        for it in range(0, 5):
            kmeans = KMeans(k, kmeans_max_iter)
            sse_vs_iter = kmeans.fit(x_train)
            train_sses_vs_iter.append(sse_vs_iter)
            train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
            train_sses_vs_k.append(min(sse_vs_iter))
        averg_list.append(
            (sum(train_purities_vs_k) / len(train_purities_vs_k)))
    #plot the average purity
    plot_y_vs_x(averg_list,
                x_label='k',
                y_label='purities',
                save_path='plot_purity_vs_k_%d' % do_pca)
def apply_kmeans_2(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []
    avg_me = []

    for k in range(1, 11):
        for it in range(0, 5):
            kmeans = KMeans(k, kmeans_max_iter)
            sse_vs_iter = kmeans.fit(x_train)
            train_sses_vs_iter.append(sse_vs_iter)
            train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
            train_sses_vs_k.append(min(sse_vs_iter))
        avg_me.append((sum(train_sses_vs_k) / len(train_sses_vs_k)))

    plot_y_vs_x(avg_me,
                x_label='k',
                y_label='sse',
                save_path='plot_sse_vs_k_%d' % do_pca)
Example #17
0
def generate_distribution_plot(clusters, output_path):
    pyplot.clf()

    for centroid, points in clusters.iteritems():
        distances = sorted(
            [KMeans.calculate_distance(centroid, p) for p in points])

        pdf = stats.norm.pdf(distances, np.mean(distances), np.std(distances))

        pl.plot(distances, pdf)

    pl.savefig(output_path + 'distribution.png')
Example #18
0
def apply_kmeans_avg(x_train, y_train, kmeans_max_iter, k, iterations=5):
    train_sses_vs_iter = None
    sse = 0
    purity = 0
    print("")
    for step in range(iterations):
        print("On step ", step + 1, "of", iterations, "for k =", k)
        kmeans = KMeans(k, kmeans_max_iter)
        sse_vs_iter_loop = np.array(kmeans.fit(x_train))

        # initialize the train sse array
        if train_sses_vs_iter is None:
            train_sses_vs_iter = np.zeros(len(sse_vs_iter_loop))

        train_sses_vs_iter += sse_vs_iter_loop

        purity += kmeans.get_purity(x_train, y_train)
        sse += sse_vs_iter_loop.min()

    return (train_sses_vs_iter /
            iterations).tolist(), sse / iterations, purity / iterations
    def setUp(self):
        random.seed(1)
        self.sc = SparkContext(master='local')
        self.points = self.sc.parallelize([
            (1, 1, 0),
            (1, 2, 2),
            (2, 2, 2),
            (2, 1, 0),

            (8, 0, 0),
            (8, 1, 2),
            (9, 1, 0),
            (9, 2, 2),

            (1, 1, 7),
            (1, 2, 9),
            (2, 2, 9),
            (2, 1, 7),
        ])
        self.k = 3
        self.kmeans = KMeans(self.k)
Example #20
0
def dbscan_test():
    from sklearn.datasets.samples_generator import make_blobs
    from sklearn.preprocessing import StandardScaler
    from sklearn.cluster import DBSCAN
    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750,
                                centers=centers,
                                cluster_std=0.4,
                                random_state=0)
    X = StandardScaler().fit_transform(X)
    db = DBSCAN(eps=0.3, min_samples=10).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    # print X
    kmeans = KMeans(n_clusters=n_clusters_).fit(X)
    #kmeans.draw_with_areas(X, kmeans)
    #kmeans.draw_membership_area(X, kmeans)
    kmeans.draw(n_clusters_, X, kmeans)
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################
    sses_sum = 0
    purities_sum = 0
    for k in range(1, kmeans_max_k):
        # for k in range(1, 6):
        for i in range(5):
            kmeans = KMeans(k, kmeans_max_iter)
            sse_vs_iter = kmeans.fit(x_train)
            sses_sum += min(sse_vs_iter)
            purities_sum += kmeans.get_purity(x_train, y_train)
        print(k)
        sses_sum /= 5
        purities_sum /= 5
        train_sses_vs_k.append(sses_sum)
        train_purities_vs_k.append(purities_sum)
    print(train_sses_vs_k)
    print(train_purities_vs_k)

    plot_y_vs_x_list(train_sses_vs_iter,
                     x_label='iter',
                     y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d' % do_pca)
    plot_y_vs_x(train_sses_vs_k,
                x_label='k',
                y_label='sse',
                save_path='plot_sse_vs_k_%d' % do_pca)
    plot_y_vs_x(train_purities_vs_k,
                x_label='k',
                y_label='purities',
                save_path='plot_purity_vs_k_%d' % do_pca)
Example #22
0
def user__reputation__to__upvotes_cast(k, user_lines):
    """
    Classifies users based on the following:
        - user reputation
        - user upvotes cast

    :param k: Number of clusters
    :param user_lines: PythonRDD containing the lines in the users XML file
    :return: RDD of clustered data
    """
    result = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Reputation', 'UpVotes'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    return KMeans(k).fit(result)
 def _fit(self, X):
     cov = np.cov(X.T)
     kmeans = KMeans(self.n_components)
     kmeans.fit(X)
     self.mu = kmeans.centers
     self.cov = np.array([cov for _ in range(self.n_components)])
     self.coef = np.ones(self.n_components) / self.n_components
     params = np.hstack(
         (self.mu.ravel(),
          self.cov.ravel(),
          self.coef.ravel())
     )
     while True:
         stats = self._expectation(X)
         self._maximization(X, stats)
         new_params = np.hstack(
             (self.mu.ravel(),
              self.cov.ravel(),
              self.coef.ravel())
         )
         if np.allclose(params, new_params):
             break
         else:
             params = new_params
Example #24
0
def length__aboutme__to__user_rep(k, user_lines):
    """
    Classifies users based on the following:
        - length of about me
        - user reputation

    :param k: Number of clusters
    :param user_lines: RDD with XML file with users
    :return: RDD of clustered data
    """
    result = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Reputation', 'AboutMe'])) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2)) \
        .map(lambda a: (int(a[0]), len(a[1])))

    return KMeans(k).fit(result)
Example #25
0
def user__reputation__to__distinct_post_tags(k, user_lines, post_lines):
    # (user_id, reputation)
    rep = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'Reputation'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    # (user_id, number_distinct_tags)
    user_id_tags = post_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'Tags'])) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2)) \
        .map(lambda a: (int(a[0]), a[1].replace(">", "")[1:])) \
        .map(lambda a: (a[0], a[1].split("<"))) \
        .flatMapValues(lambda a: a) \
        .distinct() \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    # (rep, number_distinct_tags)
    result = rep.join(user_id_tags).map(lambda a: (a[1][0], a[1][1]))

    return KMeans(k).fit(result)
Example #26
0
def user__badges__to__signup__to__answers_and_questions(
        k, user_lines, badges_lines, posts_lines):
    # (user_id, signup)
    user_id_signup = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'CreationDate'])) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2)) \
        .map(lambda a: (int(a[0]), helpers.datetime_to_timestamp(a[1])))

    # (user_id, number_badges)
    user_id_badges = badges_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['UserId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 1)) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    # (user_id, post_type)
    posts = posts_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'PostTypeId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2)) \

    # (user_id, n_answers)
    user_id_answers = posts \
        .filter(lambda a: a[1] == 2) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    # (user_id, n_asked)
    user_id_questions = posts \
        .filter(lambda a: a[1] == 1) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    # (n_questions, n_answers, n_badges, signup)
    result = user_id_signup.join(user_id_badges)
    result = result.join(user_id_answers)
    result = result.join(user_id_questions)
    result = result.map(lambda a:
                        (a[1][1], a[1][0][1], a[1][0][0][1], a[1][0][0][0]))

    return KMeans(k).fit(result)
Example #27
0
def main(dataset_fn, output_fn, clusters_no, w):
    geo_locs = []
    # read location data from csv file and store each location as a Point(latit,longit) object
    df = pd.read_csv(dataset_fn)
    for index, row in df.iterrows():
        loc_ = Node(
            [float(row['X']),
             float(row['Y']),
             float(row['PreChange'])], row['ID'])
        geo_locs.append(loc_)
    # run k_means clustering
    w = np.array(w)
    model = KMeans(geo_locs, clusters_no, w)
    flag = model.fit(True)
    if flag == -1:
        print("No of points are less than cluster number!")
    else:
        # save clustering results is a list of lists where each list represents one cluster
        model.save(output_fn)
        model.showresult(True)
Example #28
0
def user__membership_time__to__closed_questions(k, users, posts, post_history):
    """
    Classifies users based on the following:
        - amount of time a user has been a member
        - number of close or delete votes any of their posts have received

    :param k: Number of clusters
    :param users: RDD with XML file with users
    :param posts: RDD with XML file with posts
    :param post_history: RDD with XML file with post history
    :return: RDD of clustered data
    """
    # (user_id, timestamp)
    user_data = users\
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'CreationDate']))\
        .filter(lambda a: helpers.is_valid_tuple(a, 2))\
        .map(lambda data: (int(data[0]), helpers.datetime_to_timestamp(data[1])))\

    # (post_id, author_user_id)
    post_data = posts\
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'OwnerUserId'], int))\
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    # (post_id, number_of_close_and_delete_votes)
    post_history_data = post_history\
        .map(lambda line: xml_parser.extract_attributes(line, ['PostId', 'PostHistoryTypeId'], int))\
        .filter(lambda a: helpers.is_valid_tuple(a, 2) and a[1] in [10, 12])\
        .map(lambda a: (a[0], 1))\
        .reduceByKey(add)

    # (user_id, number_of_close_and_delete_votes)
    user_delete_and_close_count = post_data.join(post_history_data)\
        .map(lambda a: a[1])\
        .reduceByKey(add)

    # (timestamp, number_of_close_and_delete_votes)
    data = user_data.join(user_delete_and_close_count).map(lambda a: a[1])

    return KMeans(k).fit(data)
Example #29
0
def user__signup__to__distinct_post_tags(k, user_lines, post_lines):
    # (user_id, signup_as_timestamp)
    creation_date = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'CreationDate'])) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2)) \
        .map(lambda a: (int(a[0]), helpers.datetime_to_timestamp(a[1])))

    # (user_id, number_distinct_tags)
    user_id_tags = post_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'Tags'])) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2)) \
        .map(lambda a: (int(a[0]), a[1].replace(">", "")[1:])) \
        .map(lambda a: (a[0], a[1].split("<"))) \
        .flatMapValues(lambda a: a) \
        .distinct() \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    # (signup_as_timestamp, number_distinct_tags)
    result = creation_date.join(user_id_tags).map(lambda a: (a[1][0], a[1][1]))

    return KMeans(k).fit(result)
Example #30
0
def post__edits__average__to__user_rep(k, user_lines, post_history_lines):
    """
    Classifies users based on the following:
        - number of times a user's post has been edited
        - user's reputation

    :param k: Number of clusters
    :param user_lines: RDD with XML file with users
    :param post_history_lines: RDD with XML file with post history
    :return: RDD of clustered data
    """
    reputations = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'Reputation'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    post_edits_average = post_history_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['UserId', 'PostHistoryTypeId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2) and a[1] == 5) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    result = reputations.join(post_edits_average).map(lambda value: value[1])

    return KMeans(k).fit(result)
Example #31
0
# verifique qual foi a classe predominante, amostras pertencentes a outras
# classes estao no grupo errado. Faca os experimentos com a distancia
# Euclidiana. Gere graficos com os grupos formados pelo kmeans e clusterizacao
# hierarquica. Comente os resultados. Lembre-se de nao usar o atributo da classe
# para agrupar os dados.
import matplotlib.pyplot as plt

from clustering import KMeans
from clustering import Hierarchical
import utils


print 'Base Spiral\n'
x, y = utils.abrir_dados_spiral('./bases/spiral.csv')
print 'Kmeans'
kmeans = KMeans(x)
for k in [2, 3]:
    kmeans.train(k)
    print 'k = {:d}. Acuracia = {:1.3f} '.format(k, 100 * kmeans.calculate_accuracy(y))
    fig = kmeans.scatter_plot()
    plt.savefig('./bases/l03/spiral_kmeans_k_{:d}'.format(k))
    plt.clf()

print 'Hierarchical'
hier = Hierarchical(x)
for k in [2, 3]:
    hier.agnes(k)
    print 'k = {:d}. Acuracia = {:1.3f} '.format(k, 100 * hier.calculate_accuracy(y))
    fig = hier.scatter_plot()
    plt.savefig('./bases/l03/spiral_hier_k_{:d}'.format(k))
    plt.clf()