def apply_kmeans(do_pca, x_train, y_train, x_test, y_test, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## for k in range(1, kmeans_max_k): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) plot_y_vs_x_list(train_sses_vs_iter, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d' % do_pca) plot_y_vs_x(train_sses_vs_k, x_label='k', y_label='sse', save_path='plot_sse_vs_k_%d' % do_pca) plot_y_vs_x(train_purities_vs_k, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d' % do_pca)
def apply_kmeans3(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## result = [] for k in range(1, 11): print('k:', k) for times in range(0, 5): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) print(train_purities_vs_k) avg = sum(train_purities_vs_k) / len(train_purities_vs_k) result.append(avg) train_purities_vs_k = [] print(result) print('max purity', max(result)) plot_y_vs_x(result, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d' % do_pca)
def apply_kmeans1(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## for run in range(0, 5): kmeans = KMeans(6, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) result = [] for col in range(len(train_sses_vs_iter[0])): sum = 0 for row in range(0, 5): sum += train_sses_vs_iter[row][col] sum = sum / 5 result.append(sum) result = [result] print(result) plot_y_vs_x_list(result, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d' % do_pca)
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## start = time.time() for k in range(1, kmeans_max_k): print("On step k =", k, "of", kmeans_max_k, "\telapsed time: %.2f" % (time.time() - start), "s") kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) plot_y_vs_x_list(train_sses_vs_iter, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d' % do_pca) plot_y_vs_x(train_sses_vs_k, x_label='k', y_label='sse', save_path='plot_sse_vs_k_%d' % do_pca) plot_y_vs_x(train_purities_vs_k, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d' % do_pca)
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## # iterations for 5 different runs of k-means. for k in range(0, 5): kmeans = KMeans(6, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) if k == 0: avg_list = [0] * len(sse_vs_iter) avg_list = [ avg_list[i] + sse_vs_iter[i] for i in range(len(sse_vs_iter)) ] plot_y_vs_x_list(train_sses_vs_iter, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d' % do_pca) plot_y_vs_x(avg_list, x_label='iterations', y_label='sse', save_path='plot_sse_vs_iter_%d' % do_pca)
def main(dataset_fn, output_fn, clusters_no): geo_locs = [] # read location data from csv file and store each location as a Point(latit,longit) object df = pd.read_csv(dataset_fn) for index, row in df.iterrows(): loc_ = Point(float(row['LAT']), float(row['LON'])) #tuples for location geo_locs.append(loc_) # run k_means clustering model = KMeans(geo_locs, clusters_no) flag = model.fit(True) if flag == -1: print("No of points are less than cluster number!") else: # save clustering results is a list of lists where each list represents one cluster model.save(output_fn)
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## for k in range(1, kmeans_max_k): sses = None avg_purity = 0. # do five tests to reduce effect of random start for i in range(5): kmeans = KMeans(k, kmeans_max_iter) sse = kmeans.fit(x_train) if (sses == None): sses = sse else: for j in range(len(sse)): sses[j] = (sses[j] + sse[j]) avg_purity += kmeans.get_purity(x_train, y_train) avg_purity = avg_purity / 5. for j in range(len(sses)): sses[j] = sses[j] / 5.0 # avg_sses = np.sum(np.array(sses), 0) / 5 train_sses_vs_iter.append(sses) train_purities_vs_k.append(avg_purity) train_sses_vs_k.append(min(sses)) plot_y_vs_x_list(train_sses_vs_iter, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d' % do_pca) plot_y_vs_x(train_sses_vs_k, x_label='k', y_label='sse', save_path='plot_sse_vs_k_%d' % do_pca) plot_y_vs_x(train_purities_vs_k, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d' % do_pca)
def user__upvotes_cast__to__average_post_length__to__profile_views( k, users, posts): """ Classifies users based on the following: - amount of upvotes a user has cast - user's average post length (question and answer) - user's profile views :param k: Number of clusters :param users: RDD with XML file with users :param posts: RDD with XML file with posts :return: RDD of clustered data """ # (user_id, average_post_length) user_avg_post_length = posts\ .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'Body']))\ .filter(lambda a: helpers.is_valid_tuple(a, 2))\ .map(lambda data: (int(data[0]), len(data[1])))\ .aggregateByKey((0, 0), lambda a, b: (a[0] + b, a[1] + 1), lambda a, b: (a[0] + b[0], a[1] + b[1]))\ .mapValues(lambda value: value[0] / value[1]) # (id, (upvotes_cast, profile_views)) user_data = users\ .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'UpVotes', 'Views'], int))\ .filter(lambda a: helpers.is_valid_tuple(a, 3))\ .map(lambda data: (data[0], (data[1], data[2]))) # (upvotes_cast, views, average_post_length) joined_data = user_data.join(user_avg_post_length)\ .map(lambda data: (data[1][0][0], data[1][0][1], data[1][1])) return KMeans(k).fit(joined_data)
def test_calculate_centroid(self): points = [ (1, 2, 3), (3, 4, 5), ] centroids = KMeans.calculate_centroid(points) self.assertEqual(centroids, (2, 3, 4))
def user__reputation__to__own_questions_answered(k, user_lines, post_lines): # (user_id, rep) user_id_reputation = user_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'Reputation'], int)) \ .filter(lambda a: helpers.is_valid_tuple(a, 2)) # (user_id, n_questions) user_id_questions = post_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'OwnerUserId', 'PostTypeId'], int)) \ .filter(lambda a: helpers.is_valid_tuple(a, 3) and a[2] == 1) \ .map(lambda a: (a[0], a[1])) # (user_id, n_asked) user_id_answers = post_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['ParentId', 'OwnerUserId', 'PostTypeId'], int)) \ .filter(lambda a: helpers.is_valid_tuple(a, 3) and a[2] == 2) \ .map(lambda a: (a[0], a[1])) # (user_id, n_questions_self_answered) user_id_own_answers = user_id_questions.intersection(user_id_answers) \ .map(lambda a: (a[1], 1)) \ .reduceByKey(add) # (rep, n_questions_self_answered) result = user_id_reputation.join(user_id_own_answers).map( lambda a: (a[1][0], a[1][1])) return KMeans(k).fit(result)
def user_rep_to_answers_and_questions(k, user_lines, posts_lines): reputation = user_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'Reputation'], int)) \ .filter(lambda a: helpers.is_valid_tuple(a, 2)) posts = posts_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'PostTypeId'], int)) \ .filter(lambda a: helpers.is_valid_tuple(a, 2)) questions = posts \ .filter(lambda a: a[1] == 1) \ .map(lambda a: (a[0], 1)) \ .reduceByKey(add) answers = posts \ .filter(lambda a: a[1] == 2) \ .map(lambda a: (a[0], 1)) \ .reduceByKey(add) # (user_id, (answer_count, question_count)) ratio = answers.join(questions) # (user_id, (user_rep, (answer_count, question_count))) result = reputation.join(ratio) # (user_rep, answer_count, question_count) flat_result = result.map(lambda a: (a[1][0], a[1][1][0], a[1][1][1])) return KMeans(k).fit(flat_result)
def test_calculate_average_distance(self): centre = (1, 1) points = [ (4, 1), (5, 4), ] self.assertEqual(KMeans.calculate_average_distance(centre, points), 4)
def user_questions_answered(k, posts_lines): result = posts_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'PostTypeId'], int)) \ .filter(lambda a: helpers.is_valid_tuple(a, 2) and a[1] == 2) \ .map(lambda a: (a[0], 1)) \ .reduceByKey(add) \ .map(lambda a: (a[1],)) return KMeans(k).fit(result)
def main(): k = 3 X = [[random.randint(0,20),random.randint(0,20)] for i in range(30)] \ + [[random.randint(40,60), random.randint(40,60)] for i in range(30)] \ + [[random.randint(80, 100), random.randint(80, 100)] for i in range(30)] print(f"Cluster points:{X}") kmeans = KMeans(n_cluster=k, tol=3e-4) centroids = kmeans.fit(X) prediction = kmeans.predict([[0.0,0.0],[50.0,40.0],[100.0,100.0]]) print(f"KMeans centroids: {centroids}") print(f"KMeans predict for [0,0],[50,40],[100,100]]: {prediction}") colors = ['r', 'g', 'b'] for i in range(k): plt.scatter([x[0] for x in X], [x[1] for x in X], s=7, c=colors[i]) plt.scatter([x[0] for x in centroids], [x[1] for x in centroids], marker='*', s=200, c='black') plt.show()
def apply_kmeans_3(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] averg_list = [] for k in range(1, 11): for it in range(0, 5): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) averg_list.append( (sum(train_purities_vs_k) / len(train_purities_vs_k))) #plot the average purity plot_y_vs_x(averg_list, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d' % do_pca)
def apply_kmeans_2(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] avg_me = [] for k in range(1, 11): for it in range(0, 5): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) avg_me.append((sum(train_sses_vs_k) / len(train_sses_vs_k))) plot_y_vs_x(avg_me, x_label='k', y_label='sse', save_path='plot_sse_vs_k_%d' % do_pca)
def generate_distribution_plot(clusters, output_path): pyplot.clf() for centroid, points in clusters.iteritems(): distances = sorted( [KMeans.calculate_distance(centroid, p) for p in points]) pdf = stats.norm.pdf(distances, np.mean(distances), np.std(distances)) pl.plot(distances, pdf) pl.savefig(output_path + 'distribution.png')
def apply_kmeans_avg(x_train, y_train, kmeans_max_iter, k, iterations=5): train_sses_vs_iter = None sse = 0 purity = 0 print("") for step in range(iterations): print("On step ", step + 1, "of", iterations, "for k =", k) kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter_loop = np.array(kmeans.fit(x_train)) # initialize the train sse array if train_sses_vs_iter is None: train_sses_vs_iter = np.zeros(len(sse_vs_iter_loop)) train_sses_vs_iter += sse_vs_iter_loop purity += kmeans.get_purity(x_train, y_train) sse += sse_vs_iter_loop.min() return (train_sses_vs_iter / iterations).tolist(), sse / iterations, purity / iterations
def setUp(self): random.seed(1) self.sc = SparkContext(master='local') self.points = self.sc.parallelize([ (1, 1, 0), (1, 2, 2), (2, 2, 2), (2, 1, 0), (8, 0, 0), (8, 1, 2), (9, 1, 0), (9, 2, 2), (1, 1, 7), (1, 2, 9), (2, 2, 9), (2, 1, 7), ]) self.k = 3 self.kmeans = KMeans(self.k)
def dbscan_test(): from sklearn.datasets.samples_generator import make_blobs from sklearn.preprocessing import StandardScaler from sklearn.cluster import DBSCAN centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) X = StandardScaler().fit_transform(X) db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # print X kmeans = KMeans(n_clusters=n_clusters_).fit(X) #kmeans.draw_with_areas(X, kmeans) #kmeans.draw_membership_area(X, kmeans) kmeans.draw(n_clusters_, X, kmeans)
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## sses_sum = 0 purities_sum = 0 for k in range(1, kmeans_max_k): # for k in range(1, 6): for i in range(5): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) sses_sum += min(sse_vs_iter) purities_sum += kmeans.get_purity(x_train, y_train) print(k) sses_sum /= 5 purities_sum /= 5 train_sses_vs_k.append(sses_sum) train_purities_vs_k.append(purities_sum) print(train_sses_vs_k) print(train_purities_vs_k) plot_y_vs_x_list(train_sses_vs_iter, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d' % do_pca) plot_y_vs_x(train_sses_vs_k, x_label='k', y_label='sse', save_path='plot_sse_vs_k_%d' % do_pca) plot_y_vs_x(train_purities_vs_k, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d' % do_pca)
def user__reputation__to__upvotes_cast(k, user_lines): """ Classifies users based on the following: - user reputation - user upvotes cast :param k: Number of clusters :param user_lines: PythonRDD containing the lines in the users XML file :return: RDD of clustered data """ result = user_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['Reputation', 'UpVotes'], int)) \ .filter(lambda a: helpers.is_valid_tuple(a, 2)) return KMeans(k).fit(result)
def _fit(self, X): cov = np.cov(X.T) kmeans = KMeans(self.n_components) kmeans.fit(X) self.mu = kmeans.centers self.cov = np.array([cov for _ in range(self.n_components)]) self.coef = np.ones(self.n_components) / self.n_components params = np.hstack( (self.mu.ravel(), self.cov.ravel(), self.coef.ravel()) ) while True: stats = self._expectation(X) self._maximization(X, stats) new_params = np.hstack( (self.mu.ravel(), self.cov.ravel(), self.coef.ravel()) ) if np.allclose(params, new_params): break else: params = new_params
def length__aboutme__to__user_rep(k, user_lines): """ Classifies users based on the following: - length of about me - user reputation :param k: Number of clusters :param user_lines: RDD with XML file with users :return: RDD of clustered data """ result = user_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['Reputation', 'AboutMe'])) \ .filter(lambda a: helpers.is_valid_tuple(a, 2)) \ .map(lambda a: (int(a[0]), len(a[1]))) return KMeans(k).fit(result)
def user__reputation__to__distinct_post_tags(k, user_lines, post_lines): # (user_id, reputation) rep = user_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'Reputation'], int)) \ .filter(lambda a: helpers.is_valid_tuple(a, 2)) # (user_id, number_distinct_tags) user_id_tags = post_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'Tags'])) \ .filter(lambda a: helpers.is_valid_tuple(a, 2)) \ .map(lambda a: (int(a[0]), a[1].replace(">", "")[1:])) \ .map(lambda a: (a[0], a[1].split("<"))) \ .flatMapValues(lambda a: a) \ .distinct() \ .map(lambda a: (a[0], 1)) \ .reduceByKey(add) # (rep, number_distinct_tags) result = rep.join(user_id_tags).map(lambda a: (a[1][0], a[1][1])) return KMeans(k).fit(result)
def user__badges__to__signup__to__answers_and_questions( k, user_lines, badges_lines, posts_lines): # (user_id, signup) user_id_signup = user_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'CreationDate'])) \ .filter(lambda a: helpers.is_valid_tuple(a, 2)) \ .map(lambda a: (int(a[0]), helpers.datetime_to_timestamp(a[1]))) # (user_id, number_badges) user_id_badges = badges_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['UserId'], int)) \ .filter(lambda a: helpers.is_valid_tuple(a, 1)) \ .map(lambda a: (a[0], 1)) \ .reduceByKey(add) # (user_id, post_type) posts = posts_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'PostTypeId'], int)) \ .filter(lambda a: helpers.is_valid_tuple(a, 2)) \ # (user_id, n_answers) user_id_answers = posts \ .filter(lambda a: a[1] == 2) \ .map(lambda a: (a[0], 1)) \ .reduceByKey(add) # (user_id, n_asked) user_id_questions = posts \ .filter(lambda a: a[1] == 1) \ .map(lambda a: (a[0], 1)) \ .reduceByKey(add) # (n_questions, n_answers, n_badges, signup) result = user_id_signup.join(user_id_badges) result = result.join(user_id_answers) result = result.join(user_id_questions) result = result.map(lambda a: (a[1][1], a[1][0][1], a[1][0][0][1], a[1][0][0][0])) return KMeans(k).fit(result)
def main(dataset_fn, output_fn, clusters_no, w): geo_locs = [] # read location data from csv file and store each location as a Point(latit,longit) object df = pd.read_csv(dataset_fn) for index, row in df.iterrows(): loc_ = Node( [float(row['X']), float(row['Y']), float(row['PreChange'])], row['ID']) geo_locs.append(loc_) # run k_means clustering w = np.array(w) model = KMeans(geo_locs, clusters_no, w) flag = model.fit(True) if flag == -1: print("No of points are less than cluster number!") else: # save clustering results is a list of lists where each list represents one cluster model.save(output_fn) model.showresult(True)
def user__membership_time__to__closed_questions(k, users, posts, post_history): """ Classifies users based on the following: - amount of time a user has been a member - number of close or delete votes any of their posts have received :param k: Number of clusters :param users: RDD with XML file with users :param posts: RDD with XML file with posts :param post_history: RDD with XML file with post history :return: RDD of clustered data """ # (user_id, timestamp) user_data = users\ .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'CreationDate']))\ .filter(lambda a: helpers.is_valid_tuple(a, 2))\ .map(lambda data: (int(data[0]), helpers.datetime_to_timestamp(data[1])))\ # (post_id, author_user_id) post_data = posts\ .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'OwnerUserId'], int))\ .filter(lambda a: helpers.is_valid_tuple(a, 2)) # (post_id, number_of_close_and_delete_votes) post_history_data = post_history\ .map(lambda line: xml_parser.extract_attributes(line, ['PostId', 'PostHistoryTypeId'], int))\ .filter(lambda a: helpers.is_valid_tuple(a, 2) and a[1] in [10, 12])\ .map(lambda a: (a[0], 1))\ .reduceByKey(add) # (user_id, number_of_close_and_delete_votes) user_delete_and_close_count = post_data.join(post_history_data)\ .map(lambda a: a[1])\ .reduceByKey(add) # (timestamp, number_of_close_and_delete_votes) data = user_data.join(user_delete_and_close_count).map(lambda a: a[1]) return KMeans(k).fit(data)
def user__signup__to__distinct_post_tags(k, user_lines, post_lines): # (user_id, signup_as_timestamp) creation_date = user_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'CreationDate'])) \ .filter(lambda a: helpers.is_valid_tuple(a, 2)) \ .map(lambda a: (int(a[0]), helpers.datetime_to_timestamp(a[1]))) # (user_id, number_distinct_tags) user_id_tags = post_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'Tags'])) \ .filter(lambda a: helpers.is_valid_tuple(a, 2)) \ .map(lambda a: (int(a[0]), a[1].replace(">", "")[1:])) \ .map(lambda a: (a[0], a[1].split("<"))) \ .flatMapValues(lambda a: a) \ .distinct() \ .map(lambda a: (a[0], 1)) \ .reduceByKey(add) # (signup_as_timestamp, number_distinct_tags) result = creation_date.join(user_id_tags).map(lambda a: (a[1][0], a[1][1])) return KMeans(k).fit(result)
def post__edits__average__to__user_rep(k, user_lines, post_history_lines): """ Classifies users based on the following: - number of times a user's post has been edited - user's reputation :param k: Number of clusters :param user_lines: RDD with XML file with users :param post_history_lines: RDD with XML file with post history :return: RDD of clustered data """ reputations = user_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'Reputation'], int)) \ .filter(lambda a: helpers.is_valid_tuple(a, 2)) post_edits_average = post_history_lines \ .map(lambda line: xml_parser.extract_attributes(line, ['UserId', 'PostHistoryTypeId'], int)) \ .filter(lambda a: helpers.is_valid_tuple(a, 2) and a[1] == 5) \ .map(lambda a: (a[0], 1)) \ .reduceByKey(add) result = reputations.join(post_edits_average).map(lambda value: value[1]) return KMeans(k).fit(result)
# verifique qual foi a classe predominante, amostras pertencentes a outras # classes estao no grupo errado. Faca os experimentos com a distancia # Euclidiana. Gere graficos com os grupos formados pelo kmeans e clusterizacao # hierarquica. Comente os resultados. Lembre-se de nao usar o atributo da classe # para agrupar os dados. import matplotlib.pyplot as plt from clustering import KMeans from clustering import Hierarchical import utils print 'Base Spiral\n' x, y = utils.abrir_dados_spiral('./bases/spiral.csv') print 'Kmeans' kmeans = KMeans(x) for k in [2, 3]: kmeans.train(k) print 'k = {:d}. Acuracia = {:1.3f} '.format(k, 100 * kmeans.calculate_accuracy(y)) fig = kmeans.scatter_plot() plt.savefig('./bases/l03/spiral_kmeans_k_{:d}'.format(k)) plt.clf() print 'Hierarchical' hier = Hierarchical(x) for k in [2, 3]: hier.agnes(k) print 'k = {:d}. Acuracia = {:1.3f} '.format(k, 100 * hier.calculate_accuracy(y)) fig = hier.scatter_plot() plt.savefig('./bases/l03/spiral_hier_k_{:d}'.format(k)) plt.clf()