def writeFile(rdd: RDD): num = rdd.count() if num > 0: result_dic = open_result() print(result_dic) rdd_c = rdd.collect() lst = ast.literal_eval(str(rdd_c)) for item in lst: key = item[0] value = item[1] if str(key).replace("'", '') in result_dic.keys(): result_dic[str(key).replace( "'", '')] = result_dic[str(key).replace("'", '')] + value else: result_dic[str(key).replace("'", '')] = value result = open(path, 'w', encoding='utf-8') result.write( json.dumps(result_dic).encode('gb18030').decode('unicode_escape')) result.close()
def writeFile(rdd: RDD, f_type): num = rdd.count() global save1 global save2 if num > 0: result_dic = open_result(f_type) rdd_c = rdd.collect() lst = ast.literal_eval(str(rdd_c)) for item in lst: key = item[0] value = item[1] if str(key).replace("'", '') in result_dic.keys(): result_dic[str(key).replace("'", '')] = result_dic[str(key).replace("'", '')] + value else: result_dic[str(key).replace("'", '')] = value if f_type == 0: save1 = result_dic if f_type == 1: save2 = result_dic result = open(path[f_type], 'w', encoding='utf-8') result.write(json.dumps(result_dic).encode('gb18030').decode('unicode_escape')) result.close()
def pcy_for_rdd(baskets: RDD, support_threshold_total=support_threshold_total) -> list: def check_all_subsets_frequent(itemset: list, frequent_itemsets_dict: dict) -> bool: ''' For example, given a triple ['2', '1', '8'], check if all its subsets ['2', '1'], ['2', '8'], ['1', '8'] are frequent items. :param itemset: :return: ''' itemset_size = len(itemset) for i in range(itemset_size): subset = itemset.copy() subset.pop(i) try: _ = frequent_itemsets_dict[tuple(subset)] # 不再需要sorted这个subset,basket已sort except: return False return True num_baskets = baskets.count() singleton_counts = baskets.\ flatMap(lambda set: [(item, 1) for item in set]).\ reduceByKey(lambda x,y: x+y).\ filter(lambda pair: pair[1] >= support_threshold_total) # frequent_singletons_dict = dict(singleton_counts.collect()).keys() frequent_itemsets_dict = dict(singleton_counts.collect()) # print("frequent_itemsets_dict", frequent_itemsets_dict) frequent_itemsets_list = [sorted(list(frequent_itemsets_dict.keys()))] del singleton_counts gc.collect() # all_pairs = baskets.flatMap(lambda basket: generate_combination(basket, 2)).persist() # 既然first/second pass都要用,为何不persist # # bucket_counts = all_pairs.map(lambda pair:(hash_pair(pair), 1)).reduceByKey(lambda x,y: x+y).collect() # first pass # bitmap = dict(bucket_counts) # for key, value in bitmap.items(): # if value >= support_threshold_total: # bitmap[key] = 1 # else: # bitmap[key] = 0 current_itemset_size = 2 while True: # print("current_itemset_size", current_itemset_size) # if current_itemset_size == 2: # pairs are special # frequent_itemsets = all_pairs.\ # filter(lambda _: qualified_as_candidate_pair(_, frequent_itemsets_dict, bitmap)).\ # map(lambda pair: (tuple(pair), 1)).\ # reduceByKey(lambda x, y: x + y).\ # filter(lambda pair: pair[1] >= support_threshold_total).persist() # del all_pairs # gc.collect() # else: # 双重filter frequent_itemsets = baskets.flatMap(lambda basket: generate_combination_with_filter(basket, frequent_itemsets_dict, current_itemset_size)). \ map(lambda itemset: (tuple(itemset), 1)).\ reduceByKey(lambda x,y: x+y).\ filter(lambda pair: pair[1] >= support_threshold_total).persist() # if frequent_itemsets.count() == 0: # break current_size_frequent_itemsets = sorted(frequent_itemsets.keys().collect()) if current_size_frequent_itemsets == []: break frequent_itemsets_list.append(current_size_frequent_itemsets) frequent_itemsets_dict.update(dict.fromkeys(current_size_frequent_itemsets)) # frequent_itemsets_dict.update(dict(frequent_itemsets.collect())) current_itemset_size += 1 del frequent_itemsets # 也许正确操作应该是释放内存之后再del?我不懂 del current_size_frequent_itemsets gc.collect() gc.collect() return frequent_itemsets_list
def son(baskets: RDD, support_threshold_total=support_threshold_total) -> list: def pcy_for_list(partition: list, support_threshold_total=support_threshold_total) -> dict: # partition = baskets num_baskets_chunk = len(partition) support_threshold = math.ceil(support_threshold_total * num_baskets_chunk / num_baskets) # first pass singleton_counts = {} bucket_counts = {} for basket in partition: for item in basket: singleton_counts[item] = singleton_counts.get(item, 0) + 1 pairs = generate_combination(basket, size=2) for pair in pairs: key = hash_pair(pair) bucket_counts[key] = bucket_counts.get(key, 0) + 1 for key, value in bucket_counts.items(): if value >= support_threshold: bucket_counts[key] = 1 else: bucket_counts[key] = 0 frequent_itemsets = {} for key, value in singleton_counts.items(): if value >= support_threshold: frequent_itemsets[key] = None # store all frequent singletons # print("singleton_counts", singleton_counts) # print("frequent singletons", frequent_itemsets) del singleton_counts gc.collect() # second pass itemset_counts = {} for basket in partition: pairs = generate_combination(basket, size=2) for pair in pairs: if qualified_as_candidate_pair(pair, frequent_itemsets, bitmap=bucket_counts): key = tuple(pair) itemset_counts[key] = itemset_counts.get(key, 0) + 1 for key, value in itemset_counts.items(): if value >= support_threshold: frequent_itemsets[key] = None # store all frequent pairs # print("pair counts", itemset_counts) del itemset_counts gc.collect() # more passes for larger-size itemsets size = 3 num_frequent_itemsets = len(frequent_itemsets) while True: itemset_counts = {} for basket in partition: itemsets = generate_combination_with_filter(basket, frequent_itemsets, size) for itemset in itemsets: key = tuple(itemset) itemset_counts[key] = itemset_counts.get(key, 0) + 1 for key, value in itemset_counts.items(): if value >= support_threshold: frequent_itemsets[key] = None # store all frequent pairs del itemset_counts gc.collect() current_num_frequent_itemsets = len(frequent_itemsets) # print("frequent_itemsets", frequent_itemsets) if current_num_frequent_itemsets == num_frequent_itemsets: # no more new frequent itemsets # print("break") break num_frequent_itemsets = current_num_frequent_itemsets size += 1 # print("frequent_itemsets", frequent_itemsets) return frequent_itemsets # First stage num_baskets = baskets.count() candidate_itemsets = dict.fromkeys(baskets.mapPartitions(lambda _: pcy_for_list(list(_), support_threshold_total)).distinct().collect(), 0) # print("candidate_itemsets", candidate_itemsets) # Second stage def qualified_as_candidate_itemset(itemset): try: _ = candidate_itemsets[itemset] return True except: return False singleton_counts = baskets.\ flatMap(lambda basket: basket).\ filter(lambda item: qualified_as_candidate_itemset(item)).\ map(lambda _: (_, 1)).\ reduceByKey(lambda x,y: x+y).\ filter(lambda pair: pair[1] >= support_threshold_total).keys().collect() frequent_itemsets = [sorted(singleton_counts)] del singleton_counts gc.collect() size = 2 while True: frequent_itemsets_for_particular_size = baskets.\ flatMap(lambda _: generate_combination_with_filter(_, candidate_itemsets, size)).\ filter(lambda _: qualified_as_candidate_itemset(tuple(_))).\ map(lambda _: (tuple(_), 1)).\ reduceByKey(lambda x,y: x+y).\ filter(lambda pair: pair[1] >= support_threshold_total).keys().collect() if frequent_itemsets_for_particular_size == []: break else: frequent_itemsets.append(sorted(frequent_itemsets_for_particular_size)) size += 1 del frequent_itemsets_for_particular_size gc.collect() return frequent_itemsets
def kmeans(data: RDD, num_clusters: int, min_cluster_id=0, max_iterations=20) -> RDD: """ :param data: pair RDD, key is point_id, value is feature vector :param num_clusters: number of clusters :param min_cluster_id: The id of each cluster is min_cluster_id, min_cluster_id + 1, min_cluster_id + 2 ... :param max_iterations: maximum number of iterations :return: clustering result, pair RDD, key is point_id, value is (cluster id, feature vector) """ # ** some helper functions: def minimum_distance_to_centroids(point: list, centroids: dict) -> float: min_distance = min([ sum([(i - j)**2 for i, j in zip(point, centroid)]) for centroid in centroids ]) return min_distance def assign_point_to_cluster(point: list, centroids_: dict) -> int: "Given a data point, find the nearest centroid to it" distances = [(sum([(i - j)**2 for i, j in zip(point, centroid)]), cid) for cid, centroid in centroids_.items()] min_distance, cluster_index = min(distances) return cluster_index # ** initialize the first k centroids total_samples = data.count() print("initializing centroids") print("total samples:", total_samples) sample_fraction = 0.3 if total_samples * sample_fraction < num_clusters: if total_samples < num_clusters: centroids = data.values().collect() else: centroids = data.values().take(num_clusters) else: sample_data = data.sample(withReplacement=False, fraction=sample_fraction, seed=10) centroids = [sample_data.first()[1] ] # add first centroid to the collection of centroids for _ in range(num_clusters - 1): # find the next k-1 centroids distances = sample_data.\ mapValues(lambda values: minimum_distance_to_centroids(values, centroids)).persist() furthest_point = sample_data.lookup( distances.max(lambda pair: pair[1])[0])[0] centroids.append( furthest_point) # furthest from already selected points centroids = [(min_cluster_id + i, centroid) for i, centroid in enumerate(centroids)] # assign index centroids = dict(centroids) # ** perform clustering num_iterations = 0 previous_cluster_sizes = dict() while num_iterations < max_iterations: print("current iteration:", num_iterations) print("assign points to clusters") cluster_result: RDD = data.mapValues(lambda values: ( assign_point_to_cluster(values, centroids), values)).persist() cluster_sizes = cluster_result.map( lambda pair: (pair[1][0], 1)).reduceByKey(lambda x, y: x + y) cluster_sizes = dict(cluster_sizes.collect()) if num_iterations > 0: # after first iteration num_reassigments = [ (cluster_sizes[cid] - previous_cluster_sizes[cid]) for cid in cluster_sizes.keys() ] print("num_reassigments:", num_reassigments) if max(num_reassigments) - min( num_reassigments) < 3: # Now the process converges break previous_cluster_sizes = cluster_sizes.copy() print("update centroids_:") new_centroids = cluster_result. \ values(). \ flatMapValues(lambda features: [(i, feature) for i, feature in enumerate(features)]). \ map(lambda pair: ((pair[0], pair[1][0]), (pair[1][1], 1))). \ reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])). \ mapValues(lambda values: values[0] / values[1]). \ map(lambda pair: (pair[0][0], (pair[0][1], pair[1]))). \ groupByKey(). \ mapValues(sorted). \ mapValues(lambda values: [value for feature_index, value in values]). \ persist() new_centroids = dict(new_centroids.collect()) centroids = new_centroids.copy() num_iterations += 1 print("Converged. Total iterations:", num_iterations) print( "cluster size", cluster_result.values().mapValues( lambda values: 1).groupByKey().mapValues(len).collect()) return cluster_result
def test_pca(spark_context: SparkContext, mat: RowMatrix, rows: rdd.RDD): pc: RowMatrix = pca(spark_context, mat, rows.count()) assert pc is not None