def writeFile(rdd: RDD):
    num = rdd.count()
    if num > 0:
        result_dic = open_result()
        print(result_dic)
        rdd_c = rdd.collect()
        lst = ast.literal_eval(str(rdd_c))
        for item in lst:
            key = item[0]
            value = item[1]
            if str(key).replace("'", '') in result_dic.keys():
                result_dic[str(key).replace(
                    "'", '')] = result_dic[str(key).replace("'", '')] + value
            else:
                result_dic[str(key).replace("'", '')] = value
        result = open(path, 'w', encoding='utf-8')
        result.write(
            json.dumps(result_dic).encode('gb18030').decode('unicode_escape'))
        result.close()
def writeFile(rdd: RDD, f_type):
    num = rdd.count()
    global save1
    global save2
    if num > 0:
        result_dic = open_result(f_type)
        rdd_c = rdd.collect()
        lst = ast.literal_eval(str(rdd_c))
        for item in lst:
            key = item[0]
            value = item[1]
            if str(key).replace("'", '') in result_dic.keys():
                result_dic[str(key).replace("'", '')] = result_dic[str(key).replace("'", '')] + value
            else:
                result_dic[str(key).replace("'", '')] = value
        if f_type == 0:
            save1 = result_dic
        if f_type == 1:
            save2 = result_dic
        result = open(path[f_type], 'w', encoding='utf-8')
        result.write(json.dumps(result_dic).encode('gb18030').decode('unicode_escape'))
        result.close()
    def pcy_for_rdd(baskets: RDD, support_threshold_total=support_threshold_total) -> list:

        def check_all_subsets_frequent(itemset: list, frequent_itemsets_dict: dict) -> bool:
            '''
            For example, given a triple ['2', '1', '8'], check if all its subsets ['2', '1'], ['2', '8'], ['1', '8']
            are frequent items.
            :param itemset:
            :return:
            '''
            itemset_size = len(itemset)
            for i in range(itemset_size):
                subset = itemset.copy()
                subset.pop(i)
                try:
                    _ = frequent_itemsets_dict[tuple(subset)]  # 不再需要sorted这个subset,basket已sort
                except:
                    return False
            return True

        num_baskets = baskets.count()
        singleton_counts = baskets.\
            flatMap(lambda set: [(item, 1) for item in set]).\
            reduceByKey(lambda x,y: x+y).\
            filter(lambda pair: pair[1] >= support_threshold_total)
        # frequent_singletons_dict = dict(singleton_counts.collect()).keys()
        frequent_itemsets_dict = dict(singleton_counts.collect())
        # print("frequent_itemsets_dict", frequent_itemsets_dict)
        frequent_itemsets_list = [sorted(list(frequent_itemsets_dict.keys()))]
        del singleton_counts
        gc.collect()

        # all_pairs = baskets.flatMap(lambda basket: generate_combination(basket, 2)).persist()  # 既然first/second pass都要用,为何不persist
        #
        # bucket_counts = all_pairs.map(lambda pair:(hash_pair(pair), 1)).reduceByKey(lambda x,y: x+y).collect()  # first pass
        # bitmap = dict(bucket_counts)
        # for key, value in bitmap.items():
        #     if value >= support_threshold_total:
        #         bitmap[key] = 1
        #     else:
        #         bitmap[key] = 0

        current_itemset_size = 2
        while True:
            # print("current_itemset_size", current_itemset_size)
            # if current_itemset_size == 2: # pairs are special
            #     frequent_itemsets = all_pairs.\
            #         filter(lambda _: qualified_as_candidate_pair(_, frequent_itemsets_dict, bitmap)).\
            #         map(lambda pair: (tuple(pair), 1)).\
            #         reduceByKey(lambda x, y: x + y).\
            #         filter(lambda pair: pair[1] >= support_threshold_total).persist()
            #     del all_pairs
            #     gc.collect()
            # else:  # 双重filter
            frequent_itemsets = baskets.flatMap(lambda basket: generate_combination_with_filter(basket, frequent_itemsets_dict, current_itemset_size)). \
                map(lambda itemset: (tuple(itemset), 1)).\
                reduceByKey(lambda x,y: x+y).\
                filter(lambda pair: pair[1] >= support_threshold_total).persist()
            # if frequent_itemsets.count() == 0:
            #     break
            current_size_frequent_itemsets = sorted(frequent_itemsets.keys().collect())
            if current_size_frequent_itemsets == []:
                break

            frequent_itemsets_list.append(current_size_frequent_itemsets)
            frequent_itemsets_dict.update(dict.fromkeys(current_size_frequent_itemsets))
            # frequent_itemsets_dict.update(dict(frequent_itemsets.collect()))
            current_itemset_size += 1
            del frequent_itemsets  # 也许正确操作应该是释放内存之后再del?我不懂
            del current_size_frequent_itemsets
            gc.collect()

        gc.collect()
        return frequent_itemsets_list
    def son(baskets: RDD, support_threshold_total=support_threshold_total) -> list:

        def pcy_for_list(partition: list, support_threshold_total=support_threshold_total) -> dict:
            # partition = baskets
            num_baskets_chunk = len(partition)
            support_threshold = math.ceil(support_threshold_total * num_baskets_chunk / num_baskets)

            # first pass
            singleton_counts = {}
            bucket_counts = {}
            for basket in partition:
                for item in basket:
                    singleton_counts[item] = singleton_counts.get(item, 0) + 1

                pairs = generate_combination(basket, size=2)
                for pair in pairs:
                    key = hash_pair(pair)
                    bucket_counts[key] = bucket_counts.get(key, 0) + 1

            for key, value in bucket_counts.items():
                if value >= support_threshold:
                    bucket_counts[key] = 1
                else:
                    bucket_counts[key] = 0

            frequent_itemsets = {}
            for key, value in singleton_counts.items():
                if value >= support_threshold:
                    frequent_itemsets[key] = None  # store all frequent singletons
            # print("singleton_counts", singleton_counts)
            # print("frequent singletons", frequent_itemsets)
            del singleton_counts
            gc.collect()

            # second pass
            itemset_counts = {}
            for basket in partition:
                pairs = generate_combination(basket, size=2)
                for pair in pairs:
                    if qualified_as_candidate_pair(pair, frequent_itemsets, bitmap=bucket_counts):
                        key = tuple(pair)
                        itemset_counts[key] = itemset_counts.get(key, 0) + 1

            for key, value in itemset_counts.items():
                if value >= support_threshold:
                    frequent_itemsets[key] = None  # store all frequent pairs
            # print("pair counts", itemset_counts)
            del itemset_counts
            gc.collect()

            # more passes for larger-size itemsets
            size = 3
            num_frequent_itemsets = len(frequent_itemsets)
            while True:
                itemset_counts = {}
                for basket in partition:
                    itemsets = generate_combination_with_filter(basket, frequent_itemsets, size)
                    for itemset in itemsets:
                        key = tuple(itemset)
                        itemset_counts[key] = itemset_counts.get(key, 0) + 1

                for key, value in itemset_counts.items():
                    if value >= support_threshold:
                        frequent_itemsets[key] = None  # store all frequent pairs
                del itemset_counts
                gc.collect()

                current_num_frequent_itemsets = len(frequent_itemsets)
                # print("frequent_itemsets", frequent_itemsets)
                if current_num_frequent_itemsets == num_frequent_itemsets:  # no more new frequent itemsets
                    # print("break")
                    break

                num_frequent_itemsets = current_num_frequent_itemsets
                size += 1

            # print("frequent_itemsets", frequent_itemsets)
            return frequent_itemsets

        # First stage
        num_baskets = baskets.count()
        candidate_itemsets = dict.fromkeys(baskets.mapPartitions(lambda _: pcy_for_list(list(_), support_threshold_total)).distinct().collect(), 0)
        # print("candidate_itemsets", candidate_itemsets)

        # Second stage
        def qualified_as_candidate_itemset(itemset):
            try:
                _ = candidate_itemsets[itemset]
                return True
            except:
                return False

        singleton_counts = baskets.\
            flatMap(lambda basket: basket).\
            filter(lambda item: qualified_as_candidate_itemset(item)).\
            map(lambda _: (_, 1)).\
            reduceByKey(lambda x,y: x+y).\
            filter(lambda pair: pair[1] >= support_threshold_total).keys().collect()
        frequent_itemsets = [sorted(singleton_counts)]
        del singleton_counts
        gc.collect()

        size = 2
        while True:
            frequent_itemsets_for_particular_size = baskets.\
                flatMap(lambda _: generate_combination_with_filter(_, candidate_itemsets, size)).\
                filter(lambda _: qualified_as_candidate_itemset(tuple(_))).\
                map(lambda _: (tuple(_), 1)).\
                reduceByKey(lambda x,y: x+y).\
                filter(lambda pair: pair[1] >= support_threshold_total).keys().collect()
            if frequent_itemsets_for_particular_size == []:
                break
            else:
                frequent_itemsets.append(sorted(frequent_itemsets_for_particular_size))
                size += 1

            del frequent_itemsets_for_particular_size
            gc.collect()

        return frequent_itemsets
Esempio n. 5
0
    def kmeans(data: RDD,
               num_clusters: int,
               min_cluster_id=0,
               max_iterations=20) -> RDD:
        """
        :param data: pair RDD, key is point_id, value is feature vector
        :param num_clusters: number of clusters
        :param min_cluster_id: The id of each cluster is min_cluster_id, min_cluster_id + 1, min_cluster_id + 2 ...
        :param max_iterations: maximum number of iterations
        :return: clustering result, pair RDD, key is point_id, value is (cluster id, feature vector)
        """

        # ** some helper functions:
        def minimum_distance_to_centroids(point: list,
                                          centroids: dict) -> float:
            min_distance = min([
                sum([(i - j)**2 for i, j in zip(point, centroid)])
                for centroid in centroids
            ])
            return min_distance

        def assign_point_to_cluster(point: list, centroids_: dict) -> int:
            "Given a data point, find the nearest centroid to it"
            distances = [(sum([(i - j)**2
                               for i, j in zip(point, centroid)]), cid)
                         for cid, centroid in centroids_.items()]
            min_distance, cluster_index = min(distances)
            return cluster_index

        # ** initialize the first k centroids
        total_samples = data.count()
        print("initializing centroids")
        print("total samples:", total_samples)
        sample_fraction = 0.3
        if total_samples * sample_fraction < num_clusters:
            if total_samples < num_clusters:
                centroids = data.values().collect()
            else:
                centroids = data.values().take(num_clusters)
        else:
            sample_data = data.sample(withReplacement=False,
                                      fraction=sample_fraction,
                                      seed=10)
            centroids = [sample_data.first()[1]
                         ]  # add first centroid to the collection of centroids
            for _ in range(num_clusters - 1):  # find the next k-1 centroids
                distances = sample_data.\
                    mapValues(lambda values: minimum_distance_to_centroids(values, centroids)).persist()
                furthest_point = sample_data.lookup(
                    distances.max(lambda pair: pair[1])[0])[0]
                centroids.append(
                    furthest_point)  # furthest from already selected points

        centroids = [(min_cluster_id + i, centroid)
                     for i, centroid in enumerate(centroids)]  # assign index
        centroids = dict(centroids)

        # ** perform clustering
        num_iterations = 0
        previous_cluster_sizes = dict()
        while num_iterations < max_iterations:
            print("current iteration:", num_iterations)
            print("assign points to clusters")
            cluster_result: RDD = data.mapValues(lambda values: (
                assign_point_to_cluster(values, centroids), values)).persist()

            cluster_sizes = cluster_result.map(
                lambda pair: (pair[1][0], 1)).reduceByKey(lambda x, y: x + y)
            cluster_sizes = dict(cluster_sizes.collect())
            if num_iterations > 0:  # after first iteration
                num_reassigments = [
                    (cluster_sizes[cid] - previous_cluster_sizes[cid])
                    for cid in cluster_sizes.keys()
                ]
                print("num_reassigments:", num_reassigments)
                if max(num_reassigments) - min(
                        num_reassigments) < 3:  # Now the process converges
                    break

            previous_cluster_sizes = cluster_sizes.copy()
            print("update centroids_:")
            new_centroids = cluster_result. \
                values(). \
                flatMapValues(lambda features: [(i, feature) for i, feature in enumerate(features)]). \
                map(lambda pair: ((pair[0], pair[1][0]), (pair[1][1], 1))). \
                reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])). \
                mapValues(lambda values: values[0] / values[1]). \
                map(lambda pair: (pair[0][0], (pair[0][1], pair[1]))). \
                groupByKey(). \
                mapValues(sorted). \
                mapValues(lambda values: [value for feature_index, value in values]). \
                persist()
            new_centroids = dict(new_centroids.collect())
            centroids = new_centroids.copy()
            num_iterations += 1

        print("Converged. Total iterations:", num_iterations)
        print(
            "cluster size",
            cluster_result.values().mapValues(
                lambda values: 1).groupByKey().mapValues(len).collect())
        return cluster_result
def test_pca(spark_context: SparkContext, mat: RowMatrix, rows: rdd.RDD):
    pc: RowMatrix = pca(spark_context, mat, rows.count())

    assert pc is not None