Beispiel #1
0
    def test_dictionaries(self):

        payload = dict(first="a", second="b")
        res = cwo(post_with_dict_param(payload))
        self.assertDictEqual(res, payload, "TEST FAILED: dict payload/return")

        inout = dict(first="a", second="b")
        update_inout_dict(inout)
        inout = cwo(inout)
        self.assertIn("third", inout, "TEST FAILED: inout dict")
Beispiel #2
0
    def test_with_regular_tasks(self):

        inout = dict(first="a", second="b")
        update_inout_dict(inout)
        regular_task(inout)
        inout = cwo(inout)
        self.assertIn("third", inout, "TEST FAILED: http --> task")
        self.assertIn("greetings_from", inout, "TEST FAILED: http --> task")
Beispiel #3
0
    def test_get_methods(self):
        dummy()
        print("GET: dummy works.")
        message = "holala"
        length = int(cwo(get_length(message)))
        self.assertEqual(length, len(message), "TEST FAILED: GET get_length")

        mes = cwo(return_message(message))
        self.assertEqual(mes, message, "TEST FAILED: GET return_message")

        mes = cwo(get_nested_produces(message))
        self.assertEqual(str(mes), message, "TEST FAILED: GET nested_produces")

        mes, length = cwo(multi_return(message))
        self.assertEqual(str(mes), message, "TEST FAILED: GET multi return 0")
        self.assertEqual(int(length), len(message),
                         "TEST FAILED: GET multi return 1")
Beispiel #4
0
    def reduce(self, f, initial=marker, arity=-1):
        """
        Reduce the whole data set.
        :param f: A reduce function which should take two parameters as inputs
                  and return a single result which will be sent to itself again.
        :param initial: Initial value for reducer which will be used to reduce
                the first element with.
        :param arity: tree depth
        :return: reduced result (inside a DDS if necessary).

        >>> DDS().load(range(10), 5).reduce((lambda b, a: b + a) , 100)
        145
        """
        def local_reducer(partition):
            """
            A function to reduce a partition and retrieve it as a partition
            containing one element.
            :param partition:
            :return:
            """
            iterator = iter(partition)
            try:
                init = next(iterator)
            except StopIteration:
                return []
            import functools
            return [functools.reduce(f, iterator, init)]

        local_results = self.map_partitions(local_reducer)\
            .collect(future_objects=True)

        local_results = deque(local_results)

        # If initial value is set, add it to the list as well
        if initial != marker:
            local_results.append([initial])

        arity = arity if arity > 0 else len(self.partitions)
        branch = list()

        while local_results:
            while local_results and len(branch) < arity:
                temp = local_results.popleft()
                branch.append(temp)

            if len(branch) == 1:
                branch = cwo(branch[0])
                break

            temp = reduce_multiple(f, branch)
            local_results.append(temp)
            branch = []

        return branch[0]
Beispiel #5
0
    def test_post_methods(self):
        mes = cwo(dummy_post())
        self.assertEqual(mes, "post_works", "TEST FAILED: POST dummy")

        payload = "something"
        mes = cwo(post_with_param(payload))
        self.assertEqual(str(mes), payload,
                         "TEST FAILED: POST param in payload")

        inner_param = "hello"
        res = cwo(post_with_inner_param(inner_param))
        self.assertEqual(res.get("first", None), inner_param,
                         "TEST FAILED: POST inner param")

        fayl, content = "payload_file", "payload_content"
        with (open(fayl, 'w')) as nm:
            nm.write(content)

        ret = cwo(post_with_file_param(content))
        self.assertEqual(str(ret), content,
                         "TEST FAILED: POST file as payload")
Beispiel #6
0
    def test_serialization(self):

        payload = "something"
        ret = post_with_param(payload)

        res = cwo(post_with_inner_param(ret))
        self.assertEqual(res.get("first", ""), payload,
                         "TEST FAILED: POST inner param")

        inout = post_with_inner_param(ret)
        update_inout_dict(inout)
        regular_task(inout)
        inout = cwo(inout)
        self.assertIn("third", inout, "TEST FAILED: json serialization")
        self.assertIn("greetings_from", inout,
                      "TEST FAILED: json serialization")

        length = get_length("holalaa")
        inout = post_with_inner_param(length)
        regular_task(inout)
        inout = cwo(inout)
        self.assertIn("greetings_from", inout,
                      "TEST FAILED: json serialization")
Beispiel #7
0
    def count_by_value(self, arity=2, as_dict=True, as_fo=False):
        """
        Amount of each element on this data set.
        :return: list of tuples (element, number)

        >>> first = DDS().load([0, 1, 2], 2)
        >>> second = DDS().load([2, 3, 4], 3)
        >>> first.union(second).count_by_value(as_dict=True)
        {0: 1, 1: 1, 2: 2, 3: 1, 4: 1}
        """
        def count_partition(iterator):
            counts = defaultdict(int)
            for obj in iterator:
                counts[obj] += 1
            return counts

        # Count locally and create dictionary partitions
        local_results = self.map_partitions(count_partition) \
            .collect(future_objects=True)

        # Create a deque from partitions and start reduce
        future_objects = deque(local_results)

        branch = list()
        while future_objects:
            branch = []
            while future_objects and len(branch) < arity:
                temp = future_objects.popleft()
                branch.append(temp)

            if len(branch) == 1:
                break

            first, branch = branch[0], branch[1:]
            reduce_dicts(first, branch)
            future_objects.append(first)

        if as_dict:
            if as_fo:
                return branch[0]
            branch[0] = cwo(branch[0])
            return dict(branch[0])

        length = self.num_of_partitions()
        new_partitions = list()
        for i in range(length):
            new_partitions.append(task_dict_to_list(branch[0], length, i))

        return DDS().load(new_partitions, -1)
Beispiel #8
0
    def collect(self, keep_partitions=False, future_objects=False):
        """
        Returns all elements from all partitions. Elements can be grouped by
        partitions by setting keep_partitions value as True.
        :param keep_partitions: Keep Partitions?
        :param future_objects:
        :return:

        >>> dds = DDS().load(range(10), 2)
        >>> dds.collect(True)
        [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
        >>> DDS().load(range(10), 2).collect()
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        """
        processed = list()
        if self.func:
            if self.paac:
                for col in self.partitions:
                    processed.append(map_partition(self.func, None, col))
            else:
                for _p in self.partitions:
                    processed.append(map_partition(self.func, _p))
            # Reset the function!
            self.func = None
        else:
            for _p in self.partitions:
                if isinstance(_p, IPartitionGenerator):
                    processed.append(_p.retrieve_data())
                else:
                    processed.append(_p)

        # Future objects cannot be extended for now...
        if future_objects:
            return processed

        processed = cwo(processed)

        ret = list()
        if not keep_partitions:
            for _pp in processed:
                ret.extend(_pp)
        else:
            for _pp in processed:
                ret.append(list(_pp))
        return ret
Beispiel #9
0
    def test_dir_in(self):
        """
        Test DIRECTORY_IN
        """
        cur_path = "{}{}".format(os.getcwd(), os.sep)
        dir_in = "{}{}".format(cur_path, "some_dir_in")
        os.mkdir(dir_in)

        content = "this is some text to test directory_in".split(" ")

        for i, word in enumerate(content):
            temp = "{}{}{}".format(dir_in, os.sep, str(i))
            with open(temp, 'w') as f:
                f.write(word)

        res = self.dir_in_task(dir_in)
        cwod(dir_in)
        res = cwo(res)
        for word in content:
            self.assertTrue(word in res, "missing word: {}".format(word))

        shutil.rmtree(dir_in)
Beispiel #10
0
    def take(self, num):
        """
        The first num elements of DDS.
        :param num: number of elements to be retrieved.
        :return:

        """
        items = []
        partitions = self.collect(future_objects=True)
        taken = 0

        for part in partitions:
            _p = iter(cwo(part))
            while taken < num:
                try:
                    items.append(next(_p))
                    taken += 1
                except StopIteration:
                    break
            if taken >= num:
                break

        return items[:num]
Beispiel #11
0
    def sort_by_key(self,
                    ascending=True,
                    num_of_parts=None,
                    key_func=lambda x: x):
        """

        :type key_func:
        :param num_of_parts:
        :param ascending:
        :return:
        """

        if num_of_parts is None:
            num_of_parts = len(self.partitions)

        # Collect everything to take samples
        col_parts = self.collect(future_objects=True)
        samples = list()
        for _part in col_parts:
            samples.append(task_collect_samples(_part, 20, key_func))

        samples = sorted(list(itertools.chain.from_iterable(cwo(samples))))

        bounds = [
            samples[int(len(samples) * (i + 1) / num_of_parts)]
            for i in range(0, num_of_parts - 1)
        ]

        def range_partitioner(key):
            p = bisect.bisect_left(bounds, key_func(key))
            if ascending:
                return p
            else:
                return num_of_parts - 1 - p

        def sort_partition(iterator):
            """
            Sort a partition locally.
            :param iterator:
            :return:
            """
            chunk_size = 500
            iterator = iter(iterator)
            chunks = list()
            while True:
                chunk = list(itertools.islice(iterator, chunk_size))
                chunk.sort(key=lambda kv: key_func(kv[0]),
                           reverse=not ascending)
                chunks.append(chunk)
                if len(chunk) < chunk_size:
                    break
            else:
                chunks.append(
                    chunk.sort(key=lambda kv: key_func(kv[0]),
                               reverse=not ascending))

            return heapq3.merge(chunks,
                                key=lambda kv: key_func(kv[0]),
                                reverse=not ascending)

        partitioned = DDS().load(col_parts, -1).partition_by(range_partitioner)
        return partitioned.map_partitions(sort_partition)
Beispiel #12
0
    def test_workflow(self):
        """
        Test multiple tasks with directory in, out, and inout params.
        """
        cur_path = "{}{}".format(os.getcwd(), os.sep)
        dir_t = "{}{}".format(cur_path, "some_dir_t")
        os.mkdir(dir_t)

        # len(phase[i] = i)
        res_phase_0 = []
        for i in range(0, 5, 1):
            res_phase_0.append(self.dir_inout_task_i(dir_t, i))

        # len(phase[i] = 5)
        res_phase_1 = []
        for i in range(0, 5, 1):
            res_phase_1.append(self.dir_in_task_i(dir_t))

        # len(phase[i] = i + 5)
        res_phase_2 = []
        for i in range(5, 10, 1):
            res_phase_2.append(self.dir_inout_task_i(dir_t, i))

        # len(phase[i] = 10)
        res_phase_3 = []
        for i in range(0, 5, 1):
            res_phase_3.append(self.dir_in_task_i(dir_t))

        # dir out should contain only the last file
        for i in range(0, 15, 1):
            self.dir_out_task_i(dir_t, i)

        res_phase_0 = cwo(res_phase_0)
        res_phase_1 = cwo(res_phase_1)
        res_phase_2 = cwo(res_phase_2)
        res_phase_3 = cwo(res_phase_3)
        cwod(dir_t)

        for i, res in enumerate(res_phase_0):
            self.assertEqual(
                len(res), i, "error in task #{} of phase 0: {} != {}".format(
                    i, len(res), i))

        for i, res in enumerate(res_phase_1):
            self.assertEqual(
                len(res), 5,
                "error in task #{} of phase 1: {} != 5".format(i, len(res)))

        for i, res in enumerate(res_phase_2):
            self.assertEqual(
                len(res), i + 5,
                "error in task #{} of phase 2: {} != {}".format(
                    i, len(res), i + 5))

        for i, res in enumerate(res_phase_3):
            self.assertEqual(
                len(res), 10,
                "error in task #{} of phase 3: {} != 10".format(i, len(res)))

        self.assertEqual(
            1, len(os.listdir(dir_t)),
            "directory has fewer or more files than 1: {}".format(
                len(os.listdir(dir_t))))

        shutil.rmtree(dir_t)
Beispiel #13
0
def wordcount_k_means(dim=742):
    """
    TODO: Missing documentation
    """
    import numpy as np

    f_path = sys.argv[1]

    start_time = time.time()

    vocab = DDS().load_files_from_dir(f_path, num_of_parts=4)\
        .flat_map(lambda x: x[1].split()) \
        .map(lambda x: ''.join(e for e in x if e.isalnum())) \
        .count_by_value(arity=2, as_dict=True, as_fo=True)

    total = len(os.listdir(f_path))
    max_iter = 2
    frags = 4
    epsilon = 1e-10
    size = total / frags
    k = 4
    # The number of dimensions corresponds to: dim = len(vocabulary)
    # dim = 742  # added as parameter to allow unittests with different dataset

    # to access file names by index returned from the clusters..
    # load_files_from_list will also sort them alphabetically
    indexes = [os.path.join(f_path, f) for f in sorted(os.listdir(f_path))]

    # step 2
    # wc_per_file = DDS().load_files_from_dir(files_path, num_of_parts=frags)\
    #     .map(__count_locally__, vocabulary)\
    #     .map(__gen_array__)\

    wc_per_file = list()

    for fn in sorted(os.listdir(f_path)):
        wc_per_file.append(task_count_locally(os.path.join(f_path, fn), vocab))

    mu = [np.random.randint(1, 3, dim) for _ in range(frags)]

    old_mu = []
    clusters = []
    n = 0

    while n < max_iter and not has_converged(mu, old_mu, epsilon):
        old_mu = mu
        clusters = [
            cluster_points_partial([wc_per_file[f]], mu, int(f * size))
            for f in range(frags)
        ]
        partial_result = [
            partial_sum([wc_per_file[f]], clusters[f], int(f * size))
            for f in range(frags)
        ]
        mu = merge_reduce(reduce_centers, partial_result)
        mu = cwo(mu)
        mu = [mu[c][1] / mu[c][0] for c in mu]
        while len(mu) < k:
            # Add a new random center if one of the centers has no points.
            mu.append(np.random.randint(1, 3, dim))
        n += 1

    clusters_with_frag = cwo(clusters)

    from collections import defaultdict
    cluster_sets = defaultdict(list)

    for _d in clusters_with_frag:
        for _k in _d:
            cluster_sets[_k] += [indexes[i] for i in _d[_k]]

    # step 4 and 5 combined
    sims_per_file = {}

    for k in cluster_sets:
        clus = cluster_sets[k]
        for fayl in clus:
            sims_per_file[fayl] = get_similar_files(fayl, clus)

    sims_per_file = cwo(sims_per_file)

    for k in list(sims_per_file.keys())[:10]:
        print(k, "-----------sims --------->", sims_per_file[k][:5])

    print("-----------------------------")
    print("Kmeans Timed {} (s)".format(time.time() - start_time))
    print("Iterations: ", n)