Ejemplo n.º 1
0
def test_loader_functions():

    # Iterator loader
    dds = DDS().load(range(10)).collect()
    assert len(dds) == 10

    # Single file
    content = "Holala World!"
    desc, filename = tempfile.mkstemp()
    with open(filename, "wb") as tmp_f:
        tmp_f.write(bytearray(content, "utf8"))
    dds = DDS().load_file(filename).collect()
    os.remove(filename)
    assert dds[0] == content

    # Multiple files
    _dir = os.path.join(tempfile.gettempdir(), "multi")
    if not os.path.exists(_dir):
        os.mkdir(_dir)

    for _i in range(10):
        desc, filename = tempfile.mkstemp(dir=_dir)
        file_path = os.path.join(_dir, filename)
        with open(file_path, "wb") as tmp_f:
            tmp_f.write(bytearray(str(_i), "utf8"))

    dds = DDS().load_files_from_dir(_dir).collect()
    shutil.rmtree(_dir)

    dds = sorted(dds, key=lambda x: x[1])
    for _i in range(10):
        assert int(dds[_i][1]) == _i
Ejemplo n.º 2
0
def test_word_count():
    # Word Count
    vocabulary = ["Holala", "World", "COMPSs", "Lorem", "Ipsum", "_filter_"]

    _dir = os.path.join(tempfile.gettempdir(), "wc")
    if not os.path.exists(_dir):
        os.mkdir(_dir)
    for _i in range(5):
        desc, filename = tempfile.mkstemp(dir=_dir, suffix=".txt")
        file_path = os.path.join(_dir, filename)
        tmp_f = open(file_path, "w")
        for word in vocabulary:
            tmp_f.write(word + " ")
        tmp_f.close()

    dds = DDS().load_files_from_dir(_dir) \
        .flat_map(lambda x: x[1].split())\
        .filter(lambda x: "_" not in x)\
        .count_by_value(as_dict=True)

    shutil.rmtree(_dir)

    for key, value in dds.items():
        assert "_" not in key
        assert value == 5
Ejemplo n.º 3
0
def inverted_indexing():

    path = sys.argv[1]
    start_time = time.time()
    result = DDS().load_files_from_dir(path).map_and_flatten(_invert_files)\
        .reduce_by_key(lambda a, b: a + b).collect()
    print(result[-1:])
    print("Elapsed Time {} (s)".format(time.time() - start_time))
Ejemplo n.º 4
0
def test_terasort():

    dataset = [gen_fragment() for _ in range(10)]
    dds = DDS().load(dataset, -1).sort_by_key().collect()
    prev = 0

    for i, k in dds:
        assert i > prev
        prev = i
Ejemplo n.º 5
0
def inverted_indexing():
    """
    TODO: Missing documentation
    """
    path = sys.argv[1]
    start_time = time.time()
    result = DDS().load_files_from_dir(path).flat_map(_invert_files)\
        .reduce_by_key(lambda a, b: a + b).collect()
    print(result[-1:])
    print("Elapsed Time {} (s)".format(time.time() - start_time))
Ejemplo n.º 6
0
def word_count():

    path_file = sys.argv[1]
    start = time.time()

    results = DDS().load_files_from_dir(path_file).\
        map_and_flatten(lambda x: x[1].split())\
        .count_by_value(arity=4, as_dict=True)

    print("Elapsed Time: ", time.time() - start)
Ejemplo n.º 7
0
def pi_estimation():
    """
    Example is taken from: https://spark.apache.org/examples.html
    """
    print("Estimating Pi by 'throwing darts' algorithm.")
    tries = 100000
    print("Number of tries: {}".format(tries))

    count = DDS().load(range(0, tries), 10) \
        .filter(inside).count()
    print("Pi is roughly %f" % (4.0 * count / tries))
Ejemplo n.º 8
0
def transitive_closure(partitions=None):
    """
    TODO: Missing documentation
    """
    if not partitions:
        partitions = int(sys.argv[2]) if len(sys.argv) > 2 else 2
    # Commented out code for unknown reason:
    # path = sys.argv[1]
    # od = DDS().load_text_file(path, partitions) \
    #     .map(lambda line: (int(line.split(",")[0]), int(line.split(",")[1])))\
    #     .collect(future_objects=True)
    edges = _generate_graph()
    od = DDS().load(edges, partitions).collect(future_objects=True)

    # Because join() joins on keys, the edges are stored in reversed order.
    edges = DDS().load(od, -1).map(lambda x_y: (x_y[1], x_y[0]))

    next_count = DDS().load(od, -1).count()

    while True:
        old_count = next_count
        # Perform the join, obtaining an RDD of (y, (z, x)) pairs,
        # then project the result to obtain the new (x, z) paths.
        new_edges = DDS().load(od, -1).join(edges)\
            .map(lambda __a_b: (__a_b[1][1], __a_b[1][0]))
        od = DDS().load(od, -1).union(new_edges).distinct()\
            .collect(future_objects=True)

        next_count = DDS().load(od, -1).count()

        if next_count == old_count:
            break

    print("TC has %i edges" % next_count)
Ejemplo n.º 9
0
def word_count():
    """
    TODO: Missing documentation
    """
    path_file = sys.argv[1]
    start = time.time()

    results = DDS().load_files_from_dir(path_file) \
        .flat_map(lambda x: x[1].split()) \
        .map(lambda x: ''.join(e for e in x if e.isalnum())) \
        .count_by_value(as_dict=True)

    print("Results: " + str(results))
    print("Elapsed Time: ", time.time() - start)
Ejemplo n.º 10
0
def terasort():
    """
    """

    dir_path = sys.argv[1]
    dest_path = sys.argv[2]
    # partitions = sys.argv[2] if len(sys.argv) > 2 else -1

    start_time = time.time()

    dds = DDS().load_files_from_dir(dir_path)\
        .map_and_flatten(files_to_pairs)\
        .sort_by_key().save_as_text_file(dest_path)

    # compss_barrier()
    # test = DDS().load_pickle_files(dest_path).map(lambda x: x).collect()
    # print(test[-1:])

    print("Elapsed Time {} (s)".format(time.time() - start_time))
Ejemplo n.º 11
0
def terasort():
    """
    TODO: Missing documentation
    """

    dir_path = sys.argv[1]
    dest_path = sys.argv[2]
    # Commented out code for unknown reason:
    # partitions = sys.argv[2] if len(sys.argv) > 2 else -1

    start_time = time.time()

    dds = DDS().load_files_from_dir(dir_path) \
        .flat_map(files_to_pairs) \
        .sort_by_key().save_as_text_file(dest_path)

    # Commented out code for unknown reason:
    # compss_barrier()
    # test = DDS().load_pickle_files(dest_path).map(lambda x: x).collect()
    # print(test[-1:])

    print("Result: " + str(dds))
    print("Elapsed Time {} (s)".format(time.time() - start_time))
Ejemplo n.º 12
0
def inverted_indexing():
    def _invert_files(pair):
        res = dict()
        for word in pair[1].split():
            res[word] = [pair[0]]
        return res.items()

    vocabulary = ["Holala", "World", "COMPSs", "Lorem", "Ipsum"]
    files = list()
    pairs = defaultdict(list)

    _dir = os.path.join(tempfile.gettempdir(), "ii")
    if not os.path.exists(_dir):
        os.mkdir(_dir)

    # Create Files
    for _i in range(len(vocabulary) // 2):
        desc, filename = tempfile.mkstemp(dir=_dir, suffix=".txt")
        files.append(filename)

    for word in vocabulary:
        _files = random.sample(files, 2)
        for _f in _files:
            file_path = os.path.join(_dir, _f)
            tmp_f = open(file_path, "a")
            tmp_f.write(word + " ")
            tmp_f.close()
            pairs[word].append(file_path)

    result = DDS().load_files_from_dir(_dir).flat_map(_invert_files)\
        .reduce_by_key(lambda a, b: a + b).collect()

    shutil.rmtree(_dir)

    for word, files in result:
        assert set(pairs[word]).issubset(set(files))
Ejemplo n.º 13
0
def wordcount_k_means(dim=742):
    """
    TODO: Missing documentation
    """
    import numpy as np

    f_path = sys.argv[1]

    start_time = time.time()

    vocab = DDS().load_files_from_dir(f_path, num_of_parts=4)\
        .flat_map(lambda x: x[1].split()) \
        .map(lambda x: ''.join(e for e in x if e.isalnum())) \
        .count_by_value(arity=2, as_dict=True, as_fo=True)

    total = len(os.listdir(f_path))
    max_iter = 2
    frags = 4
    epsilon = 1e-10
    size = total / frags
    k = 4
    # The number of dimensions corresponds to: dim = len(vocabulary)
    # dim = 742  # added as parameter to allow unittests with different dataset

    # to access file names by index returned from the clusters..
    # load_files_from_list will also sort them alphabetically
    indexes = [os.path.join(f_path, f) for f in sorted(os.listdir(f_path))]

    # step 2
    # wc_per_file = DDS().load_files_from_dir(files_path, num_of_parts=frags)\
    #     .map(__count_locally__, vocabulary)\
    #     .map(__gen_array__)\

    wc_per_file = list()

    for fn in sorted(os.listdir(f_path)):
        wc_per_file.append(task_count_locally(os.path.join(f_path, fn), vocab))

    mu = [np.random.randint(1, 3, dim) for _ in range(frags)]

    old_mu = []
    clusters = []
    n = 0

    while n < max_iter and not has_converged(mu, old_mu, epsilon):
        old_mu = mu
        clusters = [
            cluster_points_partial([wc_per_file[f]], mu, int(f * size))
            for f in range(frags)
        ]
        partial_result = [
            partial_sum([wc_per_file[f]], clusters[f], int(f * size))
            for f in range(frags)
        ]
        mu = merge_reduce(reduce_centers, partial_result)
        mu = cwo(mu)
        mu = [mu[c][1] / mu[c][0] for c in mu]
        while len(mu) < k:
            # Add a new random center if one of the centers has no points.
            mu.append(np.random.randint(1, 3, dim))
        n += 1

    clusters_with_frag = cwo(clusters)

    from collections import defaultdict
    cluster_sets = defaultdict(list)

    for _d in clusters_with_frag:
        for _k in _d:
            cluster_sets[_k] += [indexes[i] for i in _d[_k]]

    # step 4 and 5 combined
    sims_per_file = {}

    for k in cluster_sets:
        clus = cluster_sets[k]
        for fayl in clus:
            sims_per_file[fayl] = get_similar_files(fayl, clus)

    sims_per_file = cwo(sims_per_file)

    for k in list(sims_per_file.keys())[:10]:
        print(k, "-----------sims --------->", sims_per_file[k][:5])

    print("-----------------------------")
    print("Kmeans Timed {} (s)".format(time.time() - start_time))
    print("Iterations: ", n)