Beispiel #1
0
def count_items(input, output):
    """Sum all the items in the input data set"""
    # Intermediate file name
    inter = output + '_inter'

    # Run the task with specified mapper and reducer methods
    prince.run(count_mapper, count_reducer, input, inter, inputformat='text', outputformat='text', files=__file__)
    prince.run(sum_mapper, count_reducer, inter + '/part*', output, inputformat='text', outputformat='text', files=__file__)

    # Read the output file and print it 
    file = prince.dfs.read(output + '/part*', first=1)
    return int(file.split()[1])
Beispiel #2
0
def count_items(input, output):
    """Sum all the items in the input data set"""
    # Intermediate file name
    inter = output + '_inter'

    # Run the task with specified mapper and reducer methods
    prince.run(count_mapper,
               count_reducer,
               input,
               inter,
               inputformat='text',
               outputformat='text',
               files=__file__)
    prince.run(sum_mapper,
               count_reducer,
               inter + '/part*',
               output,
               inputformat='text',
               outputformat='text',
               files=__file__)

    # Read the output file and print it
    file = prince.dfs.read(output + '/part*', first=1)
    return int(file.split()[1])
Beispiel #3
0
    part     = '/part-00000'
    options  = {'damping': damping, 'precision': precision, 'nb_nodes': len(graph)}

    # Create the initial values
    pagerank_current = pagerank % iteration_start
    if iteration_start == 1:
        pagerank_values = [(n, make_value(pr_init, pr_init, n_adjacent)) for n, n_adjacent in graph.items()]
        prince.dfs.write(pagerank_current + part, pagerank_values)
        iteration_start += 1

    stop = False
    iteration = iteration_start
    while not stop and iteration < iteration_max:
        # Update file names
        pagerank_previous = pagerank_current
        pagerank_current  = pagerank % iteration
        term_current      = term % iteration

        # Compute the new PageRank values
        prince.run(pagerank_mapper, pagerank_reducer, pagerank_previous + suffix, pagerank_current,
                   [], options, 'text', 'text')

        # Termination: check if all PageRank values are stable
        prince.run(term_mapper, term_reducer, pagerank_current + suffix, term_current,
                   [], options, 'text', 'text')
        term_value = prince.dfs.read(term_current + suffix)
        stop = int(term_value.split()[1])

        # Get ready for the next iteration
        iteration += 1
Beispiel #4
0
import prince

# Methods from wordcount are now in the local name space
from wordcount import * 


def display_usage():
    print 'usage: %s input output' % sys.argv[0]
    print '  input: input file on the DFS'
    print '  output: output file on the DFS'


if __name__ == "__main__":
    # Always call prince.init() at the beginning of the program
    prince.init()

    if len(sys.argv) != 3:
        display_usage()
        sys.exit(0)

    input  = sys.argv[1]
    output = sys.argv[2]

    # Run the task with the mapper and reducer methods from the wordcount.py file
    # Note that the file wordcount.py is added to the 'files' argument
    prince.run(wc_mapper, wc_reducer, input, output, inputformat='text', outputformat='text', files='wordcount.py')

    # Read the output file and print it 
    file = prince.dfs.read(output + '/part*')
    print file
Beispiel #5
0
if __name__ == "__main__":
    # Always call prince.init() at the beginning of the program
    prince.init()

    if len(sys.argv) != 3:
        display_usage()
        sys.exit(0)

    input  = sys.argv[1]
    output = sys.argv[2] + '%04d'
    sorted = sys.argv[2] + '_sorted'
    suffix  = '/part*'

    # Create the initial buckets from the data
    prince.run(init_mapper, init_reducer, input, output % 0, inputformat='text', outputformat='text')

    stop = False
    iteration = 1
    while not stop:
        # Merge current buckets
        previous = output % (iteration - 1)
        current  = output % iteration
        prince.run(merge_mapper, merge_reducer, previous + suffix, current, inputformat='text', outputformat='text')
 
        # Check if sort is done
        state = prince.dfs.read(current + suffix, last=1) 
        if int(state.split()[0]) == 0:
            stop = True
        iteration += 1
Beispiel #6
0
def display_usage():
    print 'usage: %s input output' % sys.argv[0]
    print '  input: input file on the DFS'
    print '  output: output file on the DFS'


if __name__ == "__main__":
    # Always call prince.init() at the beginning of the program
    prince.init()

    if len(sys.argv) != 3:
        display_usage()
        sys.exit(0)

    input = sys.argv[1]
    output = sys.argv[2]

    # Run the task with the mapper and reducer methods from the wordcount.py file
    # Note that the file wordcount.py is added to the 'files' argument
    prince.run(wc_mapper,
               wc_reducer,
               input,
               output,
               inputformat='text',
               outputformat='text',
               files='wordcount.py')

    # Read the output file and print it
    file = prince.dfs.read(output + '/part*')
    print file
Beispiel #7
0
    # Create the initial values
    pagerank_current = pagerank % iteration_start
    if iteration_start == 1:
        pagerank_values = [(n, make_value(pr_init, pr_init, n_adjacent))
                           for n, n_adjacent in graph.items()]
        prince.dfs.write(pagerank_current + part, pagerank_values)
        iteration_start += 1

    stop = False
    iteration = iteration_start
    while not stop and iteration < iteration_max:
        # Update file names
        pagerank_previous = pagerank_current
        pagerank_current = pagerank % iteration
        term_current = term % iteration

        # Compute the new PageRank values
        prince.run(pagerank_mapper, pagerank_reducer,
                   pagerank_previous + suffix, pagerank_current, [], options,
                   'text', 'text')

        # Termination: check if all PageRank values are stable
        prince.run(term_mapper, term_reducer, pagerank_current + suffix,
                   term_current, [], options, 'text', 'text')
        term_value = prince.dfs.read(term_current + suffix)
        stop = int(term_value.split()[1])

        # Get ready for the next iteration
        iteration += 1
Beispiel #8
0
def wc_reducer(key, values):
    """Reducer method with 'key' a string and 'values' a generator of strings"""
    try:                yield key, sum([int(v) for v in values])
    except ValueError:  pass # discard non-numerical values


def display_usage():
    print 'usage: ./%s input output' % sys.argv[0]
    print '  input: input file on the DFS'
    print '  output: output file on the DFS'


if __name__ == "__main__":
    # Always call prince.init() at the beginning of the program
    prince.init()

    if len(sys.argv) != 3:
        display_usage()
        sys.exit(0)

    input  = sys.argv[1]
    output = sys.argv[2]

    # Run the task with specified mapper and reducer methods
    prince.run(wc_mapper, wc_reducer, input, output, inputformat='text', outputformat='text')

    # Read the output file and print it 
    file = prince.dfs.read(output + '/part*')
    print file
Beispiel #9
0
    options  = {'graph': filename_graph, 'source': source_node}

    # Create the initial frontier with the tuple (source, 0)
    frontier_current = frontier % iteration_start
    if iteration_start == 1:
        prince.dfs.write(frontier_current + part, (source_node, '%d %d' % (sys.maxint, 0)))
        iteration_start += 1

    stop = False
    iteration = iteration_start
    while not stop and iteration < iteration_max:
        # Update file names
        frontier_previous = frontier_current
        frontier_current  = frontier % iteration
        term_current      = term % iteration

        # Compute the new frontier
        prince.run(frontier_mapper, frontier_reducer, frontier_previous + suffix, frontier_current,
                   filename_graph, options, 'text', 'text')
        print prince.dfs.read(frontier_current + suffix)

        # Termination: check if all distances are stable
        prince.run(term_mapper, term_reducer, frontier_current + suffix, term_current,
                   filename_graph, options, 'text', 'text')
        print prince.dfs.read(term_current + suffix)
        term_value = prince.dfs.read(term_current + suffix)
        stop = int(term_value.split()[1])

        # Get ready for the next iteration
        iteration += 1
Beispiel #10
0
    # Create the initial frontier with the tuple (source, 0)
    frontier_current = frontier % iteration_start
    if iteration_start == 1:
        prince.dfs.write(frontier_current + part,
                         (source_node, '%d %d' % (sys.maxint, 0)))
        iteration_start += 1

    stop = False
    iteration = iteration_start
    while not stop and iteration < iteration_max:
        # Update file names
        frontier_previous = frontier_current
        frontier_current = frontier % iteration
        term_current = term % iteration

        # Compute the new frontier
        prince.run(frontier_mapper, frontier_reducer,
                   frontier_previous + suffix, frontier_current,
                   filename_graph, options, 'text', 'text')
        print prince.dfs.read(frontier_current + suffix)

        # Termination: check if all distances are stable
        prince.run(term_mapper, term_reducer, frontier_current + suffix,
                   term_current, filename_graph, options, 'text', 'text')
        print prince.dfs.read(term_current + suffix)
        term_value = prince.dfs.read(term_current + suffix)
        stop = int(term_value.split()[1])

        # Get ready for the next iteration
        iteration += 1