Example #1
0
def do_job(pages_num, index, processed, to_do, another_to_do, letter, robot):
    mapper = MapReduce()
    while pages_num >= 0:
        try:
            url = to_do.get_nowait()
        except queue.Empty:
            break
        else:
            if UrlProcessor.get_name(url)[0] in letter:
                if url in processed or not robot.is_allowed(url):
                    continue

                processed.append(url)

                parsed_page = UrlProcessor.get_parsed_page(url)
                paper_link = UrlProcessor.check_arxiv(url)
                if not paper_link:
                    word_counts = mapper([url])
                    word_counts.sort(key=operator.itemgetter(1))
                    word_counts.reverse()
                    index[url] = word_counts
                else:
                    save_structured()

                pages_num -= 1

                for url_new in UrlProcessor.get_links(url, parsed_page):
                    to_do.put(url_new)
            else:
                another_to_do.put(url)
    return True
Example #2
0
def main():
    description = ('%s\n%s' % (__author__, __description__))
    epilog = ('%s\n%s' % (__credits__, __copyright__))
    parser = argparse.ArgumentParser(description=description, epilog=epilog)

    parser.add_argument('-i',
                        '--input',
                        dest='input',
                        help='Text file in input',
                        type=str,
                        required=True)

    parser.add_argument('-t',
                        '--top',
                        dest='top',
                        help='Maximum number of words to print out',
                        type=int,
                        default=50)

    options = parser.parse_args()
    num_words = options.top

    data = read_data(options.input)

    mapreduce = MapReduce(mapper, reducer)
    word_counts = mapreduce(data)
    word_counts.sort(key=operator.itemgetter(1))
    word_counts.reverse()

    print('\nTOP %d WORDS BY FREQUENCY\n' % (num_words))
    show = word_counts[:num_words]
    longest = max(len(word) for word, count in show)
    for word, count in show:
        print('%-*s: %5s' % (longest + 1, word, count))
def main():
    """
        Contributors:
            - Scot Matson
    """
    # TODO: This needs to be dynamically generated. The data gets destroyed
    #       during runtime - GitHub does not store empty directories.
    #       
    #       Going parallel, we may want to use multiple directories to help
    #       separate data sets, i.e., web1, web2, web3....... etc.
    data_directory = './web_pages/'

    # This is the biggest bottleneck in the application
    # Parallelizing this piece would give a major efficiency boost
    Crawler.crawl_web('http://www.sjsu.edu', 10)

    # Build a list of the files we have
    filepaths = list()
    for filename in os.listdir(data_directory):
        filepaths.append(data_directory + filename)

    map_data = list()
    for filepath in filepaths:
        fh = open(filepath, 'r')
        map_data.append(MapReduce.map(filepaths[0], fh))
        fh.close()
        # Remove files after parsing
        os.remove(filepath)

    intermediate_data = dict()
    # Shuffle data set
    intermediate_data = MapReduce.unshuffle(map_data)

    # Reduce should be condensing the data
    reduced_data = dict()
    reduced_data = MapReduce.reduce(intermediate_data)

    # Output results in reverse sequential ordering by value
    csvfile = open('out/results.csv', 'w')
    writer = csv.writer(csvfile, delimiter=',')
    for tag in sorted(reduced_data, key=reduced_data.get, reverse=True):
        print(tag + ': '+ str(reduced_data[tag]))
        writer.writerow([tag, reduced_data[tag]])
    csvfile.close()
Example #4
0
 def __init__(self, input_dir, output_dir, n_mappers, n_reducers):
     MapReduce.__init__(self, input_dir, output_dir, n_mappers, n_reducers)
Example #5
0
		name = split_values[0]
		if (name == 'Shubham' or name == 'Dhruv' or name == 'Shantam'):
			emitter((name, int(split_values[1])))
		
	# Define custom reducer function
	def udf_reducer(key, values, emitter):
		result = 0
		for v in values:
			result += v
		emitter(key, result/len(values))

	
	# Execute MapReduce job
	m, r, f, kill_idx = getConf("data/config_test_score.txt")
	f = "data/"+f
	mapred = MapReduce(m, r, f, udf_mapper, udf_reducer, kill_idx)

	# Sequential test verification	
	print("Verifying MapReduce results:")
	# Read MapReduce results
	output_arr = mapred.read_output()
	averages_mapred = {}
	for line in output_arr:
		name, average = line.rsplit(':', 1)
	averages_mapred[name] = float(average)
	# Compute sequential results
	averages_seq = {'Shubham': 0, 'Dhruv': 0, 'Shantam': 0}
	score_count = {'Shubham': 0, 'Dhruv': 0, 'Shantam': 0}
	with open('data/test_scores.txt', 'r') as reader:
		input_data = reader.readlines()
	for idx, line in enumerate(input_data):
Example #6
0
#!/usr/bin/env python
#-*-coding: utf-8-*-
## File name: main.py

import sys
sys.path.append('common')
sys.path.append('ub_model/mr')
from mapreduce import MapReduce
from mr_fac import MrFac
from productline_fac import ProductLineFac

if __name__ == '__main__':
  productline = sys.argv[1]
  mrname = sys.argv[2]
  
  mrfac = MrFac()
  mr = mrfac.getmr(mrname, productline)

  mrrunner = MapReduce()
  mrrunner.execute(mr)
Example #7
0
    def udf_mapper2(key, value, emitter):
        split_values = value.split(':')
        emitter(
            (0, (split_values[0], float(split_values[1])))
        )  # Same key so that we can handle all the data in one udf_reducer call

    # Reducer 2: Sort the remaining values and persist the top K
    def udf_reducer2(key, values, emitter):
        values = sorted(values, key=lambda x: x[1])
        for v in values[:K]:
            emitter(v[0])

    # Execute MapReduce job
    m, r, f, kill_idx = getConf("data/config_knn.txt")
    f = "data/" + f
    mapred = MapReduce(m, r, f, udf_mapper, udf_reducer, kill_idx)
    output_path_from_previous_job = os.path.abspath(mapred.OUT_DIR)
    mapred2 = MapReduce(m, 1, output_path_from_previous_job, udf_mapper2,
                        udf_reducer2, -1)

    # Sequential test verification
    print("Verifying MapReduce results:")
    # Read MapReduce results
    output_arr = mapred2.read_output()
    knn_mapred = set()
    for line in output_arr:
        knn_mapred.add(line)
    # Compute sequential results
    knn_seq = set()
    with open('data/knn-dataset.txt', 'r') as reader:
        input_data = reader.readlines()
Example #8
0
 def __init__(self, input_dir, output_dir, n_mappers, n_reducers):
     MapReduce.__init__(self,  input_dir, output_dir, n_mappers, n_reducers)
Example #9
0
    with bz2.BZ2File(filename) as file_:
        for i, line in enumerate(file_):
            text = ujson.loads(line)['body']
            doc = nlp(text)
            out = transform_doc(doc)
            output.append(out)
        print('Done processing file: {}'.format(filename))

    return output


def write_func(items):
    with io.open('/home/newscred/Workspace/extractor/train/output.txt',
                 'a+',
                 encoding='utf8') as f:
        for x in items:
            try:
                f.write(x)
            except:
                pass
    return


if __name__ == '__main__':
    input_files = glob.glob('train/*.bz2')

    mapper = MapReduce(transform_file, write_func)
    word_counts = mapper(input_files)

    print('Done processing!')
Example #10
0
def main():
    if len(sys.argv) not in [2, 3]:
        print("Usage: master.py <input-url> [chunk-size]")
        sys.exit(1)

    input_url = sys.argv[1]
    rds_host = os.environ["RDS_HOST"]
    rds_port = int(os.environ["RDS_PORT"])
    bucket_url = os.environ["AWS_S3_BUCKET"]
    master_id = os.environ["MASTER_ID"]
    app_name = os.environ["APP_NAME"]
    chunk_size = 25_000_000
    if len(sys.argv) == 3:
        chunk_size = int(sys.argv[2])
    if chunk_size <= 0:
        raise RuntimeError(
            f"Chunk size must be a positive number: got {chunk_size}")

    kube = authenticate_kubernetes()

    bucket_name = s3helper.get_bucket_from_s3_url(bucket_url)
    mr = MapReduce(master_id, kube, RANGES, MAPPER_IMAGE, REDUCER_IMAGE,
                   app_name)
    work_done = False
    state = 0

    # Computing chunk sizes is slow: we want to compute it in the background while
    # spinning up mappers, to allow us to reduce the output of the mappers
    def spawn_mappers():
        for chunk in s3helper.get_chunks(input_url, chunk_size):
            chunk_computer_output.put(chunk)
        chunk_computer_output.close()

    chunk_computer = multiprocessing.Process(target=spawn_mappers)
    chunk_computer_output = multiprocessing.Queue()
    print("Starting to compute chunks")
    chunk_computer.start()

    # Event loop updates state and looks for possible reduction
    # Terminates when state isn't changed, no reducers are started
    # and there are no running jobs.
    while (work_done or mr.is_active() or chunk_computer.is_alive()
           or not chunk_computer_output.empty()):
        work_done = False
        mr.update_state()
        print(
            f"State {state} - Mappers: [{mr.mappers}]    Reducers: [{mr.reducers}]"
        )
        state += 1

        try:
            while True:
                c1, c2 = chunk_computer_output.get(block=False)
                mr.start_mapper(input_url, bucket_url, str(c1), str(c2),
                                ",".join(RANGES))
        except multiprocessing.queues.Empty:
            pass

        # Reduce mappers before other reducers
        # Logic behind explicit order is that the result of mappers are not
        # as far along the reduction process, so will need more time to be processed.
        # Termination condition is that there are not enough completed mappers
        # to start a new reducer with AND the mappers which are completed are
        # not the last few.
        while len(mr.mappers.completed) >= NUM_MAPPERS_TO_REDUCERS or (len(
                mr.mappers.running) == 0 and len(mr.mappers.completed) > 0):
            to_reduce, remaining = take_at_most_n(mr.mappers.completed,
                                                  NUM_MAPPERS_TO_REDUCERS)
            mr.mappers.completed = remaining
            for tag in RANGES:
                mr.start_reducer(
                    tag,
                    ",".join(
                        get_s3_url(bucket_url, mapper.metadata.name, tag)
                        for mapper in to_reduce),
                    bucket_url,
                )
                work_done = True

        # Reduce multiple reducers when they are compatible
        # Termination condition is slightly different because the final completed
        # reducer does not need to be reduced.
        for tag in RANGES:
            while len(mr.reducers[tag].completed
                      ) >= NUM_REDUCERS_TO_REDUCERS or (
                          len(mr.reducers[tag].running) == 0
                          and len(mr.reducers[tag].completed) > 1):
                to_reduce, remaining = take_at_most_n(
                    mr.reducers[tag].completed, NUM_REDUCERS_TO_REDUCERS)
                mr.reducers[tag].completed = remaining
                mr.start_reducer(
                    tag,
                    ",".join(
                        get_s3_url(bucket_url, reducer.metadata.name, "")
                        for reducer in to_reduce),
                    bucket_url,
                )
                work_done = True
        time.sleep(EVENT_LOOP_UPDATE_INTERVAL)

    print("Processing reducer outputs")
    # Collect the reducer outputs into a single dictionary
    output = {"word": [], "letter": []}
    for tag in RANGES:
        if len(mr.reducers[tag].completed) < 1:
            continue  # It's valid for the input to contain no letters in a range
        elif len(mr.reducers[tag].completed) > 1:
            raise RuntimeError(
                f"Expected exactly one reducer for {tag}: got {mr.reducers[tag]}"
            )
        final_reducer_id = mr.reducers[tag].completed[0].metadata.name
        reducer_output = json.loads(
            s3helper.download_file(bucket_name, final_reducer_id).decode())
        output["word"].extend(reducer_output["word"].items())
        output["letter"].extend(reducer_output["letter"].items())

    # Sort outputs: decreasing by frequency, increasing by word
    for r in output:
        output[r].sort(key=lambda x: x[0])
        output[r].sort(key=lambda x: x[1], reverse=True)

    print("Writing results to database")
    write_to_db(rds_host, rds_port, output)
Example #11
0
import numpy as np

from mapreduce import MapReduce

input_images = folders = [
    f for f in glob.glob("data/LC08*.tif", recursive=True)
]

output_image = "data/output.tif"


def map_func(image):
    nir = image[0:1, :, :]
    red = image[2:3, :, :]
    ndvi = (nir - red) / (nir + red + 0.000001)
    ndvi = np.array(ndvi * 1000).astype(np.int32)
    return ndvi


def reduce_func(images):
    return np.max(images, axis=0)


mapreduce = MapReduce(input_images=input_images,
                      output_image=output_image,
                      map_func=map_func,
                      reduce_func=reduce_func)

mapreduce.run()
Example #12
0
 def __init__(self, np=None):
     MapReduce.__init__(self, backend=backends.ThreadBackend)
Example #13
0
 def __init__(self, np=None):
     MapReduce.__init__(self, backend=backends.ProcessBackend)