def start_cluster(self): if self._cluster_started: print("Cluster already started") return self._s3_bucket_url = ( f"s3://{self._config.s3_bucket_prefix}.{self._generate_random_string()}" ) _, _, vpc_id = self._get_or_create_rds_instance() security_group_id = rds.get_custom_security_group_id( self._config.region, vpc_id) # Create resources print( f"Creating s3 bucket {self._s3_bucket_url} in {self._config.region}" ) self._run_aws( ["s3", "mb", self._s3_bucket_url, "--region", self._config.region]).check_returncode() print("Uploading cluster settings to bucket") s3helper.upload_file( s3helper.get_bucket_from_s3_url(self._s3_bucket_url), "config.json", self._config.json_show().encode(), ) print( f"Creating cluster: {self._config.cluster_name} in {self._config.kubernetes_zones}, with additional security group {security_group_id}" ) self._run_kops([ "create", "cluster", self._config.cluster_name, "--zones", self._config.kubernetes_zones, "--authorization", "AlwaysAllow", "--master-count", str(self._config.master_count), "--master-size", self._config.master_type, "--master-security-groups", str(security_group_id), "--node-size", self._config.slave_type, "--node-count", str(self._config.slave_count), "--node-security-groups", str(security_group_id), "--vpc", vpc_id, "--yes", ]).check_returncode() # Run resources print("Running cluster") self._run_kops( ["update", "cluster", self._config.cluster_name, "--yes"]).check_returncode() self._cluster_started = True
def main(): if len(sys.argv) != 6: raise RuntimeError( "Usage: mapper <input file url> <output bucket url> <chunk start byte> <chunk end byte> <ranges>\n" + 'where ranges is a comma-separated list of ranges, eg. "a-d,e-g,h-w,x-z" and the start/end bytes are inclusive/exclusive respectively.' ) src_bucket, src_filename = s3helper.get_bucket_and_file(sys.argv[1]) dst_bucket = s3helper.get_bucket_from_s3_url(sys.argv[2]) dst_directory = os.environ["JOB_ID"] chunk_range = (int(sys.argv[3]), int(sys.argv[4])) ranges = sys.argv[5].split(",") file_contents = s3helper.download_chunk(src_bucket, src_filename, chunk_range) output = {"word": [], "letter": []} for token in re_split.split(file_contents): mapper(token, output) for r in ranges: start, end = r.split("-") lrange = {chr(c) for c in range(ord(start), ord(end) + 1)} data = json.dumps( { "word": list(flatFilter(lambda x: x[0][0] in lrange, output["word"])), "letter": list(flatFilter(lambda x: x[0] in lrange, output["letter"])), }, separators=[",", ":"], # Remove whitespace ) s3helper.upload_file(dst_bucket, f"{dst_directory}/{r}", data.encode())
def set_existing_cluster(self): self._s3_bucket_url = input( "Enter state store url (eg. s3://kubernetes.group8 or kubernetes.group8): " ) if not self._s3_bucket_url.startswith("s3://"): self._s3_bucket_url = "s3://" + self._s3_bucket_url print("Downloading cluster config") contents = s3helper.download_file( s3helper.get_bucket_from_s3_url(self._s3_bucket_url), "config.json") self._config.json_load(contents.decode()) self._cluster_started = True
def main(): if len(sys.argv) != 3: raise RuntimeError( "Usage: reducer <input file urls> <output bucket url>\n" + "where the input file urls are a comma separated list of s3:// urls and the output url is a s3:// url." ) input_urls = sys.argv[1].split(",") out_bucket = s3helper.get_bucket_from_s3_url(sys.argv[2]) out_file = os.environ["JOB_ID"] inputs = [get_json(url) for url in input_urls] outputs = merge(inputs) json_output = json.dumps(outputs) s3helper.upload_file(bucket=out_bucket, filename=out_file, data_bytes=json_output.encode())
def run_spark_app(self): if not self._cluster_started: print("Cluster not started") return rds_host, rds_port, _ = self._get_or_create_rds_instance() input_url = input("Enter url to the input file: ") # Validate and convert to an S3 link input_url = s3helper.convert_url_to_s3(input_url) bucket_name = s3helper.get_bucket_from_s3_url(input_url) try: file_region = s3helper.get_bucket_region(bucket_name) except RuntimeError: print( f"Access denied when getting the region of the S3 bucket {bucket_name}." ) file_region = input( "Enter the region of the bucket (or blank for eu-west-2): ") if len(file_region) == 0: file_region = "eu-west-2" print("Resetting spark database tables") db.initialise_instance( host=rds_host, port=rds_port, db_name=RDS_DB_NAME, username=RDS_USERNAME, password=RDS_PASSWORD, table_suffix="spark", ) print("Starting spark job") print( "For large inputs, ignore warnings about WatchConnectionManager: they're heartbeat timeouts." ) # Run the spark job env = self._setup_env(rds_host, rds_port) start_s = time.monotonic() subprocess.check_call(spark.spark_command(input_url, file_region, env)) end_s = time.monotonic() print(f"Took {end_s - start_s}s")
def main(): if len(sys.argv) not in [2, 3]: print("Usage: master.py <input-url> [chunk-size]") sys.exit(1) input_url = sys.argv[1] rds_host = os.environ["RDS_HOST"] rds_port = int(os.environ["RDS_PORT"]) bucket_url = os.environ["AWS_S3_BUCKET"] master_id = os.environ["MASTER_ID"] app_name = os.environ["APP_NAME"] chunk_size = 25_000_000 if len(sys.argv) == 3: chunk_size = int(sys.argv[2]) if chunk_size <= 0: raise RuntimeError( f"Chunk size must be a positive number: got {chunk_size}") kube = authenticate_kubernetes() bucket_name = s3helper.get_bucket_from_s3_url(bucket_url) mr = MapReduce(master_id, kube, RANGES, MAPPER_IMAGE, REDUCER_IMAGE, app_name) work_done = False state = 0 # Computing chunk sizes is slow: we want to compute it in the background while # spinning up mappers, to allow us to reduce the output of the mappers def spawn_mappers(): for chunk in s3helper.get_chunks(input_url, chunk_size): chunk_computer_output.put(chunk) chunk_computer_output.close() chunk_computer = multiprocessing.Process(target=spawn_mappers) chunk_computer_output = multiprocessing.Queue() print("Starting to compute chunks") chunk_computer.start() # Event loop updates state and looks for possible reduction # Terminates when state isn't changed, no reducers are started # and there are no running jobs. while (work_done or mr.is_active() or chunk_computer.is_alive() or not chunk_computer_output.empty()): work_done = False mr.update_state() print( f"State {state} - Mappers: [{mr.mappers}] Reducers: [{mr.reducers}]" ) state += 1 try: while True: c1, c2 = chunk_computer_output.get(block=False) mr.start_mapper(input_url, bucket_url, str(c1), str(c2), ",".join(RANGES)) except multiprocessing.queues.Empty: pass # Reduce mappers before other reducers # Logic behind explicit order is that the result of mappers are not # as far along the reduction process, so will need more time to be processed. # Termination condition is that there are not enough completed mappers # to start a new reducer with AND the mappers which are completed are # not the last few. while len(mr.mappers.completed) >= NUM_MAPPERS_TO_REDUCERS or (len( mr.mappers.running) == 0 and len(mr.mappers.completed) > 0): to_reduce, remaining = take_at_most_n(mr.mappers.completed, NUM_MAPPERS_TO_REDUCERS) mr.mappers.completed = remaining for tag in RANGES: mr.start_reducer( tag, ",".join( get_s3_url(bucket_url, mapper.metadata.name, tag) for mapper in to_reduce), bucket_url, ) work_done = True # Reduce multiple reducers when they are compatible # Termination condition is slightly different because the final completed # reducer does not need to be reduced. for tag in RANGES: while len(mr.reducers[tag].completed ) >= NUM_REDUCERS_TO_REDUCERS or ( len(mr.reducers[tag].running) == 0 and len(mr.reducers[tag].completed) > 1): to_reduce, remaining = take_at_most_n( mr.reducers[tag].completed, NUM_REDUCERS_TO_REDUCERS) mr.reducers[tag].completed = remaining mr.start_reducer( tag, ",".join( get_s3_url(bucket_url, reducer.metadata.name, "") for reducer in to_reduce), bucket_url, ) work_done = True time.sleep(EVENT_LOOP_UPDATE_INTERVAL) print("Processing reducer outputs") # Collect the reducer outputs into a single dictionary output = {"word": [], "letter": []} for tag in RANGES: if len(mr.reducers[tag].completed) < 1: continue # It's valid for the input to contain no letters in a range elif len(mr.reducers[tag].completed) > 1: raise RuntimeError( f"Expected exactly one reducer for {tag}: got {mr.reducers[tag]}" ) final_reducer_id = mr.reducers[tag].completed[0].metadata.name reducer_output = json.loads( s3helper.download_file(bucket_name, final_reducer_id).decode()) output["word"].extend(reducer_output["word"].items()) output["letter"].extend(reducer_output["letter"].items()) # Sort outputs: decreasing by frequency, increasing by word for r in output: output[r].sort(key=lambda x: x[0]) output[r].sort(key=lambda x: x[1], reverse=True) print("Writing results to database") write_to_db(rds_host, rds_port, output)