Exemple #1
0
    def start_cluster(self):
        if self._cluster_started:
            print("Cluster already started")
            return

        self._s3_bucket_url = (
            f"s3://{self._config.s3_bucket_prefix}.{self._generate_random_string()}"
        )

        _, _, vpc_id = self._get_or_create_rds_instance()
        security_group_id = rds.get_custom_security_group_id(
            self._config.region, vpc_id)

        # Create resources
        print(
            f"Creating s3 bucket {self._s3_bucket_url} in {self._config.region}"
        )
        self._run_aws(
            ["s3", "mb", self._s3_bucket_url, "--region",
             self._config.region]).check_returncode()
        print("Uploading cluster settings to bucket")
        s3helper.upload_file(
            s3helper.get_bucket_from_s3_url(self._s3_bucket_url),
            "config.json",
            self._config.json_show().encode(),
        )
        print(
            f"Creating cluster: {self._config.cluster_name} in {self._config.kubernetes_zones}, with additional security group {security_group_id}"
        )
        self._run_kops([
            "create",
            "cluster",
            self._config.cluster_name,
            "--zones",
            self._config.kubernetes_zones,
            "--authorization",
            "AlwaysAllow",
            "--master-count",
            str(self._config.master_count),
            "--master-size",
            self._config.master_type,
            "--master-security-groups",
            str(security_group_id),
            "--node-size",
            self._config.slave_type,
            "--node-count",
            str(self._config.slave_count),
            "--node-security-groups",
            str(security_group_id),
            "--vpc",
            vpc_id,
            "--yes",
        ]).check_returncode()

        # Run resources
        print("Running cluster")
        self._run_kops(
            ["update", "cluster", self._config.cluster_name,
             "--yes"]).check_returncode()
        self._cluster_started = True
Exemple #2
0
def main():
    if len(sys.argv) != 6:
        raise RuntimeError(
            "Usage: mapper <input file url> <output bucket url> <chunk start byte> <chunk end byte> <ranges>\n"
            +
            'where ranges is a comma-separated list of ranges, eg. "a-d,e-g,h-w,x-z" and the start/end bytes are inclusive/exclusive respectively.'
        )
    src_bucket, src_filename = s3helper.get_bucket_and_file(sys.argv[1])
    dst_bucket = s3helper.get_bucket_from_s3_url(sys.argv[2])
    dst_directory = os.environ["JOB_ID"]
    chunk_range = (int(sys.argv[3]), int(sys.argv[4]))
    ranges = sys.argv[5].split(",")

    file_contents = s3helper.download_chunk(src_bucket, src_filename,
                                            chunk_range)
    output = {"word": [], "letter": []}
    for token in re_split.split(file_contents):
        mapper(token, output)

    for r in ranges:
        start, end = r.split("-")
        lrange = {chr(c) for c in range(ord(start), ord(end) + 1)}
        data = json.dumps(
            {
                "word":
                list(flatFilter(lambda x: x[0][0] in lrange, output["word"])),
                "letter":
                list(flatFilter(lambda x: x[0] in lrange, output["letter"])),
            },
            separators=[",", ":"],  # Remove whitespace
        )
        s3helper.upload_file(dst_bucket, f"{dst_directory}/{r}", data.encode())
Exemple #3
0
 def set_existing_cluster(self):
     self._s3_bucket_url = input(
         "Enter state store url (eg. s3://kubernetes.group8 or kubernetes.group8): "
     )
     if not self._s3_bucket_url.startswith("s3://"):
         self._s3_bucket_url = "s3://" + self._s3_bucket_url
     print("Downloading cluster config")
     contents = s3helper.download_file(
         s3helper.get_bucket_from_s3_url(self._s3_bucket_url),
         "config.json")
     self._config.json_load(contents.decode())
     self._cluster_started = True
Exemple #4
0
def main():
    if len(sys.argv) != 3:
        raise RuntimeError(
            "Usage: reducer <input file urls> <output bucket url>\n" +
            "where the input file urls are a comma separated list of s3:// urls and the output url is a s3:// url."
        )
    input_urls = sys.argv[1].split(",")
    out_bucket = s3helper.get_bucket_from_s3_url(sys.argv[2])
    out_file = os.environ["JOB_ID"]

    inputs = [get_json(url) for url in input_urls]
    outputs = merge(inputs)
    json_output = json.dumps(outputs)

    s3helper.upload_file(bucket=out_bucket,
                         filename=out_file,
                         data_bytes=json_output.encode())
Exemple #5
0
    def run_spark_app(self):
        if not self._cluster_started:
            print("Cluster not started")
            return

        rds_host, rds_port, _ = self._get_or_create_rds_instance()

        input_url = input("Enter url to the input file: ")
        # Validate and convert to an S3 link
        input_url = s3helper.convert_url_to_s3(input_url)

        bucket_name = s3helper.get_bucket_from_s3_url(input_url)
        try:
            file_region = s3helper.get_bucket_region(bucket_name)
        except RuntimeError:
            print(
                f"Access denied when getting the region of the S3 bucket {bucket_name}."
            )
            file_region = input(
                "Enter the region of the bucket (or blank for eu-west-2): ")
            if len(file_region) == 0:
                file_region = "eu-west-2"

        print("Resetting spark database tables")
        db.initialise_instance(
            host=rds_host,
            port=rds_port,
            db_name=RDS_DB_NAME,
            username=RDS_USERNAME,
            password=RDS_PASSWORD,
            table_suffix="spark",
        )
        print("Starting spark job")
        print(
            "For large inputs, ignore warnings about WatchConnectionManager: they're heartbeat timeouts."
        )
        # Run the spark job
        env = self._setup_env(rds_host, rds_port)

        start_s = time.monotonic()
        subprocess.check_call(spark.spark_command(input_url, file_region, env))
        end_s = time.monotonic()
        print(f"Took {end_s - start_s}s")
Exemple #6
0
def main():
    if len(sys.argv) not in [2, 3]:
        print("Usage: master.py <input-url> [chunk-size]")
        sys.exit(1)

    input_url = sys.argv[1]
    rds_host = os.environ["RDS_HOST"]
    rds_port = int(os.environ["RDS_PORT"])
    bucket_url = os.environ["AWS_S3_BUCKET"]
    master_id = os.environ["MASTER_ID"]
    app_name = os.environ["APP_NAME"]
    chunk_size = 25_000_000
    if len(sys.argv) == 3:
        chunk_size = int(sys.argv[2])
    if chunk_size <= 0:
        raise RuntimeError(
            f"Chunk size must be a positive number: got {chunk_size}")

    kube = authenticate_kubernetes()

    bucket_name = s3helper.get_bucket_from_s3_url(bucket_url)
    mr = MapReduce(master_id, kube, RANGES, MAPPER_IMAGE, REDUCER_IMAGE,
                   app_name)
    work_done = False
    state = 0

    # Computing chunk sizes is slow: we want to compute it in the background while
    # spinning up mappers, to allow us to reduce the output of the mappers
    def spawn_mappers():
        for chunk in s3helper.get_chunks(input_url, chunk_size):
            chunk_computer_output.put(chunk)
        chunk_computer_output.close()

    chunk_computer = multiprocessing.Process(target=spawn_mappers)
    chunk_computer_output = multiprocessing.Queue()
    print("Starting to compute chunks")
    chunk_computer.start()

    # Event loop updates state and looks for possible reduction
    # Terminates when state isn't changed, no reducers are started
    # and there are no running jobs.
    while (work_done or mr.is_active() or chunk_computer.is_alive()
           or not chunk_computer_output.empty()):
        work_done = False
        mr.update_state()
        print(
            f"State {state} - Mappers: [{mr.mappers}]    Reducers: [{mr.reducers}]"
        )
        state += 1

        try:
            while True:
                c1, c2 = chunk_computer_output.get(block=False)
                mr.start_mapper(input_url, bucket_url, str(c1), str(c2),
                                ",".join(RANGES))
        except multiprocessing.queues.Empty:
            pass

        # Reduce mappers before other reducers
        # Logic behind explicit order is that the result of mappers are not
        # as far along the reduction process, so will need more time to be processed.
        # Termination condition is that there are not enough completed mappers
        # to start a new reducer with AND the mappers which are completed are
        # not the last few.
        while len(mr.mappers.completed) >= NUM_MAPPERS_TO_REDUCERS or (len(
                mr.mappers.running) == 0 and len(mr.mappers.completed) > 0):
            to_reduce, remaining = take_at_most_n(mr.mappers.completed,
                                                  NUM_MAPPERS_TO_REDUCERS)
            mr.mappers.completed = remaining
            for tag in RANGES:
                mr.start_reducer(
                    tag,
                    ",".join(
                        get_s3_url(bucket_url, mapper.metadata.name, tag)
                        for mapper in to_reduce),
                    bucket_url,
                )
                work_done = True

        # Reduce multiple reducers when they are compatible
        # Termination condition is slightly different because the final completed
        # reducer does not need to be reduced.
        for tag in RANGES:
            while len(mr.reducers[tag].completed
                      ) >= NUM_REDUCERS_TO_REDUCERS or (
                          len(mr.reducers[tag].running) == 0
                          and len(mr.reducers[tag].completed) > 1):
                to_reduce, remaining = take_at_most_n(
                    mr.reducers[tag].completed, NUM_REDUCERS_TO_REDUCERS)
                mr.reducers[tag].completed = remaining
                mr.start_reducer(
                    tag,
                    ",".join(
                        get_s3_url(bucket_url, reducer.metadata.name, "")
                        for reducer in to_reduce),
                    bucket_url,
                )
                work_done = True
        time.sleep(EVENT_LOOP_UPDATE_INTERVAL)

    print("Processing reducer outputs")
    # Collect the reducer outputs into a single dictionary
    output = {"word": [], "letter": []}
    for tag in RANGES:
        if len(mr.reducers[tag].completed) < 1:
            continue  # It's valid for the input to contain no letters in a range
        elif len(mr.reducers[tag].completed) > 1:
            raise RuntimeError(
                f"Expected exactly one reducer for {tag}: got {mr.reducers[tag]}"
            )
        final_reducer_id = mr.reducers[tag].completed[0].metadata.name
        reducer_output = json.loads(
            s3helper.download_file(bucket_name, final_reducer_id).decode())
        output["word"].extend(reducer_output["word"].items())
        output["letter"].extend(reducer_output["letter"].items())

    # Sort outputs: decreasing by frequency, increasing by word
    for r in output:
        output[r].sort(key=lambda x: x[0])
        output[r].sort(key=lambda x: x[1], reverse=True)

    print("Writing results to database")
    write_to_db(rds_host, rds_port, output)