Esempio n. 1
0
    def from_parameters(params: Parameters) -> KeyValueSink[str, bytes]:
        """
        Create a key-value sink writing to a zip file.

        Right now, these uses all the defaults for `KeyValueSink.zip_bytes_sink`. In the
        future, we might examine other parameters to allow greater customization.
        """
        return KeyValueSink.zip_bytes_sink(params.creatable_file("path"))
Esempio n. 2
0
def main(params: Parameters):
    conda_script_generator = CondaJobScriptGenerator.from_parameters(params)
    entry_point = params.string("entry_point")
    work_dir = params.optional_creatable_directory(
        "working_directory") or Path(os.getcwd())
    stdout_file = params.string("log_file") or work_dir / "___stdout.log"
    shell_script = conda_script_generator.generate_shell_script(
        entry_point_name=entry_point,
        param_file=params.existing_file("job_param_file"),
        working_directory=work_dir,
        stdout_file=stdout_file,
    )

    params.creatable_file("conda_script_path").write_text(  # type: ignore
        shell_script, encoding="utf-8")

    if params.boolean("echo_template", default=False):
        print(shell_script)
Esempio n. 3
0
def main(params: Parameters):
    input_file_path = params.existing_file("input_file")
    output_file_path = params.creatable_file("output_file")
    logging.info("Reading from input file: %s",
                 str(input_file_path.absolute()))
    with input_file_path.open() as input_file:
        nums = [int(x.strip()) for x in input_file if x.strip() != ""]

    nums.sort()

    output_file_path.write_text("\n".join(immutableset([str(x)
                                                        for x in nums])))
Esempio n. 4
0
def main(params: Parameters):
    input_file_path = params.existing_file("input_file")
    output_file_path = params.creatable_file("output_file")
    x = params.integer("x")
    logging.info("Reading from input file: %s",
                 str(input_file_path.absolute()))
    with input_file_path.open() as input_file:
        with output_file_path.open("w") as output_file:
            for num in input_file:
                output_file.write(f"{int(num)*x}\n")

    logging.info("Writing to output file: %s", str(input_file_path.absolute()))

    # Pause so that we can examine the job on the SAGA cluster
    time.sleep(30)
Esempio n. 5
0
def main(params: Parameters):
    with byte_key_value_source_from_params(params) as input_source:
        keys = list(input_source.keys())
        num_to_sample = min(params.positive_integer(_NUM_TO_SAMPLE_PARAM),
                            len(keys))
        random.shuffle(
            keys,
            random=random.Random(params.integer(_RANDOM_SEED_PARAM,
                                                default=0)).random,
        )
        keys_to_keep = keys[:num_to_sample]
        output_zip_path = params.creatable_file("output_zip_path")
        logging.info("Downsampling %s files to %s", num_to_sample,
                     output_zip_path)
        with KeyValueSink.zip_bytes_sink(output_zip_path) as out:
            for key in keys_to_keep:
                out.put(key, input_source[key])
def main(params: Parameters):
    graph_def_file = params.existing_file("graph_def_file")
    checkpoint_glob = params.string("checkpoint_glob")
    vocab_file = params.existing_file("vocab_file")
    sentences_file = params.existing_file("sentences_file")
    output_file = params.creatable_file("output_file")
    do_profiling = params.optional_boolean_with_default("profile", False)

    with tensorflow.contrib.tfprof.ProfileContext(
            os.getcwd(),
            trace_steps=range(2, 10),
            dump_steps=range(1, 10, 2),
            enabled=params.optional_boolean_with_default("profile", False)
        ):
        lm = LM1B.load(graph_def_file=graph_def_file,
                       checkpoint_file=checkpoint_glob,
                       vocab=vocab_file)

        start_time = None
        num_tokens_processed = 0

        with open(sentences_file, 'r', newline='') as inp:
            csv_input = csv.reader(inp, delimiter='\t')
            with open(output_file, 'w', newline='') as out:
                csv_output = csv.writer(out, delimiter='\t')
                for line in csv_input:
                    tokens = line[0].split(' ')
                    output_row = list(line)
                    output_row.insert(0, lm.log_probability_of_sentence(tokens))
                    csv_output.writerow(output_row)
                    # we delay till after the first sentence to avoid counting startup time
                    if num_tokens_processed == 0:
                        start_time = time.time()
                    num_tokens_processed += len(tokens)

    elapsed_time = time.time() - start_time
    print(f"Processed {num_tokens_processed - 1} sentences in {elapsed_time} "
          f"seconds, {num_tokens_processed / elapsed_time} tokens per second. First sentence not "
          f"included in time calculation.")
def example_workflow(params: Parameters):  # pragma: no cover
    """
    An example script to generate a container workflow for submission to Pegasus.
    """
    tmp_path = params.creatable_directory("example_root_dir")
    docker_tar = params.creatable_file("docker_tar")
    docker_build_dir = params.existing_directory("docker_build_dir")
    docker_image_name = params.string(
        "docker_image_name", default="pegasus_wrapper_container_demo"
    )
    docker_image_tag = params.string("docker_image_tag", default="0.2")
    mongo_db_tar = params.string(
        "mongo_db_tar", default="/nas/gaia/shared/cluster/docker/mongo-4.4.tar"
    )
    monogo_db_data = "/scratch/dockermount/pegasus_wrapper_tmp/data"
    mongo_db_config = "/scratch/dockermount/pegasus_wrapper_tmp/config"

    # Generating parameters for initializing a workflow
    # We recommend making workflow directory, site, and partition parameters
    # in an research workflow
    workflow_params = Parameters.from_mapping(
        {
            "workflow_name": "Test",
            "workflow_created": "Testing",
            "workflow_log_dir": str(tmp_path / "log"),
            "workflow_directory": str(tmp_path / "working"),
            "site": "saga",
            "namespace": "test",
            "home_dir": str(tmp_path),
            "partition": "scavenge",
        }
    )

    saga31_request = SlurmResourceRequest.from_parameters(
        Parameters.from_mapping({"run_on_single_node": "saga31", "partition": "gaia"})
    )

    workflow_params = workflow_params.unify(params)

    # Our source input for the sample jobs
    input_file = tmp_path / "raw_nums.txt"
    add_y_output_file_nas = tmp_path / "nums_y.txt"
    sorted_output_file_nas = tmp_path / "sorted.txt"

    random = Random()
    random.seed(0)
    nums = [int(random.random() * 100) for _ in range(0, 25)]

    # Base Job Locator
    job_locator = Locator(("jobs",))
    docker_python_root = Path("/home/app/")

    job_profile = PegasusProfile(
        namespace="pegasus", key="transfer.bypass.input.staging", value="True"
    )

    # Write a list of numbers out to be able to run the workflow
    with input_file.open("w") as mult_file:
        mult_file.writelines(f"{num}\n" for num in nums)

    initialize_vista_pegasus_wrapper(workflow_params)

    build_container = run_bash(
        job_locator / "build_docker",
        command=[
            "mkdir -p /scratch/dockermount/pegasus_wrapper_tmp",
            f"cd {docker_build_dir}",
            f"docker build . -t {docker_image_name}:{docker_image_tag}",
            f"docker save -o /scratch/dockermount/pegasus_wrapper_tmp/{docker_tar.name} {docker_image_name}:{docker_image_tag}",
            f"cp /scratch/dockermount/pegasus_wrapper_tmp/{docker_tar.name} {docker_tar.absolute()}",
            f"chmod go+r {docker_tar.absolute()}",
            f"docker load --input {mongo_db_tar}",
            f"mkdir -p {monogo_db_data}",
            f"mkdir -p {mongo_db_config}",
        ],
        depends_on=[],
        resource_request=saga31_request,
    )

    python36 = add_container(
        f"{docker_image_name}:{docker_image_tag}",
        "docker",
        str(docker_tar.absolute()),
        image_site="saga",
        bypass_staging=True,
    )

    mongo4_4 = add_container(
        "mongo:4.4", "docker", mongo_db_tar, image_site="saga", bypass_staging=True
    )

    start_mongo = start_docker_as_service(
        mongo4_4,
        depends_on=[build_container],
        mounts=[f"{monogo_db_data}:/data/db", f"{mongo_db_config}/etc/custom"],
        docker_args=f"-p 27017:27017",
        resource_request=saga31_request,
    )

    add_y_job = run_python_on_args(
        job_locator / "add",
        docker_python_root / "add_y.py",
        set_args=f"{input_file} {add_y_output_file_nas} --y 10",
        depends_on=[build_container],
        job_profiles=[job_profile],
        resource_request=saga31_request,
        container=python36,
        input_file_paths=[input_file],
        output_file_paths=[add_y_output_file_nas],
    )

    sort_job = run_python_on_parameters(
        job_locator / "sort",
        sort_nums_in_file,
        {"input_file": add_y_output_file_nas, "output_file": sorted_output_file_nas},
        depends_on=[add_y_job],
        container=python36,
        job_profiles=[job_profile],
        resource_request=saga31_request,
        input_file_paths=add_y_output_file_nas,
        output_file_paths=sorted_output_file_nas,
    )

    _ = stop_docker_as_service(
        mongo4_4, depends_on=[start_mongo, sort_job], resource_request=saga31_request
    )

    # Generate the Pegasus DAX file & a Submit Script
    write_workflow_description(tmp_path)