Esempio n. 1
0
    def test_interpolation(self):
        context = Parameters.from_mapping(
            yaml.safe_load(self.WRITING_REFERENCE))
        loader = YAMLParametersLoader()
        self.assertEqual(
            loader._interpolate(
                Parameters.from_mapping(
                    yaml.safe_load(self.MULTIPLE_INTERPOLATION_REFERENCE)),
                context,
            )._data,
            immutabledict([
                ("pear", "raspberry"),
                ("banana", "raspberry"),
                ("apple", "raspberry"),
                ("the_ultimate_fruit", "raspberry"),
            ]),
        )
        self.assertEqual(
            loader._interpolate(
                Parameters.from_mapping(
                    yaml.safe_load(
                        self.MULTIPLE_INTERPOLATION_REFERENCE_NEEDING_CONTEXT)
                ),
                context,
            )._data,
            immutabledict([
                ("pear", "raspberry/world"),
                ("banana", "raspberry/world"),
                ("apple", "raspberry/world"),
                ("the_ultimate_fruit", "raspberry/world"),
                # the actual pair ("hello", "world") should not be present
            ]),
        )
        self.assertEqual(
            loader._interpolate(
                Parameters.from_mapping(
                    yaml.safe_load(self.NESTED_INTERPOLATION)),
                context,
            ).as_nested_dicts(),
            {
                "key": 2,
                "key2": "fooo",
                "key3": {
                    "lalala": "fooo",
                    "meep": 2,
                    "list": [1, 2, 3]
                },
            },
        )

        with self.assertRaisesRegex(
                ParameterInterpolationError,
                r"These interpolated parameters form at least one graph cycle that must be fixed: "
                r"\('b', 'c'\)",
        ):
            loader._interpolate(
                Parameters.from_mapping(
                    yaml.safe_load('a: "%b%"\nb: "%c%"\nc: "%b%"')),
                context,
            )
Esempio n. 2
0
def main(params: Parameters):
    curriculum_repository_path = params.creatable_directory(
        CURRICULUM_REPOSITORY_PATH_PARAMETER
    )
    language_mode = params.enum(
        LANGUAGE_MODE_PARAMETER, LanguageMode, default=LanguageMode.ENGLISH
    )

    train_curriculum, test_curriculum = curriculum_from_params(
        params, language_mode=language_mode
    )
    strict_curriculum = ExperimentCurriculum(
        evaluate_curriculum(train_curriculum), evaluate_curriculum(test_curriculum)
    )
    write_experiment_curriculum(
        curriculum_repository_path,
        params,
        language_mode,
        strict_curriculum,
        ignored_parameters=immutableset(
            IGNORED_PARAMETERS.union(
                {CURRICULUM_REPOSITORY_PATH_PARAMETER, LANGUAGE_MODE_PARAMETER}
            )
        ),
    )
Esempio n. 3
0
def main(params: Parameters):
    # create_cas_from_apf(TEST_APF_PATH, TEST_SGM_PATH, OUTPUT_DIR_PATH)
    corpus_paths = params.arbitrary_list("corpus_paths")
    output_xmi_dir_path = params.creatable_directory("cached_xmi_path")
    type_system_path = params.existing_file("type_system_path")
    cas_xmi_template_path = params.existing_file("cas_xmi_template_path")

    # Load Typesystem
    with type_system_path.open('rb') as file:
        typesystem = load_typesystem(file)

    # Load xmi_template
    with cas_xmi_template_path.open('rb') as cas_xmi_file:
        cas_template = load_cas_from_xmi(cas_xmi_file, typesystem=typesystem)

    for ace_corpus_path in corpus_paths:
        print('Processing apf files from: ' + ace_corpus_path)
        start_time = time.perf_counter()
        for filename in os.listdir(ace_corpus_path):
            if filename.endswith(".apf.xml"):
                print("Processing " + filename)
                create_cas_from_apf(apf_filename=filename,
                                    apf_path=ace_corpus_path + filename,
                                    source_sgm_path=ace_corpus_path + filename.replace(
                                        ".apf.xml", ".sgm"),
                                    output_dir_path=output_xmi_dir_path, typesystem=typesystem,
                                    cas_template=cas_template)
        elapsed_time = time.perf_counter() - start_time
        print(f"Processing Completed. Time elapsed: {elapsed_time:0.4f} seconds")
def add_saga_cluster_to_sites(
    sites_catalog: SiteCatalog,
    params: Parameters = Parameters.empty()) -> None:
    home = params.string("home_dir", default=str(Path.home().absolute()))
    data_configuration = params.string("data_configuration",
                                       default="sharedfs")

    shared_scratch_dir = params.string(
        "saga_shared_scratch", default=f"{home}/workflows/shared-scratch")

    saga = Site("saga", arch=Arch.X86_64, os_type=OS.LINUX)
    saga.add_directories(
        Directory(Directory.SHARED_SCRATCH,
                  shared_scratch_dir).add_file_servers(
                      FileServer("file://" + shared_scratch_dir,
                                 Operation.ALL)))

    saga.add_env(key="PEGASUS_HOME",
                 value="/nas/gaia/shared/cluster/pegasus5/pegasus-5.0.0")

    # Profiles
    saga.add_pegasus_profile(style="glite",
                             auxillary_local=True,
                             data_configuration=data_configuration)
    saga.add_condor_profile(grid_resource="batch slurm")

    sites_catalog.add_sites(saga)
Esempio n. 5
0
def _build_curriculum_path(
    repository: Path,
    parameters: Parameters,
    language_mode: LanguageMode,
    *,
    ignored_parameters: AbstractSet[str] = IGNORED_PARAMETERS,
) -> Path:
    curriculum_file_path: Path = repository / LANGUAGE_MODE_TO_NAME[
        language_mode]
    unignored = immutableset(parameter
                             for parameter, _ in parameters.namespaced_items()
                             if parameter not in ignored_parameters)
    if not unignored.issubset(_PARAMETER_ORDER):
        unrecognized_parameters = unignored.difference(_PARAMETER_ORDER)
        raise RuntimeError(
            f"No defined order for parameters: {unrecognized_parameters}")

    if "curriculum" not in unignored:
        raise RuntimeError(
            "Expected curriculum name, but none present in parameters.")

    for parameter in iter(_PARAMETER_ORDER):
        value = parameters.get_optional(parameter, object)
        curriculum_file_path = curriculum_file_path / f"{value}_{parameter}"

    return curriculum_file_path / _EXPERIMENT_CURRICULUM_FILE_NAME
Esempio n. 6
0
 def from_parameters(params: Parameters) -> Optional["SpackConfiguration"]:
     if SpackConfiguration.SPACK_ENVIRONMENT_PARAM in params:
         if SpackConfiguration.SPACK_PACKAGES_PARAM in params:
             raise RuntimeError(
                 f"{SpackConfiguration.SPACK_ENVIRONMENT_PARAM} "
                 f"and {SpackConfiguration.SPACK_PACKAGES_PARAM} are mutually exclusive"
             )
         return SpackConfiguration(
             spack_root=params.existing_directory(
                 SpackConfiguration.SPACK_ROOT_PARAM),
             spack_environment=params.string(
                 SpackConfiguration.SPACK_ENVIRONMENT_PARAM),
         )
     elif SpackConfiguration.SPACK_PACKAGES_PARAM in params:
         if SpackConfiguration.SPACK_ENVIRONMENT_PARAM in params:
             raise RuntimeError(
                 f"{SpackConfiguration.SPACK_ENVIRONMENT_PARAM} "
                 f"and {SpackConfiguration.SPACK_PACKAGES_PARAM} are mutually exclusive"
             )
         return SpackConfiguration(
             spack_root=params.existing_directory(
                 SpackConfiguration.SPACK_ROOT_PARAM),
             spack_packages=[
                 SpackPackage.parse(package_specifier)
                 for package_specifier in params.arbitrary_list(
                     SpackConfiguration.SPACK_PACKAGES_PARAM)
             ],
         )
     else:
         return None
Esempio n. 7
0
 def test_double_context_fail(self):
     # cannot specify both deprecated context argument and new included_context argument
     with self.assertRaises(ParameterError):
         YAMLParametersLoader().load(
             f='foo: "foo"',
             context=Parameters.empty(),
             included_context=Parameters.empty(),
         )
Esempio n. 8
0
 def from_parameters(params: Parameters) -> Optional["CondaConfiguration"]:
     if CondaConfiguration.CONDA_ENVIRONMENT_PARAM in params:
         return CondaConfiguration(
             conda_base_path=params.existing_directory("conda_base_path"),
             conda_environment=params.string(
                 CondaConfiguration.CONDA_ENVIRONMENT_PARAM),
         )
     else:
         return None
 def from_parameters(params: Parameters) -> ResourceRequest:
     return SlurmResourceRequest(
         partition=params.string("partition"),
         num_cpus=params.optional_positive_integer("num_cpus"),
         num_gpus=params.optional_integer("num_gpus"),
         memory=MemoryAmount.parse(params.string("memory"))
         if "memory" in params
         else None,
         job_time_in_minutes=params.optional_integer("job_time_in_minutes"),
     )
Esempio n. 10
0
    def from_parameters(params: Parameters) -> "WorkflowBuilder":
        wb = WorkflowBuilder(
            name=params.string("workflow_name", default="Workflow"),
            created_by=params.string("workflow_created",
                                     default="Default Constructor"),
            workflow_directory=params.creatable_directory(
                "workflow_directory"),
            default_site=params.string("site"),
            conda_script_generator=CondaJobScriptGenerator.from_parameters(
                params),
            docker_script_generator=DockerJobScriptGenerator.from_parameters(
                params),
            namespace=params.string("namespace"),
            default_resource_request=ResourceRequest.from_parameters(params),
            data_configuration=params.string("data_configuration",
                                             default="sharedfs"),
            experiment_name=params.string("experiment_name", default=""),
        )

        if params.boolean("include_nas", default=True):
            add_local_nas_to_sites(
                wb._sites_catalog,
                params  # pylint: disable=protected-access
            )
        if params.boolean("include_saga", default=True):
            add_saga_cluster_to_sites(
                wb._sites_catalog,
                params  # pylint: disable=protected-access
            )
            configure_saga_properities(
                wb._properties,
                params  # pylint: disable=protected-access
            )

        return wb
Esempio n. 11
0
def main(params: Parameters):
    params.assert_exactly_one_present(
        [_NUM_SLICES_PARAM, _EXPLICIT_SPLIT_PARAM])

    with byte_key_value_linear_source_from_params(params) as input_source:
        if _NUM_SLICES_PARAM in params:
            _split_into_even_slices(input_source, params)
        elif _EXPLICIT_SPLIT_PARAM in params:
            _explicit_split(input_source, params)
        else:
            raise RuntimeError("No known split parameter specified.")
Esempio n. 12
0
def main(params: Parameters):
    input_file_path = params.existing_file("input_file")
    output_file_path = params.creatable_file("output_file")
    logging.info("Reading from input file: %s",
                 str(input_file_path.absolute()))
    with input_file_path.open() as input_file:
        nums = [int(x.strip()) for x in input_file if x.strip() != ""]

    nums.sort()

    output_file_path.write_text("\n".join(immutableset([str(x)
                                                        for x in nums])))
Esempio n. 13
0
 def test_namespace_prefix(self):
     assert Parameters.from_mapping({
         "hello": {
             "world": {
                 "foo": "bar"
             }
         }
     }).namespace("hello").namespace("world").namespace_prefix == ("hello",
                                                                   "world")
     assert Parameters.empty(
         namespace_prefix=("foo", )).namespace_prefix == ("foo", )
     # test it works even for empty parameters
     assert Parameters.empty().namespace_or_empty("foo").namespace_or_empty(
         "bar").namespace_prefix == ("foo", "bar")
Esempio n. 14
0
def main(params: Parameters):
    input_file_path = params.existing_file("input_file")
    output_file_path = params.creatable_file("output_file")
    x = params.integer("x")
    logging.info("Reading from input file: %s",
                 str(input_file_path.absolute()))
    with input_file_path.open() as input_file:
        with output_file_path.open("w") as output_file:
            for num in input_file:
                output_file.write(f"{int(num)*x}\n")

    logging.info("Writing to output file: %s", str(input_file_path.absolute()))

    # Pause so that we can examine the job on the SAGA cluster
    time.sleep(30)
Esempio n. 15
0
 def from_parameters(params: Parameters) -> "SlurmPythonRunner":
     return SlurmPythonRunner(
         conda_config=CondaConfiguration.from_parameters(params),
         spack_config=SpackConfiguration.from_parameters(params),
         log_base_directory=params.creatable_directory(
             "log_directory").absolute(),
     )
Esempio n. 16
0
    def test_optional_existing_directory(self):
        test_dir = Path(tempfile.mkdtemp()).absolute()
        existing_dir_path = test_dir / "existing_directory"
        existing_dir_path.mkdir(parents=True, exist_ok=True)
        non_existing_dir_path = test_dir / "non_existent_directory"
        a_file = test_dir / "a_file"
        a_file.touch()
        params = Parameters.from_mapping({
            "directory_which_exists":
            str(existing_dir_path.absolute()),
            "directory_which_does_not_exist":
            non_existing_dir_path,
            "a_file":
            a_file,
        })

        # noinspection PyTypeChecker
        self.assertEqual(
            os.path.realpath(existing_dir_path),
            os.path.realpath(
                params.optional_existing_directory("directory_which_exists")),
        )
        self.assertEqual(None,
                         params.optional_existing_directory("missing_param"))
        with self.assertRaises(ParameterError):
            params.optional_existing_directory(
                "directory_which_does_not_exist")
        with self.assertRaises(ParameterError):
            params.optional_existing_directory("a_file")

        shutil.rmtree(test_dir)
Esempio n. 17
0
    def test_optional_creatable_empty_directory(self):
        test_dir = Path(tempfile.mkdtemp()).absolute()
        existing_dir_path = test_dir / "existing_directory"
        existing_dir_path.mkdir(parents=True, exist_ok=True)
        non_existing_dir_path = test_dir / "non_existent_directory"
        a_file = existing_dir_path / "a_file"
        a_file.touch()
        params = Parameters.from_mapping({
            "directory_which_exists":
            str(existing_dir_path.absolute()),
            "directory_which_does_not_exist":
            str(non_existing_dir_path.absolute()),
            "a_file":
            a_file,
        })

        self.assertEqual(
            None, params.optional_creatable_empty_directory("missing_param"))
        self.assertEqual(
            os.path.realpath(non_existing_dir_path),
            os.path.realpath(
                params.optional_creatable_empty_directory(
                    "directory_which_does_not_exist")),
        )
        with self.assertRaises(ParameterError):
            params.optional_creatable_empty_directory("a_file")
        with self.assertRaises(ParameterError):
            params.optional_creatable_empty_directory("directory_which_exists")
        self.assertEqual(
            os.path.realpath(existing_dir_path),
            os.path.realpath(
                params.optional_creatable_empty_directory(
                    "directory_which_exists", delete=True)),
        )
Esempio n. 18
0
    def test_optional_creatable_file(self):
        test_dir = Path(tempfile.mkdtemp()).absolute()
        existing_dir_path = test_dir / "existing_directory"
        existing_dir_path.mkdir(parents=True, exist_ok=True)
        non_existing_dir_path = test_dir / "non_existent_directory"
        a_file = existing_dir_path / "a_file"
        a_file.touch()
        non_existing_file = test_dir / "b_file"
        params = Parameters.from_mapping({
            "directory_which_exists":
            str(existing_dir_path.absolute()),
            "directory_which_does_not_exist":
            str(non_existing_dir_path.absolute()),
            "a_file":
            str(a_file.absolute()),
            "non_existing_file":
            str(non_existing_file.absolute()),
        })

        self.assertEqual(None, params.optional_creatable_file("missing_param"))
        self.assertEqual(
            os.path.realpath(non_existing_file),
            os.path.realpath(
                params.optional_creatable_file("non_existing_file")),
        )
Esempio n. 19
0
    def test_optionals_when_present(self):
        params = Parameters.from_mapping({
            "list": [1, 2, 3, ["a", "b", "c"]],
            "boolean": True,
            "float": 0.5,
            "integer": 42,
            "negative_int": -5,
            "namespace": {
                "fred": "meep"
            },
            "string": "foo",
        })

        assert params.optional_arbitrary_list("list") == [
            1, 2, 3, ["a", "b", "c"]
        ]
        assert params.optional_boolean("boolean")
        assert params.optional_floating_point("float") == 0.5
        assert params.optional_integer("integer") == 42
        assert params.optional_positive_integer("integer") == 42
        with self.assertRaises(ParameterError):
            params.optional_positive_integer("negative_int")
        assert params.optional_namespace("namespace").as_nested_dicts() == {
            "fred": "meep"
        }
        assert params.optional_string("string") == "foo"
Esempio n. 20
0
def split_key_value_store(
        input_store: KeyValueStore,
        *,
        num_parts: int,
        random_seed: Optional[int] = None) -> Tuple[KeyValueStore]:
    """
    Splits *input_store* into *num_parts* pieces of nearly equal size.

    Some of the resulting key-value stores may be empty.
    """
    if num_parts <= 0:
        raise RuntimeError("Number of parts must be positive")

    split_locator = input_store.locator / "split"
    split_output_dir = directory_for(split_locator)
    param_args = {
        "input": input_store.input_parameters(),
        "num_slices": num_parts,
        "output_dir": split_output_dir,
    }
    if random_seed:
        param_args["random_seed"] = random_seed
    split_job = run_python_on_parameters(
        split_locator,
        split_entry_point,
        Parameters.from_mapping(param_args),
        depends_on=input_store,
    )
    return tuple(
        ZipKeyValueStore(
            path=split_output_dir / f"{slice_index}.zip",
            depends_on=split_job,
            locator=split_locator / str(slice_index),
        ) for slice_index in range(num_parts))
Esempio n. 21
0
def downsample(input_store: KeyValueStore,
               *,
               limit: int,
               output_locator: Optional[Locator] = None) -> KeyValueStore:
    """
    Convince function to run `vistautils.scripts.downsample_key_value_store` as a Pegasus Job
    """
    if not output_locator:
        output_locator = input_store.locator / f"downsampled-{limit}"
    output_zip_path = directory_for(output_locator) / "downsampled.zip"
    downsample_job = run_python_on_parameters(
        output_locator,
        downsample_key_value_store,
        Parameters.from_mapping({
            "input": input_store.input_parameters(),
            "output_zip_path": output_zip_path,
            "num_to_sample": limit,
            "random_seed": 0,
        }),
        depends_on=input_store,
    )
    return ZipKeyValueStore(
        path=output_zip_path,
        locator=output_locator,
        depends_on=[input_store.depends_on, downsample_job],
    )
def test_simple_dax(tmp_path):
    params = Parameters.from_mapping({
        "workflow_name":
        "Test",
        "workflow_created":
        "Testing",
        "workflow_log_dir":
        str(tmp_path / "log"),
        "workflow_directory":
        str(tmp_path / "working"),
        "site":
        "saga",
        "namespace":
        "test",
        "partition":
        "scavenge",
    })
    workflow_builder = WorkflowBuilder.from_parameters(params)
    assert workflow_builder.name == "Test"
    assert workflow_builder.created_by == "Testing"
    assert (workflow_builder._workflow_directory  # pylint:disable=protected-access
            == tmp_path / "working")
    assert workflow_builder._namespace == "test"  # pylint:disable=protected-access
    assert workflow_builder._default_site == "saga"  # pylint:disable=protected-access
    assert workflow_builder.default_resource_request  # pylint:disable=protected-access
    assert workflow_builder._job_graph is not None  # pylint:disable=protected-access
Esempio n. 23
0
def byte_key_value_sink_from_params(
    params: Parameters,
    *,
    output_namespace: str = "output",
    eval_context: Optional[Dict] = None,
) -> KeyValueSink[str, bytes]:
    """
    Get a binary key-value sink based on parameters.

    This should be passed a parameter namespace.  If the "type" field is present, it should
    be the name of a class or method.  If a class, its static `from_parameters` method will be
    called with these parameters and should return a `KeyValueSink[str, bytes]`. If a callable,
    it will be called with these parameters (and should also return a KeyValueSink[str, bytes].

    The type 'zip' is a shortcut for a key-value zip file.  'directory' is a shortcut for
    writing the output files to the specified directory.

    If additional imports are needed to resolve 'type', they can be specified as a Python list in
    the `import` field.

    If no type is specified, a 'directory' sink will be created.
    """
    # to be sure the default special values can be evaluated, we want to include this module
    # itself in the evaluation context. We combine it with eval_context, giving priority to
    # the context specified by the user
    effective_context = dict(globals())
    effective_context.update(eval_context or {})
    return params.object_from_parameters(  # type: ignore
        output_namespace,
        KeyValueSink,
        special_factories=_BYTE_KEY_VALUE_SINK_SPECIAL_VALUES,
        default_factory=_DirectoryBytesKeyValueSink,
        context=effective_context,
        factory_namespace_param_name="type",
    )
Esempio n. 24
0
def test_dax_with_categories(tmp_path):
    workflow_params = Parameters.from_mapping({
        "workflow_name":
        "Test",
        "workflow_created":
        "Testing",
        "workflow_log_dir":
        str(tmp_path / "log"),
        "workflow_directory":
        str(tmp_path / "working"),
        "site":
        "saga",
        "namespace":
        "test",
        "partition":
        "gaia",
        "home_dir":
        str(tmp_path),
    })
    initialize_vista_pegasus_wrapper(workflow_params)

    multiply_job_name = Locator(_parse_parts("jobs/multiply"))
    multiply_output_file = tmp_path / "multiplied_nums.txt"
    multiply_input_file = tmp_path / "raw_nums.txt"
    multiply_params = Parameters.from_mapping({
        "input_file": multiply_input_file,
        "output_file": multiply_output_file,
        "x": 4
    })
    multiply_job_category = "arithmetic"

    run_python_on_parameters(
        multiply_job_name,
        multiply_by_x_main,
        multiply_params,
        depends_on=[],
        category=multiply_job_category,
    )

    # Check that the multiply job has the appropriate category set in the DAX file
    dax_file = write_workflow_description()
    assert dax_file.exists()

    assert _job_in_dax_has_category(dax_file, multiply_job_name,
                                    multiply_job_category)
    assert not _job_in_dax_has_category(dax_file, multiply_job_name,
                                        "an-arbitrary-category")
Esempio n. 25
0
    def create_logger(params: Parameters) -> "LearningProgressHtmlLogger":
        output_dir = params.creatable_directory("experiment_group_dir")
        experiment_name = params.string("experiment")
        include_links_to_images = params.optional_boolean("include_image_links")
        num_pretty_descriptions = params.positive_integer(
            "num_pretty_descriptions", default=3
        )
        sort_by_length = params.boolean(
            "sort_learner_descriptions_by_length", default=False
        )

        logging_dir = output_dir / experiment_name
        logging_dir.mkdir(parents=True, exist_ok=True)
        output_html_path = str(logging_dir / "index.html")

        if include_links_to_images is None:
            include_links_to_images = False

        logging.info("Experiment will be logged to %s", output_html_path)

        with open(output_html_path, "w") as outfile:
            html_dumper = CurriculumToHtmlDumper()

            outfile.write(f"<head>\n\t<style>{CSS}\n\t</style>\n</head>")
            outfile.write(f"\n<body>\n\t<h1>{experiment_name}</h1>")
            # A JavaScript function to allow toggling perception information
            outfile.write(
                """
                <script>
                function myFunction(id) {
                  var x = document.getElementById(id);
                  if (x.style.display === "none") {
                    x.style.display = "block";
                  } else {
                    x.style.display = "none";
                  }
                }
                </script>
                """
            )
        return LearningProgressHtmlLogger(
            outfile_dir=output_html_path,
            html_dumper=html_dumper,
            include_links_to_images=include_links_to_images,
            num_pretty_descriptions=num_pretty_descriptions,
            sort_by_length=sort_by_length,
        )
Esempio n. 26
0
def log_experiment_entry_point(params: Parameters) -> None:
    experiment_name = params.string("experiment")
    debug_log_dir = params.optional_creatable_directory("debug_log_directory")

    graph_logger: Optional[HypothesisLogger]
    if debug_log_dir:
        logging.info("Debug graphs will be written to %s", debug_log_dir)
        graph_logger = HypothesisLogger(debug_log_dir,
                                        enable_graph_rendering=True)
    else:
        graph_logger = None

    logger = LearningProgressHtmlLogger.create_logger(params)

    language_mode = params.enum("language_mode",
                                LanguageMode,
                                default=LanguageMode.ENGLISH)

    (training_instance_groups,
     test_instance_groups) = curriculum_from_params(params, language_mode)

    execute_experiment(
        Experiment(
            name=experiment_name,
            training_stages=training_instance_groups,
            learner_factory=learner_factory_from_params(
                params, graph_logger, language_mode),
            pre_example_training_observers=[
                logger.pre_observer(),
                CandidateAccuracyObserver("pre-acc-observer"),
            ],
            post_example_training_observers=[logger.post_observer()],
            test_instance_groups=test_instance_groups,
            test_observers=[logger.test_observer()],
            sequence_chooser=RandomChooser.for_seed(0),
        ),
        log_path=params.optional_creatable_directory("hypothesis_log_dir"),
        log_hypotheses_every_n_examples=params.integer(
            "log_hypothesis_every_n_steps", default=250),
        log_learner_state=params.boolean("log_learner_state", default=True),
        learner_logging_path=params.optional_creatable_directory(
            "experiment_group_dir"),
        starting_point=params.integer("starting_point", default=-1),
        point_to_log=params.integer("point_to_log", default=0),
        load_learner_state=params.optional_existing_file("learner_state_path"),
    )
Esempio n. 27
0
    def from_parameters(params: Parameters) -> KeyValueSink[str, bytes]:
        """
        Create a key-value sink writing to a zip file.

        Right now, these uses all the defaults for `KeyValueSink.zip_bytes_sink`. In the
        future, we might examine other parameters to allow greater customization.
        """
        return KeyValueSink.zip_bytes_sink(params.creatable_file("path"))
Esempio n. 28
0
def main(params: Parameters):
    with byte_key_value_source_from_params(params) as input_source:
        keys = list(input_source.keys())
        num_to_sample = min(params.positive_integer(_NUM_TO_SAMPLE_PARAM),
                            len(keys))
        random.shuffle(
            keys,
            random=random.Random(params.integer(_RANDOM_SEED_PARAM,
                                                default=0)).random,
        )
        keys_to_keep = keys[:num_to_sample]
        output_zip_path = params.creatable_file("output_zip_path")
        logging.info("Downsampling %s files to %s", num_to_sample,
                     output_zip_path)
        with KeyValueSink.zip_bytes_sink(output_zip_path) as out:
            for key in keys_to_keep:
                out.put(key, input_source[key])
Esempio n. 29
0
def test_not_clearing_ckpts(monkeypatch, tmp_path):

    workflow_params = Parameters.from_mapping({
        "workflow_name":
        "Test",
        "workflow_created":
        "Testing",
        "workflow_log_dir":
        str(tmp_path / "log"),
        "workflow_directory":
        str(tmp_path / "working"),
        "site":
        "saga",
        "namespace":
        "test",
        "partition":
        "scavenge",
        "home_dir":
        str(tmp_path),
    })

    initialize_vista_pegasus_wrapper(workflow_params)

    multiply_job_name = Locator(_parse_parts("jobs/multiply"))
    multiply_output_file = tmp_path / "multiplied_nums.txt"
    multiply_input_file = tmp_path / "raw_nums.txt"
    multiply_params = Parameters.from_mapping({
        "input_file": multiply_input_file,
        "output_file": multiply_output_file,
        "x": 4
    })

    multiple_dir = directory_for(multiply_job_name)

    checkpointed_multiply_file = multiple_dir / "___ckpt"
    checkpointed_multiply_file.touch()
    multiply_output_file.touch()

    run_python_on_parameters(multiply_job_name,
                             multiply_by_x_main,
                             multiply_params,
                             depends_on=[])
    monkeypatch.setattr("builtins.input", lambda _: "n")
    write_workflow_description()
    assert checkpointed_multiply_file.exists()
Esempio n. 30
0
def build_relation_learner_factory(
        params: Parameters, beam_size: int,
        language_mode: LanguageMode) -> Optional[TemplateLearner]:
    learner_type = params.string("learner_type",
                                 valid_options=["subset", "pursuit", "none"],
                                 default="subset")
    ontology, _, _ = ONTOLOGY_STR_TO_ONTOLOGY[params.string(
        "ontology",
        valid_options=ONTOLOGY_STR_TO_ONTOLOGY.keys(),
        default="phase2")]

    if learner_type == "subset":
        return SubsetRelationLearnerNew(ontology=ontology,
                                        beam_size=beam_size,
                                        language_mode=language_mode)
    elif learner_type == "pursuit":
        rng = random.Random()
        rng.seed(params.integer("random_seed", default=0))
        return PursuitRelationLearnerNew(
            learning_factor=params.floating_point("learning_factor"),
            graph_match_confirmation_threshold=params.floating_point(
                "graph_match_confirmation_threshold"),
            lexicon_entry_threshold=params.floating_point(
                "lexicon_entry_threshold"),
            rng=rng,
            smoothing_parameter=params.floating_point("smoothing_parameter"),
            ontology=ontology,
            language_mode=language_mode,
        )
    elif learner_type == "none":
        # We don't want to include this learner type.
        return None
    else:
        raise RuntimeError("Relation learner type invalid ")