Example #1
0
def test_GetPreprocessorFunction_absolute_path_not_found(
        tempdir: pathlib.Path):
    """Test loading module when file not found."""
    path = tempdir / "foo.py"
    fs.Write(path, "".encode("utf-8"))
    with test.Raises(errors.UserError):
        preprocessors.GetPreprocessorFunction(f"{path}:NotFound")
Example #2
0
def AssertConfigIsValid(config: corpus_pb2.Corpus) -> corpus_pb2.Corpus:
    """Assert that config proto is valid.

  Args:
    config: A Corpus proto.

  Returns:
    The Corpus proto.

  Raises:
    UserError: If the config is invalid.
  """
    try:
        pbutil.AssertFieldIsSet(config, 'contentfiles')
        pbutil.AssertFieldIsSet(config, 'atomizer')
        pbutil.AssertFieldIsSet(config, 'contentfile_separator')
        # Check that the preprocessor pipeline resolves to preprocessor functions.
        [preprocessors.GetPreprocessorFunction(p) for p in config.preprocessor]

        if config.HasField('greedy_multichar_atomizer'):
            if not config.greedy_multichar_atomizer.tokens:
                raise errors.UserError(
                    'GreedyMulticharAtomizer.tokens is empty')
            for atom in config.greedy_multichar_atomizer.tokens:
                if not atom:
                    raise errors.UserError(
                        'Empty string found in GreedyMulticharAtomizer.tokens is empty'
                    )

        return config
    except pbutil.ProtoValueError as e:
        raise errors.UserError(e)
Example #3
0
def test_GetPreprocessFunction_undecorated_preprocessor():
  """Test that an UserError is raised if preprocessor not decorated."""
  with pytest.raises(errors.UserError) as e_info:
    preprocessors.GetPreprocessorFunction(
        'deeplearning.clgen.preprocessors.preprocessors_test'
        ':MockUndecoratedPreprocessor')
  assert '@clgen_preprocessor' in str(e_info.value)
def test_GetPreprocessFunction_missing_function():
  """Test that UserError is raised if module exists but function doesn't."""
  with test.Raises(errors.UserError) as e_info:
    preprocessors.GetPreprocessorFunction(
      "deeplearning.clgen.preprocessors.preprocessors_test:Foo"
    )
  assert "not found" in str(e_info.value)
def test_GetPreprocessorFunction_absolute_path_with_dep(tempdir: pathlib.Path):
  """Test loading module from file which has a dependency."""
  lib_module = tempdir / "lib_module.py"
  fs.Write(
    lib_module,
    """
def PreprocessImplementation(src):
  return src.replace('b', 'c')
""".encode(
      "utf-8"
    ),
  )

  path = tempdir / "lib_module.py"
  fs.Write(
    path,
    """
from . import lib_module
def Preprocess(src):
  return lib_module.PreprocessImplementation(src)
""".encode(
      "utf-8"
    ),
  )

  with test.Raises(errors.UserError):
    preprocessors.GetPreprocessorFunction(f"{path}:Preprocess")
Example #6
0
def test_GetPreprocessorFunction_absolute_path(tempdir: pathlib.Path):
  """Test loading module from absolute path to file."""
  path = tempdir / 'preprocessor.py'
  fs.Write(
      path, """
def Preprocess(src: str) -> str:
  return src.replace('a', 'b')
""".encode('utf-8'))

  f = preprocessors.GetPreprocessorFunction(f'{path}:Preprocess')
  assert f('abc') == 'bbc'
Example #7
0
def test_GetPreprocessorFunction_absolute_path(tempdir: pathlib.Path):
    """Test loading module from absolute path to file."""
    path = tempdir / "preprocessor.py"
    fs.Write(
        path,
        """
def Preprocess(src: str) -> str:
  return src.replace('a', 'b')
""".encode("utf-8"),
    )

    f = preprocessors.GetPreprocessorFunction(f"{path}:Preprocess")
    assert f("abc") == "bbc"
Example #8
0
def Preprocess(contentfiles: pathlib.Path, outdir: pathlib.Path,
               preprocessor_names):
    # Error early if preprocessors are bad.
    [preprocessors.GetPreprocessorFunction(f) for f in preprocessor_names]

    # This is basically the same code as:
    # deeplearning.clgen.corpuses.preprocessed.PreprocessedContentFiles:Import()
    # Only it's writing the results of preprocessing to files rather than to a
    # database. Consider refactoring.
    relpaths = {f.name for f in contentfiles.iterdir()}
    done = {f.name for f in outdir.iterdir()}
    todo = relpaths - done
    app.Log(
        1,
        "Preprocessing %s of %s content files",
        humanize.Commas(len(todo)),
        humanize.Commas(len(relpaths)),
    )
    jobs = [
        internal_pb2.PreprocessorWorker(
            contentfile_root=str(contentfiles),
            relpath=t,
            preprocessors=preprocessor_names,
        ) for t in todo
    ]
    pool = multiprocessing.Pool()
    bar = progressbar.ProgressBar(max_value=len(jobs))
    wall_time_start = time.time()
    workers = pool.imap_unordered(preprocessed.PreprocessorWorker, jobs)
    succeeded_count = 0
    for preprocessed_cf in bar(workers):
        wall_time_end = time.time()
        preprocessed_cf.wall_time_ms = int(
            (wall_time_end - wall_time_start) * 1000)
        wall_time_start = wall_time_end
        if preprocessed_cf.preprocessing_succeeded:
            succeeded_count += 1
            with open(outdir / preprocessed_cf.input_relpath, "w") as f:
                f.write(preprocessed_cf.text)

    app.Log(
        1,
        "Successfully preprocessed %s of %s files (%.2f %%)",
        humanize.Commas(succeeded_count),
        humanize.Commas(len(todo)),
        (succeeded_count / min(len(todo), 1)) * 100,
    )
Example #9
0
def AssertConfigIsValid(config: typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus]
                       ) -> typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus]:
  """Assert that config proto is valid.

  Args:
    config: A Corpus proto.

  Returns:
    The Corpus proto.

  Raises:
    UserError: If the config is invalid.
  """
  try:
    # Early-exit to support corpuses derived from databases of pre-encoded
    # content files.
    # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor after splitting
    # Corpus class.
    if config.HasField("pre_encoded_corpus_url"):
      return config

    pbutil.AssertFieldIsSet(config,          "contentfiles")
    if isinstance(config, corpus_pb2.Corpus):
      pbutil.AssertFieldIsSet(config,          "tokenizer")
      pbutil.AssertFieldIsSet(config.tokenizer, "token_type")
      pbutil.AssertFieldConstraint(config.tokenizer, 
                                   "token_type", 
                                   lambda x: x == "character" or x == "word" or x == "ast",
                                   "tokenizer is either character or word based."
                                   )
      if config.tokenizer.token_type == "word":
        pbutil.AssertFieldConstraint(config.tokenizer,
                                    "token_list",
                                    lambda x: os.path.isfile(str(ExpandConfigPath(x, path_prefix=FLAGS.clgen_local_path_prefix))),
                                    "Invalid token_list file"
                                    )
    else:
      if config.HasField("tokenizer"):
        raise ValueError("Pre-train corpus cannot have a distinct tokenizer.")
    pbutil.AssertFieldIsSet(config,          "contentfile_separator")
    # Check that the preprocessor pipeline resolves to preprocessor functions.
    [preprocessors.GetPreprocessorFunction(p) for p in config.preprocessor]

    return config
  except pbutil.ProtoValueError as e:
    raise e
Example #10
0
def AssertConfigIsValid(config: corpus_pb2.Corpus) -> corpus_pb2.Corpus:
    """Assert that config proto is valid.

  Args:
    config: A Corpus proto.

  Returns:
    The Corpus proto.

  Raises:
    UserError: If the config is invalid.
  """
    try:
        # Early-exit to support corpuses derived from databases of pre-encoded
        # content files.
        # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor after splitting
        # Corpus class.
        if config.HasField("pre_encoded_corpus_url"):
            return config

        pbutil.AssertFieldIsSet(config, "contentfiles")
        pbutil.AssertFieldIsSet(config, "atomizer")
        pbutil.AssertFieldIsSet(config, "contentfile_separator")
        # Check that the preprocessor pipeline resolves to preprocessor functions.
        [preprocessors.GetPreprocessorFunction(p) for p in config.preprocessor]

        if config.HasField("greedy_multichar_atomizer"):
            if not config.greedy_multichar_atomizer.tokens:
                raise errors.UserError(
                    "GreedyMulticharAtomizer.tokens is empty")
            for atom in config.greedy_multichar_atomizer.tokens:
                if not atom:
                    raise errors.UserError(
                        "Empty string found in GreedyMulticharAtomizer.tokens is empty"
                    )

        return config
    except pbutil.ProtoValueError as e:
        raise errors.UserError(e)
Example #11
0
def test_GetPreprocessFunction_mock_preprocessor():
  """Test that a mock preprocessor can be found."""
  f = preprocessors.GetPreprocessorFunction(
      'deeplearning.clgen.preprocessors.preprocessors_test:MockPreprocessor')
  assert f == MockPreprocessor
Example #12
0
def test_GetPreprocessFunction_missing_module():
  """Test that UserError is raised if module not found."""
  with pytest.raises(errors.UserError) as e_info:
    preprocessors.GetPreprocessorFunction('not.a.real.module:Foo')
  assert 'not found' in str(e_info.value)
Example #13
0
def test_GetPreprocessFunction_empty_string():
  """Test that an UserError is raised if no preprocessor is given."""
  with pytest.raises(errors.UserError) as e_info:
    preprocessors.GetPreprocessorFunction('')
  assert 'Invalid preprocessor name' in str(e_info.value)
Example #14
0
def test_GetPreprocessorFunction_absolute_function_not_found(
    tempdir: pathlib.Path):
  """Test loading module when file not found."""
  with pytest.raises(errors.UserError):
    preprocessors.GetPreprocessorFunction(f'{tempdir}/foo.py:Preprocess')
Example #15
0
def test_GetPreprocessorFunction_absolute_path_not_found(tempdir: pathlib.Path):
  """Test loading module when file not found."""
  path = tempdir / 'foo.py'
  fs.Write(path, "".encode('utf-8'))
  with pytest.raises(errors.UserError):
    preprocessors.GetPreprocessorFunction(f'{path}:NotFound')
Example #16
0
def test_GetPreprocessFunction_mock_preprocessor():
    """Test that a mock preprocessor can be found."""
    f = preprocessors.GetPreprocessorFunction(
        "deeplearning.clgen.preprocessors.preprocessors_test:MockPreprocessor")
    assert f.__name__ == "MockPreprocessor"