Example #1
0
def test_ToFile_path_is_directory(suffix):
  """Test that IsADirectoryError raised if path is a directory."""
  with tempfile.TemporaryDirectory(suffix=suffix) as d:
    proto = test_protos_pb2.TestMessage(string='abc', number=1)
    with pytest.raises(IsADirectoryError) as e_info:
      pbutil.ToFile(proto, pathlib.Path(d))
    assert str(e_info.value).endswith(f"Is a directory: '{d}'")
Example #2
0
def CreatePackageArchiveSidecar(archive_path: pathlib.Path,
                                manifest: dpack_pb2.DataPackage,
                                sidecar_path: pathlib.Path) -> None:
    """Create a sidecar manifest to accompany an archive.

  Args:
    archive_path: The path of the archive tarball.
    manifest: A DataPackage manifest instance.
    sidecar_path: The path of the sidecar to create

  Raises:
    OSError: If sidecar_path already exists, or archive_path does not.
  """
    if sidecar_path.exists():
        raise OSError(f'Refusing to overwrite {sidecar_path}.')
    if not archive_path.is_file():
        raise OSError(f'Archive {archive_path} does not exist')

    sidecar = dpack_pb2.DataPackage()
    sidecar.CopyFrom(manifest)
    # Clear the file attributes. Only the file names and comments are stored in the sidecar.
    for f in sidecar.file:
        if not f.comment:
            f.ClearField("comment")
        f.ClearField("size_in_bytes")
        f.ClearField("checksum_hash")
        f.ClearField("checksum")
    sidecar.checksum_hash = dpack_pb2.SHA256
    sidecar.checksum = crypto.sha256_file(archive_path)
    pbutil.ToFile(sidecar, sidecar_path)
    logging.info('Wrote %s', sidecar_path.absolute())
Example #3
0
def _CreateTestRepo(root_dir: pathlib.Path, owner: str, name: str) -> None:
    """Create an empty repo for testing indexers."""
    owner_name = f'{owner}_{name}'
    (root_dir / owner_name / '.git').mkdir(parents=True)
    (root_dir / owner_name / 'src').mkdir(parents=True)
    pbutil.ToFile(scrape_repos_pb2.GitHubRepoMetadata(owner=owner, name=name),
                  root_dir / f'{owner_name}.pbtxt')
Example #4
0
def test_ToFile_message_missing_required_fields(suffix):
  """Test that EncodeError is raised if required field is not set."""
  with tempfile.NamedTemporaryFile(prefix='labm8_proto_',
                                   suffix=suffix) as f:
    proto = test_protos_pb2.TestMessage(number=1)
    with pytest.raises(pbutil.EncodeError):
      pbutil.ToFile(proto, pathlib.Path(f.name))
Example #5
0
def test_ProtoBackedMixin_FromProtoFile(suffix: str, tempdir: pathlib.Path):
  """Test FromProtoFile constructor for proto backed class."""
  proto_path = tempdir / f'proto{suffix}'
  pbutil.ToFile(test_protos_pb2.TestMessage(string="Hello, world!", number=42),
                proto_path)

  instance = TestMessage.FromProtoFile(proto_path)
  assert instance.string == "Hello, world!"
  assert instance.number == 42
Example #6
0
def test_FromFile_required_fields_not_set_uninitialized_okay(suffix):
  """Test that DecodeError not raised if required fields not set."""
  with tempfile.NamedTemporaryFile(prefix='labm8_proto_',
                                   suffix=suffix) as f:
    proto_in = test_protos_pb2.AnotherTestMessage(number=1)
    pbutil.ToFile(test_protos_pb2.AnotherTestMessage(number=1),
                  pathlib.Path(f.name))
    pbutil.FromFile(pathlib.Path(f.name), test_protos_pb2.TestMessage(),
                    uninitialized_okay=True)
Example #7
0
def test_FromFile_required_fields_not_set(suffix):
  """Test that DecodeError raised if required fields not set."""
  with tempfile.NamedTemporaryFile(prefix='labm8_proto_',
                                   suffix=suffix) as f:
    pbutil.ToFile(test_protos_pb2.AnotherTestMessage(number=1),
                  pathlib.Path(f.name))
    with pytest.raises(pbutil.DecodeError) as e_info:
      pbutil.FromFile(pathlib.Path(f.name), test_protos_pb2.TestMessage())
    assert f"Required fields not set: '{f.name}'" == str(e_info.value)
def ExportOpenCLResults(cursor, start_id, proto_dir):
    batch_size = 1000
    result_id = start_id
    while True:
        logging.info('Exporting batch of %s results',
                     humanize.intcomma(batch_size))
        cursor.execute(
            """
SELECT
  results.id,
  assertions.assertion,
  results.outcome,
  programs.src
FROM results
LEFT JOIN testbeds ON results.testbed_id = testbeds.id
LEFT JOIN platforms ON testbeds.platform_id = platforms.id
LEFT JOIN testcases ON results.testcase_id = testcases.id
LEFT JOIN programs ON testcases.program_id = programs.id
LEFT JOIN stderrs ON results.stderr_id = stderrs.id
LEFT JOIN assertions ON stderrs.assertion_id = assertions.id
WHERE results.id >= %s
AND programs.generator = 1
AND testbeds.id = (
  SELECT testbeds.id
    FROM testbeds
    LEFT JOIN platforms ON testbeds.platform_id=platforms.id
  WHERE platform = 'clang'
  AND driver = '3.6.2'
)
ORDER BY results.id
LIMIT %s
""", (result_id, batch_size))
        i = 0
        for row in cursor:
            i += 1
            (
                result_id,
                assertion_text,
                outcome_num,
                program_src,
            ) = row

            outcome = fish_pb2.CompilerCrashDiscriminatorTrainingExample.Outcome.Name(
                outcome_num).lower()
            proto = fish_pb2.CompilerCrashDiscriminatorTrainingExample(
                src=program_src,
                outcome=outcome_num,
                raised_assertion=True if assertion_text else False,
                assertion_name=(GetClangAssertionStub(assertion_text)
                                if assertion_text else ''))
            pbutil.ToFile(proto,
                          proto_dir / outcome / (str(result_id) + '.pbtxt'))

        # If we received fewer results than the requested batch size, then we have
        # ran out of data.
        if i < batch_size:
            return
Example #9
0
 def _create_lock():
     lockfile = lockfile_pb2.LockFile(
         owner_process_id=os.getpid() if pid is None else pid,
         owner_process_argv=' '.join(sys.argv),
         date_acquired_utc_epoch_ms=labdate.MillisecondsTimestamp(
             labdate.GetUtcMillisecondsNow()),
         owner_hostname=system.HOSTNAME,
         owner_user=system.USERNAME)
     pbutil.ToFile(lockfile, self.path, assume_filename='LOCK.pbtxt')
Example #10
0
 def EpochEndCallback(self, epoch: int, loss: float):
     now = labdate.MillisecondsTimestamp()
     epoch_time_ms = now - self.last_epoch_begin_timestamp
     telemetry = telemetry_pb2.ModelEpochTelemetry(
         timestamp_utc_epoch_ms=now,
         epoch_num=epoch,
         epoch_wall_time_ms=epoch_time_ms,
         loss=loss,
     )
     pbutil.ToFile(telemetry,
                   self.logdir / f'epoch_{epoch:03d}_telemetry.pbtxt')
Example #11
0
def test_ImportFromLanguage_Java_repo(tempdir: pathlib.Path):
    """An end-to-end test of a Java importer."""
    (tempdir / 'src').mkdir()
    (tempdir / 'src' / 'Owner_Name' / '.git').mkdir(parents=True)
    (tempdir / 'src' / 'Owner_Name' / 'src').mkdir(parents=True)

    # A repo will only be imported if there is a repo meta file.
    pbutil.ToFile(
        scrape_repos_pb2.GitHubRepoMetadata(owner='Owner', name='Name'),
        tempdir / 'src' / 'Owner_Name.pbtxt')

    # Create some files in our test repo.
    with open(tempdir / 'src' / 'Owner_Name' / 'src' / 'A.java', 'w') as f:
        f.write("""
public class A {
  public static void helloWorld() {
    System.out.println("Hello, world!");
  }
}
""")
    with open(tempdir / 'src' / 'Owner_Name' / 'src' / 'B.java', 'w') as f:
        f.write("""
public class B {
  private static int foo() {return 5;}
}
""")
    with open(tempdir / 'src' / 'Owner_Name' / 'README.txt', 'w') as f:
        f.write('Hello, world!')

    language = scrape_repos_pb2.LanguageToClone(
        language='foolang',
        query=[],
        destination_directory=str(tempdir / 'src'),
        importer=[
            scrape_repos_pb2.ContentFilesImporterConfig(
                source_code_pattern='.*\\.java',
                preprocessor=[
                    "datasets.github.scrape_repos.preprocessors."
                    "extractors:JavaMethods"
                ]),
        ])
    indexer.ImportFromLanguage(language, multiprocessing.Pool(1))

    test_repo = github_repo.GitHubRepo(tempdir / 'src' / 'Owner_Name.pbtxt')
    assert (test_repo.index_dir / 'DONE.txt').is_file()
    assert len(list(test_repo.index_dir.iterdir())) == 3
    contentfiles = list(test_repo.ContentFiles())
    assert len(contentfiles) == 2
    assert set([cf.text for cf in contentfiles]) == {
        ('public static void helloWorld(){\n'
         '  System.out.println("Hello, world!");\n}\n'),
        'private static int foo(){\n  return 5;\n}\n',
    }
Example #12
0
def test_ToFile_FromFile_equivalence(suffix):
  """Test that ToFile() and FromFile() are symmetrical."""
  with tempfile.TemporaryDirectory(prefix='labm8_proto_') as d:
    path = pathlib.Path(d) / f'proto{suffix}'
    proto_in = test_protos_pb2.TestMessage(string='abc', number=1)
    pbutil.ToFile(proto_in, path)
    assert path.is_file()
    proto_out = test_protos_pb2.TestMessage()
    pbutil.FromFile(path, proto_out)
    assert proto_out.string == 'abc'
    assert proto_out.number == 1
    assert proto_in == proto_out
Example #13
0
def InitManifest(package_dir: pathlib.Path,
                 contents: typing.List[pathlib.Path], update: bool) -> None:
    """Write the MANIFEST.pbtxt file for a package."""
    manifest = CreatePackageManifest(package_dir, contents)
    manifest_path = package_dir / 'MANIFEST.pbtxt'
    if update and pbutil.ProtoIsReadable(manifest_path,
                                         dpack_pb2.DataPackage()):
        old = pbutil.FromFile(manifest_path, dpack_pb2.DataPackage())
        MergeManifests(manifest, old)
    elif manifest_path.is_file():
        raise OSError('Refusing to overwrite MANIFEST.pbtxt file.')
    pbutil.ToFile(manifest, manifest_path)
    logging.info('Wrote %s', manifest_path.absolute())
Example #14
0
def test_ProtoBackedMixin_FromFile(tempdir: pathlib.Path):
    """Test FromProto constructor for proto backed tables."""
    base = declarative.declarative_base()

    class TestMessage(AbstractTestMessage, base):
        pass

    pbutil.ToFile(
        test_protos_pb2.TestMessage(string="Hello, world!", number=42),
        tempdir / 'proto.pb')

    row = TestMessage(**TestMessage.FromFile(tempdir / 'proto.pb'))
    assert row.string == "Hello, world!"
    assert row.number == 42
Example #15
0
def TestingLoop(min_interesting_results: int,
                max_testing_time_seconds: int,
                batch_size: int,
                generator: base_generator.GeneratorServiceBase,
                dut_harness: base_harness.HarnessBase,
                gs_harness: base_harness.HarnessBase,
                filters: difftests.FiltersBase,
                interesting_results_dir: pathlib.Path,
                start_time: float = None) -> None:
    """The main fuzzing loop.

  Args:
    min_interesting_results: The minimum number of interesting results to find.
    max_testing_time_seconds: The maximum time allowed to find interesting
      results.
    batch_size: The number of testcases to generate and execute in each batch.
    generator: A testcase generator.
    dut_harness: The device under test.
    gs_harness: The device to compare outputs against.
    filters: A filters instance for testcases.
    interesting_results_dir: The directory to write interesting results to.
    start_time: The starting time, as returned by time.time(). If not provided,
      the starting time will be the moment that this function is called. Set
      this value if you would like to include initialization overhead in the
      calculated testing time.
  """
    start_time = start_time or time.time()
    interesting_results_dir.mkdir(parents=True, exist_ok=True)
    num_interesting_results = 0
    batch_num = 0
    while (num_interesting_results < min_interesting_results
           and time.time() < start_time + max_testing_time_seconds):
        batch_num += 1
        logging.info('Starting generate / test / eval batch %d ...', batch_num)
        interesting_results = RunBatch(generator, dut_harness, gs_harness,
                                       filters, batch_size)
        num_interesting_results += len(interesting_results)
        for result in interesting_results:
            pbutil.ToFile(
                result, interesting_results_dir /
                (str(labdate.MillisecondsTimestamp()) + '.pbtxt'))

    logging.info(
        'Stopping after %.2f seconds and %s batches (%.0fms / testcase).\n'
        'Found %s interesting results.',
        time.time() - start_time, humanize.intcomma(batch_num),
        (((time.time() - start_time) / (batch_num * batch_size)) * 1000),
        num_interesting_results)
    logging.flush()
Example #16
0
    def MakeRepositoryMetas(self,
                            repos: typing.List[Repository.Repository]) -> None:
        """Make meta files for a list of repositories.

    Args:
      repos: A list of GitHub Repository instances.
    """
        logging.debug('Scraping %s repositories',
                      humanize.intcomma(len(repos)))
        for repo in repos:
            self.i += 1
            concat_name = '_'.join([repo.owner.login, repo.name])
            clone_dir = self.destination_directory / concat_name
            meta_path = pathlib.Path(str(clone_dir) + '.pbtxt')
            if not pbutil.ProtoIsReadable(
                    meta_path, scrape_repos_pb2.GitHubRepoMetadata()):
                meta = GetRepositoryMetadata(repo)
                logging.debug('%s', meta)
                pbutil.ToFile(meta, meta_path)
Example #17
0
File: run.py Project: SpringRi/phd
def SampleModel(instance: clgen.Instance) -> None:
    """Take --output_corpus_size samples from model."""
    logging.info('Training and sampling the CLgen model ...')
    target_samples = FLAGS.output_corpus_size
    sample_dir = instance.model.SamplerCache(instance.sampler)
    sample_dir.mkdir(exist_ok=True)
    num_samples = len(list(sample_dir.iterdir()))
    logging.info('Need to generate %d samples in %s',
                 max(target_samples - num_samples, 0), sample_dir)
    if num_samples < target_samples:
        sample_lock = lockfile.LockFile(sample_dir / 'LOCK')
        with sample_lock.acquire(replace_stale=True, block=True):
            num_samples = len(list(sample_dir.iterdir()))
            while num_samples < target_samples:
                samples = instance.model.SampleFast(
                    instance.sampler, target_samples - num_samples)
                for sample in samples:
                    sample_id = crypto.sha256_str(sample.text)
                    pbutil.ToFile(sample, sample_dir / f'{sample_id}.pbtxt')
                num_samples = len(list(sample_dir.iterdir()))
Example #18
0
def IndexContentFiles(job: scrape_repos_pb2.ImportWorker) -> None:
    """Index content files."""
    relpath = job.abspath[len(str(job.clone_dir)) + 1:]
    try:
        texts = preprocessors.Preprocess(pathlib.Path(job.clone_dir), relpath,
                                         job.all_files_relpaths,
                                         job.preprocessors)
        for i, text in enumerate(texts):
            sha256 = hashlib.sha256(text.encode('utf-8'))
            proto = scrape_repos_pb2.ContentFile(
                clone_from_url=job.clone_from_url,
                relpath=relpath,
                artifact_index=i,
                sha256=sha256.digest(),
                charcount=len(text),
                linecount=len(text.split('\n')),
                text=text)
            path = pathlib.Path(job.index_dir) / (
                binascii.hexlify(proto.sha256).decode('utf-8') + '.pbtxt')
            pbutil.ToFile(proto, path)
    except UnicodeDecodeError:
        logging.warning('Failed to decode %s', relpath)
Example #19
0
def GenerateTestcases(generator_config: generator_pb2.ClgenGenerator,
                      output_directory: pathlib.Path,
                      num_testcases: int) -> None:
    logging.info('Writing output to %s', output_directory)
    (output_directory / 'generated_kernels').mkdir(parents=True, exist_ok=True)
    (output_directory / 'generated_testcases').mkdir(parents=True,
                                                     exist_ok=True)

    logging.info('Preparing test case generator.')
    generator = clgen.ClgenGenerator(generator_config)

    # Generate testcases.
    logging.info('Generating %d testcases ...', num_testcases)
    req = generator_pb2.GenerateTestcasesRequest()
    req.num_testcases = num_testcases
    res = generator.GenerateTestcases(req, None)

    for testcase in res.testcases:
        # Write kernel to file.
        kernel = testcase.inputs['src']
        kernel_id = crypto.md5_str(kernel)
        with open(output_directory / 'generated_kernels' / f'{kernel_id}.cl',
                  'w') as f:
            f.write(kernel)

        # Write testcase to file.
        testcase_id = crypto.md5_str(str(testcase))
        pbutil.ToFile(
            testcase,
            output_directory / 'generated_testcases' / f'{testcase_id}.pbtxt')

    logging.info('%d testcases written to %s', num_testcases,
                 output_directory / 'generated_testcases')
    generation_times = [
        testcase.profiling_events[0].duration_ms for testcase in res.testcases
    ]
    logging.info('Average time to generate testcase: %.2f ms',
                 sum(generation_times) / len(generation_times))
Example #20
0
def DifftestTestcase(s: db.session_t, t: testcase.Testcase,
                     outdir: pathlib.Path) -> None:
    """Difftest a testcase."""
    results = list(s.query(result.Result).filter(result.Result.testcase == t))
    for r in results:
        r.output_class = GetResultOutputClass(r)
    majority = GetMajorityOutput(results)

    def OutputPath(result_class: str) -> pathlib.Path:
        try:
            if r.testbed.opts['opencl_opt'] == 'enabled':
                opt = '+'
            elif r.testbed.opts['opencl_opt'] == 'disabled':
                opt = '-'
            else:
                raise KeyError
        except KeyError:
            raise LookupError(str(r.testbed))
        testbeds = sorted(x[0] for x in s.query(testbed.Testbed.name))
        dir = outdir / result_class / str(testbeds.index(r.testbed.name)) / opt
        dir.mkdir(parents=True, exist_ok=True)
        return dir / (str(r.id) + '.pbtxt')

    for r in results:
        if r.output_class == 'Build crash':
            pbutil.ToFile(r.ToProto(), OutputPath('bc'))
        elif r.output_class == 'Build timeout':
            pbutil.ToFile(r.ToProto(), OutputPath('bto'))
        elif (majority.majority_outcome == 'Pass'
              and r.output_class == 'Build failure'):
            pbutil.ToFile(r.ToProto(), OutputPath('abf'))
        elif (majority.majority_outcome == 'Pass'
              and r.output_class == 'Runtime crash'):
            pbutil.ToFile(r.ToProto(), OutputPath('arc'))
        elif (r.outputs['stdout'] != majority.majority_stdout
              and majority.majority_outcome == 'Pass'
              and majority.stdout_majority_size >= math.ceil(
                  2 * majority.outcome_majority_size / 3)):
            pbutil.ToFile(r.ToProto(), OutputPath('awo'))
        else:
            pbutil.ToFile(r.ToProto(), OutputPath('pass'))
Example #21
0
 def _WriteMetafile(self) -> None:
     pbutil.ToFile(self.meta,
                   pathlib.Path(self.cache.keypath('META.pbtxt')))
Example #22
0
<<<<<<< HEAD:labm8/py/lockfile.py
=======
>>>>>>> 4242aed2a... Automated code format.
=======
>>>>>>> 4242aed2a... Automated code format.
        owner_process_id=os.getpid() if pid is None else pid,
        owner_process_argv=" ".join(sys.argv),
        date_acquired_utc_epoch_ms=labdate.MillisecondsTimestamp(
          labdate.GetUtcMillisecondsNow(),
        ),
        owner_hostname=system.HOSTNAME,
        owner_user=system.USERNAME,
<<<<<<< HEAD
<<<<<<< HEAD
      )
      pbutil.ToFile(lockfile, self.path, assume_filename="LOCK.pbtxt")
=======
          owner_process_id=os.getpid() if pid is None else pid,
          owner_process_argv=' '.join(sys.argv),
          date_acquired_utc_epoch_ms=labdate.MillisecondsTimestamp(
              labdate.GetUtcMillisecondsNow(),),
          owner_hostname=system.HOSTNAME,
          owner_user=system.USERNAME,
      )
      pbutil.ToFile(lockfile, self.path, assume_filename='LOCK.pbtxt')
>>>>>>> 49340dc00... Auto-format labm8 python files.:labm8/lockfile.py
=======
      )
      pbutil.ToFile(lockfile, self.path, assume_filename="LOCK.pbtxt")
>>>>>>> 4242aed2a... Automated code format.
=======
Example #23
0
 def OnSample(self, sample: model_pb2.Sample) -> bool:
     """Sample receive callback. Returns True if sampling should continue."""
     sample_id = crypto.sha256_str(sample.text)
     sample_path = self.cache_path / f'{sample_id}.pbtxt'
     pbutil.ToFile(sample, sample_path)
     return True
def main(argv):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(' '.join(
            argv[1:])))

    if not FLAGS.export_path:
        raise app.UsageError('--export_path must be a directory')
    export_path = pathlib.Path(FLAGS.export_path)
    if export_path.is_file():
        raise app.UsageError('--export_path must be a directory')
    export_path.mkdir(parents=True, exist_ok=True)

    if not FLAGS.dataset_root:
        raise app.UsageError('--dataset_root must be a directory')
    dataset_root = pathlib.Path(FLAGS.dataset_root)
    if dataset_root.is_file():
        raise app.UsageError('--dataset_root must be a directory')
    dataset_root.mkdir(parents=True, exist_ok=True)

    ratios = DatasetRatios(FLAGS.training_ratio, FLAGS.validation_ratio,
                           FLAGS.testing_ratio)
    assert sum(ratios) <= 1

    # Load protos.
    positive_protos = LoadPositiveProtos(export_path,
                                         FLAGS.positive_class_outcomes,
                                         FLAGS.max_src_len, FLAGS.max_protos,
                                         FLAGS.assertions_only)
    positive_protos, negative_protos = LoadNegativeProtos(
        export_path, positive_protos, FLAGS.negative_class_outcomes,
        FLAGS.max_src_len, FLAGS.balance_class_lengths,
        FLAGS.balance_class_counts)

    positive_sizes = DatasetSizes(
        int(len(positive_protos) * FLAGS.training_ratio),
        int(len(positive_protos) * FLAGS.validation_ratio),
        int(len(positive_protos) * FLAGS.testing_ratio),
    )
    negative_sizes = DatasetSizes(
        int(len(negative_protos) * FLAGS.training_ratio),
        int(len(negative_protos) * FLAGS.validation_ratio),
        int(len(negative_protos) * FLAGS.testing_ratio),
    )

    # Create output directories.
    (dataset_root / 'training').mkdir(exist_ok=True, parents=True)
    (dataset_root / 'validation').mkdir(exist_ok=True, parents=True)
    (dataset_root / 'testing').mkdir(exist_ok=True, parents=True)

    logging.info('Shuffling protos with seed %d', FLAGS.seed)
    random.seed(FLAGS.seed)
    random.shuffle(positive_protos)
    random.shuffle(negative_protos)

    for i, proto in enumerate(positive_protos[:positive_sizes[0]]):
        pbutil.ToFile(proto,
                      (dataset_root / 'training' / f'positive-{i:04d}.pbtxt'))
    for i, proto in enumerate(negative_protos[:negative_sizes[0]]):
        pbutil.ToFile(proto,
                      (dataset_root / 'training' / f'negative-{i:04d}.pbtxt'))
    logging.info('Wrote %s training examples',
                 humanize.intcomma(positive_sizes[0] + negative_sizes[0]))
    positive_protos = positive_protos[positive_sizes[0]:]
    negative_protos = negative_protos[negative_sizes[0]:]

    for i, proto in enumerate(positive_protos[:positive_sizes[1]]):
        pbutil.ToFile(
            proto, (dataset_root / 'validation' / f'positive-{i:04d}.pbtxt'))
    for i, proto in enumerate(negative_protos[:negative_sizes[1]]):
        pbutil.ToFile(
            proto, (dataset_root / 'validation' / f'negative-{i:04d}.pbtxt'))
    logging.info('Wrote %s validation examples',
                 humanize.intcomma(positive_sizes[1] + negative_sizes[1]))
    positive_protos = positive_protos[positive_sizes[1]:]
    negative_protos = negative_protos[negative_sizes[1]:]

    for i, proto in enumerate(positive_protos[:positive_sizes[2]]):
        pbutil.ToFile(proto,
                      (dataset_root / 'testing' / f'positive-{i:04d}.pbtxt'))
    for i, proto in enumerate(negative_protos[:negative_sizes[2]]):
        pbutil.ToFile(proto,
                      (dataset_root / 'testing' / f'negative-{i:04d}.pbtxt'))
    logging.info('Wrote %s testing examples',
                 humanize.intcomma(positive_sizes[2] + negative_sizes[2]))
Example #25
0
def test_ToFile_parent_directory_does_not_exist(suffix):
  """Test that FileNotFoundError raised if parent directory doesn't exist."""
  with tempfile.TemporaryDirectory() as d:
    proto = test_protos_pb2.TestMessage(string='abc', number=1)
    with pytest.raises(FileNotFoundError):
      pbutil.ToFile(proto, pathlib.Path(d) / 'notadir' / f'proto{suffix}')
Example #26
0
    def Sample(self,
               sampler: samplers.Sampler,
               min_num_samples: int,
               seed: int = None) -> typing.List[model_pb2.Sample]:
        """Sample a model.

    If the model is not already trained, calling Sample() first trains the
    model. Thus a call to Sample() is equivalent to calling Train() then
    Sample().

    Args:
      sampler: The sampler to sample using.
      min_num_samples: The minimum number of samples to return. Note that the
        true number of samples returned may be higher than this value, as
        sampling occurs in batches. The model will continue producing samples
        until the lowest mulitple of the sampler batch size property that is
        larger than this value. E.g. if min_num_samples is 7 and the Sampler
        batch size is 10, 10 samples will be returned.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Returns:
      A list of Sample protos.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
        self.Train()

        sample_count = 1
        self.SamplerCache(sampler).mkdir(exist_ok=True)
        with logutil.TeeLogsToFile(f'sampler_{sampler.hash}',
                                   self.cache.path / 'logs'):
            logging.info("Sampling: '%s'", sampler.start_text)
            if min_num_samples < 0:
                logging.warning(
                    'Entering an infinite sample loop, this process will never end!'
                )
            sample_start_time = labdate.MillisecondsTimestamp()

            atomizer = self.corpus.atomizer
            sampler.Specialize(atomizer)
            batch_size = self.backend.InitSampling(sampler, seed)

            samples = []
            sample_dir = self.SamplerCache(sampler)

            # Per-sample batch outer loop. Continues until we have as many samples
            # as we want.
            while True:
                samples_in_progress = [
                    sampler.tokenized_start_text.copy()
                    for _ in range(batch_size)
                ]
                done = np.zeros(batch_size, dtype=np.bool)
                start_time = labdate.MillisecondsTimestamp()
                wall_time_start = start_time

                self.backend.InitSampleBatch(sampler, batch_size)

                # Sampling loop. Continues until all samples in the batch are done.
                while True:
                    indices = self.backend.SampleNextIndices(
                        sampler, batch_size)

                    # Iterate over all samples in batch to determine whether they're
                    # done.
                    for i in range(batch_size):
                        if done[i]:
                            continue

                        token = atomizer.decoder[indices[i]]
                        samples_in_progress[i].append(token)
                        if sampler.SampleIsComplete(samples_in_progress[i]):
                            end_time = labdate.MillisecondsTimestamp()
                            done[i] = 1
                            sample = model_pb2.Sample(
                                text=''.join(samples_in_progress[i]),
                                sample_start_epoch_ms_utc=start_time,
                                sample_time_ms=end_time - start_time,
                                wall_time_ms=end_time - wall_time_start,
                                num_tokens=len(samples_in_progress[i]))
                            print(f'=== BEGIN CLGEN SAMPLE {sample_count} '
                                  f'===\n\n{sample.text}\n')
                            sample_count += 1
                            sample_id = crypto.sha256_str(sample.text)
                            sample_path = sample_dir / f'{sample_id}.pbtxt'
                            pbutil.ToFile(sample, sample_path)
                            if min_num_samples > 0:
                                samples.append(sample)
                            wall_time_start = labdate.MillisecondsTimestamp()

                    # Complete the batch.
                    if done.all():
                        break

                # Complete sampling. Note that sample_count starts at 1.
                if sample_count > min_num_samples:
                    now = labdate.MillisecondsTimestamp()
                    logging.info(
                        'Produced %s samples at a rate of %s ms / sample.',
                        humanize.intcomma(len(samples)),
                        humanize.intcomma(
                            int((now - sample_start_time) /
                                max(len(samples), 1))))
                    break

        return samples
Example #27
0
def dummy_lockfile_path(
        dummy_lockfile_proto: lockfile_pb2.LockFile) -> pathlib.Path:
    """Yield a path to a lockfile proto."""
    with tempfile.TemporaryDirectory() as d:
        pbutil.ToFile(dummy_lockfile_proto, pathlib.Path(d) / 'LOCK.pbtxt')
        yield pathlib.Path(d) / 'LOCK.pbtxt'
Example #28
0
def abc_instance_file(abc_instance_config) -> str:
    """A test fixture that returns a path to an Instance config file."""
    with tempfile.NamedTemporaryFile() as f:
        pbutil.ToFile(abc_instance_config, pathlib.Path(f.name))
        yield f.name
Example #29
0
def main(argv):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(' '.join(
            argv[1:])))

    model_dir = pathlib.Path(FLAGS.reachability_model_dir)
    model_dir.mkdir(parents=True, exist_ok=True)
    (model_dir / 'logs').mkdir(exist_ok=True)
    (model_dir / 'checkpoints').mkdir(exist_ok=True)

    logging.info('Generating graphs dataset ...')
    data = MakeReachabilityDataset(FLAGS.reachability_num_training_graphs +
                                   FLAGS.reachability_num_testing_graphs)
    training_data = reachability_pb2.ReachabilityDataset()
    training_data.entry.extend(
        data.entry[:FLAGS.reachability_num_training_graphs])
    pbutil.ToFile(training_data, model_dir / 'training_data.pbtxt')
    testing_data = reachability_pb2.ReachabilityDataset()
    testing_data.entry.extend(
        data.entry[FLAGS.reachability_num_training_graphs:])
    pbutil.ToFile(testing_data, model_dir / 'testing_data.pbtxt')

    logging.info('Number of training examples: %s.',
                 humanize.intcomma(len(training_data.entry)))
    logging.info('Number of testing examples: %s.',
                 humanize.intcomma(len(testing_data.entry)))

    n = FLAGS.reachability_num_nodes
    sequence_length = GetSequenceLength(FLAGS.reachability_num_nodes)
    logging.info('Using sequence length %s.',
                 humanize.intcomma(sequence_length))
    seqs = [ControlFlowGraphToSequence(entry.graph) for entry in data.entry]
    text = '\n'.join(seqs)
    logging.info('Deriving atomizer from %s chars.',
                 humanize.intcomma(len(text)))
    atomizer = atomizers.AsciiCharacterAtomizer.FromText(text)
    logging.info('Vocabulary size: %s.',
                 humanize.intcomma(len(atomizer.vocab)))
    with open(model_dir / 'atomizer.pkl', 'wb') as f:
        pickle.dump(atomizer, f)
    logging.info('Pickled atomizer to %s.', model_dir / 'atomizer.pkl')

    x, y = ProtosToModelData(training_data, sequence_length, atomizer)
    logging.info('Training data: x %s, y[%s] %s', x.shape, len(y), y[0].shape)

    test_x, test_y = ProtosToModelData(testing_data, sequence_length, atomizer)
    logging.info('Testing data: x %s, y[%s] %s', test_x.shape, len(test_y),
                 test_y[0].shape)

    num_uniq_seqs = len(set(seqs))
    logging.info('Unique sequences: %s of %s (%.2f %%)',
                 humanize.intcomma(num_uniq_seqs),
                 humanize.intcomma(len(seqs)),
                 (num_uniq_seqs / len(seqs)) * 100)
    num_uniq_labels = len(
        set([''.join(str(x) for x in e.reachable) for e in data.entry]))
    logging.info('Unique labels: %s of %s (%.2f %%)',
                 humanize.intcomma(num_uniq_labels),
                 humanize.intcomma(len(seqs)),
                 (num_uniq_labels / len(seqs)) * 100)

    np.random.seed(FLAGS.reachability_model_seed)
    random.seed(FLAGS.reachability_model_seed)
    logging.info('Building Keras model ...')
    model = BuildKerasModel(sequence_length=sequence_length,
                            num_classes=n,
                            lstm_size=FLAGS.lstm_size,
                            num_layers=FLAGS.num_layers,
                            dnn_size=FLAGS.dnn_size,
                            atomizer=atomizer)

    model_json = model.to_json()
    with open(model_dir / 'model.json', 'w') as f:
        f.write(model_json)
    logging.info('Wrote model to %s', model_dir / 'model.json')

    logging.info('Training model ...')

    def OnEpochEnd(epoch, logs):
        """End-of-epoch model evaluate."""
        del logs
        logging.info('Evaluating model at epoch %d', epoch)
        # score, accuracy
        row = model.evaluate(test_x,
                             test_y,
                             batch_size=FLAGS.batch_size,
                             verbose=0)
        overall_loss, losses, accuracies = row[0], row[1:1 + n], row[n + 1:]
        logging.info('Accuracy (excluding first class): %.2f %%',
                     (sum(accuracies[1:]) / len(accuracies[1:])) * 100)

    logger = telemetry.TrainingLogger(logdir=model_dir / 'logs')
    model.fit(
        x,
        y,
        epochs=FLAGS.num_epochs,
        batch_size=FLAGS.batch_size,
        verbose=True,
        shuffle=True,
        callbacks=[
            keras.callbacks.ModelCheckpoint(str(model_dir / 'checkpoints') +
                                            '/weights_{epoch:03d}.hdf5',
                                            verbose=1,
                                            mode="min",
                                            save_best_only=False),
            keras.callbacks.LambdaCallback(on_epoch_end=OnEpochEnd),
            logger.KerasCallback(keras),
        ])

    for i in range(5):
        outs = FlattenModelOutputs(model.predict(np.array([x[i]])))
        logging.info('outs:    %s', outs)
        logging.info('clamped: %s', np.rint(outs).astype(np.int32))
        logging.info('true:    %s', FlattenModelData(y, i))
        logging.info('')
    logging.info('done')
Example #30
0
def clsmith_result(dummy_result: deepsmith_pb2.Result) -> pathlib.Path:
    """A test fixture which returns a dummy CLSmith result."""
    dummy_result.testcase.harness.name = 'cl_launcher'
    with tempfile.TemporaryDirectory(prefix='phd_') as d:
        pbutil.ToFile(dummy_result, pathlib.Path(d) / 'result.pbtxt')
        yield pathlib.Path(d) / 'result.pbtxt'