Beispiel #1
0
def GetBuildInfo() -> config_pb2.BuildInfo:
    """Return the build state."""
    if not _BUILD_INFO.is_file():
        raise OSError("No build_info.pbtxt. Are there runfiles?")
    return pbutil.FromFile(_BUILD_INFO,
                           config_pb2.BuildInfo(),
                           uninitialized_okay=False)
Beispiel #2
0
def main(argv) -> None:
  """Main entry point."""
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  clone_list_path = pathlib.Path(FLAGS.clone_list or "")
  if not clone_list_path.is_file():
    raise app.UsageError('--clone_list is not a file.')
  clone_list = pbutil.FromFile(clone_list_path,
                               scrape_repos_pb2.LanguageCloneList())

  meta_files = []
  for language in clone_list.language:
    directory = pathlib.Path(language.destination_directory)
    if directory.is_dir():
      meta_files += [pathlib.Path(directory / f) for f in directory.iterdir() if
                     IsRepoMetaFile(f)]
  random.shuffle(meta_files)
  worker = AsyncWorker(meta_files)
  logging.info('Cloning %s repos from GitHub ...',
               humanize.intcomma(worker.max))
  bar = progressbar.ProgressBar(max_value=worker.max, redirect_stderr=True)
  worker.start()
  while worker.is_alive():
    bar.update(worker.i)
    worker.join(.5)
  bar.update(worker.i)
Beispiel #3
0
def PostprocessSampleCorpus(instance: clgen.Instance):
    """Create a corpus from the model samples and pre-process."""
    sample_dir = instance.model.SamplerCache(instance.sampler)

    # Read the sample protos and write them to a directory of content files.
    contentfiles_dir = pathlib.Path(str(sample_dir) + '.contentfiles')
    contentfiles_dir.mkdir(exist_ok=True)
    logging.info('Writing output contentfiles to %s', contentfiles_dir)
    if len(list(contentfiles_dir.iterdir())) != len(list(
            sample_dir.iterdir())):
        for proto_path in sample_dir.iterdir():
            sample = pbutil.FromFile(proto_path, model_pb2.Sample())
            with open(contentfiles_dir / proto_path.name, 'w') as f:
                f.write(sample.text)

    logging.info('Creating output corpus')
    output_corpus_config = corpus_pb2.Corpus()
    output_corpus_config.CopyFrom(instance.model.corpus.config)
    output_corpus_config.local_directory = str(contentfiles_dir)
    # We derive the programming language name from the input corpus directory.
    # This depends on corpuses being in directories named after their language,
    # e.g. ~/corpuses/opencl, or ~/corpuses/java.A
    preprocessed_dir = instance.model.corpus.preprocessed.url[len('sqlite:///'
                                                                  ):].parent
    language = (preprocessed_dir / 'contentfiles').resolve().name
    output_corpus_config.preprocessor[:] = POSTPROCESSORS[language]
    output_corpus = corpuses.Corpus(output_corpus_config)
    try:
        output_corpus.Create()
    except errors.EmptyCorpusException:
        pass
    return output_corpus
Beispiel #4
0
def ConfigFromFlags() -> clgen_pb2.Instance:
  config_path = pathlib.Path(FLAGS.config)
  if not config_path.is_file():
    raise app.UsageError(f"CLgen --config file not found: '{config_path}'")
  config = pbutil.FromFile(config_path, clgen_pb2.Instance())
  os.environ['PWD'] = str(config_path.parent)
  return config
Beispiel #5
0
def _ReadTestDataStoreFiles() -> datastore_pb2.DataStoreTestSet:
    """Read the config protos for testing.

  The datastore names are derived from the file names.

  Returns:
    A DataStoreTestSet instance.

  Raises:
    AssertionError: In case of error reading datastore configs.
  """
    paths = list(
        pathlib.Path('deeplearning/deepsmith/tests/data/datastores').iterdir())
    assert paths
    names = [p.stem for p in paths]
    protos = [
        pbutil.FromFile(path, datastore_pb2.DataStore()) for path in paths
    ]
    datastore_set = datastore_pb2.DataStoreTestSet()
    for name, proto in zip(names, protos):
        # There's no graceful error handling here, but it's important that we don't
        # run tests on a datastore unless it's specifically marked as testonly.
        assert proto.testonly
        dst_proto = datastore_set.values[name]
        dst_proto.MergeFrom(proto)
    assert len(datastore_set.values) == len(protos) == len(names) == len(paths)
    return datastore_set
Beispiel #6
0
def CloneFromMetafile(metafile: pathlib.Path) -> None:
  meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata())
  if not meta.owner and meta.name:
    logging.error('Metafile missing owner and name fields %s', metafile)
    return
  clone_dir = metafile.parent / f'{meta.owner}_{meta.name}'
  logging.debug('%s', meta)
  if (clone_dir / '.git').is_dir():
    return

  # Remove anything left over from a previous attempt.
  subprocess.check_call(['rm', '-rf', str(clone_dir)])

  cmd = ['timeout', f'{FLAGS.repository_clone_timeout_minutes}m',
         '/usr/bin/git', 'clone', meta.clone_from_url, str(clone_dir)]
  logging.debug('$ %s', ' '.join(cmd))

  # Try to checkout the repository and submodules.
  p = subprocess.Popen(cmd + ['--recursive'], stdout=subprocess.PIPE,
                       stderr=subprocess.PIPE, universal_newlines=True)
  _, stderr = p.communicate()
  if p.returncode and 'submodule' in stderr:
    # Remove anything left over from a previous attempt.
    subprocess.check_call(['rm', '-rf', str(clone_dir)])
    # Try again, but this time without cloning submodules.
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                         universal_newlines=True)
    _, stderr = p.communicate()

  if p.returncode:
    # Give up.
    logging.warning('\nClone failed %s:\n%s', meta.clone_from_url, stderr)
    # Remove anything left over.
    subprocess.check_call(['rm', '-rf', str(clone_dir)])
Beispiel #7
0
def main(argv) -> None:
  """Main entry point."""
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  tiers = pbutil.FromFile(pathlib.Path(FLAGS.data_tiers),
                          data_tiers_pb2.DataTiers())
  for tier in tiers.directory:
    logging.info('Processing %s', tier.path)
    _SetDirectorySize(tier)

  if FLAGS.summary:
    # Print the size per directory.
    df = pd.DataFrame([
      {
        'Path': d.path,
        'Tier': d.tier,
        'Size': humanize.naturalsize(d.size_bytes),
        'Size (bytes)': d.size_bytes
      } for d in tiers.directory if d.size_bytes
    ])
    df = df.sort_values(['Tier', 'Size (bytes)'], ascending=[True, False])
    print(df[['Path', 'Tier', 'Size']].to_string(index=False))

    # Print the total size per tier.
    df2 = df.groupby('Tier').sum()
    df2['Size'] = [humanize.naturalsize(d['Size (bytes)'])
                   for _, d in df2.iterrows()]
    df2 = df2.reset_index()
    df2 = df2.sort_values('Tier')
    print()
    print("Totals:")
    print(df2[['Tier', 'Size']].to_string(index=False))
  else:
    print(tiers)
Beispiel #8
0
def main(argv):
  """Main entry point."""
  if len(argv) > 1:
    raise app.UsageError("Unknown arguments '{}'".format(', '.join(argv[1:])))

  clone_list_path = pathlib.Path(FLAGS.clone_list or '')
  if not clone_list_path.is_file():
    raise app.UsageError('--clone_list is not a file.')
  clone_list = pbutil.FromFile(clone_list_path,
                               scrape_repos_pb2.LanguageCloneList())

  if not FLAGS.export_path:
    raise app.UsageError('--export_path not set.')
  export_path = pathlib.Path(FLAGS.export_path)
  export_path.mkdir(parents=True, exist_ok=True)

  # To export from contentfiles database.
  # for language in clone_list.language:
  #   d = pathlib.Path(language.destination_directory)
  #   d = d.parent / (str(d.name) + '.db')
  #   db = contentfiles.ContentFiles(d)
  #   with db.Session() as session:
  #     (export_path / language.language).mkdir(exist_ok=True)
  #     ExportDatabase(session, export_path / language.language)

  # To export from index directory.
  for language in clone_list.language:
    index_path = pathlib.Path(language.destination_directory + '.index')
    if index_path.is_dir():
      (export_path / language.language).mkdir(exist_ok=True)
      ExportIndex(index_path, export_path / language.language)
Beispiel #9
0
def main(argv):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError('Unrecognized arguments')

    # Parse flags and instantiate testing objects.
    if not FLAGS.interesting_results_dir:
        raise app.UsageError('--interesting_results_dir must be set')
    interesting_results_dir = pathlib.Path(FLAGS.interesting_results_dir)
    if interesting_results_dir.exists(
    ) and not interesting_results_dir.is_dir():
        raise app.UsageError('--interesting_results_dir must be a directory')
    logging.info('Recording interesting results in %s.',
                 interesting_results_dir)

    for path in interesting_results_dir.iterdir():
        result = pbutil.FromFile(path, deepsmith_pb2.Result())
        print(f'=== BEGIN INTERESTING RESULT {path.stem} ===')
        print('Outcome:', deepsmith_pb2.Result.Outcome.Name(result.outcome))
        print()
        print('OpenCL kernel')
        print('-------------')
        print(fmt.Indent(2, result.testcase.inputs['src']))
        print()
        print('Stdout')
        print('------')
        print(fmt.Indent(2, result.outputs['stderr']))
        print()
Beispiel #10
0
def main(argv):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(' '.join(
            argv[1:])))

    start_time = time.time()
    instances = [
        clgen.Instance(p) for p in pbutil.FromFile(
            pathlib.Path(FLAGS.instances), clgen_pb2.Instances()).instance
    ]
    random.shuffle(instances)
    candidate_instances = collections.deque(instances)
    logging.info('Loaded %d instances in %s ms', len(candidate_instances),
                 humanize.intcomma(int((time.time() - start_time) * 1000)))

    while candidate_instances:
        instance = candidate_instances.popleft()
        with instance.Session():
            if IsEligible(instance):
                logging.info('Found an eligible candidate to work on')
                SampleModel(instance)
                PostprocessSampleCorpus(instance)
            else:
                logging.info('Candidate is ineligible')
                candidate_instances.append(instance)
                time.sleep(1)

    logging.info('Done.')
Beispiel #11
0
 def ContentFiles(self) -> typing.Iterable[scrape_repos_pb2.ContentFile]:
     """Return an iterator over all contentfiles in the repo."""
     if self.IsIndexed():
         return (pbutil.FromFile(f, scrape_repos_pb2.ContentFile())
                 for f in self.index_dir.iterdir() if f.name != 'DONE.txt')
     else:
         return []
Beispiel #12
0
def LoadPositiveNegativeProtos(path: pathlib.Path) -> PositiveNegativeDataset:
    """Load positive and negative training protos from a directory."""
    positive_protos = [
        pbutil.FromFile(p,
                        fish_pb2.CompilerCrashDiscriminatorTrainingExample())
        for p in path.iterdir() if p.name.startswith('positive-')
    ]
    logging.info('Loaded %s positive protos',
                 humanize.intcomma(len(positive_protos)))
    negative_protos = [
        pbutil.FromFile(p,
                        fish_pb2.CompilerCrashDiscriminatorTrainingExample())
        for p in path.iterdir() if p.name.startswith('negative-')
    ]
    logging.info('Loaded %s negative protos',
                 humanize.intcomma(len(negative_protos)))
    return PositiveNegativeDataset(positive_protos, negative_protos)
Beispiel #13
0
 def EpochTelemetry(self) -> typing.List[telemetry_pb2.ModelEpochTelemetry]:
     """Return the epoch telemetry files."""
     return [
         pbutil.FromFile(self.logdir / p,
                         telemetry_pb2.ModelEpochTelemetry())
         for p in sorted(self.logdir.iterdir())
         if re.match(r'epoch_\d\d+_telemetry\.pbtxt', str(p.name))
     ]
Beispiel #14
0
def test_main_stop_after_train(abc_instance_file):
    """Test that --stop_after train trains the model."""
    app.FLAGS.unparse_flags()
    app.FLAGS(
        ['argv[0]', '--config', abc_instance_file, '--stop_after', 'train'])
    clgen.main([])
    instance = clgen.Instance(
        pbutil.FromFile(pathlib.Path(abc_instance_file), clgen_pb2.Instance()))
    assert instance.model.is_trained
Beispiel #15
0
def test_main_stop_after_corpus(abc_instance_file):
    """Test that --stop_after corpus prevents model training."""
    app.FLAGS.unparse_flags()
    app.FLAGS(
        ['argv[0]', '--config', abc_instance_file, '--stop_after', 'corpus'])
    clgen.main([])
    instance = clgen.Instance(
        pbutil.FromFile(pathlib.Path(abc_instance_file), clgen_pb2.Instance()))
    assert not instance.model.is_trained
Beispiel #16
0
def test_FromFile_required_fields_not_set_uninitialized_okay(suffix):
  """Test that DecodeError not raised if required fields not set."""
  with tempfile.NamedTemporaryFile(prefix='labm8_proto_',
                                   suffix=suffix) as f:
    proto_in = test_protos_pb2.AnotherTestMessage(number=1)
    pbutil.ToFile(test_protos_pb2.AnotherTestMessage(number=1),
                  pathlib.Path(f.name))
    pbutil.FromFile(pathlib.Path(f.name), test_protos_pb2.TestMessage(),
                    uninitialized_okay=True)
Beispiel #17
0
def test_FromFile_required_fields_not_set(suffix):
  """Test that DecodeError raised if required fields not set."""
  with tempfile.NamedTemporaryFile(prefix='labm8_proto_',
                                   suffix=suffix) as f:
    pbutil.ToFile(test_protos_pb2.AnotherTestMessage(number=1),
                  pathlib.Path(f.name))
    with pytest.raises(pbutil.DecodeError) as e_info:
      pbutil.FromFile(pathlib.Path(f.name), test_protos_pb2.TestMessage())
    assert f"Required fields not set: '{f.name}'" == str(e_info.value)
Beispiel #18
0
def GeneratorFromFlag(config_class,
                      generator_class) -> base_generator.GeneratorServiceBase:
    """Instantiate a generator from the --generator_config flag."""
    if not pbutil.ProtoIsReadable(FLAGS.generator_config, config_class()):
        raise app.UsageError(
            f'--generator_config is not a {config_class.__name__} proto')
    config = pbutil.FromFile(pathlib.Path(FLAGS.generator_config),
                             config_class())
    return generator_class(config)
Beispiel #19
0
  def ProtoFromFile(cls, path: pathlib.Path) -> deepsmith_pb2.Testcase:
    """Instantiate a protocol buffer testcase from file.

    Args:
      path: Path to the testcase proto file.

    Returns:
      Testcase message instance.
    """
    return pbutil.FromFile(path, deepsmith_pb2.Testcase())
Beispiel #20
0
def CreateInstanceProtoFromFlags() -> clgen_pb2.Instance:
  if FLAGS.clgen_instance:
    return pbutil.FromFile(
        pathlib.Path(FLAGS.clgen_instance), clgen_pb2.Instance())
  else:
    return clgen_pb2.Instance(
        working_dir=FLAGS.clgen_working_dir,
        model=CreateModelProtoFromFlags(),
        sampler=CreateSamplerProtoFromFlags(),
    )
Beispiel #21
0
    def ProtoFromFile(cls, path: pathlib.Path) -> deepsmith_pb2.Result:
        """Instantiate a protocol buffer result from file.

    Args:
      path: Path to the result proto file.

    Returns:
      Result message instance.
    """
        return pbutil.FromFile(path, deepsmith_pb2.Result())
Beispiel #22
0
def ServiceConfigFromFlag(
        flag_name: str,
        service_config: pbutil.ProtocolBuffer) -> pbutil.ProtocolBuffer:
    if not getattr(FLAGS, flag_name):
        raise app.UsageError(f'--{flag_name} not set.')
    config_path = pathlib.Path(getattr(FLAGS, flag_name))
    if not config_path.is_file():
        cls_name = type(service_config).__name__
        raise app.UsageError(f"{cls_name} file not found: '{config_path}'.")

    return pbutil.FromFile(config_path, service_config)
Beispiel #23
0
def DoFlagsAction():
    """Do the action requested by the command line flags."""
    if not FLAGS.config:
        raise app.UsageError("Missing required argument: '--config'")
    config_path = pathlib.Path(FLAGS.config)
    if not config_path.is_file():
        raise app.UsageError(f"File not found: '{config_path}'")
    config = pbutil.FromFile(config_path, clgen_pb2.Instance())
    os.environ['PWD'] = str(config_path.parent)

    if FLAGS.clgen_profiling:
        prof.enable()

    instance = Instance(config)
    with instance.Session():
        if FLAGS.print_cache_path == 'corpus':
            print(instance.model.corpus.cache.path)
            return
        elif FLAGS.print_cache_path == 'model':
            print(instance.model.cache.path)
            return
        elif FLAGS.print_cache_path == 'sampler':
            print(instance.model.SamplerCache(instance.sampler))
            return
        elif FLAGS.print_cache_path:
            raise app.UsageError(
                f"Invalid --print_cache_path argument: '{FLAGS.print_cache_path}'"
            )

        if FLAGS.print_preprocessed:
            print(instance.model.corpus.GetTextCorpus(shuffle=False))
            return

        # The default action is to sample the model.
        if FLAGS.stop_after == 'corpus':
            instance.model.corpus.Create()
        elif FLAGS.stop_after == 'train':
            instance.model.Train()
            logging.info('Model: %s', instance.model.cache.path)
        elif FLAGS.stop_after:
            raise app.UsageError(
                f"Invalid --stop_after argument: '{FLAGS.stop_after}'")
        elif FLAGS.export_model:
            instance.model.Train()
            export_dir = pathlib.Path(FLAGS.export_model)
            for path in instance.model.InferenceManifest():
                relpath = pathlib.Path(
                    os.path.relpath(path, instance.model.cache.path))
                (export_dir / relpath.parent).mkdir(parents=True,
                                                    exist_ok=True)
                shutil.copyfile(path, export_dir / relpath)
                print(export_dir / relpath)
        else:
            instance.model.Sample(instance.sampler, FLAGS.min_samples)
Beispiel #24
0
    def FromFile(cls, path: pathlib.Path) -> 'DataStore':
        """Instantiate a DataStore from a config file.

    Args:
      path: Path to the datastore config proto file.

    Returns:
      A DataStore instance.
    """
        config = pbutil.FromFile(path, datastore_pb2.DataStore())
        return DataStore(config)
Beispiel #25
0
def PackDataPackage(package_dir: pathlib.Path) -> None:
    """Create an archive and sidecar of a package."""
    manifest = pbutil.FromFile(package_dir / 'MANIFEST.pbtxt',
                               dpack_pb2.DataPackage())
    PackageManifestIsValid(package_dir, manifest)
    archive_path = (package_dir /
                    f'../{package_dir.name}.dpack.tar.bz2').resolve()
    sidecar_path = (package_dir /
                    f'../{package_dir.name}.dpack.pbtxt').resolve()
    CreatePackageArchive(package_dir, manifest, archive_path)
    CreatePackageArchiveSidecar(archive_path, manifest, sidecar_path)
Beispiel #26
0
def main(argv: typing.List[str]):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(' '.join(
            argv[1:])))

    path = pathlib.Path(FLAGS.delayed_reward_experiment_path)
    data = pbutil.FromFile(path, random_opt_pb2.DelayedRewardExperiment())
    # graph = DelayedRewardExperimentToGraph(data)
    # print(graph.ToDot())
    dot = DelayedRewardExperimentToDot(data)
    print(dot.source)
Beispiel #27
0
def test_ToFile_FromFile_equivalence(suffix):
  """Test that ToFile() and FromFile() are symmetrical."""
  with tempfile.TemporaryDirectory(prefix='labm8_proto_') as d:
    path = pathlib.Path(d) / f'proto{suffix}'
    proto_in = test_protos_pb2.TestMessage(string='abc', number=1)
    pbutil.ToFile(proto_in, path)
    assert path.is_file()
    proto_out = test_protos_pb2.TestMessage()
    pbutil.FromFile(path, proto_out)
    assert proto_out.string == 'abc'
    assert proto_out.number == 1
    assert proto_in == proto_out
def GetProtos(export_path: pathlib.Path, outcomes: typing.List[str],
              max_src_len: int) -> typing.List[TrainingProto]:
    paths = sorted(
        labtypes.flatten(
            [list((export_path / outcome).iterdir()) for outcome in outcomes]))
    protos = []
    for path in paths:
        proto = pbutil.FromFile(path, TrainingProto())
        if len(proto.src) > max_src_len:
            continue
        protos.append(proto)
    return protos
Beispiel #29
0
def test_config_is_valid():
    """Test that config proto is valid."""
    with tempfile.TemporaryDirectory() as d:
        config = pbutil.FromFile(
            bazelutil.DataPath(
                'phd/deeplearning/clgen/tests/data/c99/config.pbtxt'),
            clgen_pb2.Instance())
        # Change the working directory and corpus path to our bazel run dir.
        config.working_dir = d
        config.model.corpus.local_directory = str(
            bazelutil.DataPath('phd/deeplearning/clgen/tests/data/c99/src/'))
        clgen.Instance(config)
Beispiel #30
0
def VerifyManifest(package_dir: pathlib.Path) -> bool:
    """Verify that the MANIFEST.pbtext file matches the contents."""
    if not (package_dir / 'MANIFEST.pbtxt').is_file():
        logging.info('%s/MANIFEST.pbtxt missing, nothing to do.', package_dir)
        return False
    manifest = pbutil.FromFile(package_dir / 'MANIFEST.pbtxt',
                               dpack_pb2.DataPackage())
    if not PackageManifestIsValid(package_dir, manifest):
        logging.error('Package %s contains errors.', package_dir)
        return False
    logging.info('%s verified. No changes to files in the manifest.',
                 package_dir)
    return True