Beispiel #1
0
    def __init__(
        self,
        client: bigquery.Client,
        dataset_id: str = None,
        extensions: typing.List[str] = None,
    ):
        """Generic Dataset class constructor. Not to be used directly."""
        self.client = client
        self.dataset, self.tables = self._setupDataset(
            "{}.clgen_{}_github".format(self.client.project, dataset_id
                                        or "generic"))
        self.queryConfig = lambda qt, qr=[], dr=False: bigquery.QueryJobConfig(
            destination=self.tables[qt],
            write_disposition='WRITE_TRUNCATE',
            query_parameters=qr,
            dry_run=dr,
        )

        self.extensions = extensions
        self.query_file_id = ""
        if self.extensions is not None:
            self.query_file_id = " OR ".join([
                "substr(file.path, {}, {}) = '{}'".format(
                    -len(ext), 1 + len(ext), ext) for ext in self.extensions
            ])
        self.file_count = None
        l.logger().info("{} dataset initialized.".format(self.language))
        return
Beispiel #2
0
    def _initTensors(self):
        if self.current_step > 0:
            if self.jsonfile.exists():
                with open(self.jsonfile, 'r') as js:
                    loaded_tensors = json.load(js)
                    if loaded_tensors[-1]['step'] > self.current_step:
                        # If previous sessions have written beyond current step, overwrite them.
                        back_index = -2
                        while loaded_tensors[back_index][
                                'step'] > self.current_step:
                            back_index -= 1
                        self.tensors = loaded_tensors[:back_index + 1]
                    else:
                        self.tensors = loaded_tensors

                    for ch in self.tensors:
                        for k, v in ch.items():
                            if k == 'step':
                                continue
                            if k not in self.plot_tensors:
                                self.plot_tensors[k] = {
                                    'value': [],
                                    'step': []
                                }
                            self.plot_tensors[k]['value'].append(v)
                            self.plot_tensors[k]['step'].append(ch['step'])
            else:
                l.logger().error(
                    "Training json log-file not found. Will keep track from this point on."
                )
        return
Beispiel #3
0
    def filecount_query(self) -> typing.Tuple[int, int]:
        """
    Queries the file count of files intended to query.
    Returns file count in int.
    """
        query = """
    SELECT COUNT(*)
    FROM `bigquery-public-data.github_repos.files` as file
    {}
    """.format("" if not self.query_file_id else "WHERE " + self.query_file_id)

        dry_run_job = self.client.query(query,
                                        job_config=self.queryConfig(
                                            'main_files', dr=True))
        l.logger().warn("This query is going to consume {}".format(
            humanize.naturalsize(dry_run_job.total_bytes_processed)))
        l.logger().info(query)
        if FLAGS.bq_wait_permission:
            l.logger().warn("Hit any button to continue...")
            try:
                input()
            except KeyboardInterrupt:
                return (0, 0)

        l.logger().info("Running file count query...")

        try:
            job = self.client.query(query)
            for f in job:
                self.file_count = (f[0], 0)
                return self.file_count
        except google.api_core.exceptions.Forbidden as e:
            l.logger().error(e)
            exit()
Beispiel #4
0
    def remove_identical_files(self) -> None:

        l.logger().info("Removing duplicate files from mined corpus...")
        if os.path.isfile(str(self.cache_path / "record.json")):
            with open(self.cache_path / "record.json", 'r') as f:
                data = json.load(f)
                repos = data[0]
                length = data[1]['total_files']

        cache_map = {}
        for i in range(length):
            with open(self.cache_path / "{}.cl".format(i), 'r') as f:
                cf = f.read()
                cf_hash = crypto.sha256_str(cf)
                if cf_hash not in cache_map:
                    cache_map[cf_hash] = cf

        new_path = self.cache_path / "distinct_corpus"
        new_path.mkdir(exist_ok=True, parents=True)
        for k, v in cache_map.items():
            with open(new_path / "{}.cl".format(k), 'w') as f:
                f.write(v)

        with open(new_path / "record.json", 'w') as f:
            data[1]['total_files'] = len(cache_map)
            json.dump(data, f, indent=2)
        return
Beispiel #5
0
def getOpenCLPlatforms() -> None:
    """
  Identify compatible OpenCL platforms for current system.
  """
    global CL_PLATFORMS
    CL_PLATFORMS = {
        'CPU': None,
        'GPU': None,
    }
    try:
        cmd = subprocess.Popen(
            "{} --clinfo".format(CLDRIVE).split(),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True,
        )
        stdout, stderr = cmd.communicate()
        if stderr:
            raise ValueError(stderr)
    except Exception as e:
        l.logger().error(cmd)
        l.logger().error(e)
    lines = stdout.split('\n')
    for line in lines:
        if line and line[:3] == "GPU" and not CL_PLATFORMS['GPU']:
            CL_PLATFORMS['GPU'] = line
        elif line and line[:3] == "CPU" and not CL_PLATFORMS['CPU']:
            CL_PLATFORMS['CPU'] = line
    return
Beispiel #6
0
  def initOrGetQueue(self) -> np.array:
    """
    If feed queue is not initialized, initialize it by getting new datapoint.
    Otherwise, don't do anything as feed_queue is already loaded from checkpoint.
    Adds datapoint to InputFeed table of database.

    Returns:
      Starting input feed of sampling.
    """
    if not self.feed_queue:
      try:
        cf = next(self.loader).squeeze(0)
      except StopIteration:
        self.loader = iter(self.dataloader)
        cf = next(self.loader).squeeze(0)
      cf = [int(x) for x in cf]
      self.feed_queue.append(
        ActiveSampleFeed(
          input_feed     = cf,
          input_features = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(cf), [self.feat_sampler.feature_space])[self.feat_sampler.feature_space],
          input_score    = math.inf,
          gen_id         = 0,
        )
      )
      if environment.WORLD_RANK == 0:
        self.addToDB(
          active_feed_database.ActiveInput.FromArgs(
            tokenizer      = self.tokenizer, id = self.active_db.input_count,
            input_feed     = cf, input_features = self.feed_queue[-1].input_features,
          )
        )
    l.logger().info("Feed queue input scores: {}".format(', '.join([str(round(c.input_score, 3)) for c in self.feed_queue])))
    return self.feed_queue[0].input_feed
Beispiel #7
0
    def argmax(self, t):
        """Sample argmax from a tensor."""
        if self.use_categorical:
            try:
                ct = torch.distributions.relaxed_categorical.RelaxedOneHotCategorical(
                    temperature=self.temperature
                    if self.temperature is not None else 1.0,
                    logits=t,
                    validate_args=False
                    if "1.9." in torch.__version__ else None,
                ).sample()
            except ValueError as e:
                dump_cf = ""
                dump_types = ""
                p = pathlib.Path("./dump_argmax_error.log").absolute()
                if not p.exists():
                    l.logger().error(t.shape)
                    l.logger().error(p)
                    for d0 in t:
                        for d1 in d0:
                            dump_cf += str(d1) + ", "
                            if isinstance(d1, torch.Tensor):
                                dump_types += str(d1.type()) + ", "
                            else:
                                dump_types += str(type(d1)) + ", "
                    with open(p, 'w') as outf:
                        outf.write(
                            str(t.shape) + "\n\n\n" + dump_cf + "\n\n\n" +
                            dump_types)
                raise e

        return torch.argmax(ct, dim=-1)
Beispiel #8
0
    def __init__(self, config: active_learning_pb2.ActiveLearner):
        """Instantiate a model.

    Args:
      config: A Model message.

    Raises:
      TypeError: If the config argument is not a Model proto.
      UserError: In case on an invalid config.
    """
        # Error early, so that a cache isn't created.
        if not isinstance(config, active_learning_pb2.ActiveLearner):
            t = type(config).__name__
            raise TypeError(
                f"Config must be an ActiveLearner proto. Received: '{t}'")

        self.config = active_learning_pb2.ActiveLearner()
        # Validate config options.
        self.config.CopyFrom(AssertConfigIsValid(config))

        distrib.lock()
        self.cache = cache.mkcache("active_model")
        distrib.unlock()

        self.downstream_task = downstream_tasks.DownstreamTask.FromTask(
            self.config.downstream_task, self.config.training_corpus)

        if environment.WORLD_RANK == 0:
            ## Store current commit
            commit.saveCommit(self.cache.path)
        self.backend = active_committee.QueryByCommittee(
            self.config, self.cache, self.downstream_task)
        l.logger().info("Initialized {} in {}".format(self.backend,
                                                      self.cache.path))
        return
Beispiel #9
0
  def Specialize(self, tokenizer: tokenizers.TokenizerBase) -> None:
    """Specialize a sampler a vocabulary.

    This enables the sampler to set state specialized to a specific encoding
    vocabulary. This is guaranteed to be called before SampleIsComplete(), and
    ensures that the vocabulary used for all sample arguments to
    SampleIsComplete() is from this vocabulary.

    Args:
      tokenizer: An tokenizer to specialize to.

    Raises:
      InvalidStartText: If the start_text cannot be encoded using the
        vocabulary.
      UserError: In case the sampler cannot be specialized to this vocabulary.
    """
    try:
      self.encoded_start_text = tokenizer.TokenizeString(self.start_text)
      self.tokenized_start_text = tokenizer.AtomizeString(self.start_text)
    except ValueError:
      raise ValueError(
        "Sampler start text cannot be encoded using the corpus vocabulary: "
        f"'{self.start_text}'"
      )

    if len(self.encoded_start_text) > self.sequence_length:
      raise ValueError(
        "Encoded sampler start text must be less than sampler sequence "
        f"length. Sampler sequence length={self.sequence_length}, encoded "
        f"start text length={len(self.encoded_start_text)}"
      )
    l.logger().info("Sampling: '{}'\n".format(self.start_text))
    [terminator.Specialize(tokenizer) for terminator in self.terminators]
Beispiel #10
0
    def Train(self, **kwargs) -> "Model":
        """Train the model.

    Returns:
      The model instance.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
    """
        self.Create()

        self.backend.Train(self.corpus, **kwargs)
        telemetry_logs = self.backend.telemetry.EpochTelemetry()

        l.logger().info("Trained model for {} {} in {} ms. "
                        "Training loss: {}.".format(
                            telemetry_logs[-1].epoch_num,
                            "steps" if isinstance(self.backend, tf_bert.tfBert)
                            or isinstance(self.backend, torch_bert.torchBert)
                            else "epochs",
                            humanize.intcomma(
                                sum(t.epoch_wall_time_ms
                                    for t in telemetry_logs)),
                            telemetry_logs[-1].loss,
                        ))
        return self
Beispiel #11
0
    def repository_query(self) -> typing.Tuple[bigquery.table.RowIterator]:
        """
    Queries the repositories' name/branch that contain files with requested
    specifications (e.g. OpenCL files).

    Returns iterable of query.
    """
        query = """
    SELECT DISTINCT file.repo_name, file.ref
    FROM `bigquery-public-data.github_repos.files` as file
    {}
    """.format("" if not self.query_file_id else "WHERE " + self.query_file_id)

        dry_run_job = self.client.query(query,
                                        job_config=self.queryConfig(
                                            'repositories', dr=True))
        l.logger().warn("This query is going to consume {}".format(
            humanize.naturalsize(dry_run_job.total_bytes_processed)))
        l.logger().info(query)
        if FLAGS.bq_wait_permission:
            l.logger().warn("Hit any button to continue...")
            try:
                input()
            except KeyboardInterrupt:
                return (None, None)

        l.logger().info("Retrieving repository list of specs...")

        try:
            rows = self.client.query(
                query, job_config=self.queryConfig('repositories')).result()
        except google.api_core.exceptions.Forbidden as e:
            l.logger().error(e)
            exit()
        return (rows, None)
Beispiel #12
0
def FromText(config, contentfile_separator: str, corpus_txt: str):
    mask_tokens = False if config.mask_tokens is None else config.mask_tokens

    if config.token_type == "character":
        if config.token_list:
            l.logger().warning(
                "token list in character-based tokenization is going to be ignored."
            )
        return AsciiCharacterTokenizer.FromText(corpus_txt, mask_tokens)
    elif config.token_type == "word":
        with open(config.token_list, 'r') as f:
            token_set = json.load(f)
            token_set = set(token_set['opencl']['tokens'])
        wpc_tok = False if config.wordpiece_tokenization is None else config.wordpiece_tokenization
        return WordTokenizer.FromText(corpus_txt, token_set, mask_tokens,
                                      wpc_tok)
    elif config.token_type == "ast":
        if config.token_list:
            with open(config.token_list, 'r') as f:
                token_set = json.load(f)
                token_set = set(token_set['opencl']['tokens'])
        else:
            token_set = set()
        return ASTokenizer.FromText(corpus_txt, token_set,
                                    contentfile_separator, mask_tokens)
    else:
        raise NotImplementedError
Beispiel #13
0
def get_data_features(
        db,
        tokenizer,
        size_limit=None
) -> typing.List[typing.Dict[str, typing.Dict[str, float]]]:
    """
  Get or set feature with data list of tuples.
  """
    datapoints = []
    db_feats = db.get_data_features(tokenizer, size_limit)
    for inp in tqdm.tqdm(db_feats, total=len(db_feats), desc="Fetch data"):
        feats = workers.ContentFeat(inp)
        if len(inp) == 2:
            src, _ = inp
            include = ""
        else:
            src, include, _ = inp
        try:
            datapoints.append({
                "GreweFeatures": feats["GreweFeatures"],
                "AutophaseFeatures": feats["AutophaseFeatures"],
                "InstCountFeatures": feats["InstCountFeatures"],
            })
        except KeyError as e:
            l.logger().warn(e)
    return datapoints
Beispiel #14
0
 def __init__(
     self,
     workspace: pathlib.Path,
     feature_space: str,
     target: str,
     git_corpus: corpuses.Corpus = None,
 ):
     self.target = target
     self.benchmarks = []
     if self.target != "grid_walk":
         self.path = pathlib.Path(targets[target]).resolve()
     self.workspace = workspace
     self.feature_space = feature_space
     self.reduced_git_corpus = [
         (cf, feats[self.feature_space])
         for cf, feats in git_corpus.getFeaturesContents(
             sequence_length=768)
         if self.feature_space in feats and feats[self.feature_space]
     ]
     self.loadCheckpoint()
     try:
         self.target_benchmark = self.benchmarks.pop(0)
         l.logger().info("Target benchmark: {}\nTarget fetures: {}".format(
             self.target_benchmark.name, self.target_benchmark.features))
     except IndexError:
         self.target_benchmark = None
     return
Beispiel #15
0
    def __init__(self, *args, **kwargs):

        super(QueryByCommittee, self).__init__(*args, **kwargs)

        from deeplearning.clgen.util import pytorch
        if not pytorch.initialized:
            pytorch.initPytorch()

        self.pytorch = pytorch
        self.torch = pytorch.torch
        self.torch_tpu_available = pytorch.torch_tpu_available

        self.torch.manual_seed(self.config.committee.random_seed)
        self.torch.cuda.manual_seed_all(self.config.committee.random_seed)

        self.ckpt_path = self.cache.path / "checkpoints"
        self.sample_path = self.cache.path / "samples"
        self.logfile_path = self.cache.path / "logs"

        self.validation_results_file = "val_results.txt"
        self.validation_results_path = self.logfile_path / self.validation_results_file

        self.committee = []

        self.is_validated = False
        self.trained = False
        l.logger().info("Active Committee config initialized in {}".format(
            self.cache.path))
        return
Beispiel #16
0
    def __init__(self, *args, **kwargs):

        super(tfBert, self).__init__(*args, **kwargs)

        from deeplearning.clgen.util import tf
        tf.initTensorflow()

        self.tf = tf.tf
        self.bertAttrs = None
        self.bert_config = None

        self.train = None
        self.sample = None
        self.predict_generator = None
        self.sampler = None

        self.train_batch_size = None
        self.eval_batch_size = None
        self.learning_rate = None
        self.num_train_steps = None
        self.num_warmup_steps = None

        self.ckpt_path = self._ConfigCheckpointParams()
        self.logfile_path = self.cache.path / "logs"
        self.sample_path = self.cache.path / "samples"
        self.telemetry = telemetry.TrainingLogger(self.logfile_path)

        self.is_validated = False
        l.logger().info("BERT Model config initialized in {}".format(
            self.cache.path))
        return
Beispiel #17
0
def yield_cl_kernels(
        path: pathlib.Path
) -> typing.List[typing.Tuple[pathlib.Path, str, str]]:
    """
  Fetch all cl files from base path and atomize, preprocess
  kernels to single instances.

  Original benchmarks extracted from suites, go through a series of pre-processors:
  1. Include statements are removed.
  2. Code is preprocessed with shim (macro expansion).
  3. Double underscores are removed.
  4. void kernel -> kernel void
  5. Translation units are split to tuples of (kernel, utility/global space)
  """
    contentfiles = iter_cl_files(path)
    kernels = []
    pool = multiprocessing.Pool()
    for kernel_batch in tqdm.tqdm(pool.map(preprocessor_worker, contentfiles),
                                  total=len(contentfiles),
                                  desc="Yield {} benchmarks".format(
                                      path.stem)):
        kernels += kernel_batch
    l.logger().info("Pre-processed {} OpenCL benchmarks".format(len(kernels)))
    pool.close()
    return kernels
Beispiel #18
0
    def _ConfigCheckpointParams(self):
        if FLAGS.select_checkpoint_step >= 0:

            ckpt_current = self.cache.path / "checkpoints"
            if not (ckpt_current / "model.ckpt-{}.index".format(
                    FLAGS.select_checkpoint_step)).exists():
                raise FileNotFoundError(
                    ckpt_current /
                    "model.ckpt-{}.index".format(FLAGS.select_checkpoint_step))

            workspace_rel_path = self.cache.path.relative_to(
                pathlib.Path(os.environ.get("CLGEN_CACHE")).parent)
            ckpt_path = pathlib.Path("/tmp" / workspace_rel_path /
                                     "checkpoints")
            ckpt_path.mkdir(exist_ok=True, parents=True)

            shutil.copy2(ckpt_current / "checkpoint", ckpt_path)
            shutil.copy2(ckpt_current / "graph.pbtxt", ckpt_path)
            for ckpt_file in glob.glob(
                    str(ckpt_current / "model.ckpt-{}.*".format(
                        FLAGS.select_checkpoint_step))):
                shutil.copy2(ckpt_file, ckpt_path)
            l.logger().warn(
                "Explicit checkpoint selected. Explicit checkpoints can only be used for validation or sampling."
            )
        elif FLAGS.select_checkpoint_step == -1:
            ckpt_path = self.cache.path / "checkpoints"
        else:
            raise ValueError(
                "Invalid value {} for --select_checkpoint_step".format(
                    FLAGS.select_checkpoint_step))
        l.logger().info("Configured model checkpoints in {}".format(ckpt_path))
        return ckpt_path
Beispiel #19
0
def to_unique_samples(db: SamplesDatabase, out_db: SamplesDatabase) -> None:
    """
  Read input database, pass through deterministic re-writer and keep only unique samples.
  """
    pool = multiprocessing.Pool()
    inp_data = [x for x in db.get_data]
    visited = set()
    data = []
    try:
        for sha, sample in tqdm.tqdm(pool.imap_unordered(
                ContentHash_worker, inp_data),
                                     total=len(inp_data),
                                     desc="Unique-fy samples database"):
            if sha not in visited:
                visited.add(sha)
                data.append(sample)
    except Exception as e:
        l.logger().error(e)
        pool.terminate()
        raise e
    pool.close()
    with out_db.Session() as s:
        idx = 0
        for dp in tqdm.tqdm(data, total=len(data), desc="Adding to DB"):
            new_dp = get_sample(dp)
            new_dp.id = idx
            idx += 1
            s.add(new_dp)
        s.commit()
    return
Beispiel #20
0
  def save_pretrained(self, save_directory):
    """
    Save a model and its configuration file to a directory, so that it can be re-loaded using the
    `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.

    Arguments:
      save_directory (:obj:`str`):
        Directory to which to save. Will be created if it doesn't exist.
    """
    if os.path.isfile(save_directory):
      l.logger().error("Provided path ({}) should be a directory, not a file".format(save_directory))
      return
    os.makedirs(save_directory, exist_ok=True)

    # Only save the model itself if we are using distributed training
    model_to_save = self.module if hasattr(self, "module") else self

    # Attach architecture to the config
    model_to_save.config.architectures = [model_to_save.__class__.__name__]

    # If we save using the predefined names, we can load using `from_pretrained`
    output_model_file = os.path.join(save_directory, WEIGHTS_NAME)

    if getattr(self.config, "xla_device", False):

      if pytorch.xla_model.is_master_ordinal():
        # Save configuration file
        model_to_save.config.save_pretrained(save_directory)
      # pytorch.xla_model.save takes care of saving only from master
      pytorch.xla_model.save(model_to_save.state_dict(), output_model_file)
    else:
      model_to_save.config.save_pretrained(save_directory)
      torch.save(model_to_save.state_dict(), output_model_file)

    l.logger().info("Model weights saved in {}".format(output_model_file))
Beispiel #21
0
def to_unique_samples(db: EncodedContentFiles, out_db: EncodedContentFiles, tokenizer) -> None:
  """
  Read input database, pass through deterministic re-writer and keep only unique samples.
  """
  pool     = multiprocessing.Pool()
  visited  = set()
  data     = []
  f = functools.partial(ContentHash_worker, tokenizer = tokenizer)

  with db.Session() as s:
    inp_data = [x for x in s.query(EncodedContentFile).all()]

  try:
    for sha, cfile in tqdm.tqdm(pool.imap_unordered(f, inp_data), total = len(inp_data), desc = "Unique-fy encoded database"):
      if sha not in visited:
        visited.add(sha)
        data.append(cfile)
  except Exception as e:
    l.logger().error(e)
    pool.terminate()
    raise e
  pool.close()
  with out_db.Session() as s:
    idx = 0
    for dp in tqdm.tqdm(data, total = len(data), desc = "Adding to DB"):
      new_dp = EncodedContentFile.FromEncodedContentFile(dp, idx = idx)
      idx += 1
      s.add(new_dp)
    s.commit()
  return
Beispiel #22
0
def GetDatabase() -> DashboardDatabase:
    db: DashboardDatabase = DashboardDatabase(
        url="sqlite:///{}/dashboard.db".format(
            os.path.abspath(FLAGS.workspace_dir)),
        must_exist=False)
    l.logger().info("Created dashboard database {}".format(db.url))
    return db
Beispiel #23
0
 def InitSampling(self,
                  sampler : samplers.Sampler,
                  seed    : typing.Optional[int] = None,
                  corpus = None,
                  ) -> None:
   """This is called only once. Performs basic initialization of sampling"""
   sample_batch_size = sampler.batch_size
   data_generator = torchLMDataGenerator.SampleMaskLMBatchGenerator(
                      self.config.training, sampler, self.tokenizer, seed, sample_batch_size,
                      self.config.architecture.max_position_embeddings, self.cache.path, corpus,
                      self.feature_encoder,
                      self.feature_tokenizer,
                      self.feature_sequence_length,
                    )
   self._ConfigSampleParams(data_generator, sampler)
   ckpt_step = self.loadCheckpoint(self.sample)
   if self.pytorch.num_gpus > 0:
     self.torch.cuda.empty_cache()
   if ckpt_step >= 0:
     l.logger().info("Loaded checkpoint step {}".format(ckpt_step))
   self.step_inputs   = None
   self.loader        = None
   self.pred_iterator = None
   l.logger().info("Initialized model samples in {}".format(self.sample_path / self.sampler.hash))
   return
Beispiel #24
0
 def IsDone(self, session: sqlutil.Session):
   if session.query(Meta).filter(Meta.key == "done").first():
     return True
   elif FLAGS.override_preprocessing:
     l.logger().warn("Overriding incomplete pre-processed DB.")
     return True
   else:
     return False
Beispiel #25
0
 def __init__(self, path: pathlib.Path, name: str, extension: str):
     super(zipStorage, self).__init__(path, name, extension)
     self.cached_content = []
     self.flush_counter = 20000
     self.file_count = 0
     self.repos = self.loadRepos
     self.data_file = ""
     l.logger().info("Set up ZIP storage in {}".format(self.cache_path))
Beispiel #26
0
    def _ConfigTrainParams(self, data_generator: tfLMDataGenerator) -> None:
        """
    Model parameter initialization for training and validation.
    """
        if self.bert_config is None:
            self._ConfigModelParams()

        self.train_batch_size = self.config.training.batch_size
        self.eval_batch_size = self.config.training.batch_size
        self.learning_rate = self.config.training.adam_optimizer.initial_learning_rate_micros / 1e6
        self.num_warmup_steps = self.config.training.num_warmup_steps

        self.steps_per_epoch = data_generator.steps_per_epoch
        self.num_epochs = data_generator.num_epochs
        self.num_train_steps = self.steps_per_epoch * self.num_epochs
        self.max_eval_steps = FLAGS.max_eval_steps

        self.validation_results_file = "val_results.txt"
        self.validation_results_path = os.path.join(
            str(self.logfile_path), self.validation_results_file)

        tpu_cluster_resolver = None
        if FLAGS.use_tpu and FLAGS.tpu_name:
            tpu_cluster_resolver = self.tf.distribute.cluster_resolver.TPUClusterResolver(
                FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

        train_distribute = self.tf.distribute.MirroredStrategy(
            num_gpus=gpu.numGPUs(
            )) if FLAGS.use_tpu and FLAGS.mirror_gpus else None

        is_per_host = self.tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2
        run_config = self.tf.compat.v1.estimator.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=FLAGS.master,
            model_dir=str(self.ckpt_path),
            save_checkpoints_steps=self.steps_per_epoch,
            save_summary_steps=self.steps_per_epoch,
            keep_checkpoint_max=0,
            log_step_count_steps=self.steps_per_epoch,
            train_distribute=train_distribute,
            tpu_config=self.tf.compat.v1.estimator.tpu.TPUConfig(
                iterations_per_loop=self.steps_per_epoch,
                num_shards=FLAGS.num_tpu_cores,
                per_host_input_for_training=is_per_host))
        model_fn = self._model_fn_builder(bert_config=self.bert_config)
        # If TPU is not available, this will fall back to normal Estimator on CPU
        # or GPU.
        self.train = tfBert.BertEstimator(
            self.tf.compat.v1.estimator.tpu.TPUEstimator(
                use_tpu=FLAGS.use_tpu,
                model_fn=model_fn,
                config=run_config,
                params=None,
                train_batch_size=self.train_batch_size,
                eval_batch_size=self.eval_batch_size,
            ), data_generator)
        l.logger().info(self.GetShortSummary())
        return
Beispiel #27
0
def ContentHash_worker(sample: Sample) -> typing.Tuple[str, Sample]:
    """
  Return new sample along with content hash of code.
  """
    try:
        return opencl.ContentHash(sample.text), sample
    except Exception as e:
        l.logger().warn(e)
        return None
Beispiel #28
0
  def __init__(self, *args, **kwargs):

    super(torchBert, self).__init__(*args, **kwargs)
    
    from deeplearning.clgen.util import pytorch
    if not pytorch.initialized:
      pytorch.initPytorch()

    if self.config.architecture.HasField("feature_encoder") and self.config.architecture.feature_encoder:
      self.feature_encoder   = True
      self.feature_tokenizer = tokenizers.FeatureTokenizer.FromArgs(
        self.config.architecture.feature_singular_token_thr,
        self.config.architecture.feature_max_value_token,
        self.config.architecture.feature_token_range
      )
      self.feature_sequence_length = self.config.architecture.feature_sequence_length
    else:
      self.feature_encoder         = False
      self.feature_tokenizer       = None
      self.feature_sequence_length = None

    self.pytorch             = pytorch
    self.torch               = pytorch.torch
    self.torch_tpu_available = pytorch.torch_tpu_available

    self.torch.manual_seed(self.config.training.random_seed)
    self.torch.cuda.manual_seed_all(self.config.training.random_seed)

    self.bertAttrs         = {}
    self.featureAttrs      = {}
    self.bert_config       = None

    self.train             = None
    self.sample            = None
    self.predict_generator = None
    self.sampler           = None

    self.train_batch_size  = None
    self.eval_batch_size   = None
    self.learning_rate     = None
    self.num_train_steps   = None

    self.ckpt_path         = self.cache.path / "checkpoints"
    self.sample_path       = self.cache.path / "samples"

    self.logfile_path      = self.cache.path / "logs"
    if self.config.HasField("pre_train_corpus"):
      self.pre_logfile_path = self.logfile_path / "pre_train"

    self.telemetry         = telemetry.TrainingLogger(self.logfile_path)
    if self.config.HasField("pre_train_corpus"):
      self.pre_telemetry   = telemetry.TrainingLogger(self.logfile_path / "pre_train")

    self.is_validated      = False
    self.trained           = False
    l.logger().info("BERT Model config initialized in {}".format(self.cache.path))
    return
Beispiel #29
0
  def after_run(self, run_context, run_values):
    """
      Requested tensors are evaluated and their values are available
    """
    super(writeValidationDB, self).after_run(run_context, run_values)

    batch_size = run_values.results[self.input_ids].shape[0]

    masked_lm_predictions = np.reshape(
      run_values.results[self.masked_lm_predictions],
      (batch_size, int(len(run_values.results[self.masked_lm_predictions]) / batch_size))
    )
    next_sentence_predictions = np.reshape(
      run_values.results[self.next_sentence_predictions],
      (batch_size, int(len(run_values.results[self.next_sentence_predictions]) / batch_size))
    )

    assert run_values.results[self.original_input].shape[0]       == batch_size
    assert run_values.results[self.input_ids].shape[0]            == batch_size
    assert run_values.results[self.input_mask].shape[0]           == batch_size
    assert run_values.results[self.masked_lm_positions].shape[0]  == batch_size
    assert run_values.results[self.masked_lm_ids].shape[0]        == batch_size
    assert run_values.results[self.masked_lm_weights].shape[0]    == batch_size
    assert run_values.results[self.masked_lm_lengths].shape[0]    == batch_size
    assert run_values.results[self.next_sentence_labels].shape[0] == batch_size
    assert masked_lm_predictions.shape[0]                         == batch_size
    assert next_sentence_predictions.shape[0]                     == batch_size

    with self.val_db.Session(commit = True) as session:
      for b in range(batch_size):
        val_trace = validation_database.BERTValFile(
          **validation_database.BERTValFile.FromArgs(
            tokenizer = self.tokenizer,
            id       = self.val_id,
            train_step                = run_values.results[self.global_step],
            seen_in_training          = run_values.results[self.seen_in_training][b],
            original_input            = run_values.results[self.original_input][b],
            input_ids                 = run_values.results[self.input_ids][b],
            input_mask                = run_values.results[self.input_mask][b],
            masked_lm_positions       = run_values.results[self.masked_lm_positions][b],
            masked_lm_ids             = run_values.results[self.masked_lm_ids][b],
            masked_lm_weights         = run_values.results[self.masked_lm_weights][b],
            masked_lm_lengths         = run_values.results[self.masked_lm_lengths][b],
            next_sentence_labels      = run_values.results[self.next_sentence_labels][b],
            masked_lm_predictions     = masked_lm_predictions[b],
            next_sentence_predictions = next_sentence_predictions[b],
          )
        )
        try:
          exists = session.query(validation_database.BERTValFile.sha256).filter_by(sha256 = val_trace.sha256).scalar() is not None
        except sqlalchemy.orm.exc.MultipleResultsFound as e:
          l.logger().error("Selected sha256 has been already found more than once.")
          raise e
        if not exists:
          session.add(val_trace)
          self.val_id += 1
    return
Beispiel #30
0
def ContentHash_worker(contentfile: EncodedContentFile, tokenizer) -> typing.Tuple[str, EncodedContentFile]:
  """
  Return new contentfile along with content hash of code.
  """
  try:
    return opencl.ContentHash(tokenizer.ArrayToCode(contentfile.indices_array, with_formatting = False)), contentfile
  except Exception as e:
    l.logger().warn(e)
    return None