Beispiel #1
0
def test_TeeLogsToFile_contextmanager(capsys):
    """Test that contextmanager temporarily also logs to file."""
    with tempfile.TemporaryDirectory() as d:
        FLAGS.logtostderr = True
        logging.info('This is not going in a file')
        with logutil.TeeLogsToFile('test', d):
            logging.info('Hello, file!')
        logging.info('This is not going in a file')
        # Test file contents.
        with open(pathlib.Path(d) / 'test.INFO') as f:
            lines = f.read().rstrip().split('\n')
            assert len(lines) == 1
            assert lines[0].endswith('Hello, file!')
        out, err = capsys.readouterr()
        assert not out
        # Test stderr contents.
        lines = err.rstrip().split('\n')
        assert len(lines) == 3
        assert lines[0].endswith('This is not going in a file')
        assert lines[1].endswith('Hello, file!')
        assert lines[2].endswith('This is not going in a file')
Beispiel #2
0
    def Sample(self,
               sampler: samplers.Sampler,
               min_num_samples: int,
               seed: int = None) -> typing.List[model_pb2.Sample]:
        """Sample a model.

    If the model is not already trained, calling Sample() first trains the
    model. Thus a call to Sample() is equivalent to calling Train() then
    Sample().

    Args:
      sampler: The sampler to sample using.
      min_num_samples: The minimum number of samples to return. Note that the
        true number of samples returned may be higher than this value, as
        sampling occurs in batches. The model will continue producing samples
        until the lowest mulitple of the sampler batch size property that is
        larger than this value. E.g. if min_num_samples is 7 and the Sampler
        batch size is 10, 10 samples will be returned.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Returns:
      A list of Sample protos.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
        self.Train()
        min_num_samples = 500
        sample_count = 1  #For logging purposes only
        self.SamplerCache(sampler).mkdir(exist_ok=True)
        with logutil.TeeLogsToFile(f'sampler_{sampler.hash}',
                                   self.cache.path / 'logs'):
            logging.info("Sampling: '%s'", sampler.start_text)
            if min_num_samples < 0:
                logging.warning(
                    'Entering an infinite sample loop, this process will never end!'
                )
            sample_start_time = labdate.MillisecondsTimestamp()

            atomizer = self.corpus.atomizer
            sampler.Specialize(atomizer)
            batch_size = self.backend.InitSampling(sampler, seed)
            print("Sampling Batch Size :" + str(batch_size))
            samples = []
            sample_dir = self.SamplerCache(sampler)

            # Per-sample batch outer loop. Continues until we have as many samples
            # as we want.
            while True:
                samples_in_progress = [
                    sampler.tokenized_start_text.copy()
                    for _ in range(batch_size)
                ]
                #print (samples_in_progress)
                done = np.zeros(batch_size, dtype=np.bool)
                #print(done)
                start_time = labdate.MillisecondsTimestamp()
                wall_time_start = start_time

                self.backend.InitSampleBatch(sampler, batch_size)

                # Sampling loop. Continues until all samples in the batch are done.
                while True:
                    indices = self.backend.SampleNextIndices(
                        sampler, batch_size)
                    #print(indices)
                    # Iterate over all samples in batch to determine whether they're
                    # done.
                    for i in range(batch_size):
                        if done[i]:
                            continue

                        token = atomizer.decoder[indices[i]]
                        samples_in_progress[i].append(token)
                        if sampler.SampleIsComplete(samples_in_progress[i]):
                            end_time = labdate.MillisecondsTimestamp()
                            done[i] = 1
                            sample = model_pb2.Sample(
                                text=''.join(samples_in_progress[i]),
                                sample_start_epoch_ms_utc=start_time,
                                sample_time_ms=end_time - start_time,
                                wall_time_ms=end_time - wall_time_start,
                                num_tokens=len(samples_in_progress[i]))
                            print(f'=== BEGIN CLGEN SAMPLE {sample_count} '
                                  f'===\n\n{sample.text}\n')
                            sample_count += 1
                            #sample_id = crypto.sha256_str(sample.text)
                            sample_path = sample_dir / f'Sample{sample_count}.mdl'  #previously .txt #name of the samples
                            with open(sample_path, 'w') as samplefile:
                                samplefile.write(''.join(
                                    samples_in_progress[i]))
                            #pbutil.ToFile(sample, sample_path)
                            if min_num_samples > 0:
                                samples.append(sample)
                            wall_time_start = labdate.MillisecondsTimestamp()

                    # Complete the batch.
                    if done.all():
                        break

                # Complete sampling. Note that sample_count starts at 1.
                if sample_count > min_num_samples:
                    now = labdate.MillisecondsTimestamp()
                    logging.info(
                        'Produced %s samples at a rate of %s ms / sample.',
                        humanize.intcomma(len(samples)),
                        humanize.intcomma(
                            int((now - sample_start_time) /
                                max(len(samples), 1))))
                    break

        return samples
Beispiel #3
0
    def SampleFast(self,
                   sampler: samplers.Sampler,
                   min_num_samples: int,
                   seed: int = None) -> typing.List[model_pb2.Sample]:
        """Sample a model.

    Same as Sample(), but without printing or caching samples. Because samples
    are not cached, infinite sampling loops are not supported, since we must
    return the sample protos at some point.

    Args:
      sampler: The sampler to sample using.
      min_num_samples: The minimum number of samples to return. Note that the
        true number of samples returned may be higher than this value, as
        sampling occurs in batches. The model will continue producing samples
        until the lowest mulitple of the sampler batch size property that is
        larger than this value. E.g. if min_num_samples is 7 and the Sampler
        batch size is 10, 10 samples will be returned.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Returns:
      A list of Sample protos.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
        self.Train()

        sample_count = 1
        with logutil.TeeLogsToFile(f'sampler_{sampler.hash}',
                                   self.cache.path / 'logs'):
            logging.info("Sampling: '%s'", sampler.start_text)
            sample_start_time = labdate.MillisecondsTimestamp()
            atomizer = self.corpus.atomizer
            sampler.Specialize(atomizer)
            batch_size = self.backend.InitSampling(sampler, seed)
            samples = []

            # Per-sample batch outer loop. Continues until we have as many samples
            # as we want.
            while True:
                samples_in_progress = [
                    sampler.tokenized_start_text.copy()
                    for _ in range(batch_size)
                ]
                done = np.zeros(batch_size, dtype=np.bool)
                start_time = labdate.MillisecondsTimestamp()
                wall_time_start = start_time

                self.backend.InitSampleBatch(sampler, batch_size)

                # Sampling loop. Continues until all samples in the batch are done.
                while True:
                    indices = self.backend.SampleNextIndices(
                        sampler, batch_size)
                    print("Done :" + str(done))
                    # Iterate over all samples in batch to determine whether they're
                    # done.
                    for i in range(batch_size):
                        if done[i]:
                            continue

                        token = atomizer.decoder[indices[i]]
                        samples_in_progress[i].append(token)
                        if sampler.SampleIsComplete(samples_in_progress[i]):
                            end_time = labdate.MillisecondsTimestamp()
                            done[i] = 1
                            sample = model_pb2.Sample(
                                text=''.join(samples_in_progress[i]),
                                sample_start_epoch_ms_utc=start_time,
                                sample_time_ms=end_time - start_time,
                                wall_time_ms=end_time - wall_time_start,
                                num_tokens=len(samples_in_progress[i]))
                            sample_count += 1
                            samples.append(sample)
                            wall_time_start = labdate.MillisecondsTimestamp()

                    # Complete the batch.
                    if done.all():
                        break

                # Complete sampling. Note that sample_count starts at 1.
                if sample_count > min_num_samples:
                    now = labdate.MillisecondsTimestamp()
                    logging.info(
                        'Produced %s samples at a rate of %s ms / sample.',
                        humanize.intcomma(len(samples)),
                        humanize.intcomma(
                            int((now - sample_start_time) / len(samples))))
                    break

        return samples
Beispiel #4
0
    def Train(self, corpus) -> 'keras.models.Sequential':
        """Locked training.

    If there are cached epoch checkpoints, the one closest to the target number
    of epochs will be loaded, and the model will be trained for only the
    remaining number of epochs, if any. This means that calling this function
    twice will only actually train the model the first time, and all subsequent
    calls will be no-ops.

    This method must only be called when the model is locked.

    Returns:
      The trained Keras model.
    """
        model = builders.BuildKerasModel(self.config, self.atomizer.vocab_size)
        with open(self.cache.keypath('model.yaml'), 'w') as f:
            f.write(model.to_yaml())
        model.compile(loss='categorical_crossentropy',
                      optimizer=builders.BuildOptimizer(self.config))

        # Print a model summary.
        buf = io.StringIO()
        model.summary(print_fn=lambda x: buf.write(x + '\n'))
        logging.info('Model summary:\n%s', buf.getvalue())

        # TODO(cec): Add an atomizer.CreateVocabularyFile() method, with frequency
        # counts for a given corpus.
        def Escape(token: str) -> str:
            """Make a token visible and printable."""
            if token == '\t':
                return '\\t'
            elif token == '\n':
                return '\\n'
            elif not token.strip():
                return f"'{token}'"
            else:
                return token

        if not (self.cache.path / 'embeddings' / 'metadata.tsv').is_file():
            with open(self.cache.path / 'embeddings' / 'metadata.tsv',
                      'w') as f:
                for _, token in sorted(self.atomizer.decoder.items(),
                                       key=lambda x: x[0]):
                    f.write(Escape(token) + '\n')

        target_num_epochs = self.config.training.num_epochs
        starting_epoch = 0

        epoch_checkpoints = self.epoch_checkpoints
        if len(epoch_checkpoints) >= target_num_epochs:
            # We have already trained a model to at least this number of epochs, so
            # simply the weights from that epoch and call it a day.
            logging.info('Loading weights from %s',
                         epoch_checkpoints[target_num_epochs - 1])
            model.load_weights(epoch_checkpoints[target_num_epochs - 1])
            return model

        # Now entering the point at which training is inevitable.
        with logutil.TeeLogsToFile('train', self.cache.path / 'logs'):
            # Deferred importing of Keras so that we don't have to activate the
            # TensorFlow backend every time we import this module.
            import keras

            if epoch_checkpoints:
                # We have already trained a model at least part of the way to our target
                # number of epochs, so load the most recent one.
                starting_epoch = len(epoch_checkpoints)
                logging.info('Resuming training from epoch %d.',
                             starting_epoch)
                model.load_weights(epoch_checkpoints[-1])

            callbacks = [
                keras.callbacks.ModelCheckpoint(str(
                    self.cache.path / 'checkpoints' / '{epoch:03d}.hdf5'),
                                                verbose=1,
                                                mode="min",
                                                save_best_only=False),
                keras.callbacks.TensorBoard(
                    str(self.cache.path / 'embeddings'),
                    write_graph=True,
                    embeddings_freq=1,
                    embeddings_metadata={
                        'embedding_1':
                        str(self.cache.path / 'embeddings' / 'metadata.tsv'),
                    }),
                telemetry.TrainingLogger(self.cache.path /
                                         'logs').KerasCallback(keras),
            ]

            generator = data_generators.AutoGenerator(corpus,
                                                      self.config.training)
            steps_per_epoch = (corpus.encoded.token_count -
                               1) // (self.config.training.batch_size *
                                      self.config.training.sequence_length)
            logging.info(
                'Step counts: %s per epoch, %s left to do, %s total',
                humanize.intcomma(steps_per_epoch),
                humanize.intcomma(
                    (target_num_epochs - starting_epoch) * steps_per_epoch),
                humanize.intcomma(target_num_epochs * steps_per_epoch))
            model.fit_generator(generator,
                                steps_per_epoch=steps_per_epoch,
                                callbacks=callbacks,
                                initial_epoch=starting_epoch,
                                epochs=target_num_epochs)
        return model