Ejemplo n.º 1
0
 def validate_train_opts(cls, opt):
     if opt.epochs:
         raise AssertionError(
             "-epochs is deprecated please use -train_steps.")
     if opt.truncated_decoder > 0 and opt.accum_count > 1:
         raise AssertionError("BPTT is not compatible with -accum > 1")
     if opt.gpuid:
         raise AssertionError("gpuid is deprecated \
               see world_size and gpu_ranks")
     if torch.cuda.is_available() and not opt.gpu_ranks:
         logger.info("WARNING: You have a CUDA device, \
                     should run with -gpu_ranks")
Ejemplo n.º 2
0
    def segment_and_write(self):
        def _segment_and_write(sents, filename):
            with open(filename, "w") as f:
                for sent in sents:
                    pieces = self.sentpiece_processor.EncodeAsPieces(sent)
                    f.write(" ".join(pieces) + "\n")

        output_dir = self.data_output_fmt.format("segmented")
        os.makedirs(output_dir, exist_ok=True)
        for mode, files in self.files.items():
            for lang, data in files.items():
                file_path = f"{mode}.{lang}.bpe"
                logger.info(f"Write segmented file at {file_path}.")
                _segment_and_write(data, os.path.join(output_dir, file_path))
Ejemplo n.º 3
0
def read_file(filename, mode):
    logger.info(f"Read file from {filename}.")
    if mode == "train":
        texts = [
            line
            for line in tqdm(Path(filename).open("r").read().splitlines())
            if not line.startswith("<")
        ]
    else:
        texts = [
            re.sub("<[^>]+>", "", line)
            for line in tqdm(Path(filename).open("r").read().splitlines())
            if line.startswith("<seq")
        ]
    return texts
Ejemplo n.º 4
0
 def __init__(
     self,
     num_layers,
     num_heads,
     d_model,
     d_ff,
     encoder_vocab_size,
     decoder_vocab_size,
     dropout_rate=0.1,
 ):
     super().__init__(name="transformer")
     self.encoder = TransformerEncoder(num_layers, num_heads, d_model, d_ff,
                                       encoder_vocab_size, dropout_rate)
     logger.info("Building Encoder")
     self.decoder = TransformerDecoder(num_layers, num_heads, d_model, d_ff,
                                       decoder_vocab_size, dropout_rate)
     logger.info("Building Decoder")
     self.final_layer = tf.keras.layers.Dense(decoder_vocab_size)
Ejemplo n.º 5
0
    def write_preprocessed_file(self):
        def _write_file(sents, filename):
            with open(filename, "w") as f:
                f.write("\n".join(sents))

        output_dir = self.data_output_fmt.format("preprocessed")
        os.makedirs(output_dir, exist_ok=True)
        for mode, files in self.files.items():
            for lang, data in files.items():
                file_path = f"{mode}.{lang}"
                logger.info(f"Write preprocessed file at {file_path}.")
                _write_file(data, os.path.join(output_dir, file_path))

        en_train = self.files["train"]["en"]
        de_train = self.files["train"]["de"]
        all_train_data_bpe = en_train + de_train
        all_train_data = [(en_sent + "\t" + de_sent)
                          for en_sent, de_sent in zip(en_train, de_train)]

        _write_file(all_train_data_bpe, os.path.join(output_dir, "train.bpe"))
        _write_file(all_train_data, os.path.join(output_dir, "train"))
Ejemplo n.º 6
0
    def output(self, step, num_steps, learning_rate, start):
        """Write out statistics to stdout.

        Args:
           step (int): current step
           n_batch (int): total batches
           start (int): start time of step.
        """
        t = self.elapsed_time()
        step_fmt = "%2d" % step
        if num_steps > 0:
            step_fmt = "%s/%5d" % (step_fmt, num_steps)
        logger.info(
            ("Step %s; acc: %6.2f; ppl: %5.2f; xent: %4.2f; " +
             "lr: %7.5f; %3.0f/%3.0f tok/s; %6.0f sec")
            % (step_fmt,
               self.accuracy(),
               self.ppl(),
               self.xent(),
               learning_rate,
               self.n_src_words / (t + 1e-5),
               self.n_words / (t + 1e-5),
               time.time() - start))
        sys.stdout.flush()
Ejemplo n.º 7
0
 def log(self, *args, **kwargs):
     logger.info(*args, **kwargs)