Ejemplo n.º 1
0
def main2(input_file, output_file):
    with open(input_file) as f_in:
        data = json.load(f_in)
    tmp = list()
    tr = Translator()
    for elem in data['data']:
        trans = tr.translate(elem)
        if not list(trans.keys()):
            continue
        key = list(trans.keys())[0]
        val = trans[key][0]
        tmp.append({'word': elem, 'translations': {key: val}})
    data['data'] = tmp
    with open(output_file, 'w+') as f_out:
        f_out.write(json.dumps(data))
Ejemplo n.º 2
0
    def post(self):
        try:
            data = json.loads(self.request.body)
        except json.JSONDecodeError:
            logging.warning(f"get incorrect body: {self.request.body}")
            self.write(json.dumps({'error': 'incorrect-format'}))
            return
        if 'word' not in data:
            logging.warning(f"get incorrect body: {self.request.body}")
            self.write(json.dumps({'error': 'incorrect-format'}))
            return
        word = data['word'].lower().strip()
        if not check_english_word(word):
            logging.warning(f"get incorrect word: {word}")
            self.write(json.dumps({'error': 'incorrect-format'}))
            return

        try:
            translation = Translator().translate(word)
        except RuntimeError:
            logging.error(f"translation error")
            self.write(json.dumps({'error': 'translation-error'}))
            return

        logging.debug(f"get translation {translation} for word {data['word']}")

        self.write(
            json.dumps({
                'result': 'ok',
                'data': {
                    'translations': translation
                }
            }))
Ejemplo n.º 3
0
def main4(input_file, output_file):
    with open(input_file) as f_in:
        data = json.load(f_in)
    tmp = list()
    tr = Translator()
    for elem in data['data']:
        if ' ' in elem['word']:
            continue
        key = list(elem['translations'].keys())[0]
        val = elem['translations'][key][0]
        tmp.append({
            'word': elem['word'].lower(),
            'translations': {
                key: [val]
            }
        })
    data['data'] = tmp
    with open(output_file, 'w+') as f_out:
        f_out.write(json.dumps(data))
Ejemplo n.º 4
0
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
grandfatherdir = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.insert(0, parentdir)
sys.path.insert(0, grandfatherdir)

from utils.query_util import tokenize
from utils.translator import Translator

import dmn.char.dmn_data_utils as dmn_data_utils
from dmn.char.dmn_plus import Config, DMN_PLUS

translator = Translator()

EPOCH = 5


def prepare_data(args, config):
    # train, valid, word_embedding, word2vec, updated_embedding, max_q_len, max_input_len, max_sen_len, \
    #     num_supporting_facts, vocab_size, candidate_size, candid2idx, \
    #     idx2candid, w2idx, idx2w = dmn_data_utils.load_data(
    #         config, split_sentences=True)
    train_data, val_data, test_data, metadata = dmn_data_utils.load_data(
        config, split_sentences=True)
    # metadata = dict()
    data = dict()
    data['train'] = train_data
    data['valid'] = val_data
Ejemplo n.º 5
0
    def train(self):
        """ Main training method for the Trainer class """

        print("Starting training for {} epoch(s)".format(self.max_num_epochs -
                                                         self.epoch))

        if not self.params.boost_warmup:
            hard_training_instances = []

        for epoch in range(self.max_num_epochs):
            self.epoch = epoch
            print("Epoch {}/{}".format(epoch + 1, self.max_num_epochs))

            # train the model the train set
            epoch_start_time = time.time()

            # Make a copy of train_iter, add new examples to it (if boost==True),
            # and pass it into train_epoch()
            data_iterator = self.train_iter

            # If boost==True and epochs are past warmup, perform boosting
            if self.params.boost and epoch + 1 > self.params.boost_warmup:
                print("Boosting....")

                # make `Example` objects for all hard training instances
                example_objs = self.create_example_objs(
                    hard_training_instances)

                # Add the new hard training instances to the original training data
                # thereby `boosting` the dataset with hard training examples
                existing_data = self.train_iter.data()
                existing_data.extend(example_objs)

                # Create new Dataset and iterator on the boosted data
                data_iterator = self.create_boosted_dataset(existing_data)

            train_loss_avg, hard_training_instances = self.train_epoch(
                data_iterator)

            # write epoch statistics to Tensorboard
            self.summary_writer.add_scalar("train/avg_loss_per_epoch",
                                           train_loss_avg, self.epoch)
            self.summary_writer.add_scalar("train/avg_perplexity_epoch",
                                           math.exp(train_loss_avg),
                                           self.epoch)

            epoch_end_time = time.time()
            epoch_mins, epoch_secs = self.epoch_time(epoch_start_time,
                                                     epoch_end_time)
            print(
                f'Epoch: {epoch+1:02} | Avg Train Loss: {train_loss_avg} | Perpelxity: {math.exp(train_loss_avg)} | Time: {epoch_mins}m {epoch_secs}s'
            )

            # validate the model on the dev set
            val_start_time = time.time()
            val_loss_avg = self.validate()
            val_end_time = time.time()
            val_mins, val_secs = self.epoch_time(val_start_time, val_end_time)

            # write validation statistics to Tensorboard
            self.summary_writer.add_scalar("val/loss", val_loss_avg,
                                           self.epoch)
            self.summary_writer.add_scalar("val/perplexity",
                                           math.exp(val_loss_avg), self.epoch)

            # TODO: write translations to Tensorboard
            # every `decode_every_num_epochs` epochs, write out translations using Greedy Decoding
            # to Tensorboard
            if (self.epoch + 1) % self.decode_every_num_epochs == 0:
                print("Performing Greedy Decoding...")
                num_translations = 5
                dev_iter = copy.copy(self.dev_iter)
                decoder = Translator(
                    model=self.model,
                    dev_iter=list(dev_iter)[:num_translations],
                    params=self.params,
                    device=self.params.device)
                translations = decoder.greedy_decode(max_len=100)
                translations = [
                    " ".join(translation) for translation in translations
                ]
                for translation in translations:
                    self.summary_writer.add_text("transformer/translation",
                                                 translation, self.epoch)

            print(
                f'Avg Val Loss: {val_loss_avg} | Val Perplexity: {math.exp(val_loss_avg)} | Time: {val_mins}m {val_secs}s'
            )
            print('\n')

            # use a scheduler in order to decay learning rate hasn't improved
            if self.scheduler is not None:
                self.scheduler.step(val_loss_avg)

            is_best = val_loss_avg < self.best_val_loss

            optim_dict = self.optimizer._optimizer.state_dict() if isinstance(
                self.optimizer,
                ScheduledOptimizer) else self.optimizer.state_dict()

            # save checkpoint
            self.save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "state_dict": self.model.state_dict(),
                    "optim_dict": optim_dict
                },
                is_best=is_best,
                checkpoint=self.params.model_dir + "/checkpoints/")

            if is_best:
                print("- Found new lowest loss!")
                self.best_val_loss = val_loss_avg
                         replace_unk=False,
                         phrase_table='',
                         verbose=True,
                         dump_beam='',
                         n_best=1,
                         batch_type='sents',
                         gpu=0)

fields, model, model_opt = load_test_model(opt, args)
scorer = GNMTGlobalScorer.from_opt(opt)
out_file = codecs.open(opt.output, 'w+', 'utf-8')
translator = Translator.from_opt(model,
                                 fields,
                                 opt,
                                 model_opt,
                                 args,
                                 global_scorer=scorer,
                                 out_file=out_file,
                                 report_align=opt.report_align,
                                 report_score=False,
                                 logger=None)

res = []
n = 1
with open(args.input_file, 'r') as f:
    lines = f.readlines()
    lines = [line.strip() for line in lines]
    translated = translator.translate(lines, batch_size=args.batch_size)
    for i in range(len(translated[1])):
        res.append(translated[1][i][0])

if args.output_file:
Ejemplo n.º 7
0
def main(params, greedy, beam_size, test):
    """
    The main function for decoding a trained MT model
    Arguments:
        params: parameters related to the `model` that is being decoded
        greedy: whether or not to do greedy decoding
        beam_size: size of beam if doing beam search
    """
    print("Loading dataset...")
    _, dev_iter, test_iterator, DE, EN = load_dataset(params.data_path,
                                                      params.train_batch_size,
                                                      params.dev_batch_size)
    de_size, en_size = len(DE.vocab), len(EN.vocab)
    print("[DE Vocab Size: ]: {}, [EN Vocab Size]: {}".format(
        de_size, en_size))

    params.src_vocab_size = de_size
    params.tgt_vocab_size = en_size
    params.sos_index = EN.vocab.stoi["<s>"]
    params.pad_token = EN.vocab.stoi["<pad>"]
    params.eos_index = EN.vocab.stoi["</s>"]
    params.itos = EN.vocab.itos

    device = torch.device('cuda' if params.cuda else 'cpu')
    params.device = device

    # make the Seq2Seq model
    model = make_seq2seq_model(params)

    # load the saved model for evaluation
    if params.average > 1:
        print("Averaging the last {} checkpoints".format(params.average))
        checkpoint = {}
        checkpoint["state_dict"] = average_checkpoints(params.model_dir,
                                                       params.average)
        model = Trainer.load_checkpoint(model, checkpoint)
    else:
        model_path = os.path.join(params.model_dir + "checkpoints/",
                                  params.model_file)
        print("Restoring parameters from {}".format(model_path))
        model = Trainer.load_checkpoint(model, model_path)

    # evaluate on the test set
    if test:
        print("Doing Beam Search on the Test Set")
        test_decoder = Translator(model, test_iterator, params, device)
        test_beam_search_outputs = test_decoder.beam_decode(
            beam_width=beam_size)
        test_decoder.output_decoded_translations(
            test_beam_search_outputs,
            "beam_search_outputs_size_test={}.en".format(beam_size))
        return

    # instantiate a Translator object to translate SRC langauge to TRG language using Greedy/Beam Decoding
    decoder = Translator(model, dev_iter, params, device)

    if greedy:
        print("Doing Greedy Decoding...")
        greedy_outputs = decoder.greedy_decode(max_len=100)
        decoder.output_decoded_translations(greedy_outputs,
                                            "greedy_outputs.en")

        print("Evaluating BLEU Score on Greedy Tranlsation...")
        subprocess.call([
            './utils/eval.sh', params.model_dir + "outputs/greedy_outputs.en"
        ])

    if beam_size:
        print("Doing Beam Search...")
        beam_search_outputs = decoder.beam_decode(beam_width=beam_size)
        decoder.output_decoded_translations(
            beam_search_outputs,
            "beam_search_outputs_size={}.en".format(beam_size))

        print("Evaluating BLEU Score on Beam Search Translation")
        subprocess.call([
            './utils/eval.sh', params.model_dir +
            "outputs/beam_search_outputs_size={}.en".format(beam_size)
        ])