def test_starting_out(self):
        translator = Translator((100, 100))
        self.assertEqual(translator.global_origin_vector, Vector2D([50, 50]))
        self.assertEqual(translator.spacing, 10)

        translator = Translator((400, 400))
        self.assertEqual(translator.spacing, 40)
    def test_starting_out(self):
        translator = Translator((100, 100))

        v1 = Vector2D([1, 1])
        translation1 = translator.translate(v1)
        self.assertEqual(translation1, Vector2D([60, 40]))

        v2 = Vector2D([-20, 0])
        translation2 = translator.translate(v2)
        self.assertEqual(translation2, Vector2D([-150, 50]))
Ejemplo n.º 3
0
 def train(self, model: Seq2Seq, discriminator: Discriminator,
           src_file_names: List[str], tgt_file_names: List[str],
           unsupervised_big_epochs: int, print_every: int, save_every: int,
           num_words_in_batch: int, max_length: int, teacher_forcing: bool,
           save_file: str="model", n_unsupervised_batches: int=None,
           enable_unsupervised_backtranslation: bool=False):
     if self.main_optimizer is None or self.discriminator_optimizer is None:
         logger.info("Initializing optimizers...")
         self.main_optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 
                                          lr=self.main_lr, betas=self.main_betas)
         self.discriminator_optimizer = optim.RMSprop(discriminator.parameters(), lr=self.discriminator_lr)
     for big_epoch in range(unsupervised_big_epochs):
         src_batch_gen = BatchGenerator(src_file_names, num_words_in_batch, max_len=max_length,
                                        vocabulary=self.vocabulary, language="src",
                                        max_batch_count=n_unsupervised_batches)
         tgt_batch_gen = BatchGenerator(tgt_file_names, num_words_in_batch, max_len=max_length,
                                        vocabulary=self.vocabulary, language="tgt",
                                        max_batch_count=n_unsupervised_batches)
         logger.debug("Src batch:" + str(next(iter(src_batch_gen))))
         logger.debug("Tgt batch:" + str(next(iter(tgt_batch_gen))))
         timer = time.time()
         main_loss_total = 0
         discriminator_loss_total = 0
         epoch = 0
         for src_batch, tgt_batch in zip(src_batch_gen, tgt_batch_gen):
             model.train()
             discriminator_loss, losses = self.train_batch(model, discriminator, src_batch,
                                                           tgt_batch, teacher_forcing)
             main_loss = sum(losses)
             main_loss_total += main_loss
             discriminator_loss_total += discriminator_loss
             if epoch % save_every == 0 and epoch != 0:
                 save_model(model, discriminator, self.main_optimizer,
                            self.discriminator_optimizer, save_file + ".pt")
             if epoch % print_every == 0 and epoch != 0:
                 main_loss_avg = main_loss_total / print_every
                 discriminator_loss_avg = discriminator_loss_total / print_every
                 main_loss_total = 0
                 discriminator_loss_total = 0
                 diff = time.time() - timer
                 timer = time.time()
                 translator = Translator(model, self.vocabulary, self.use_cuda)
                 logger.debug("Auto: " + translator.translate_sentence("you can prepare your meals here .",
                                                                       "src", "src"))
                 logger.debug("Translated: " + translator.translate_sentence("you can prepare your meals here .",
                                                                             "src", "tgt"))
                 logger.info('%s big epoch, %s epoch, %s sec, %.4f main loss, '
                              '%.4f discriminator loss, current losses: %s' %
                              (big_epoch, epoch, diff, main_loss_avg, discriminator_loss_avg, losses))
             epoch += 1
         save_model(model, discriminator, self.main_optimizer,
                    self.discriminator_optimizer, save_file + ".pt")
         if enable_unsupervised_backtranslation:
             self.current_translation_model = Translator(model, self.vocabulary, self.use_cuda)
             model = copy.deepcopy(model)
Ejemplo n.º 4
0
 def __init__(self, fileName):
     self.prog = None
     self.iProgLine = 0
     self.symCommand = ""
     self.binCommand = ""
     self.nCommand = 0
     self.isComment = False
     self.symCommandType = -1
     self.tranlator = Translator()
     self.symTable = SymTable()
     self.newFile = fileName + ".hack"
     return None
    def test_starting_out(self):
        translator = Translator((100, 100))
        v1 = Vector2D([1, 1])
        self.assertTrue(translator.is_in_range(v1))

        translator.update(Vector2D([10, 0]), 1)
        self.assertFalse(translator.is_in_range(v1))

        translator.update(Vector2D([0, 0]), 1)

        v2 = Vector2D([0, 5])
        self.assertTrue(translator.is_in_range(v2))
        v2.y += 1
        self.assertFalse(translator.is_in_range(v2))
Ejemplo n.º 6
0
def main():
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    use_cuda = torch.cuda.is_available()
    logging.info("Use CUDA: " + str(use_cuda))

    _, _, vocabulary = collect_vocabularies(
        src_vocabulary_path=opt.src_vocabulary,
        tgt_vocabulary_path=opt.tgt_vocabulary,
        all_vocabulary_path=opt.all_vocabulary,
        reset=False)
    if opt.src_to_tgt_dict is not None and opt.tgt_to_src_dict is not None:
        translator = WordByWordModel(opt.src_to_tgt_dict, opt.tgt_to_src_dict,
                                     vocabulary, opt.max_length)
    else:
        model, _, _, _ = load_model(opt.model, use_cuda)
        translator = Translator(model, vocabulary, use_cuda)
    input_filename = opt.input
    output_filename = opt.output
    lang = opt.lang
    tgt_lang = "src" if lang == "tgt" else "tgt"
    logging.info("Writing output...")
    with open(input_filename, "r",
              encoding="utf-8") as r, open(output_filename,
                                           "w",
                                           encoding="utf-8") as w:
        for line in r:
            translated = translator.translate_sentence(line, lang, tgt_lang)
            logging.debug(translated)
            w.write(translated + "\n")
Ejemplo n.º 7
0
def init_zero_supervised(vocabulary, save_file, use_cuda):
    model, discriminator = build_model(
        max_length=opt.max_length,
        output_size=vocabulary.size(),
        rnn_size=opt.rnn_size,
        encoder_n_layers=opt.layers,
        decoder_n_layers=opt.layers,
        dropout=opt.dropout,
        use_cuda=use_cuda,
        enable_embedding_training=bool(opt.sv_embedding_training),
        discriminator_hidden_size=opt.discriminator_hidden_size,
        bidirectional=bool(opt.bidirectional),
        use_attention=bool(opt.attention))
    if opt.src_embeddings is not None:
        load_embeddings(model,
                        src_embeddings_filename=opt.src_embeddings,
                        tgt_embeddings_filename=opt.tgt_embeddings,
                        vocabulary=vocabulary)
    model = model.cuda() if use_cuda else model
    discriminator = discriminator.cuda() if use_cuda else discriminator
    print_summary(model)

    trainer = Trainer(
        vocabulary,
        max_length=opt.max_length,
        use_cuda=use_cuda,
        discriminator_lr=opt.discriminator_lr,
        main_lr=opt.sv_learning_rate,
        main_betas=(opt.adam_beta1, 0.999),
    )

    if opt.sv_load_from:
        model, discriminator, main_optimizer, discriminator_optimizer = load_model(
            opt.sv_load_from, use_cuda)
        trainer.main_optimizer = main_optimizer
        trainer.discriminator_optimizer = discriminator_optimizer
    else:
        pair_file_names = [
            (opt.train_src_bi, opt.train_tgt_bi),
        ]
        trainer.train_supervised(model,
                                 discriminator,
                                 pair_file_names,
                                 vocabulary,
                                 num_words_in_batch=opt.sv_num_words_in_batch,
                                 max_length=opt.max_length,
                                 save_file=save_file,
                                 big_epochs=opt.supervised_epochs,
                                 print_every=opt.print_every,
                                 save_every=opt.save_every,
                                 max_batch_count=opt.n_supervised_batches)
    for param in model.parameters():
        param.requires_grad = False
    return Translator(model, vocabulary, use_cuda)
    def run(self, interface, **kwargs):
        book = ChoseBook().run(interface, **kwargs)

        if not book.are_all_words_processed():
            interface.display_info(
                "The book is not fully processed. Please firstly mark known words. "
            )
            return

        translator = Translator()

        unknown_words_cnt = len(book.unknown_words)
        all_flashcards = defaultdict(list)
        for idx, word in enumerate(book.unknown_words):
            translation_units = translator.get_translation(word.stored_word)
            prompt = self.get_translation_choice_prompt(
                idx, unknown_words_cnt, translation_units)

            multiple_input_processor = MultipleInputProcessor(
                IntInRangeInputProcessor(valid_range=(0,
                                                      len(translation_units))))

            choices = interface.get_input(
                prompt, input_processor=multiple_input_processor)

            for choice in choices:
                chosen_translation_unit = translation_units[choice]
                all_flashcards[", ".join(
                    chosen_translation_unit.words)].append(", ".join(
                        chosen_translation_unit.meanings))

            word.mark_if_known(True)

        all_flashcards_in_final_format = [
            f"{key}={'/'.join(val)}" for key, val in all_flashcards.items()
        ]
        book.flashcards.extend(all_flashcards_in_final_format)

        db = Database()
        db.store_book(book)
        interface.display_info("FINISHED MAKING FLASHCARDS")
    def test_starting_out(self):
        translator = Translator((100, 100))
        translator.update(Vector2D([0, 0]), 0.5)
        self.assertEqual(translator.spacing, 5)
        self.assertEqual(translator.local_origin_vector,
                         translator.global_origin_vector)

        translator.update(Vector2D([5, 5]), 0.5)
        self.assertEqual(translator.spacing, 5)
        self.assertEqual(translator.local_origin_vector, Vector2D([75, 25]))

        translator.update(Vector2D([5, 5]), 2)
        self.assertEqual(translator.spacing, 20)
        self.assertEqual(translator.local_origin_vector, Vector2D([150, -50]))
Ejemplo n.º 10
0
 def train_supervised(self, model, discriminator, pair_file_names, vocabulary: Vocabulary, *, num_words_in_batch, 
                      big_epochs, max_length, max_batch_count=None, save_every=100, print_every=100,
                      save_file="model"):
     if self.main_optimizer is None:
         logger.info("Initializing optimizers...")
         self.main_optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 
                                          lr=self.main_lr, betas=self.main_betas)
         self.discriminator_optimizer = optim.RMSprop(discriminator.parameters(), lr=self.discriminator_lr)
     for big_epoch in range(big_epochs):
         batch_gen = BilingualBatchGenerator(pair_file_names, max_length, num_words_in_batch, vocabulary,
                                             languages=["src", "tgt"], max_batch_count=max_batch_count)
         timer = time.time()
         loss_total = 0
         epoch = 0
         model.train()
         for src_batch, tgt_batch in batch_gen:
             logger.debug("Src batch: " + str(src_batch))
             logger.debug("Tgt batch: " + str(tgt_batch))
             loss = self.train_supervised_batch(model, src_batch, tgt_batch)
             Batch.print_pair(src_batch, tgt_batch, self.vocabulary, "src-tgt")
             logger.debug("Loss: " + str(loss))
             loss_total += loss
             if epoch % save_every == 0 and epoch != 0:
                 save_model(model, discriminator, self.main_optimizer, self.discriminator_optimizer,
                            save_file + "_supervised.pt")
             if epoch % print_every == 0 and epoch != 0:
                 print_loss_avg = loss_total / print_every
                 loss_total = 0
                 diff = time.time() - timer
                 timer = time.time()
                 translator = Translator(model, self.vocabulary, self.use_cuda)
                 logger.debug("Translated: "+ translator.translate_sentence("you can prepare your meals here .", "src", "tgt"))
                 logger.info('%s big epoch, %s epoch, %s sec, %.4f main loss' %
                             (big_epoch, epoch, diff, print_loss_avg))
             epoch += 1
         save_model(model, discriminator, self.main_optimizer, self.discriminator_optimizer,
                    save_file + "_supervised.pt")
    def test_starting_out(self):
        translator = Translator((100, 100))
        self.assertEqual(translator.global_origin_vector, Vector2D([50, 50]))

        translator.update(Vector2D([10, 0]), 1)
        self.assertEqual(translator.spacing, 10)
        self.assertEqual(translator.local_origin_vector, Vector2D([150, 50]))

        translator.update(Vector2D([5, 5]), 1)
        self.assertEqual(translator.spacing, 10)
        self.assertEqual(translator.local_origin_vector, Vector2D([100, 0]))
Ejemplo n.º 12
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    
    logger = logging.getLogger("unmt")
    logger.propagate = False
    fh = logging.FileHandler(opt.log_file)
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    logger.addHandler(fh)
    logger.addHandler(ch)
    
    use_cuda = torch.cuda.is_available()
    logger.info("Use CUDA: " + str(use_cuda))
  
    _, _, vocabulary = collect_vocabularies(
            src_vocabulary_path=opt.src_vocabulary,
            tgt_vocabulary_path=opt.tgt_vocabulary,
            all_vocabulary_path=opt.all_vocabulary,
            src_file_names=(opt.train_src_mono, ),
            tgt_file_names=(opt.train_tgt_mono, ),
            src_max_words=opt.src_vocab_size,
            tgt_max_words=opt.tgt_vocab_size,
            reset=bool(opt.reset_vocabularies))

    if opt.src_to_tgt_dict is not None and opt.tgt_to_src_dict is not None:
        zero_model = WordByWordModel(opt.src_to_tgt_dict, opt.tgt_to_src_dict, vocabulary, opt.max_length)
    elif opt.bootstrapped_model is not None:
        model, discriminator, _, _ = load_model(opt.bootstrapped_model, use_cuda)
        for param in model.parameters():
            param.requires_grad = False
        zero_model = Translator(model, vocabulary, use_cuda)
    elif opt.train_src_bi is not None and opt.train_tgt_bi is not None:
        zero_model = init_zero_supervised(vocabulary, opt.save_model, use_cuda)
    else:
        assert False, "Zero model was not initialized"
    
    trainer = Trainer(vocabulary,
                      max_length=opt.max_length,
                      use_cuda=use_cuda,
                      discriminator_lr=opt.discriminator_lr,
                      main_lr=opt.learning_rate,
                      main_betas=(opt.adam_beta1, 0.999),)
    trainer.current_translation_model = zero_model

    model, discriminator = build_model(
        max_length=opt.max_length,
        output_size=vocabulary.size(),
        rnn_size=opt.rnn_size,
        encoder_n_layers=opt.layers,
        decoder_n_layers=opt.layers,
        dropout=opt.dropout,
        use_cuda=use_cuda,
        enable_embedding_training=bool(opt.usv_embedding_training),
        discriminator_hidden_size=opt.discriminator_hidden_size,
        bidirectional=bool(opt.bidirectional),
        use_attention=bool(opt.attention)
    )
    if opt.src_embeddings is not None:
        load_embeddings(model,
                        src_embeddings_filename=opt.src_embeddings,
                        tgt_embeddings_filename=opt.tgt_embeddings,
                        vocabulary=vocabulary)
    model = model.cuda() if use_cuda else model
    print_summary(model)
    print_summary(discriminator)
    discriminator = discriminator.cuda() if use_cuda else discriminator

    if opt.usv_load_from:
        model, discriminator, main_optimizer, discriminator_optimizer = load_model(opt.usv_load_from, use_cuda)
        trainer.main_optimizer = main_optimizer
        trainer.discriminator_optimizer = discriminator_optimizer

    trainer.train(model, discriminator,
                  src_file_names=[opt.train_src_mono, ],
                  tgt_file_names=[opt.train_tgt_mono, ],
                  unsupervised_big_epochs=opt.unsupervised_epochs,
                  num_words_in_batch=opt.usv_num_words_in_batch,
                  print_every=opt.print_every,
                  save_every=opt.save_every,
                  save_file=opt.save_model,
                  n_unsupervised_batches=opt.n_unsupervised_batches,
                  enable_unsupervised_backtranslation=opt.enable_unsupervised_backtranslation,
                  teacher_forcing=bool(opt.teacher_forcing),
                  max_length=opt.max_length)
Ejemplo n.º 13
0
 def __init__(self):
     self.cal = Calculator()
     self.data_maps = DataMaps()
     self.translator = Translator()
Ejemplo n.º 14
0
class Parser:
    def __init__(self, fileName):
        self.prog = None
        self.iProgLine = 0
        self.symCommand = ""
        self.binCommand = ""
        self.nCommand = 0
        self.isComment = False
        self.symCommandType = -1
        self.tranlator = Translator()
        self.symTable = SymTable()
        self.newFile = fileName + ".hack"
        return None

    def addData(self, data):
        self.prog = data
        return None

    def advance(self):
        # strip leading/trailing whitespaces, remove "/n" characters
        currentLine = self.prog[self.iProgLine].strip()
        self.iProgLine += 1
        # do not parse empty lines and comments
        if ('//' in currentLine):
            i = currentLine.find('/')
            currentLine = currentLine[:i].strip()
        if ('/*' in currentLine and '*/' in currentLine):
            i = currentLine.find('/')
            j = currentLine[i+1:].find('/')
            currentLine = (currentLine[:i] + currentLine[j+1:]).strip()
        if ('/*' in currentLine):
            self.isComment = True
            return False
        if ('*/' in currentLine):
            self.isComment = False
            return False
        if (self.isComment):
            return False
        if (not currentLine):
            return False
        if (currentLine.startswith("(")):
            return False
        self.symCommand = currentLine
        # select correct type of instruction
        if (currentLine.startswith("@")):
            self.symCommandType = cType.A_COMMAND
        else:
            self.symCommandType = cType.C_COMMAND
        self.nCommand += 1
        return True

    def symbol(self):
        value = ""
        if (self.symCommandType == cType.A_COMMAND):
            value = self.symCommand.strip("@")
        if (not value.isdigit()):
            if (self.symTable.contains(value)):
                value = self.symTable.getAddress(value)
            else:
                value = self.symTable.addEntry(value)
        return '{0:016b}'.format(int(value))

    def instructions(self):
        if (self.symCommandType != cType.C_COMMAND):
            return ("", "", "")
        i = self.symCommand.find('=')
        j = self.symCommand.find(';')
        if (i == -1 and j == -1):
            dest = 'null'
            comp = self.symCommand
            jump = 'null'
            return (dest, comp, jump)
        if (i == -1):
            dest = 'null'
            comp = self.symCommand[:j]
            jump = self.symCommand[j+1:]
            return (dest, comp, jump)
        if (j == -1):
            dest = self.symCommand[:i]
            comp = self.symCommand[i+1:]
            jump = 'null'
            return (dest, comp, jump)
        dest = self.symCommand[:i]
        comp = self.symCommand[i+1:j]
        jump = self.symCommand[j+1:]
        return (dest, comp, jump)

    def commandToBinary(self):
        if (self.symCommandType == cType.C_COMMAND):
            (dest, comp, jump) = self.instructions()
            cCommandBinary = self.tranlator.translate(dest, comp, jump)
            return '111' + cCommandBinary
        else:
            return self.symbol()

    def parse(self):
        self.symTable.addData(self.prog)
        self.symTable.findLabels()
        hackFile = open(self.newFile, 'w')
        while (self.iProgLine < len(self.prog)):
            if (self.advance()):
                binCommand = self.commandToBinary()
                hackFile.write(binCommand)
                hackFile.write("\n")
        return
Ejemplo n.º 15
0
from typing import List

from src.translator import Translator

DEFAULT_BF_SOURCES = 'etc/hello_world.bf'


def read_sources(filename: str) -> List[str]:
    """read char by char a source file

    :param filename: path to the source file

    :return: its content as a list of char
    """
    bf_sources: List[str] = []

    with open(filename, 'r') as source:
        for line in source:
            [bf_sources.append(char) for char in line]

    return bf_sources


if __name__ == '__main__':
    sources = read_sources(DEFAULT_BF_SOURCES)
    sources = Translator.sanitize(sources)
    c_sources = Translator.bf_to_c(sources)
    print(''.join(c_sources))
Ejemplo n.º 16
0
def runf1(conn, args):
    # evaluation dataset
    # english context so that answer is in english
    data = MLQADataset(args.dataset, 'en', args.langQuestion)

    # initialize searcher
    init(conn, 'wiki', args)

    # initialise reader
    print("Reader")
    reader = Reader(model="models/distilbert-base-uncased-distilled-squad/",
                    tokenizer="models/distilbert-uncased-my-tok")

    # initialise translator
    print("Translator")
    languages = {args.langQuestion, args.langSearch, 'en'}
    translator = Translator(languages)
    print("Translating between: {}".format(str(languages)))
    counters = {'f1': [], 'tally': 0, 'score': []}

    for doc in data.get():
        questionSearch = translator(doc['question'], args.langQuestion,
                                    args.langSearch)
        #print("questionSearch ", questionSearch.encode('utf-8'))
        search(conn, questionSearch, args.langSearch)

        if args.langSearch == 'en':
            questionRead = questionSearch
        else:
            questionRead = translator(doc['question'], args.langQuestion, 'en')
        #print("questionRead ", questionRead.encode('utf-8'))
        # recv = {'search':[{'id':qid, 'docs':[{'context':'...', 'title':'...', 'score':score}]}]
        bestScore = 0
        recv = recvall(conn)
        for n, docSearch in enumerate(recv['search'][0]['docs']):
            # reader answer question given contexts
            #print("n: ", n)
            #print("contextSearch ", docSearch['context'].encode('utf-8'))
            contextRead = translator(docSearch['context'], args.langSearch,
                                     'en')
            #print("contextRead ", contextRead.encode('utf-8'))
            _, answerRead, score = reader(questionRead, contextRead)
            if score >= bestScore:
                bestScore = score
                bestAnswer = answerRead
                bestContext = contextRead

        #print("goldAnswer: ",doc['answer'].encode('utf-8'))
        #print("Answer:     ",bestAnswer.encode('utf-8'))
        counters['f1'].append(f1_drqa(bestAnswer, doc['answer']))
        counters['tally'] += 1
        counters['score'].append(bestScore)
        # test
        if args.stop != 0 and counters['tally'] >= args.stop:
            print("Stoping at: ", counters['tally'])
            break
        #if i > 1:
        #    break

    f1 = np.array(counters['f1'])
    exact_match = f1[f1 == 1.0].sum() / f1.size
    print("Exact match: {}".format(exact_match))
    print("F1 mean: {}".format(f1.mean()))
    print("Mean score: {}".format(sum(counters['score']) / counters['tally']))
    print("Total: {}".format(counters['tally']))
    if args.save_as:
        print("Writing to: ", args.save_as)
        with open(args.save_as, "w") as fp:
            json.dump(counters, fp)

    close(conn, args.stop_server)

    return f1.mean()
Ejemplo n.º 17
0
def main_train():
    # Build argument parser
    parser = argparse.ArgumentParser(description='Train a table to text model')

    # Training corpus
    corpora_group = parser.add_argument_group('training corpora',
                                              'Corpora related arguments; specify either unaligned or'
                                              ' aligned training corpora')
    # "Languages (type,path)"
    corpora_group.add_argument('--src_corpus_params', type=str,
                               default='table, ./data/processed_data/train/train.box',
                               help='the source unaligned corpus (type,path). Type = text/table')
    corpora_group.add_argument('--trg_corpus_params', type=str,
                               default='text, ./data/processed_data/train/train.article',
                               help='the target unaligned corpus (type,path). Type = text/table')
    corpora_group.add_argument('--src_para_corpus_params', type=str, default='',
                               help='the source corpus of parallel data(type,path). Type = text/table')
    corpora_group.add_argument('--trg_para_corpus_params', type=str, default='',
                               help='the target corpus of parallel data(type,path). Type = text/table')
    # Maybe add src/target type (i.e. text/table)
    corpora_group.add_argument('--corpus_mode', type=str, default='mono',
                               help='training mode: "mono" (unsupervised) / "para" (supervised)')

    corpora_group.add_argument('--max_sentence_length', type=int, default=50,
                               help='the maximum sentence length for training (defaults to 50)')
    corpora_group.add_argument('--cache', type=int, default=100000,
                               help='the cache size (in sentences) for corpus reading (defaults to 1000000)')

    # Embeddings/vocabulary
    embedding_group = parser.add_argument_group('embeddings',
                                                'Embedding related arguments; either give pre-trained embeddings,'
                                                ' or a vocabulary and embedding dimensionality to'
                                                ' randomly initialize them')
    embedding_group.add_argument('--metadata_path', type=str, default='', required=True,
                                 help='Path for bin file created in pre-processing phase, '
                                      'containing BPEmb related metadata.')

    # Architecture
    architecture_group = parser.add_argument_group('architecture', 'Architecture related arguments')
    architecture_group.add_argument('--layers', type=int, default=2,
                                    help='the number of encoder/decoder layers (defaults to 2)')
    architecture_group.add_argument('--hidden', type=int, default=600,
                                    help='the number of dimensions for the hidden layer (defaults to 600)')
    architecture_group.add_argument('--dis_hidden', type=int, default=150,
                                    help='Number of dimensions for the discriminator hidden layers')
    architecture_group.add_argument('--n_dis_layers', type=int, default=2,
                                    help='Number of discriminator layers')
    architecture_group.add_argument('--disable_bidirectional', action='store_true',
                                    help='use a single direction encoder')
    architecture_group.add_argument('--disable_backtranslation', action='store_true', help='disable backtranslation')
    architecture_group.add_argument('--disable_field_loss', action='store_true', help='disable backtranslation')
    architecture_group.add_argument('--disable_discriminator', action='store_true', help='disable discriminator')
    architecture_group.add_argument('--shared_enc', action='store_true', help='share enc for both directions')
    architecture_group.add_argument('--shared_dec', action='store_true', help='share dec for both directions')

    # Denoising
    denoising_group = parser.add_argument_group('denoising', 'Denoising related arguments')
    denoising_group.add_argument('--denoising_mode', type=int, default=1, help='0/1/2 = disabled/old/new')
    denoising_group.add_argument('--word_shuffle', type=int, default=3,
                                 help='shuffle words (only relevant in new mode)')
    denoising_group.add_argument('--word_dropout', type=float, default=0.1,
                                 help='randomly remove words (only relevant in new mode)')
    denoising_group.add_argument('--word_blank', type=float, default=0.2,
                                 help='randomly blank out words (only relevant in new mode)')

    # Optimization
    optimization_group = parser.add_argument_group('optimization', 'Optimization related arguments')
    optimization_group.add_argument('--batch', type=int, default=50, help='the batch size (defaults to 50)')
    optimization_group.add_argument('--learning_rate', type=float, default=0.0002,
                                    help='the global learning rate (defaults to 0.0002)')
    optimization_group.add_argument('--dropout', metavar='PROB', type=float, default=0.3,
                                    help='dropout probability for the encoder/decoder (defaults to 0.3)')
    optimization_group.add_argument('--param_init', metavar='RANGE', type=float, default=0.1,
                                    help='uniform initialization in the specified range (defaults to 0.1,  0 for module specific default initialization)')
    optimization_group.add_argument('--iterations', type=int, default=300000,
                                    help='the number of training iterations (defaults to 300000)')

    # Model saving
    saving_group = parser.add_argument_group('model saving', 'Arguments for saving the trained model')
    saving_group.add_argument('--save', metavar='PREFIX', help='save models with the given prefix')
    saving_group.add_argument('--save_interval', type=int, default=0, help='save intermediate models at this interval')

    # Logging/validation
    logging_group = parser.add_argument_group('logging', 'Logging and validation arguments')
    logging_group.add_argument('--log_interval', type=int, default=100, help='log at this interval (defaults to 1000)')
    logging_group.add_argument('--dbg_print_interval', type=int, default=1000,
                               help='log at this interval (defaults to 1000)')
    logging_group.add_argument('--src_valid_corpus', type=str, default='')
    logging_group.add_argument('--trg_valid_corpus', type=str, default='')
    logging_group.add_argument('--print_level', type=str, default='info', help='logging level [debug | info]')

    # Other
    misc_group = parser.add_argument_group('misc', 'Misc. arguments')
    misc_group.add_argument('--encoding', default='utf-8',
                            help='the character encoding for input/output (defaults to utf-8)')
    misc_group.add_argument('--cuda', type=str, default='cpu', help='device for training. default value: "cpu"')
    misc_group.add_argument('--bleu_device', type=str, default='',
                            help='device for calculating BLEU scores in case a validation dataset is given')

    # Parse arguments
    args = parser.parse_args()

    logger = logging.getLogger()
    if args.print_level == 'debug':
        logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
    elif args.print_level == 'info':
        logging.basicConfig(stream=sys.stderr, level=logging.INFO)
    elif args.print_level == 'warning':
        logging.basicConfig(stream=sys.stderr, level=logging.WARNING)
    else:
        logging.basicConfig(stream=sys.stderr, level=logging.CRITICAL)

    # Validate arguments
    if args.src_corpus_params is None or args.trg_corpus_params is None:
        print("Must supply corpus")
        sys.exit(-1)

    args.src_corpus_params = args.src_corpus_params.split(',')
    args.trg_corpus_params = args.trg_corpus_params.split(',')
    assert len(args.src_corpus_params) == 2
    assert len(args.trg_corpus_params) == 2

    src_type, src_corpus_path = args.src_corpus_params
    trg_type, trg_corpus_path = args.trg_corpus_params

    src_type = src_type.strip()
    src_corpus_path = src_corpus_path.strip()
    trg_type = trg_type.strip()
    trg_corpus_path = trg_corpus_path.strip()

    assert src_type != trg_type
    assert (src_type in ['table', 'text']) and (trg_type in ['table', 'text'])

    corpus_size = get_num_lines(src_corpus_path + '.content')

    # Select device
    if torch.cuda.is_available():
        device = torch.device(args.cuda)
    else:
        device = torch.device('cpu')

    if args.bleu_device == '':
        args.bleu_device = device

    current_time = str(datetime.datetime.now().timestamp())
    run_dir = 'run_' + current_time + '/'
    train_log_dir = 'logs/train/' + run_dir + args.save
    valid_log_dir = 'logs/valid/' + run_dir + args.save

    train_writer = SummaryWriter(train_log_dir)
    valid_writer = SummaryWriter(valid_log_dir)

    # Create optimizer lists
    src2src_optimizers = []
    trg2trg_optimizers = []
    src2trg_optimizers = []
    trg2src_optimizers = []

    # Method to create a module optimizer and add it to the given lists
    def add_optimizer(module, directions=()):
        if args.param_init != 0.0:
            for param in module.parameters():
                param.data.uniform_(-args.param_init, args.param_init)
        optimizer = torch.optim.Adam(module.parameters(), lr=args.learning_rate)
        for direction in directions:
            direction.append(optimizer)
        return optimizer

    assert os.path.isfile(args.metadata_path)

    metadata = torch.load(args.metadata_path)
    bpemb_en = metadata.init_bpe_module()
    word_dict: BpeWordDict = torch.load(metadata.word_dict_path)
    field_dict: LabelDict = torch.load(metadata.field_dict_path)

    args.hidden = bpemb_en.dim + bpemb_en.dim // 2
    if not args.disable_bidirectional:
        args.hidden *= 2

    # Load embedding and/or vocab
    # word_dict = BpeWordDict.get(vocab=bpemb_en.words)
    w_sos_id = {'text': word_dict.bos_index, 'table': word_dict.sot_index}

    word_embeddings = nn.Embedding(len(word_dict), bpemb_en.dim, padding_idx=word_dict.pad_index)
    nn.init.normal_(word_embeddings.weight, 0, 0.1)
    nn.init.constant_(word_embeddings.weight[word_dict.pad_index], 0)
    with torch.no_grad():
        word_embeddings.weight[:bpemb_en.vs, :] = torch.from_numpy(bpemb_en.vectors)
    word_embedding_size = word_embeddings.weight.data.size()[1]
    word_embeddings = word_embeddings.to(device)
    word_embeddings.weight.requires_grad = False
    logger.debug('w_embeddings is running on cuda: %d', next(word_embeddings.parameters()).is_cuda)

    # field_dict: LabelDict = torch.load('./data/processed_data/train/field.dict')
    field_embeddings = nn.Embedding(len(field_dict), bpemb_en.dim // 2, padding_idx=field_dict.pad_index)
    nn.init.normal_(field_embeddings.weight, 0, 0.1)
    nn.init.constant_(field_embeddings.weight[field_dict.pad_index], 0)
    field_embedding_size = field_embeddings.weight.data.size()[1]
    field_embeddings = field_embeddings.to(device)
    field_embeddings.weight.requires_grad = True
    logger.debug('f_embeddings is running on cuda: %d', next(word_embeddings.parameters()).is_cuda)

    src_encoder_word_embeddings = word_embeddings
    trg_encoder_word_embeddings = word_embeddings
    src_encoder_field_embeddings = field_embeddings
    trg_encoder_field_embeddings = field_embeddings

    src_decoder_word_embeddings = word_embeddings
    trg_decoder_word_embeddings = word_embeddings
    src_decoder_field_embeddings = field_embeddings
    trg_decoder_field_embeddings = field_embeddings

    src_generator = LinearGenerator(args.hidden, len(word_dict), len(field_dict)).to(device)

    if args.shared_dec:
        trg_generator = src_generator
        add_optimizer(src_generator, (src2src_optimizers, trg2src_optimizers, trg2trg_optimizers, src2trg_optimizers))
    else:
        trg_generator = LinearGenerator(args.hidden, len(word_dict), len(field_dict)).to(device)
        add_optimizer(src_generator, (src2src_optimizers, trg2src_optimizers))
        add_optimizer(trg_generator, (trg2trg_optimizers, src2trg_optimizers))

    logger.debug('src generator is running on cuda: %d', next(src_generator.parameters()).is_cuda)
    logger.debug('trg generator is running on cuda: %d', next(src_generator.parameters()).is_cuda)

    # Build encoder
    src_enc = RNNEncoder(word_embedding_size=word_embedding_size, field_embedding_size=field_embedding_size,
                         hidden_size=args.hidden, bidirectional=not args.disable_bidirectional,
                         layers=args.layers, dropout=args.dropout).to(device)

    if args.shared_enc:
        trg_enc = src_enc
        add_optimizer(src_enc, (src2src_optimizers, src2trg_optimizers, trg2trg_optimizers, trg2src_optimizers))
    else:
        trg_enc = RNNEncoder(word_embedding_size=word_embedding_size, field_embedding_size=field_embedding_size,
                             hidden_size=args.hidden, bidirectional=not args.disable_bidirectional,
                             layers=args.layers, dropout=args.dropout).to(device)
        add_optimizer(src_enc, (src2src_optimizers, src2trg_optimizers))
        add_optimizer(trg_enc, (trg2trg_optimizers, trg2src_optimizers))

    logger.debug('encoder model is running on cuda: %d', next(src_enc.parameters()).is_cuda)

    # Build decoders
    src_dec = RNNAttentionDecoder(word_embedding_size=word_embedding_size,
                                  field_embedding_size=field_embedding_size, hidden_size=args.hidden,
                                  layers=args.layers, dropout=args.dropout, input_feeding=False).to(device)

    if args.shared_dec:
        trg_dec = src_dec
        add_optimizer(src_dec, (src2src_optimizers, trg2src_optimizers, trg2trg_optimizers, src2trg_optimizers))
    else:
        trg_dec = RNNAttentionDecoder(word_embedding_size=word_embedding_size,
                                      field_embedding_size=field_embedding_size, hidden_size=args.hidden,
                                      layers=args.layers, dropout=args.dropout, input_feeding=False).to(device)
        add_optimizer(src_dec, (src2src_optimizers, trg2src_optimizers))
        add_optimizer(trg_dec, (trg2trg_optimizers, src2trg_optimizers))

    logger.debug('decoder model is running on cuda: %d', next(src_dec.parameters()).is_cuda)
    logger.debug('attention model is running on cuda: %d', next(src_dec.attention.parameters()).is_cuda)

    discriminator = None

    if (args.corpus_mode == 'mono') and not args.disable_discriminator:
        discriminator = Discriminator(args.hidden, args.dis_hidden, args.n_dis_layers, args.dropout)
        discriminator = discriminator.to(device)

    # Build translators
    src2src_translator = Translator("src2src",
                                    encoder_word_embeddings=src_encoder_word_embeddings,
                                    decoder_word_embeddings=src_decoder_word_embeddings,
                                    encoder_field_embeddings=src_encoder_field_embeddings,
                                    decoder_field_embeddings=src_decoder_field_embeddings,
                                    generator=src_generator,
                                    src_word_dict=word_dict, trg_word_dict=word_dict,
                                    src_field_dict=field_dict, trg_field_dict=field_dict,
                                    src_type=src_type, trg_type=src_type, w_sos_id=w_sos_id[src_type],
                                    bpemb_en=bpemb_en, encoder=src_enc, decoder=src_dec, discriminator=discriminator,
                                    denoising=args.denoising_mode, device=device,
                                    max_word_shuffle_distance=args.word_shuffle,
                                    word_dropout_prob=args.word_dropout,
                                    word_blanking_prob=args.word_blank)
    src2trg_translator = Translator("src2trg",
                                    encoder_word_embeddings=src_encoder_word_embeddings,
                                    decoder_word_embeddings=trg_decoder_word_embeddings,
                                    encoder_field_embeddings=src_encoder_field_embeddings,
                                    decoder_field_embeddings=trg_decoder_field_embeddings,
                                    generator=trg_generator,
                                    src_word_dict=word_dict, trg_word_dict=word_dict,
                                    src_field_dict=field_dict, trg_field_dict=field_dict,
                                    src_type=src_type, trg_type=trg_type, w_sos_id=w_sos_id[trg_type],
                                    bpemb_en=bpemb_en, encoder=src_enc, decoder=trg_dec, discriminator=discriminator,
                                    denoising=0, device=device,
                                    max_word_shuffle_distance=args.word_shuffle,
                                    word_dropout_prob=args.word_dropout,
                                    word_blanking_prob=args.word_blank)
    trg2trg_translator = Translator("trg2trg",
                                    encoder_word_embeddings=trg_encoder_word_embeddings,
                                    decoder_word_embeddings=trg_decoder_word_embeddings,
                                    encoder_field_embeddings=trg_encoder_field_embeddings,
                                    decoder_field_embeddings=trg_decoder_field_embeddings,
                                    generator=trg_generator,
                                    src_word_dict=word_dict, trg_word_dict=word_dict,
                                    src_field_dict=field_dict, trg_field_dict=field_dict,
                                    src_type=trg_type, trg_type=trg_type, w_sos_id=w_sos_id[trg_type],
                                    bpemb_en=bpemb_en, encoder=trg_enc, decoder=trg_dec, discriminator=discriminator,
                                    denoising=args.denoising_mode, device=device,
                                    max_word_shuffle_distance=args.word_shuffle,
                                    word_dropout_prob=args.word_dropout,
                                    word_blanking_prob=args.word_blank)
    trg2src_translator = Translator("trg2src",
                                    encoder_word_embeddings=trg_encoder_word_embeddings,
                                    decoder_word_embeddings=src_decoder_word_embeddings,
                                    encoder_field_embeddings=trg_encoder_field_embeddings,
                                    decoder_field_embeddings=src_decoder_field_embeddings,
                                    generator=src_generator,
                                    src_word_dict=word_dict, trg_word_dict=word_dict,
                                    src_field_dict=field_dict, trg_field_dict=field_dict,
                                    src_type=trg_type, trg_type=src_type, w_sos_id=w_sos_id[src_type],
                                    bpemb_en=bpemb_en, encoder=trg_enc, decoder=src_dec, discriminator=discriminator,
                                    denoising=0, device=device,
                                    max_word_shuffle_distance=args.word_shuffle,
                                    word_dropout_prob=args.word_dropout,
                                    word_blanking_prob=args.word_blank)

    # Build trainers
    trainers = []
    iters_per_epoch = int(np.ceil(corpus_size / args.batch))
    print("CORPUS_SIZE = %d | BATCH_SIZE = %d | ITERS_PER_EPOCH = %d" % (corpus_size, args.batch, iters_per_epoch))

    if args.corpus_mode == 'mono':
        f_content = open(src_corpus_path + '.content', encoding=args.encoding, errors='surrogateescape')
        f_labels = open(src_corpus_path + '.labels', encoding=args.encoding, errors='surrogateescape')
        src_corpus_path = data.CorpusReader(f_content, f_labels, max_sentence_length=args.max_sentence_length,
                                       cache_size=args.cache)
        f_content = open(trg_corpus_path + '.content', encoding=args.encoding, errors='surrogateescape')
        f_labels = open(trg_corpus_path + '.labels', encoding=args.encoding, errors='surrogateescape')
        trg_corpus_path = data.CorpusReader(f_content, f_labels, max_sentence_length=args.max_sentence_length,
                                       cache_size=args.cache)

        if not args.disable_discriminator:
            disc_trainer = DiscTrainer(device, src_corpus_path, trg_corpus_path, src_enc, trg_enc, src_encoder_word_embeddings,
                                       src_encoder_field_embeddings, word_dict, field_dict, discriminator,
                                       args.learning_rate, batch_size=args.batch)
            trainers.append(disc_trainer)

        src2src_trainer = Trainer(translator=src2src_translator, optimizers=src2src_optimizers, corpus=src_corpus_path,
                                  batch_size=args.batch, iters_per_epoch=iters_per_epoch)
        trainers.append(src2src_trainer)
        if not args.disable_backtranslation:
            trgback2src_trainer = Trainer(translator=trg2src_translator, optimizers=trg2src_optimizers,
                                          corpus=data.BacktranslatorCorpusReader(corpus=src_corpus_path,
                                                                                 translator=src2trg_translator),
                                          batch_size=args.batch, iters_per_epoch=iters_per_epoch)
            trainers.append(trgback2src_trainer)

        trg2trg_trainer = Trainer(translator=trg2trg_translator, optimizers=trg2trg_optimizers, corpus=trg_corpus_path,
                                  batch_size=args.batch, iters_per_epoch=iters_per_epoch)
        trainers.append(trg2trg_trainer)
        if not args.disable_backtranslation:
            srcback2trg_trainer = Trainer(translator=src2trg_translator, optimizers=src2trg_optimizers,
                                          corpus=data.BacktranslatorCorpusReader(corpus=trg_corpus_path,
                                                                                 translator=trg2src_translator),
                                          batch_size=args.batch, iters_per_epoch=iters_per_epoch)
            trainers.append(srcback2trg_trainer)
    elif args.corpus_mode == 'para':
        fsrc_content = open(src_corpus_path + '.content', encoding=args.encoding, errors='surrogateescape')
        fsrc_labels = open(src_corpus_path + '.labels', encoding=args.encoding, errors='surrogateescape')
        ftrg_content = open(trg_corpus_path + '.content', encoding=args.encoding, errors='surrogateescape')
        ftrg_labels = open(trg_corpus_path + '.labels', encoding=args.encoding, errors='surrogateescape')
        corpus = data.CorpusReader(fsrc_content, fsrc_labels, trg_word_file=ftrg_content, trg_field_file=ftrg_labels,
                                   max_sentence_length=args.max_sentence_length,
                                   cache_size=args.cache)
        src2trg_trainer = Trainer(translator=src2trg_translator, optimizers=src2trg_optimizers, corpus=corpus,
                                  batch_size=args.batch, iters_per_epoch=iters_per_epoch)
        trainers.append(src2trg_trainer)

    # Build validators
    if args.src_valid_corpus != '' and args.trg_valid_corpus != '':
        with ExitStack() as stack:
            src_content_vfile = stack.enter_context(open(args.src_valid_corpus + '.content', encoding=args.encoding,
                                                         errors='surrogateescape'))
            src_labels_vfile = stack.enter_context(open(args.src_valid_corpus + '.labels', encoding=args.encoding,
                                                        errors='surrogateescape'))
            trg_content_vfile = stack.enter_context(open(args.trg_valid_corpus + '.content', encoding=args.encoding,
                                                         errors='surrogateescape'))
            trg_labels_vfile = stack.enter_context(open(args.trg_valid_corpus + '.labels', encoding=args.encoding,
                                                        errors='surrogateescape'))

            src_content = src_content_vfile.readlines()
            src_labels = src_labels_vfile.readlines()
            trg_content = trg_content_vfile.readlines()
            trg_labels = trg_labels_vfile.readlines()
            assert len(src_content) == len(trg_content) == len(src_labels) == len(trg_labels), \
                "Validation sizes do not match {} {} {} {}".format(len(src_content), len(trg_content), len(src_labels),
                len(trg_labels))

            src_content = [list(map(int, line.strip().split())) for line in src_content]
            src_labels = [list(map(int, line.strip().split())) for line in src_labels]
            trg_content = [list(map(int, line.strip().split())) for line in trg_content]
            trg_labels = [list(map(int, line.strip().split())) for line in trg_labels]

            cache = []
            for src_sent, src_label, trg_sent, trg_label in zip(src_content, src_labels, trg_content, trg_labels):
                if 0 < len(src_sent) <= args.max_sentence_length and 0 < len(trg_sent) <= args.max_sentence_length:
                    cache.append((src_sent, src_label, trg_sent, trg_label))

            src_content, src_labels, trg_content, trg_labels = zip(*cache)

            src2trg_validator = Validator(src2trg_translator, src_content, trg_content, src_labels, trg_labels)

            if args.corpus_mode == 'mono':
                src2src_validator = Validator(src2src_translator, src_content, src_content, src_labels, src_labels)

                trg2src_validator = Validator(trg2src_translator, trg_content, src_content, trg_labels, src_labels)

                trg2trg_validator = Validator(trg2trg_translator, trg_content, trg_content, trg_labels, trg_labels)

            del src_content
            del src_labels
            del trg_content
            del trg_labels
    else:
        src2src_validator = None
        src2trg_validator = None
        trg2src_validator = None
        trg2trg_validator = None

    # Build loggers
    loggers = []
    semi_loggers = []

    if args.corpus_mode == 'mono':
        if not args.disable_backtranslation:
            loggers.append(Logger('Source to target (backtranslation)', srcback2trg_trainer, src2trg_validator,
                                  None, args.encoding, short_name='src2trg_bt', train_writer=train_writer,
                                  valid_writer=valid_writer))
            loggers.append(Logger('Target to source (backtranslation)', trgback2src_trainer, trg2src_validator,
                                  None, args.encoding, short_name='trg2src_bt', train_writer=train_writer,
                                  valid_writer=valid_writer))

        loggers.append(Logger('Source to source', src2src_trainer, src2src_validator, None, args.encoding,
                              short_name='src2src', train_writer=train_writer, valid_writer=valid_writer))
        loggers.append(Logger('Target to target', trg2trg_trainer, trg2trg_validator, None, args.encoding,
                              short_name='trg2trg', train_writer=train_writer, valid_writer=valid_writer))
    elif args.corpus_mode == 'para':
        loggers.append(Logger('Source to target', src2trg_trainer, src2trg_validator, None, args.encoding,
                              short_name='src2trg_para', train_writer=train_writer, valid_writer=valid_writer))

    # Method to save models
    def save_models(name):
        # torch.save(src2src_translator, '{0}.{1}.src2src.pth'.format(args.save, name))
        # torch.save(trg2trg_translator, '{0}.{1}.trg2trg.pth'.format(args.save, name))
        torch.save(src2trg_translator, '{0}.{1}.src2trg.pth'.format(args.save, name))
        if args.corpus_mode == 'mono':
            torch.save(trg2src_translator, '{0}.{1}.trg2src.pth'.format(args.save, name))

    ref_string_path = args.trg_valid_corpus + '.str.content'

    if not os.path.isfile(ref_string_path):
        print("Creating ref file... [%s]" % (ref_string_path))

        with ExitStack() as stack:

            fref_content = stack.enter_context(
                open(args.trg_valid_corpus + '.content', encoding=args.encoding, errors='surrogateescape'))
            fref_str_content = stack.enter_context(
                open(ref_string_path, mode='w', encoding=args.encoding, errors='surrogateescape'))

            for line in fref_content:
                ref_ids = [int(idstr) for idstr in line.strip().split()]
                ref_str = bpemb_en.decode_ids(ref_ids)
                fref_str_content.write(ref_str + '\n')

        print("Ref file created!")

    # Training
    for curr_iter in range(1, args.iterations + 1):
        print_dbg = (0 != args.dbg_print_interval) and (curr_iter % args.dbg_print_interval == 0)

        for trainer in trainers:
            trainer.step(print_dbg=print_dbg, include_field_loss=not args.disable_field_loss)

        if args.save is not None and args.save_interval > 0 and curr_iter % args.save_interval == 0:
            save_models('it{0}'.format(curr_iter))

        if curr_iter % args.log_interval == 0:
            print()
            print('[{0}] TRAIN-STEP {1} x {2}'.format(args.save, curr_iter, args.batch))
            for logger in loggers:
                logger.log(curr_iter)

        if curr_iter % iters_per_epoch == 0:
            save_models('it{0}'.format(curr_iter))
            print()
            print('[{0}] VALID-STEP {1}'.format(args.save, curr_iter))
            for logger in loggers:
                if logger.validator is not None:
                    logger.validate(curr_iter)

            model = '{0}.{1}.src2trg.pth'.format(args.save, 'it{0}'.format(curr_iter))

            bleu_thread = threading.Thread(target=calc_bleu,
                                           args=(model, args.save, args.src_valid_corpus, args.trg_valid_corpus + '.str.result',
                                                 ref_string_path, bpemb_en, curr_iter, args.bleu_device, valid_writer))
            bleu_thread.start()
            if args.cuda == args.bleu_device or args.bleu_device == 'cpu':
                bleu_thread.join()

    save_models('final')
    train_writer.close()
    valid_writer.close()
Ejemplo n.º 18
0
def main():
    parser = argparse.ArgumentParser(description="translate.py")

    parser.add_argument("--eval_splits", type=str, nargs="+", default=["val", ],
                        choices=["val", "test"], help="evaluate on val/test set, yc2 only has val")
    parser.add_argument("--res_dir", required=True, help="path to dir containing model .pt file")
    parser.add_argument("--batch_size", type=int, default=100, help="batch size")

    # beam search configs
    parser.add_argument("--use_beam", action="store_true", help="use beam search, otherwise greedy search")
    parser.add_argument("--beam_size", type=int, default=2, help="beam size")
    parser.add_argument("--n_best", type=int, default=1, help="stop searching when get n_best from beam search")
    parser.add_argument("--min_sen_len", type=int, default=5, help="minimum length of the decoded sentences")
    parser.add_argument("--max_sen_len", type=int, default=30, help="maximum length of the decoded sentences")
    parser.add_argument("--block_ngram_repeat", type=int, default=0, help="block repetition of ngrams during decoding.")
    parser.add_argument("--length_penalty_name", default="none",
                        choices=["none", "wu", "avg"], help="length penalty to use.")
    parser.add_argument("--length_penalty_alpha", type=float, default=0.,
                        help="Google NMT length penalty parameter (higher = longer generation)")
    parser.add_argument("--eval_tool_dir", type=str, default="./densevid_eval")

    parser.add_argument("--no_cuda", action="store_true")
    parser.add_argument("--seed", default=2019, type=int)
    parser.add_argument("--debug", action="store_true")

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # random seed
    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)

    checkpoint = torch.load(os.path.join(opt.res_dir, "model.chkpt"))

    # add some of the train configs
    train_opt = checkpoint["opt"]  # EDict(load_json(os.path.join(opt.res_dir, "model.cfg.json")))
    for k in train_opt.__dict__:
        if k not in opt.__dict__:
            setattr(opt, k, getattr(train_opt, k))
    print("train_opt", train_opt)

    decoding_strategy = "beam{}_lp_{}_la_{}".format(
        opt.beam_size, opt.length_penalty_name, opt.length_penalty_alpha) if opt.use_beam else "greedy"
    save_json(vars(opt),
              os.path.join(opt.res_dir, "{}_eval_cfg.json".format(decoding_strategy)),
              save_pretty=True)

    if opt.dset_name == "anet":
        reference_files_map = {
            "val": [os.path.join(opt.data_dir, e) for e in
                    ["anet_entities_val_1_para.json", "anet_entities_val_2_para.json"]],
            "test": [os.path.join(opt.data_dir, e) for e in
                     ["anet_entities_test_1_para.json", "anet_entities_test_2_para.json"]]}
    else:  # yc2
        reference_files_map = {"val": [os.path.join(opt.data_dir, "yc2_val_anet_format_para.json")]}
    for eval_mode in opt.eval_splits:
        print("Start evaluating {}".format(eval_mode))
        # add 10 at max_n_sen to make the inference stage use all the segments
        eval_data_loader = get_data_loader(opt, eval_mode=eval_mode)
        eval_references = reference_files_map[eval_mode]

        # setup model
        translator = Translator(opt, checkpoint)

        pred_file = os.path.join(opt.res_dir, "{}_pred_{}.json".format(decoding_strategy, eval_mode))
        pred_file = os.path.abspath(pred_file)
        if not os.path.exists(pred_file):
            json_res = run_translate(eval_data_loader, translator, opt=opt)
            save_json(json_res, pred_file, save_pretty=True)
        else:
            print("Using existing prediction file at {}".format(pred_file))

        # COCO language evaluation
        lang_file = pred_file.replace(".json", "_lang.json")
        eval_command = ["python", "para-evaluate.py", "-s", pred_file, "-o", lang_file,
                        "-v", "-r"] + eval_references
        subprocess.call(eval_command, cwd=opt.eval_tool_dir)

        # basic stats
        stat_filepath = pred_file.replace(".json", "_stat.json")
        eval_stat_cmd = ["python", "get_caption_stat.py", "-s", pred_file, "-r", eval_references[0],
                         "-o", stat_filepath, "-v"]
        subprocess.call(eval_stat_cmd, cwd=opt.eval_tool_dir)

        # repetition evaluation
        rep_filepath = pred_file.replace(".json", "_rep.json")
        eval_rep_cmd = ["python", "evaluateRepetition.py", "-s", pred_file,
                        "-r", eval_references[0], "-o", rep_filepath]
        subprocess.call(eval_rep_cmd, cwd=opt.eval_tool_dir)

        metric_filepaths = [lang_file, stat_filepath, rep_filepath]
        all_metrics = merge_dicts([load_json(e) for e in metric_filepaths])
        all_metrics_filepath = pred_file.replace(".json", "_all_metrics.json")
        save_json(all_metrics, all_metrics_filepath, save_pretty=True)

        print("pred_file {} lang_file {}".format(pred_file, lang_file))
        print("[Info] Finished {}.".format(eval_mode))
Ejemplo n.º 19
0
from fastapi import FastAPI, HTTPException
from src.models import Answer
from src.qa_model import QAModel
from src.translator import Translator


app = FastAPI()

biobert_path = 'BioBertFolder/biobert_v1.0_pubmed_pmc/'
bert_fnn_weights = 'assets/models/bertffn_crossentropy/bertffn'
embedding_file = 'assets/Float16EmbeddingsExpanded5-27-19.pkl'
qa_model = QAModel(biobert_path, bert_fnn_weights, embedding_file)

translator = Translator(creds_path='gct_creds.json')


@app.get('/api/v1/ask', response_model=Answer)
async def ask(question: str, lang: str):
    if lang == 'uk':
        question = translator.translate(question, target="en")
        # return only 1 answer
        orig_result = qa_model.predict(question)[0]
        trans_result = translator.translate(orig_result)
        return {"original_answer": orig_result, "translated_answer": trans_result}
    elif lang == "en":
        result = qa_model.predict(question)[0]
        return {"original_answer": result, "translated_answer": result}
    else:
        raise HTTPException(400, "Only uk(Ukrainian) and en(English) languages are supported!")
Ejemplo n.º 20
0
def eval_language_metrics(checkpoint,
                          eval_data_loader,
                          opt,
                          model=None,
                          eval_mode="val"):
    """eval_mode can only be set to `val` here, as setting to `test` is cheating
    0, run inference
    1, Get METEOR, BLEU1-4, CIDEr scores
    2, Get vocab size, sentence length
    """
    translator = Translator(opt, checkpoint, model=model)
    json_res = run_translate(eval_data_loader, translator, opt=opt)
    res_filepath = os.path.abspath(
        opt.save_model + "_tmp_greedy_pred_{}.json".format(eval_mode))
    save_json(json_res, res_filepath, save_pretty=True)

    if opt.dset_name == "anet":
        reference_files_map = {
            "val": [
                os.path.join(opt.data_dir, e) for e in [
                    "anet_entities_val_1_para.json",
                    "anet_entities_val_2_para.json"
                ]
            ],
            "test": [
                os.path.join(opt.data_dir, e) for e in [
                    "anet_entities_test_1_para.json",
                    "anet_entities_test_2_para.json"
                ]
            ]
        }
    else:  # yc2
        reference_files_map = {
            "val":
            [os.path.join(opt.data_dir, "yc2_val_anet_format_para.json")]
        }

    # COCO language evaluation
    eval_references = reference_files_map[eval_mode]
    lang_filepath = res_filepath.replace(".json", "_lang.json")
    eval_cmd = [
        "python", "para-evaluate.py", "-s", res_filepath, "-o", lang_filepath,
        "-v", "-r"
    ] + eval_references
    subprocess.call(eval_cmd, cwd=opt.eval_tool_dir)

    # basic stats
    stat_filepath = res_filepath.replace(".json", "_stat.json")
    eval_stat_cmd = [
        "python", "get_caption_stat.py", "-s", res_filepath, "-r",
        eval_references[0], "-o", stat_filepath, "-v"
    ]
    subprocess.call(eval_stat_cmd, cwd=opt.eval_tool_dir)

    # repetition evaluation
    rep_filepath = res_filepath.replace(".json", "_rep.json")
    eval_rep_cmd = [
        "python", "evaluateRepetition.py", "-s", res_filepath, "-r",
        eval_references[0], "-o", rep_filepath
    ]
    subprocess.call(eval_rep_cmd, cwd=opt.eval_tool_dir)

    # save results
    logger.info("Finished eval {}.".format(eval_mode))
    metric_filepaths = [lang_filepath, stat_filepath, rep_filepath]
    all_metrics = merge_dicts([load_json(e) for e in metric_filepaths])

    all_metrics_filepath = res_filepath.replace(".json", "_all_metrics.json")
    save_json(all_metrics, all_metrics_filepath, save_pretty=True)
    return all_metrics, [res_filepath, all_metrics_filepath]
Ejemplo n.º 21
0
 def get_json_from_translator(self, xml, prolog=None):
     document_tree = DocumentTree(xml, prolog)
     translator = Translator(document_tree, loads('{}'))
     json = translator.get_json()
     return json
Ejemplo n.º 22
0
from src.lexer import Lexer
from src.parser import Parser
import sys
import json

from src.translator import Translator

if len(sys.argv) == 3:
    f = open(sys.argv[1], 'r')
    content = f.read()

    f = open(sys.argv[2], 'r')

    config = json.loads(f.read())

    lexer = Lexer(content)
    parser = Parser(lexer)
    document_tree = parser.get_document_tree()

    translator = Translator(document_tree, config)
    json = translator.get_json()

    output_file = open("output.json", "w")
    output_file.write(json)
    output_file.close()

else:
    print("Usage: python main.py <file_to_translate> <config_file>")