def train_model(self):
        trace('making vocaburaries ...')
        src_vocab = Vocabulary.new(gens.word_list(self.source), self.vocab)
        trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab)

        trace('making model ...')
        model = self.new(src_vocab, trg_vocab, self.embed, self.hidden, self.parameter_dict)

        random_number = random.randint(0, self.minibatch)
        for i_epoch in range(self.epoch):
            trace('epoch %d/%d: ' % (i_epoch + 1, self.epoch))
            trained = 0
            gen1 = gens.word_list(self.source)
            gen2 = gens.word_list(self.target)
            gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * self.minibatch), self.minibatch)
            model.init_optimizer()

            for src_batch, trg_batch in gen3:
                src_batch = fill_batch(src_batch)
                trg_batch = fill_batch(trg_batch)
                K = len(src_batch)
                hyp_batch = model.train(src_batch, trg_batch)

                if trained == 0:
                    self.print_out(random_number, i_epoch, trained, src_batch, trg_batch, hyp_batch)

                trained += K

            trace('saving model ...')
            model.save("ChainerMachineTranslation" + '.%03d' % (self.epoch + 1))

        trace('finished.')
Exemple #2
0
def train(args):
  trace('loading corpus ...')
  with open(args.source) as fp:
    trees = [make_tree(l) for l in fp]

  trace('extracting leaf nodes ...')
  word_lists = [extract_words(t) for t in trees]

  trace('extracting gold operations ...')
  op_lists = [make_operations(t) for t in trees]

  trace('making vocabulary ...')
  word_vocab = Vocabulary.new(word_lists, args.vocab)
  phrase_set = set()
  semi_set = set()
  for tree in trees:
    phrase_set |= set(extract_phrase_labels(tree))
    semi_set |= set(extract_semi_labels(tree))
  phrase_vocab = Vocabulary.new([list(phrase_set)], len(phrase_set), add_special_tokens=False)
  semi_vocab = Vocabulary.new([list(semi_set)], len(semi_set), add_special_tokens=False)

  trace('converting data ...')
  word_lists = [convert_word_list(x, word_vocab) for x in word_lists]
  op_lists = [convert_op_list(x, phrase_vocab, semi_vocab) for x in op_lists]

  trace('start training ...')
  parser = Parser(
      args.vocab, args.embed, args.queue, args.stack,
      len(phrase_set), len(semi_set),
  )
  if USE_GPU:
    parser.to_gpu()
  opt = optimizers.AdaGrad(lr = 0.005)
  opt.setup(parser)
  opt.add_hook(optimizer.GradientClipping(5))

  for epoch in range(args.epoch):
    n = 0
    
    for samples in batch(zip(word_lists, op_lists), args.minibatch):
      parser.zerograds()
      loss = my_zeros((), np.float32)

      for word_list, op_list in zip(*samples):
        trace('epoch %3d, sample %6d:' % (epoch + 1, n + 1))
        loss += parser.forward(word_list, op_list, 0)
        n += 1
      
      loss.backward()
      opt.update()

    trace('saving model ...')
    prefix = args.model + '.%03.d' % (epoch + 1)
    word_vocab.save(prefix + '.words')
    phrase_vocab.save(prefix + '.phrases')
    semi_vocab.save(prefix + '.semiterminals')
    parser.save_spec(prefix + '.spec')
    serializers.save_hdf5(prefix + '.weights', parser)

  trace('finished.')
 def load(filename):
     self = AttentionalTranslationModel()
     with ModelFile(filename) as fp:
         self.__src_vocab = Vocabulary.load(fp.get_file_pointer())
         self.__trg_vocab = Vocabulary.load(fp.get_file_pointer())
         self.__n_embed = int(fp.read())
         self.__n_hidden = int(fp.read())
         self.__make_model()
         wrapper.begin_model_access(self.__model)
         fp.read_embed(self.__model.w_xi)
         fp.read_linear(self.__model.w_ia)
         fp.read_linear(self.__model.w_aa)
         fp.read_linear(self.__model.w_ib)
         fp.read_linear(self.__model.w_bb)
         fp.read_linear(self.__model.w_aw)
         fp.read_linear(self.__model.w_bw)
         fp.read_linear(self.__model.w_pw)
         fp.read_linear(self.__model.w_we)
         fp.read_linear(self.__model.w_ap)
         fp.read_linear(self.__model.w_bp)
         fp.read_embed(self.__model.w_yp)
         fp.read_linear(self.__model.w_pp)
         fp.read_linear(self.__model.w_cp)
         fp.read_linear(self.__model.w_dp)
         fp.read_linear(self.__model.w_py)
         wrapper.end_model_access(self.__model)
     return self
Exemple #4
0
def train(args):
  trace('loading corpus ...')
  with open(args.source) as fp:
    trees = [make_tree(l) for l in fp]

  trace('extracting leaf nodes ...')
  word_lists = [extract_words(t) for t in trees]

  trace('extracting gold operations ...')
  op_lists = [make_operations(t) for t in trees]

  trace('making vocabulary ...')
  word_vocab = Vocabulary.new(word_lists, args.vocab)
  phrase_set = set()
  semi_set = set()
  for tree in trees:
    phrase_set |= set(extract_phrase_labels(tree))
    semi_set |= set(extract_semi_labels(tree))
  phrase_vocab = Vocabulary.new([list(phrase_set)], len(phrase_set), add_special_tokens=False)
  semi_vocab = Vocabulary.new([list(semi_set)], len(semi_set), add_special_tokens=False)

  trace('converting data ...')
  word_lists = [convert_word_list(x, word_vocab) for x in word_lists]
  op_lists = [convert_op_list(x, phrase_vocab, semi_vocab) for x in op_lists]

  trace('start training ...')
  parser = Parser(
      args.vocab, args.embed, args.queue, args.stack,
      len(phrase_set), len(semi_set),
  )
  if USE_GPU:
    parser.to_gpu()
  opt = optimizers.AdaGrad(lr = 0.005)
  opt.setup(parser)
  opt.add_hook(optimizer.GradientClipping(5))

  for epoch in range(args.epoch):
    n = 0
    
    for samples in batch(zip(word_lists, op_lists), args.minibatch):
      parser.zerograds()
      loss = my_zeros((), np.float32)

      for word_list, op_list in zip(*samples):
        trace('epoch %3d, sample %6d:' % (epoch + 1, n + 1))
        loss += parser.forward(word_list, op_list, 0)
        n += 1
      
      loss.backward()
      opt.update()

    trace('saving model ...')
    prefix = args.model + '.%03.d' % (epoch + 1)
    word_vocab.save(prefix + '.words')
    phrase_vocab.save(prefix + '.phrases')
    semi_vocab.save(prefix + '.semiterminals')
    parser.save_spec(prefix + '.spec')
    serializers.save_hdf5(prefix + '.weights', parser)

  trace('finished.')
Exemple #5
0
def test(args):
    trace('loading model ...')
    word_vocab = Vocabulary.load(args.model + '.words')
    phrase_vocab = Vocabulary.load(args.model + '.phrases')
    semiterminal_vocab = Vocabulary.load(args.model + '.semiterminals')
    parser = Parser.load_spec(args.model + '.spec')
    if args.use_gpu:
        parser.to_gpu()
    serializers.load_hdf5(args.model + '.weights', parser)

    embed_cache = {}
    parser.reset()

    trace('generating parse trees ...')
    with open(args.source) as fp:
        for l in fp:
            word_list = to_vram_words(convert_word_list(l.split(), word_vocab))
            tree = combine_xbar(
                restore_labels(
                    parser.forward(word_list, None, args.unary_limit,
                                   embed_cache), phrase_vocab,
                    semiterminal_vocab))
            print('( ' + tree_to_string(tree) + ' )')

    trace('finished.')
Exemple #6
0
def train_model(args):
    trace('making vocabularies ...')
    src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab)
    trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab)

    trace('making model ...')
    model = EncoderDecoderModel.new(src_vocab, trg_vocab, args.embed, args.hidden)

    for epoch in range(args.epoch):
        trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
        trained = 0
        gen1 = gens.word_list(args.source)
        gen2 = gens.word_list(args.target)
        gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * args.minibatch), args.minibatch)
        model.init_optimizer()

        for src_batch, trg_batch in gen3:
            src_batch = fill_batch(src_batch)
            trg_batch = fill_batch(trg_batch)
            K = len(src_batch)
            hyp_batch = model.train(src_batch, trg_batch)

            for k in range(K):
                trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1))
                trace('  src = ' + ' '.join([x if x != '</s>' else '*' for x in src_batch[k]]))
                trace('  trg = ' + ' '.join([x if x != '</s>' else '*' for x in trg_batch[k]]))
                trace('  hyp = ' + ' '.join([x if x != '</s>' else '*' for x in hyp_batch[k]]))

            trained += K

        trace('saving model ...')
        model.save(args.model + '.%03d' % (epoch + 1))

    trace('finished.')
    def test(self):
        trace('loading model ...')
        src_vocab = Vocabulary.load(self.model + '.srcvocab')
        trg_vocab = Vocabulary.load(self.model + '.trgvocab')
        encdec = EncoderDecoder.load_spec(self.model + '.spec')
        serializers.load_hdf5(self.model + '.weights', encdec)

        trace('generating translation ...')
        generated = 0

        with open(self.target, 'w') as fp:
            for src_batch in gens.batch(gens.word_list(self.source), self.minibatch):
                src_batch = fill_batch(src_batch)
                K = len(src_batch)

                trace('sample %8d - %8d ...' % (generated + 1, generated + K))
                hyp_batch = self.forward(src_batch, None, src_vocab, trg_vocab, encdec, False, self.generation_limit)

                source_cuont = 0
                for hyp in hyp_batch:
                    hyp.append('</s>')
                    hyp = hyp[:hyp.index('</s>')]
                    print("src : " + "".join(src_batch[source_cuont]).replace("</s>", ""))
                    print('hyp : ' +''.join(hyp))
                    print(' '.join(hyp), file=fp)
                    source_cuont = source_cuont + 1

                generated += K

        trace('finished.')
Exemple #8
0
def test(args):
  trace('loading model ...')
  word_vocab = Vocabulary.load(args.model + '.words')
  phrase_vocab = Vocabulary.load(args.model + '.phrases')
  semiterminal_vocab = Vocabulary.load(args.model + '.semiterminals')
  parser = Parser.load_spec(args.model + '.spec')
  if args.use_gpu:
    parser.to_gpu()
  serializers.load_hdf5(args.model + '.weights', parser)

  embed_cache = {}
  parser.reset()

  trace('generating parse trees ...')
  with open(args.source) as fp:
    for l in fp:
      word_list = to_vram_words(convert_word_list(l.split(), word_vocab))
      tree = combine_xbar(
          restore_labels(
              parser.forward(word_list, None, args.unary_limit, embed_cache),
              phrase_vocab,
              semiterminal_vocab))
      print('( ' + tree_to_string(tree) + ' )')

  trace('finished.')
    def test(self):
        trace('loading model ...')
        src_vocab = Vocabulary.load(self.model + '.srcvocab')
        trg_vocab = Vocabulary.load(self.model + '.trgvocab')
        encdec = EncoderDecoder.load_spec(self.model + '.spec')
        serializers.load_hdf5(self.model + '.weights', encdec)

        trace('generating translation ...')
        generated = 0

        with open(self.target, 'w') as fp:
            for src_batch in gens.batch(gens.word_list(self.source),
                                        self.minibatch):
                src_batch = fill_batch(src_batch)
                K = len(src_batch)

                trace('sample %8d - %8d ...' % (generated + 1, generated + K))
                hyp_batch = self.forward(src_batch, None, src_vocab, trg_vocab,
                                         encdec, False, self.generation_limit)

                source_cuont = 0
                for hyp in hyp_batch:
                    hyp.append('</s>')
                    hyp = hyp[:hyp.index('</s>')]
                    print("src : " +
                          "".join(src_batch[source_cuont]).replace("</s>", ""))
                    print('hyp : ' + ''.join(hyp))
                    print(' '.join(hyp), file=fp)
                    source_cuont = source_cuont + 1

                generated += K

        trace('finished.')
def train_model(args):
    trace('making vocaburaries ...')
    src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab)
    trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab)

    trace('making model ...')
    model = EncoderDecoderModel.new(src_vocab, trg_vocab, args.embed, args.hidden)

    for epoch in range(args.epoch):
        trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
        trained = 0
        gen1 = gens.word_list(args.source)
        gen2 = gens.word_list(args.target)
        gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * args.minibatch), args.minibatch)
        model.init_optimizer()

        for src_batch, trg_batch in gen3:
            src_batch = fill_batch(src_batch)
            trg_batch = fill_batch(trg_batch)
            K = len(src_batch)
            hyp_batch = model.train(src_batch, trg_batch)

            for k in range(K):
                trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1))
                trace('  src = ' + ' '.join([x if x != '</s>' else '*' for x in src_batch[k]]))
                trace('  trg = ' + ' '.join([x if x != '</s>' else '*' for x in trg_batch[k]]))
                trace('  hyp = ' + ' '.join([x if x != '</s>' else '*' for x in hyp_batch[k]]))

            trained += K

        trace('saving model ...')
        model.save(args.model + '.%03d' % (epoch + 1))

    trace('finished.')
Exemple #11
0
 def load(filename):
     self = AttentionalTranslationModel()
     with ModelFile(filename) as fp:
         self.__src_vocab = Vocabulary.load(fp.get_file_pointer())
         self.__trg_vocab = Vocabulary.load(fp.get_file_pointer())
         self.__n_embed = int(fp.read())
         self.__n_hidden = int(fp.read())
         self.__make_model()
         wrapper.begin_model_access(self.__model)
         fp.read_embed(self.__model.w_xi)
         fp.read_linear(self.__model.w_ia)
         fp.read_linear(self.__model.w_aa)
         fp.read_linear(self.__model.w_ib)
         fp.read_linear(self.__model.w_bb)
         fp.read_linear(self.__model.w_aw)
         fp.read_linear(self.__model.w_bw)
         fp.read_linear(self.__model.w_pw)
         fp.read_linear(self.__model.w_we)
         fp.read_linear(self.__model.w_ap)
         fp.read_linear(self.__model.w_bp)
         fp.read_embed(self.__model.w_yp)
         fp.read_linear(self.__model.w_pp)
         fp.read_linear(self.__model.w_cp)
         fp.read_linear(self.__model.w_dp)
         fp.read_linear(self.__model.w_py)
         wrapper.end_model_access(self.__model)
     return self
Exemple #12
0
def test(args):
  trace('loading model ...')
  src_vocab = Vocabulary.load(args.model + '.srcvocab')
  trg_vocab = Vocabulary.load(args.model + '.trgvocab')
  attmt = AttentionMT.load_spec(args.model + '.spec')
  if args.use_gpu:
    attmt.to_gpu()
  serializers.load_hdf5(args.model + '.weights', attmt)
  
  trace('generating translation ...')
  generated = 0

  with open(args.target, 'w') as fp:
    for src_batch in gens.batch(gens.word_list(args.source), args.minibatch):
      src_batch = fill_batch(src_batch)
      K = len(src_batch)

      trace('sample %8d - %8d ...' % (generated + 1, generated + K))
      hyp_batch = forward(src_batch, None, src_vocab, trg_vocab, attmt, False, args.generation_limit)

      for hyp in hyp_batch:
        hyp.append('</s>')
        hyp = hyp[:hyp.index('</s>')]
        print(' '.join(hyp), file=fp)

      generated += K

  trace('finished.')
 def __predict_sentence(self, src_batch):
     dialogue = EncoderDecoderModelForwardSlack(self.parameter)
     src_vocab = Vocabulary.load(self.model_name + '.srcvocab')
     trg_vocab = Vocabulary.load(self.model_name + '.trgvocab')
     model = EncoderDecoder.load_spec(self.model_name + '.spec')
     serializers.load_hdf5(dialogue.model + '.weights', model)
     hyp_batch = dialogue.forward(src_batch, None, src_vocab, trg_vocab, model, False, self.generation_limit)
     return hyp_batch
def train(args):
    trace('making vocabularies ...')
    src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab)
    trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab)

    trace('making model ...')
    attmt = AttentionMT(args.vocab, args.embed, args.hidden)
    if args.use_gpu:
        attmt.to_gpu()

    for epoch in range(args.epoch):
        trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
        trained = 0
        gen1 = gens.word_list(args.source)
        gen2 = gens.word_list(args.target)
        gen3 = gens.batch(
            gens.sorted_parallel(gen1, gen2, 100 * args.minibatch),
            args.minibatch)
        opt = optimizers.AdaGrad(lr=0.01)
        opt.setup(attmt)
        opt.add_hook(optimizer.GradientClipping(5))

        for src_batch, trg_batch in gen3:
            src_batch = fill_batch(src_batch)
            trg_batch = fill_batch(trg_batch)
            K = len(src_batch)
            hyp_batch, loss = forward(src_batch, trg_batch, src_vocab,
                                      trg_vocab, attmt, True, 0)
            loss.backward()
            opt.update()

            for k in range(K):
                trace('epoch %3d/%3d, sample %8d' %
                      (epoch + 1, args.epoch, trained + k + 1))
                trace(
                    '  src = ' +
                    ' '.join([x if x != '</s>' else '*'
                              for x in src_batch[k]]))
                trace(
                    '  trg = ' +
                    ' '.join([x if x != '</s>' else '*'
                              for x in trg_batch[k]]))
                trace(
                    '  hyp = ' +
                    ' '.join([x if x != '</s>' else '*'
                              for x in hyp_batch[k]]))

            trained += K

        trace('saving model ...')
        prefix = args.model + '.%03.d' % (epoch + 1)
        src_vocab.save(prefix + '.srcvocab')
        trg_vocab.save(prefix + '.trgvocab')
        attmt.save_spec(prefix + '.spec')
        serializers.save_hdf5(prefix + '.weights', attmt)

    trace('finished.')
Exemple #15
0
 def __predict_sentence(self, src_batch):
     dialogue = EncoderDecoderModelForwardSlack(self.parameter)
     src_vocab = Vocabulary.load(self.model_name + '.srcvocab')
     trg_vocab = Vocabulary.load(self.model_name + '.trgvocab')
     model = EncoderDecoder.load_spec(self.model_name + '.spec')
     serializers.load_hdf5(dialogue.model + '.weights', model)
     hyp_batch = dialogue.forward(src_batch, None, src_vocab, trg_vocab,
                                  model, False, self.generation_limit)
     return hyp_batch
    def train(self):
        """
        Train method
        If you use the word2vec model, you possible to use the copy weight
        Optimizer method use the Adagrad
        """
        trace('making vocabularies ...')
        src_vocab = Vocabulary.new(gens.word_list(self.source), self.vocab)
        trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab)

        trace('making model ...')
        self.attention_dialogue = AttentionDialogue(self.vocab, self.embed,
                                                    self.hidden, self.XP)
        if self.word2vecFlag:
            self.copy_model(self.word2vec, self.attention_dialogue.emb)
            self.copy_model(self.word2vec,
                            self.attention_dialogue.dec,
                            dec_flag=True)

        for epoch in range(self.epoch):
            trace('epoch %d/%d: ' % (epoch + 1, self.epoch))
            trained = 0
            gen1 = gens.word_list(self.source)
            gen2 = gens.word_list(self.target)
            gen3 = gens.batch(
                gens.sorted_parallel(gen1, gen2, 100 * self.minibatch),
                self.minibatch)
            opt = optimizers.AdaGrad(lr=0.01)
            opt.setup(self.attention_dialogue)
            opt.add_hook(optimizer.GradientClipping(5))

            random_number = random.randint(0, self.minibatch - 1)
            for src_batch, trg_batch in gen3:
                src_batch = fill_batch(src_batch)
                trg_batch = fill_batch(trg_batch)
                K = len(src_batch)
                hyp_batch, loss = self.forward_implement(
                    src_batch, trg_batch, src_vocab, trg_vocab,
                    self.attention_dialogue, True, 0)
                loss.backward()
                opt.update()

                self.print_out(random_number, epoch, trained, src_batch,
                               trg_batch, hyp_batch)

                trained += K

        trace('saving model ...')
        prefix = self.model
        model_path = APP_ROOT + "/model/" + prefix
        src_vocab.save(model_path + '.srcvocab')
        trg_vocab.save(model_path + '.trgvocab')
        self.attention_dialogue.save_spec(model_path + '.spec')
        serializers.save_hdf5(model_path + '.weights', self.attention_dialogue)

        trace('finished.')
Exemple #17
0
def train(args):
    if os.path.exists("./model/vocab.bin"):
        src_vocab = Vocabulary.load("./model/vocab.bin")
    else:
        src_vocab = Vocabulary.new(gens.word_list(args.source), args.n_vocab)
        src_vocab.save('./model/vocab.bin')
    if os.path.exists("./model/tag.bin"):
        trg_tag = Vocabulary.load("./model/tag.bin")
    else:
        trg_tag = Vocabulary.new(gens.word_list(args.target), args.n_tag)
        trg_tag.save('./model/tag.bin')
    print("vocab_len:{}".format(src_vocab.__len__))
    print("tag_len:{}".format(trg_tag.__len__))
    encdec = BiEncDecLSTM(args.n_vocab, args.layer, args.embed, args.hidden,
                          args.n_tag)
    optimizer = optimizers.Adam()
    optimizer.setup(encdec)

    for e_i in range(args.epoch):
        tt_list = [[src_vocab.stoi(char) for char in char_arr]
                   for char_arr in gens.word_list(args.source_tr)]
        tag_list = [
            trg_tag.stoi(tag[0]) for tag in gens.word_list(args.target_tr)
        ]
        print("{}:{}".format(len(tt_list), len(tag_list)))
        assert len(tt_list) == len(tag_list)
        ind_arr = [ri for ri in range(len(tt_list))]
        random.shuffle(ind_arr)
        tt_now = (tt_list[ri] for ri in ind_arr)
        tag_now = (tag_list[ri] for ri in ind_arr)
        tt_gen = gens.batch(tt_now, args.batchsize)
        tag_gen = gens.batch(tag_now, args.batchsize)

        for tt, tag in zip(tt_gen, tag_gen):
            y_ws = encdec(tt)

            teac_arr = [src_vocab.itos(t) for t in tt[0]]
            pred_arr = [trg_tag.itos(y_each.data.argmax(0)) for y_each in y_ws]
            print("teach:{}:{}:{}".format(teac_arr, trg_tag.itos(tag[0]),
                                          pred_arr[0]))
            tag = xp.array(tag, dtype=xp.int32)
            loss = F.softmax_cross_entropy(y_ws, tag)

            encdec.cleargrads()
            loss.backward()
            optimizer.update()

            # loss.backward()
            # optimizer.target.cleargrads()
            # loss.backward()
            # loss.unchain_backward()
            # optimizer.update()

        serializers.save_npz('./model/attn_tag_model_{}.npz'.format(e_i),
                             encdec)
Exemple #18
0
  def __init__(self, args):
    trace('loading model ...')
    self.args = args
    self.src_vocab = Vocabulary.load(args.model + '.srcvocab')
    self.trg_vocab = Vocabulary.load(args.model + '.trgvocab')
    self.encdec = EncoderDecoder.load_spec(args.model + '.spec')
    if args.use_gpu:
      self.encdec.to_gpu()
    serializers.load_hdf5(args.model + '.weights', self.encdec)

    trace('generating translation ...')
 def __predict_sentence(self, src_batch):
     """
     predict sentence
     :param src_batch: get the source sentence
     :return:
     """
     dialogue = EncoderDecoderModelAttention(self.parameter)
     src_vocab = Vocabulary.load(self.model_name + '.srcvocab')
     trg_vocab = Vocabulary.load(self.model_name + '.trgvocab')
     model = AttentionDialogue.load_spec(self.model_name + '.spec', self.XP)
     serializers.load_hdf5(self.model_name + '.weights', model)
     hyp_batch = dialogue.forward_implement(src_batch, None, src_vocab, trg_vocab, model, False, self.generation_limit)
     return hyp_batch
Exemple #20
0
 def setCateg(self, args):
     categ_name = "./{}/categ_{}.bin".format(args.dataname, args.dataname)
     if os.path.exists(categ_name):
         categ_vocab = Vocabulary.load(categ_name)
     else:
         set_cat = set()
         [[set_cat.add(word) for word in word_arr] for word_arr in gens.word_list(args.category)]
         n_categ = len(set_cat) + 3
         print("n_categ:{}".format(n_categ))
         categ_vocab = Vocabulary.new(gens.word_list(args.category), n_categ)
         categ_vocab.save(categ_name)
     self.categ_vocab = categ_vocab
     return categ_vocab
Exemple #21
0
 def setVocab(self, args):
     vocab_name = "./{}/vocab_{}.bin".format(args.dataname, args.dataname)
     if os.path.exists(vocab_name):
         src_vocab = Vocabulary.load(vocab_name)
     else:
         set_vocab = set()
         [[set_vocab.add(word) for word in word_arr] for word_arr in gens.word_list(args.source)]
         n_vocab = len(set_vocab) + 3
         print("n_vocab:{}".format(n_vocab))
         print("arg_vocab:{}".format(args.n_vocab))
         src_vocab = Vocabulary.new(gens.word_list(args.source), args.n_vocab)
         src_vocab.save(vocab_name)
     self.vocab = src_vocab
     return src_vocab
    def train(self):
        """
        Train method
        If you use the word2vec model, you possible to use the copy weight
        Optimizer method use the Adagrad
        """
        trace("making vocabularies ...")
        src_vocab = Vocabulary.new(gens.word_list(self.source), self.vocab)
        trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab)

        trace("making model ...")
        self.attention_dialogue = AttentionDialogue(self.vocab, self.embed, self.hidden, self.XP)
        if self.word2vecFlag:
            self.copy_model(self.word2vec, self.attention_dialogue.emb)
            self.copy_model(self.word2vec, self.attention_dialogue.dec, dec_flag=True)

        for epoch in range(self.epoch):
            trace("epoch %d/%d: " % (epoch + 1, self.epoch))
            trained = 0
            gen1 = gens.word_list(self.source)
            gen2 = gens.word_list(self.target)
            gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * self.minibatch), self.minibatch)
            opt = optimizers.AdaGrad(lr=0.01)
            opt.setup(self.attention_dialogue)
            opt.add_hook(optimizer.GradientClipping(5))

            random_number = random.randint(0, self.minibatch - 1)
            for src_batch, trg_batch in gen3:
                src_batch = fill_batch(src_batch)
                trg_batch = fill_batch(trg_batch)
                K = len(src_batch)
                hyp_batch, loss = self.forward_implement(
                    src_batch, trg_batch, src_vocab, trg_vocab, self.attention_dialogue, True, 0
                )
                loss.backward()
                opt.update()

                self.print_out(random_number, epoch, trained, src_batch, trg_batch, hyp_batch)

                trained += K

        trace("saving model ...")
        prefix = self.model
        model_path = APP_ROOT + "/model/" + prefix
        src_vocab.save(model_path + ".srcvocab")
        trg_vocab.save(model_path + ".trgvocab")
        self.attention_dialogue.save_spec(model_path + ".spec")
        serializers.save_hdf5(model_path + ".weights", self.attention_dialogue)

        trace("finished.")
Exemple #23
0
    def train(self):
        trace('making vocabularies ...')
        src_vocab = Vocabulary.new(gens.word_list(self.source), self.vocab)
        trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab)

        trace('making model ...')
        encdec = EncoderDecoder(self.vocab, self.embed, self.hidden)
        if self.word2vecFlag:
            self.copy_model(self.word2vec, encdec.enc)
            self.copy_model(self.word2vec, encdec.dec, dec_flag=True)

        for epoch in range(self.epoch):
            trace('epoch %d/%d: ' % (epoch + 1, self.epoch))
            trained = 0
            gen1 = gens.word_list(self.source)
            gen2 = gens.word_list(self.target)
            gen3 = gens.batch(
                gens.sorted_parallel(gen1, gen2, 100 * self.minibatch),
                self.minibatch)
            opt = optimizers.AdaGrad(lr=0.01)
            opt.setup(encdec)
            opt.add_hook(optimizer.GradientClipping(5))

            random_number = random.randint(0, self.minibatch - 1)
            for src_batch, trg_batch in gen3:
                src_batch = fill_batch(src_batch)
                trg_batch = fill_batch(trg_batch)
                K = len(src_batch)
                # If you use the ipython note book you hace to use the forward function
                # hyp_batch, loss = self.forward(src_batch, trg_batch, src_vocab, trg_vocab, encdec, True, 0)
                hyp_batch, loss = self.forward_implement(
                    src_batch, trg_batch, src_vocab, trg_vocab, encdec, True,
                    0)
                loss.backward()
                opt.update()

                self.print_out(random_number, epoch, trained, src_batch,
                               trg_batch, hyp_batch)

                trained += K

        trace('saving model ...')
        prefix = self.model
        src_vocab.save(prefix + '.srcvocab')
        trg_vocab.save(prefix + '.trgvocab')
        encdec.save_spec(prefix + '.spec')
        serializers.save_hdf5(prefix + '.weights', encdec)

        trace('finished.')
 def __predict_sentence(self, src_batch):
     """
     predict sentence
     :param src_batch: get the source sentence
     :return:
     """
     dialogue = EncoderDecoderModelAttention(self.parameter)
     src_vocab = Vocabulary.load(self.model_name + '.srcvocab')
     trg_vocab = Vocabulary.load(self.model_name + '.trgvocab')
     model = AttentionDialogue.load_spec(self.model_name + '.spec', self.XP)
     serializers.load_hdf5(self.model_name + '.weights', model)
     hyp_batch = dialogue.forward_implement(src_batch, None, src_vocab,
                                            trg_vocab, model, False,
                                            self.generation_limit)
     return hyp_batch
    def train(self):
        trace('making vocabularies ...')
        src_vocab = Vocabulary.new(gens.word_list(self.source), self.vocab)
        trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab)

        trace('making model ...')
        encdec = EncoderDecoder(self.vocab, self.embed, self.hidden)
        if self.word2vecFlag:
            self.copy_model(self.word2vec, encdec.enc)
            self.copy_model(self.word2vec, encdec.dec, dec_flag=True)
        else:
            encdec = self.encdec

        for epoch in range(self.epoch):
            trace('epoch %d/%d: ' % (epoch + 1, self.epoch))
            trained = 0
            gen1 = gens.word_list(self.source)
            gen2 = gens.word_list(self.target)
            gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * self.minibatch), self.minibatch)
            opt = optimizers.AdaGrad(lr = 0.01)
            opt.setup(encdec)
            opt.add_hook(optimizer.GradientClipping(5))

            random_number = random.randint(0, self.minibatch - 1)
            for src_batch, trg_batch in gen3:
                src_batch = fill_batch(src_batch)
                trg_batch = fill_batch(trg_batch)
                K = len(src_batch)
                hyp_batch, loss = self.forward(src_batch, trg_batch, src_vocab, trg_vocab, encdec, True, 0)
                loss.backward()
                opt.update()

                if trained == 0:
                    self.print_out(random_number, epoch, trained, src_batch, trg_batch, hyp_batch)

                trained += K

        trace('saving model ...')
        prefix = self.model
        src_vocab.save(prefix + '.srcvocab')
        trg_vocab.save(prefix + '.trgvocab')
        encdec.save_spec(prefix + '.spec')
        serializers.save_hdf5(prefix + '.weights', encdec)

        trace('finished.')
 def load(self, filename):
     with ModelFile(filename) as fp:
         self.src_vocab = Vocabulary.load(fp.get_file_pointer())
         self.trg_vocab = Vocabulary.load(fp.get_file_pointer())
         self.n_embed = int(fp.read())
         self.n_hidden = int(fp.read())
         self.make_model()
         wrapper.begin_model_access(self.model)
         fp.read_embed(self.model.weight_xi)
         fp.read_linear(self.model.weight_ip)
         fp.read_linear(self.model.weight_pp)
         fp.read_linear(self.model.weight_pq)
         fp.read_linear(self.model.weight_qj)
         fp.read_linear(self.model.weight_jy)
         fp.read_embed(self.model.weight_yq)
         fp.read_linear(self.model.weight_qq)
         wrapper.end_model_access(self.model)
     return self
 def load(self, filename):
     with ModelFile(filename) as fp:
         self.src_vocab = Vocabulary.load(fp.get_file_pointer())
         self.trg_vocab = Vocabulary.load(fp.get_file_pointer())
         self.n_embed = int(fp.read())
         self.n_hidden = int(fp.read())
         self.make_model()
         wrapper.begin_model_access(self.model)
         fp.read_embed(self.model.weight_xi)
         fp.read_linear(self.model.weight_ip)
         fp.read_linear(self.model.weight_pp)
         fp.read_linear(self.model.weight_pq)
         fp.read_linear(self.model.weight_qj)
         fp.read_linear(self.model.weight_jy)
         fp.read_embed(self.model.weight_yq)
         fp.read_linear(self.model.weight_qq)
         wrapper.end_model_access(self.model)
     return self
def train(args):
  trace('making vocabularies ...')
  src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab)
  trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab)

  trace('making model ...')
  attmt = AttentionMT(args.vocab, args.embed, args.hidden)
  if args.use_gpu:
    attmt.to_gpu()

  for epoch in range(args.epoch):
    trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
    trained = 0
    gen1 = gens.word_list(args.source)
    gen2 = gens.word_list(args.target)
    gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * args.minibatch), args.minibatch)
    opt = optimizers.AdaGrad(lr = 0.01)
    opt.setup(attmt)
    opt.add_hook(optimizer.GradientClipping(5))

    for src_batch, trg_batch in gen3:
      src_batch = fill_batch(src_batch)
      trg_batch = fill_batch(trg_batch)
      K = len(src_batch)
      hyp_batch, loss = forward(src_batch, trg_batch, src_vocab, trg_vocab, attmt, True, 0)
      loss.backward()
      opt.update()

      for k in range(K):
        trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1))
        trace('  src = ' + ' '.join([x if x != '</s>' else '*' for x in src_batch[k]]))
        trace('  trg = ' + ' '.join([x if x != '</s>' else '*' for x in trg_batch[k]]))
        trace('  hyp = ' + ' '.join([x if x != '</s>' else '*' for x in hyp_batch[k]]))

      trained += K

    trace('saving model ...')
    prefix = args.model + '.%03.d' % (epoch + 1)
    src_vocab.save(prefix + '.srcvocab')
    trg_vocab.save(prefix + '.trgvocab')
    attmt.save_spec(prefix + '.spec')
    serializers.save_hdf5(prefix + '.weights', attmt)

  trace('finished.')
Exemple #29
0
def test(args, epoch):
    model_name = "./model/attn_tag_model_{}.npz".format(epoch)
    encdec = BiEncDecLSTM(args.n_vocab, args.layer, args.embed, args.hidden,
                          args.n_tag)
    serializers.load_npz(model_name, encdec)
    src_vocab = Vocabulary.load("./model/vocab.bin")
    trg_tag = Vocabulary.load("./model/tag.bin")
    tt_now = ([src_vocab.stoi(char) for char in char_arr]
              for char_arr in gens.word_list(args.source_te))
    tag_now = (trg_tag.stoi(tag[0]) for tag in gens.word_list(args.target_te))
    tt_gen = gens.batch(tt_now, args.batchsize)
    tag_gen = gens.batch(tag_now, args.batchsize)
    correct_num = 0
    wrong_num = 0
    fw = codecs.open("./output/result_attn_tw{}.csv".format(epoch),
                     "w",
                     encoding="utf-8")
    fw.write("台詞,教師キャラ,予測キャラ,予測値,›単語\n")
    for tt, tag in zip(tt_gen, tag_gen):
        y, att_w = encdec.callAndAtt(tt)
        max_y = [
            max(
                F.softmax(F.reshape(y_each.data,
                                    (1, len(y_each.data)))).data[0])
            for y_each in y
        ]
        y = [y_each.data.argmax(0) for y_each in y]
        for tt_e, y_e, tag_e, max_y_e, att_w_e in zip(tt, y, tag, max_y,
                                                      att_w):
            txt = ",".join([src_vocab.itos(id) for id in tt_e])
            tag_e = trg_tag.itos(tag_e)
            y_e = trg_tag.itos(y_e)
            att_ind = att_w_e.data.argmax()
            most_word = src_vocab.itos(tt_e[att_ind])
            fw.write("{}:{}:{}:{}:{}\n".format(txt, tag_e, y_e, max_y_e,
                                               most_word))
        correct_num += len([1 for y_e, tag_e in zip(y, tag) if y_e == tag_e])
        wrong_num += len([1 for y_e, tag_e in zip(y, tag) if y_e != tag_e])
    print("epoch:{}".format(epoch))
    print(" correct:{}".format(correct_num))
    print(" wrong:{}".format(wrong_num))
    fw.write("correct{}\n".format(correct_num))
    fw.write("wrong:{}\n".format(wrong_num))
    fw.close()
 def load(filename):
     self = EncoderDecoderModel()
     with ModelFile(filename) as fp:
         self.__src_vocab = Vocabulary.load(fp.get_file_pointer())
         self.__trg_vocab = Vocabulary.load(fp.get_file_pointer())
         self.__n_embed = int(fp.read())
         self.__n_hidden = int(fp.read())
         self.__make_model()
         wrapper.begin_model_access(self.__model)
         fp.read_embed(self.__model.w_xi)
         fp.read_linear(self.__model.w_ip)
         fp.read_linear(self.__model.w_pp)
         fp.read_linear(self.__model.w_pq)
         fp.read_linear(self.__model.w_qj)
         fp.read_linear(self.__model.w_jy)
         fp.read_embed(self.__model.w_yq)
         fp.read_linear(self.__model.w_qq)
         wrapper.end_model_access(self.__model)
     return self
Exemple #31
0
 def load(filename):
     self = EncoderDecoderModel()
     with ModelFile(filename) as fp:
         self.__src_vocab = Vocabulary.load(fp.get_file_pointer())
         self.__trg_vocab = Vocabulary.load(fp.get_file_pointer())
         self.__n_embed = int(fp.read())
         self.__n_hidden = int(fp.read())
         self.__make_model()
         wrapper.begin_model_access(self.__model)
         fp.read_embed(self.__model.w_xi)
         fp.read_linear(self.__model.w_ip)
         fp.read_linear(self.__model.w_pp)
         fp.read_linear(self.__model.w_pq)
         fp.read_linear(self.__model.w_qj)
         fp.read_linear(self.__model.w_jy)
         fp.read_embed(self.__model.w_yq)
         fp.read_linear(self.__model.w_qq)
         wrapper.end_model_access(self.__model)
     return self
    def test(self):
        """
        Test method
        You have to parepare the train model
        """
        trace('loading model ...')
        prefix = self.model
        model_path = APP_ROOT + "/model/" + prefix
        src_vocab = Vocabulary.load(model_path + '.srcvocab')
        trg_vocab = Vocabulary.load(model_path + '.trgvocab')
        self.attention_dialogue = AttentionDialogue.load_spec(
            model_path + '.spec', self.XP)
        serializers.load_hdf5(model_path + '.weights', self.attention_dialogue)

        trace('generating translation ...')
        generated = 0

        with open(self.test_target, 'w') as fp:
            for src_batch in gens.batch(gens.word_list(self.source),
                                        self.minibatch):
                src_batch = fill_batch(src_batch)
                K = len(src_batch)

                trace('sample %8d - %8d ...' % (generated + 1, generated + K))
                hyp_batch = self.forward_implement(src_batch, None, src_vocab,
                                                   trg_vocab,
                                                   self.attention_dialogue,
                                                   False,
                                                   self.generation_limit)

                source_cuont = 0
                for hyp in hyp_batch:
                    hyp.append('</s>')
                    hyp = hyp[:hyp.index('</s>')]
                    print("src : " +
                          "".join(src_batch[source_cuont]).replace("</s>", ""))
                    print('hyp : ' + ''.join(hyp))
                    print(' '.join(hyp), file=fp)
                    source_cuont = source_cuont + 1

                generated += K

        trace('finished.')
Exemple #33
0
def test(args):
    trace('loading model ...')
    word_vocab = Vocabulary.load(args.model + '.words')
    phrase_vocab = Vocabulary.load(args.model + '.phrases')
    semi_vocab = Vocabulary.load(args.model + '.semiterminals')
    parser = Parser.load_spec(args.model + '.spec')
    if USE_GPU:
        parser.to_gpu()
    serializers.load_hdf5(args.model + '.weights', parser)

    trace('generating parse trees ...')
    with open(args.source) as fp:
        for l in fp:
            word_list = convert_word_list(l.split(), word_vocab)
            tree = restore_labels(
                parser.forward(word_list, None, args.unary_limit),
                phrase_vocab, semi_vocab)
            print('( ' + tree_to_string(tree) + ' )')

    trace('finished.')
def test(args):

    trace('loading model ...')
    src_vocab = Vocabulary.load(args.model + '.srcvocab')
    trg_vocab = Vocabulary.load(args.model + '.trgvocab')
    encdec = EncoderDecoder.load_spec(args.model + '.spec')
    if args.use_gpu:
        encdec.to_gpu()
    serializers.load_hdf5(args.model + '.weights', encdec)

    trace('generating translation ...')
    generated = 0

    temp = gens.to_words(args.target)
    # temp.append("</s>")
    src_batch = []
    src_batch.append(temp)
    # src_batch = [['私は', '太郎', 'です', '(´', 'ー', '`*)', 'ウンウン', '</s>']]
    src_batch = fill_batch(src_batch)
    print("src_batch:", src_batch)
    K = len(src_batch)

    trace('sample %8d - %8d ...' % (generated + 1, generated + K))
    print("question:")
    for srp in src_batch:
        srp.append('</s>')
        srp = srp[:srp.index('</s>')]
        print(''.join(srp))

    hyp_batch = forward(src_batch, None, src_vocab, trg_vocab, encdec, False,
                        args.generation_limit)
    print("answser:")
    for hyp in hyp_batch:
        hyp.append('</s>')
        hyp = hyp[:hyp.index('</s>')]
        print(''.join(hyp))
    print("----------------")

    generated += K

    trace('finished.')
 def load(filename):
     self = SegmentationModel()
     with ModelFile(filename) as fp:
         self.__vocab = Vocabulary.load(fp.get_file_pointer())
         self.__n_context = int(fp.read())
         self.__n_hidden = int(fp.read())
         self.__make_model()
         wrapper.begin_model_access(self.__model)
         fp.read_embed(self.__model.w_xh)
         fp.read_linear(self.__model.w_hy)
         wrapper.end_model_access(self.__model)
     return self
Exemple #36
0
 def load(filename):
     self = TransSegmentationModel()
     with ModelFile(filename) as fp:
         self.__vocab = Vocabulary.load(fp.get_file_pointer())
         self.__n_context = int(fp.read())
         self.__n_hidden = int(fp.read())
         self.__make_model()
         wrapper.begin_model_access(self.__model)
         fp.read_embed(self.__model.w_xh)
         fp.read_linear(self.__model.w_hy)
         wrapper.end_model_access(self.__model)
     return self
Exemple #37
0
def test(args):
  trace('loading model ...')
  word_vocab = Vocabulary.load(args.model + '.words')
  phrase_vocab = Vocabulary.load(args.model + '.phrases')
  semi_vocab = Vocabulary.load(args.model + '.semiterminals')
  parser = Parser.load_spec(args.model + '.spec')
  if USE_GPU:
    parser.to_gpu()
  serializers.load_hdf5(args.model + '.weights', parser)

  trace('generating parse trees ...')
  with open(args.source) as fp:
    for l in fp:
      word_list = convert_word_list(l.split(), word_vocab)
      tree = restore_labels(
          parser.forward(word_list, None, args.unary_limit),
          phrase_vocab,
          semi_vocab
      )
      print('( ' + tree_to_string(tree) + ' )')

  trace('finished.')
Exemple #38
0
def train_model(args):
    train_begin = time.time()
    trace('making vocaburaries ...')
    vocab = Vocabulary.new(gens.letter_list(args.corpus), args.vocab)

    trace('begin training ...')
    model = TransSegmentationModel.new(vocab, args.context, args.hidden,
                                       args.labels, args.eta)

    for epoch in range(args.epoch):
        epoch_beg = time.time()
        trace('START epoch %d/%d: ' % (epoch + 1, args.epoch))
        trained = 0
        total_loss = 0

        model.init_optimizer()

        with open(args.corpus) as fp:
            for text in fp:
                word_list = text.split()
                if not word_list:
                    continue

                text = ' '.join(word_list)
                letters = ''.join(word_list)
                labels, accum_loss_f = model.train(text)
                total_loss += accum_loss_f
                trained += 1
                hyp = make_hyp(letters, labels)
                """for 1sentence output
                trace("accum_loss : %lf"% (accum_loss_f))
                trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
                trace('trained %d: '% trained)
                trace(text)
                trace(hyp)
                """
                """
                if trained % 100 == 0:
                    trace('  %8d' % trained)
                """
        trace('FINISHED epoch %d/%d: ' % (epoch + 1, args.epoch))
        trace('total_loss : %lf' % total_loss)
        trace('saving model ...')
        model.save(args.model + '.%03d' % (epoch + 1))
        epoch_time = time.time() - epoch_beg
        trace('elapsed_time/1epoch : %lf' % epoch_time)

    trace('finished.')
    elapsed_time = time.time() - train_begin
    trace('train_time : %lf' % elapsed_time)
    trace('')
Exemple #39
0
def train_model(args):
    train_begin = time.time()
    trace('making vocaburaries ...')
    vocab = Vocabulary.new(gens.letter_list(args.corpus), args.vocab) 

    trace('begin training ...')
    model = TransSegmentationModel.new(vocab, args.context, args.hidden, args.labels, args.eta)

    for epoch in range(args.epoch):
        epoch_beg = time.time() 
        trace('START epoch %d/%d: ' % (epoch + 1, args.epoch))
        trained = 0
        total_loss = 0

        model.init_optimizer()

        with open(args.corpus) as fp:
            for text in fp:
                word_list = text.split()
                if not word_list:
                    continue

                text = ' '.join(word_list)
                letters = ''.join(word_list)
                labels, accum_loss_f = model.train(text)
                total_loss += accum_loss_f
                trained += 1
                hyp = make_hyp(letters, labels)
                
                """for 1sentence output
                trace("accum_loss : %lf"% (accum_loss_f))
                trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
                trace('trained %d: '% trained)
                trace(text)
                trace(hyp)
                """
                """
                if trained % 100 == 0:
                    trace('  %8d' % trained)
                """
        trace('FINISHED epoch %d/%d: ' % (epoch + 1, args.epoch))
        trace('total_loss : %lf'%total_loss)
        trace('saving model ...')
        model.save(args.model + '.%03d' % (epoch + 1))
        epoch_time = time.time() - epoch_beg
        trace('elapsed_time/1epoch : %lf'%epoch_time)

    trace('finished.')
    elapsed_time = time.time() - train_begin
    trace('train_time : %lf'%elapsed_time)
    trace('')
    def test(self):
        """
        Test method
        You have to parepare the train model
        """
        trace("loading model ...")
        prefix = self.model
        model_path = APP_ROOT + "/model/" + prefix
        src_vocab = Vocabulary.load(model_path + ".srcvocab")
        trg_vocab = Vocabulary.load(model_path + ".trgvocab")
        self.attention_dialogue = AttentionDialogue.load_spec(model_path + ".spec", self.XP)
        serializers.load_hdf5(model_path + ".weights", self.attention_dialogue)

        trace("generating translation ...")
        generated = 0

        with open(self.test_target, "w") as fp:
            for src_batch in gens.batch(gens.word_list(self.source), self.minibatch):
                src_batch = fill_batch(src_batch)
                K = len(src_batch)

                trace("sample %8d - %8d ..." % (generated + 1, generated + K))
                hyp_batch = self.forward_implement(
                    src_batch, None, src_vocab, trg_vocab, self.attention_dialogue, False, self.generation_limit
                )

                source_cuont = 0
                for hyp in hyp_batch:
                    hyp.append("</s>")
                    hyp = hyp[: hyp.index("</s>")]
                    print("src : " + "".join(src_batch[source_cuont]).replace("</s>", ""))
                    print("hyp : " + "".join(hyp))
                    print(" ".join(hyp), file=fp)
                    source_cuont = source_cuont + 1

                generated += K

        trace("finished.")
    def train_model(self):
        trace('making vocaburaries ...')
        src_vocab = Vocabulary.new(gens.word_list(self.source), self.vocab)
        trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab)

        trace('making model ...')
        model = self.new(src_vocab, trg_vocab, self.embed, self.hidden,
                         self.parameter_dict)

        random_number = random.randint(0, self.minibatch)
        for i_epoch in range(self.epoch):
            trace('epoch %d/%d: ' % (i_epoch + 1, self.epoch))
            trained = 0
            gen1 = gens.word_list(self.source)
            gen2 = gens.word_list(self.target)
            gen3 = gens.batch(
                gens.sorted_parallel(gen1, gen2, 100 * self.minibatch),
                self.minibatch)
            model.init_optimizer()

            for src_batch, trg_batch in gen3:
                src_batch = fill_batch(src_batch)
                trg_batch = fill_batch(trg_batch)
                K = len(src_batch)
                hyp_batch = model.train(src_batch, trg_batch)

                if trained == 0:
                    self.print_out(random_number, i_epoch, trained, src_batch,
                                   trg_batch, hyp_batch)

                trained += K

            trace('saving model ...')
            model.save("ChainerMachineTranslation" + '.%03d' %
                       (self.epoch + 1))

        trace('finished.')
 def load(filename):
     self = RNNSegmentationModel()
     with ModelFile(filename) as fp:
         self.__vocab = Vocabulary.load(fp.get_file_pointer())
         self.__n_embed = int(fp.read())
         self.__n_hidden = int(fp.read())
         self.__make_model()
         wrapper.begin_model_access(self.__model)
         fp.read_embed(self.__model.w_xe)
         fp.read_linear(self.__model.w_ea)
         fp.read_linear(self.__model.w_aa)
         fp.read_linear(self.__model.w_eb)
         fp.read_linear(self.__model.w_bb)
         fp.read_linear(self.__model.w_ay1)
         fp.read_linear(self.__model.w_by1)
         fp.read_linear(self.__model.w_ay2)
         fp.read_linear(self.__model.w_by2)
         wrapper.end_model_access(self.__model)
     return self
def train_model(args):
    trace("making vocabularies ...")
    vocab = Vocabulary.new(gens.letter_list(args.corpus), args.vocab)

    trace("start training ...")
    model = SegmentationModel.new(vocab, args.context, args.hidden)

    for epoch in range(args.epoch):
        trace("epoch %d/%d: " % (epoch + 1, args.epoch))
        trained = 0

        model.init_optimizer()

        with open(args.corpus) as fp:
            for text in fp:
                word_list = text.split()
                if not word_list:
                    continue

                text = " ".join(word_list)
                letters = "".join(word_list)
                scores = model.train(text)
                trained += 1
                hyp = make_hyp(letters, scores)

                trace(trained)
                trace(text)
                trace(hyp)
                trace(" ".join("%+.1f" % x for x in scores))

                if trained % 100 == 0:
                    trace("  %8d" % trained)

        trace("saveing model ...")
        model.save(args.model + ".%03d" % (epoch + 1))

    trace("finished.")
def train_model(args):
    trace('making vocabularies ...')
    vocab = Vocabulary.new(gens.letter_list(args.corpus), args.vocab)

    trace('start training ...')
    model = RNNSegmentationModel.new(vocab, args.embed, args.hidden)

    for epoch in range(args.epoch):
        trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
        trained = 0

        model.init_optimizer()

        with open(args.corpus) as fp:
            for text in fp:
                word_list = text.split()
                if not word_list:
                    continue

                text = ' '.join(word_list)
                letters = ''.join(word_list)
                scores = model.train(text)
                trained += 1
                hyp = make_hyp(letters, scores)
                
                trace(trained)
                trace(text)
                trace(hyp)
                trace(' '.join('%+.1f' % x for x in scores))
                
                if trained % 100 == 0:
                    trace('  %8d' % trained)

        trace('saveing model ...')
        model.save(args.model + '.%03d' % (epoch + 1))

    trace('finished.')
Exemple #45
0
        assert (display.shape == (1, V))
        output_str = str(k) + ": "
        for v in xrange(self._V):
            if display[:, v]:
                output_str += vocab[v] + "\t"
        print(output_str)


"""
run HDP on a synthetic corpus.
"""
if __name__ == '__main__':
    from util.vocabulary import Vocabulary, parse_raw_text

    bdir = "../../data/nips12/"
    #data = import_monolingual_data(bdir + "doc.dat")
    data = parse_raw_text(bdir)
    data = data[:50]
    voca = Vocabulary(exclude_stopwords=True)
    corpus = [voca.doc2bow(doc) for doc in data]
    #corpus = numpy.array([numpy.random.poisson(1, k) for k in numpy.random.poisson(50, 20)])
    corpus = dict((i, j) for i, j in enumerate(corpus))

    gs = UncollapsedGibbsSampling()
    gs._initialize(corpus)

    gs.sample(100)

    print(gs._K)
    print(gs._n_kv)
Exemple #46
0
def train(args):
    trace('loading corpus ...')
    with open(args.source) as fp:
        trees = [make_tree(l) for l in fp]

    trace('extracting leaf nodes ...')
    word_lists = [extract_words(t) for t in trees]
    lower_lists = [[w.lower() for w in words] for words in word_lists]

    trace('extracting gold operations ...')
    op_lists = [make_operations(t) for t in trees]

    trace('making vocabulary ...')
    word_vocab = Vocabulary.new(lower_lists, args.vocab)
    phrase_set = set()
    semiterminal_set = set()
    for tree in trees:
        phrase_set |= set(extract_phrase_labels(tree))
        semiterminal_set |= set(extract_semiterminals(tree))
    phrase_vocab = Vocabulary.new([list(phrase_set)],
                                  len(phrase_set),
                                  add_special_tokens=False)
    semiterminal_vocab = Vocabulary.new([list(semiterminal_set)],
                                        len(semiterminal_set),
                                        add_special_tokens=False)

    trace('converting data ...')
    word_lists = [convert_word_list(x, word_vocab) for x in word_lists]
    op_lists = [
        convert_op_list(x, phrase_vocab, semiterminal_vocab) for x in op_lists
    ]

    trace('start training ...')
    parser = Parser(
        args.vocab,
        args.embed,
        args.char_embed,
        args.queue,
        args.stack,
        args.srstate,
        len(phrase_set),
        len(semiterminal_set),
    )
    if args.use_gpu:
        parser.to_gpu()
    opt = optimizers.SGD(lr=0.1)
    opt.setup(parser)
    opt.add_hook(optimizer.GradientClipping(10))
    opt.add_hook(optimizer.WeightDecay(0.0001))

    batch_set = list(zip(word_lists, op_lists))

    for epoch in range(args.epoch):
        n = 0
        random.shuffle(batch_set)

        for samples in batch(batch_set, args.minibatch):
            parser.zerograds()
            loss = XP.fzeros(())

            for word_list, op_list in zip(*samples):
                trace('epoch %3d, sample %6d:' % (epoch + 1, n + 1))
                loss += parser.forward_train(word_list, op_list)
                n += 1

            loss.backward()
            opt.update()

        trace('saving model ...')
        prefix = args.model + '.%03.d' % (epoch + 1)
        word_vocab.save(prefix + '.words')
        phrase_vocab.save(prefix + '.phrases')
        semiterminal_vocab.save(prefix + '.semiterminals')
        parser.save_spec(prefix + '.spec')
        serializers.save_hdf5(prefix + '.weights', parser)

        opt.lr *= 0.92

    trace('finished.')
Exemple #47
0
def train():
   
    parser = argparse.ArgumentParser()
    # 配置文件
    parser.add_argument(
        "--config-yml",
        default="exp_fvqa/exp2.yml",
        help=
        "Path to a config file listing reader, model and solver parameters.")

    parser.add_argument("--cpu-workers",
                        type=int,
                        default=8,
                        help="Number of CPU workers for dataloader.")

    parser.add_argument(
        "--save-dirpath",
        default="fvqa/exp_data/checkpoints",
        help=
        "Path of directory to create checkpoint directory and save checkpoints."
    )

    parser.add_argument(
        "--load-pthpath",
        default="",
        help="To continue training, path to .pth file of saved checkpoint.")

    parser.add_argument("--gpus", default="", help="gpus")
    parser.add_argument(
        "--overfit",
        action="store_true",
        help="Whether to validate on val split after every epoch.")

    parser.add_argument(
        "--validate",
        action="store_true",
        help="Whether to validate on val split after every epoch.")

    args = parser.parse_args()

    # set mannual seed
    torch.manual_seed(10)
    torch.cuda.manual_seed(10)
    cudnn.benchmark = True
    cudnn.deterministic = True

    config = yaml.load(open(args.config_yml))

    device = torch.device("cuda:0") if args.gpus != "cpu" else torch.device(
        "cpu")

    # Print config and args.
    print(yaml.dump(config, default_flow_style=False))
    for arg in vars(args):
        print("{:<20}: {}".format(arg, getattr(args, arg)))

 
    print('Loading TrainDataset...')
    train_dataset = FvqaTrainDataset(config, overfit=args.overfit)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=config['solver']['batch_size'],
                                  num_workers=args.cpu_workers,
                                  shuffle=True,
                                  collate_fn=collate_fn)

    if args.validate:
        print('Loading TestDataset...')
        val_dataset = FvqaTestDataset(config, overfit=args.overfit)
        val_dataloader = DataLoader(val_dataset,
                                    batch_size=config['solver']['batch_size'],
                                    num_workers=args.cpu_workers,
                                    shuffle=True,
                                    collate_fn=collate_fn)


    print('Loading glove...')
    que_vocab = Vocabulary(config['dataset']['word2id_path'])
    glove = np.load(config['dataset']['glove_vec_path'])
    glove = torch.Tensor(glove)


    print('Building Model...')
    model = CMGCNnet(config,
                     que_vocabulary=que_vocab,
                     glove=glove,
                     device=device)

    if torch.cuda.device_count() > 1 and args.gpus != "cpu":
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)

    model = model.to(device)
    print(model)


    iterations = len(train_dataset) // config["solver"]["batch_size"] + 1

    def lr_lambda_fun(current_iteration: int) -> float:
   
        current_epoch = float(current_iteration) / iterations
        if current_epoch <= config["solver"]["warmup_epochs"]:
            alpha = current_epoch / float(config["solver"]["warmup_epochs"])
            return config["solver"]["warmup_factor"] * (1. - alpha) + alpha
        else:
            idx = bisect(config["solver"]["lr_milestones"], current_epoch)
            return pow(config["solver"]["lr_gamma"], idx)


    optimizer = optim.Adamax(model.parameters(),
                             lr=config["solver"]["initial_lr"])
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda_fun)
    T = iterations * (config["solver"]["num_epochs"] -
                      config["solver"]["warmup_epochs"] + 1)
    scheduler2 = lr_scheduler.CosineAnnealingLR(
        optimizer, int(T), eta_min=config["solver"]["eta_min"], last_epoch=-1)

   
    summary_writer = SummaryWriter(log_dir=args.save_dirpath)
    checkpoint_manager = CheckpointManager(model,
                                           optimizer,
                                           args.save_dirpath,
                                           config=config)


    if args.load_pthpath == "":
        start_epoch = 0
    else:

        start_epoch = int(args.load_pthpath.split("_")[-1][:-4])

        model_state_dict, optimizer_state_dict = load_checkpoint(
            args.load_pthpath)
        if isinstance(model, nn.DataParallel):
            model.module.load_state_dict(model_state_dict)
        else:
            model.load_state_dict(model_state_dict)
        optimizer.load_state_dict(optimizer_state_dict)
        print("Loading resume model from {}...".format(args.load_pthpath))


    global_iteration_step = start_epoch * iterations

    for epoch in range(start_epoch, config['solver']['num_epochs']):

        print(f"\nTraining for epoch {epoch}:")

        train_answers = []
        train_preds = []

        for i, batch in enumerate(tqdm(train_dataloader)):
            optimizer.zero_grad()
            fact_batch_graph = model(batch)
            batch_loss = cal_batch_loss(fact_batch_graph,
                                        batch,
                                        device,
                                        neg_weight=0.1,
                                        pos_weight=0.9)

            batch_loss.backward()
            optimizer.step()

            fact_graphs = dgl.unbatch(fact_batch_graph)
            for i, fact_graph in enumerate(fact_graphs):
                train_pred = fact_graph.ndata['h'].squeeze()  # (num_nodes,1)
                train_preds.append(train_pred)  # [(num_nodes,)]
                train_answers.append(batch['facts_answer_id_list'][i])

            summary_writer.add_scalar('train/loss', batch_loss,
                                      global_iteration_step)
            summary_writer.add_scalar("train/lr",
                                      optimizer.param_groups[0]["lr"],
                                      global_iteration_step)
            summary_writer.add_text('train/loss', str(batch_loss.item()),
                                    global_iteration_step)
            summary_writer.add_text('train/lr',
                                    str(optimizer.param_groups[0]["lr"]),
                                    global_iteration_step)

            if global_iteration_step <= iterations * config["solver"][
                "warmup_epochs"]:
                scheduler.step(global_iteration_step)
            else:
                global_iteration_step_in_2 = iterations * config["solver"][
                    "warmup_epochs"] + 1 - global_iteration_step
                scheduler2.step(int(global_iteration_step_in_2))

            global_iteration_step = global_iteration_step + 1
            torch.cuda.empty_cache()


        checkpoint_manager.step()
        train_acc_1, train_acc_3 = cal_acc(
            train_answers, train_preds)
        print(
            "trainacc@1={:.2%} & trainacc@3={:.2%} "
                .format(train_acc_1, train_acc_3))
        summary_writer.add_scalars(
            'train/acc', {
                'acc@1': train_acc_1,
                'acc@3': train_acc_3

            }, epoch)


        if args.validate:
            model.eval()
            answers = []  # [batch_answers,...]
            preds = []  # [batch_preds,...]
            print(f"\nValidation after epoch {epoch}:")
            for i, batch in enumerate(tqdm(val_dataloader)):
                with torch.no_grad():
                    fact_batch_graph = model(batch)
                batch_loss = cal_batch_loss(fact_batch_graph,
                                            batch,
                                            device,
                                            neg_weight=0.1,
                                            pos_weight=0.9)

                summary_writer.add_scalar('test/loss', batch_loss, epoch)
                fact_graphs = dgl.unbatch(fact_batch_graph)
                for i, fact_graph in enumerate(fact_graphs):
                    pred = fact_graph.ndata['h'].squeeze()  # (num_nodes,1)
                    preds.append(pred)  # [(num_nodes,)]
                    answers.append(batch['facts_answer_id_list'][i])

            acc_1, acc_3 = cal_acc(answers, preds)
            print("acc@1={:.2%} & acc@3={:.2%} ".
                  format(acc_1, acc_3))
            summary_writer.add_scalars('test/acc', {
                'acc@1': acc_1,
                'acc@3': acc_3
            }, epoch)

            model.train()
            torch.cuda.empty_cache()
    print('Train finished !!!')
    summary_writer.close()
Exemple #48
0
from util.vocabulary import Vocabulary
from util import generators as gens
from util.controller import Controller
from util.wrapper import wrapper
from util.const import *

if __name__ == '__main__':
    args = parse_args()

    trace('initializing ...')
    wrapper = wrapper(args.gpu_id)
    wrapper.init()

    trace('loading vocab ...')
    #    src_vocab = Vocabulary.load(args.src_vocab)
    #    trg_vocab = Vocabulary.load(args.trg_vocab)
    src_vocab = Vocabulary.load(VOCAB_SRC)
    trg_vocab = Vocabulary.load(VOCAB_TRG)

    controller = Controller(args.folder_name)

    if args.mode == 'train':
        controller.train_model(BasicEncoderDecoderModel, src_vocab, trg_vocab,
                               args)
    elif args.mode == 'dev':
        controller.dev_model(BasicEncoderDecoderModel, src_vocab, trg_vocab,
                             args)
    elif args.mode == 'test':
        controller.test_model(BasicEncoderDecoderModel, src_vocab, trg_vocab,
                              args)
Exemple #49
0
def train(args):
  trace('loading corpus ...')
  with open(args.source) as fp:
    trees = [make_tree(l) for l in fp]

  trace('extracting leaf nodes ...')
  word_lists = [extract_words(t) for t in trees]
  lower_lists = [[w.lower() for w in words] for words in word_lists]

  trace('extracting gold operations ...')
  op_lists = [make_operations(t) for t in trees]

  trace('making vocabulary ...')
  word_vocab = Vocabulary.new(lower_lists, args.vocab)
  phrase_set = set()
  semiterminal_set = set()
  for tree in trees:
    phrase_set |= set(extract_phrase_labels(tree))
    semiterminal_set |= set(extract_semiterminals(tree))
  phrase_vocab = Vocabulary.new([list(phrase_set)], len(phrase_set), add_special_tokens=False)
  semiterminal_vocab = Vocabulary.new([list(semiterminal_set)], len(semiterminal_set), add_special_tokens=False)

  trace('converting data ...')
  word_lists = [to_vram_words(convert_word_list(x, word_vocab)) for x in word_lists]
  op_lists = [to_vram_ops(convert_op_list(x, phrase_vocab, semiterminal_vocab)) for x in op_lists]

  trace('start training ...')
  parser = Parser(
    args.vocab, args.embed, args.char_embed, args.queue,
    args.stack, args.srstate, len(phrase_set), len(semiterminal_set),
  )
  if args.use_gpu:
    parser.to_gpu()
  opt = optimizers.SGD(lr = 0.1)
  opt.setup(parser)
  opt.add_hook(optimizer.GradientClipping(10))
  opt.add_hook(optimizer.WeightDecay(0.0001))

  batch_set = list(zip(word_lists, op_lists))

  for epoch in range(args.epoch):
    n = 0
    random.shuffle(batch_set)
    
    for samples in batch(batch_set, args.minibatch):
      parser.zerograds()
      loss = XP.fzeros(())
      embed_cache = {}

      for word_list, op_list in zip(*samples):
        trace('epoch %3d, sample %6d:' % (epoch + 1, n + 1))
        loss += parser.forward(word_list, op_list, 0, embed_cache)
        n += 1
      
      loss.backward()
      opt.update()

    trace('saving model ...')
    prefix = args.model + '.%03.d' % (epoch + 1)
    word_vocab.save(prefix + '.words')
    phrase_vocab.save(prefix + '.phrases')
    semiterminal_vocab.save(prefix + '.semiterminals')
    parser.save_spec(prefix + '.spec')
    serializers.save_hdf5(prefix + '.weights', parser)

    opt.lr *= 0.92

  trace('finished.')
def train(args):
    trace('making vocabularies ...')
    src_vocab = Vocabulary.new(gens.input_word_list(), args.vocab)
    trg_vocab = Vocabulary.new(gens.output_word_list(), args.vocab)
    trace('making model ...')
    encdec = EncoderDecoder(args.vocab, args.embed, args.hidden)

    if args.load_model != "":
        print("model load  %s ... " % (args.load_model))
        src_vocab = Vocabulary.load(args.load_model + '.srcvocab')
        trg_vocab = Vocabulary.load(args.load_model + '.trgvocab')
        encdec = EncoderDecoder.load_spec(args.load_model + '.spec')
        serializers.load_hdf5(args.load_model + '.weights', encdec)

    if args.use_gpu:
        encdec.to_gpu()

    for epoch in range(args.epoch):
        trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
        trained = 0
        gen1 = gens.input_word_list()
        gen2 = gens.output_word_list()
        gen3 = gens.batch(
            gens.sorted_parallel(gen1, gen2, 100 * args.minibatch),
            args.minibatch)
        opt = optimizers.AdaGrad(lr=0.01)
        opt.setup(encdec)
        opt.add_hook(optimizer.GradientClipping(5))

        for src_batch, trg_batch in gen3:
            src_batch = fill_batch(src_batch)
            trg_batch = fill_batch(trg_batch)
            K = len(src_batch)
            hyp_batch, loss = forward(src_batch, trg_batch, src_vocab,
                                      trg_vocab, encdec, True, 0)
            loss.backward()
            opt.update()

            for k in range(K):
                trace('epoch %3d/%3d, sample %8d' %
                      (epoch + 1, args.epoch, trained + k + 1))
                trace(
                    '  src = ' +
                    ' '.join([x if x != '</s>' else '*'
                              for x in src_batch[k]]))
                trace(
                    '  trg = ' +
                    ' '.join([x if x != '</s>' else '*'
                              for x in trg_batch[k]]))
                trace(
                    '  hyp = ' +
                    ' '.join([x if x != '</s>' else '*'
                              for x in hyp_batch[k]]))

            trained += K

        if epoch % args.model_save_timing == 0:

            trace('saving model ...')
            prefix = args.model + '.%03.d' % (epoch + 1)
            src_vocab.save(prefix + '.srcvocab')
            trg_vocab.save(prefix + '.trgvocab')
            encdec.save_spec(prefix + '.spec')
            serializers.save_hdf5(prefix + '.weights', encdec)

    trace('finished.')