def __init__(self, work_dir, rawdata_dir, rawvocabsize, max_seq_length):
     json_path = work_dir + '/compressed'
     if os.path.exists(json_path):
         # load data from json
         print('loading saved json data from %s' % json_path)
         with open(json_path, 'r') as fin:
             gdict = json.load(fin)
             for name, val in gdict.items():
                 setattr(self, name, val)
         # setup encoder from vocabulary file
         vocabFile = work_dir + '/vocabulary.txt'
         if os.path.exists(vocabFile):
             print("Loading supplied vocabluary file: %s" % vocabFile)
             encoder = text_encoder.SubwordTextEncoder(filename=vocabFile)
             print("Total vocab size is: %d" % encoder.vocab_size)
         else:
             print(
                 "No supplied vocabulary file found. Build new vocabulary based on training data ...."
             )
             token_counts = tokenizer.corpus_token_counts(
                 work_dir + '/*.Corpus', 2000000, split_on_newlines=True)
             encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
                 rawvocabsize, token_counts, 2, 1000)
             encoder.store_to_file(vocabFile)
             print("New vocabulary constructed.")
         self.encoder = encoder
         self.max_seq_length = int(self.max_seq_length)
         self.vocab_size = encoder.vocab_size
         print('-')
         print('Vocab size:', self.vocab_size, 'unique words')
         print('-')
         print('Max allowed sequence length:', self.max_seq_length)
         print('-')
     else:
         print('generating data from data path: %s' % rawdata_dir)
         encoder, trainCorpus, evalCorpus, encodedFullTargetSpace, tgtIdNameMap = data_utils.prepare_raw_data(
             rawdata_dir, work_dir, rawvocabsize, max_seq_length)
         self.encoder = encoder
         self.rawTrainPosCorpus = trainCorpus
         self.rawEvalCorpus = evalCorpus
         self.max_seq_length = max_seq_length
         self.encodedFullTargetSpace = encodedFullTargetSpace
         self.tgtIdNameMap = tgtIdNameMap
         self.vocab_size = encoder.vocab_size
         self.fullSetTargetIds = list(encodedFullTargetSpace.keys())
         self.rawnegSetLen = len(self.fullSetTargetIds)
         print('-')
         print('Vocab size:', self.vocab_size, 'unique words')
         print('-')
         print('Max allowed sequence length:', self.max_seq_length)
         print('-')
         gdict = {}
         for name, attr in self.__dict__.items():
             if not name.startswith("__") and name != 'encoder':
                 if not callable(attr) and not type(attr) is staticmethod:
                     gdict[name] = attr
         with open(json_path, 'w') as fout:
             json.dump(gdict, fout)
         print('Processed data dumped')
Example #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode',
                        choices={'train', 'chat'},
                        default='train',
                        help="mode. if not specified, it's in the train mode")
    args = parser.parse_args()

    if not os.path.isdir(config.PROCESSED_PATH):
        data_utils.prepare_raw_data()
        data_utils.process_data()
    print('Data ready!')
    data_utils.make_dir(config.CPT_PATH)

    if args.mode == 'train':
        train()
    elif args.mode == 'chat':
        chat()
def train():
  # Prepare data.
  print("Preparing Train & Eval data in %s" % FLAGS.data_dir)

  for d in FLAGS.data_dir, FLAGS.model_dir:
    if not os.path.exists(d):
      os.makedirs(d)

  encoder, train_corpus, eval_corpus, encodedTgtSpace, tgtIdNameMap = data_utils.prepare_raw_data(
      FLAGS.data_dir, FLAGS.model_dir, FLAGS.vocab_size, FLAGS.neg_samples, FLAGS.max_seq_length )

  epoc_steps = int(math.floor( len(train_corpus) /  FLAGS.batch_size ) )

  print( "Training Data: %d total positive samples, each epoch need %d steps" % (len(train_corpus), epoc_steps ) )

  cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
  with tf.Session(config=cfg) as sess:
    model = create_model( sess, len(encodedTgtSpace), encoder.vocab_size,  False )
    #setup tensorboard logging
    sw =  tf.summary.FileWriter( logdir=FLAGS.model_dir,  graph=sess.graph, flush_secs=120)
    summary_op = model.add_summaries()
    # This is the training loop.
    step_time, loss, train_acc = 0.0, 0.0, 0.0
    current_step = 0
    previous_accuracies = []
    fullSetTargetIds = list(encodedTgtSpace.keys())
    fullSetLen = len(fullSetTargetIds)
    negIdx = random.randint(0, fullSetLen - 1)
    for epoch in range( FLAGS.max_epoc ):
      epoc_start_Time = time.time()
      random.shuffle(train_corpus)
      for batchId in range( math.floor(epoc_steps * 0.95) ): #basic drop out here
        start_time = time.time()
        source_inputs, src_lens, tgt_inputs, tgt_lens, labels, negIdx = \
          buildMixedTrainBatch( train_corpus[batchId*FLAGS.batch_size:(batchId+1)*FLAGS.batch_size], encodedTgtSpace,fullSetTargetIds, fullSetLen,FLAGS.neg_samples, negIdx)
        model.set_forward_only(False)
        d = model.get_train_feed_dict(source_inputs, tgt_inputs, labels, src_lens, tgt_lens)
        ops = [model.train, summary_op, model.loss, model.train_acc ]
        _, summary, step_loss, step_train_acc = sess.run(ops, feed_dict=d)
        step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
        loss += step_loss / FLAGS.steps_per_checkpoint
        train_acc += step_train_acc / FLAGS.steps_per_checkpoint
        current_step += 1

        # Once in a while, we save checkpoint, print statistics, and run evals.
        if current_step % FLAGS.steps_per_checkpoint == 0:
          print ("global epoc: %.3f, global step %d, learning rate %.4f step-time:%.2f loss:%.4f train_binary_acc:%.4f " %
                 ( float(model.global_step.eval())/ float(epoc_steps), model.global_step.eval(), model.learning_rate.eval(),
                             step_time, step_loss, train_acc ))
          checkpoint_path = os.path.join(FLAGS.model_dir, "SSE-LSTM.ckpt")
          acc_sum = tf.Summary(value=[tf.Summary.Value(tag="train_binary_acc", simple_value=train_acc)])
          sw.add_summary(acc_sum, current_step)

          # #########debugging##########
          # model.set_forward_only(True)
          # sse_index.createIndexFile(model, encoder, os.path.join(FLAGS.model_dir, FLAGS.rawfilename),
          #                           FLAGS.max_seq_length, os.path.join(FLAGS.model_dir, FLAGS.encodedIndexFile), sess,
          #                           batchsize=1000)
          # evaluator = sse_evaluator.Evaluator(model, eval_corpus, os.path.join(FLAGS.model_dir, FLAGS.encodedIndexFile),
          #                                     sess)
          # acc1, acc3, acc10 = evaluator.eval()
          # print("epoc# %.3f, task specific evaluation: top 1/3/10 accuracies: %f / %f / %f " % (float(model.global_step.eval())/ float(epoc_steps), acc1, acc3, acc10))
          # ###end of debugging########

          # Decrease learning rate if no improvement was seen over last 3 times.
          if len(previous_accuracies) > 3 and train_acc < min(previous_accuracies[-2:]):
            sess.run(model.learning_rate_decay_op)
          previous_accuracies.append(train_acc)
          # save currently best-ever model
          if train_acc == max(previous_accuracies):
            print("Better Accuracy %.4f found. Saving current best model ..." % train_acc )
            model.save(sess, checkpoint_path + "-BestEver")
          else:
            print("Best Accuracy is: %.4f, while current round is: %.4f" % (max(previous_accuracies), train_acc) )
            print("skip saving model ...")
          # if finished at least 2 Epocs and still no further accuracy improvement, stop training
          # report the best accuracy number and final model's number and save it.
          if epoch > 10 and train_acc < min(previous_accuracies[-5:]):
            p = model.save(sess, checkpoint_path + "-final")
            print("After around %d Epocs no further improvement, Training finished, wrote checkpoint to %s." % (epoch, p) )
            break

          # reset current checkpoint step statistics
          step_time, loss, train_acc = 0.0, 0.0, 0.0


      epoc_train_time = time.time() - epoc_start_Time
      print('\n\n\nepoch# %d  took %f hours' % ( epoch , epoc_train_time / (60.0 * 60) ) )

      # run task specific evaluation afer each epoch
      if (FLAGS.task_type not in ['ranking', 'crosslingual']) or ( (epoch+1) % 20 == 0 ):
        model.set_forward_only(True)
        sse_index.createIndexFile( model, encoder, os.path.join(FLAGS.model_dir, FLAGS.rawfilename), FLAGS.max_seq_length, os.path.join(FLAGS.model_dir, FLAGS.encodedIndexFile), sess, batchsize=1000 )
        evaluator = sse_evaluator.Evaluator(model, eval_corpus, os.path.join(FLAGS.model_dir, FLAGS.encodedIndexFile) , sess)
        acc1, acc3, acc10 = evaluator.eval()
        print("epoc#%d, task specific evaluation: top 1/3/10 accuracies: %f / %f / %f \n\n\n" % (epoch, acc1, acc3, acc10) )
      # Save checkpoint at end of each epoch
      checkpoint_path = os.path.join(FLAGS.model_dir, "SSE-LSTM.ckpt")
      model.save(sess, checkpoint_path + '-epoch-%d'%epoch)
      if len(previous_accuracies) > 0:
        print('So far best ever model training binary accuracy is: %.4f ' % max(previous_accuracies) )
Example #4
0
def train():
    # Prepare data.
    print("Preparing Train & Eval data in %s" % FLAGS.data_dir)

    for d in FLAGS.data_dir, FLAGS.model_dir:
        if not os.path.exists(d):
            os.makedirs(d)

    encoded_train_pair_path, encoded_eval_pair_path, encodedFullTargetSpace_path, _, _ = data_utils.prepare_raw_data(
        FLAGS.data_dir, FLAGS.model_dir, FLAGS.src_vocab_size,
        FLAGS.tgt_vocab_size)

    #load full set targetSeqID data
    tgtID_EncodingMap, tgtID_FullLableMap, fullLabel_tgtID_Map, target_inputs, target_lens = load_encodedTargetSpace(
        encodedFullTargetSpace_path)

    #load full set train data
    print("Reading development and training data ...")
    train_set, epoc_steps = read_train_data(encoded_train_pair_path,
                                            tgtID_EncodingMap)
    print("Training Data: %d total samples, each epoch need %d steps" %
          (len(train_set), epoc_steps))

    #load eval data
    eval_src_seqs, eval_src_lens, eval_tgtIDs = get_eval_set(
        encoded_eval_pair_path)

    cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    with tf.device('/' + FLAGS.device), tf.Session(config=cfg) as sess:
        # Create SSE model and build tensorflow training graph.
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.embedding_size))
        model = create_model(sess, len(tgtID_FullLableMap), False)

        #setup evaluation graph
        evaluator = sse_evaluator.Evaluator(model, eval_src_seqs,
                                            eval_src_lens, eval_tgtIDs,
                                            target_inputs, target_lens,
                                            tgtID_FullLableMap, sess)

        #setup tensorboard logging
        sw = tf.train.SummaryWriter(FLAGS.model_dir,
                                    sess.graph,
                                    flush_secs=120)
        summary_op = model.add_summaries()

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_accuracies = []
        for epoch in range(FLAGS.max_epoc):
            epoc_start_Time = time.time()
            random.shuffle(train_set, random.random)
            for batchId in range(epoc_steps -
                                 int(2.5 * FLAGS.steps_per_checkpoint)
                                 ):  #basic drop out here
                start_time = time.time()
                source_inputs, labels, src_lens = [], [], []
                for idx in xrange(FLAGS.batch_size):
                    source_input, src_len, tgtID = train_set[batchId *
                                                             FLAGS.batch_size +
                                                             idx]
                    source_inputs.append(source_input)
                    labels.append(tgtID_FullLableMap[tgtID])
                    src_lens.append(src_len)

                d = model.get_train_feed_dict(source_inputs, target_inputs,
                                              labels, src_lens, target_lens)
                ops = [model.train, summary_op, model.loss]
                _, summary, step_loss = sess.run(ops, feed_dict=d)
                step_time += (time.time() -
                              start_time) / FLAGS.steps_per_checkpoint
                loss += step_loss / FLAGS.steps_per_checkpoint
                current_step += 1

                # Once in a while, we save checkpoint, print statistics, and run evals.
                if current_step % FLAGS.steps_per_checkpoint == 0:
                    print(
                        "global epoc: %.3f, global step %d, learning rate %.4f step-time:%.2f loss:%.4f "
                        % (float(model.global_step.eval()) / float(epoc_steps),
                           model.global_step.eval(),
                           model.learning_rate.eval(), step_time, step_loss))
                    # Save checkpoint and zero timer and loss.
                    checkpoint_path = os.path.join(FLAGS.model_dir,
                                                   "SSE-LSTM.ckpt")
                    model.save(sess,
                               checkpoint_path,
                               global_step=model.global_step)
                    step_time, loss = 0.0, 0.0
                    # Run evals on development set and print their accuracy number.
                    t = time.time()
                    acc1, acc3, acc10 = evaluator.eval()
                    acc_sum = tf.Summary(value=[
                        tf.Summary.Value(tag="acc1", simple_value=acc1),
                        tf.Summary.Value(tag="acc3", simple_value=acc3),
                        tf.Summary.Value(tag="acc10", simple_value=acc10)
                    ])
                    sw.add_summary(acc_sum, current_step)
                    print(
                        "Step %d, top 1/3/10 accuracies: %f / %f / %f, (eval took %f seconds) "
                        % (current_step, acc1, acc3, acc10, time.time() - t))

                    sys.stdout.flush()
                    # Decrease learning rate if no improvement was seen over last 3 times.
                    if len(previous_accuracies) > 2 and acc1 < min(
                            previous_accuracies[-3:]):
                        sess.run(model.learning_rate_decay_op)
                    previous_accuracies.append(acc1)
                    # save currently best-ever model
                    if acc1 == max(previous_accuracies):
                        model.save(sess, checkpoint_path + "-BestEver")
                    # if finished at least 2 Epocs and still no further accuracy improvement, stop training
                    # report the best accuracy number and final model's number and save it.
                    if epoch > 2 and acc1 < min(previous_accuracies[-3:]):
                        p = model.save(sess, checkpoint_path + "-final")
                        print(
                            "After around %d Epocs no further improvement, Training finished, wrote checkpoint to %s."
                            % (epoch, p))
                        print(
                            "Best ever top1 accuracy: %.2f , Final top 1 / 3 / 10 accuracies: %.2f / %.2f / %.2f"
                            % (max(previous_accuracies), acc1, acc3, acc10))
                        break
            #give out epoc statistics
            epoc_train_time = time.time() - epoc_start_Time
            print('epoch# %d  took %f hours' % (epoch, epoc_train_time /
                                                (60.0 * 60)))
            # Save checkpoint at end of each epoch
            checkpoint_path = os.path.join(FLAGS.model_dir, "SSE-LSTM.ckpt")
            model.save(sess, checkpoint_path + '-epoch-%d' % epoch)
            if len(previous_accuracies) > 0:
                print('So far best ever model top1 accuracy is: %.4f ' %
                      max(previous_accuracies))
def train():
    # Prepare data.
    print("Preparing Train & Eval data in %s" % FLAGS.data_dir)

    for d in FLAGS.data_dir, FLAGS.model_dir:
        if not os.path.exists(d):
            os.makedirs(d)

    encoder, train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = data_utils.prepare_raw_data(
        FLAGS.data_dir, FLAGS.model_dir, FLAGS.vocab_size, FLAGS.task_type,
        FLAGS.max_seq_length)

    epoc_steps = int(math.floor(len(train_corpus) / FLAGS.batch_size))

    print(
        "Training Data: %d total samples (pos + neg), each epoch need %d steps"
        % (len(train_corpus), epoc_steps))

    cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)

    with tf.Session(config=cfg) as sess:
        # Create SSE model and build tensorflow training graph.
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.embedding_size))
        model = create_model(sess, len(encodedTgtSpace), encoder.vocab_size,
                             False)

        #setup evaluation graph
        evaluator = sse_evaluator.Evaluator(model, dev_corpus, encodedTgtSpace,
                                            sess)

        #setup tensorboard logging
        sw = tf.summary.FileWriter(logdir=FLAGS.model_dir,
                                   graph=sess.graph,
                                   flush_secs=120)
        summary_op = model.add_summaries()

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_accuracies = []
        fullSetTargetIds = set(encodedTgtSpace.keys())
        for epoch in range(FLAGS.max_epoc):
            epoc_start_Time = time.time()
            random.shuffle(train_corpus, random.random)
            for batchId in range(epoc_steps -
                                 int(2.5 * FLAGS.steps_per_checkpoint)
                                 ):  #basic drop out here
                start_time = time.time()
                source_inputs, src_lens, tgt_inputs, tgt_lens, labels  = [], [], [], [], []
                for idx in range(FLAGS.batch_size):
                    source_input, tgtId = train_corpus[batchId *
                                                       FLAGS.batch_size + idx]
                    #add positive pair
                    source_inputs.append(source_input)
                    src_lens.append(
                        source_input.index(text_encoder.PAD_ID) + 1)
                    tgt_inputs.append(encodedTgtSpace[tgtId])
                    tgt_lens.append(
                        encodedTgtSpace[tgtId].index(text_encoder.PAD_ID) + 1)
                    labels.append(1.0)
                    #add negative pair
                    negTgt = random.sample(fullSetTargetIds - set([tgtId]),
                                           1)[0]
                    source_inputs.append(source_input)
                    src_lens.append(
                        source_input.index(text_encoder.PAD_ID) + 1)
                    tgt_inputs.append(encodedTgtSpace[negTgt])
                    tgt_lens.append(
                        encodedTgtSpace[negTgt].index(text_encoder.PAD_ID) + 1)
                    labels.append(0.0)

                d = model.get_train_feed_dict(source_inputs, tgt_inputs,
                                              labels, src_lens, tgt_lens)
                ops = [model.train, summary_op, model.loss]
                _, summary, step_loss = sess.run(ops, feed_dict=d)
                step_time += (time.time() -
                              start_time) / FLAGS.steps_per_checkpoint
                loss += step_loss / FLAGS.steps_per_checkpoint
                current_step += 1

                # Once in a while, we save checkpoint, print statistics, and run evals.
                if current_step % FLAGS.steps_per_checkpoint == 0:
                    print(
                        "global epoc: %.3f, global step %d, learning rate %.4f step-time:%.2f loss:%.4f "
                        % (float(model.global_step.eval()) / float(epoc_steps),
                           model.global_step.eval(),
                           model.learning_rate.eval(), step_time, step_loss))
                    # Save checkpoint and zero timer and loss.
                    checkpoint_path = os.path.join(FLAGS.model_dir,
                                                   "SSE-LSTM.ckpt")
                    # model.save(sess, checkpoint_path, global_step=model.global_step)  #only save better models
                    step_time, loss = 0.0, 0.0
                    # Run evals on development set and print their accuracy number.
                    t = time.time()
                    acc1, acc3, acc10 = evaluator.eval()
                    acc_sum = tf.Summary(value=[
                        tf.Summary.Value(tag="acc1", simple_value=acc1),
                        tf.Summary.Value(tag="acc3", simple_value=acc3),
                        tf.Summary.Value(tag="acc10", simple_value=acc10)
                    ])
                    sw.add_summary(acc_sum, current_step)
                    print(
                        "Step %d, top 1/3/10 accuracies: %f / %f / %f, (eval took %f seconds) "
                        % (current_step, acc1, acc3, acc10, time.time() - t))
                    sys.stdout.flush()
                    # Decrease learning rate if no improvement was seen over last 3 times.
                    if len(previous_accuracies) > 2 and acc1 < min(
                            previous_accuracies[-3:]):
                        sess.run(model.learning_rate_decay_op)
                    previous_accuracies.append(acc1)
                    # save currently best-ever model
                    if acc1 == max(previous_accuracies):
                        print(
                            "Better Accuracy %f found. Saving current best model ..."
                            % acc1)
                        model.save(sess, checkpoint_path + "-BestEver")
                    else:
                        print(
                            "Best Accuracy is: %f, while current round is: %f"
                            % (max(previous_accuracies), acc1))
                        print("skip saving model ...")
                    # if finished at least 2 Epocs and still no further accuracy improvement, stop training
                    # report the best accuracy number and final model's number and save it.
                    if epoch > 2 and acc1 < min(previous_accuracies[-3:]):
                        p = model.save(sess, checkpoint_path + "-final")
                        print(
                            "After around %d Epocs no further improvement, Training finished, wrote checkpoint to %s."
                            % (epoch, p))
                        print(
                            "Best ever top1 accuracy: %.2f , Final top 1 / 3 / 10 accuracies: %.2f / %.2f / %.2f"
                            % (max(previous_accuracies), acc1, acc3, acc10))
                        break
            #give out epoc statistics
            epoc_train_time = time.time() - epoc_start_Time
            print('epoch# %d  took %f hours' % (epoch, epoc_train_time /
                                                (60.0 * 60)))
            # Save checkpoint at end of each epoch
            checkpoint_path = os.path.join(FLAGS.model_dir, "SSE-LSTM.ckpt")
            model.save(sess, checkpoint_path + '-epoch-%d' % epoch)
            if len(previous_accuracies) > 0:
                print('So far best ever model top1 accuracy is: %.4f ' %
                      max(previous_accuracies))