def percrank_train(args): opts, files = getopt(args, 'c:d:s:j:w:e:r:') candgen_model = None train_size = 1.0 parallel = False jobs_number = 0 work_dir = None experiment_id = None for opt, arg in opts: if opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-s': train_size = float(arg) elif opt == '-c': candgen_model = arg elif opt == '-j': parallel = True jobs_number = int(arg) elif opt == '-w': work_dir = arg elif opt == '-e': experiment_id = arg elif opt == '-r' and arg: rnd.seed(arg) if len(files) != 4: sys.exit(__doc__) fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files log_info('Training perceptron ranker...') rank_config = Config(fname_rank_config) if candgen_model: rank_config['candgen_model'] = candgen_model if rank_config.get('nn'): from tgen.rank_nn import SimpleNNRanker, EmbNNRanker if rank_config['nn'] in ['emb', 'emb_trees', 'emb_prev']: ranker_class = EmbNNRanker else: ranker_class = SimpleNNRanker else: ranker_class = PerceptronRanker log_info('Using %s for ranking' % ranker_class.__name__) if not parallel: ranker = ranker_class(rank_config) else: rank_config['jobs_number'] = jobs_number if work_dir is None: work_dir, _ = os.path.split(fname_rank_config) ranker = ParallelRanker(rank_config, work_dir, experiment_id, ranker_class) ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size) # avoid the "maximum recursion depth exceeded" error sys.setrecursionlimit(100000) ranker.save_to_file(fname_rank_model)
def seq2seq_train(args): ap = ArgumentParser(prog=' '.join(sys.argv[0:2])) ap.add_argument('-s', '--train-size', type=float, help='Portion of the training data to use (default: 1.0)', default=1.0) ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-j', '--jobs', type=int, help='Number of parallel jobs to use') ap.add_argument('-w', '--work-dir', type=str, help='Main working directory for parallel jobs') ap.add_argument('-e', '--experiment-id', type=str, help='Experiment ID for parallel jobs (used as job name prefix)') ap.add_argument('-r', '--random-seed', type=str, help='Initial random seed (used as string).') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('-v', '--valid-data', type=str, help='Validation data paths (2-3 comma-separated files: DAs, trees/sentences, contexts)') ap.add_argument('-l', '--lexic-data', type=str, help='Lexicalization data paths (1-2 comma-separated files: surface forms,' + 'training lexic. instructions)') ap.add_argument('-t', '--tb-summary-dir', '--tensorboard-summary-dir', '--tensorboard', type=str, help='Directory where Tensorboard summaries are saved during training') ap.add_argument('seq2seq_config_file', type=str, help='Seq2Seq generator configuration file') ap.add_argument('da_train_file', type=str, help='Input training DAs') ap.add_argument('tree_train_file', type=str, help='Input training trees/sentences') ap.add_argument('seq2seq_model_file', type=str, help='File name where to save the trained Seq2Seq generator model') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) if args.random_seed: rnd.seed(args.random_seed) log_info('Training sequence-to-sequence generator...') config = Config(args.seq2seq_config_file) if args.tb_summary_dir: # override Tensorboard setting config['tb_summary_dir'] = args.tb_summary_dir if args.jobs: # parallelize when training config['jobs_number'] = args.jobs if not args.work_dir: work_dir, _ = os.path.split(args.seq2seq_config_file) generator = ParallelSeq2SeqTraining(config, args.work_dir or work_dir, args.experiment_id) else: # just a single training instance generator = Seq2SeqGen(config) generator.train(args.da_train_file, args.tree_train_file, data_portion=args.train_size, context_file=args.context_file, validation_files=args.valid_data, lexic_files=args.lexic_data) sys.setrecursionlimit(100000) generator.save_to_file(args.seq2seq_model_file)
def exposed_train(self, rnd_seed, das_file, ttree_file, data_portion, context_file, validation_files): """Run the whole training. """ rnd.seed(rnd_seed) log_info('Random seed: %f' % rnd_seed) tstart = time.time() log_info('Starting training...') self.seq2seq.train(das_file, ttree_file, data_portion, context_file, validation_files) log_info('Training finished -- time taken: %f secs.' % (time.time() - tstart)) top_cost = self.seq2seq.top_k_costs[0] log_info('Best cost: %f' % top_cost) return top_cost
def exposed_training_pass(self, w, pass_no, rnd_seed, data_offset, data_len): """(Worker) Run one pass over a part of the training data. @param w: initial perceptron weights (pickled) @param pass_no: pass number (for logging purposes) @param rnd_seed: random generator seed for shuffling training examples @param data_offset: training data portion start @param data_len: training data portion size @return: updated perceptron weights after passing the selected data portion (pickled) """ log_info('Training pass %d with data portion %d + %d' % (pass_no, data_offset, data_len)) # use the local ranker instance ranker = self.ranker_inst # import current feature weights tstart = time.time() ranker.set_weights(pickle.loads(w)) log_info('Weights loading: %f secs.' % (time.time() - tstart)) # save rest of the training data to temporary variables, set just the # required portion for computation all_train_das = ranker.train_das ranker.train_das = ranker.train_das[data_offset:data_offset + data_len] all_train_trees = ranker.train_trees ranker.train_trees = ranker.train_trees[data_offset:data_offset + data_len] all_train_feats = ranker.train_feats ranker.train_feats = ranker.train_feats[data_offset:data_offset + data_len] all_train_sents = ranker.train_sents ranker.train_sents = ranker.train_sents[data_offset:data_offset + data_len] all_train_order = ranker.train_order ranker.train_order = range(len(ranker.train_trees)) if ranker.randomize: rnd.seed(rnd_seed) rnd.shuffle(ranker.train_order) # do the actual computation (update w) ranker._training_pass(pass_no) # return the rest of the training data to member variables ranker.train_das = all_train_das ranker.train_trees = all_train_trees ranker.train_feats = all_train_feats ranker.train_sents = all_train_sents ranker.train_order = all_train_order # return the result of the computation log_info('Training pass %d / %d / %d done.' % (pass_no, data_offset, data_len)) tstart = time.time() dump = pickle.dumps((ranker.get_weights(), ranker.get_diagnostics()), pickle.HIGHEST_PROTOCOL) log_info('Weights saving: %f secs.' % (time.time() - tstart)) return dump
def seq2seq_train(args): ap = ArgumentParser() ap.add_argument('-s', '--train-size', type=float, help='Portion of the training data to use (default: 1.0)', default=1.0) ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-j', '--jobs', type=int, help='Number of parallel jobs to use') ap.add_argument('-w', '--work-dir', type=str, help='Main working for parallel jobs') ap.add_argument('-e', '--experiment-id', type=str, help='Experiment ID for parallel jobs (used as job name prefix)') ap.add_argument('-r', '--random-seed', type=str, help='Initial random seed (used as string).') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('-v', '--valid-data', type=str, help='Validation data paths (2-3 comma-separated files: DAs, trees/sentences, contexts)') ap.add_argument('seq2seq_config_file', type=str, help='Seq2Seq generator configuration file') ap.add_argument('da_train_file', type=str, help='Input training DAs') ap.add_argument('tree_train_file', type=str, help='Input training trees/sentences') ap.add_argument('seq2seq_model_file', type=str, help='File name where to save the trained Seq2Seq generator model') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) if args.random_seed: rnd.seed(rnd.seed(args.random_seed)) log_info('Training sequence-to-sequence generator...') config = Config(args.seq2seq_config_file) if args.jobs: config['jobs_number'] = args.jobs if not args.work_dir: work_dir, _ = os.path.split(args.seq2seq_config_file) generator = ParallelSeq2SeqTraining(config, args.work_dir or work_dir, args.experiment_id) else: generator = Seq2SeqGen(config) generator.train(args.da_train_file, args.tree_train_file, data_portion=args.train_size, context_file=args.context_file, validation_files=args.valid_data) sys.setrecursionlimit(100000) generator.save_to_file(args.seq2seq_model_file)
def train(args): if args.random_seed: # set random seed if needed rnd.seed(args.random_seed) log_info("Loading configuration from %s..." % args.config_file) with codecs.open(args.config_file, 'r', 'UTF-8') as fh: cfg = yaml.load(fh) log_info("Initializing...") rp = RatingPredictor(cfg) if args.tensorboard_dir_id is not None: tb_dir, run_id = args.tensorboard_dir_id.split(':', 1) rp.set_tensorboard_logging(tb_dir, run_id) log_info("Training...") rp.train(args.train_data, valid_data_file=args.valid_data, data_portion=args.training_portion, model_fname=args.model_file) log_info("Saving model to %s..." % args.model_file) rp.save_to_file(args.model_file)