def main(): import onlinePreprocess onlinePreprocess.lower = opt.lower_input onlinePreprocess.seq_length = opt.max_sent_length onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0 from onlinePreprocess import prepare_data_online dataset = prepare_data_online(opt.train_src, opt.src_vocab, opt.train_bio, opt.bio_vocab, opt.train_feats, opt.feat_vocab, opt.train_tgt, opt.tgt_vocab) trainData = s2s.Dataset(dataset['train']['src'], dataset['train']['bio'], dataset['train']['feats'], dataset['train']['tgt'], dataset['train']['switch'], dataset['train']['c_tgt'], opt.batch_size, opt.gpus) dicts = dataset['dicts'] logger.info(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) logger.info(' * number of training sentences. %d' % len(dataset['train']['src'])) logger.info(' * maximum batch size. %d' % opt.batch_size) logger.info('Building model...') encoder = s2s.Models.Encoder(opt, dicts['src']) decoder = s2s.Models.Decoder(opt, dicts['tgt']) decIniter = s2s.Models.DecInit(opt) generator = nn.Sequential( nn.Linear(opt.dec_rnn_size // opt.maxout_pool_size, dicts['tgt'].size()), # TODO: fix here # nn.LogSoftmax(dim=1) nn.Softmax(dim=1)) model = s2s.Models.NMTModel(encoder, decoder, decIniter) model.generator = generator translator = s2s.Translator(opt, model, dataset) if len(opt.gpus) >= 1: model.cuda() generator.cuda() else: model.cpu() generator.cpu() # if len(opt.gpus) > 1: # model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) # generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) for pr_name, p in model.named_parameters(): logger.info(pr_name) # p.data.uniform_(-opt.param_init, opt.param_init) if p.dim() == 1: # p.data.zero_() p.data.normal_(0, math.sqrt(6 / (1 + p.size(0)))) else: nn.init.xavier_normal_(p, math.sqrt(3)) encoder.load_pretrained_vectors(opt) decoder.load_pretrained_vectors(opt) optim = s2s.Optim(opt.optim, opt.learning_rate, max_grad_norm=opt.max_grad_norm, max_weight_value=opt.max_weight_value, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, decay_bad_count=opt.halve_lr_bad_count) optim.set_parameters(model.parameters()) validData = None if opt.dev_input_src and opt.dev_ref: validData = load_dev_data(translator, opt.dev_input_src, opt.dev_bio, opt.dev_feats, opt.dev_ref) trainModel(model, translator, trainData, validData, dataset, optim)
def main(): import onlinePreprocess onlinePreprocess.seq_length = opt.max_sent_length onlinePreprocess.MAX_LDA_WORDS = opt.max_lda_words onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0 from onlinePreprocess import prepare_data_online dataset = prepare_data_online(opt.train_src, opt.src_vocab, opt.train_tgt, opt.tgt_vocab, opt.train_lda, opt.lda_vocab) dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict if dict_checkpoint: logger.info('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint) dataset['dicts'] = checkpoint['dicts'] trainData = s2s.Dataset(dataset['train']['src'], dataset['train']['eq_mask'], dataset['train']['lda'], dataset['train']['tgt'], opt.batch_size, opt.gpus) # validData = s2s.Dataset(dataset['valid']['src'], dataset['valid']['bio'], dataset['valid']['tgt'], # None, None, opt.batch_size, opt.gpus, # volatile=True) dicts = dataset['dicts'] logger.info(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) logger.info(' * number of training sentences. %d' % len(dataset['train']['src'])) logger.info(' * maximum batch size. %d' % opt.batch_size) logger.info('Building model...') encoder = s2s.Models.Encoder(opt, dicts['src']) topic_encoder = s2s.Models.TopicEncoder(opt, dicts['lda']) decoder = s2s.Models.MPGDecoder(opt, dicts['tgt']) decIniter = s2s.Models.DecInit(opt) generator = nn.Sequential( nn.Linear(opt.dec_rnn_size // opt.maxout_pool_size, dicts['tgt'].size()), # TODO: fix here nn.LogSoftmax(dim=1)) model = s2s.Models.NMTModel(encoder, topic_encoder, decoder, decIniter) model.generator = generator translator = s2s.Translator(opt, model, dataset) if opt.train_from: logger.info('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] generator_state_dict = chk_model.generator.state_dict() model_state_dict = { k: v for k, v in chk_model.state_dict().items() if 'generator' not in k } model.load_state_dict(model_state_dict) generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: logger.info('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpus) >= 1: model.cuda() generator.cuda() else: model.cpu() generator.cpu() # if len(opt.gpus) > 1: # model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) # generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) if not opt.train_from_state_dict and not opt.train_from: for pr_name, p in model.named_parameters(): logger.info(pr_name) # p.data.uniform_(-opt.param_init, opt.param_init) if p.dim() == 1: # p.data.zero_() p.data.normal_(0, math.sqrt(6 / (1 + p.size(0)))) else: nn.init.xavier_normal_(p, math.sqrt(3)) encoder.load_pretrained_vectors(opt) decoder.load_pretrained_vectors(opt) optim = s2s.Optim(opt.optim, opt.learning_rate, max_grad_norm=opt.max_grad_norm, max_weight_value=opt.max_weight_value, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, decay_bad_count=opt.halve_lr_bad_count) else: logger.info('Loading optimizer from checkpoint:') optim = checkpoint['optim'] logger.info(optim) optim.set_parameters(model.parameters()) if opt.train_from or opt.train_from_state_dict: optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) validData = None if opt.dev_input_src and opt.dev_ref: validData = load_dev_data(translator, opt.dev_input_src, opt.dev_input_lda, opt.dev_ref) if opt.test_input_src and opt.test_ref: testData = load_dev_data(translator, opt.test_input_src, opt.test_input_lda, opt.test_ref) trainModel(model, translator, trainData, validData, testData, dataset, optim)
def main(): if not opt.online_process_data: raise Exception( 'This code does not use preprocessed .pt pickle file. It has some issues with big files.' ) # dataset = torch.load(opt.data) else: import onlinePreprocess onlinePreprocess.seq_length = opt.max_sent_length onlinePreprocess.max_doc_len = opt.max_doc_len onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0 onlinePreprocess.norm_lambda = opt.norm_lambda from onlinePreprocess import prepare_data_online dataset = prepare_data_online(opt.train_src, opt.src_vocab, opt.train_tgt, opt.tgt_vocab, opt.train_oracle, opt.train_src_rouge) trainData = neusum.Dataset(dataset['train']['src'], dataset['train']['src_raw'], dataset['train']['tgt'], dataset['train']['oracle'], dataset['train']['src_rouge'], opt.batch_size, opt.max_doc_len, opt.gpus) dicts = dataset['dicts'] logger.info(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) logger.info(' * number of training sentences. %d' % len(dataset['train']['src'])) logger.info(' * maximum batch size. %d' % opt.batch_size) logger.info('Building model...') sent_encoder = neusum.Models.Encoder(opt, dicts['src']) doc_encoder = neusum.Models.DocumentEncoder(opt) pointer = neusum.Models.Pointer(opt, dicts['tgt']) if opt.dec_init == "simple": decIniter = neusum.Models.DecInit(opt) elif opt.dec_init == "att": decIniter = neusum.Models.DecInitAtt(opt) else: raise ValueError('Unknown decoder init method: {0}'.format( opt.dec_init)) model = neusum.Models.NMTModel(sent_encoder, doc_encoder, pointer, decIniter, rouge_calculator) summarizer = neusum.Summarizer(opt, model, dataset) if len(opt.gpus) >= 1: model.cuda() else: model.cpu() if opt.freeze_word_vecs_enc: logger.warning('Not updating encoder word embedding.') for pr_name, p in model.named_parameters(): logger.info(pr_name) # p.data.uniform_(-opt.param_init, opt.param_init) if p.dim() == 1: # p.data.zero_() p.data.normal_(0, math.sqrt(6 / (1 + p.size(0)))) else: xavier_normal(p, math.sqrt(3)) # xavier_uniform(p) sent_encoder.load_pretrained_vectors(opt, logger) optim = neusum.Optim(opt.optim, opt.learning_rate, max_grad_norm=opt.max_grad_norm, max_weight_value=opt.max_weight_value, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, decay_bad_count=opt.halve_lr_bad_count) optim.set_parameters(model.parameters()) validData = None if opt.dev_input_src and opt.dev_ref: validData = load_dev_data(summarizer, opt.dev_input_src, opt.dev_ref) trainModel(model, summarizer, trainData, validData, dataset, optim)
def main(): import onlinePreprocess onlinePreprocess.lower = opt.lower_input onlinePreprocess.seq_length = opt.max_sent_length onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0 from onlinePreprocess import prepare_data_online # opt.train_src (source file of sequence) 'it is a replica of the grotto at lourdes , france where the virgin mary reputedly appeared to saint bernadette soubirous in 1858 .' # opt.src_vocab (source file of vocab) 'the(word) 4(index) 256272(frequency) 0.06749202214022335' # opt.train_bio (answer position embedding) 'O O O O O O O O O O O O O O O O O O B I I O O O' # opt.bio_vocab (source file of answer position vocab) 'O(bio) 4(index) 2525015(frequency) 0.8958601572376024' # opt.train_feats (source file of postag/ner/case) 'PERSON/UPCASE/NN ...' (3 different embeddings) # opt.feat_vocab (source file of answer feat vocab) # opt.train_tgt (source file of question) 'to whom did the virgin mary allegedly appear in 1858 in lourdes france ?' # opt.tgt_vocab (source file of vocab) same file with opt.src_vocab !! dataset = prepare_data_online(opt.train_src, opt.src_vocab, opt.train_bio, opt.bio_vocab, opt.train_feats, opt.feat_vocab, opt.train_tgt, opt.tgt_vocab, opt.train_guide_src, opt.guide_src_vocab) trainData = s2s.Dataset(dataset['train']['src'], dataset['train']['bio'], dataset['train']['feats'], dataset['train']['tgt'], dataset['train']['switch'], dataset['train']['c_tgt'], opt.batch_size, opt.gpus, dataset['train']['guide_src']) dicts = dataset['dicts'] logger.info(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) logger.info(' * number of training sentences. %d' % len(dataset['train']['src'])) logger.info(' * maximum batch size. %d' % opt.batch_size) logger.info('Building Model ...') encoder = s2s.Models.Encoder(opt, dicts['src'], dicts['guide_src']) decoder = s2s.Models.Decoder(opt, dicts['tgt']) decIniter = s2s.Models.DecInit(opt) ''' generator map output embedding to vocab size vector then softmax''' generator = nn.Sequential( nn.Linear(opt.dec_rnn_size // opt.maxout_pool_size, dicts['tgt'].size()), nn.Softmax(dim=1)) classifier = nn.Sequential( nn.Linear(opt.dec_rnn_size + 300, dicts['guide_src'].size()), nn.Softmax(dim=1)) nlu_generator = nn.Sequential( nn.Linear(opt.dec_rnn_size * 2, dicts['guide_src'].size()), nn.Softmax(dim=1)) model = s2s.Models.NMTModel(encoder, decoder, decIniter) model.generator = generator model.classifier = classifier model.nlu_generator = nlu_generator translator = s2s.Translator(opt, model, dataset) if len(opt.gpus) >= 1: model.cuda() generator.cuda() classifier.cuda() nlu_generator.cuda() else: model.cpu() generator.cpu() classifier.cpu() nlu_generator.cpu() # if len(opt.gpus) > 1: # model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) # generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) for pr_name, p in model.named_parameters(): logger.info(pr_name) # p.data.uniform_(-opt.param_init, opt.param_init) if p.dim() == 1: # p.data.zero_() p.data.normal_(0, math.sqrt(6 / (1 + p.size(0)))) else: nn.init.xavier_normal_(p, math.sqrt(3)) encoder.load_pretrained_vectors(opt) decoder.load_pretrained_vectors(opt) optim = s2s.Optim(opt.optim, opt.learning_rate, max_grad_norm=opt.max_grad_norm, max_weight_value=opt.max_weight_value, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, decay_bad_count=opt.halve_lr_bad_count) optim.set_parameters(model.parameters()) validData = None if opt.dev_input_src and opt.dev_ref: validData = load_dev_data(translator, opt.dev_input_src, opt.dev_bio, opt.dev_feats, opt.dev_ref, opt.dev_guide_src) testData = None if opt.test_input_src and opt.test_ref: testData = load_dev_data(translator, opt.test_input_src, opt.test_bio, opt.test_feats, opt.test_ref, opt.test_guide_src) trainModel(model, translator, trainData, validData, testData, dataset, optim)
requires_grad=False) return scores, doc_sent_mask if __name__ == "__main__": # test the modules import onlinePreprocess onlinePreprocess.seq_length = 80 onlinePreprocess.max_doc_len = 500 onlinePreprocess.shuffle = 0 onlinePreprocess.norm_lambda = 20 from onlinePreprocess import prepare_data_online dataset = prepare_data_online("../../data/train/future/train.txt.src", None, None, None, "../../data/train/future/train.txt.oracle", None, "../../data/train/future/train.txt.section", 10, 500, '') trainData = neusum.Dataset( dataset['train']['src'], dataset['train']['src_raw'], dataset['train']['tgt'], dataset['train']['oracle'], dataset['train']['src_rouge'], dataset['train']['src_section'], dataset['train']['src_section_raw'], 4, 500, None) model = LogLinear() # model.set_rules(1.0, 1.0, ['future'], 1.0) from loglinear.Config import Keyword, PossibleSection # use section title average embedding # model.set_rules(1.0, 1.0, Keyword['future'], 1.0, -1.0, [], True, '../glove/glove.6B.50d.txt')
if __name__ == "__main__": # put this here so it will use the logger of test from train import evalModel, load_dev_data if opt.online_process_data: logger.info( 'Online Preprocessing data (to get vocabulary dictionary).') onlinePreprocess.seq_length = opt.max_sent_length onlinePreprocess.max_doc_len = opt.max_doc_len onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0 onlinePreprocess.norm_lambda = opt.norm_lambda from onlinePreprocess import prepare_data_online dataset = prepare_data_online(opt.train_src, opt.src_vocab, opt.train_tgt, opt.tgt_vocab, opt.train_oracle, opt.train_src_rouge, opt.train_src_section, opt.drop_too_short, opt.drop_too_long) else: logger.info('Use preprocessed data stored in checkpoint.') dataset = { } # this is used for the summarizer (only need the 'dict' part) logger.info('Loading checkpoint...') if opt.specific_epoch > 0: model_selected = os.path.join(opt.save_path, 'model_epoch_%s.pt' % opt.specific_epoch) logger.info('Loading from the specific epoch checkpoint "%s"' % model_selected) else: # Find the latest model to load