def train(): conf = configuration.Config() tokenizer = tokenization.FullTokenizer( vocab_file=conf.file_config.vocab_file) model = models.TransformerEncoder(conf) model = model.to(device) if args.train: train_dataset = datasets.OnlineShopping( mode='train', config=conf, tokenizer=tokenizer, auto_padding=conf.train_config.auto_padding) logging.info("***** Running training *****") logging.info(" Num examples = %d", len(train_dataset)) logging.info(" Total training steps: {}".format( train_dataset.num_steps)) train_dataloader = DataLoader( train_dataset, batch_size=conf.train_config.train_batch_size, shuffle=True, collate_fn=collate_fn) run(config=conf, dataloader=train_dataloader, model=model, mode='train', total_steps=train_dataset.num_steps) if args.dev: dev_dataset = datasets.OnlineShopping( mode='dev', config=conf, tokenizer=tokenizer, auto_padding=conf.train_config.auto_padding) logging.info("***** Running validating *****") logging.info(" Num examples = %d", len(dev_dataset)) logging.info(" Total validating steps: {}".format( dev_dataset.num_steps)) train_dataloader = DataLoader( dev_dataset, batch_size=conf.train_config.train_batch_size, collate_fn=collate_fn) run(config=conf, dataloader=train_dataloader, model=model, mode='eval')
def predict(texts): conf = configuration.Config() tokenizer = tokenization.FullTokenizer( vocab_file=conf.file_config.vocab_file) model = models.TransformerEncoder(conf) model = model.to(device) if os.path.exists( os.path.join(conf.train_config.model_dir, conf.train_config.model_name)): logging.info(' *** Loading model ***') model.load_state_dict( torch.load( os.path.join(conf.train_config.model_dir, conf.train_config.model_name))) else: logging.info(' *** No model available. *** ') return predict_dataset = datasets.OnlineShopping(mode='single_predict', config=conf, tokenizer=tokenizer, auto_padding=True, texts=texts) predict_dataloader = DataLoader(predict_dataset, batch_size=len(predict_dataset), collate_fn=collate_fn) data = next(iter(predict_dataloader)) text_ids, _ = [t.to(device) if t is not None else t for t in data] logits = model(text_ids) probs, predictions = get_predictions(logits) return dict( zip(texts, [{ 'result': label, 'probability': prob } for label, prob in zip([ predict_dataset.convert_label_id_to_value(prediction.item()) for prediction in predictions ], [prob.item() for prob in probs])]))
def __init__(self, config, use_attention=True, encoder=None, decoder=None, src_padding_idx=0, tgt_padding_idx=0, label_smoothing=0, tgt_vocab=None): """ Initialization of variables and functions :param config: configuration :param use_attention: use attention or not, consistent with seq2seq :param encoder: encoder :param decoder: decoder :param src_padding_idx: source padding index :param tgt_padding_idx: target padding index :param label_smoothing: ratio for label smoothing :param tgt_vocab: target vocabulary """ super(tensor2tensor, self).__init__() self.config = config # pretrained encoder or not if encoder is not None: self.encoder = encoder else: self.encoder = models.TransformerEncoder( config, padding_idx=src_padding_idx) tgt_embedding = self.encoder.embedding if config.shared_vocab else None # pretrained decoder or not if decoder is not None: self.decoder = decoder else: self.decoder = models.TransformerDecoder( config, tgt_embedding=tgt_embedding, padding_idx=tgt_padding_idx) # log softmax should specify dimension explicitly self.log_softmax = nn.LogSoftmax(dim=-1) self.use_cuda = config.use_cuda self.config = config self.label_smoothing = label_smoothing if self.label_smoothing > 0: self.criterion = LabelSmoothingLoss(label_smoothing, config.tgt_vocab_size, ignore_index=tgt_padding_idx) else: self.criterion = nn.CrossEntropyLoss(ignore_index=utils.PAD) if config.use_cuda: self.criterion.cuda() self.compute_score = nn.Linear(config.hidden_size, config.tgt_vocab_size) # Use rl or not. Should specify a reward provider. Not available yet in this framework. # if config.rl: # self.bleu_scorer = bleu.Scorer(pad=0, eos=3, unk=1) # self.reward_provider = CTRRewardProvider(config.ctr_reward_provider_path) # self.tgt_vocab = tgt_vocab self.padding_idx = tgt_padding_idx
desc_encoder = models.CNNEncoder(len(DESC.vocab), args.emb_dim, args.filter_size, args.n_layers, args.dropout, device) code_pooler = models.EmbeddingPooler(args.emb_dim, args.pool_mode) desc_pooler = models.EmbeddingPooler(args.emb_dim, args.pool_mode) elif args.model == 'transformer': code_pad_idx = CODE.vocab.stoi[CODE.pad_token] desc_pad_idx = DESC.vocab.stoi[DESC.pad_token] code_encoder = models.TransformerEncoder(len(CODE.vocab), args.emb_dim, args.hid_dim, args.n_layers, args.n_heads, args.dropout, code_pad_idx, device) desc_encoder = models.TransformerEncoder(len(DESC.vocab), args.emb_dim, args.hid_dim, args.n_layers, args.n_heads, args.dropout, desc_pad_idx, device) code_pooler = models.EmbeddingPooler(args.emb_dim, args.pool_mode) desc_pooler = models.EmbeddingPooler(args.emb_dim, args.pool_mode) else: raise ValueError(f'Model {args.model} not valid!') if args.model == 'transformer':
valid_data = utils.batchify(valid_data, args.batch_size) test_data = utils.batchify(test_data, args.batch_size) print('train/valid/test shape', [x.shape for x in [train_data, valid_data, test_data]]) #data is [length, batch size] device = torch.device('cuda') if args.model == 'transformer': pad_idx = vocab[PAD_TOKEN] model = models.TransformerEncoder(len(vocab), args.emb_dim, args.hid_dim, args.n_layers, args.n_heads, args.dropout, pad_idx, device) else: raise ValueError if args.model == 'transformer': model.apply(utils.initialize_transformer) else: raise ValueError language_model = models.LanguageModel(model, args.emb_dim, len(vocab)) print(f'Language model parameters: {utils.count_parameters(language_model):,}')
def __init__(self, config, use_attention=True, encoder=None, decoder=None, src_padding_idx=0, tgt_padding_idx=0, label_smoothing=0, tgt_vocab=None): # 用来生成描述的target vocab是否固定 """ Initialization of variables and functions :param config: configuration :param use_attention: use attention or not, consistent with seq2seq 是decoder对encoder的attention :param encoder: encoder :param decoder: decoder :param src_padding_idx: source padding index :param tgt_padding_idx: target padding index :param label_smoothing: ratio for label smoothing, label smoothing的比例 :param tgt_vocab: target vocabulary """ super(tensor2tensor, self).__init__() # 调用父类 self.config = config # pretrained encoder or not if encoder is not None: self.encoder = encoder # pretrained else: self.encoder = models.TransformerEncoder( config, padding_idx=src_padding_idx) if self.config.knowledge: # HACK: we use tgt_vocab for knowledge instead of src_vocab src_vocab_size = config.src_vocab_size config.src_vocab_size = config.tgt_vocab_size self.knowledge_encoder = models.TransformerEncoder( config, padding_idx=src_padding_idx) config.src_vocab_size = src_vocab_size # 知识图谱信息中用的词是生成描述中的词 tgt_embedding = self.encoder.embedding if config.shared_vocab else None # pretrained decoder or not if decoder is not None: self.decoder = decoder else: self.decoder = models.TransformerDecoder( config, tgt_embedding=tgt_embedding, padding_idx=tgt_padding_idx) # 确定使用怎样的embedding # log softmax should specify dimension explicitly self.log_softmax = nn.LogSoftmax(dim=-1) self.use_cuda = config.use_cuda self.config = config self.label_smoothing = label_smoothing if self.label_smoothing > 0: self.criterion = LabelSmoothingLoss(label_smoothing, config.tgt_vocab_size, ignore_index=tgt_padding_idx) # 做label_smoothing else: self.criterion = nn.CrossEntropyLoss(ignore_index=utils.PAD) if config.use_cuda: self.criterion.cuda() self.compute_score = nn.Linear(config.hidden_size, config.tgt_vocab_size) # 不确定是在干啥,self.compute_score似乎是一个提前定义好的函数, # 用来做进softmax之前的全连接的 # 先不使用强化学习 # Use rl or not. Should specify a reward provider. Not available yet in this framework. # if config.rl: # self.bleu_scorer = bleu.Scorer(pad=0, eos=3, unk=1) # self.reward_provider = CTRRewardProvider(config.ctr_reward_provider_path) # self.tgt_vocab = tgt_vocab self.padding_idx = tgt_padding_idx # 使用输出的padding