def do_infer_sent(args): if not os.path.exists(args.name + '.token'): logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token')) sys.exit() if not os.path.exists(args.name + '.vocab'): logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab')) sys.exit() if len(glob.glob(args.name + '.model.?????????.pth')) == 0: logging.error('no model available: {}'.format(args.name + '.model.?????????.pth')) sys.exit() token = OpenNMTTokenizer(args.name + '.token') vocab = Vocab() vocab.read(args.name + '.vocab') args.embedding_size, args.pooling = read_params(args) model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps) n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer) if args.cuda: model.cuda() dataset = Dataset(args, token, vocab, 'infer_sent', skip_subsampling=True) with torch.no_grad(): model.eval() for batch in dataset: snts = model.SentEmbed(batch[0], batch[1], 'iEmb').cpu().detach().numpy().tolist() for i in range(len(snts)): sentence = ["{:.6f}".format(w) for w in snts[i]] print('{}\t{}'.format(batch[2][i]+1, ' '.join(sentence) ))
def do_infer_word(args): if not os.path.exists(args.name + '.token'): logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token')) sys.exit() if not os.path.exists(args.name + '.vocab'): logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab')) sys.exit() if len(glob.glob(args.name + '.model.?????????.pth')) == 0: logging.error('no model available: {}'.format(args.name + '.model.?????????.pth')) sys.exit() token = OpenNMTTokenizer(args.name + '.token') vocab = Vocab() vocab.read(args.name + '.vocab') args.embedding_size, args.pooling = read_params(args) model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps) n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer) if args.cuda: model.cuda() if args.sim == 'cos': distance = nn.CosineSimilarity(dim=1, eps=1e-6) elif args.sim == 'pairwise': distance = nn.PairwiseDistance(eps=1e-6) else: logging.error('bad -sim option {}'.format(args.sim)) sys.exit() dataset = Dataset(args, token, vocab, 'infer_word', skip_subsampling=True) with torch.no_grad(): model.eval() voc_i = [i for i in range(0,len(vocab))] voc_e = model.Embed(voc_i,'iEmb') for batch in dataset: #batch[0] batch_wrd #batch[1] batch_isnt #batch[2] batch_iwrd wrd_i = batch[0] wrd_e = model.Embed(wrd_i, 'iEmb') #.cpu().detach().numpy().tolist() for i in range(len(wrd_i)): ### words to find their closest ind_snt = batch[1][i] ind_wrd = batch[2][i] wrd = vocab[wrd_i[i]] out = [] out.append("{}:{}:{}".format(ind_snt,ind_wrd,wrd)) dist_wrd_voc = distance(wrd_e[i].unsqueeze(0),voc_e) mininds = torch.argsort(dist_wrd_voc,dim=0,descending=True) for k in range(1,len(mininds)): ind = mininds[k].item() #cpu().detach().numpy() if i != ind: dis = dist_wrd_voc[ind].item() wrd = vocab[ind] out.append("{:.6f}:{}".format(dis,wrd)) if len(out)-1 == args.k: break print('\t'.join(out))
def __init__(self, **vocab_kwargs): self.vocab = { 'rel': vocab_kwargs.get('rel', Vocab()), 'ner': vocab_kwargs.get('ner', Vocab(unk='O')), 'dep': vocab_kwargs.get('dep', Vocab()), 'pos': vocab_kwargs.get('pos', Vocab(unk='.')), 'word': vocab_kwargs.get('word', Vocab(unk='UNKNOWN')), }
def load_data(model_type, pd): multi_sense, n_sense = set_sense_paras(model_type, pd) x_vocab = Vocab(pd['x_vocab_file'], multi_sense, n_sense) y_vocab = Vocab(pd['y_vocab_file'], False, 1) train_data = RelationData(pd['train_data_file'], multi_sense, n_sense) test_data = RelationData(pd['test_data_file'], multi_sense, n_sense) train_data.gen_multinomial_dist(y_vocab.size()) return train_data, test_data, x_vocab, y_vocab
def do_train(args): if not os.path.exists(args.name + '.token'): logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token')) sys.exit() if not os.path.exists(args.name + '.vocab'): logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab')) sys.exit() token = OpenNMTTokenizer(args.name + '.token') vocab = Vocab() vocab.read(args.name + '.vocab') if os.path.exists(args.name + '.param'): args.embedding_size, args.pooling = read_params(args) else: write_params(args) model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk) if args.cuda: model.cuda() # optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps) # optimizer = torch.optim.SGD(model.parameters(), lr=0.05) optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, weight_decay=0.01, amsgrad=False) n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer) dataset = Dataset(args, token, vocab, args.method) n_epochs = 0 losses = [] while True: n_epochs += 1 for batch in dataset: model.train() if args.method == 'skipgram': loss = model.forward_skipgram(batch) elif args.method == 'cbow': loss = model.forward_cbow(batch) elif args.method == 'sbow': loss = model.forward_sbow(batch) optimizer.zero_grad() loss.backward() optimizer.step() n_steps += 1 losses.append(loss.data.cpu().detach().numpy()) if n_steps % args.report_every_n_steps == 0: accum_loss = np.mean(losses) logging.info('{} n_epoch={} n_steps={} Loss={:.6f}'.format(args.method, n_epochs,n_steps,accum_loss)) losses = [] if n_steps % args.save_every_n_steps == 0: save_model_optim(args.name, model, optimizer, n_steps, args.keep_last_n) if n_epochs >= args.max_epochs: logging.info('Stop (max epochs reached)') break save_model_optim(args.name, model, optimizer, n_steps, args.keep_last_n)
def inference(self): self.dropout = 0.0 self.seq_size = 0 if not self.epoch: for e in range(999, 0, -1): if os.path.exists(self.mdir+"/epoch{}.index".format(e)): self.epoch = e break if not self.epoch: sys.stderr.write("error: Cannot find epoch in mdir '{}'\n{}".format(self.mdir, self.usage)) sys.exit(1) check_dataset(self.tst) if self.output == '-': self.output = sys.stdout else: self.output = open(self.output, "wb") if not os.path.exists('{}/epoch{}.index'.format(self.mdir, self.epoch)): sys.stderr.write('error: -epoch file {}/epoch{}.index cannot be find\n'.format(self.mdir, self.epoch)) sys.exit(1) if not os.path.exists(self.mdir + '/topology'): sys.stderr.write('error: topology file: {} cannot be find\n'.format(self.mdir + '/topology')) sys.exit(1) src_voc = 'vocab_src' tgt_voc = 'vocab_tgt' if os.path.exists(self.mdir + '/tokenization_src.json'): with open(self.mdir + '/tokenization_src.json') as jsonfile: self.tok_src = json.load(jsonfile) src_voc = self.tok_src["vocabulary"] else: self.tok_src = None if not os.path.exists(self.mdir + '/' + src_voc): sys.stderr.write('error: vocab src file: {} cannot be find\n'.format(self.mdir + '/' + src_voc)) sys.exit(1) if os.path.exists(self.mdir + '/tokenization_tgt.json'): with open(self.mdir + '/tokenization_tgt.json') as jsonfile: self.tok_tgt = json.load(jsonfile) tgt_voc = self.tok_tgt["vocabulary"] else: self.tok_tgt = None argv = [] with open(self.mdir + "/topology", 'r') as f: for line in f: opt, val = line.split() argv.append('-'+opt) argv.append(val) # overrides options passed in command line self.parse(argv) # read vocabularies self.voc_src = Vocab(self.mdir + "/" + src_voc) self.voc_tgt = Vocab(self.mdir + "/" + tgt_voc) return
def __init__(self, args): # get the dir with pre-trained model load_dir = os.path.join(args.experiment_dir, args.old_model_dir) # initialize, and load vocab self.vocab = Vocab() vocab_filename = os.path.join(load_dir, "vocab.json") self.vocab.load_from_dict(vocab_filename) # load configuration with open(os.path.join(load_dir, "config.json"), "r") as f: config = json.load(f) args.response_len = config["response_len"] args.history_len = config["history_len"] # initialize an empty dataset. used to get input features self.dataset = DialogueDataset(None, history_len=config["history_len"], response_len=config["response_len"], vocab=self.vocab, update_vocab=False) # set device self.device = torch.device(args.device) # initialize model model = Transformer(config["vocab_size"], config["vocab_size"], config["history_len"], config["response_len"], d_word_vec=config["embedding_dim"], d_model=config["model_dim"], d_inner=config["inner_dim"], n_layers=config["num_layers"], n_head=config["num_heads"], d_k=config["dim_k"], d_v=config["dim_v"], dropout=config["dropout"], pretrained_embeddings=None).to(self.device) # load checkpoint checkpoint = torch.load(os.path.join(load_dir, args.old_model_name), map_location=self.device) model.load_state_dict(checkpoint['model']) # create chatbot self.chatbot = Chatbot(args, model) self.args = args
def write_error_results(self, opt): model_a_name, model_b_name = opt['cmp_model_type_list'] model_a_file = self.instance_analysis_path + model_a_name + '.txt' model_b_file = self.instance_analysis_path + model_b_name + '.txt' model_a_indicator = pd.read_table(model_a_file, header=None).ix[:, 0] model_b_indicator = pd.read_table(model_b_file, header=None).ix[:, 0] test_instances = pd.read_table(self.test_data_file, header=None).as_matrix() vocab = Vocab(self.x_vocab_file, n_sense=1, id_offset=0) instances = self.select_better_instances(test_instances, model_a_indicator, model_b_indicator) # where model b makes correct predictions but model a does not output_file = self.instance_analysis_path + model_a_name + '-0-' + model_b_name + '-1-.txt' self.write_model_instances(instances, vocab, output_file) instances = self.select_better_instances(test_instances, model_b_indicator, model_a_indicator) # where model a makes correct predictions but model b does not output_file = self.instance_analysis_path + model_a_name + '-1-' + model_b_name + '-0-.txt' self.write_model_instances(instances, vocab, output_file) # where both model a and b make correct predictions instances = self.select_equal_instances(test_instances, model_a_indicator, model_b_indicator, 1) output_file = self.instance_analysis_path + model_a_name + '-1-' + model_b_name + '-1-.txt' self.write_model_instances(instances, vocab, output_file) # where both model a and b make wrong predictions instances = self.select_equal_instances(test_instances, model_a_indicator, model_b_indicator, 0) output_file = self.instance_analysis_path + model_a_name + '-0-' + model_b_name + '-0-.txt' self.write_model_instances(instances, vocab, output_file)
def inference(self): self.dropout = 0.0 self.seq_size = 0 if not self.epoch: sys.stderr.write("error: Missing -epoch option\n{}".format( self.usage)) sys.exit() if not os.path.exists(self.tst): sys.stderr.write('error: -tst file {} cannot be find\n'.format( self.tst)) sys.exit() if not os.path.exists(self.mdir + '/epoch' + self.epoch + '.index'): sys.stderr.write( 'error: -epoch file {} cannot be find\n'.format(self.mdir + '/epoch' + self.epoch + '.index')) sys.exit() if not os.path.exists(self.mdir + '/topology'): sys.stderr.write( 'error: topology file: {} cannot be find\n'.format( self.mdir + '/topology')) sys.exit() if not os.path.exists(self.mdir + '/vocab_src'): sys.stderr.write( 'error: vocab_src file: {} cannot be find\n'.format( self.mdir + '/vocab_src')) sys.exit() if not os.path.exists(self.mdir + '/vocab_tgt'): sys.stderr.write( 'error: vocab_tgt file: {} cannot be find\n'.format( self.mdir + '/vocab_tgt')) sys.exit() argv = [] with open(self.mdir + "/topology", 'r') as f: for line in f: opt, val = line.split() argv.append('-' + opt) argv.append(val) self.parse(argv) ### this overrides options passed in command line ### read vocabularies self.voc_src = Vocab(self.mdir + "/vocab_src") self.voc_tgt = Vocab(self.mdir + "/vocab_tgt") return
def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._test_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._test_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._test_dir, 'rouge_dec') for p in [self._test_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True)
def sampling_decode(self, vocab: Vocab, example: LMExample, begin_symbol: int = 2, end_symbol: int = 5, initial_hidden: Optional[HiddenState] = None, warm_up: Optional[int] = None, max_length: int = 200, greedy: bool = False, topk: Optional[int] = None, print_info: bool = True, color_outputs: bool = False, **_kwargs) \ -> SampledOutput: tensor = functools.partial(sample_utils.tensor, device=self.device) sample = functools.partial(sample_utils.sample, greedy=greedy, topk=topk) self.eval() self.init_hidden(1, None) if warm_up is None: inputs = [begin_symbol] hidden = initial_hidden total_log_prob = 0.0 else: inputs = list(vocab.numericalize(example.sentence[:warm_up])) total_log_prob, hidden = self.forward(tensor(inputs[:-1]), target=tensor(inputs[1:])) total_log_prob = -torch.sum(total_log_prob).item() * (len(inputs) - 1) while len(inputs) < max_length and inputs[-1] != end_symbol: # full copy of the forward pass, including dropouts. But they won't be applied due to .eval function. # Run LSTM over the word word_log_probs, new_hidden = self.forward(tensor(inputs[-1]), hidden) word_id, word_log_prob = sample(word_log_probs) inputs.append(word_id) hidden = new_hidden total_log_prob += word_log_prob sample_loss = -total_log_prob / (len(inputs) - 1) if print_info: print( f"Sample loss: {sample_loss:.3f}, PPL: {math.exp(sample_loss):.3f}" ) # Format the output words = [vocab.i2w[token] for token in inputs] if color_outputs and warm_up is not None: words[:warm_up] = [ Logging.color('yellow', w) for w in words[:warm_up] ] output = SampledOutput(sentence=words, sample_loss=sample_loss, complete_copies=0, incomplete_copies=0) return output
def __init__(self, model_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) model_name = os.path.basename(model_path) eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name)) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.summary.FileWriter(eval_dir) self.model = Model(model_path, is_eval=True)
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(self.vocab, config.train_data_path, config.batch_size, single_pass=False, mode='train') time.sleep(10) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'models') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir)
def main(train_data_path, test_data_path, answer_data_path, vocab_data_path, embedding_data_path, batch_size, learning_rate, hidden_size, margin, epoch, save_path, pretrained_path, use_cuda): # load qa data answer_data = AnswerData(answer_data_path) train_data = QaData(train_data_path) test_data = QaData(test_data_path) # load pretrained embedding pretrained_embedding = gensim.models.KeyedVectors.load_word2vec_format( embedding_data_path, binary=True) vocab = Vocab(vocab_data_path, answer_data.lexicon + train_data.lexicon) pretrained_weights = np.zeros((len(vocab) + 1, 300)) # TODO magic number for wid, surf in vocab.wid2surf.items(): if surf in pretrained_embedding.vocab: pretrained_weights[wid] = pretrained_embedding.wv[surf] # create dataset / data loader train_dataset = InsuranceQaDataset(train_data, answer_data, vocab) train_data_loader = torch.utils.data.DataLoader( train_dataset, shuffle=True, batch_size=batch_size, collate_fn=train_dataset.collate) test_dataset = InsuranceQaDataset(test_data, answer_data, vocab) # train model if pretrained_path is not None: model = torch.load(pretrained_path)['model'] else: model = SentenceEncoder(pretrained_weights, hidden_size) optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate) criterion = QaLoss(margin=margin) if use_cuda: model = model.cuda() train(model, train_data_loader, test_dataset, optimizer, criterion, epoch, use_cuda) # save model torch.save({'model': model, 'vocab': vocab}, save_path)
def __init__( self, train_folder: str, test_folder: str, alpha: float = 0.01, beta: float = 1.0, predicted_poses: int = 20, previous_poses: int = 10, stride: int = None, batch_size: int = 50, with_context: bool = False, embedding: str = None, text_folder: str = None, *args, **kwargs ): super().__init__() self.save_hyperparameters() self.encoder = Encoder(26, 150, 2, with_context) self.decoder = Decoder(45, 150, 300, max_gen=predicted_poses) self.predicted_poses = predicted_poses self.previous_poses = previous_poses self.loss = MSELoss() self.train_folder = train_folder self.test_folder = test_folder self.alpha = alpha self.beta = beta self.stride = predicted_poses if stride is None else stride self.batch_size = batch_size self.with_context = with_context if embedding is not None: self.vocab = Vocab(embedding) self.word_embedder = nn.Embedding(len(self.vocab.token_to_idx), len(self.vocab.weights[0]), _weight=torch.FloatTensor(self.vocab.weights)) self.word_encoder = nn.GRU(len(self.vocab.weights[0]), 100, bidirectional=True) else: self.vocab = None self.text_folder = text_folder
def do_preprocess(args): if args.tok_conf is None: opts = {} opts['mode'] = 'space' with open(args.name + '.token', 'w') as yamlfile: _ = yaml.dump(opts, yamlfile) else: with open(args.tok_conf) as yamlfile: opts = yaml.load(yamlfile, Loader=yaml.FullLoader) #cp bpe args.name+'.bpe' #replace in opts the bpe path yaml.dump(opts, args.name + '.token') logging.info('built tokenizer config file') token = OpenNMTTokenizer(args.name + '.token') vocab = Vocab() vocab.build(args.data,token,min_freq=args.voc_minf,max_size=args.voc_maxs) vocab.dump(args.name + '.vocab') logging.info('built vocab')
mask = np.triu(np.ones(shape), k=1).astype('uint8') mask = torch.from_numpy(mask) == 0 return mask ### for test ### if __name__ == '__main__': parser = argparse.ArgumentParser(prog=sys.argv[0], usage='python3 {}'.format(sys.argv[0]), add_help=True) parser.add_argument('-v', '--vocab', help='vocab file', required=True) parser.add_argument('-i', '--input', help='input file', required=True) parser.add_argument('-t', '--target', help='target file', required=True) args = parser.parse_args() vocab = Vocab(args.vocab) dataset = AspecDataset(args.input, args.target, vocab) loader = DataLoader(dataset, collate_fn=MiniBatchProcess(), shuffle=False, batch_size=50) for i, batch in enumerate(loader): # test1 #if i>=2: break #print('--- batch_idx={} ---'.format(i)) #print('id: {}'.format(batch[0])) #print('input: {}\n{}'.format(batch[1].shape, batch[1])) #print('input mask: {}\n{}'.format(batch[2].shape, batch[2])) #print('target: {}\n{}'.format(batch[3].shape, batch[3])) #print('target mask: {}\n{}'.format(batch[4].shape, batch[4]))
def __init__(self, args): # set up output directory self.output_dir = os.path.join(args.experiment_dir, args.run_name) if not os.path.exists(args.experiment_dir): os.mkdir(args.experiment_dir) if not os.path.exists(self.output_dir): os.mkdir(self.output_dir) if not os.path.exists(os.path.join(args.experiment_dir,"runs/")): os.mkdir(os.path.join(args.experiment_dir,"runs/")) # initialize tensorboard writer self.runs_dir = os.path.join(args.experiment_dir,"runs/",args.run_name) self.writer = SummaryWriter(self.runs_dir) # initialize global steps self.train_gs = 0 self.val_gs = 0 # initialize model config self.config = ModelConfig(args) # check if there is a model to load if args.old_model_dir is not None: self.use_old_model = True self.load_dir = args.old_model_dir self.config.load_from_file( os.path.join(self.load_dir, "config.json")) # create vocab self.vocab = Vocab() self.vocab.load_from_dict(os.path.join(self.load_dir, "vocab.json")) self.update_vocab = False self.config.min_count=1 else: self.use_old_model = False self.vocab = None self.update_vocab = True # create data sets self.dataset_filename = args.dataset_filename # train self.train_dataset = DialogueDataset( os.path.join(self.dataset_filename, "train_data.json"), self.config.sentence_len, self.vocab, self.update_vocab) self.data_loader_train = torch.utils.data.DataLoader( self.train_dataset, self.config.train_batch_size, shuffle=True) self.config.train_len = len(self.train_dataset) self.vocab = self.train_dataset.vocab # eval self.val_dataset = DialogueDataset( os.path.join(self.dataset_filename, "val_data.json"), self.config.sentence_len, self.vocab, self.update_vocab) self.data_loader_val = torch.utils.data.DataLoader( self.val_dataset, self.config.val_batch_size, shuffle=True) self.config.val_len = len(self.val_dataset) # update, and save vocab self.vocab = self.val_dataset.vocab self.train_dataset.vocab = self.vocab if (self.config.min_count > 1): self.config.old_vocab_size = len(self.vocab) self.vocab.prune_vocab(self.config.min_count) self.vocab.save_to_dict(os.path.join(self.output_dir, "vocab.json")) self.vocab_size = len(self.vocab) self.config.vocab_size = self.vocab_size # load embeddings if self.config.pretrained_embeddings_dir is None: pretrained_embeddings = get_pretrained_embeddings(self.config.pretrained_embeddings_dir , self.vocab) else: pretrained_embeddings = None # print and save the config file self.config.print_config(self.writer) self.config.save_config(os.path.join(self.output_dir, "config.json")) # set device self.device = torch.device('cuda') # create model self.model = Transformer( self.config.vocab_size, self.config.label_len, self.config.sentence_len, d_word_vec=self.config.embedding_dim, d_model=self.config.model_dim, d_inner=self.config.inner_dim, n_layers=self.config.num_layers, n_head=self.config.num_heads, d_k=self.config.dim_k, d_v=self.config.dim_v, dropout=self.config.dropout, pretrained_embeddings=pretrained_embeddings ).to(self.device) # create optimizer self.optimizer = torch.optim.Adam( filter(lambda x: x.requires_grad, self.model.parameters()), betas=(0.9, 0.98), eps=1e-09) # load old model, optimizer if there is one if self.use_old_model: self.model, self.optimizer = load_checkpoint( os.path.join(self.load_dir, "model.bin"), self.model, self.optimizer, self.device) # create a sceduled optimizer object self.optimizer = ScheduledOptim( self.optimizer, self.config.model_dim, self.config.warmup_steps)
class ModelOperator: def __init__(self, args): # set up output directory self.output_dir = os.path.join(args.experiment_dir, args.run_name) if not os.path.exists(args.experiment_dir): os.mkdir(args.experiment_dir) if not os.path.exists(self.output_dir): os.mkdir(self.output_dir) if not os.path.exists(os.path.join(args.experiment_dir,"runs/")): os.mkdir(os.path.join(args.experiment_dir,"runs/")) # initialize tensorboard writer self.runs_dir = os.path.join(args.experiment_dir,"runs/",args.run_name) self.writer = SummaryWriter(self.runs_dir) # initialize global steps self.train_gs = 0 self.val_gs = 0 # initialize model config self.config = ModelConfig(args) # check if there is a model to load if args.old_model_dir is not None: self.use_old_model = True self.load_dir = args.old_model_dir self.config.load_from_file( os.path.join(self.load_dir, "config.json")) # create vocab self.vocab = Vocab() self.vocab.load_from_dict(os.path.join(self.load_dir, "vocab.json")) self.update_vocab = False self.config.min_count=1 else: self.use_old_model = False self.vocab = None self.update_vocab = True # create data sets self.dataset_filename = args.dataset_filename # train self.train_dataset = DialogueDataset( os.path.join(self.dataset_filename, "train_data.json"), self.config.sentence_len, self.vocab, self.update_vocab) self.data_loader_train = torch.utils.data.DataLoader( self.train_dataset, self.config.train_batch_size, shuffle=True) self.config.train_len = len(self.train_dataset) self.vocab = self.train_dataset.vocab # eval self.val_dataset = DialogueDataset( os.path.join(self.dataset_filename, "val_data.json"), self.config.sentence_len, self.vocab, self.update_vocab) self.data_loader_val = torch.utils.data.DataLoader( self.val_dataset, self.config.val_batch_size, shuffle=True) self.config.val_len = len(self.val_dataset) # update, and save vocab self.vocab = self.val_dataset.vocab self.train_dataset.vocab = self.vocab if (self.config.min_count > 1): self.config.old_vocab_size = len(self.vocab) self.vocab.prune_vocab(self.config.min_count) self.vocab.save_to_dict(os.path.join(self.output_dir, "vocab.json")) self.vocab_size = len(self.vocab) self.config.vocab_size = self.vocab_size # load embeddings if self.config.pretrained_embeddings_dir is None: pretrained_embeddings = get_pretrained_embeddings(self.config.pretrained_embeddings_dir , self.vocab) else: pretrained_embeddings = None # print and save the config file self.config.print_config(self.writer) self.config.save_config(os.path.join(self.output_dir, "config.json")) # set device self.device = torch.device('cuda') # create model self.model = Transformer( self.config.vocab_size, self.config.label_len, self.config.sentence_len, d_word_vec=self.config.embedding_dim, d_model=self.config.model_dim, d_inner=self.config.inner_dim, n_layers=self.config.num_layers, n_head=self.config.num_heads, d_k=self.config.dim_k, d_v=self.config.dim_v, dropout=self.config.dropout, pretrained_embeddings=pretrained_embeddings ).to(self.device) # create optimizer self.optimizer = torch.optim.Adam( filter(lambda x: x.requires_grad, self.model.parameters()), betas=(0.9, 0.98), eps=1e-09) # load old model, optimizer if there is one if self.use_old_model: self.model, self.optimizer = load_checkpoint( os.path.join(self.load_dir, "model.bin"), self.model, self.optimizer, self.device) # create a sceduled optimizer object self.optimizer = ScheduledOptim( self.optimizer, self.config.model_dim, self.config.warmup_steps) #self.optimizer.optimizer.to(torch.device('cpu')) def train(self, num_epochs): metrics = {"best_epoch":0, "highest_f1":0} # output an example self.output_example(0) for epoch in range(num_epochs): #self.writer.add_graph(self.model) #self.writer.add_embedding( # self.model.encoder.src_word_emb.weight, global_step=epoch) epoch_metrics = dict() # train epoch_metrics["train"] = self.execute_phase(epoch, "train") # save metrics metrics["epoch_{}".format(epoch)] = epoch_metrics with open(os.path.join(self.output_dir, "metrics.json"), "w") as f: json.dump(metrics, f, indent=4) # validate epoch_metrics["val"] = self.execute_phase(epoch, "val") # save metrics metrics["epoch_{}".format(epoch)] = epoch_metrics with open(os.path.join(self.output_dir, "metrics.json"), "w") as f: json.dump(metrics, f, indent=4) # save checkpoint #TODO: fix this b if epoch_metrics["val"]["avg_results"]["F1"] > metrics["highest_f1"]: #if epoch_metrics["train"]["loss"] < metrics["lowest_loss"]: #if epoch % 100 == 0: self.save_checkpoint(os.path.join(self.output_dir, "model.bin")) metrics["lowest_f1"] = epoch_metrics["val"]["avg_results"]["F1"] metrics["best_epoch"] = epoch test_results = self.get_test_predictions( os.path.join(self.dataset_filename, "test_data.json"), os.path.join(self.output_dir, "predictions{}.json".format(epoch))) # record metrics to tensorboard self.writer.add_scalar("training loss total", epoch_metrics["train"]["loss"], global_step=epoch) self.writer.add_scalar("val loss total", epoch_metrics["val"]["loss"], global_step=epoch) self.writer.add_scalar("training time", epoch_metrics["train"]["time_taken"], global_step=epoch) self.writer.add_scalar("val time", epoch_metrics["val"]["time_taken"], global_step=epoch) self.writer.add_scalars("train_results", epoch_metrics["train"]["avg_results"], global_step=epoch) self.writer.add_scalars("val_results", epoch_metrics["val"]["avg_results"], global_step=epoch) # output an example self.output_example(epoch+1) self.writer.close() def execute_phase(self, epoch, phase): if phase == "train": self.model.train() dataloader = self.data_loader_train batch_size = self.config.train_batch_size train = True else: self.model.eval() dataloader = self.data_loader_val batch_size = self.config.val_batch_size train = False start = time.clock() phase_metrics = dict() epoch_loss = list() epoch_metrics = list() results = {"accuracy": list(), "precision": list(), "recall": list(), "F1": list()} average_epoch_loss = None for i, batch in enumerate(tqdm(dataloader, mininterval=2, desc=phase, leave=False)): # prepare data src_seq, src_pos, src_seg, tgt= map( lambda x: x.to(self.device), batch[:4]) ids = batch[4] start_end_idx = batch[5] # forward if train: self.optimizer.zero_grad() pred = self.model(src_seq, src_pos, src_seg, tgt) loss = F.cross_entropy(self.prepare_pred(pred).view(-1, 2), tgt.view(-1)) average_loss = float(loss) epoch_loss.append(average_loss) average_epoch_loss = np.mean(epoch_loss) if train: self.writer.add_scalar("train_loss", average_loss, global_step=i + epoch * self.config.train_batch_size) # backward loss.backward() # update parameters self.optimizer.step_and_update_lr() output = torch.argmax(self.prepare_pred(pred), 3) get_results(tgt.view(-1).cpu(), output.view(-1).cpu(), results) phase_metrics["avg_results"] = {key: np.mean(value) for key, value in results.items()} phase_metrics["loss"] = average_epoch_loss phase_metrics["time_taken"] = time.clock() - start string = ' {} loss: {:.3f} '.format(phase, average_epoch_loss) print(string, end='\n') return phase_metrics def get_test_predictions(self, test_filename, save_filename): test_dataset = DialogueDataset( test_filename, self.config.sentence_len, self.vocab, False) test_data_loader = torch.utils.data.DataLoader( test_dataset, self.config.val_batch_size, shuffle=True) with open(test_filename, 'r') as f: data = json.load(f) start = time.clock() phase_metrics = dict() epoch_loss = list() epoch_metrics = list() results = {"accuracy": list(), "precision": list(), "recall": list(), "F1": list()} average_epoch_loss = None for i, batch in enumerate(tqdm(test_data_loader, mininterval=2, desc='test', leave=False)): # prepare data src_seq, src_pos, src_seg, tgt = map( lambda x: x.to(self.device), batch[:4]) ids = batch[4] start_end_idx = batch[5] # forward pred = self.model(src_seq, src_pos, src_seg, tgt) loss = F.cross_entropy(self.prepare_pred(pred).view(-1, 2), tgt.view(-1)) average_loss = float(loss) epoch_loss.append(average_loss) average_epoch_loss = np.mean(epoch_loss) output = torch.argmax(self.prepare_pred(pred), 3) record_predictions(output, data, ids, start_end_idx) get_results(tgt.view(-1).cpu(), output.view(-1).cpu(), results) phase_metrics["avg_results"] = {key: np.mean(value) for key, value in results.items()} phase_metrics["loss"] = average_epoch_loss phase_metrics["time_taken"] = time.clock() - start string = ' {} loss: {:.3f} '.format('test', average_epoch_loss) print(string, end='\n') data["results"] = phase_metrics with open(save_filename, 'w') as f: json.dump(data, f) return phase_metrics def save_checkpoint(self, filename): state = { 'model': self.model.state_dict(), 'optimizer': self.optimizer.optimizer.state_dict() } torch.save(state, filename) def output_example(self, epoch): random_index = random.randint(0, len(self.val_dataset)) example = self.val_dataset[random_index] # prepare data src_seq, src_pos, src_seg, tgt_seq = map( lambda x: torch.from_numpy(x).to(self.device).unsqueeze(0), example[:4]) # take out first token from target for some reason gold = tgt_seq[:, 1:] # forward pred = self.model(src_seq, src_pos, src_seg, tgt_seq) output = self.prepare_pred(pred).squeeze(0) words = src_seq.tolist()[0] target_strings = labels_2_mention_str(tgt_seq.squeeze(0)) output_strings = labels_2_mention_str(torch.argmax(output, dim=2)) # get history text string = "word: output - target\n" for word, t, o in zip(words, target_strings, output_strings): token = self.vocab.id2token[word] if token != "<blank>": string += "[{}: {} - {}], \n".format(token, o, t) # print print("\n------------------------\n") print(string) print("\n------------------------\n") # add result to tensorboard self.writer.add_text("example_output", string, global_step=epoch) self.writer.add_histogram("example_vocab_ranking", pred, global_step=epoch) self.writer.add_histogram("example_vocab_choice", output,global_step=epoch) def prepare_pred(self, pred): temp = pred pred = pred.view(-1) size = pred.size() nullclass = torch.ones(size, dtype=pred.dtype, device=self.device) nullclass -= pred pred = torch.stack((nullclass, pred), 1).view(-1, self.config.sentence_len, self.config.label_len, 2) return pred
class BeamSearch(object): def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._test_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._test_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._test_dir, 'rouge_dec') for p in [self._test_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True) def sort_beams(self, beams): return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) def beam_search(self, batch): # single example repeated across the batch enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t, coverage = \ get_input_from_batch(batch, use_cuda) enc_out, enc_fea, enc_h = self.model.encoder(enc_batch, enc_lens) s_t = self.model.reduce_state(enc_h) dec_h, dec_c = s_t # b x hidden_dim dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() # decoder batch preparation, it has beam_size example initially everything is repeated beams = [ Beam(tokens=[self.vocab.word2id(config.BOS_TOKEN)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context=c_t[0], coverage=(coverage[0] if config.is_coverage else None)) for _ in range(config.beam_size) ] steps = 0 results = [] while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(config.UNK_TOKEN) \ for t in latest_tokens] y_t = Variable(torch.LongTensor(latest_tokens)) if use_cuda: y_t = y_t.cuda() all_state_h = [h.state[0] for h in beams] all_state_c = [h.state[1] for h in beams] all_context = [h.context for h in beams] s_t = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) c_t = torch.stack(all_context, 0) coverage_t = None if config.is_coverage: all_coverage = [h.coverage for h in beams] coverage_t = torch.stack(all_coverage, 0) final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder( y_t, s_t, enc_out, enc_fea, enc_padding_mask, c_t, extra_zeros, enc_batch_extend_vocab, coverage_t, steps) log_probs = torch.log(final_dist) topk_log_probs, topk_ids = torch.topk(log_probs, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] # On the first step, we only had one original hypothesis (the initial hypothesis). On subsequent steps, all original hypotheses are distinct. num_orig_beams = 1 if steps == 0 else len(beams) for i in range(num_orig_beams): h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage[i] if config.is_coverage else None) for j in range(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].item(), log_prob=topk_log_probs[i, j].item(), state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(config.EOS_TOKEN): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len( results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0] def run(self): counter = 0 start = time.time() batch = self.batcher.next_batch() while batch is not None: # Run beam search to get best Hypothesis best_summary = self.beam_search(batch) # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = utils.outputids2words( output_ids, self.vocab, (batch.art_oovs[0] if config.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(dataset.EOS_TOKEN) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words original_abstract_sents = batch.original_abstracts_sents[0] write_for_rouge(original_abstract_sents, decoded_words, counter, self._rouge_ref_dir, self._rouge_dec_dir) counter += 1 if counter % 1000 == 0: print('%d example in %d sec' % (counter, time.time() - start)) start = time.time() batch = self.batcher.next_batch() print("Decoder has finished reading dataset for single_pass.") print("Now starting ROUGE eval...") results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir) rouge_log(results_dict, self._test_dir)
model_dir = os.path.join(data_root_folder, 'models') mkdir_if_missing(model_dir) if mode == MODE_MRT: model_name = cur_cfg.name + '_mrt' elif mode == MODE_OBJ: model_name = cur_cfg.name + '_obj' else: model_name = cur_cfg.name model_path = os.path.join(model_dir, model_name + '.state') print('Model path:', model_path) jieba_base_v = Vocab( os.path.join(data_root_folder, 'vocab', 'title_summ.vocab.pkl'), os.path.join(data_root_folder, 'vocab', 'title_summ.emb.pkl')) jieba_sgns_v = Vocab( os.path.join(data_root_folder, 'vocab', 'title_summ.vocab.pkl'), os.path.join(data_root_folder, 'vocab', 'title_summ.emb.pkl')) jieba_flag_v = Vocab( os.path.join(data_root_folder, 'vocab', 'title_summ.vocab.pkl'), os.path.join(data_root_folder, 'vocab', 'title_summ.emb.pkl')) # jieba_sgns_v = Vocab(os.path.join(data_root_folder, 'vocab', 'useless.vocab.pkl'), # os.path.join(data_root_folder, 'vocab', 'useless.emb.pkl')) # jieba_flag_v = Vocab(os.path.join(data_root_folder, 'vocab', 'useless.vocab.pkl'), # os.path.join(data_root_folder, 'vocab', 'useless.emb.pkl')) trainset_roots = [os.path.join(data_root_folder, 'val.txt')]
import os import pickle from models import VariationalModels import re def load_pickle(path): with open(path, 'rb') as f: return pickle.load(f) if __name__ == '__main__': config = get_config(mode='test') print('Loading Vocabulary...') vocab = Vocab() vocab.load(config.word2id_path, config.id2word_path) print(f'Vocabulary size: {vocab.vocab_size}') config.vocab_size = vocab.vocab_size data_loader = get_loader( sentences=load_pickle(config.sentences_path), conversation_length=load_pickle(config.conversation_length_path), sentence_length=load_pickle(config.sentence_length_path), vocab=vocab, batch_size=config.batch_size, shuffle=False) if config.model in VariationalModels: solver = VariationalSolver(config, None, data_loader, vocab=vocab, is_train=False)
if mode == MODE_MRT: model_name = cur_cfg.name + '_mrt' elif mode == MODE_OBJ: model_name = cur_cfg.name + '_obj' else: model_name = cur_cfg.name if switch: model_name += '_switch' if use_data1: model_name += '_full_data' model_path = os.path.join(model_dir, model_name + '.state') print('Model path:', model_path) jieba_base_v = Vocab('./data/embed/base_token_vocab_jieba.pkl', './data/embed/base_token_embed_jieba.pkl') jieba_sgns_v = Vocab('./data/embed/train_sgns_vocab_jieba.pkl', './data/embed/train_sgns_embed_jieba.pkl') jieba_flag_v = Vocab('./data/embed/base_flag_vocab_jieba.pkl', './data/embed/base_flag_embed_jieba.pkl') if switch: pyltp_base_v = Vocab('./data/embed/base_token_vocab_pyltp.pkl', './data/embed/base_token_embed_pyltp.pkl') pyltp_sgns_v = Vocab('./data/embed/train_sgns_vocab_pyltp.pkl', './data/embed/train_sgns_embed_pyltp.pkl') pyltp_flag_v = Vocab('./data/embed/base_flag_vocab_pyltp.pkl', './data/embed/base_flag_embed_pyltp.pkl') transform = MaiIndexTransform(jieba_base_v, jieba_sgns_v, jieba_flag_v, pyltp_base_v, pyltp_sgns_v, pyltp_flag_v)
val_file = os.path.join(data_root_folder, 'preprocessed', 'dev-%s.preprocessed.json' % version) model_dir = os.path.join(data_root_folder, 'models', version) mkdir_if_missing(model_dir) if mode == MODE_MRT: model_name = cur_cfg.name + '_mrt' elif mode == MODE_OBJ: model_name = cur_cfg.name + '_obj' else: model_name = cur_cfg.name model_path = os.path.join(model_dir, model_name + '.state') print('Model path:', model_path) jieba_base_v = Vocab(os.path.join(data_root_folder, 'vocab', 'squad-%s.vocab.pkl' % version), os.path.join(data_root_folder, 'vocab', 'squad-%s.emb.pkl' % version)) jieba_sgns_v = Vocab(os.path.join(data_root_folder, 'vocab', 'useless.vocab.pkl'), os.path.join(data_root_folder, 'vocab', 'useless.emb.pkl')) jieba_flag_v = Vocab(os.path.join(data_root_folder, 'vocab', 'useless.vocab.pkl'), os.path.join(data_root_folder, 'vocab', 'useless.emb.pkl')) trainset_roots = [ os.path.join(data_root_folder, 'val.txt') ] embed_lists = { 'jieba': [jieba_base_v.embeddings, jieba_sgns_v.embeddings, jieba_flag_v.embeddings], 'pyltp': [] }
dev_src, dev_trg, dev_num, dev_src_max_len, dev_trg_max_len = process( dev_src_path, dev_trg_path) test_src, test_trg, test_num, test_src_max_len, test_trg_max_len = process( test_src_path, test_trg_path) log.write('train_num', train_num) log.write('train_src_max_len', train_src_max_len) log.write('train_trg_max_len', train_trg_max_len) log.write('dev_num', dev_num) log.write('dev_src_max_len', dev_src_max_len) log.write('dev_trg_max_len', dev_trg_max_len) log.write('test_num', test_num) log.write('test_src_max_len', test_src_max_len) log.write('test_trg_max_len', test_trg_max_len) vocab = Vocab() for i in range(train_num): vocab.add_list(train_src[i]) vocab.add_list(train_trg[i]) for i in range(dev_num): vocab.add_list(dev_src[i]) vocab.add_list(dev_trg[i]) for i in range(test_num): vocab.add_list(test_src[i]) vocab.add_list(test_trg[i]) word2index, index2word = vocab.get_vocab(min_freq=4) total_words = len(word2index)
def train(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--dataset", required=True, type=str, help="dataset") # parser.add_argument("-c", "--train_dataset", required=True, # type=str, help="train dataset for train bert") # parser.add_argument("-t", "--test_dataset", type=str, # default=None, help="test set for evaluate train set") # parser.add_argument("-v", "--vocab_path", required=True, # type=str, help="built vocab model path with bert-vocab") parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model") parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model") parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers") parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads") parser.add_argument("-s", "--seq_len", type=int, default=64, help="maximum sequence len") parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size") parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs") parser.add_argument("-w", "--num_workers", type=int, default=5, help="dataloader worker size") parser.add_argument("--duplicate", type=int, default=5, help="dataloader worker size") parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n") parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus") parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids") parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false") parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") parser.add_argument("--dropout", type=float, default=0.2, help="dropout value") args = parser.parse_args() print("Load Data", args.dataset) data_reader = DataReader(args.dataset, seq_len=args.seq_len) neg_data_reader = DataReader(args.dataset, graphs=data_reader.graphs, shuffle=True, duplicate=args.duplicate, seq_len=args.seq_len) # print("Loading Vocab", args.vocab_path) print("Loading Vocab") vocab = Vocab(data_reader.graphs) # vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", len(vocab)) print("Shuffle Data") 'TODO' print("Loading Train Dataset", args.dataset) train_dataset = CustomBERTDataset( data_reader.graphs[:int(len(data_reader) * 0.8)], vocab, seq_len=args.seq_len, on_memory=args.on_memory, n_neg=args.duplicate) # pdb.set_trace() neg_train_dataset = CustomBERTDataset( neg_data_reader.graphs[:args.duplicate * len(train_dataset)], vocab, seq_len=args.seq_len, on_memory=args.on_memory, n_neg=args.duplicate) # pdb.set_trace() assert len(neg_train_dataset) == args.duplicate * len(train_dataset) # print("Loading Test Dataset", args.test_dataset) print("Loading Dev Dataset", args.dataset) test_dataset = CustomBERTDataset( data_reader.graphs[int(len(data_reader) * 0.8):], vocab, seq_len=args.seq_len, on_memory=args.on_memory, n_neg=args.duplicate) # \ neg_test_dataset = CustomBERTDataset( neg_data_reader.graphs[-args.duplicate * len(test_dataset):], vocab, seq_len=args.seq_len, on_memory=args.on_memory, n_neg=args.duplicate) # \ assert len(neg_test_dataset) == args.duplicate * len(test_dataset) # if args.test_dataset is not None else None # pdb.set_trace() print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=my_collate) neg_train_data_loader = DataLoader(neg_train_dataset, batch_size=args.batch_size * args.duplicate, num_workers=args.num_workers, collate_fn=my_collate) test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=my_collate) # \ neg_test_data_loader = DataLoader(neg_test_dataset, batch_size=args.batch_size * args.duplicate, num_workers=args.num_workers, collate_fn=my_collate) # \ # if test_dataset is not None else None # assert False print("Building BERT model") bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads, dropout=args.dropout) print("Creating BERT Trainer") # trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, # lr=args.lr, betas=( # args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, # with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, pad_index=vocab.pad_index) trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, pad_index=vocab.pad_index) # raise NotImplementedError print("Training Start") best_loss = None for epoch in range(args.epochs): # test_loss = trainer.test(epoch) train_loss = trainer.train(epoch) torch.cuda.empty_cache() # if test_data_loader is not None: test_loss = trainer.test(epoch) if best_loss is None or test_loss < best_loss: best_loss = test_loss trainer.save(epoch, args.output_path) torch.cuda.empty_cache()
hidden_size = 128 char_embedding_dim=128 # charater-level embedding dim word_embedding_dim=50 # word-level embedding dim max_sent_length=35 max_word_length=16 kernel_n=3 # 卷积核长度 padding=2 # padding大小 lr = 3e-3 weight_decay = 1e-3 # 梯度衰减权值 gradient_clipping = 5 # 梯度裁剪 output_per_batchs = 1 test_per_batchs = 5 test_batchs = 1 ITORS = 100 # 加载字典 vocab = Vocab(vocab_path) char_vocab = CharVocab(char_vocab_path) entity_vocab = EntityVocab(entity_vocab_path) # 创建数据集 train_data_set = DataSet(path=train_data_path, vocab=vocab, entity_vocab=entity_vocab, entity_padding_len=max_sent_length) test_data_set = DataSet(path=test_data_path, vocab=vocab, entity_vocab=entity_vocab, entity_padding_len=max_sent_length) # 创建加载器 train_data_loader = DataLoader(train_data_set, shuffle=True, batch_size=BATCH_SIZE) test_data_loader = DataLoader(test_data_set, shuffle=True, batch_size=BATCH_SIZE) # 是否用GPU
batch_size = opt.batch_size # read dataset if os.path.exists('dataset.pickle'): with open('dataset.pickle', 'rb') as f: train_iter, dev_iter, test_iter, vocab = pickle.load(f) else: root_dir = opt.data segments = ['train', 'dev', 'test'] token_files = [ os.path.join(root_dir, seg, '%s.toks' % tok) for tok in ['a', 'b'] for seg in segments ] vocab = Vocab(filepaths=token_files, embedpath=opt.word_embed) train_iter, dev_iter, test_iter = [ SICKDataIter(os.path.join(root_dir, segment), vocab, num_classes) for segment in segments ] with open('dataset.pickle', 'wb') as f: pickle.dump([train_iter, dev_iter, test_iter, vocab], f) logging.info('==> SICK vocabulary size : %d ' % vocab.size) logging.info('==> Size of train data : %d ' % len(train_iter)) logging.info('==> Size of dev data : %d ' % len(dev_iter)) logging.info('==> Size of test data : %d ' % len(test_iter)) # get network net = SimilarityTreeLSTM(sim_hidden_size, rnn_hidden_size, vocab.size,
def main(): print("Hello!") voca = Vocab(args.vocab_fname) model = Model(args, voca) batcher = Batcher(voca, args) with tf.Session(config=GPU_config()) as sess: model.build_graph() if args.mode == 'train': sess.run(tf.global_variables_initializer()) if not os.path.exists(args.train_logdir): os.makedirs(args.train_logdir) if not os.path.exists(args.valid_logdir): os.makedirs(args.valid_logdir) train_writer, valid_writer = tf.summary.FileWriter( args.train_logdir, sess.graph), tf.summary.FileWriter(args.valid_logdir, sess.graph) t = trange(args.max_step, leave=True) for i in t: sample, label = batcher.next_data() _, loss, step, summaries = model.run_train_step(sample, sess) t.set_description('Train loss: {}'.format(round(loss, 3))) train_writer.add_summary(summaries, step) if step % 5e3 == 0: model.saver.save(sess, args.model_path, step) if step % 5 == 0: valid_sample, valid_label = batcher.next_data( is_valid=True) loss, step, summaries = model.run_eval_step( valid_sample, sess) valid_writer.add_summary(summaries, step) t.set_description('Valid loss: {}'.format(round(loss, 3))) if step % 100 == 0: near_ids, near_words = model.get_nearest_words( sess, args.near_K) pprint(near_words) score = coherence_score(args.test_bin_fname, voca, near_ids) summary = tf.Summary() summary.value.add(tag='coherence_score_{}k'.format( args.near_K), simple_value=score) valid_writer.add_summary(summary, step) else: load_ckpt(args.model_path, sess, model.saver) near_words_dict = {i: [] for i in range(args.aspect_num)} for k in range(5, 50, 5): near_ids, near_words = model.get_nearest_words(sess, k) score = coherence_score(args.test_bin_fname, voca, near_ids) print(k, score) for asp_idx in near_words: for word in near_words[asp_idx]: if word not in near_words_dict[asp_idx]: near_words_dict[asp_idx].append(word) with open(args.nearword_fname, 'w') as f: for idx in range(len(list(near_words_dict.keys()))): print(near_words_dict[idx]) f.write(str(idx) + ' ') f.write(' '.join(near_words_dict[idx][:5])) f.write('\n')
def learn(self): check_dataset(self.trn) if self.dev is not None: check_dataset(self.dev) ### # continuation ### if os.path.exists( self.mdir) and os.path.exists(self.mdir + '/topology'): src_voc = 'vocab_src' tgt_voc = 'vocab_tgt' if os.path.exists(self.mdir + '/tokenization_src.json'): with open(self.mdir + '/tokenization_src.json') as jsonfile: self.tok_src = json.load(jsonfile) src_voc = self.tok_src["vocabulary"] else: self.src_tok = None if not os.path.exists(self.mdir + '/' + src_voc): sys.stderr.write( 'error: vocab src file: {} cannot be find\n'.format( self.mdir + '/' + src_voc)) sys.exit(1) if os.path.exists(self.mdir + '/tokenization_tgt.json'): with open(self.mdir + '/tokenization_tgt.json') as jsonfile: self.tok_tgt = json.load(jsonfile) tgt_voc = self.tok_tgt["vocabulary"] else: self.tgt_tok = None if not os.path.exists(self.mdir + '/' + tgt_voc): sys.stderr.write( 'error: vocab tgt file: {} cannot be find\n'.format( self.mdir + '/' + tgt_voc)) sys.exit(1) if not os.path.exists(self.mdir + '/checkpoint'): sys.stderr.write( 'error: checkpoint file: {} cannot be find\ndelete dir {} ???\n' .format(self.mdir + '/checkpoint', self.mdir)) sys.exit(1) argv = [] with open(self.mdir + "/topology", 'r') as f: for line in f: opt, val = line.split() argv.append('-' + opt) argv.append(val) # overrides options passed in command line self.parse(argv) # read vocabularies self.voc_src = Vocab(self.mdir + "/" + src_voc) self.voc_tgt = Vocab(self.mdir + "/" + tgt_voc) # update last epoch for e in range(999, 0, -1): if os.path.exists(self.mdir + "/epoch{}.index".format(e)): self.last_epoch = e break print("learning continuation: last epoch is {}".format( self.last_epoch)) ### # learning from scratch ### else: # read file or config/vocab_src if file is not set if self.src_tok: if not os.path.exists(self.src_tok): sys.stderr.write( 'error: cannot find -src_tok file: {}\n'.format( self.src_tok)) sys.exit(1) with open(self.src_tok) as jsonfile: self.tok_src = json.load(jsonfile) if not self.src_voc: self.src_voc = self.tok_src["vocabulary"] else: self.tok_src = None self.voc_src = Vocab(self.src_voc) if self.tgt_tok: if not os.path.exists(self.tgt_tok): sys.stderr.write( 'error: cannot find -tgt_tok file: {}\n'.format( self.tgt_tok)) sys.exit(1) with open(self.tgt_tok) as jsonfile: self.tok_tgt = json.load(jsonfile) if not self.tgt_voc: self.tgt_voc = self.tok_tgt["vocabulary"] else: self.tok_tgt = None self.voc_tgt = Vocab(self.tgt_voc) self.src_voc_size = self.voc_src.length self.tgt_voc_size = self.voc_tgt.length if not os.path.exists(self.mdir): os.makedirs(self.mdir) # copy vocabularies if self.src_tok: copyfile(self.src_voc, self.mdir + "/" + self.tok_src["vocabulary"]) copyfile(self.src_tok, self.mdir + "/tokenization_src.json") else: copyfile(self.src_voc, self.mdir + "/vocab_src") if self.tgt_tok: copyfile(self.tgt_voc, self.mdir + "/" + self.tok_tgt["vocabulary"]) copyfile(self.tgt_tok, self.mdir + "/tokenization_tgt.json") else: copyfile(self.tgt_voc, self.mdir + "/vocab_tgt") # read embeddings # read file or use emb_src.length if file is not set self.emb_src = Embeddings(self.src_emb, self.voc_src, self.src_emb_size) self.src_emb_size = self.emb_src.dim # read file or use emb_tgt.length if file is not set self.emb_tgt = Embeddings(self.tgt_emb, self.voc_tgt, self.tgt_emb_size) self.tgt_emb_size = self.emb_tgt.dim # write topology file with open(self.mdir + "/topology", 'w') as f: for opt, val in vars(self).items(): if opt.startswith("src") or opt.startswith("tgt") or \ opt == "aggr" or opt == "mode": f.write("{} {}\n".format(opt, val)) print("learning from scratch") return