def test_electra_without_magic(): generator = ReformerLM(num_tokens=20000, dim=512, depth=1, max_seq_len=1024) discriminator = ReformerLM(num_tokens=20000, dim=512, depth=2, max_seq_len=1024, return_embeddings=True) generator.token_emb = discriminator.token_emb generator.pos_emb = discriminator.pos_emb discriminator_with_adapter = nn.Sequential(discriminator, nn.Linear(512, 1), nn.Sigmoid()) trainer = Electra(generator, discriminator_with_adapter, num_tokens=20000, pad_token_id=1, mask_ignore_token_ids=[2, 3]) data = torch.randint(0, 20000, (1, 1024)) results = trainer(data) results.loss.backward()
def test_electra(): generator = ReformerLM(num_tokens=20000, dim=512, depth=1, max_seq_len=1024) discriminator = ReformerLM(num_tokens=20000, dim=512, depth=2, max_seq_len=1024) generator.token_emb = discriminator.token_emb generator.pos_emb = discriminator.pos_emb trainer = Electra(generator, discriminator, num_tokens=20000, discr_dim=512, discr_layer='reformer', pad_token_id=1, mask_ignore_token_ids=[2, 3]) data = torch.randint(0, 20000, (1, 1024)) results = trainer(data) results.loss.backward()
class ReformerQA(nn.Module): def __init__(self, config, pretrained_model_path=None): super(ReformerQA, self).__init__() self.reformer_lm = ReformerLM( num_tokens=config['num_tokens'], dim=config['dim'], depth=config['depth'], max_seq_len=config['max_seq_len'], heads=config['heads'], causal=config['casual'], return_embeddings=config['return_embeddings']) self.qa_outputs = nn.Linear(config['dim'], config['num_label']) if pretrained_model_path: self._load_weights(pretrained_model_path) def _load_weights(self, pretrained_model_path): state_dict = copy.deepcopy(torch.load(pretrained_model_path)) state_dict = OrderedDict( (k, v) for k, v in state_dict.items() if 'to_logits' not in k) self.reformer_lm.load_state_dict(state_dict) def forward(self, input_ids=None, start_positions=None, end_positions=None): sequence_output = self.reformer_lm(input_ids) logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = ( start_logits, end_logits, ) if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside # our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss, ) + outputs return outputs
def __init__(self, config, pretrained_model_path=None): super(ReformerQA, self).__init__() self.reformer_lm = ReformerLM( num_tokens=config['num_tokens'], dim=config['dim'], depth=config['depth'], max_seq_len=config['max_seq_len'], heads=config['heads'], causal=config['casual'], return_embeddings=config['return_embeddings']) self.qa_outputs = nn.Linear(config['dim'], config['num_label']) if pretrained_model_path: self._load_weights(pretrained_model_path)
def __init__(self, config): super().__init__() args = ReformerConfig(vocab_size=config.vocab_size, hidden_size=config.hidden_dim, intermediate_size=config.hidden_dim * 4, num_attention_heads=config.num_heads, attention_probs_dropout_prob=config.dropout_prob, hidden_dropout_prob=config.dropout_prob, max_position_embeddings=config.max_token_len, max_length=config.max_token_len, num_hidden_layers=config.num_layers) self.config = config self.pad_token_id = config.pad_token_id num_tokens = args.vocab_size dim = args.hidden_size depth = args.num_hidden_layers max_seq_len = args.max_length heads = args.num_attention_heads lsh_dropout = args.attention_probs_dropout_prob ff_dropout = args.hidden_dropout_prob post_attn_dropout = args.attention_probs_dropout_prob layer_dropout = args.hidden_dropout_prob attn_chunks = args.attn_chunks num_mem_kv = args.num_mem_kv full_attn_thres = args.full_attn_thres reverse_thres = args.reverse_thres use_full_attn = args.use_full_attn n_hashes = args.n_hashes self.reformer = ReformerLM( num_tokens=num_tokens, ## vocab_size dim=dim, depth=depth, max_seq_len=max_seq_len, heads=heads, lsh_dropout=lsh_dropout, ff_dropout=ff_dropout, post_attn_dropout=post_attn_dropout, layer_dropout= layer_dropout, # layer dropout from 'Reducing Transformer Depth on Demand' paper attn_chunks= attn_chunks, # process lsh attention in chunks, only way for memory to fit when scaling to 16k tokens num_mem_kv= num_mem_kv, # persistent learned memory key values, from all-attention paper full_attn_thres= full_attn_thres, # use full attention if context length is less than set value --> 이거 테스트 해보자 reverse_thres= reverse_thres, # turn off reversibility for 2x speed for sequence lengths shorter or equal to the designated value --> 이거 테스트 해보자 use_full_attn=use_full_attn, n_hashes=n_hashes, return_embeddings=True) self.hidden2tag = nn.Linear(config.hidden_dim, config.tag_size) self.softmax = nn.LogSoftmax(dim=-1) self.crit = nn.NLLLoss()
def __init__(self, num_tokens, dim, depth, max_seq_len, heads, causal=True): super().__init__() self.reformer = ReformerLM( num_tokens=num_tokens, dim=dim, depth=depth, heads=heads, max_seq_len=max_seq_len, causal=causal, # auto-regressive 학습을 위한 설정 return_embeddings=True # reformer 임베딩을 받기 위한 설정 ) self.lm_head = nn.Linear(dim, num_tokens, bias=False)
def main(): torch.manual_seed(9) # Config config = ModelConfig( config_path='../config/mlm/mlm-pretrain-small.json').get_config() # Tokenizer tokenizer = BertTokenizer(vocab_file=config.vocab_path, do_lower_case=False) # dataset = NamuWikiDataset(tokenizer, max_len, path=mini_data_path) dataset = DatasetForMLM(tokenizer, config.max_seq_len, path=config.data_path) # Model model = ReformerLM( num_tokens=tokenizer.vocab_size, dim=config.dim, depth=config.depth, heads=config.n_head, max_seq_len=config.max_seq_len, causal=False # auto-regressive 학습을 위한 설정 ) trainer = ReformerTrainer(dataset, model, tokenizer, model_name=config.model_name, checkpoint_path=config.checkpoint_path, max_len=config.max_seq_len, train_batch_size=config.batch_size, eval_batch_size=config.batch_size) train_dataloader, eval_dataloader = trainer.build_dataloaders( train_test_split=0.1) trainer.train( epochs=config.epochs, train_dataloader=train_dataloader, eval_dataloader=eval_dataloader, log_steps=config.log_steps, ckpt_steps=config.ckpt_steps, gradient_accumulation_steps=config.gradient_accumulation_steps)
def __init__(self, x_dim, y_dim, num_tokens, max_seq_len, dim, depth, heads, **kwargs): super().__init__() self._min_std = min_std self.nan_value = nan_value enc_x_dim = x_dim + y_dim self.enc_emb = nn.Linear(enc_x_dim, hidden_size) encoder_norm = nn.LayerNorm(hidden_size) self.encoder = ReformerLM(num_tokens=num_tokens, max_seq_len=max_seq_len, dim=dim, depth=depth, heads=heads, **kwargs) self.mean = nn.Linear(hidden_size, y_dim) self.std = nn.Linear(hidden_size, y_dim)
def __init__(self, num_tokens, dim, depth, max_seq_len, heads, num_labels=2, causal=False): super().__init__() self.reformer = ReformerLM( num_tokens=num_tokens, dim=dim, depth=depth, heads=heads, max_seq_len=max_seq_len, causal=causal, # auto-regressive 학습을 위한 설정 return_embeddings=True # reformer 임베딩을 받기 위한 설정 ) self.mrc_head = ReformerMRCHead(dim, num_labels)
def __init__(self, model_args={}): super().__init__() self.model_dim = model_args.get('model_dim', 300) self.max_encoder_len = model_args.get('max_encoder_len', 1604) self.max_decoder_len = model_args.get('max_decoder_len', 66) self.vocab_size = model_args.get('vocab_size', 4000) self.encoder = CRNN() self.decoder = ReformerLM( num_tokens = self.vocab_size, dim = self.model_dim, depth = 2, heads = 1, bucket_size = 233, ff_dropout=0.2, causal = True, max_seq_len = self.max_decoder_len ) if model_args.get('decoder_embedding', None) is not None: self.decoder.token_emb = nn.Embedding.from_pretrained(model_args['decoder_embedding'], freeze=False) else: self.decoder.token_emb = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=300, padding_idx=0)
def gen(text): model = ReformerLM(num_tokens=13137, dim=128, depth=12, max_seq_len=4096, lsh_dropout=0.1, causal=True, full_attn_thres=128) model = TrainingWrapper(model, ignore_index=0, pad_value=0).cpu() output_dir = "model" model_cpu_path = os.path.join(output_dir, 'model_cpu.pt') model.load_state_dict(torch.load(model_cpu_path)) initial = auto_encode(text) # print(initial) sample = model.generate( initial, 10, temperature=1., filter_thres=0.9, eos_token=1 ) # assume end token is 1, or omit and it will sample up to 100 # print(sample) # print(sample.shape) # (1, <=100) token ids text = tokenizer.convert_ids_to_tokens(sample.tolist()[0]) print(text)
self.writer.add_scalar('Perplexity', perplexity, eval_steps) self.writer.close() logging.info( f'{datetime.now()} | Step: {step} | Eval Loss: {eval_loss} | Perplexity: {perplexity}' ) return None if __name__ == '__main__': dataset = WikiDataset(path='./data/enwiki') tokenizer = BertTokenizer.from_pretrained('bert-base-cased') tokenizer.max_len = 128 model = ReformerLM(num_tokens=tokenizer.vocab_size, dim=512, depth=6, heads=8, max_seq_len=tokenizer.max_len, causal=True) parameters = filter(lambda p: p.requires_grad, model.parameters()) parser = argparse.ArgumentParser(description='Reformer') # data # cuda parser.add_argument('--with_cuda', default=False, action='store_true', dest='with_cuda', help='use CPU in case there\'s no GPU support') parser.add_argument('--use_ema',
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='cuda', type=str, required=False, help='设置使用哪些显卡') # parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, # help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small_terry_ai.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=2, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1e-8, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--stride', default=500, type=int, required=False, help=' 向前跨越的长度') parser.add_argument('--dim', default=1024, type=int, required=False, help='训练时取训练数据的窗口步长单个样本长度') parser.add_argument('--gradient_accumulation', default=5, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=10, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=64, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') # parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--bpe_token', action='store_true', help='subword') # parser.add_argument('--dim', default=1024, type=int, required=False, help='dim') parser.add_argument('--depth', default=12, type=int, required=False, help='depth') parser.add_argument('--full_attn_thres', default=1024, type=int, required=False, help='full_attn_thres') parser.add_argument('--max_seq_len', default=4096, type=int, required=False, help='max_seq_len') # parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") # parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") args = parser.parse_args() full_tokenizer=tokenizer_plus(args.tokenizer_path) config_file=os.path.join(args.output_dir,'config.json') Config=tkitJson.Config(config_file) new_conf={'num_tokens':full_tokenizer.vocab_size, 'dim': args.dim, #和窗口长度一样 'depth' : args.depth, 'max_seq_len' : args.max_seq_len, 'lsh_dropout' : 0.1, 'causal' : True, 'full_attn_thres' : args.full_attn_thres, 'stride': args.stride, #滑块长度 } print("new_conf:",new_conf) Config.save(new_conf) #复制词典 shutil.copy(args.tokenizer_path,os.path.join(args.output_dir,'vocab.txt')) print('args:\n' + args.__repr__()) # if args.segment: # from tokenizations import tokenization_bert_word_level as tokenization_bert # else: # from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = '0,1,2,3' # 此处设置程序使用哪些显卡 # model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config) # print('config:\n' + model_config.to_json_string()) # dim = model_config.dim # if args.bpe_token: # full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) # else: # full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) # full_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path) # full_tokenizer.max_len = dim # if args.device=='' device = 'cuda' if torch.cuda.is_available() else 'cpu' #强制使用cpu device = args.device print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 pretrained_model = args.pretrained_model epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride dim=args.dim if stride>= dim: stride=dim/2-2 gradient_accumulation = args.gradient_accumulation # fp16 = args.fp16 # 不支持半精度的显卡请勿打开 # fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir # tb_writer = SummaryWriter(log_dir=args.writer_dir) # 加载之前的模型路径 model_path=os.path.join(pretrained_model, 'model.pt') optimizer_path= os.path.join(pretrained_model, 'optimizer.pt') scheduler_path=os.path.join(pretrained_model, 'scheduler.pt') # 设置输出 output_model_path=os.path.join(output_dir, 'model.pt') output_optimizer_path= os.path.join(output_dir, 'optimizer.pt') output_scheduler_path=os.path.join(output_dir, 'scheduler.pt') if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length) print('files built') model = ReformerLM( num_tokens= full_tokenizer.vocab_size, dim = dim, #窗口长度 depth = args.depth, max_seq_len = args.max_seq_len, lsh_dropout = 0.1, causal = True, full_attn_thres = args.full_attn_thres ) # 0 is used for padding and no loss to be calculated on it if device=='cuda': model = TrainingWrapper(model, ignore_index = 0, pad_value = 0).to('cuda') else: model = TrainingWrapper(model, ignore_index = 0, pad_value = 0) if os.path.isfile(model_path): # if so, load them model.load_state_dict(torch.load(model_path)) else: # pass model.train() weight_decay=0.0 # learning_rate=5e-5 adam_epsilon=1e-8 # warmup_steps=0 max_grad_norm=1.0 max_steps=-1 # gradient_accumulation_steps=10 logging_steps=1000 save_steps=10000 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay }, { 'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 } ] full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) # total_steps = len(x_train_text)/gradient_accumulation_steps * num_train_epochs # t_total=3/1*3 # optimizer = AdamW(model.parameters(), lr=lr, correct_bias=True) optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=warmup_steps,num_training_steps=total_steps) # # checking if another optimizer/scheduler exists if os.path.isfile(optimizer_path) and os.path.isfile(scheduler_path): # if so, load them optimizer.load_state_dict(torch.load(optimizer_path)) scheduler.load_state_dict(torch.load(scheduler_path)) print("optimizer",optimizer) loss_fn=nn.CrossEntropyLoss() print('starting training') overall_step = 0 running_loss = 0 gradient_accumulation_run=0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) # piece_num = 0 # model.zero_grad() # reset gradient # for piece_num, i in tqdm(enumerate( x)): for piece_num, i in enumerate( x): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] # print(len(tokens)) start_point = 0 samples = [] #划窗切割数据 while start_point < len(tokens) - dim: samples.append(tokens[start_point: start_point + dim]) # print(start_point, start_point + dim) start_point += stride if start_point < len(tokens): samples.append(tokens[len(tokens)-dim:]) # 打乱数据,防止过度拟合 random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # print(step) # prepare data batch = samples[step * batch_size: (step + 1) * batch_size] # batch_labels = [] batch_inputs = [] for ids in batch: # int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] # batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) if device=='cuda': batch_inputs = torch.tensor(batch_inputs).long().to("cuda") # batch_labels = torch.tensor(batch_labels).long().to("cuda") else: batch_inputs = torch.tensor(batch_inputs).long() # batch_labels = torch.tensor(batch_labels).long() # batch_inputs = torch.tensor(batch_inputs).long().to(device) # print(batch_labels) # print(len(batch_inputs)) # print(batch_inputs) # print(len(batch_inputs)) loss = model(batch_inputs, return_loss = True) loss = loss/gradient_accumulation loss.backward() # print(loss.sum()) if((gradient_accumulation_run+1)%gradient_accumulation)==0: # optimizer the net optimizer.step() scheduler.step() # update parameters of net optimizer.zero_grad() # update parameters of net # scheduler.zero_grad() # update parameters of net # model.zero_grad() # reset gradient end = datetime.now() print("epoch:",epoch + 1," piece_num:",piece_num,'/',num_pieces," step:",overall_step+1,'/',total_steps," step完成比例:",(overall_step+1)/total_steps," loss:",loss.item(),'Time',end-now) overall_step+=1 gradient_accumulation_run=gradient_accumulation_run+1 # scheduler.step() # model.zero_grad() # end = datetime.now() # print("one piece:",end-now," s") torch.save(model.state_dict(), output_model_path) torch.save(optimizer.state_dict(), output_optimizer_path) torch.save(scheduler.state_dict(), output_scheduler_path) model_cpu_path=os.path.join(output_dir, 'model_cpu.pt') torch.save(model.cpu().state_dict(), model_cpu_path)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer.max_len = 40960 model = ReformerLM( num_tokens=tokenizer.vocab_size, dim=768, depth=12, max_seq_len=tokenizer.max_len, heads=48, lsh_dropout=0.1, causal=False, # auto-regressive or not bucket_size=64, # average size of qk per bucket, 64 was recommended in paper n_hashes=4, # 4 is permissible per author, 8 is the best but slower ff_chunks= 200, # number of chunks for feedforward layer, make higher if there are memory issues weight_tie= False, # tie parameters of each layer for no memory per additional depth attn_chunks= 8, # process lsh attention in chunks, only way for memory to fit when scaling to 16k tokens num_mem_kv= 128, # persistent learned memory key values, from all-attention paper twin_attention= False, # both branches of the reversible network will be attention use_full_attn=False, # use full self attention, for comparison full_attn_thres= 128, # use full attention if context length is less than set value use_scale_norm= True, # use scale norm from 'Transformers without tears' paper axial_position_emb=True, axial_position_shape=(640, 64), axial_position_dims=(384, 384)) model.train() model.to(devices)
return str(chr(max(32, token))) def decode_tokens(tokens): return ''.join(list(map(decode_token, tokens))) # instantiate model model = ReformerLM( dim=512, depth=6, max_seq_len=SEQ_LEN, num_tokens=256, heads=8, bucket_size=64, n_hashes=4, ff_chunks=10, lsh_dropout=0.1, weight_tie=True, causal=True, use_full_attn=False # set this to true for comparison with full attention ) model = TrainingWrapper(model) model.cuda() # prepare enwik8 data with gzip.open('./data/enwik8.gz') as file: X = np.fromstring(file.read(int(95e6)), dtype=np.uint8) trX, vaX = np.split(X, [int(90e6)])
# print(tokenizer) config = AlbertConfig.from_pretrained(path) model = AlbertModel.from_pretrained(path, config=config) return model, tokenizer # 加载albert path = "model/albert_tiny/" albert_model, full_tokenizer = load_albert(path) # outputs = albert_model(batch_inputs) model = ReformerLM(num_tokens=20000, dim=1024, depth=12, max_seq_len=4096, lsh_dropout=0.1, causal=True, full_attn_thres=1024) # 0 is used for padding and no loss to be calculated on it model = TrainingWrapper(model, ignore_index=0, pad_value=0) # the wrapper can handle evenly packed sequences x_train = randint(0, 20000, (3, 357)) # or if you have a list of uneven sequences, it will be padded for you x_train = [ randint(0, 20000, (120, )), randint(0, 20000, (253, )), randint(0, 20000, (846, ))
# pretrained_weights = 'cache/vocab_small_terry_ai.txt' device = 'cpu' output_dir = 'model' pretrained_weights = os.path.join(output_dir, 'vocab.txt') config_file = os.path.join(output_dir, 'config.json') Config = tkitJson.Config(config_file) conf = Config.read() # tokenizer = BertTokenizer.from_pretrained(pretrained_weights) tokenizer = tokenizer_plus(pretrained_weights) model = ReformerLM(num_tokens=conf['num_tokens'], dim=conf['dim'], depth=conf['depth'], max_seq_len=conf['max_seq_len'], lsh_dropout=conf['lsh_dropout'], causal=conf['causal'], full_attn_thres=conf['full_attn_thres']) model_path = os.path.join(output_dir, 'model.pt') if device == 'cuda': model = TrainingWrapper(model, ignore_index=0, pad_value=0).cuda() if os.path.isfile(model_path): # if so, load them # print('++++'*20) model.load_state_dict(torch.load(model_path)).cuda() else: model = TrainingWrapper(model, ignore_index=0, pad_value=0).cpu() # print(model)
def main(): torch.manual_seed(9) # 1. Config train_config, gen_config, disc_config = ElectraConfig( config_path='../config/electra/electra-train.json').get_config() # 2. Tokenizer tokenizer = BertTokenizer(vocab_file=train_config.vocab_path, do_lower_case=False) # 3. Dataset dataset = ElectraDataset(tokenizer, train_config.max_len, data_path=train_config.data_path) # 4. Electra Model # 4.1. instantiate the generator and discriminator, # making sure that the generator is roughly a quarter to a half of the size of the discriminator # 제너레이터의 크기는 디스크리미네이터의 1/4~ 1/2 크기로 # Generator generator = ReformerLM( num_tokens=tokenizer.vocab_size, emb_dim=gen_config.emb_dim, dim=gen_config.emb_dim, # smaller hidden dimension heads=gen_config.heads, # less heads ff_mult=gen_config. ff_mult, # smaller feed forward intermediate dimension dim_head=gen_config.dim_head, depth=gen_config.depth, max_seq_len=train_config.max_len) discriminator = ReformerLM( num_tokens=tokenizer.vocab_size, emb_dim=disc_config.emb_dim, dim=disc_config.dim, dim_head=disc_config.dim_head, heads=disc_config.heads, depth=disc_config.depth, ff_mult=disc_config.ff_mult, max_seq_len=train_config.max_len, return_embeddings=True, ) # 4.2 weight tie the token and positional embeddings of generator and discriminator # 제너레이터와 디스크리미네이터의 토큰, 포지션 임베딩을 공유한다(tie). generator.token_emb = discriminator.token_emb generator.pos_emb = discriminator.pos_emb # weight tie any other embeddings if available, token type embeddings, etc. # 다른 임베딩 웨이트도 있다면 공유 필요. # 4.3 instantiate electra # 엘렉트라 모델 초기화 discriminator_with_adapter = nn.Sequential(discriminator, nn.Linear(disc_config.dim, 1)) model = Electra( generator, discriminator_with_adapter, mask_token_id=tokenizer. mask_token_id, # the token id reserved for masking pad_token_id=tokenizer.pad_token_id, # the token id for padding mask_prob=0.15, # masking probability for masked language modeling mask_ignore_token_ids=tokenizer. all_special_ids # ids of tokens to ignore for mask modeling ex. (cls, sep) ) trainer = ElectraTrainer(dataset, model, tokenizer, train_config.max_len, checkpoint_path=train_config.checkpoint_path, model_name=train_config.model_name, train_batch_size=train_config.batch_size, eval_batch_size=train_config.batch_size) train_dataloader, eval_dataloader = trainer.build_dataloaders( train_test_split=0.1) model = trainer.train( epochs=train_config.epochs, train_dataloader=train_dataloader, eval_dataloader=eval_dataloader, log_steps=train_config.log_steps, ckpt_steps=train_config.ckpt_steps, gradient_accumulation_steps=train_config.gradient_accumulation_steps)
import re import os from tqdm import tqdm, tqdm_notebook from glob import glob import json tokenizer = BertTokenizer.from_pretrained('/home/huanghaiping/Research/Data/cased_L-12_H-768_A-12/') tokenizer.max_len = 3072 model = ReformerLM( num_tokens=tokenizer.vocab_size, dim=100, depth=2, heads=2, max_seq_len=tokenizer.max_len, causal=True ) test = 'Hello, my dog is cute' tok = tokenizer.encode(test, max_length=tokenizer.max_len, add_special_tokens=True) print("tok: ", tok) tokens = [] for ii in range(32): tokens.append(tok) # tok = np.array(tokens) tok = torch.tensor(tok, dtype=torch.long)
from reformer_pytorch import ReformerLM from reformer_pytorch.generative_tools import TrainingWrapper import torch from transformers import * import os pretrained_weights = 'cache/vocab_small_terry_ai.txt' device='cpu' output_dir='model' tokenizer = BertTokenizer.from_pretrained(pretrained_weights) model = ReformerLM( num_tokens= 13137, dim = 1024, depth = 12, max_seq_len = 4096, lsh_dropout = 0.1, causal = True, full_attn_thres = 1024 ) model_path=os.path.join(output_dir, 'model.pt') if device=='cuda': model = TrainingWrapper(model, ignore_index = 0, pad_value = 0).cuda() if os.path.isfile(model_path): # if so, load them # print('++++'*20) model.load_state_dict(torch.load(model_path)).cuda() else:
import torch from reformer_pytorch import ReformerLM CONTEXT_LEN = 512 SEQ_LEN = 8192 model = ReformerLM(num_tokens=20000, dim=1024, depth=1, max_seq_len=SEQ_LEN, ff_chunks=8, causal=True) c = torch.randn(1, CONTEXT_LEN, 1024) x = torch.randint(0, 20000, (1, SEQ_LEN)).long() i_mask = torch.ones(1, SEQ_LEN).bool() c_mask = torch.ones(1, CONTEXT_LEN).bool() y = model(x, keys=c, input_mask=i_mask, context_mask=c_mask) # masking done correctly in LSH attention
def test_encdec_v1(input_lang, target_lang, dim, bucket_size, depth, heads, n_hashes, vir_seq_len, ff_chunks, attn_chunks, mol_seq_len, cmd_args, train_dataset, test_dataset, output_folder, train_batch_size, epochs, validate_every, save_every, checkpoint_id, deepspeed_optimizer, use_full_attn, gradient_accumulation_steps, filter_thres): results = { 'generated_seq': [], 'generated_mol': [], 'target_mol': [], 'input_genome': [] } encoder = ReformerLM( num_tokens=input_lang.n_words, dim=dim, bucket_size=bucket_size, depth=depth, heads=heads, n_hashes=n_hashes, max_seq_len=vir_seq_len, ff_chunks=ff_chunks, attn_chunks=attn_chunks, weight_tie=True, weight_tie_embedding=True, axial_position_emb=True, axial_position_shape=compute_axial_position_shape(vir_seq_len), axial_position_dims=(dim // 2, dim // 2), return_embeddings=True, use_full_attn=use_full_attn).to(device) decoder = ReformerLM( num_tokens=target_lang.n_words, dim=dim, bucket_size=bucket_size, depth=depth, heads=heads, n_hashes=n_hashes, ff_chunks=ff_chunks, attn_chunks=attn_chunks, max_seq_len=mol_seq_len, axial_position_emb=True, axial_position_shape=compute_axial_position_shape(mol_seq_len), axial_position_dims=(dim // 2, dim // 2), weight_tie=True, weight_tie_embedding=True, causal=True, use_full_attn=use_full_attn).to(device) SAVE_DIR = os.sep.join([output_folder, 'saved_model']) if checkpoint_id: enc_ckp_max = checkpoint_id dec_ckp_max = checkpoint_id else: try: enc_ckp_max = np.max([ int(ckp) for ckp in os.listdir(os.sep.join([SAVE_DIR, 'encoder'])) ]) except Exception as e: print('Exception:', e) enc_ckp_max = 0 try: dec_ckp_max = np.max([ int(ckp) for ckp in os.listdir(os.sep.join([SAVE_DIR, 'decoder'])) ]) except: dec_ckp_max = 0 encoder = TrainingWrapper(encoder, ignore_index=PAD_IDX, pad_value=PAD_IDX).to(device) decoder = TrainingWrapper(decoder, ignore_index=PAD_IDX, pad_value=PAD_IDX).to(device) ''' encoder_params = filter(lambda p: p.requires_grad, encoder.parameters()) decoder_params = filter(lambda p: p.requires_grad, decoder.parameters()) if deepspeed_optimizer == False: print('No DeepSpeed optimizer found. Using RangerLars.') encoder_optimizer = RangerLars(encoder.parameters()) decoder_optimizer = RangerLars(decoder.parameters()) encoder_engine, encoder_optimizer, trainloader, _ = deepspeed.initialize( args=cmd_args, model=encoder, optimizer=encoder_optimizer, model_parameters=encoder_params, training_data=train_dataset, dist_init_required=True ) decoder_engine, decoder_optimizer, testloader, _ = deepspeed.initialize( args=cmd_args, model=decoder, optimizer=decoder_optimizer, model_parameters=decoder_params, training_data=test_dataset, dist_init_required=False ) else: print('Found optimizer in the DeepSpeed configurations. Using it.') encoder_engine, encoder_optimizer, trainloader, _ = deepspeed.initialize(args=cmd_args, model=encoder, model_parameters=encoder_params, training_data=train_dataset, dist_init_required=True) decoder_engine, decoder_optimizer, testloader, _ = deepspeed.initialize(args=cmd_args, model=decoder, model_parameters=decoder_params, training_data=test_dataset, dist_init_required=False) _, encoder_client_sd = encoder_engine.load_checkpoint(os.sep.join([SAVE_DIR,'encoder']), enc_ckp_max) _, decoder_client_sd = decoder_engine.load_checkpoint(os.sep.join([SAVE_DIR,'decoder']), dec_ckp_max) gpus_mini_batch = (train_batch_size// gradient_accumulation_steps) // torch.cuda.device_count() print('gpus_mini_batch:', gpus_mini_batch, 'with gradient_accumulation_steps:', gradient_accumulation_steps) for pair in tqdm(testloader): encoder_engine.eval() decoder_engine.eval() encoder.eval() decoder.eval() with torch.no_grad(): ts_src = pair[0] ts_trg = pair[1] input_genome = [[input_lang.index2word[gen_idx.item()] for gen_idx in smpl] for smpl in pair[0]] target_mol = [[target_lang.index2word[mol_idx.item()] for mol_idx in smpl] for smpl in pair[1]] ts_src = ts_src.to(encoder_engine.local_rank) #ts_src.to(device) # ts_trg = ts_trg.to(decoder_engine.local_rank) #ts_trg.to(device) # print('ts_src.shape', ts_src.shape) print('ts_src.shape', ts_trg.shape) enc_keys = encoder(ts_src) #encoder_engine(ts_src) yi = torch.tensor([[SOS_token] for _ in range(gpus_mini_batch)]).long().to(decoder_engine.local_rank) #to(device) # #sample = decoder_engine.generate(yi, mol_seq_len, filter_logits_fn=top_p, filter_thres=0.95, keys=enc_keys, eos_token = EOS_token) sample = decoder.generate(yi, mol_seq_len, filter_logits_fn=top_p, filter_thres=0.95, keys=enc_keys, eos_token = EOS_token) actual_mol = [] for mol_seq in sample.cpu().numpy(): for mol_idx in mol_seq: actual_mol.append(target_lang.index2word[mol_idx]) print('Generated Seq:', sample) print('Generated Mol:', actual_mol) print('Real Mol:', target_mol[:target_mol.index(target_lang.index2word[EOS_token])]) results['generated_seq'].append(sample) results['generated_mol'].append(actual_mol) results['target_mol'].append(target_mol) results['input_genome'].append(input_genome) print('Saving Test Results..') pickle.dump(results, open(os.sep.join([output_folder,'test_results.pkl']), 'wb')) ''' encoder_checkpoint = os.sep.join([ output_folder, 'saved_model', 'encoder', enc_ckp_max, 'mp_rank_00_model_states.pt' ]) decoder_checkpoint = os.sep.join([ output_folder, 'saved_model', 'decoder', dec_ckp_max, 'mp_rank_00_model_states.pt' ]) encoder.load_state_dict( torch.load(encoder_checkpoint, map_location=torch.device(device))['module']) decoder.load_state_dict( torch.load(decoder_checkpoint, map_location=torch.device(device))['module']) real_batch_size = train_batch_size // gradient_accumulation_steps test_loader = DataLoader(dataset=test_dataset, batch_size=real_batch_size, shuffle=True) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs encoder = nn.DataParallel(encoder) decoder = nn.DataParallel(decoder) encoder.to(device) decoder.to(device) for pair in tqdm(test_loader): encoder.eval() decoder.eval() with torch.no_grad(): ts_src = torch.tensor(np.array([pair[0].numpy()])).to(device) ts_trg = torch.tensor(np.array([pair[1].numpy()])).to(device) input_genome = [ input_lang.index2word[gen_idx.item()] for gen_idx in pair[0] ] target_mol = [ target_lang.index2word[mol_idx.item()] for mol_idx in pair[1] ] enc_keys = encoder(ts_src) yi = torch.tensor([[SOS_token]]).long().to(device) sample = decoder.generate(yi, mol_seq_len, filter_logits_fn=top_p, filter_thres=filter_thres, keys=enc_keys, eos_token=EOS_token) actual_mol = [] for mol_seq in sample.cpu().numpy(): for mol_idx in mol_seq: actual_mol.append(target_lang.index2word[mol_idx]) print('Generated Seq:', sample) print('Generated Mol:', actual_mol) print( 'Real Mol:', target_mol[:target_mol.index(target_lang. index2word[EOS_token])]) results['generated_seq'].append(sample) results['generated_mol'].append(actual_mol) results['target_mol'].append(target_mol) results['input_genome'].append(input_genome) print('Saving Test Results..') pickle.dump(results, open(os.sep.join([output_folder, 'test_results.pkl']), 'wb')) '''
import torch from torch import nn from reformer_pytorch import ReformerLM from electra_pytorch import Electra # (1) instantiate the generator and discriminator, making sure that the generator is roughly a quarter to a half of the size of the discriminator generator = ReformerLM( num_tokens=20000, emb_dim=128, dim=256, # smaller hidden dimension heads=4, # less heads ff_mult=2, # smaller feed forward intermediate dimension dim_head=64, depth=12, max_seq_len=1024) discriminator = ReformerLM(num_tokens=20000, emb_dim=128, dim=1024, dim_head=64, heads=16, depth=12, ff_mult=4, max_seq_len=1024) # (2) weight tie the token and positional embeddings of generator and discriminator generator.token_emb = discriminator.token_emb generator.pos_emb = discriminator.pos_emb
def main(): cmd_args = add_argument() path_to_file_tr = cmd_args.path_to_file_tr path_to_file_ts = cmd_args.path_to_file_ts min_len_mol = cmd_args.min_len_mol max_len_mol = cmd_args.max_len_mol num_examples_tr = cmd_args.num_examples_tr num_examples_ts = cmd_args.num_examples_ts train_batch_size = json.load(open(cmd_args.ds_conf))['train_batch_size'] gradient_accumulation_steps = json.load(open( cmd_args.ds_conf))['gradient_accumulation_steps'] deepspeed_optimizer = True if json.load(open(cmd_args.ds_conf)).get( 'optimizer', None) is not None else False epochs = cmd_args.epochs emb_dim = cmd_args.emb_dim dim = cmd_args.dim bucket_size = cmd_args.bucket_size depth = cmd_args.depth heads = cmd_args.heads n_hashes = cmd_args.n_hashes ff_chunks = cmd_args.ff_chunks attn_chunks = cmd_args.attn_chunks validate_every = cmd_args.validate_every save_every = cmd_args.save_every output_folder = cmd_args.output_folder use_full_attn = cmd_args.use_full_attn mrpc_test = cmd_args.mrpc_test use_deepspeed = cmd_args.use_deepspeed os.makedirs(output_folder, exist_ok=True) pickle.dump(cmd_args, open(os.sep.join([output_folder, 'training_conf.pkl']), 'wb')) MIN_LENGTH_MOL = min_len_mol MAX_LENGTH_MOL = max_len_mol # 2048 NUM_EXAMPLES_TR = num_examples_tr # 1024 NUM_EXAMPLES_TS = num_examples_ts # 1024 N_EPOCHS = epochs # 10 VALIDATE_EVERY = validate_every SAVE_EVERY = save_every MOL_SEQ_LEN = MAX_LENGTH_MOL # output_lang.max_len if (output_lang.max_len % 2) == 0 else output_lang.max_len + 1 # ?? saved_mol_lang = os.sep.join([output_folder, 'mol_lang.pkl']) MAX_LENGTH_MOL = cmd_args.max_len_mol saved_target_lang = os.sep.join([output_folder, 'mol_lang.pkl']) if mrpc_test: mol_lang, tr_samples, ts_samples = readMRPC( molecule_file_tr=path_to_file_tr, molecule_file_ts=path_to_file_ts, saved_molecule_lang=saved_target_lang, num_examples_tr=NUM_EXAMPLES_TR, num_examples_ts=NUM_EXAMPLES_TS, min_len_molecule=MIN_LENGTH_MOL, max_len_molecule=MAX_LENGTH_MOL, shuffle=True) else: mol_lang, tr_samples, ts_samples = readMolecules( molecule_file_tr=path_to_file_tr, molecule_file_ts=path_to_file_ts, saved_molecule_lang=saved_target_lang, num_examples_tr=NUM_EXAMPLES_TR, num_examples_ts=NUM_EXAMPLES_TS, min_len_molecule=MIN_LENGTH_MOL, max_len_molecule=MAX_LENGTH_MOL, shuffle=True) pickle.dump(mol_lang, open(saved_mol_lang, 'wb')) train_dataset = MolecularSimilarityDataset( tr_samples, mol_lang, train_batch_size if device == 'cuda' else 1) test_dataset = MolecularSimilarityDataset( ts_samples, mol_lang, train_batch_size if device == 'cuda' else 1) MAX_SEQ_LEN = MOL_SEQ_LEN * 2 print('Axial Embedding shape:', compute_axial_position_shape(MAX_SEQ_LEN)) model = ReformerLM( num_tokens=mol_lang.n_words, dim=dim, bucket_size=bucket_size, depth=depth, heads=heads, n_hashes=n_hashes, max_seq_len=MAX_SEQ_LEN, ff_chunks=ff_chunks, attn_chunks=attn_chunks, weight_tie=True, weight_tie_embedding=True, axial_position_emb=True, axial_position_shape=compute_axial_position_shape(MAX_SEQ_LEN), axial_position_dims=(dim // 2, dim // 2), return_embeddings=True, use_full_attn=use_full_attn).to(device) linear_regressor = Linear(512, 2).to(device) model = TrainingWrapper(model, ignore_index=PAD_IDX, pad_value=PAD_IDX).to(device) model_params = filter(lambda p: p.requires_grad, model.parameters()) linear_params = filter(lambda p: p.requires_grad, linear_regressor.parameters()) SAVE_DIR = os.sep.join([output_folder, 'saved_model']) os.makedirs(SAVE_DIR, exist_ok=True) try: model_ckp_max = np.max( [int(ckp) for ckp in os.listdir(os.sep.join([SAVE_DIR, 'model']))]) except: model_ckp_max = 0 gpus_mini_batch = (train_batch_size // gradient_accumulation_steps ) // torch.cuda.device_count() print('gpus_mini_batch:', gpus_mini_batch, 'with gradient_accumulation_steps:', gradient_accumulation_steps) log_file = open(os.sep.join([output_folder, 'training_log.log']), 'a') log_file.write( "\n\n\n{}\tStarting new training from chekpoint: EncoderDecoder-{}\n". format(datetime.datetime.now(), model_ckp_max)) log_file.flush() if use_deepspeed: if deepspeed_optimizer == False: print('No DeepSpeed optimizer found. Using RangerLars.') model_optimizer = RangerLars(model.parameters()) linear_optimizer = RangerLars(linear_regressor.parameters()) model_engine, model_optimizer, trainloader, _ = deepspeed.initialize( args=cmd_args, model=model, optimizer=model_optimizer, model_parameters=model_params, training_data=train_dataset) linear_engine, linear_optimizer, _, _ = deepspeed.initialize( args=cmd_args, model=linear_regressor, optimizer=linear_optimizer, model_parameters=linear_params) else: print('Found optimizer in the DeepSpeed configurations. Using it.') model_engine, model_optimizer, trainloader, _ = deepspeed.initialize( args=cmd_args, model=model, model_parameters=model_params, training_data=train_dataset) linear_engine, linear_optimizer, _, _ = deepspeed.initialize( args=cmd_args, model=linear_regressor, model_parameters=linear_params) _, model_client_sd = model_engine.load_checkpoint( os.sep.join([SAVE_DIR, 'model']), model_ckp_max) testloader = model_engine.deepspeed_io(test_dataset) ######TO DO for eph in range(epochs): print('Starting Epoch: {}'.format(eph)) for i, pair in enumerate(tqdm(trainloader)): tr_step = ((eph * len(trainloader)) + i) + 1 src = pair[0] trg = pair[1] pickle.dump(src, open('src.pkl', 'wb')) pickle.dump(trg, open('trg.pkl', 'wb')) model_engine.train() linear_engine.train() #enc_dec.train() src = src.to(model_engine.local_rank) trg = trg.to(linear_engine.local_rank) print("Sample:", src) print("Target:", trg) print("Target Shape:", trg.shape) print("len Samples:", len(src)) ## Need to learn how to use masks correctly enc_input_mask = torch.tensor( [[1 if idx != PAD_IDX else 0 for idx in smpl] for smpl in src]).bool().to(model_engine.local_rank) # context_mask = torch.tensor([[1 for idx in smpl if idx != PAD_IDX] for smpl in trg]).bool().to(device) ################# enc_keys = model_engine( src, return_loss=False, input_mask=enc_input_mask ) #enc_input_mask)#, context_mask=context_mask) #loss = enc_dec(src, trg, return_loss = True, enc_input_mask = None)#enc_input_mask)#, context_mask=context_mask) print('enc_keys shape', enc_keys.shape) #enc_keys_cls = enc_keys[:,0:1,:].to(linear_engine.local_rank)#torch.tensor([s[0] for s in enc_keys]).to(linear_engine.local_rank) #print('enc_keys_cls shape', enc_keys_cls.shape) preds = torch.softmax(linear_engine(enc_keys), dim=1).to(linear_engine.local_rank) print('preds shape', preds.shape) #preds = np.array([r[0] for r in results]) #print('Pred:', preds.shape) loss = F.cross_entropy(preds, trg).to(linear_engine.local_rank) loss.backward() model_engine.step() linear_engine.step() print('Training Loss:', loss.item()) if tr_step % validate_every == 0: val_loss = [] for pair in tqdm( testloader ): #Can't use the testloader or I will mess up with the model assignment and it won't learn during training, need to use normal validation instead of parallel one model_engine.eval() linear_engine.eval() with torch.no_grad(): ts_src = pair[0] ts_trg = pair[1] pickle.dump(ts_src, open('ts_src.pkl', 'wb')) pickle.dump(ts_trg, open('ts_trg.pkl', 'wb')) ts_src = ts_src.to(model_engine.local_rank) ts_trg = ts_trg.to(linear_engine.local_rank) #ts_src = torch.tensor(np.array([pair[0].numpy()])).to(device) #ts_trg = torch.tensor(np.array([pair[1].numpy()])).to(device) ## Need to learn how to use masks correctly ts_enc_input_mask = torch.tensor([ [1 if idx != PAD_IDX else 0 for idx in smpl] for smpl in ts_src ]).bool().to(model_engine.local_rank) #ts_context_mask = torch.tensor([[1 for idx in smpl if idx != PAD_IDX] for smpl in ts_trg]).bool().to(device) # loss = model_engine( # ts_src, # ts_trg, # return_loss=True, # enc_input_mask=ts_enc_input_mask # ) #ts_enc_input_mask)#, context_mask=ts_context_mask) # #loss = enc_dec(ts_src, ts_trg, return_loss = True, enc_input_mask = None) ts_enc_keys = model_engine( ts_src, return_loss=False, input_mask=ts_enc_input_mask) ts_pred = torch.softmax( linear_engine(ts_enc_keys), dim=1).to(linear_engine.local_rank) loss = F.cross_entropy(ts_pred, ts_trg).to( linear_engine.local_rank) val_loss.append(loss.item()) print( f'\tValidation Loss: AVG: {np.mean(val_loss)}, MEDIAN: {np.median(val_loss)}, STD: {np.std(val_loss)} ' ) log_file.write( 'Step: {}\tTraining Loss:{}\t Validation LOSS: AVG: {}| MEDIAN: {}| STD: {}\n' .format(i, loss.item(), np.mean(val_loss), np.median(val_loss), np.std(val_loss))) else: log_file.write('Step: {}\tTraining Loss:{}\n'.format( i, loss.item())) log_file.flush() if tr_step % save_every == 0: print('\tSaving Checkpoint') model_ckpt_id = str(model_ckp_max + tr_step + 1) model_engine.save_checkpoint( os.sep.join([SAVE_DIR, 'model']), model_ckpt_id) log_file.close() print('\tSaving Final Checkpoint') model_ckpt_id = str(model_ckp_max + tr_step + 1) model_engine.save_checkpoint(os.sep.join([SAVE_DIR, 'model']), model_ckpt_id) else: #model_optimizer = torch.optim.Adam(model.parameters()) # RangerLars(model.parameters()) #linear_optimizer = torch.optim.Adam(linear_regressor.parameters()) # RangerLars(linear_regressor.parameters()) model_optimizer = torch.optim.Adam( list(model.parameters()) + list(linear_regressor.parameters()) ) #RangerLars(list(model.parameters())+list(linear_regressor.parameters())) # PATH = os.sep.join( [SAVE_DIR, 'model', str(model_ckp_max), 'sts_model.pt']) if os.path.exists(PATH): print('********** Found Checkpoint. Loading:', PATH) checkpoint = torch.load(PATH) model.load_state_dict(checkpoint['model_state_dict']) linear_regressor.load_state_dict(checkpoint['linear_state_dict']) model_optimizer.load_state_dict(checkpoint['optimizer_state_dict']) trainloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=False) testloader = DataLoader(test_dataset, batch_size=train_batch_size, shuffle=False) ######TO DO train_loss_list = [] for eph in range(epochs): print('Starting Epoch: {}'.format(eph)) for i, pair in enumerate(tqdm(trainloader)): tr_step = ((eph * len(trainloader)) + i) + 1 src = pair[0] trg = pair[1] pickle.dump(src, open('src.pkl', 'wb')) pickle.dump(trg, open('trg.pkl', 'wb')) model.train() linear_regressor.train() #enc_dec.train() src = src.to(device) trg = trg.to(device) #print("Sample:", src) #print("Target:", trg) #print("Target Shape:", trg.shape) #print("len Samples:", len(src)) ## Need to learn how to use masks correctly enc_input_mask = torch.tensor( [[1 if idx != PAD_IDX else 0 for idx in smpl] for smpl in src]).bool().to(device) # context_mask = torch.tensor([[1 for idx in smpl if idx != PAD_IDX] for smpl in trg]).bool().to(device) ################# enc_keys = model( src, return_loss=False, input_mask=enc_input_mask ) #enc_input_mask)#, context_mask=context_mask) #loss = enc_dec(src, trg, return_loss = True, enc_input_mask = None)#enc_input_mask)#, context_mask=context_mask) #print('enc_keys shape', enc_keys.shape) enc_keys_cls = enc_keys[:, 0, :].to( device ) #torch.tensor([s[0] for s in enc_keys]).to(linear_engine.local_rank) #print('enc_keys_cls shape', enc_keys_cls.shape) preds = torch.softmax(linear_regressor(enc_keys_cls), dim=1).to(device) #print('preds shape', preds.shape) #preds = np.array([r[0] for r in results]) #print('Pred:', preds.shape) loss = F.cross_entropy(preds, trg).to(device) loss.backward() model_optimizer.step() #linear_optimizer.step() train_loss_list.append(loss.item()) #print('Training Loss:', loss.item()) if tr_step % validate_every == 0: val_loss = [] ACC_list = [] MCC_list = [] for pair in tqdm( testloader ): #Can't use the testloader or I will mess up with the model assignment and it won't learn during training, need to use normal validation instead of parallel one model.eval() linear_regressor.eval() with torch.no_grad(): ts_src = pair[0] ts_trg = pair[1] pickle.dump(ts_src, open('ts_src.pkl', 'wb')) pickle.dump(ts_trg, open('ts_trg.pkl', 'wb')) ts_src = ts_src.to(device) ts_trg = ts_trg.to(device) #ts_src = torch.tensor(np.array([pair[0].numpy()])).to(device) #ts_trg = torch.tensor(np.array([pair[1].numpy()])).to(device) ## Need to learn how to use masks correctly ts_enc_input_mask = torch.tensor( [[1 if idx != PAD_IDX else 0 for idx in smpl] for smpl in ts_src]).bool().to(device) #ts_context_mask = torch.tensor([[1 for idx in smpl if idx != PAD_IDX] for smpl in ts_trg]).bool().to(device) # loss = model_engine( # ts_src, # ts_trg, # return_loss=True, # enc_input_mask=ts_enc_input_mask # ) #ts_enc_input_mask)#, context_mask=ts_context_mask) # #loss = enc_dec(ts_src, ts_trg, return_loss = True, enc_input_mask = None) ts_enc_keys = model(ts_src, return_loss=False, input_mask=ts_enc_input_mask) ts_enc_keys_cls = ts_enc_keys[:, 0, :].to(device) ts_pred = torch.softmax( linear_regressor(ts_enc_keys_cls), dim=1).to(device) loss = F.cross_entropy(ts_pred, ts_trg).to(device) ACC, MCC = compute_simple_metrics(ts_pred, ts_trg) ACC_list.append(ACC) MCC_list.append(MCC) val_loss.append(loss.item()) print( f'\Train Loss: LAST: {train_loss_list[-1]}, AVG: {np.mean(train_loss_list)}, MEDIAN: {np.median(train_loss_list)}, STD: {np.std(train_loss_list)} ' ) print( f'\tValidation Loss: AVG: {np.mean(val_loss)}, MEDIAN: {np.median(val_loss)}, STD: {np.std(val_loss)} ' ) print( f'\tValidation ACC: AVG: {np.mean(ACC_list)}, MEDIAN: {np.median(ACC_list)}, STD: {np.std(ACC_list)} ' ) print( f'\tValidation MCC: AVG: {np.mean(MCC_list)}, MEDIAN: {np.median(MCC_list)}, STD: {np.std(MCC_list)} ' ) log_file.write( 'Step: {}\tTraining Loss:{}\t Validation LOSS: AVG: {}| MEDIAN: {}| STD: {}\n' .format(i, loss.item(), np.mean(val_loss), np.median(val_loss), np.std(val_loss))) else: log_file.write('Step: {}\tTraining Loss:{}\n'.format( i, loss.item())) log_file.flush() if tr_step % save_every == 0: print('\tSaving Checkpoint') model_ckpt_id = str(model_ckp_max + tr_step + 1) #model_engine.save_checkpoint(os.sep.join([SAVE_DIR, 'model']), # model_ckpt_id) PATH = os.sep.join([ SAVE_DIR, 'model', str(model_ckpt_id), 'sts_model.pt' ]) os.makedirs(os.sep.join(PATH.split(os.sep)[:-1]), exist_ok=True) torch.save( { 'step': tr_step, 'model_state_dict': model.state_dict(), 'linear_state_dict': linear_regressor.state_dict(), 'optimizer_state_dict': model_optimizer.state_dict(), }, PATH) log_file.close() print('\tSaving Final Checkpoint') model_ckpt_id = str(model_ckp_max + tr_step + 1) #model_engine.save_checkpoint(os.sep.join([SAVE_DIR, 'model']), # model_ckpt_id) PATH = os.sep.join( [SAVE_DIR, 'model', str(model_ckpt_id), 'sts_model.pt']) os.makedirs(os.sep.join(PATH.split(os.sep)[:-1]), exist_ok=True) torch.save( { 'step': tr_step, 'model_state_dict': model.state_dict(), 'linear_state_dict': linear_regressor.state_dict(), 'optimizer_state_dict': model_optimizer.state_dict(), }, PATH)
all_labels ) return dataset max_seq_len = 2048 tokenizer = BertTokenizer.from_pretrained('bert-base-cased') tokenizer.max_len = max_seq_len model = ReformerLM( dim=512, depth=6, max_seq_len=max_seq_len, num_tokens=tokenizer.vocab_size, heads=8, bucket_size=64, n_hashes=4, ff_chunks=10, lsh_dropout=0.1, weight_tie=True, causal=True ).cuda() # training on glue tasks for key in processors.keys(): task_name = key.lower() processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list)
def train_encdec_v1( input_lang, target_lang, dim, bucket_size, depth, heads, n_hashes, vir_seq_len, ff_chunks, attn_chunks, mol_seq_len, cmd_args, train_dataset, test_dataset, output_folder, train_batch_size, epochs, validate_every, save_every, deepspeed_optimizer, use_full_attn, gradient_accumulation_steps ): #zero_optimization, #unused for now. Use this flag to create IF statement for Zero Compatibility if needed print('Axial Embedding shape:', compute_axial_position_shape(vir_seq_len)) encoder = ReformerLM( num_tokens=input_lang.n_words, dim=dim, bucket_size=bucket_size, depth=depth, heads=heads, n_hashes=n_hashes, max_seq_len=vir_seq_len, ff_chunks=ff_chunks, attn_chunks=attn_chunks, weight_tie=True, weight_tie_embedding=True, axial_position_emb=True, axial_position_shape=compute_axial_position_shape(vir_seq_len), axial_position_dims=(dim // 2, dim // 2), return_embeddings=True, use_full_attn=use_full_attn).to(device) decoder = ReformerLM( num_tokens=target_lang.n_words, dim=dim, bucket_size=bucket_size, depth=depth, heads=heads, n_hashes=n_hashes, ff_chunks=ff_chunks, attn_chunks=attn_chunks, max_seq_len=mol_seq_len, axial_position_emb=True, axial_position_shape=compute_axial_position_shape(mol_seq_len), axial_position_dims=(dim // 2, dim // 2), weight_tie=True, weight_tie_embedding=True, causal=True, use_full_attn=use_full_attn).to(device) encoder = TrainingWrapper(encoder, ignore_index=PAD_IDX, pad_value=PAD_IDX).to(device) decoder = TrainingWrapper(decoder, ignore_index=PAD_IDX, pad_value=PAD_IDX).to(device) encoder_params = filter(lambda p: p.requires_grad, encoder.parameters()) decoder_params = filter(lambda p: p.requires_grad, decoder.parameters()) if deepspeed_optimizer == False: print('No DeepSpeed optimizer found. Using RangerLars.') encoder_optimizer = RangerLars(encoder.parameters()) decoder_optimizer = RangerLars(decoder.parameters()) encoder_engine, encoder_optimizer, trainloader, _ = deepspeed.initialize( args=cmd_args, model=encoder, optimizer=encoder_optimizer, model_parameters=encoder_params, training_data=train_dataset, dist_init_required=True) decoder_engine, decoder_optimizer, testloader, _ = deepspeed.initialize( args=cmd_args, model=decoder, optimizer=decoder_optimizer, model_parameters=decoder_params, training_data=test_dataset, dist_init_required=False) else: print('Found optimizer in the DeepSpeed configurations. Using it.') encoder_engine, encoder_optimizer, trainloader, _ = deepspeed.initialize( args=cmd_args, model=encoder, model_parameters=encoder_params, training_data=train_dataset, dist_init_required=True) decoder_engine, decoder_optimizer, testloader, _ = deepspeed.initialize( args=cmd_args, model=decoder, model_parameters=decoder_params, training_data=test_dataset, dist_init_required=False) SAVE_DIR = os.sep.join([output_folder, 'saved_model']) os.makedirs(SAVE_DIR, exist_ok=True) try: enc_ckp_max = np.max([ int(ckp) for ckp in os.listdir(os.sep.join([SAVE_DIR, 'encoder'])) ]) except Exception as e: print('Exception:', e) enc_ckp_max = 0 try: dec_ckp_max = np.max([ int(ckp) for ckp in os.listdir(os.sep.join([SAVE_DIR, 'decoder'])) ]) except: dec_ckp_max = 0 _, encoder_client_sd = encoder_engine.load_checkpoint( os.sep.join([SAVE_DIR, 'encoder']), enc_ckp_max) _, decoder_client_sd = decoder_engine.load_checkpoint( os.sep.join([SAVE_DIR, 'decoder']), dec_ckp_max) gpus_mini_batch = (train_batch_size // gradient_accumulation_steps ) // torch.cuda.device_count() print('gpus_mini_batch:', gpus_mini_batch, 'with gradient_accumulation_steps:', gradient_accumulation_steps) log_file = open(os.sep.join([output_folder, 'training_log.log']), 'a') log_file.write( "\n\n\n{}\tStarting new training from chekpoint: Encoder-{} | Decoder-{}\n" .format(datetime.datetime.now(), enc_ckp_max, dec_ckp_max)) log_file.flush() for eph in range(epochs): print('Starting Epoch: {}'.format(eph)) for i, pair in enumerate(tqdm(trainloader)): tr_step = ((eph * len(trainloader)) + i) + 1 src = pair[0] trg = pair[1] encoder_engine.train() decoder_engine.train() src = src.to(encoder_engine.local_rank) trg = trg.to(decoder_engine.local_rank) enc_keys = encoder_engine(src) loss = decoder_engine(trg, keys=enc_keys, return_loss=True) loss.backward() decoder_engine.step() encoder_engine.step() print('Training Loss:', loss.item()) if tr_step % validate_every == 0: val_loss = [] for pair in tqdm(testloader): encoder_engine.eval() decoder_engine.eval() with torch.no_grad(): ts_src = pair[0] ts_trg = pair[1] ts_src = ts_src.to(encoder_engine.local_rank) ts_trg = ts_trg.to(decoder_engine.local_rank) enc_keys = encoder_engine(ts_src) loss = decoder_engine(ts_trg, keys=enc_keys, return_loss=True) val_loss.append(loss.item()) print( f'\tValidation Loss: AVG: {np.mean(val_loss)}, MEDIAN: {np.median(val_loss)}, STD: {np.std(val_loss)} ' ) log_file.write( 'Step: {}\tTraining Loss:{}\t Validation LOSS: AVG: {}| MEDIAN: {}| STD: {}\n' .format(i, loss.item(), np.mean(val_loss), np.median(val_loss), np.std(val_loss))) else: log_file.write('Step: {}\tTraining Loss:{}\n'.format( i, loss.item())) log_file.flush() if tr_step % save_every == 0: print('\tSaving Checkpoint') enc_ckpt_id = str(enc_ckp_max + tr_step + 1) dec_ckpt_id = str(dec_ckp_max + tr_step + 1) encoder_engine.save_checkpoint( os.sep.join([SAVE_DIR, 'encoder']), enc_ckpt_id) decoder_engine.save_checkpoint( os.sep.join([SAVE_DIR, 'decoder']), dec_ckpt_id) log_file.close() print('\tSaving Final Checkpoint') enc_ckpt_id = str(enc_ckp_max + tr_step + 1) dec_ckpt_id = str(dec_ckp_max + tr_step + 1) encoder_engine.save_checkpoint(os.sep.join([SAVE_DIR, 'encoder']), enc_ckpt_id) decoder_engine.save_checkpoint(os.sep.join([SAVE_DIR, 'decoder']), dec_ckpt_id)
def main(input, output): max_seq_length = 512 doc_stride = 64 max_query_length = 64 batch_size = 16 n_best_size = 20 max_answer_length = 30 seed = 42 fp16 = False # device = torch.device("cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}".format(device, n_gpu)) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if n_gpu > 0: torch.cuda.manual_seed_all(seed) # 1. Config train_config, gen_config, disc_config = ElectraConfig(config_path=CONFIG_PATH).get_config() # 2. Tokenizer tokenizer = BertTokenizer(vocab_file=train_config.vocab_path, do_lower_case=False) # 3. Generator generator = ReformerLM( num_tokens=tokenizer.vocab_size, emb_dim=gen_config.emb_dim, dim=gen_config.emb_dim, # smaller hidden dimension heads=gen_config.heads, # less heads ff_mult=gen_config.ff_mult, # smaller feed forward intermediate dimension dim_head=gen_config.dim_head, depth=gen_config.depth, max_seq_len=train_config.max_len ) # 4. Discriminator discriminator = ReformerLM( num_tokens=tokenizer.vocab_size, emb_dim=disc_config.emb_dim, dim=disc_config.dim, dim_head=disc_config.dim_head, heads=disc_config.heads, depth=disc_config.depth, ff_mult=disc_config.ff_mult, max_seq_len=train_config.max_len, return_embeddings=True, ) # 4.2 weight tie the token and positional embeddings of generator and discriminator # 제너레이터와 디스크리미네이터의 토큰, 포지션 임베딩을 공유한다(tie). generator.token_emb = discriminator.token_emb generator.pos_emb = discriminator.pos_emb # weight tie any other embeddings if available, token type embeddings, etc. # 다른 임베딩 웨이트도 있다면 공유 필요. # 4.3 instantiate electra # 엘렉트라 모델 초기화 discriminator_with_adapter = nn.Sequential(discriminator, nn.Linear(disc_config.dim, 1)) electra = Electra( generator, discriminator_with_adapter, mask_token_id=tokenizer.mask_token_id, # the token id reserved for masking pad_token_id=tokenizer.pad_token_id, # the token id for padding mask_prob=0.15, # masking probability for masked language modeling mask_ignore_token_ids=tokenizer.all_special_ids # ids of tokens to ignore for mask modeling ex. (cls, sep) ) # electra.load_state_dict(torch.load(train_config.checkpoint_path, map_location=device),strict=False) electra_discriminator = electra.discriminator[0] model = DiscriminatorMRCModel(discriminator=electra_discriminator, dim=disc_config.dim) eval_examples = read_squad_examples(input_file=input, is_training=False, version_2_with_negative=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False) if fp16 is True: model.half() model.load_state_dict(torch.load(CHK_PATH, map_location=device)) model.to(device) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_nbest_file = os.path.join("nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, n_best_size, max_answer_length, False, output, output_nbest_file, None, False, False, 0.0)
self.writer.add_scalar('Perplexity', perplexity, eval_steps) self.writer.close() logging.info( f'{datetime.now()} | Step: {step} | Eval Loss: {eval_loss} | Perplexity: {perplexity}' ) return None if __name__ == '__main__': dataset = WikiDataset(path='D:/data/enwiki') tokenizer = BertTokenizer.from_pretrained('bert-base-cased') tokenizer.max_len = 128 model = ReformerLM(num_tokens=tokenizer.vocab_size, dim=512, depth=6, heads=8, max_seq_len=tokenizer.max_len, causal=True) trainer = ReformerTrainer(dataset, model, tokenizer, train_batch_size=32, eval_batch_size=32) train_dataloader, eval_dataloader = trainer.build_dataloaders( train_test_split=0.90) model = trainer.train(epochs=3, train_dataloader=train_dataloader, eval_dataloader=eval_dataloader, log_steps=10, ckpt_steps=100, ckpt_dir='./ckpts',
BATCH_SIZE = 4 GRADIENT_ACCUMULATE_EVERY = 4 LEARNING_RATE = 1e-4 VALIDATE_EVERY = 100 SEQ_LEN = 1024 # instantiate model model = ReformerLM( emb = 512, depth = 6, max_seq_len = SEQ_LEN, num_tokens = 256, heads = 8, bucket_size = 64, n_hashes = 8, ff_chunks = 10, lsh_dropout = 0.1, weight_tie = True, causal = True ) model.cuda() # prepare enwik8 data with gzip.open('./data/enwik8.gz') as file: X = np.fromstring(file.read(int(95e6)), dtype=np.uint8) trX, vaX = np.split(X, [int(90e6)]) data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)