def train_skipgram (corpus_dir, extn, learning_rate, embedding_size, num_negsample, epochs, batch_size, output_dir,valid_size): ''' :param corpus_dir: folder containing WL kernel relabeled files. All the files in this folder will be relabled according to WL relabeling strategy and the format of each line in these folders shall be: <target> <context 1> <context 2>.... :param extn: Extension of the WL relabled file :param learning_rate: learning rate for the skipgram model (will involve a linear decay) :param embedding_size: number of dimensions to be used for learning subgraph representations :param num_negsample: number of negative samples to be used by the skipgram model :param epochs: number of iterations the dataset is traversed by the skipgram model :param batch_size: size of each batch for the skipgram model :param output_dir: the folder where embedding file will be stored :param valid_size: number of subgraphs to be chosen at random to validate the goodness of subgraph representation learning process in every epoc :return: name of the file that contains the subgraph embeddings (in word2vec format proposed by Mikolov et al (2013)) ''' op_fname = '_'.join([os.path.basename(corpus_dir), 'dims', str(embedding_size), 'epochs', str(epochs),'embeddings.txt']) op_fname = os.path.join(output_dir, op_fname) if os.path.isfile(op_fname): logging.info('The embedding file: {} is already present, hence NOT training skipgram model ' 'for subgraph vectors'.format(op_fname)) return op_fname logging.info("Initializing SKIPGRAM...") corpus = Corpus(corpus_dir, extn = extn, max_files=0) # just load 'max_files' files from this folder corpus.scan_and_load_corpus() valid_examples = np.concatenate((np.random.choice(corpus.high_freq_word_ids, valid_size, replace=False), np.random.choice(corpus.low_freq_word_ids, valid_size, replace=False))) model_skipgram = skipgram( doc_size=corpus._vocabsize, # for doc2vec skipgram model, the doc size should be same as word size vocabulary_size=corpus._vocabsize, # size of i/p and o/p layers learning_rate=learning_rate, # will decay over time? embedding_size=embedding_size, # hidden layer neurons num_negsample=num_negsample, num_steps=epochs, # no. of time the training set will be iterated through corpus=corpus, # data set of (target,context) tuples valid_dataset=valid_examples, # validation set (a small subset) of (target, context) tuples? ) final_embeddings, final_weights = model_skipgram.train( corpus=corpus, batch_size=batch_size, valid_dataset=valid_examples, ) logging.info('Write the matrix to a word2vec format file') save_embeddings(corpus, final_embeddings, embedding_size, op_fname) logging.info('Completed writing the final embeddings, pls check file: {} for the same'.format(op_fname)) return op_fname
def get_corpus(datadir, dataset): fn = os.path.join(datadir, 'cache.pt') if os.path.exists(fn): print('Load cached dataset...') corpus = torch.load(fn) else: corpus = Corpus(datadir, dataset)
def setup(seq_len, corpus_name, model_name): global seq_length, corpus_path, sample_path, corpus, vocab_size, model seq_length = seq_len corpus_path = './data/' + corpus_name sample_path = './sample/sample.txt' corpus = Corpus() corpus.get_data(corpus_path, batch_size) vocab_size = len(corpus.dictionary) model = RNNLM(vocab_size, embed_size, hidden_size, num_layers) model = model.cuda() model.load_state_dict( torch.load('./model/' + model_name, map_location=lambda storage, loc: storage))
def build_dataset(train_bs, test_bs): path = '../../data/ptb' train_path = os.path.join(path, 'train.txt') val_path = os.path.join(path, 'valid.txt') test_path = os.path.join(path, 'test.txt') corpus = Corpus([train_path, val_path, test_path], train_bs, train_bs, test_bs) print('Data is loaded.') return corpus
def predict(args, is_eval=False): args.to_resume_model = True if is_eval: input_file_name = "dev.csv" else: input_file_name = "test.csv" device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if args.model_name in MODEL_MAP: Config, Model, Tokenizer, Tansform = MODEL_MAP[args.model_name] config = Config.from_pretrained(args.pretrained_model_path, num_labels=args.num_labels) config = add_args_to_config(args, config) tokenizer = Tokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case) transform = Tansform(tokenizer, args) model = load_model(Model, args, config) model = model.to(device) if args.n_gpus > 1: model = nn.DataParallel(model) pub_data = Corpus(args, input_file_name, transform) pub_sampler = SequentialSampler(pub_data) pub_loader = DataLoader(pub_data, batch_size=args.eval_batch_size, sampler=pub_sampler) logits, _, _ = do_inference(model, pub_loader, device) df = pd.read_csv(os.path.join(args.data_dir, input_file_name)) inference_label = logits.argmax(axis=1) df['label_pre'] = inference_label if is_eval: df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['id', 'label', 'label_pre', 'label_0', 'label_1']].to_csv(os.path.join(args.out_dir, "dev_sub.csv"), index=False) else: df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] filename = time.ctime().replace(' ', '-') label_filename = "label-" + filename filename = filename.replace(':', '-') + ".csv" label_filename = label_filename.replace(':', '-') + ".csv" df[['id', 'label_0', 'label_1']].to_csv(os.path.join(args.out_dir, filename), index=False) # df[['id', 'label_pre']].to_csv(os.path.join(args.out_dir, label_filename), index=False) with open(os.path.join(args.out_dir, label_filename), 'w', encoding='utf-8') as out: for i in range(df.shape[0]): out.write("{}\t{}\n".format(df['id'][i], df['label_pre'][i]))
def train(args): start = time.time() print( 'Train with hid=%d layers=%d drop=%.3lf seq_len=%d lr=%.5lf, seed=%s' % (args.hidden_size, args.num_layers, args.dropout, args.seq_length, args.lr, args.seed)) continuous_no_update_epochs = 0 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') _ = torch.empty(size=[0], device=device) train_data, eval_data, test_data, vocab_size = Corpus().get_data( args.data_dir, args.batch_size) model = RNNLM(vocab_size=vocab_size, embed_size=args.hidden_size, hidden_size=args.hidden_size, num_layers=args.num_layers, device=device, dropout=args.dropout, batch_size=args.batch_size).to(device) criterion = nn.CrossEntropyLoss() optimizer = get_optimizer(args.optimizer, model) best_val_loss = None for nth_epoch in range(1, args.epoch + 1): train_epoch(nth_epoch, model, train_data, criterion, optimizer, args) eval_loss = evaluate(model, data=eval_data, criterion=criterion, seq_len=args.seq_length, epoch=nth_epoch) if not best_val_loss or eval_loss < best_val_loss: print(' >>> Save model %.3lf -> %.3lf' % ((np.exp(best_val_loss) if best_val_loss else 0.0), np.exp(eval_loss)), flush=True) best_val_loss = eval_loss continuous_no_update_epochs = 0 model.save(get_model_path(args)) else: continuous_no_update_epochs += 1 print('', flush=True) if continuous_no_update_epochs == args.continuous_no_update_epochs_threshold: break print('Test result is %s' % (np.exp( evaluate(RNNLM.load(get_model_path(args)), data=test_data, criterion=criterion, seq_len=args.seq_length, test=True)))) print('Finished in %.3lfms\n' % ((time.time() - start) / 60))
def predict(args, Model, tokenizer, config, transform, is_eval=False): args.to_resume_model = True if is_eval: input_file_name = "dev.csv" else: input_file_name = "test.csv" device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # if args.model_name in MODEL_MAP: # Config, Model, Tokenizer,Tansform = MODEL_MAP[args.model_name] # config = Config.from_pretrained(args.pretrained_model_path, num_labels=args.num_labels) # config = add_args_to_config(args,config) # tokenizer = Tokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case) # transform = Tansform(tokenconfig_Tue-Dec-10-01-05-51-2019izer,args) model = load_model(Model, args, config) model = model.to(device) if args.n_gpus > 1: model = nn.DataParallel(model) pub_data = Corpus(args, input_file_name, transform) pub_sampler = SequentialSampler(pub_data) pub_loader = DataLoader(pub_data, batch_size=args.eval_batch_size, sampler=pub_sampler) logits, _, _ = do_inference(model, pub_loader, device) df = pd.read_csv(os.path.join(args.data_dir, input_file_name)) inference_label = logits.argmax(axis=1) df['label_pre'] = inference_label if is_eval: df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[[ 'id', 'label', 'label_pre', 'label_0', 'label_1', 'question1', 'question2' ]].to_csv(os.path.join(args.out_dir, "dev_sub.csv"), index=False) else: df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] filename = time.ctime().replace(' ', '-') label_filename = "label-" + filename filename = filename.replace(':', '-') + ".csv" label_filename = label_filename.replace(':', '-') + ".csv" df[['id', 'label_0', 'label_1']].to_csv(os.path.join(args.out_dir, filename), index=False) df[['id', 'label_pre']].to_csv(os.path.join(args.out_dir, label_filename), index=False, header=False, sep='\t')
import torch from torch.autograd import Variable from torch import nn, optim from data_utils import Corpus seq_length = 30 train_file = 'train.txt' valid_file = 'valid.txt' test_file = 'test.txt' train_corpus = Corpus() valid_corpus = Corpus() test_corpus = Corpus() train_id = train_corpus.get_data(train_file) valid_id = valid_corpus.get_data(valid_file) test_id = test_corpus.get_data(test_file) vocab_size = len(train_corpus.dic) num_batches = train_id.size(1) // seq_length class languagemodel(nn.Module): def __init__(self, vocab_size, embed_dim, hidden_size, num_layers): super(languagemodel, self).__init__() self.embed = nn.Embedding(vocab_size, embed_dim) self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers, batch_first=True) self.linear = nn.Linear(hidden_size, vocab_size)
def train(args, Model, tokenizer, config, transform): ## """ :param args: training arguments :param Model: Class of Model :param tokenizer: word tokenizer :param config: bert config instance :param transform: data transform instance :return: """ device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") best_f1 = 0 logger.info("the current config is :\n {}".format(str(vars(args)))) set_seed(args) # if args.model_name in MODEL_MAP: # Config, Model, Tokenizer, Transform = MODEL_MAP[args.model_name] # config = BertConfig.from_pretrained(args.pretrained_model_path, num_labels=args.num_labels) # config = add_args_to_config(args,config) ##add customized args # tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case) model = load_model(Model, args, config) model = model.to(device) if args.n_gpus > 1: model = nn.DataParallel(model) train_data = Corpus(args, "train.csv", transform) dev_data = Corpus(args, 'dev.csv', transform) dev_sampler = SequentialSampler(dev_data) dev_loader = DataLoader(dev_data, batch_size=args.eval_batch_size, sampler=dev_sampler) # Run prediction for full data eval_sampler = SequentialSampler(dev_data) dev_loader = DataLoader(dev_data, sampler=eval_sampler, batch_size=args.eval_batch_size) train_sampler = RandomSampler(train_data) test_sampler = SubsetRandomSampler( np.random.randint(low=0, high=(len(train_data)), size=len(dev_data))) train_loader = DataLoader(train_data, batch_size=args.batch_size, sampler=train_sampler, drop_last=True) test_loader = DataLoader(train_data, batch_size=args.eval_batch_size, sampler=test_sampler) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.batch_size) logger.info(" Num steps = %d", args.epochs) logger.info(" Early Stoppi ng dev_loss = %f", args.dev_loss) bar = tqdm(total=len(train_loader) * args.epochs) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=len(bar)) steps = 0 total_train_loss = 0 set_seed(args) for _ in range(args.epochs): for step, data_batch in enumerate(train_loader): bar.update(1) model.train() for k, v in data_batch.items(): data_batch[k] = v.to(device) loss = model(batch=data_batch, feed_labels=True) if args.n_gpus > 1: loss = loss.mean() optimizer.zero_grad() ## clear previous grad loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm ) ##grad norm according to the glue it is useful. optimizer.step() scheduler.step() # while len(model.points)>100: # model.points.pop(0) ##setting bar steps += 1 if steps > args.optimize_steps: print("early stopping in {} steps".format(args.optimize_steps)) break total_train_loss += loss.item() bar.set_description("training loss {}".format(loss.item())) if (steps) % args.eval_steps == 0: logits, loss, dev_labels = do_inference( model, dev_loader, device) test_logits, test_loss, test_labels = do_inference( model, test_loader, device) inference_labels = logits.argmax(axis=1) test_inference_labels = test_logits.argmax(axis=1) f1 = f1_score(y_true=dev_labels, y_pred=inference_labels, average='macro') c1_f1, c2_f1 = f1_score(y_true=dev_labels, y_pred=inference_labels, average=None) test_f1 = f1_score(y_true=test_labels, y_pred=test_inference_labels, average='macro') acc = accuracy_score(dev_labels, inference_labels) logger.info("=========eval report =========") logger.info("step : %s ", str(steps)) logger.info("average_train loss: %s" % (str(total_train_loss / steps))) logger.info("subset train loss: %s" % (str(test_loss))) logger.info("subset train f1 score: %s", str(test_f1)) logger.info("eval loss: %s", str(loss)) logger.info("eval acc: %s", str(acc)) logger.info("eval f1 score: %s", str(f1)) logger.info("eval label 0 f1 score: %s", str(c1_f1)) logger.info("eval label 1 f1 score: %s", str(c2_f1)) output_eval_file = os.path.join(args.out_dir, "eval_records.txt") with open(output_eval_file, "a") as writer: if steps == args.eval_steps: writer.write("\n%s\n" % (args.memo)) writer.write("=========eval report =========\n") writer.write("step : %s \n" % (str(steps))) writer.write("average_train loss: %s\n" % (str(total_train_loss / steps))) writer.write("subset train loss: %s\n" % (str(test_loss))) writer.write("subset f 1 score: %s\n" % (str(test_f1))) writer.write("eval loss: %s\n" % (str(loss))) writer.write("eval f1 score: %s\n" % (str(f1))) writer.write("eval label 0 f1 score: %s\n" % str(c1_f1)) writer.write("eval label 1 f1 score: %s\n" % str(c2_f1)) writer.write('\n') if f1 > best_f1: logger.info("we get a best dev f1 %s saving model....", str(f1)) output_path = os.path.join(args.out_dir, "pytorch_model.bin") if hasattr(model, 'module'): logger.info("model has module") model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save.state_dict(), output_path) logger.info("model saved") best_f1 = f1 save_config(args) logger.info("args saved") ##load the final model args.to_resume_model = True model = load_model(Model, args, config) model = model.to(device) if args.n_gpus > 1: model = nn.DataParallel(model) dev_logits, loss, dev_labels = do_inference( model, dev_loader, device) ##do the inference for dev set pub_data = Corpus(args, 'test.csv', transform) pub_sampler = SequentialSampler(pub_data) pub_loader = DataLoader(pub_data, batch_size=args.eval_batch_size, sampler=pub_sampler) logits, loss, dev_labels = do_inference(model, dev_loader, device) test_logits, _, _ = do_inference(model, pub_loader, device) return dev_logits, dev_labels, test_logits
# Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Hyper-parameters embed_size = 128 hidden_size = 1024 num_layers = 1 num_epochs = 5 num_samples = 1000 # number of words to be sampled batch_size = 20 seq_length = 30 learning_rate = 0.002 # Load "Penn Treebank" dataset corpus = Corpus() ids = corpus.get_data('data/short.txt', batch_size) # label each word with word ids # print(ids.shape) vocab_size = len(corpus.dictionary) num_batches = ids.size(1) // seq_length # RNN based language model class RNNLM(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, num_layers): super(RNNLM, self).__init__() self.embed = nn.Embedding(vocab_size, embed_size) self.lstm = nn.LSTM(embed_size, hidden_size, num_layers,
def forward(self, x, h): x = self.embedding(x) x, hi = self.lstm(x, h) b, s, h = x.size() x = x.contiguous().view(b * s, h) x = self.linear(x) return x, hi seq_length = 30 train_file = 'train.txt' val_file = 'val.txt' test_file = 'test.txt' train_corpus = Corpus() val_corpus = Corpus() test_corpus = Corpus() train_id = train_corpus.get_data(train_file) val_id = train_corpus.get_data(val_file) test_id = train_corpus.get_data(test_file) vocab_size = len(train_corpus.dic) num_batches = train_id.size(1) // seq_length model = language_model(vocab_size, 128, 1024, 1) # if torch.cuda.device_count() > 1: # model = nn.DataParallel(model) if torch.cuda.is_available():
from data_utils import Corpus from config import args import os import pickle from model import MOS from sklearn.utils import shuffle if __name__ == '__main__': if args.nhidlast < 0: args.nhidlast = args.emsize if args.dropoutl < 0: args.dropoutl = args.dropouth if args.small_batch_size < 0: args.small_batch_size = args.batch_size data = Corpus(args.data) vocab_size = len(data.dictionary) train_data = data.train val_data = data.valid test_data = data.test model = MOS(vocab_size) model.train(train_data, val_data)
# Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Hyper-parameters embed_size = 128 hidden_size = 1024 num_layers = 1 num_epochs = 5 num_samples = 1000 # number of words to be sampled batch_size = 20 seq_length = 30 learning_rate = 0.002 # Load "Penn Treebank" dataset corpus = Corpus() ids = corpus.get_data('data/train.txt', batch_size) vocab_size = len(corpus.dictionary) num_batches = ids.size(1) // seq_length # RNN based language model class RNNLM(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, num_layers): super(RNNLM, self).__init__() self.embed = nn.Embedding(vocab_size, embed_size) self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) self.linear = nn.Linear(hidden_size, vocab_size) def forward(self, x, h): # Embed word ids to vectors
from data_utils import Dictionary, Corpus # Hyper Parameters embed_size = 128 hidden_size = 1024 num_layers = 1 num_epochs = 5 num_samples = 1000 # number of words to be sampled batch_size = 20 seq_length = 30 learning_rate = 0.002 # Load Penn Treebank Dataset train_path = './data/train.txt' sample_path = './sample.txt' corpus = Corpus() ids = corpus.get_data(train_path, batch_size) vocab_size = len(corpus.dictionary) num_batches = ids.size(1) // seq_length # RNN Based Language Model class RNNLM(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, num_layers): super(RNNLM, self).__init__() self.embed = nn.Embedding(vocab_size, embed_size) self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) self.linear = nn.Linear(hidden_size, vocab_size)
print(device, '\n') # Hyper-parameters embed_size = 220 hidden_size = 220 num_layers = 2 num_epochs = 40 num_samples = 5 # number of words to be sampled batch_size = 20 seq_length = 30 dropout = 0.3 learning_rate = 0.005 dropout = 0.5 learning_rate = 0.01 # Load "Penn Treebank" dataset corpus = Corpus() ids = corpus.get_data('data/train.txt', batch_size) # divide to batch size valid_d = corpus.get_data('data/valid.txt', batch_size) test_d = corpus.get_data('data/test.txt', batch_size) vocab_size = len(corpus.dictionary) num_batches = ids.size(1) // seq_length best_val_loss = None model = RNNLM(vocab_size, embed_size, hidden_size, num_layers, dropout).to(device) # Calculate the number of trainable parameters in the model model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print('Number of trainable parameters: ', params) # Loss and optimizer
from data_utils import Dictionary, Corpus # Hyper Parameters embed_size = 128 hidden_size = 1024 num_layers = 1 num_epochs = 5 num_samples = 1000 # number of words to be sampled batch_size = 20 seq_length = 30 learning_rate = 0.002 # Load Penn Treebank Dataset train_path = './data/train.txt' sample_path = './sample.txt' corpus = Corpus() ids = corpus.get_data(train_path, batch_size) vocab_size = len(corpus.dictionary) num_batches = ids.size(1) // seq_length # RNN Based Language Model class RNNLM(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, num_layers): super(RNNLM, self).__init__() self.embed = nn.Embedding(vocab_size, embed_size) self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) self.linear = nn.Linear(hidden_size, vocab_size) self.init_weights() def init_weights(self): self.embed.weight.data.uniform_(-0.1, 0.1)
# Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Hyper-parameters embed_size = 128 hidden_size = 1024 num_layers = 1 num_epochs = 5 num_samples = 10000 # number of words to be sampled batch_size = 20 seq_length = 30 learning_rate = 0.002 # Load "Penn Treebank" dataset corpus = Corpus() ids = corpus.get_data('data/shakespeare.txt', batch_size) vocab_size = len(corpus.dictionary) num_batches = ids.size(1) // seq_length infer_mode = False model_path = "model.ckpt" import sys if len(sys.argv) > 1 and sys.argv[1] == 'infer': infer_mode = True if infer_mode: print("Inference mode..")
hidden_size = 200 num_layers = 2 num_epochs = 20 num_samples = 1000 # number of words to be sampled batch_size = 20 seq_length = 20 learning_rate = 0.002 num_steps = 20 unrolling_size = 5 dtype = torch.cuda.LongTensor ftype = torch.cuda.FloatTensor # Load Penn Treebank Dataset train_path = './data/ptb.train.txt' test_path = '/data/ptb.test.txt' corpus = Corpus('./data/ptb.train.txt') raw_data = corpus.get_data(train_path, batch_size) vocab_size = len(corpus.dictionary) data_len = len(raw_data) n_seq = (data_len - 1) // num_steps raw_data_x = raw_data[0:n_seq * num_steps].view(n_seq, num_steps) raw_data_y = raw_data[1:n_seq * num_steps + 1].view(n_seq, num_steps) logger = Logger('./logs') def convert(data, unroll, num_steps): datalist = torch.split(data, 1, dim=1) x0 = torch.cat(datalist[:unroll], dim=1) x1 = torch.cat(datalist[unroll:], dim=1) dataconvert = torch.cat((x1, x0), dim=1) return dataconvert
def train(args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") best_f1 = 0 logger.info("the current config is :\n {}".format(str(vars(args)))) set_seed(args) if args.model_name in MODEL_MAP: Config, Model, Tokenizer, Transform = MODEL_MAP[args.model_name] config = BertConfig.from_pretrained(args.pretrained_model_path, num_labels=args.num_labels) config = add_args_to_config(args, config) ##add customized args tokenizer = BertTokenizer.from_pretrained( args.pretrained_model_path, do_lower_case=args.do_lower_case) model = load_model(LinearBertModel, args, config) model = model.to(device) if args.n_gpus > 1: model = nn.DataParallel(model) ###adv training pgd = PGD(model) ###adv training transform = base_transform(tokenizer, args) train_data = Corpus(args, "train.csv", transform) dev_data = Corpus(args, 'dev.csv', transform) dev_sampler = SequentialSampler(dev_data) dev_loader = DataLoader(dev_data, batch_size=args.eval_batch_size, sampler=dev_sampler) # Run prediction for full data eval_sampler = SequentialSampler(dev_data) dev_loader = DataLoader(dev_data, sampler=eval_sampler, batch_size=args.eval_batch_size) train_sampler = RandomSampler(train_data) test_sampler = SubsetRandomSampler( np.random.randint(low=0, high=(len(train_data)), size=len(dev_data))) train_loader = DataLoader(train_data, batch_size=args.batch_size, sampler=train_sampler, drop_last=True) test_loader = DataLoader(train_data, batch_size=args.eval_batch_size, sampler=test_sampler) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.batch_size) logger.info(" Num steps = %d", args.epochs) logger.info(" Early Stopping dev_loss = %f", args.dev_loss) bar = tqdm(total=len(train_loader) * args.epochs) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=len(bar)) steps = 0 total_train_loss = 0 set_seed(args) model.zero_grad() for _ in range(args.epochs): for step, data_batch in enumerate(train_loader): bar.update(1) model.train() for k, v in data_batch.items(): data_batch[k] = v.to(device) loss = model(batch=data_batch, feed_labels=True) if args.n_gpus > 1: loss = loss.mean() loss.backward() ###adv training pgd.backup_grad() for t in range(2): pgd.attack(is_first_attack=(t == 0)) if t != 2 - 1: model.zero_grad() else: pgd.restore_grad() loss_adv = model(batch=data_batch, feed_labels=True) if args.n_gpus > 1: loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 pgd.restore() # 恢复embedding参数 ###adv training torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() ##delete the grad in the graph ##setting bar steps += 1 total_train_loss += loss.item() bar.set_description("training loss {}".format(loss.item())) if (steps) % args.eval_steps == 0: logits, loss, dev_labels = do_inference( model, dev_loader, device) test_logits, test_loss, test_labels = do_inference( model, test_loader, device) inference_labels = logits.argmax(axis=1) test_inference_labels = test_logits.argmax(axis=1) f1 = f1_score(y_true=dev_labels, y_pred=inference_labels) test_f1 = f1_score(y_true=test_labels, y_pred=test_inference_labels) acc = accuracy_score(dev_labels, inference_labels) logger.info("=========eval report =========") logger.info("step : %s ", str(steps)) logger.info("average_train loss: %s" % (str(total_train_loss / steps))) logger.info("subset train loss: %s" % (str(test_loss))) logger.info("subset train f1 score: %s", str(test_f1)) logger.info("eval loss: %s", str(loss)) logger.info("eval f1 score: %s", str(f1)) logger.info("eval acc: %s", str(acc)) output_eval_file = os.path.join(args.out_dir, "eval_records.txt") with open(output_eval_file, "a") as writer: if steps == args.eval_steps: writer.write("\n%s\n" % (args.memo)) writer.write("=========eval report =========\n") writer.write("step : %s \n" % (str(steps))) writer.write("average_train loss: %s\n" % (str(total_train_loss / steps))) writer.write("subset train loss: %s\n" % (str(test_loss))) writer.write("subset f1 score: %s\n" % (str(test_f1))) writer.write("eval loss: %s\n" % (str(loss))) writer.write("eval f1 score: %s\n" % (str(f1))) writer.write('\n') if f1 > best_f1: logger.info("we get a best dev f1 %s saving model....", str(f1)) output_path = os.path.join(args.out_dir, "pytorch_model.bin") if hasattr(model, 'module'): logger.info("model has module") model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save.state_dict(), output_path) logger.info("model saved") best_f1 = f1 save_config(args) logger.info("args saved") ##load the final model args.to_resume_model = True model = load_model(Model, args, config) model = model.to(device) if args.n_gpus > 1: model = nn.DataParallel(model) dev_logits, loss, dev_labels = do_inference( model, dev_loader, device) ##do the inference for dev set pub_data = Corpus(args, 'test.csv', transform) pub_sampler = SequentialSampler(pub_data) pub_loader = DataLoader(pub_data, batch_size=args.eval_batch_size, sampler=pub_sampler) # logits, loss, dev_labels = do_inference(model, dev_loader, device) test_logits, _, _ = do_inference(model, pub_loader, device) return dev_logits, dev_labels, test_logits else: logger.info("the model %s is not registered", args.model_name) return
def train_skipgram(corpus_dir, extn, learning_rate, embedding_size, num_negsample, epochs, batch_size, output_dir, valid_size): ''' :param corpus_dir: folder containing WL kernel relabeled files. All the files in this folder will be relabled according to WL relabeling strategy and the format of each line in these folders shall be: <target> <context 1> <context 2>.... :param extn: Extension of the WL relabled file :param learning_rate: learning rate for the skipgram model (will involve a linear decay) :param embedding_size: number of dimensions to be used for learning subgraph representations :param num_negsample: number of negative samples to be used by the skipgram model :param epochs: number of iterations the dataset is traversed by the skipgram model :param batch_size: size of each batch for the skipgram model :param output_dir: the folder where embedding file will be stored :param valid_size: number of subgraphs to be chosen at random to validate the goodness of subgraph representation learning process in every epoc :return: name of the file that contains the subgraph embeddings (in word2vec format proposed by Mikolov et al (2013)) ''' op_fname = '_'.join([ os.path.basename(corpus_dir), 'dims', str(embedding_size), 'epochs', str(epochs), 'embeddings.txt' ]) op_fname = os.path.join(output_dir, op_fname) if os.path.isfile(op_fname): logging.info( 'The embedding file: {} is already present, hence NOT training skipgram model ' 'for subgraph vectors'.format(op_fname)) return op_fname logging.info("Initializing SKIPGRAM...") corpus = Corpus( corpus_dir, extn=extn, max_files=0) # just load 'max_files' files from this folder corpus.scan_and_load_corpus() valid_examples = np.concatenate( (np.random.choice(corpus.high_freq_word_ids, valid_size, replace=False), np.random.choice(corpus.low_freq_word_ids, valid_size, replace=False))) model_skipgram = skipgram( doc_size=corpus. _vocabsize, # for doc2vec skipgram model, the doc size should be same as word size vocabulary_size=corpus._vocabsize, # size of i/p and o/p layers learning_rate=learning_rate, # will decay over time? embedding_size=embedding_size, # hidden layer neurons num_negsample=num_negsample, num_steps= epochs, # no. of time the training set will be iterated through corpus=corpus, # data set of (target,context) tuples valid_dataset= valid_examples, # validation set (a small subset) of (target, context) tuples? ) final_embeddings, final_weights = model_skipgram.train( corpus=corpus, batch_size=batch_size, valid_dataset=valid_examples, ) logging.info('Write the matrix to a word2vec format file') save_embeddings(corpus, final_embeddings, embedding_size, op_fname) logging.info( 'Completed writing the final embeddings, pls check file: {} for the same' .format(op_fname)) return op_fname
def train(args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") set_seed(args) best_f1 = 0 logger.info("the current config is :\n {}".format(str(vars(args)))) if args.model_name in MODEL_MAP: Config, Model, Tokenizer, Transform = MODEL_MAP[args.model_name] config = Config.from_pretrained(args.pretrained_model_path, num_labels=args.num_labels) config = add_args_to_config(args, config) ##add customized args tokenizer = Tokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case) model = load_model(Model, args, config) model = model.to(device) if args.n_gpus > 1: model = nn.DataParallel(model) ###adv training pgd = PGD(model) transform = Transform(tokenizer, args) train_data = Corpus(args, "train.csv", transform) ###get the weighted sample with the weight [0.9,0.2,0.5] # weight = [0.9,0.2,0.5] # weight_sequence = [] # for i in range(len(train_data)): # data = train_data[i] # label =data.get('label').item() # weight_sequence.append(weight[label]) ###add the weight of this label dev_data = Corpus(args, 'dev.csv', transform) dev_sampler = SequentialSampler(dev_data) dev_loader = DataLoader(dev_data, batch_size=args.eval_batch_size, sampler=dev_sampler) # Run prediction for full data eval_sampler = SequentialSampler(dev_data) dev_loader = DataLoader(dev_data, sampler=eval_sampler, batch_size=args.eval_batch_size) train_sampler = RandomSampler(train_data) # weight_sampler = WeightedRandomSampler(weights=weight_sequence,num_samples=args.epochs*len(train_data), replacement=True) test_sampler = SubsetRandomSampler( np.random.randint(low=0, high=(len(train_data)), size=len(dev_data))) train_loader = DataLoader(train_data, batch_size=args.batch_size, sampler=train_sampler, drop_last=True) test_loader = DataLoader(train_data, batch_size=args.eval_batch_size, sampler=test_sampler) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.batch_size) logger.info(" Num steps = %d", args.epochs) logger.info(" Early Stopping dev_loss = %f", args.dev_loss) bar = tqdm(range(len(train_loader) * args.epochs), total=len(train_loader) * args.epochs) train_loader = cycle(train_loader) ##get optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=len(bar)) steps = 0 # dev_labels = dev_data.get_feature("label") # dev_labels = [i.item() for i in dev_labels]# get gold label total_train_loss = 0 for step in bar: model.train() data_batch = next(train_loader) for k, v in data_batch.items(): data_batch[k] = v.to(device) loss = model(batch=data_batch, feed_labels=True) if args.n_gpus > 1: loss = loss.mean() loss.backward() ###adv training pgd.backup_grad() for t in range(1): pgd.attack(is_first_attack=(t == 0)) if t != 1 - 1: model.zero_grad() else: pgd.restore_grad() loss_adv = model(batch=data_batch, feed_labels=True) if args.n_gpus > 1: loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 pgd.restore() # 恢复embedding参数 ###adv training optimizer.step() optimizer.zero_grad() scheduler.step() ##setting bar steps += 1 total_train_loss += loss.item() bar.set_description("training loss {}".format(loss.item())) if (steps) % args.eval_steps == 0: logits, loss, dev_labels = do_inference( model, dev_loader, device) test_logits, test_loss, test_labels = do_inference( model, test_loader, device) inference_labels = logits.argmax(axis=1) test_inference_labels = test_logits.argmax(axis=1) f1 = f1_score(dev_labels, inference_labels, labels=[0, 1, 2], average="macro") test_f1 = f1_score(test_labels, test_inference_labels, labels=[0, 1, 2], average="macro") # acc = accuracy_score(dev_labels, inference_labels) logger.info("=========eval report =========") logger.info("step : %s ", str(steps)) logger.info("average_train loss: %s" % (str(total_train_loss / steps))) logger.info("subset train loss: %s" % (str(test_loss))) logger.info("subset train f1 score: %s", str(test_f1)) logger.info("eval loss: %s", str(loss)) logger.info("eval f1 score: %s", str(f1)) output_eval_file = os.path.join(args.out_dir, "eval_records.txt") with open(output_eval_file, "a") as writer: if steps == args.eval_steps: writer.write("\n%s\n" % (args.memo)) writer.write("=========eval report =========\n") writer.write("step : %s \n" % (str(steps))) writer.write("average_train loss: %s\n" % (str(total_train_loss / steps))) writer.write("subset train loss: %s\n" % (str(test_loss))) writer.write("subset f1 score: %s\n" % (str(test_f1))) writer.write("eval loss: %s\n" % (str(loss))) writer.write("eval f1 score: %s\n" % (str(f1))) writer.write('\n') if f1 > best_f1: logger.info("we get a best dev f1 %s saving model....", str(f1)) output_path = os.path.join(args.out_dir, "pytorch_model.bin") if hasattr(model, 'module'): logger.info("model has module") model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save.state_dict(), output_path) logger.info("model saved") best_f1 = f1 save_config(args) logger.info("args saved") ##load the final model args.to_resume_model = True model = load_model(Model, args, config) model = model.to(device) if args.n_gpus > 1: model = nn.DataParallel(model) dev_logits, loss, dev_labels = do_inference( model, dev_loader, device) ##do the inference for dev set pub_data = Corpus(args, 'test.csv', transform) pub_sampler = SequentialSampler(pub_data) pub_loader = DataLoader(pub_data, batch_size=args.eval_batch_size, sampler=pub_sampler) # logits, loss, dev_labels = do_inference(model, dev_loader, device) test_logits, _, _ = do_inference(model, pub_loader, device) return dev_logits, dev_labels, test_logits else: logger.info("the model %s is not registered", args.model_name) return
num_layers = 1 #num_layers = 2 num_epochs = 10 num_samples = 10000 # number of words to be sampled batch_size = 20 seq_length = 30 learning_rate = 0.002 # Load "Penn Treebank" dataset corpus = Corpus() ids = corpus.get_data('data/wikitext-2-v1.train.tokens', batch_size) vocab_size = len(corpus.dictionary) num_batches = ids.size(1) // seq_length # RNN based language model class RNNLM(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, num_layers): super(RNNLM, self).__init__() self.embed = nn.Embedding(vocab_size, embed_size)
path_data = os.path.join(path_this, 'data', 'meditations.mb.txt') from data_utils import Corpus device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') embed_size = 128 hidden_size = 1024 num_layers = 1 num_epoch = 20 batch_size = 20 max_seq = 30 learning_rate = 0.002 max_vocab = None corpus = Corpus() tensor = corpus.fit(path_data, limit=max_vocab) vocab_size = len(corpus.vocab.stoi) tot_batch = tensor.shape[1] // max_seq # not yet? # tensor = tensor[:, :max_seq * tot_batch] # remove spilled # RNN based language model class RNNLM(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, num_layers): super(RNNLM, self).__init__() self.embed = nn.Embedding(vocab_size, embed_size) self.lstm = nn.LSTM(embed_size, hidden_size,
import numpy as np from torch.nn.utils import clip_grad_norm_ from data_utils import Dictionary, Corpus device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') embed_size = 128 hidden_size = 1024 num_layers = 1 num_epochs = 5 num_samples = 1000 batch_size = 20 seq_length = 30 learning_rate = 0.002 corpus = Corpus() ids = corpus.get_data('data/train.txt', batch_size) vocab_size = len(corpus.dictionary) num_batches = ids.size(1) // seq_length class RNNLM(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, num_layers): super(RNNLM, self).__init__() self.embed = nn.Embedding(vocab_size, embed_size) self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) self.linear = nn.Linear(hidden_size, vocab_size)
from torch.nn.utils import clip_grad_norm_ from data_utils import Dictionary, Corpus # Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') embed_size = 128 hidden_size = 1024 num_layers = 1 num_epochs = 0 num_samples = 1000 batch_size = 20 seq_length = 30 learning_rate = 0.002 corpus = Corpus() ids = corpus.get_data('data/train.txt', batch_size) # (batch_size, word_ids from many sentence) vocab_size = len(corpus.dictionary) num_batches = ids.size(1) // seq_length class RNNLM(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, num_layers): super(RNNLM, self).__init__() self.embed = nn.Embedding(vocab_size, embed_size) self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) self.linear = nn.Linear(hidden_size, vocab_size)