def run(): dfx = pd.read_csv(config.TRAINING_FILE) df_train, df_valid = model_selection.train_test_split(dfx, test_size=0.1, random_state=42) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = TweetDataset(tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) device = torch.device('cuda') model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) for epoch in range(3): train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = eval_fn(valid_data_loader, model, device) results.append(jaccard) print(f"Jaccard Score = {jaccard}") torch.save(model.state_dict(), f"MODEL_PATH")
def run(): seed_everything(config.SEED) df_train = pd.read_csv( config.TRAINING_FILE).dropna().reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) device = torch.device("cuda") model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) es = utils.EarlyStopping(patience=2, mode="max") for epoch in range(EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) if epoch + 1 == MAX_EPOCHS: torch.save(model.state_dict(), 'model_full.bin') break
def train(fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, lr, patience, num_warmup_steps): dfx = pd.read_csv(training_file) df_train = dfx[dfx.kfold != fold].reset_index(drop = True) df_valid = dfx[dfx.kfold == fold].reset_index(drop = True) # 训练集 train_dataset = TweetDataset( tweet = df_train.text.values, sentiment = df_train.sentiment.values, selected_text = df_train.selected_text.values, tokenizer = tokenizer, max_len = max_len ) # 验证集 valid_dataset = TweetDataset( tweet = df_valid.text.values, sentiment = df_valid.sentiment.values, selected_text = df_valid.selected_text.values, tokenizer = tokenizer, max_len = max_len ) train_sampler, valid_sampler = None, None if args.shuffle: train_sampler = RandomSampler(train_dataset) valid_sampler = SequentialSampler(valid_dataset) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size = train_batch_size, num_workers = 4, sampler=train_sampler ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size = valid_batch_size, num_workers = 2, sampler=valid_sampler ) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained(roberta_path) model_config.output_hidden_states = True model = TweetModel(roberta_path = roberta_path, conf = model_config) model.to(device) num_train_steps = int(len(df_train) / train_batch_size * epochs) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.003}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] optimizer = AdamW(optimizer_parameters, lr = lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_train_steps ) if args.fp16: # try: # from apex import amp # except ImportError: # raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.parallel: model = torch.nn.DataParallel(model) es = utils.EarlyStopping(patience = patience, mode = "max") print("Training is Starting for fold", fold) for epoch in range(epochs): train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler) jaccard = eval_fn(valid_data_loader, model, device) print("Jaccard Score = ", jaccard) experiment.log_metric("jaccard", jaccard) es(jaccard, model, model_path = f"{save_path}/model_{fold}.bin") if es.early_stop: print("Early stopping") break del model, optimizer, scheduler, df_train, df_valid, train_dataset, valid_dataset, train_data_loader, valid_data_loader import gc gc.collect() torch.cuda.empty_cache()
def run(fold): dfx = pd.read_csv(config.TRAINING_FILE) df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = TweetDataset(tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained( config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) es = utils.EarlyStopping(patience=2, mode="max") print(f"Training is Starting for fold={fold}") for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) #print(f"Jaccard Score = {jaccard}") es(jaccard, model, model_path=f"model_{fold}.bin") if es.early_stop: print("Early stopping") break
def main(): dfx = pd.read_csv(config.TRAINING_FILE) df_train, df_valid = train_test_split(dfx, test_size=0.2, random_state=42) train_dataset = TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values, ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values, ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained( config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) for _ in range(config.EPOCHS): train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") torch.save(model, "model.pth")
def run(fold): dfx = pd.read_csv(config.TRAINING_FILE) # Set train validation set split df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) train_dataset = TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) valid_dataset = TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2 ) device = torch.device("cuda") model_config = transformers.BertConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] # Define two sets of parameters: those with weight decay, and those without optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) ''' Create a scheduler to set the learning rate at each training step "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html) Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step ''' scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) es = utils.EarlyStopping(patience=2, mode="max") print(f"Training is Starting for fold={fold}") logger.info("{} - {}".format("Training is Starting for fold", fold)) #model=nn.DataParallel(model) for epoch in range(3): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) jaccard=engine.eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") logger.info("EPOCHS {} - Jaccard Score - {}".format(epoch, jaccard)) es(jaccard, model, model_path=f"../models/nmodel_{fold}.bin") if es.early_stop: print("Early stopping") break
num_epochs = 5 batch_size = 16 skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) train_df = pd.read_csv('data/train.csv') train_df['text'] = train_df['text'].astype(str) train_df['selected_text'] = train_df['selected_text'].astype(str) for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment), start=1): print(f'Fold: {fold}') model = TweetModel(MODEL_PATH) num_train_steps = int(len(train_idx) / batch_size * num_epochs) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0
def run(): dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) valid_dataset = TweetDataset(tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") conf = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH) model = TweetModel(conf) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model = nn.DataParallel(model) best_jaccard = 0 for epoch in range(config.EPOCHS): train_fn(train_data_loader, model, optimizer, device, scheduler) jaccard = eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") if jaccard > best_jaccard: torch.save(model.state_dict(), config.MODEL_PATH) best_jaccard = jaccard
def train(fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, lr, patience, num_warmup_steps): dfx = pd.read_csv(training_file) df_train = dfx[dfx.kfold != fold].reset_index(drop = True) df_valid = dfx[dfx.kfold == fold].reset_index(drop = True) train_sampler = None val_sampler = None # 训练集 # 3)使用DistributedSampler train_dataset = TweetDataset( tweet = df_train.text.values, sentiment = df_train.sentiment.values, selected_text = df_train.selected_text.values, tokenizer = tokenizer, max_len = max_len ) # 验证集 valid_dataset = TweetDataset( tweet = df_valid.text.values, sentiment = df_valid.sentiment.values, selected_text = df_valid.selected_text.values, tokenizer = tokenizer, max_len = max_len ) if distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size = train_batch_size, shuffle=(train_sampler is None), num_workers = 4, sampler=train_sampler ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size = valid_batch_size, shuffle=False, num_workers = 2, sampler=val_sampler ) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained(roberta_path) model_config.output_hidden_states = True model = TweetModel(roberta_path = roberta_path, conf = model_config) model.to(device) if torch.cuda.device_count() > 1: num_device = torch.cuda.device_count() print("Let's use", num_device, "GPUs!") # 5) 封装 model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) num_train_steps = int(len(df_train) / train_batch_size * epochs) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] optimizer = AdamW(optimizer_parameters, lr = lr * num_device) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_train_steps ) es = utils.EarlyStopping(patience = patience, mode = "max") print("Training is Starting for fold", fold) for epoch in range(epochs): if distributed: train_sampler.set_epoch(epoch) train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler) jaccard = eval_fn(valid_data_loader, model, device) # if distributed: # jaccard_reduce = reduce_tensor(jaccard) # print("jaccard_reduce:", jaccard_reduce) if not distributed or (distributed and torch.distributed.get_rank() == 0): print("Jaccard Score = ", jaccard) es(jaccard, model, model_path = f"./bin/model_{fold}.bin") if es.early_stop: print("Early stopping") break del model, optimizer, scheduler, df_train, df_valid, train_dataset, valid_dataset, train_data_loader, valid_data_loader import gc gc.collect() torch.cuda.empty_cache()