def run(): df = pd.read_csv(config.TRAINING_FILE).fillna("none") df.sentiment = df.sentiment.apply(lambda x: 1 if x == "positive" else 0) df_train, df_valid = model_selection.train_test_split( df, test_size=0.1, random_state=42, stratify=df.sentiment.values) # Same ratio of +ve and -ve index df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, ) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=4, ) model = BERTBaseUncased() trainer = Trainer(gpus=1) trainer.fit(model, train_dataloader=train_data_loader, val_dataloaders=[valid_data_loader])
def run(): dfx = pd.read_csv(config.TRAINING_FILE).fillna("none") dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) valid_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, scheduler, device) outputs, targets = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): df1 = pd.read_csv("../input/jigsaw-toxic-comment-train.csv", usecols=["comment_text", "toxic"]) df2 = pd.read_csv("../input/jigsaw-unintended-bias-train.csv", usecols=["comment_text", "toxic"]) df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True) df_valid = pd.read_csv("../input/validation.csv") train_dataset = dataset.BERTDataset( comment_text=df_train.comment_text.values, target=df_train.toxic.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset( comment_text=df_valid.comment_text.values, target=df_valid.toxic.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device(config.DEVICE) model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [{ "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.001 }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) targets = np.array(targets) >= 0.5 accuracy = metrics.roc_auc_score(targets, outputs) print(f"AUC Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): df1 = pd.read_csv(config.TRAINING_FILE, usecols=["comment_text","toxic"]) train_dataset = dataset.BERTDataset( review=df1.comment_text.values, target=df1.toxic.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) df2=pd.read_csv("../input/validation.csv", usecols=["comment_text","toxic"]) valid_dataset = dataset.BERTDataset( review=df2.comment_text.values, target=df2.toxic.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] num_train_steps = int(len(df1) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) model = nn.DataParallel(model) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): print('1.Loading data...') dfx = pd.read_csv(config.TRAINING_FILE).fillna("none") # only train 2000 entries dfx = dfx[:2000] dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) print('Creating dataset...') train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) print('Creating dataloader...') train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) print('Building Bert Model...') model = BERTBaseUncased() print("Creating BERT Trainer...") trainer = BERTTrainer(model=model, train_dataloader=train_data_loader, test_dataloader=valid_data_loader, lr=config.LR, with_cuda=config.USE_CUDA) # model = nn.DataParallel(model) print('Training Start...') best_accuracy = 0 for epoch in range(config.EPOCHS): train_acc, train_loss = trainer.train_fn(epoch, len(df_train)) print(f'Train loss: {train_loss} Train accuracy: {train_acc:.4%}') outputs, targets = trainer.eval_fn() outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score = {accuracy:.2%}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): # Read in CSV df = pd.read_csv(config.TRAINING_FILE) print('Read In Complete!') # Split into Validation df_train, df_val = train_test_split(df, test_size=0.1, stratify=df.sentiment.values, random_state=config.RANDOM_SEED) df_train = df_train.reset_index(drop=True) df_val = df_val.reset_index(drop=True) print(df_train.shape, df_val.shape) print('Validation Split Complete!') # Create Dataset required for BERT Model train_dataset = dataset.BERTDataset(df_train.content.values, df_train.sentiment.values) val_dataset = dataset.BERTDataset(df_val.content.values, df_val.sentiment.values) train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=config.VAL_BATCH_SIZE, num_workers=1) print('Dataset for Model Complete!') # Define Model and Hyperparameters device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = BERTBaseCased() model.to(device) num_training_steps = len(train_data_loader) * config.EPOCHS optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=num_training_steps, num_warmup_steps=0) # Train the Model, Print Aaccurcay, Save Model n_train_exp = len(df_train) n_val_exp = len(df_val) history = defaultdict(list) best_accuracy = 0 for epoch in range(config.EPOCHS): print(f'\n{"#" * 10} Epoch: {epoch+1}/{config.EPOCHS} {"#" * 10}\n') train_acc, train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler, n_train_exp) val_acc, val_loss = engine.eval_fn(val_data_loader, model, device, n_val_exp) print(f'\nTrain Loss: {train_loss:.4f} Acc: {train_acc:.4f} \nVal Loss: {val_loss:.4f} Val Acc: {val_acc:.4f}') history['train_acc'].append(train_acc) history['train_loss'].append(train_loss) history['val_acc'].append(val_acc) history['val_loss'].append(val_loss) if val_acc > best_accuracy: #!rm -rf /content/model* torch.save(model.state_dict(), config.MODEL_PATH) # f'model/model_{val_acc:0.2f}.bin') best_accuracy = val_acc
def sentence_prediction(sentence): sentence = preprocess(sentence) model_path = config.MODEL_PATH test_dataset = dataset.BERTDataset( review=[sentence], target=[0] ) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=3 ) device = config.device model = BERTBaseUncased() model.load_state_dict(torch.load( model_path, map_location=torch.device(device))) model.to(device) outputs, [] = engine.predict_fn(test_data_loader, model, device) print(outputs) return outputs[0]
def generate_predictions(df): df.reset_index(drop=True, inplace=True) predict_dataset = dataset.BERTDataset(review=df.PROCESSED_TEXT.values) predict_data_loader = torch.utils.data.DataLoader( predict_dataset, batch_size=config.PREDICT_BATCH_SIZE, num_workers=config.NUM_WORKERS, ) test_preds = np.zeros(df.shape[0]) with torch.no_grad(): for bi, d in enumerate(predict_data_loader): ids = d["ids"] token_type_ids = d["token_type_ids"] mask = d["mask"] ids = ids.to(DEVICE, dtype=torch.long) token_type_ids = token_type_ids.to(DEVICE, dtype=torch.long) mask = mask.to(DEVICE, dtype=torch.long) preds = MODEL(ids=ids, mask=mask, token_type_ids=token_type_ids) test_preds[ bi * config.PREDICT_BATCH_SIZE : (bi + 1) * config.PREDICT_BATCH_SIZE ] = (preds[:, 0].detach().cpu().squeeze().numpy()) output = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel() return output
def process_dataset(df, batch_size, num_workers): df = df.reset_index(drop=True) this_dataset = dataset.BERTDataset( review=df.sentence.values, target=df.ENCODE_CAT.values ) data_loader = torch.utils.data.DataLoader( this_dataset, batch_size=batch_size, num_workers=num_workers) return data_loader
def create_data_loader(df, tokenizer, max_len, batch_size): ds = dataset.BERTDataset( reviews=df.content.to_numpy(), targets=df.category.to_numpy(), tokenizer=tokenizer, max_len=max_len ) return DataLoader( ds, batch_size=batch_size, num_workers=4 )
def main(_): input = config.EVAL_PROC output = 'predictions.csv' model_path = config.MODEL_PATH if FLAGS.input: input = FLAGS.input if FLAGS.output: output = FLAGS.input if FLAGS.model_path: model_path = FLAGS.model_path df_test = pd.read_fwf(input) logger.info(f"Bert Model: {config.BERT_PATH}") logger.info( f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} " ) logger.info(f"Test file: {input}") logger.info(f"Test size : {len(df_test):.4f}") trg = [] for i in range(len(df_test.values)): trg.append(0) test_dataset = dataset.BERTDataset(text=df_test.values, target=trg) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=3) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = BERTBaseUncased(config.DROPOUT) model.load_state_dict( torch.load(model_path, map_location=torch.device(device))) model.to(device) outputs, extracted_features = engine.predict_fn( test_data_loader, model, device, extract_features=FLAGS.features) df_test["predicted"] = outputs # save file df_test.to_csv(output, header=None, index=False)
def run(opt_level="O2", keep_batchnorm_fp32=True, batch_size=5, nb_epochs=10, data_path="../inputs/IMDB_Dataset.csv", model_path="./"): df = pd.read_csv(data_path).fillna("none")[0:100] df.sentiment = df.sentiment.apply(lambda x: 1 if x == "positive" else 0) df_train, df_valid = model_selection.train_test_split( df, test_size=0.1, random_state=42, stratify=df.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) # Creating the datasets train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) # Creating the dataloaders train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, num_workers=10, drop_last=True) valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size, num_workers=10, drop_last=True) # Defining the model and sending to the device device = torch.device("cuda") model = BERTBaseUncased() model.to(device) parameters = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] # We don't want any decay for them optimizer_parameters = [{ "params": [p for n, p in parameters if not any(nd in n for nd in no_decay)], "weight_decay": 0.001 }, { "params": [p for n, p in parameters if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }] num_train_steps = int(len(df_train) * nb_epochs / batch_size) # Defining the optimizer and the scheduler optimizer = AdamW(optimizer_parameters, lr=3e-5) # Initialize the pytorch model and the optimizer to allow automatic mixed-precision training model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32, loss_scale="dynamic") scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # No warmup num_training_steps=num_train_steps) # Train the model engine.global_trainer(train_dataloader, valid_dataloader, model, optimizer, scheduler, device, nb_epochs, model_path)
def run(): dfx = pd.read_csv(config.TRAINING_FILE).fillna("none") dfx.sentiment = dfx.sentiment.apply( # can use label encoding lambda x: 1 if x == "positive" else 0 # can use map fn ) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment. values # when split both train and val have same positive to negative sample ratio ) df_train = df_train.reset_index(drop=True) # 0 to length of df_train df_valid = df_valid.reset_index(drop=True) # 0 to length of df_valid train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") # using cuda model = BERTBaseUncased() # calling from model.py param_optimizer = list( model.named_parameters()) # specify parameters to train no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] """ These parameters are adjustable, we should take a look at different layers and the decay we want, how much learning rate etc.""" num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # model = nn.DataParallel(model) # converting to multi gpu model best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, target = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(target, outputs) print(f"Accuracy score = {accuracy}") if accuracy > best_accuracy: torch.save( model.state_dict(), config.MODEL_PATH) # saving the model only if it improves best_accuracy = accuracy
def run(): print("---------- Starting Data Reading -------") df1 = pd.read_csv("../input/jigsaw-toxic-comment-train.csv", usecols=["comment_text", "toxic"]) df2 = pd.read_csv("../input/jigsaw-unintended-bias-train.csv", usecols=["comment_text", "toxic"]) df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True) df_valid = pd.read_csv("../input/validation.csv") print("---- Data Read Sucessfully --- ") # # dfx = pd.read_csv(config.TRAINING_FILE).fillna("none") # # dfx["sentiment"] = dfx["sentiment"].apply( # # lambda x : 1 if x == "positive" else 0 # # ) # # df_train, df_valid = model_selection.train_test_split( # # dfx, # # test_size=0.1, # # random_state=42, # # stratify=dfx["sentiment"].values # # ) # df_train = df_train.reset_index(drop=True) # df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset( comment_text=df_train["comment_text"].values, target=df_train["toxic"].values) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, ) valid_dataset = dataset.BERTDataset( comment_text=df_valid["comment_text"].values, target=df_train["toxic"].values) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALIDATION_BATCH_SIZE, num_workers=1, ) print("---- DataLoaders Created Sucessfully --- ") device = torch.device("cuda") model = BERTBasedUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = len(dfx) / (config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_dataloader, model, optimizer, scheduler, device) outputs, targets = engine.eval_fn(valid_dataloader, model, device) targets = np.array(targets) >= 0.5 accuracy = metrics.roc_auc_score(targets, outputs) print(f"AUC Score {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def train(): # this function trains the model # read the training file and fill NaN values with "none" df = pd.read_csv(config.TRAINING_FILE).fillna("none") # map positive to 1 and negative to 0 df.sentiment = df.apply(lambda x: 1 if x == "positive" else 0) # split data into single training and validation fold df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=df.sentiment.values) # reset index df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) # initialize BERTDataset from dataset.py # for training dataset train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) # create training dataloader train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) # initialize BERTDataset from dataset.py # for training dataset valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) # create training dataloader valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) # initialize the cuda device # use cpu if you dont have GPU device = torch.device("cuda") # load model and send it to the device model = BERTBasedUncased() model.to(device) # create parameters we want to optimize # we generally dont use any decay for bias # and weight layers param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] # calculate the number of training steps # this is used by scheduler num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) # AdamW optimizer # AdamW is the most widely used optimizer # for transformer based networks optimizer = AdamW(optimizer_parameters, lr=3e-5) # fetch a scheduler # you can also try using reduce lr on plateau scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # if you have multiple GPUs # model to DataParallel # to use multiple GPUs model = nn.DataParallel(model) # start training the epochs best_accuracy = 0 for epoch in range(config.EPOCHS): # train model engine.train_fn(train_data_loader, model, device, scheduler) # test the model outputs, targets = engine.eval_fn(valid_data_loader, model, device) # convert outputs to numpy array outputs = np.array(outputs) >= 0.5 # calculate the accuracy accuracy = metrics.accuracy_score(targets, outputs) # print the accuracy print(f"Accuracy Score = {accuracy}") # save model only this the accuracy is better than the best_accuracy (set up to 0) if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): df_train = pd.read_csv(config.TRAINING_FILE).fillna("none") df_train.label = df_train.label.apply( lambda x: 1 if x == "unscrambled" else 0) df_valid = pd.read_csv(config.VALID_FILE).fillna("none") df_valid.label = df_valid.label.apply( lambda x: 1 if x == "unscrambled" else 0) # df_train, df_valid = model_selection.train_test_split( # dfx, test_size=0.1, random_state=42, stratify=dfx.label.values # ) # df_train = df_train.reset_index(drop=True) # df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset( text=df_train.text.values, target=df_train.label.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) valid_dataset = dataset.BERTDataset( text=df_valid.text.values, target=df_valid.label.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) device = torch.device("cuda") # model = DistilBERTBaseUncased() configuration = transformers.DistilBertConfig() # Initializing a model from the configuration # model = transformers.DistilBertModel(configuration) model = transformers.DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased') model.classifier = nn.Linear(768, 1) print(model) # exit(0) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) best_f1 = 0 es_patience = 3 es = 0 for epoch in range(config.EPOCHS): engine.train_fn( train_data_loader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) # valid_loss = nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1)) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) f1_score = metrics.f1_score(targets, outputs) print(f"Accuracy Score = {accuracy} F1 Score = {f1_score}") if f1_score > best_f1: print( f'Saving model, F1 score improved from {best_f1} to {f1_score}') torch.save(model.state_dict(), config.MODEL_PATH) best_f1 = f1_score else: if es < es_patience: print(f'Early stopping!') break else: es += 1 print(f'Early Stop Counter {es} of {es_patience}')
def run(): ''' Entire training loop - Create DataLoaders - Define Training Configuration - Launch Training Loop ''' # Num of available TPU cores if config.TPUs: n_TPUs = xm.xrt_world_size() DEVICE = xm.xla_device() else: DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' device = torch.device(DEVICE) # Read Data # df1 = pd.read_csv('data/jigsaw-toxic-comment-train.csv', usecols=['comment_text', 'toxic']) # df2 = pd.read_csv('data/jigsaw-unintended-bias-train.csv', usecols=['comment_text', 'toxic'], engine='python') # don't know why it was breaking with default C parser # df_train = df1 # pd.concat([df1,df2], axis=0).reset_index(drop=True) # df_valid = pd.read_csv('data/validation.csv') # Subsample df_train = pd.read_csv('data/jigsaw-toxic-comment-train-small.csv', usecols=['comment_text', 'toxic']) df_valid = pd.read_csv('data/validation-small.csv', usecols=['comment_text', 'toxic']) # Preprocess train_dataset = dataset.BERTDataset( comment=df_train.comment_text.values, target=df_train.toxic.values ) valid_dataset = dataset.BERTDataset( comment=df_valid.comment_text.values, target=df_valid.toxic.values ) drop_last=False train_sampler, valid_sampler = None, None if config.TPUs: drop_last=True train_sampler = DistributedSampler( train_dataset, num_replicas=n_TPUs, rank=xm.get_ordinal(), shuffle=True ) valid_sampler = DistributedSampler( valid_dataset, num_replicas=n_TPUs, rank=xm.get_ordinal(), shuffle=True ) # Create Data Loaders train_data_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, drop_last=drop_last, sampler=train_sampler ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1, drop_last=drop_last, sampler=valid_sampler ) # Machine Configuration if config.MODEL == 'bert': model = BERTBaseUncased() elif config.MODEL == 'distil-bert': model = DistilBERTBaseUncased() else: print('Model chosen in config not valid') exit() model.to(device) # Optimizer Configuration param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] lr = config.LR num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) # TODO: why do the LR increases because of a distributed training ? if config.TPUs: num_train_steps /= n_TPUs lr *= n_TPUs optimizer = AdamW(optimizer_parameters, lr=lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) if not config.TPUs: if N_GPU > 1: model = nn.DataParallel(model) # Training loop best_score = 0 for epoch in range(config.EPOCHS): if config.TPUs: train_loader = pl.ParallelLoader(train_data_loader, [device]) valid_loader = pl.ParallelLoader(valid_data_loader, [device]) train_fn(train_loader.per_device_loader(device), model, optimizer, device, scheduler) outputs, targets = eval_fn(valid_loader.per_device_loader(device), model, device) else: train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = eval_fn(valid_data_loader, model, device) targets = np.array(targets) >= 0.5 # TODO: why ? auc_score = metrics.roc_auc_score(targets, outputs) # Save if best print(f"AUC Score = {auc_score}") if auc_score > best_score: if not config.TPUs: torch.save(model.state_dict(), config.MODEL_PATH) else: xm.save(model.state_dict(), config.MODEL_PATH) best_score = auc_score
def run(): dfx = pd.read_csv(configr.TRAINING_FILE).fillna('none') dfx.sentiment = dfx.sentiment.map({"positive": 1, "negative": 0}) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=configr.TRAIN_BATCH_SIZE, num_workers=1) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=configr.VALID_BATCH_SIZE, num_workers=1) device = torch.device('cpu') model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / configr.TRAIN_BATCH_SIZE * configr.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(configr.EPOCHS): print("here") engine.train_fn(train_data_loader, model, optimizer, configr.ACCUMULATION, device) outputs, targets = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print("accuracy_score = {accuracy}".format(accuracy=accuracy)) if (accuracy > best_accuracy): torch.save(model.state_dict(), configr.MODEL_PATH)
def run(): dfx = pd.read_csv(config.TRAINING_FILE) print("Shape of datframe:",dfx.shape) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.label.values ) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) print("Shape of train datframe:",df_train.shape) print("Shape of validation dataframe:",df_valid.shape) train_dataset = dataset.BERTDataset( sent=df_train.sentences.values, target=df_train.label.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=8 ) valid_dataset = dataset.BERTDataset( sent=df_valid.sentences.values, target=df_valid.label.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2 ) device = torch.device(config.DEVICE) model = BERT_CLASSIFIER() if config.RETRAIN: DEVICE = 'cuda' model.load_state_dict(torch.load(config.RETRAIN_MODEL_LOC)) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.1, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=config.LEARNING_RATE) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) best_accuracy = 0 best_eval_loss = np.inf for epoch in range(config.EPOCHS): epoch_train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets, epoch_eval_loss = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= config.ACC_CUTOFF accuracy = metrics.accuracy_score(targets, outputs) print("Train loss = ", epoch_train_loss) print("Validation Loss = ", epoch_eval_loss) print("Accuracy Score =", accuracy) if config.TRAINING_MODE == 'ba': best_eval_loss = np.inf if accuracy > best_accuracy and epoch_eval_loss < best_eval_loss: print("Saving Model state") torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy best_eval_loss = epoch_eval_loss else: print("Saving model in dump folder") torch.save(model.state_dict(), config.MODEL_PATH_2 + f"{epoch}.bin")
def run(): #df = preprocess() #df = pd.read_csv(config.PROCESSED_FILE) df = pd.read_csv('data/processed_train_data.csv') #print(df.columns) df_train, df_valid = model_selection.train_test_split( df, test_size=0.3, random_state=32, stratify=df.offensive.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.text.values, target=df_train.offensive.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset(review=df_valid.text.values, target=df_valid.offensive.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) model = BERTBaseUncased() model.to(config.DEVICE) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model = nn.DataParallel(model) best_accuracy = 0 for epoch in range(5): engine.train_fn(train_data_loader, model, optimizer, config.DEVICE, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, config.DEVICE) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): df1 = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-train.csv", usecols=['comment_text', 'toxic']) df1 = pd.read_csv("../input/jigsaw-unintended-bias-train.csv", usecols=['comment_text', 'toxic']) #combined df1 and df2 and made big dataframe df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True) #validation dataframe has been given by kaggle df_valid - pd.read_csv("../input/validation.csv") train_dataset = dataset.BERTDataset( comment_text=df_train.comment_text.values, target=df_train.toxic.values) #-------------------------------------- #write sampler if using tpu else not train_sampler = torch.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) #---------------------------------------- train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, sampler=train_sampler, #problem with tpu when using torch_xla is that if batch size is not equal then it's going to crash , so use drop_last drop_last=True) valid_dataset = dataset.BERTDataset( comment_text=df_valid.comment_text.values, target=df_valid.toxic.values) #-------------------------------------- #write sampler if using tpu else not valid_sampler = torch.data.distributed.DistributedSampler( valid_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) #---------------------------------------------- valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1, sampler=valid_sampler, #no need of drop_last here ) device = xm.xla_device() #xla_device means tpu model = BERTBaseUncased() # model.to(device) #no need to move data on device #specify what parameters you want to train param_optimizer = list(model.named_parameters()) #we don't want any deacy for these layer names such as bias and othr following things no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { #don't decay weight for above no_decay list else decay "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE / xm.xrt_world_size() * config.EPOCHS) lr = 3e-5 * xm.xrt_world_size() #experiment with lr optimizer = AdamW(optimizer_parameters, lr=lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): #parallel loader for tpus para_loader = pl.ParallelLoader(train_data_loader, [device]) engine.train_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler) parallel_loader = pl.ParallelLoader(valid_data_loader, [device]) outputs, targets = engine.eval_fn( para_loader.per_device_loader(device), model, device) #threshold the target instead of output targets = np.array(targets) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score = {accuracy}") if accuracy > best_accuracy: #instead of torch.save use xm.save xm.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def train(): df = pd.read_csv(config.TRAINING_FILE).fillna("none") df['sentiment'] = df['sentiment'].map({"positive": 1, "negative": 0}) df_train, df_valid = train_test_split(df, test_size=0.1, random_state=42, stratify=df.sentiment.values) # reset index of both splits df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, shuffle=False, num_workers=4, ) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle=False, num_workers=4, ) device = torch.device("cuda") model = BERTBaseUncased() model.to(device) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=0, num_training_steps=int(len(df_train) / config.TRAIN_BATCH_SIZE) * config.EPOCHS) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_dataloader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_dataloader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(outputs, targets) print(f"Accuracy: {accuracy:.3f}") if accuracy > best_accuracy: best_accuracy = accuracy torch.save(model.state_dict(), config.MODEL_PATH)
def run(): dfx = pd.read_csv(config.TRAINING_FILE).fillna("none") #convert positive to 1 and negative to 0 dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0) #stratify split so that class can be balanced for both train and validation ==>> it means number of positive class will be equal to negative class for train ===>>same for validation dataset also df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") model = BERTBaseUncased() model.to(device) #specify what parameters you want to train param_optimizer = list(model.named_parameters()) #we don't want any deacy for these layer names such as bias and othr following things no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { #don't decay weight for above no_decay list else decay "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) #experiment with lr optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) #convert model to multi-gpu model --->> no need to do this if you have not multiple gpus model = nn.DataParallel( model) # use @amp.autocast() in model.py if DataParallel() is enabled scaler = amp.GradScaler( ) #from torch.cuda import amp #this is required if using autoatic mixed precision #and pass scaler to train_fun best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler, scaler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def main(_): test_file = config.EVAL_PROC model_path = config.MODEL_PATH if FLAGS.test_file: test_file = FLAGS.test_file if FLAGS.model_path: model_path = FLAGS.model_path df_test = pd.read_csv(test_file).fillna("none") logger.info(f"Bert Model: {config.BERT_PATH}") logger.info( f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} " ) logger.info(f"Test file: {test_file}") logger.info(f"Test size : {len(df_test):.4f}") test_dataset = dataset.BERTDataset(review=df_test.text.values, target=df_test.label.values) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=3) device = config.device model = BERTBaseUncased() model.load_state_dict( torch.load(model_path, map_location=torch.device(device))) model.to(device) outputs, extracted_features = engine.predict_fn( test_data_loader, model, device, extract_features=FLAGS.features) df_test["predicted"] = outputs # save file df_test.to_csv(model_path.split("/")[-2] + '.csv', header=None, index=False) if FLAGS.features: pca = PCA(n_components=50, random_state=7) X1 = pca.fit_transform(extracted_features) tsne = TSNE(n_components=2, perplexity=10, random_state=6, learning_rate=1000, n_iter=1500) X1 = tsne.fit_transform(X1) # if row == 0: print("Shape after t-SNE: ", X1.shape) X = pd.DataFrame(np.concatenate([X1], axis=1), columns=["x1", "y1"]) X = X.astype({"x1": float, "y1": float}) # Plot for layer -1 plt.figure(figsize=(20, 15)) p1 = sns.scatterplot(x=X["x1"], y=X["y1"], palette="coolwarm") # p1.set_title("development-"+str(row+1)+", layer -1") x_texts = [] for output, value in zip(outputs, df_test.label.values): if output == value: x_texts.append("@" + label_decoder(output)[0] + label_decoder(output)) else: x_texts.append( label_decoder(value) + "-" + label_decoder(output)) X["texts"] = x_texts # X["texts"] = ["@G" + label_decoder(output) if output == value else "@R-" + label_decoder(value) + "-" + label_decoder(output) # for output, value in zip(outputs, df_test.label.values)] # df_test.label.astype(str) #([str(output)+"-" + str(value)] for output, value in zip(outputs, df_test.label.values)) # Label each datapoint with the word it corresponds to for line in X.index: text = X.loc[line, "texts"] + "-" + str(line) if "@U" in text: p1.text(X.loc[line, "x1"] + 0.2, X.loc[line, "y1"], text[2:], horizontalalignment='left', size='medium', color='blue', weight='semibold') elif "@P" in text: p1.text(X.loc[line, "x1"] + 0.2, X.loc[line, "y1"], text[2:], horizontalalignment='left', size='medium', color='green', weight='semibold') elif "@N" in text: p1.text(X.loc[line, "x1"] + 0.2, X.loc[line, "y1"], text[2:], horizontalalignment='left', size='medium', color='red', weight='semibold') else: p1.text(X.loc[line, "x1"] + 0.2, X.loc[line, "y1"], text, horizontalalignment='left', size='medium', color='black', weight='semibold') plt.show() plt.savefig(model_path.split("/")[-2] + '-figure.svg', format="svg")
def main(_): LEARNING_RATE = config.LEARNING_RATE DROPOUT = config.DROPOUT if FLAGS.lr: LEARNING_RATE = FLAGS.lr if FLAGS.dropout: DROPOUT = FLAGS.dropout train_file = config.TRAIN_PROC df_train = pd.read_csv(train_file).fillna("none") valid_file = config.DEVEL_PROC df_valid = pd.read_csv(valid_file).fillna("none") test_file = config.EVAL_PROC df_test = pd.read_csv(test_file).fillna("none") logger.info(f"Bert Model: {config.BERT_PATH}") logger.info(f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ") logger.info(f"Train file: {train_file}") logger.info(f"Valid file: {valid_file}") logger.info(f"Test file: {test_file}") logger.info(f"Train size : {len(df_train):.4f}") logger.info(f"Valid size : {len(df_valid):.4f}") logger.info(f"Test size : {len(df_test):.4f}") train_dataset = dataset.BERTDataset( review=df_train.text.values, target=df_train.label.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, shuffle=True ) valid_dataset = dataset.BERTDataset( review=df_valid.text.values, target=df_valid.label.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) test_dataset = dataset.BERTDataset( review=df_test.text.values, target=df_test.label.values ) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #torch.device("cuda") model = BERTBaseUncased(DROPOUT) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) # model = nn.DataParallel(model) best_accuracy = 0 for epoch in range(config.EPOCHS): logger.info(f"Epoch = {epoch}") train_loss, train_acc = engine.train_fn( train_data_loader, model, optimizer, device, scheduler) for tag, parm in model.named_parameters(): if parm.grad is not None: writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch) outputs, targets, val_loss, val_acc = engine.eval_fn( valid_data_loader, model, device) val_mcc = metrics.matthews_corrcoef(outputs, targets) logger.info(f"val_MCC_Score = {val_mcc:.4f}") outputs, targets, test_loss, test_acc = engine.eval_fn( test_data_loader, model, device) test_mcc = metrics.matthews_corrcoef(outputs, targets) logger.info(f"test_MCC_Score = {test_mcc:.4f}") logger.info( f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, test_loss={test_loss:.4f}") writer.add_scalar('loss/train', train_loss, epoch) # data grouping by `slash` writer.add_scalar('loss/val', val_loss, epoch) # data grouping by `slash` writer.add_scalar('loss/test', test_loss, epoch) # data grouping by `slash` logger.info( f"train_acc={train_acc:.4f}, val_acc={val_acc:.4f}, test_acc={test_acc:.4f}") writer.add_scalar('acc/train', train_acc, epoch) # data grouping by `slash` writer.add_scalar('acc/val', val_acc, epoch) # data grouping by `slash` writer.add_scalar('acc/test', test_acc, epoch) # data grouping by `slash` logger.info(f"val_mcc={val_acc:.4f}, test_mcc={test_acc:.4f}") writer.add_scalar('mcc/val', val_mcc, epoch) # data grouping by `slash` writer.add_scalar('mcc/test', test_mcc, epoch) # data grouping by `slash` accuracy = metrics.accuracy_score(targets, outputs) logger.info(f"Accuracy Score = {accuracy:.4f}") if accuracy < 0.4: logger.info(f"Something is very wrong! Accuracy is only {accuracy:.4f} Stopping...") break if accuracy > best_accuracy: logger.info(f"Saving model with Accuracy Score = {accuracy:.4f}") torch.save(model.state_dict(), config.MODEL_PATH[:-4] + "." + str(round(accuracy*100, 2)) + ".bin") best_accuracy = accuracy es = 0 else: es += 1 logger.info(f"Not improved for {es} times of 5. Best so far - {best_accuracy:.4f}") if es > 4: logger.info(f"Early stopping with best accuracy: {best_accuracy:.4f} and accuracy for this epoch: {accuracy:.4f} ...") break
def run(dataset_index): datasets = [ "gold.prep-auto.full.prep.{0}.csv", "gold.prep-auto.no-emoticons.prep.{0}.csv", "gold.prep-auto.prep.{0}.csv", "gold.prep-english.prep.{0}.csv", "gold.prep-peisenieks.prep.{0}.csv", "gold.prep.{0}.csv" ] # dataset_index = 5 #0-5 train_file = config.DATASET_LOCATION + datasets[dataset_index].format( "train") df_train = pd.read_csv(train_file).fillna("none") df_train.label = df_train.label.apply(label_encoder) valid_file = config.DATASET_LOCATION + datasets[dataset_index].format( "dev" ) #"gold.prep-auto.full.prep.dev.csv" #gold.prep-auto.no-emoticons.prep.dev.csv" #gold.prep-auto.prep.dev.csv" #"gold.prep-english.prep.dev.csv" #"gold.prep-peisenieks.prep.dev.csv" #"gold.prep.dev.csv" df_valid = pd.read_csv(valid_file).fillna("none") df_valid.label = df_valid.label.apply(label_encoder) test_file = config.DATASET_LOCATION + "eval.prep.test.csv" df_test = pd.read_csv(test_file).fillna("none") df_test.label = df_test.label.apply(label_encoder) logger.info(f"Bert Model: {config.BERT_PATH}") logger.info( f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} " ) logger.info(f"Train file: {train_file}") logger.info(f"Valid file: {valid_file}") logger.info(f"Test file: {test_file}") logger.info(f"Train size : {len(df_train):.4f}") logger.info(f"Valid size : {len(df_valid):.4f}") logger.info(f"Test size : {len(df_test):.4f}") train_dataset = dataset.BERTDataset(review=df_train.text.values, target=df_train.label.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, shuffle=True) valid_dataset = dataset.BERTDataset(review=df_valid.text.values, target=df_valid.label.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) test_dataset = dataset.BERTDataset(review=df_test.text.values, target=df_test.label.values) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') #torch.device("cuda") model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # model = nn.DataParallel(model) best_accuracy = 0 for epoch in range(config.EPOCHS): logger.info(f"epoch={epoch}") train_loss, train_acc = engine.train_fn(train_data_loader, model, optimizer, device, scheduler) for tag, parm in model.named_parameters(): if parm.grad is not None: writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch) outputs, targets, val_loss, val_acc = engine.eval_fn( valid_data_loader, model, device) val_mcc = metrics.matthews_corrcoef(outputs, targets) logger.info(f"val_MCC_Score = {val_mcc:.3f}") outputs, targets, test_loss, test_acc = engine.eval_fn( test_data_loader, model, device) test_mcc = metrics.matthews_corrcoef(outputs, targets) logger.info(f"test_MCC_Score = {test_mcc:.3f}") logger.info( f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, test_loss={test_loss:.4f}" ) writer.add_scalar('loss/train', train_loss, epoch) # data grouping by `slash` writer.add_scalar('loss/val', val_loss, epoch) # data grouping by `slash` writer.add_scalar('loss/test', test_loss, epoch) # data grouping by `slash` logger.info( f"train_acc={train_acc:.3f}, val_acc={val_acc:.3f}, test_acc={test_acc:.3f}" ) writer.add_scalar('acc/train', train_acc, epoch) # data grouping by `slash` writer.add_scalar('acc/val', val_acc, epoch) # data grouping by `slash` writer.add_scalar('acc/test', test_acc, epoch) # data grouping by `slash` logger.info(f"val_mcc={val_acc:.3f}, test_mcc={test_acc:.3f}") writer.add_scalar('mcc/val', val_mcc, epoch) # data grouping by `slash` writer.add_scalar('mcc/test', test_mcc, epoch) # data grouping by `slash` accuracy = metrics.accuracy_score(targets, outputs) logger.info(f"Accuracy Score = {accuracy:.3f}") if accuracy > best_accuracy: print(f"Saving model with Accuracy Score = {accuracy:.3f}") torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): df_train = preprocess('./review-sentence_train_clean.csv') df_valid = preprocess('./review-sentence_dev_clean.csv') df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.sentence.values, target=df_train.ENCODE_CAT.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset(review=df_valid.sentence.values, target=df_valid.ENCODE_CAT.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device(config.DEVICE) model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler, epoch) outputs, targets = engine.eval_fn(valid_data_loader, model, device, epoch) accuracy = metrics.accuracy_score(outputs, targets) print(f"Validation Accuracy = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy print("Best val accuracy till now {}".format(best_accuracy))
def run(): dfx = pd.read_csv( config.TRAINING_FILE).fillna("none").reset_index(drop=True) # df_test = pd.read_csv(config.TESTING_FILE).fillna("none").reset_index(drop=True) df_train, df_test = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.label.values) df_train = df_train.reset_index(drop=True) df_test = df_test.reset_index(drop=True) train_dataset = dataset.BERTDataset(text=df_train.title.values, label=df_train.label.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) test_dataset = dataset.BERTDataset(text=df_test.title.values, label=df_test.label.values) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.TEST_BATCH_SIZE, num_workers=1) device = torch.device("cpu") model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model = nn.DataParallel(model) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, labels = engine.eval_fn(test_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(labels, outputs) print(f"Accuracy Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy