def run(): skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True) train_df = pd.read_csv(TRAINING_FILE) train_df['text'] = train_df['text'].apply(lambda x: clean_text(x)) train_df['selected_text'] = train_df['selected_text'].apply( lambda x: clean_text(x)) for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment), start=1): print(f'Fold: {fold}') if SELECTED_MODEL == 'LSTM': model = models.TweetLSTMModel() elif SELECTED_MODEL == 'RoBERTa': model = models.TweetRoBERTaModel() optimizer = torch.optim.AdamW(model.parameters(), lr=LR, betas=(0.9, 0.999)) criterion = engine.loss_fn dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx, BATCH_SIZE) engine.train_fn( model, SELECTED_MODEL, dataloaders_dict, criterion, optimizer, NUM_EPOCHS, '../config/roberta-pths/' + f'{SELECTED_MODEL}_fold{fold}.pth')
def run(): dfx = pd.read_csv(config.TRAINING_FILE).fillna("none") dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) valid_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, scheduler, device) outputs, targets = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): utils.seed_everything(seed=config.SEED) train_dataset = dataset.Lyft2ndLevelDataset( config.PRED_PATHS + [config.MODE_16_PATH]) data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.BATCH_SIZE, num_workers=4, shuffle=True) device = torch.device('cuda') model = models.SetTransformer(**config.MODEL_PARAMS) model = model.to(device) optimizer = torch.optim.Adam( model.parameters(), lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY) scheduler = torch.optim.lr_scheduler.OneCycleLR( optimizer=optimizer, pct_start=config.PCT_START, div_factor=config.DIV_FACTOR, max_lr=config.LEARNING_RATE, epochs=config.EPOCHS, steps_per_epoch=len(data_loader)) for epoch in range(config.EPOCHS): engine.train_fn(data_loader, model, optimizer, device, scheduler=scheduler) torch.save(model.state_dict(), config.MODEL_PATH + 'transformer.bin')
def run(): df = pd.read_csv(config.TRAIN_PATH) kfold = KFold(n_splits=5, random_state=config.SEED, shuffle=True) fold_losses = [] for i, (train_idx, val_idx) in enumerate(kfold.split(df)): print("-------------------------------------------------------") print(f"Training fold {i}") print("-------------------------------------------------------") train = df.iloc[train_idx] validation = df.iloc[val_idx] train_dataset = PicDataset(train) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.BATCH_SIZE) val_dataset = PicDataset(validation) val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=config.BATCH_SIZE) device = 'cuda:0' if torch.cuda.is_available() else "cpu" model = CNN() model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=config.LR) loss = 0 for _ in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device) loss = engine.eval_fn(val_data_loader, model, device) print(f"Loss on fold {i} is {loss}") fold_losses.append(loss) torch.save(model.state_dict(), f'./models/model_{i}.bin') print(f"Average loss on cross validation is {sum(fold_losses) / 5}")
def run(): dfx = pd.read_csv(config.TRAINING_FILE) dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == 'positive' else 0) df_train, df_val = train_test_split(dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) train_dataset = dataset.BertDataset(df_train.review.values, df_train.sentiment.values) val_dataset = dataset.BertDataset(df_val.review.values, df_val.sentiment.values) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=1) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=4) device = 'cuda:0' if torch.cuda.is_available() else 'cpu' model = BertBaseUncased() model = model.to(device) params = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_params = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_params, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 accumulation_steps = config.ACCUMULATION for epoch in range(config.EPOCHS): engine.train_fn(train_dataloader, model, optimizer, scheduler, device, accumulation_steps) preds, actuals = engine.eval_fn(val_dataloader, model, device) preds = np.array(preds) >= 0.5 accuracy = metrics.accuracy_score(actuals, preds) print("Accuracy Score: %0.2f" % (accuracy)) if accuracy > best_accuracy: print("Best Accuracy reached, saving model...") torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): df1 = pd.read_csv("../input/jigsaw-toxic-comment-train.csv", usecols=["comment_text", "toxic"]) df2 = pd.read_csv("../input/jigsaw-unintended-bias-train.csv", usecols=["comment_text", "toxic"]) df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True) df_valid = pd.read_csv("../input/validation.csv") train_dataset = dataset.BERTDataset( comment_text=df_train.comment_text.values, target=df_train.toxic.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset( comment_text=df_valid.comment_text.values, target=df_valid.toxic.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device(config.DEVICE) model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [{ "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.001 }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) targets = np.array(targets) >= 0.5 accuracy = metrics.roc_auc_score(targets, outputs) print(f"AUC Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): df1 = pd.read_csv(config.TRAINING_FILE, usecols=["comment_text","toxic"]) train_dataset = dataset.BERTDataset( review=df1.comment_text.values, target=df1.toxic.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) df2=pd.read_csv("../input/validation.csv", usecols=["comment_text","toxic"]) valid_dataset = dataset.BERTDataset( review=df2.comment_text.values, target=df2.toxic.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] num_train_steps = int(len(df1) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) model = nn.DataParallel(model) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(fold, model_name): writer = SummaryWriter(log_dir=f'{SAVE_PATH}/', filename_suffix=f'{model_name}-fold{fold}') dfx = pd.read_csv(config.TRAINING_FILE) df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) print(df_train.shape) print(df_valid.shape) train_dataset = dataset.TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) valid_dataset = dataset.TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2 ) device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu') print(f'training on {device}') model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(params.optimizer_params(model), lr=5e-5) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) es = utils.EarlyStopping(patience=5, mode="max") for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler, writer) jaccard = engine.eval_fn(valid_data_loader, model, device, writer) print(f"Jaccard Score = {jaccard}") print(f"Epoch={epoch}, Jaccard={jaccard}") es(jaccard, model, model_path=f"{SAVE_PATH}/{model_name}-f{fold}.pt") if es.early_stop: print("Early stopping") break
def run_train(): data_dir = config.DATA_DIR nerProcessor = NerProcessor() train_example = nerProcessor.get_train_examples(data_dir) label_list = nerProcessor.get_labels() tokenizer = transformers.BertTokenizer.from_pretrained( config.BERT_TOKENIZER_PATH) train_features = convert_examples_to_features(train_example, label_list, config.MAX_SEQ_LEN, tokenizer) # input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) # attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long) # token_type_ids = torch.tensor([f.token_type_ids for f in train_features], dtype=torch.long) # label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long) input_ids = torch.tensor([f["input_ids"] for f in train_features], dtype=torch.long) attention_mask = torch.tensor( [f["attention_mask"] for f in train_features], dtype=torch.long) token_type_ids = torch.tensor( [f["token_type_ids"] for f in train_features], dtype=torch.long) label_ids = torch.tensor([f["label_ids"] for f in train_features]) label_ids = F.one_hot(label_ids) label_ids = torch.tensor(label_ids.numpy(), dtype=torch.float) train_dataset = TensorDataset(input_ids, attention_mask, token_type_ids, label_ids) sampler = SequentialSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=sampler, batch_size=config.TRAIN_BATCH_SIZE) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertNER(config.BERT_MODEL_PATH, len(label_list) + 1) model.to(device) optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE) num_training_step = len( train_dataset) // config.TRAIN_BATCH_SIZE * config.TRAIN_EPOCHS scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_training_step) for epoch in range(config.TRAIN_EPOCHS): train_fn(model, device, train_dataloader, optimizer, scheduler) model_to_save = model.module if hasattr(model, "module") else model model_save_path = os.path.join(f"{config.BERT_OUTPUT}/{epoch+1}", WEIGHTS_NAME) torch.save(model_to_save.state_dict(), model_save_path) tokenizer.save_vocabulary(f"{config.BERT_OUTPUT}/{epoch+1}/vocab.txt") model_to_save = model.module if hasattr(model, "module") else model model_save_path = os.path.join(config.BERT_OUTPUT, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), model_save_path) tokenizer.save_vocabulary(f"{config.BERT_OUTPUT}/vocab.txt")
def run(): seed_everything(config.SEED) df_train = pd.read_csv( config.TRAINING_FILE).dropna().reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) device = torch.device("cuda") model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) es = utils.EarlyStopping(patience=2, mode="max") for epoch in range(EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) if epoch + 1 == MAX_EPOCHS: torch.save(model.state_dict(), 'model_full.bin') break
def run_train(): data_dir = config.DATA_DIR kgp = KGProcessor() rela_list = kgp.get_all_relations() examples = kgp.get_train_examples(data_dir) tokenizer = transformers.BertTokenizer.from_pretrained( config.BERT_TOKENIZER_PATH) features = kgp.convert_examples_to_features(examples, config.MAX_SEQ_LEN, tokenizer) input_ids = torch.tensor([f["input_ids"] for f in features], dtype=torch.long) attention_mask = torch.tensor([f["attention_mask"] for f in features], dtype=torch.long) token_type_ids = torch.tensor([f["token_type_ids"] for f in features], dtype=torch.long) labels = torch.tensor([f["label"] for f in features]) labels = F.one_hot(labels) labels = torch.tensor(labels.numpy(), dtype=float) dataset = TensorDataset(input_ids, attention_mask, token_type_ids, labels) sampler = SequentialSampler(dataset) data_loader = DataLoader(dataset, sampler=sampler, batch_size=config.TRAIN_BATCH_SIZE) num_training_steps = len( input_ids) / config.TRAIN_BATCH_SIZE * config.TRAIN_EPOCHS device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertKG(config.BERT_MODEL_PATH, len(rela_list)) model.to(device) optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_training_steps) for epoch in range(config.TRAIN_EPOCHS): print( f"\n---------------------------epoch: {epoch+1}---------------------------" ) train_fn(model, device, data_loader, optimizer, scheduler) model_to_save = model.module if hasattr(model, "module") else model output_path = os.path.join(f"{config.BERT_OUTPUT_PATH}/{epoch+1}", WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_path) tokenizer.save_vocabulary( f"{config.BERT_OUTPUT_PATH}/{epoch+1}/vocab.txt") model_to_save = model.module if hasattr(model, "module") else model output_path = os.path.join(f"{config.BERT_OUTPUT_PATH}", WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_path) tokenizer.save_vocabulary(f"{config.BERT_OUTPUT_PATH}/vocab.txt")
def _training(Model): features, targets = engine.get_features(Model, train=True) X_train, X_test, y_train, y_test = utils.train_test_split(features, targets, test_size=0.3) classifier = engine.train_fn(X_train, y_train) utils.save_model(classifier, config.MODEL_PATH) predictions = engine.eval_fn(classifier, X_test) accuracy = utils.accuracy_score(predictions, y_test) print("Accuracy Score:", accuracy)
def run_training(): image_files = glob.glob(os.path.join(config.DATA_DIR, "*.png"))[:10] target_orig = [x.split('/')[-1][:-4] for x in image_files] targets = [[c for c in x] for x in target_orig] targets_flat = [c for clist in targets for c in clist] lbl_enc = preprocessing.LabelEncoder() lbl_enc.fit(targets_flat) targets_enc = [lbl_enc.transform(x) for x in targets] # didn't get this targets_enc = np.array(targets_enc) + 1 # print(targets_enc) # print(len(lbl_enc.classes_)) train_imgs, test_imgs, train_targets, test_targets, train_orig_targets, test_orig_targets = model_selection.train_test_split( image_files, targets_enc, target_orig, test_size=0.1, random_state=42) train_dataset = dataset.ClassificationDataset(image_paths=train_imgs, targets=train_targets, resize=(config.IMAGE_HEIGHT, config.IMAGE_WIDTH)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.BATCH_SIZE, num_workers=config.NUM_WORKERS, shuffle=True) test_dataset = dataset.ClassificationDataset(image_paths=test_imgs, targets=test_targets, resize=(config.IMAGE_HEIGHT, config.IMAGE_WIDTH)) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=config.BATCH_SIZE, num_workers=config.NUM_WORKERS, shuffle=False) model = CaptchModel(num_chars=len(lbl_enc.classes_)) model.to(config.DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.8, patience=5, verbose=True) for epoch in range(config.EPOCHS): train_loss = engine.train_fn(model, train_loader, optimizer) valid_preds, valid_loss = engine.eval_fn(model, test_loader) valid_cap_preds = [] for vp in valid_preds: current_preds = decode_predictions(vp, lbl_enc) valid_cap_preds.extend(current_preds) pprint(list(zip(test_orig_targets, valid_cap_preds))) print( f"Epoch = {epoch}, TrainLoss = {train_loss}, ValidLoss = {valid_loss}" )
def run(): sent_data = dataset.SentimentDataset() sent_data.load_data() sent_data.clean_data() sent_data.vocab_dict() sent_data.encode_text() sent_data.encode_label() sent_data.remove_outliers() sent_data.pad_features(config.SEQ_LENGTH) features= sent_data.features encoded_labels = sent_data.encoded_labels train_x, train_y, val_x, val_y, test_x, test_y = data_split(features, encoded_labels) # create Tensor datasets train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y)) valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y)) test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y)) # make sure the SHUFFLE your training data train_loader = DataLoader(train_data, shuffle=True, batch_size=config.BATCH_SIZE) valid_loader = DataLoader(valid_data, shuffle=True, batch_size=config.BATCH_SIZE) test_loader = DataLoader(test_data, shuffle=True, batch_size=config.BATCH_SIZE) vocab_size = len(sent_data.vocab_to_int)+1 output_size = config.OUTPUT_SIZE if config.MODEL_ARCH =='LSTM': net = model.SentimentLSTM(vocab_size, output_size, config.EMBEDDING_DIM, config.HIDDEN_DIM, config.N_LAYERS) elif config.MODEL_ARCH =='CNN': net = model.SentimentCNN(vocab_size,config.EMBEDDING_DIM,output_size) elif config.MODEL_ARCH =='LSTM+CNN': net = model.SentimentCNNLSTM(vocab_size, config.EMBEDDING_DIM, output_size) print(net) lr=0.001 criterion = nn.BCELoss() optimizer = torch.optim.Adam(net.parameters(), lr=lr) net.to(device=config.DEVICE) net.train() net = engine.train_fn(train_loader, valid_loader, net, optimizer, criterion, config.DEVICE) engine.test_fn(test_loader, net, criterion, config.DEVICE) print(" Testing few insances ") engine.predict(net, sent_data ," I Love this movie") engine.predict(net, sent_data, " This movie is not good") engine.predict(net, sent_data, "The worst movie I have seen; acting was terrible and I want my money back") engine.predict(net, sent_data, " I enjoy this movie") engine.predict(net, sent_data, " this movie is pathetic")
def run(): data = pd.read_csv(config.TRAINING_FILE).dropna() train, eval = train_test_split(data, random_state=1, test_size=0.2) model = get_model() model = engine.train_fn(model, train) score = engine.eval_fn(model, eval) print("EVAL Score : ", score) model.save_model(config.MODEL_PATH)
def train(): all_losses = [] for epoch in range(config.N_EPOCHS): print(f'Epoch: {epoch}/{config.N_EPOCHS}') train_loss, batch_loss = engine.train_fn() all_losses.extend(batch_loss) print(f'Train loss: {train_loss:.5f}\n') with open('results_2', 'wb') as fp: pickle.dump(all_losses, fp)
def run(): # Read in CSV df = pd.read_csv(config.TRAINING_FILE) print('Read In Complete!') # Split into Validation df_train, df_val = train_test_split(df, test_size=0.1, stratify=df.sentiment.values, random_state=config.RANDOM_SEED) df_train = df_train.reset_index(drop=True) df_val = df_val.reset_index(drop=True) print(df_train.shape, df_val.shape) print('Validation Split Complete!') # Create Dataset required for BERT Model train_dataset = dataset.BERTDataset(df_train.content.values, df_train.sentiment.values) val_dataset = dataset.BERTDataset(df_val.content.values, df_val.sentiment.values) train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=config.VAL_BATCH_SIZE, num_workers=1) print('Dataset for Model Complete!') # Define Model and Hyperparameters device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = BERTBaseCased() model.to(device) num_training_steps = len(train_data_loader) * config.EPOCHS optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=num_training_steps, num_warmup_steps=0) # Train the Model, Print Aaccurcay, Save Model n_train_exp = len(df_train) n_val_exp = len(df_val) history = defaultdict(list) best_accuracy = 0 for epoch in range(config.EPOCHS): print(f'\n{"#" * 10} Epoch: {epoch+1}/{config.EPOCHS} {"#" * 10}\n') train_acc, train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler, n_train_exp) val_acc, val_loss = engine.eval_fn(val_data_loader, model, device, n_val_exp) print(f'\nTrain Loss: {train_loss:.4f} Acc: {train_acc:.4f} \nVal Loss: {val_loss:.4f} Val Acc: {val_acc:.4f}') history['train_acc'].append(train_acc) history['train_loss'].append(train_loss) history['val_acc'].append(val_acc) history['val_loss'].append(val_loss) if val_acc > best_accuracy: #!rm -rf /content/model* torch.save(model.state_dict(), config.MODEL_PATH) # f'model/model_{val_acc:0.2f}.bin') best_accuracy = val_acc
def run_training(): image_files = glob.glob( os.path.abspath(os.path.join(config.DATA_DIR, "*.png"))) labels = [list(x.split("/")[-1].split(".")[0]) for x in image_files] labels_flat = [c for x in labels for c in x] label_enc = preprocessing.LabelEncoder() label_enc.fit(labels_flat) tar_enc = np.array([label_enc.transform(x) for x in labels]) + 1 train_X, test_X, train_y, test_y, train_target, test_target = model_selection.train_test_split( image_files, tar_enc, labels) train_dataset = dataset.DataSet(train_X, train_y, resize=(config.IMG_HEIGHT, config.IMG_WIDTH)) test_dataset = dataset.DataSet(test_X, test_y, resize=(config.IMG_HEIGHT, config.IMG_WIDTH)) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=config.BATCH_SIZE, num_workers=config.NUM_WORKERS, shuffle=True) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=config.BATCH_SIZE, num_workers=config.NUM_WORKERS) cm = CaptchaModel(num_chars=len(label_enc.classes_)) cm.to(config.DEVICE) optimizer = torch.optim.Adam(cm.parameters(), lr=3e-5) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.8, patience=10, verbose=True) for epoch in range(config.EPOCHS): train_loss = engine.train_fn(cm, train_dataloader, optimizer)
def run_training(): transform = transforms.Compose([ transforms.Resize(size=(32,32)), transforms.ToTensor(), transforms.Normalize((0.45820624,0.43722707,0.39191988),(0.23130463,0.22692703,0.22379072)) ]) label_encoder = preprocessing.LabelEncoder() image_paths = glob.glob(os.path.join(config.DATA_DIR,"**/*.*"),recursive=True) targets = [x.split("/")[-2] for x in image_paths] label_encoded = np.array(label_encoder.fit_transform(targets)) (train_images,test_images,train_labels,test_labels) = model_selection.train_test_split(image_paths,label_encoded,test_size=0.2,random_state=0) # print(len(train_images)) # print(len(train_labels)) train_dataset = dataset.ClassificationDataset(train_images,train_labels,transform) train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=config.BATCH_SIZE,shuffle=True) test_dataset = dataset.ClassificationDataset(test_images,test_labels,transform) test_loader = torch.utils.data.DataLoader(test_dataset,batch_size=config.BATCH_SIZE,shuffle=False) model =SmallNet(num_classes=3) model.to(config.DEVICE) opt = torch.optim.Adam(model.parameters(),lr=3e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( opt, factor=0.8, patience=5, verbose=True ) for epoch in range(config.EPOCHS): train_loss = engine.train_fn(model,train_loader,opt) val_accuracy,val_loss = engine.eval_fn(model,test_loader) print( f"Epoch={epoch}, Train Loss={train_loss}, Test Loss={val_loss} Accuracy={val_accuracy}" ) scheduler.step(val_loss) print("Saved model...") torch.save(model.state_dict(),"./models/weights_latest.pt")
def run_training(): image_files = glob.glob(os.path.join(config.DATA_DIR,"*.png")) # path to the dataset targets_orig = [x.split('/')[-1][:-4] for x in image_files] targets = [[c for c in x] for x in targets_orig] targets_flat = [c for clist in targets for c in clist] lbl_enc = preprocessing.LabelEncoder() lbl_enc.fit(targets_flat) targets_enc = [lbl_enc.transform(x) for x in targets] targets_enc = np.array(targets_enc) + 1 # print(targets) # print(target_enc) # print(len(lbl_enc.classes_)) # # print(targets_orig) # for i, item in enumerate(lbl_enc.classes_): # print(item, '-->', i) train_imgs, test_imgs, train_targets, test_targets, train_orig_targets, test_orig_targets= model_selection.train_test_split(image_files, targets_enc, targets_orig, test_size = 0.1, random_state= 42) train_dataset = dataset.ClassificationDataset(image_paths = train_imgs, targets = train_targets, resize = (config.IMAGE_HEIGHT, config.IMAGE_WIDTH)) print(train_dataset[0]) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size = config.BATCH_SIZE, num_workers = config.NUM_WORKERS, shuffle = True ) test_dataset = dataset.ClassificationDataset(image_paths = test_imgs, targets = test_targets, resize = (config.IMAGE_HEIGHT, config.IMAGE_WIDTH)) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size = config.BATCH_SIZE, num_workers = config.NUM_WORKERS, shuffle = False ) model = CaptchaModel(num_chars = len(lbl_enc.classes_)) model.to(config.DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor =0.8, patience= 5, verbose= True) for epoch in range(config.EPOCHS): train_loss = engine.train_fn(model, train_loader, optimizer) valid_pred, valid_loss = engine.eval_fn(model, train_loader)
def run_training(): df = pd.read_csv(TRAIN_CSV) labelencoder = LabelEncoder() df['label_group'] = labelencoder.fit_transform(df['label_group']) trainset = ShopeeDataset( df, DATA_DIR, transform=get_train_transforms(img_size=CFG.img_size)) trainloader = torch.utils.data.DataLoader( trainset, batch_size=CFG.batch_size, num_workers=CFG.num_workers, #pin_memory = True, shuffle=True, #drop_last = True ) scaler = GradScaler() model = ShopeeModel() model.to(CFG.device) #model=torch.nn.DataParallel(model) criterion = nn.CrossEntropyLoss().to(CFG.device) optimizer = torch.optim.Adam(model.parameters(), lr=CFG.scheduler_params['lr_start']) scheduler = ShopeeScheduler(optimizer, **CFG.scheduler_params) for epoch in range(CFG.epochs): avg_loss_train = engine.train_fn(model, trainloader, optimizer, scheduler, epoch, CFG.device, criterion, scaler) torch.save(model.state_dict(), MODEL_PATH + 'arcface_512x512_{}.pt'.format(CFG.model_name)) torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, MODEL_PATH + 'arcface_512x512_{}_checkpoints.pt'.format(CFG.model_name))
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_train_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) best_loss = np.inf for epoch in range(config.EPOCHS): train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler) test_loss = engine.eval_fn(valid_data_loader, model, device) print(f"Train Loss = {train_loss} Valid Loss = {test_loss}") if test_loss < best_loss: torch.save(model.state_dict(), config.MODEL_PATH) best_loss = test_loss
def run(): df_train = preprocess('./review-sentence_train_clean.csv') df_valid = preprocess('./review-sentence_dev_clean.csv') df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.sentence.values, target=df_train.ENCODE_CAT.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset(review=df_valid.sentence.values, target=df_valid.ENCODE_CAT.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device(config.DEVICE) model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler, epoch) outputs, targets = engine.eval_fn(valid_data_loader, model, device, epoch) accuracy = metrics.accuracy_score(outputs, targets) print(f"Validation Accuracy = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy print("Best val accuracy till now {}".format(best_accuracy))
def run(): device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") train_df = pd.read_csv(config.TRAIN_CSV_PATH) valid_df = pd.read_csv(config.VALIDATION_CSV_PATH) train_dataset = dataset.detection_dataset( train_df, target=config.TARGET_COL, train=True, transforms=T.Compose([T.ToTensor()]), ) valid_dataset = dataset.detection_dataset( valid_df, target=config.TARGET_COL, train=True, transforms=T.Compose([T.ToTensor()]), ) # print(train_dataset) train_dataloader = DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, shuffle=False, collate_fn=utils.collate_fn, ) valid_dataloader = DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle=False, collate_fn=utils.collate_fn, ) print("Data Loaders created") detector = model.create_model(config.NUM_CLASSES, backbone=config.BACKBONE) params = [p for p in detector.parameters() if p.requires_grad] optimizer = optim.Adam(params, lr=config.LEARNING_RATE) # lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5) detector.to(device) print("Model loaded to device") print("---------------- Training Started --------------") for epoch in range(config.EPOCHS): loss_value = engine.train_fn(train_dataloader, detector, optimizer, device) print("epoch = {}, Training_loss = {}".format(epoch, loss_value)) # Set the threshold as per needs results = engine.eval_fn( valid_dataloader, detector, device, detection_threshold=config.DETECTION_THRESHOLD, ) # Pretty printing the results pprint(results) # For now just saving one model. I haven't build evaluation metrics which I will use to save best model. # torch.save({ # 'epoch': epoch, # 'model_state_dict': detector.state_dict(), # 'optimizer_state_dict': optimizer.state_dict(), # 'loss': loss_value, # }, config.MODEL_SAVE_PATH) torch.save(detector.state_dict(), config.MODEL_SAVE_PATH) print("-" * 25) print("Model Trained and Saved to Disk")
detector = model.get_model(num_class) params = [p for p in detector.parameters() if p.requires_grad] optimizer = optim.Adam(params) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9) if os.path.exists('model.pth'): detector, start_epoch, optimizer, lr_scheduler = utils.get_checkpoint_state( "model.pth", detector, optimizer, lr_scheduler) detector.to(device) utils.writelog(file=log_file, log_info='=' * 10 + 'finished to set model' + '=' * 10 + ', ' + str(time.time() - since)) min_loss = 1000000000000 for epoch in range(start_epoch, start_epoch + 5): loss_value = engine.train_fn(train_set_load, detector, optimizer, device, epoch, lr_scheduler) print("epoch = {}, Training_loss = {}".format(epoch, loss_value)) utils.writelog(file=log_file, log_info="epoch = {}, Training_loss = {}".format( epoch, loss_value)) # Set the threshold as per needs if loss_value < min_loss: min_loss = loss_value utils.save_checkpoint_state("model.pth", epoch, detector, optimizer, lr_scheduler) utils.writelog( file=log_file, log_info=">>>>>>>>>>>>epoch = {}, save model<<<<<<<<<<<". format(epoch)) print("-" * 25)
def run(): sentences, pos, tag, enc_pos, enc_tag = utils.process_data(config.DATA_FILE) meta_data = { "enc_pos": enc_pos, "enc_tag": enc_tag } joblib.dump(meta_data, "meta.bin") num_pos = len(list(enc_pos.classes_)) num_tag = len(list(enc_tag.classes_)) ( train_sentences, test_sentences, train_pos, test_pos, train_tag, test_tag ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1) train_dataset = dataset.EntityDataset( texts = train_sentences, pos=train_pos, tags=train_tag ) test_dataset = dataset.EntityDataset( texts = test_sentences, pos=test_pos, tags=test_tag ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size = config.TRAIN_BATCH_SIZE, num_workers=4 ) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) device = torch.device("cuda") model = EntityModel(num_tag=num_tag, num_pos=num_pos) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_param = [ { "params" : [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params" : [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_train_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS ) optimizer = AdamW(optimizer_param, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) best_loss = np.inf for epoch in range(config.EPOCHS): train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler) test_loss = engine.eval_fn(test_data_loader, model, device) print(f"Train Loss = {train_loss} Valod Loss = {test_loss}") if test_loss < best_loss: torch.save(model.state_dict(), config.MODEL_SAVE_PATH) best_loss = test_loss
def run(fold): dfx = pd.read_csv(config.TRAINING_FILE) df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) train_dataset = dataset.TweetDataset( tweets=df_train.text.values, sentiments=df_train.sentiment.values, selected_texts=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, shuffle=True) valid_dataset = dataset.TweetDataset( tweets=df_valid.text.values, sentiments=df_valid.sentiment.values, selected_texts=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=4, shuffle=False) device = torch.device('cuda') model_config = transformers.RobertaConfig.from_pretrained( config.MODEL_CONFIG) model_config.output_hidden_states = True model = models.TweetModel(conf=model_config) model = model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': config.WEIGHT_DECAY }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] base_opt = transformers.AdamW(optimizer_parameters, lr=config.LEARNING_RATE) optimizer = torchcontrib.optim.SWA(base_opt, swa_start=int(num_train_steps * config.SWA_RATIO), swa_freq=config.SWA_FREQ, swa_lr=None) scheduler = transformers.get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=int(num_train_steps * config.WARMUP_RATIO), num_training_steps=num_train_steps) print(f'Training is starting for fold={fold}') for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) if config.USE_SWA: optimizer.swap_swa_sgd() torch.save(model.state_dict(), f'{config.MODEL_SAVE_PATH}/model_{fold}.bin') return jaccard
def run(): Seed = 1234 random.seed(Seed) np.random.seed(Seed) torch.manual_seed(Seed) torch.cuda.manual_seed(Seed) torch.backends.cudnn.deterministic = True train, valid, test, SRC, TRG = dataset.create_dataset() train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train, valid, test), sort_key=lambda x: len(x.source), batch_size=config.BATCH_SIZE, device=config.device) INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) ENC_EMB_DIM = config.ENCODER_EMBEDDING_DIMENSION DEC_EMB_DIM = config.DECODER_EMBEDDING_DIMENSION HID_DIM = config.LSTM_HIDDEN_DIMENSION N_LAYERS = config.LSTM_LAYERS ENC_DROPOUT = config.ENCODER_DROPOUT DEC_DROPOUT = config.DECODER_DROPOUT attn = model.Attention(HID_DIM, HID_DIM) enc = model.Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, HID_DIM, ENC_DROPOUT) dec = model.Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, HID_DIM, DEC_DROPOUT, attn) model_rnn = model.Seq2Seq(enc, dec, config.device).to(config.device) optimizer = optim.Adam(model_rnn.parameters(), lr=config.LEARNING_RATE) TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) if (args.action == 'train'): model_rnn.apply(utils.init_weights) best_valid_loss = float('inf') for epoch in range(config.N_EPOCHS): start_time = time.time() train_loss = engine.train_fn(model_rnn, train_iterator, optimizer, criterion, config.CLIP) valid_loss = engine.evaluate_fn(model_rnn, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = utils.epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model_rnn.state_dict(), config.MODEL_SAVE_FILE) with open(config.RESULTS_SAVE_FILE, 'a') as f: print( f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s', file=f) print( f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}', file=f) print( f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}', file=f) elif (args.action == 'test'): model_rnn.load_state_dict(torch.load(config.TEST_MODEL)) loss, target, output = engine.test_fn(model_rnn, test_iterator, criterion, SRC, TRG) bl = bleu_score(output, target, max_n=1, weights=[1]) met = 0 for z in range(len(output)): out = ' '.join(output[z][y] for y in range(min(10, len(output[z])))) tar = ' '.join(y for y in target[z]) met = met + metric_utils.compute_metric(out, 1.0, tar) with open(config.TEST_RESULTS_FILE, 'w') as f: print(f'Test bleu :, {bl*100}, Test PPL: {math.exp(loss):7.3f}', 'Metric:', met / len(output), file=f) elif (args.action == 'save_vocab'): print('Source Vocab Length', len(SRC.vocab)) print('Target vocab length', len(TRG.vocab)) s1 = '\n'.join(k for k in SRC.vocab.itos) s2 = '\n'.join(k for k in TRG.vocab.itos) with open('NL_vocabulary.txt', 'w') as f: f.write(s1) with open('Bash_vocabulary.txt', 'w') as f: f.write(s2)
def run(): print('Loading Files...') dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop = True) #dfx = dfx.sample(100) df_train,df_valid = model_selection.train_test_split( dfx, test_size = 0.1, random_state = 42, ) df_train = df_train.reset_index(drop = True) df_valid = df_valid.reset_index(drop = True) print('Files loaded') train_dataset = dataset.TweetDataset( tweet=df_train.text.values, selected_text = df_train.selected_text.values ) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, shuffle = False ) valid_dataset = dataset.TweetDataset( tweet=df_valid.text.values, selected_text = df_valid.selected_text.values ) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle = False ) device = torch.device('cuda') print('Running on ',device) model = BertBaseUncased().to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias','LayerNorm.bias','layerNorm.weight'] optimizer_params = [ {'params':[p for n,p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay':0.003}, {'params':[p for n,p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay':0.00} ] num_training_steps = int(len(df_train)/config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_params, lr = 2e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps = 0, num_training_steps = num_training_steps) best_jaccard = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_dataloader,model,optimizer,device,scheduler) jaccard = engine.eval_fn(valid_dataloader,model,device) print(f'Epochs {epoch+1}...', f'Jaccard {jaccard}') if jaccard > best_jaccard: torch.save(model.state_dict(),config.MODEL_PATH) best_jaccard = jaccard print('Memory Used: ',torch.cuda.memory_allocated()/1000000000,'GB') torch.cuda.empty_cache()
def run_training(): image_files = glob.glob(os.path.join(config.DATA_DIR, "*.png")) targets_orig = [x.split("\\")[-1][:-4] for x in image_files] targets = [[y for y in x] for x in targets_orig] targets_flat = [c for clist in targets for c in clist] label_enc = preprocessing.LabelEncoder() label_enc.fit(targets_flat) targets_enc = [label_enc.transform(x) for x in targets] targets_enc = np.array(targets_enc) + 1 # print(targets_enc) # print(label_enc.classes_) ( train_imgs, test_imgs, train_targets, test_targets, train_orig_targets, test_orig_targets, ) = model_selection.train_test_split(image_files, targets_enc, targets_orig, test_size=0.1, random_state=42) train_dataset = dataset.ClassificationDataset( image_paths=train_imgs, targets=train_targets, resize=(config.IMAGE_HEIGHT, config.IMAGE_WIDTH), ) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.BATCH_SIZE, num_workers=config.NUM_WORKERS, shuffle=True, pin_memory=True) test_dataset = dataset.ClassificationDataset( image_paths=test_imgs, targets=test_targets, resize=(config.IMAGE_HEIGHT, config.IMAGE_WIDTH), ) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=config.BATCH_SIZE, num_workers=config.NUM_WORKERS, shuffle=False, pin_memory=True) model = CaptchaModel(num_chars=len(label_enc.classes_)).cuda() model.to(torch.device(config.DEVICE)) optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.8, patience=5, verbose=True) for epoch in range(config.EPOCHS): train_loss = engine.train_fn(model, train_loader, optimizer) valid_preds, valid_loss = engine.eval_fn(model, test_loader) valid_cap_preds = [] for vp in valid_preds: current_preds = decode_predictions(vp, label_enc) valid_cap_preds.extend(current_preds) pprint.pprint(list(zip(test_orig_targets, valid_cap_preds))[:10]) test_dup_rem = [remove_duplicates(c) for c in test_orig_targets] accuracy = metrics.accuracy_score(test_dup_rem, valid_cap_preds) print( f"EPOCH: {epoch}.train_loss:{train_loss},valid_loss:{valid_loss}, Accuracy={accuracy}" ) scheduler.step(valid_loss)