Ejemplo n.º 1
0
def run():
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True)
    train_df = pd.read_csv(TRAINING_FILE)
    train_df['text'] = train_df['text'].apply(lambda x: clean_text(x))
    train_df['selected_text'] = train_df['selected_text'].apply(
        lambda x: clean_text(x))

    for fold, (train_idx,
               val_idx) in enumerate(skf.split(train_df, train_df.sentiment),
                                     start=1):
        print(f'Fold: {fold}')
        if SELECTED_MODEL == 'LSTM':
            model = models.TweetLSTMModel()
        elif SELECTED_MODEL == 'RoBERTa':
            model = models.TweetRoBERTaModel()
        optimizer = torch.optim.AdamW(model.parameters(),
                                      lr=LR,
                                      betas=(0.9, 0.999))
        criterion = engine.loss_fn
        dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx,
                                                 BATCH_SIZE)

        engine.train_fn(
            model, SELECTED_MODEL, dataloaders_dict, criterion, optimizer,
            NUM_EPOCHS,
            '../config/roberta-pths/' + f'{SELECTED_MODEL}_fold{fold}.pth')
Ejemplo n.º 2
0
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    valid_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")

    model = BERTBaseUncased()

    model.to(device)

    param_optimizer = list(model.named_parameters())

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, scheduler, device)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")

        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Ejemplo n.º 3
0
def run():
    utils.seed_everything(seed=config.SEED)

    train_dataset = dataset.Lyft2ndLevelDataset(
        config.PRED_PATHS + [config.MODE_16_PATH])
    data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.BATCH_SIZE,
        num_workers=4,
        shuffle=True)
    
    device = torch.device('cuda')
    model = models.SetTransformer(**config.MODEL_PARAMS)
    model = model.to(device)

    optimizer = torch.optim.Adam(
        model.parameters(), lr=config.LEARNING_RATE,
        weight_decay=config.WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer, pct_start=config.PCT_START,
        div_factor=config.DIV_FACTOR, max_lr=config.LEARNING_RATE,
        epochs=config.EPOCHS, steps_per_epoch=len(data_loader))

    for epoch in range(config.EPOCHS):
        engine.train_fn(data_loader, model, optimizer,
                        device, scheduler=scheduler)
    
    torch.save(model.state_dict(), config.MODEL_PATH + 'transformer.bin') 
Ejemplo n.º 4
0
def run():
    df = pd.read_csv(config.TRAIN_PATH)
    kfold = KFold(n_splits=5, random_state=config.SEED, shuffle=True)
    fold_losses = []

    for i, (train_idx, val_idx) in enumerate(kfold.split(df)):
        print("-------------------------------------------------------")
        print(f"Training fold {i}")
        print("-------------------------------------------------------")
        train = df.iloc[train_idx]
        validation = df.iloc[val_idx]
        train_dataset = PicDataset(train)
        train_data_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=config.BATCH_SIZE)

        val_dataset = PicDataset(validation)
        val_data_loader = torch.utils.data.DataLoader(
            val_dataset, batch_size=config.BATCH_SIZE)

        device = 'cuda:0' if torch.cuda.is_available() else "cpu"
        model = CNN()
        model.to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=config.LR)
        loss = 0

        for _ in range(config.EPOCHS):
            engine.train_fn(train_data_loader, model, optimizer, device)
            loss = engine.eval_fn(val_data_loader, model, device)

        print(f"Loss on fold {i} is {loss}")
        fold_losses.append(loss)
        torch.save(model.state_dict(), f'./models/model_{i}.bin')

    print(f"Average loss on cross validation is {sum(fold_losses) / 5}")
Ejemplo n.º 5
0
def run():
    dfx = pd.read_csv(config.TRAINING_FILE)
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == 'positive' else 0)
    df_train, df_val = train_test_split(dfx,
                                        test_size=0.1,
                                        random_state=42,
                                        stratify=dfx.sentiment.values)

    train_dataset = dataset.BertDataset(df_train.review.values,
                                        df_train.sentiment.values)

    val_dataset = dataset.BertDataset(df_val.review.values,
                                      df_val.sentiment.values)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=1)

    val_dataloader = torch.utils.data.DataLoader(
        val_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=4)

    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    model = BertBaseUncased()
    model = model.to(device)

    params = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_params = [{
        'params':
        [p for n, p in params if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }, {
        'params': [p for n, p in params if any(nd in n for nd in no_decay)],
        'weight_decay':
        0
    }]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    optimizer = AdamW(optimizer_params, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    accumulation_steps = config.ACCUMULATION
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_dataloader, model, optimizer, scheduler, device,
                        accumulation_steps)
        preds, actuals = engine.eval_fn(val_dataloader, model, device)
        preds = np.array(preds) >= 0.5
        accuracy = metrics.accuracy_score(actuals, preds)
        print("Accuracy Score: %0.2f" % (accuracy))

        if accuracy > best_accuracy:
            print("Best Accuracy reached, saving model...")
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
def run():
    df1 = pd.read_csv("../input/jigsaw-toxic-comment-train.csv",
                      usecols=["comment_text", "toxic"])
    df2 = pd.read_csv("../input/jigsaw-unintended-bias-train.csv",
                      usecols=["comment_text", "toxic"])

    df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True)
    df_valid = pd.read_csv("../input/validation.csv")

    train_dataset = dataset.BERTDataset(
        comment_text=df_train.comment_text.values,
        target=df_train.toxic.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.BERTDataset(
        comment_text=df_valid.comment_text.values,
        target=df_valid.toxic.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device(config.DEVICE)
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [{
        "params":
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay":
        0.001
    }, {
        "params":
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay":
        0.0
    }]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        targets = np.array(targets) >= 0.5
        accuracy = metrics.roc_auc_score(targets, outputs)
        print(f"AUC Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Ejemplo n.º 7
0
def run():
    df1 = pd.read_csv(config.TRAINING_FILE, usecols=["comment_text","toxic"])
    
    train_dataset = dataset.BERTDataset(
        review=df1.comment_text.values,
        target=df1.toxic.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )
    df2=pd.read_csv("../input/validation.csv", usecols=["comment_text","toxic"])
    valid_dataset = dataset.BERTDataset(
        review=df2.comment_text.values,
        target=df2.toxic.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BERTBaseUncased()
    model.to(device)
    
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    num_train_steps = int(len(df1) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Ejemplo n.º 8
0
def run(fold, model_name):
    writer = SummaryWriter(log_dir=f'{SAVE_PATH}/', filename_suffix=f'{model_name}-fold{fold}')
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)
    print(df_train.shape)
    print(df_valid.shape)
    train_dataset = dataset.TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = dataset.TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
    print(f'training on {device}')
    model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    optimizer = AdamW(params.optimizer_params(model), lr=5e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    es = utils.EarlyStopping(patience=5, mode="max")
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler, writer)
        jaccard = engine.eval_fn(valid_data_loader, model, device, writer)
        print(f"Jaccard Score = {jaccard}")
        print(f"Epoch={epoch}, Jaccard={jaccard}")
        es(jaccard, model, model_path=f"{SAVE_PATH}/{model_name}-f{fold}.pt")
        if es.early_stop:
            print("Early stopping")
            break
Ejemplo n.º 9
0
def run_train():
    data_dir = config.DATA_DIR
    nerProcessor = NerProcessor()
    train_example = nerProcessor.get_train_examples(data_dir)
    label_list = nerProcessor.get_labels()
    tokenizer = transformers.BertTokenizer.from_pretrained(
        config.BERT_TOKENIZER_PATH)
    train_features = convert_examples_to_features(train_example, label_list,
                                                  config.MAX_SEQ_LEN,
                                                  tokenizer)

    # input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    # attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long)
    # token_type_ids = torch.tensor([f.token_type_ids for f in train_features], dtype=torch.long)
    # label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long)

    input_ids = torch.tensor([f["input_ids"] for f in train_features],
                             dtype=torch.long)
    attention_mask = torch.tensor(
        [f["attention_mask"] for f in train_features], dtype=torch.long)
    token_type_ids = torch.tensor(
        [f["token_type_ids"] for f in train_features], dtype=torch.long)
    label_ids = torch.tensor([f["label_ids"] for f in train_features])
    label_ids = F.one_hot(label_ids)
    label_ids = torch.tensor(label_ids.numpy(), dtype=torch.float)

    train_dataset = TensorDataset(input_ids, attention_mask, token_type_ids,
                                  label_ids)
    sampler = SequentialSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=sampler,
                                  batch_size=config.TRAIN_BATCH_SIZE)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BertNER(config.BERT_MODEL_PATH, len(label_list) + 1)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE)
    num_training_step = len(
        train_dataset) // config.TRAIN_BATCH_SIZE * config.TRAIN_EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_training_step)

    for epoch in range(config.TRAIN_EPOCHS):
        train_fn(model, device, train_dataloader, optimizer, scheduler)

        model_to_save = model.module if hasattr(model, "module") else model
        model_save_path = os.path.join(f"{config.BERT_OUTPUT}/{epoch+1}",
                                       WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), model_save_path)
        tokenizer.save_vocabulary(f"{config.BERT_OUTPUT}/{epoch+1}/vocab.txt")

    model_to_save = model.module if hasattr(model, "module") else model
    model_save_path = os.path.join(config.BERT_OUTPUT, WEIGHTS_NAME)
    torch.save(model_to_save.state_dict(), model_save_path)
    tokenizer.save_vocabulary(f"{config.BERT_OUTPUT}/vocab.txt")
Ejemplo n.º 10
0
def run():
    seed_everything(config.SEED)
    df_train = pd.read_csv(
        config.TRAINING_FILE).dropna().reset_index(drop=True)

    train_dataset = TweetDataset(tweet=df_train.text.values,
                                 sentiment=df_train.sentiment.values,
                                 selected_text=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    es = utils.EarlyStopping(patience=2, mode="max")

    for epoch in range(EPOCHS):
        engine.train_fn(train_data_loader,
                        model,
                        optimizer,
                        device,
                        scheduler=scheduler)
        if epoch + 1 == MAX_EPOCHS:
            torch.save(model.state_dict(), 'model_full.bin')
            break
Ejemplo n.º 11
0
def run_train():
    data_dir = config.DATA_DIR
    kgp = KGProcessor()
    rela_list = kgp.get_all_relations()
    examples = kgp.get_train_examples(data_dir)
    tokenizer = transformers.BertTokenizer.from_pretrained(
        config.BERT_TOKENIZER_PATH)
    features = kgp.convert_examples_to_features(examples, config.MAX_SEQ_LEN,
                                                tokenizer)

    input_ids = torch.tensor([f["input_ids"] for f in features],
                             dtype=torch.long)
    attention_mask = torch.tensor([f["attention_mask"] for f in features],
                                  dtype=torch.long)
    token_type_ids = torch.tensor([f["token_type_ids"] for f in features],
                                  dtype=torch.long)
    labels = torch.tensor([f["label"] for f in features])
    labels = F.one_hot(labels)
    labels = torch.tensor(labels.numpy(), dtype=float)

    dataset = TensorDataset(input_ids, attention_mask, token_type_ids, labels)
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset,
                             sampler=sampler,
                             batch_size=config.TRAIN_BATCH_SIZE)

    num_training_steps = len(
        input_ids) / config.TRAIN_BATCH_SIZE * config.TRAIN_EPOCHS
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BertKG(config.BERT_MODEL_PATH, len(rela_list))
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    for epoch in range(config.TRAIN_EPOCHS):
        print(
            f"\n---------------------------epoch: {epoch+1}---------------------------"
        )
        train_fn(model, device, data_loader, optimizer, scheduler)

        model_to_save = model.module if hasattr(model, "module") else model
        output_path = os.path.join(f"{config.BERT_OUTPUT_PATH}/{epoch+1}",
                                   WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_path)
        tokenizer.save_vocabulary(
            f"{config.BERT_OUTPUT_PATH}/{epoch+1}/vocab.txt")

    model_to_save = model.module if hasattr(model, "module") else model
    output_path = os.path.join(f"{config.BERT_OUTPUT_PATH}", WEIGHTS_NAME)
    torch.save(model_to_save.state_dict(), output_path)
    tokenizer.save_vocabulary(f"{config.BERT_OUTPUT_PATH}/vocab.txt")
Ejemplo n.º 12
0
def _training(Model):
    features, targets = engine.get_features(Model, train=True)
    X_train, X_test, y_train, y_test = utils.train_test_split(features,
                                                              targets,
                                                              test_size=0.3)
    classifier = engine.train_fn(X_train, y_train)
    utils.save_model(classifier, config.MODEL_PATH)
    predictions = engine.eval_fn(classifier, X_test)
    accuracy = utils.accuracy_score(predictions, y_test)
    print("Accuracy Score:", accuracy)
Ejemplo n.º 13
0
def run_training():
    image_files = glob.glob(os.path.join(config.DATA_DIR, "*.png"))[:10]
    target_orig = [x.split('/')[-1][:-4] for x in image_files]
    targets = [[c for c in x] for x in target_orig]
    targets_flat = [c for clist in targets for c in clist]

    lbl_enc = preprocessing.LabelEncoder()
    lbl_enc.fit(targets_flat)
    targets_enc = [lbl_enc.transform(x) for x in targets]
    # didn't get this
    targets_enc = np.array(targets_enc) + 1
    # print(targets_enc)
    # print(len(lbl_enc.classes_))

    train_imgs, test_imgs, train_targets, test_targets, train_orig_targets, test_orig_targets = model_selection.train_test_split(
        image_files, targets_enc, target_orig, test_size=0.1, random_state=42)

    train_dataset = dataset.ClassificationDataset(image_paths=train_imgs,
                                                  targets=train_targets,
                                                  resize=(config.IMAGE_HEIGHT,
                                                          config.IMAGE_WIDTH))
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=config.BATCH_SIZE,
                                               num_workers=config.NUM_WORKERS,
                                               shuffle=True)

    test_dataset = dataset.ClassificationDataset(image_paths=test_imgs,
                                                 targets=test_targets,
                                                 resize=(config.IMAGE_HEIGHT,
                                                         config.IMAGE_WIDTH))
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=config.BATCH_SIZE,
                                              num_workers=config.NUM_WORKERS,
                                              shuffle=False)

    model = CaptchModel(num_chars=len(lbl_enc.classes_))
    model.to(config.DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.8,
                                                           patience=5,
                                                           verbose=True)

    for epoch in range(config.EPOCHS):
        train_loss = engine.train_fn(model, train_loader, optimizer)
        valid_preds, valid_loss = engine.eval_fn(model, test_loader)
        valid_cap_preds = []
        for vp in valid_preds:
            current_preds = decode_predictions(vp, lbl_enc)
            valid_cap_preds.extend(current_preds)
        pprint(list(zip(test_orig_targets, valid_cap_preds)))
        print(
            f"Epoch = {epoch}, TrainLoss = {train_loss}, ValidLoss = {valid_loss}"
        )
def run():

    sent_data = dataset.SentimentDataset()
    sent_data.load_data()
    sent_data.clean_data()
    sent_data.vocab_dict()
    sent_data.encode_text()
    sent_data.encode_label()
    sent_data.remove_outliers()
    sent_data.pad_features(config.SEQ_LENGTH)

    features= sent_data.features
    encoded_labels = sent_data.encoded_labels

    train_x, train_y, val_x, val_y, test_x, test_y = data_split(features, encoded_labels)

    # create Tensor datasets
    train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
    valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
    test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

    # make sure the SHUFFLE your training data
    train_loader = DataLoader(train_data, shuffle=True, batch_size=config.BATCH_SIZE)
    valid_loader = DataLoader(valid_data, shuffle=True, batch_size=config.BATCH_SIZE)
    test_loader = DataLoader(test_data, shuffle=True, batch_size=config.BATCH_SIZE)

    vocab_size = len(sent_data.vocab_to_int)+1
    output_size = config.OUTPUT_SIZE

    if config.MODEL_ARCH =='LSTM':
        net = model.SentimentLSTM(vocab_size, output_size, config.EMBEDDING_DIM, config.HIDDEN_DIM, config.N_LAYERS)
    elif config.MODEL_ARCH =='CNN':
        net = model.SentimentCNN(vocab_size,config.EMBEDDING_DIM,output_size)
    elif config.MODEL_ARCH =='LSTM+CNN':
        net = model.SentimentCNNLSTM(vocab_size, config.EMBEDDING_DIM, output_size)



    print(net)
    lr=0.001
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    net.to(device=config.DEVICE)
    net.train()

    net = engine.train_fn(train_loader, valid_loader, net, optimizer, criterion, config.DEVICE)
    engine.test_fn(test_loader, net, criterion, config.DEVICE)

    print(" Testing few insances ")
    engine.predict(net, sent_data ," I Love this movie")
    engine.predict(net, sent_data, " This movie is not good")
    engine.predict(net, sent_data, "The worst movie I have seen; acting was terrible and I want my money back")
    engine.predict(net, sent_data, " I enjoy this movie")
    engine.predict(net, sent_data, " this movie is pathetic")
Ejemplo n.º 15
0
def run():
    data = pd.read_csv(config.TRAINING_FILE).dropna()
    train, eval = train_test_split(data, random_state=1, test_size=0.2)

    model = get_model()
    model = engine.train_fn(model, train)
    score = engine.eval_fn(model, eval)

    print("EVAL Score : ", score)

    model.save_model(config.MODEL_PATH)
def train():
    all_losses = []
    for epoch in range(config.N_EPOCHS):
        print(f'Epoch: {epoch}/{config.N_EPOCHS}')
        train_loss, batch_loss = engine.train_fn()
        all_losses.extend(batch_loss)

        print(f'Train loss: {train_loss:.5f}\n')

    with open('results_2', 'wb') as fp:
        pickle.dump(all_losses, fp)
Ejemplo n.º 17
0
def run():
  # Read in CSV
  df = pd.read_csv(config.TRAINING_FILE)
  print('Read In Complete!')

  # Split into Validation
  df_train, df_val = train_test_split(df, test_size=0.1, stratify=df.sentiment.values, random_state=config.RANDOM_SEED)
  df_train = df_train.reset_index(drop=True)
  df_val = df_val.reset_index(drop=True)
  print(df_train.shape, df_val.shape)
  print('Validation Split Complete!')

  # Create Dataset required for BERT Model
  train_dataset = dataset.BERTDataset(df_train.content.values, df_train.sentiment.values)
  val_dataset = dataset.BERTDataset(df_val.content.values, df_val.sentiment.values)

  train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)
  val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=config.VAL_BATCH_SIZE, num_workers=1)
  print('Dataset for Model Complete!')

  # Define Model and Hyperparameters
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  model = BERTBaseCased()
  model.to(device)

  num_training_steps = len(train_data_loader) * config.EPOCHS
  optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=num_training_steps, num_warmup_steps=0)

  # Train the Model, Print Aaccurcay, Save Model
  n_train_exp = len(df_train)
  n_val_exp = len(df_val)

  history = defaultdict(list)
  best_accuracy = 0

  for epoch in range(config.EPOCHS):
    print(f'\n{"#" * 10} Epoch: {epoch+1}/{config.EPOCHS} {"#" * 10}\n')
    train_acc, train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler, n_train_exp)    
    val_acc, val_loss = engine.eval_fn(val_data_loader, model, device, n_val_exp)

    print(f'\nTrain Loss: {train_loss:.4f}        Acc: {train_acc:.4f} \nVal   Loss: {val_loss:.4f}    Val Acc: {val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        #!rm -rf /content/model*
        torch.save(model.state_dict(), config.MODEL_PATH)  # f'model/model_{val_acc:0.2f}.bin')
        best_accuracy = val_acc
Ejemplo n.º 18
0
def run_training():
    image_files = glob.glob(
        os.path.abspath(os.path.join(config.DATA_DIR, "*.png")))
    labels = [list(x.split("/")[-1].split(".")[0]) for x in image_files]
    labels_flat = [c for x in labels for c in x]

    label_enc = preprocessing.LabelEncoder()
    label_enc.fit(labels_flat)
    tar_enc = np.array([label_enc.transform(x) for x in labels]) + 1
    train_X, test_X, train_y, test_y, train_target, test_target = model_selection.train_test_split(
        image_files, tar_enc, labels)

    train_dataset = dataset.DataSet(train_X,
                                    train_y,
                                    resize=(config.IMG_HEIGHT,
                                            config.IMG_WIDTH))

    test_dataset = dataset.DataSet(test_X,
                                   test_y,
                                   resize=(config.IMG_HEIGHT,
                                           config.IMG_WIDTH))

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.BATCH_SIZE,
        num_workers=config.NUM_WORKERS,
        shuffle=True)

    test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=config.BATCH_SIZE,
        num_workers=config.NUM_WORKERS)

    cm = CaptchaModel(num_chars=len(label_enc.classes_))
    cm.to(config.DEVICE)

    optimizer = torch.optim.Adam(cm.parameters(), lr=3e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.8,
                                                           patience=10,
                                                           verbose=True)

    for epoch in range(config.EPOCHS):

        train_loss = engine.train_fn(cm, train_dataloader, optimizer)
def run_training():
    transform = transforms.Compose([
            transforms.Resize(size=(32,32)),
            transforms.ToTensor(),
            transforms.Normalize((0.45820624,0.43722707,0.39191988),(0.23130463,0.22692703,0.22379072))
    ])

    label_encoder = preprocessing.LabelEncoder()

    image_paths = glob.glob(os.path.join(config.DATA_DIR,"**/*.*"),recursive=True)
    targets = [x.split("/")[-2] for x in image_paths]
    label_encoded = np.array(label_encoder.fit_transform(targets))
    
    (train_images,test_images,train_labels,test_labels) = model_selection.train_test_split(image_paths,label_encoded,test_size=0.2,random_state=0)
    # print(len(train_images))
    # print(len(train_labels))

    train_dataset = dataset.ClassificationDataset(train_images,train_labels,transform)
    train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=config.BATCH_SIZE,shuffle=True)
    
    test_dataset = dataset.ClassificationDataset(test_images,test_labels,transform)
    test_loader = torch.utils.data.DataLoader(test_dataset,batch_size=config.BATCH_SIZE,shuffle=False)

    model =SmallNet(num_classes=3)
    model.to(config.DEVICE)

    opt = torch.optim.Adam(model.parameters(),lr=3e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        opt, factor=0.8, patience=5, verbose=True
    )

    for epoch in range(config.EPOCHS):
        
        train_loss = engine.train_fn(model,train_loader,opt)
        val_accuracy,val_loss = engine.eval_fn(model,test_loader)

        print(
            f"Epoch={epoch}, Train Loss={train_loss}, Test Loss={val_loss} Accuracy={val_accuracy}"
        )
        
        scheduler.step(val_loss)

    print("Saved model...")
    torch.save(model.state_dict(),"./models/weights_latest.pt")    
Ejemplo n.º 20
0
def run_training():
    image_files = glob.glob(os.path.join(config.DATA_DIR,"*.png"))
    # path to the dataset
    targets_orig = [x.split('/')[-1][:-4] for x in image_files]
    targets = [[c for c in x] for x in targets_orig]
    targets_flat = [c for clist in targets for c in clist]
    lbl_enc = preprocessing.LabelEncoder()
    lbl_enc.fit(targets_flat)
    targets_enc = [lbl_enc.transform(x) for x in targets]
    targets_enc = np.array(targets_enc) + 1
    # print(targets)
    # print(target_enc)
    # print(len(lbl_enc.classes_))
    # # print(targets_orig)
    # for i, item in enumerate(lbl_enc.classes_):
    #     print(item, '-->', i)

    train_imgs, test_imgs, train_targets, test_targets, train_orig_targets, test_orig_targets= model_selection.train_test_split(image_files, targets_enc, targets_orig, test_size = 0.1, random_state= 42)

    train_dataset = dataset.ClassificationDataset(image_paths = train_imgs, targets =  train_targets, resize = (config.IMAGE_HEIGHT, config.IMAGE_WIDTH))
    print(train_dataset[0])
    train_loader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size = config.BATCH_SIZE,
                num_workers = config.NUM_WORKERS,
                shuffle = True
        )


    test_dataset = dataset.ClassificationDataset(image_paths = test_imgs, targets =  test_targets, resize = (config.IMAGE_HEIGHT, config.IMAGE_WIDTH))
    test_loader = torch.utils.data.DataLoader(
                test_dataset,
                batch_size = config.BATCH_SIZE,
                num_workers = config.NUM_WORKERS,
                shuffle = False
        )
    model = CaptchaModel(num_chars = len(lbl_enc.classes_))
    model.to(config.DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor =0.8, patience= 5, verbose= True)
    for epoch in range(config.EPOCHS):
        train_loss = engine.train_fn(model, train_loader, optimizer)
        valid_pred, valid_loss = engine.eval_fn(model, train_loader)
def run_training():

    df = pd.read_csv(TRAIN_CSV)

    labelencoder = LabelEncoder()
    df['label_group'] = labelencoder.fit_transform(df['label_group'])

    trainset = ShopeeDataset(
        df, DATA_DIR, transform=get_train_transforms(img_size=CFG.img_size))

    trainloader = torch.utils.data.DataLoader(
        trainset,
        batch_size=CFG.batch_size,
        num_workers=CFG.num_workers,
        #pin_memory = True,
        shuffle=True,
        #drop_last = True
    )
    scaler = GradScaler()
    model = ShopeeModel()
    model.to(CFG.device)
    #model=torch.nn.DataParallel(model)
    criterion = nn.CrossEntropyLoss().to(CFG.device)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=CFG.scheduler_params['lr_start'])
    scheduler = ShopeeScheduler(optimizer, **CFG.scheduler_params)

    for epoch in range(CFG.epochs):
        avg_loss_train = engine.train_fn(model, trainloader, optimizer,
                                         scheduler, epoch, CFG.device,
                                         criterion, scaler)
        torch.save(model.state_dict(),
                   MODEL_PATH + 'arcface_512x512_{}.pt'.format(CFG.model_name))
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict()
            }, MODEL_PATH +
            'arcface_512x512_{}_checkpoints.pt'.format(CFG.model_name))
Ejemplo n.º 22
0
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_loss = np.inf
    for epoch in range(config.EPOCHS):
        train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        test_loss = engine.eval_fn(valid_data_loader, model, device)
        print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
        if test_loss < best_loss:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_loss = test_loss
Ejemplo n.º 23
0
def run():

    df_train = preprocess('./review-sentence_train_clean.csv')
    df_valid = preprocess('./review-sentence_dev_clean.csv')

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.sentence.values,
                                        target=df_train.ENCODE_CAT.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.BERTDataset(review=df_valid.sentence.values,
                                        target=df_valid.ENCODE_CAT.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device(config.DEVICE)
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler,
                        epoch)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device,
                                          epoch)
        accuracy = metrics.accuracy_score(outputs, targets)
        print(f"Validation Accuracy  = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
            print("Best val accuracy till now {}".format(best_accuracy))
Ejemplo n.º 24
0
def run():
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")

    train_df = pd.read_csv(config.TRAIN_CSV_PATH)
    valid_df = pd.read_csv(config.VALIDATION_CSV_PATH)

    train_dataset = dataset.detection_dataset(
        train_df,
        target=config.TARGET_COL,
        train=True,
        transforms=T.Compose([T.ToTensor()]),
    )

    valid_dataset = dataset.detection_dataset(
        valid_df,
        target=config.TARGET_COL,
        train=True,
        transforms=T.Compose([T.ToTensor()]),
    )

    # print(train_dataset)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        shuffle=False,
        collate_fn=utils.collate_fn,
    )

    valid_dataloader = DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        shuffle=False,
        collate_fn=utils.collate_fn,
    )

    print("Data Loaders created")

    detector = model.create_model(config.NUM_CLASSES, backbone=config.BACKBONE)

    params = [p for p in detector.parameters() if p.requires_grad]
    optimizer = optim.Adam(params, lr=config.LEARNING_RATE)
    # lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5)
    detector.to(device)

    print("Model loaded to device")

    print("---------------- Training Started --------------")

    for epoch in range(config.EPOCHS):
        loss_value = engine.train_fn(train_dataloader, detector, optimizer,
                                     device)
        print("epoch = {}, Training_loss = {}".format(epoch, loss_value))
        # Set the threshold as per needs
        results = engine.eval_fn(
            valid_dataloader,
            detector,
            device,
            detection_threshold=config.DETECTION_THRESHOLD,
        )
        # Pretty printing the results
        pprint(results)

    # For now just saving one model. I haven't build evaluation metrics which I will use to save best model.

    # torch.save({
    #         'epoch': epoch,
    #         'model_state_dict': detector.state_dict(),
    #         'optimizer_state_dict': optimizer.state_dict(),
    #         'loss': loss_value,
    #         }, config.MODEL_SAVE_PATH)

    torch.save(detector.state_dict(), config.MODEL_SAVE_PATH)
    print("-" * 25)
    print("Model Trained and Saved to Disk")
Ejemplo n.º 25
0
    detector = model.get_model(num_class)
    params = [p for p in detector.parameters() if p.requires_grad]
    optimizer = optim.Adam(params)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)

    if os.path.exists('model.pth'):
        detector, start_epoch, optimizer, lr_scheduler = utils.get_checkpoint_state(
            "model.pth", detector, optimizer, lr_scheduler)
    detector.to(device)
    utils.writelog(file=log_file,
                   log_info='=' * 10 + 'finished to set model' + '=' * 10 +
                   ',   ' + str(time.time() - since))

    min_loss = 1000000000000
    for epoch in range(start_epoch, start_epoch + 5):
        loss_value = engine.train_fn(train_set_load, detector, optimizer,
                                     device, epoch, lr_scheduler)
        print("epoch = {}, Training_loss = {}".format(epoch, loss_value))
        utils.writelog(file=log_file,
                       log_info="epoch = {}, Training_loss = {}".format(
                           epoch, loss_value))
        # Set the threshold as per needs
        if loss_value < min_loss:
            min_loss = loss_value
            utils.save_checkpoint_state("model.pth", epoch, detector,
                                        optimizer, lr_scheduler)
            utils.writelog(
                file=log_file,
                log_info=">>>>>>>>>>>>epoch = {}, save model<<<<<<<<<<<".
                format(epoch))

    print("-" * 25)
def run():
    sentences, pos, tag, enc_pos, enc_tag = utils.process_data(config.DATA_FILE)

    meta_data = {
        "enc_pos": enc_pos,
        "enc_tag": enc_tag
    }

    joblib.dump(meta_data, "meta.bin")

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))

    (
        train_sentences,
        test_sentences,
        train_pos,
        test_pos,
        train_tag,
        test_tag
    ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1)

    train_dataset = dataset.EntityDataset(
        texts = train_sentences, pos=train_pos, tags=train_tag
    )

    test_dataset  = dataset.EntityDataset(
        texts = test_sentences, pos=test_pos, tags=test_tag
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size = config.TRAIN_BATCH_SIZE, num_workers=4
    )

    test_data_loader  = torch.utils.data.DataLoader(
        test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
    )

    device = torch.device("cuda")

    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay        = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_param = [
        {
            "params" : [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params" : [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS )
    optimizer = AdamW(optimizer_param, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_loss = np.inf

    for epoch in range(config.EPOCHS):
        train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        test_loss  = engine.eval_fn(test_data_loader, model, device)
        print(f"Train Loss = {train_loss} Valod Loss = {test_loss}")
        if test_loss < best_loss:
            torch.save(model.state_dict(), config.MODEL_SAVE_PATH)
            best_loss = test_loss
Ejemplo n.º 27
0
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = dataset.TweetDataset(
        tweets=df_train.text.values,
        sentiments=df_train.sentiment.values,
        selected_texts=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4,
        shuffle=True)

    valid_dataset = dataset.TweetDataset(
        tweets=df_valid.text.values,
        sentiments=df_valid.sentiment.values,
        selected_texts=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=4,
        shuffle=False)

    device = torch.device('cuda')
    model_config = transformers.RobertaConfig.from_pretrained(
        config.MODEL_CONFIG)
    model_config.output_hidden_states = True
    model = models.TweetModel(conf=model_config)
    model = model.to(device)

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        config.WEIGHT_DECAY
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    base_opt = transformers.AdamW(optimizer_parameters,
                                  lr=config.LEARNING_RATE)
    optimizer = torchcontrib.optim.SWA(base_opt,
                                       swa_start=int(num_train_steps *
                                                     config.SWA_RATIO),
                                       swa_freq=config.SWA_FREQ,
                                       swa_lr=None)
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=int(num_train_steps * config.WARMUP_RATIO),
        num_training_steps=num_train_steps)

    print(f'Training is starting for fold={fold}')

    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader,
                        model,
                        optimizer,
                        device,
                        scheduler=scheduler)
        jaccard = engine.eval_fn(valid_data_loader, model, device)

    if config.USE_SWA:
        optimizer.swap_swa_sgd()

    torch.save(model.state_dict(),
               f'{config.MODEL_SAVE_PATH}/model_{fold}.bin')

    return jaccard
def run():
    Seed = 1234
    random.seed(Seed)
    np.random.seed(Seed)
    torch.manual_seed(Seed)
    torch.cuda.manual_seed(Seed)
    torch.backends.cudnn.deterministic = True
    train, valid, test, SRC, TRG = dataset.create_dataset()
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train, valid, test),
        sort_key=lambda x: len(x.source),
        batch_size=config.BATCH_SIZE,
        device=config.device)

    INPUT_DIM = len(SRC.vocab)
    OUTPUT_DIM = len(TRG.vocab)

    ENC_EMB_DIM = config.ENCODER_EMBEDDING_DIMENSION
    DEC_EMB_DIM = config.DECODER_EMBEDDING_DIMENSION
    HID_DIM = config.LSTM_HIDDEN_DIMENSION
    N_LAYERS = config.LSTM_LAYERS
    ENC_DROPOUT = config.ENCODER_DROPOUT
    DEC_DROPOUT = config.DECODER_DROPOUT

    attn = model.Attention(HID_DIM, HID_DIM)
    enc = model.Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, HID_DIM, ENC_DROPOUT)
    dec = model.Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, HID_DIM, DEC_DROPOUT,
                        attn)

    model_rnn = model.Seq2Seq(enc, dec, config.device).to(config.device)

    optimizer = optim.Adam(model_rnn.parameters(), lr=config.LEARNING_RATE)

    TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

    criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

    if (args.action == 'train'):
        model_rnn.apply(utils.init_weights)

        best_valid_loss = float('inf')

        for epoch in range(config.N_EPOCHS):
            start_time = time.time()

            train_loss = engine.train_fn(model_rnn, train_iterator, optimizer,
                                         criterion, config.CLIP)
            valid_loss = engine.evaluate_fn(model_rnn, valid_iterator,
                                            criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = utils.epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model_rnn.state_dict(), config.MODEL_SAVE_FILE)

            with open(config.RESULTS_SAVE_FILE, 'a') as f:
                print(
                    f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s',
                    file=f)
                print(
                    f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}',
                    file=f)
                print(
                    f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}',
                    file=f)

    elif (args.action == 'test'):
        model_rnn.load_state_dict(torch.load(config.TEST_MODEL))
        loss, target, output = engine.test_fn(model_rnn, test_iterator,
                                              criterion, SRC, TRG)
        bl = bleu_score(output, target, max_n=1, weights=[1])
        met = 0

        for z in range(len(output)):
            out = ' '.join(output[z][y]
                           for y in range(min(10, len(output[z]))))
            tar = ' '.join(y for y in target[z])

            met = met + metric_utils.compute_metric(out, 1.0, tar)

        with open(config.TEST_RESULTS_FILE, 'w') as f:
            print(f'Test bleu :, {bl*100}, Test PPL: {math.exp(loss):7.3f}',
                  'Metric:',
                  met / len(output),
                  file=f)

    elif (args.action == 'save_vocab'):
        print('Source Vocab Length', len(SRC.vocab))
        print('Target vocab length', len(TRG.vocab))
        s1 = '\n'.join(k for k in SRC.vocab.itos)
        s2 = '\n'.join(k for k in TRG.vocab.itos)
        with open('NL_vocabulary.txt', 'w') as f:
            f.write(s1)
        with open('Bash_vocabulary.txt', 'w') as f:
            f.write(s2)
Ejemplo n.º 29
0
def run():
    print('Loading Files...')
    
    dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop = True)
    
    #dfx = dfx.sample(100)
    df_train,df_valid = model_selection.train_test_split(
            dfx,
            test_size = 0.1,
            random_state = 42,            
            )
    df_train = df_train.reset_index(drop = True)
    df_valid = df_valid.reset_index(drop = True)
    
    print('Files loaded')
    
    train_dataset = dataset.TweetDataset( 
            tweet=df_train.text.values,
            selected_text = df_train.selected_text.values
            )
    
    train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=config.TRAIN_BATCH_SIZE,
            shuffle = False
            )
    
    
    valid_dataset = dataset.TweetDataset( 
            tweet=df_valid.text.values,
            selected_text = df_valid.selected_text.values
            )
    valid_dataloader = torch.utils.data.DataLoader(
            valid_dataset,
            batch_size=config.VALID_BATCH_SIZE,
            shuffle = False
            )
    
    
    device = torch.device('cuda')
    print('Running on ',device)
    model = BertBaseUncased().to(device)
    
    
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias','LayerNorm.bias','layerNorm.weight']
    
    optimizer_params = [
            {'params':[p for n,p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay':0.003},
            {'params':[p for n,p in param_optimizer if  any(nd in n for nd in no_decay)],'weight_decay':0.00}
            ]
    
    
    num_training_steps = int(len(df_train)/config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_params, lr = 2e-5)
    scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps = 0,
            num_training_steps = num_training_steps)
    
    
    best_jaccard = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_dataloader,model,optimizer,device,scheduler)
        jaccard = engine.eval_fn(valid_dataloader,model,device)
        
        print(f'Epochs {epoch+1}...',
              f'Jaccard {jaccard}')
        
        if jaccard > best_jaccard:
            torch.save(model.state_dict(),config.MODEL_PATH)
            best_jaccard = jaccard
        
        print('Memory Used: ',torch.cuda.memory_allocated()/1000000000,'GB')         
        torch.cuda.empty_cache()
Ejemplo n.º 30
0
def run_training():
    image_files = glob.glob(os.path.join(config.DATA_DIR, "*.png"))

    targets_orig = [x.split("\\")[-1][:-4] for x in image_files]

    targets = [[y for y in x] for x in targets_orig]

    targets_flat = [c for clist in targets for c in clist]

    label_enc = preprocessing.LabelEncoder()
    label_enc.fit(targets_flat)
    targets_enc = [label_enc.transform(x) for x in targets]
    targets_enc = np.array(targets_enc) + 1
    # print(targets_enc)
    # print(label_enc.classes_)

    (
        train_imgs,
        test_imgs,
        train_targets,
        test_targets,
        train_orig_targets,
        test_orig_targets,
    ) = model_selection.train_test_split(image_files,
                                         targets_enc,
                                         targets_orig,
                                         test_size=0.1,
                                         random_state=42)

    train_dataset = dataset.ClassificationDataset(
        image_paths=train_imgs,
        targets=train_targets,
        resize=(config.IMAGE_HEIGHT, config.IMAGE_WIDTH),
    )

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=config.BATCH_SIZE,
                                               num_workers=config.NUM_WORKERS,
                                               shuffle=True,
                                               pin_memory=True)

    test_dataset = dataset.ClassificationDataset(
        image_paths=test_imgs,
        targets=test_targets,
        resize=(config.IMAGE_HEIGHT, config.IMAGE_WIDTH),
    )

    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=config.BATCH_SIZE,
                                              num_workers=config.NUM_WORKERS,
                                              shuffle=False,
                                              pin_memory=True)

    model = CaptchaModel(num_chars=len(label_enc.classes_)).cuda()
    model.to(torch.device(config.DEVICE))

    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.8,
                                                           patience=5,
                                                           verbose=True)

    for epoch in range(config.EPOCHS):
        train_loss = engine.train_fn(model, train_loader, optimizer)
        valid_preds, valid_loss = engine.eval_fn(model, test_loader)
        valid_cap_preds = []
        for vp in valid_preds:
            current_preds = decode_predictions(vp, label_enc)
            valid_cap_preds.extend(current_preds)
        pprint.pprint(list(zip(test_orig_targets, valid_cap_preds))[:10])
        test_dup_rem = [remove_duplicates(c) for c in test_orig_targets]
        accuracy = metrics.accuracy_score(test_dup_rem, valid_cap_preds)
        print(
            f"EPOCH: {epoch}.train_loss:{train_loss},valid_loss:{valid_loss}, Accuracy={accuracy}"
        )
        scheduler.step(valid_loss)