Ejemplo n.º 1
0
def main():
    seed_everything(0)
    # Load Data  #####################################
    # From csv
    since = time.time()
    print('Data Loading...')
    # From Original  #################
    # data_dir = '../data/input'
    # df = load_data(nrows=None, merge=True, data_dir=data_dir)

    # From Pickle  ###################
    # with open('../data/input/data.pkl', 'rb') as f:
    #     df = pickle.load(f)

    # Preprocessing
    # df = prep_dict[args.preprocessing](df)
    # df = reduce_mem_usage(df)

    # From Feather  #################
    target_features = [
        'Snap', 'SellPrice', 'Lag', 'Lag_RollMean_28',
        'TimeFeatures', 'Lag_SellPrice', 'Lag_SellPrice_diff', 'Ids', 'Event'
    ]

    target_path = [f'../features/{name}.ftr' for name in target_features]
    df = load_from_feather(target_path)

    # Model Training  #####################################
    lgbm = LGBMModel_group(df, **config)
    res = lgbm.train()

    # WRMSSE  ##################################################
    print('Reading files...')
    calendar = pd.read_csv('../data/input/calendar.csv')
    sell_prices = pd.read_csv('../data/input/sell_prices.csv')
    sales_train_validation = pd.read_csv('../data/input/sales_train_validation.csv')
    train_fold_df = sales_train_validation.iloc[:, :-28]
    valid_fold_df = sales_train_validation.iloc[:, -28:]
    del sales_train_validation

    wrmsse = lgbm.get_wrmsse(train_fold_df, valid_fold_df, calendar, sell_prices)
    print(f'WRMSSE: {wrmsse:.3f}')
    del calendar, sell_prices, train_fold_df, valid_fold_df
    gc.collect()

    # Evaluate  #####################################
    sub_name = f"{config['exp_name']}_wrmsse_{wrmsse:.3f}.csv"
    res.to_csv(f'../data/output/{sub_name}', index=False)
    del df
    gc.collect()

    # Feature Importance  #####################################
    lgbm.visualize_feature_importance()

    # Time Counting  ##################################################
    erapsedtime = time.time() - since
    s = datetime.timedelta(seconds=erapsedtime)
    print(f'All Times: {str(s)}')
Ejemplo n.º 2
0
def main():
    parser = get_argparse()
    parser.add_argument("--fine_tunning_model",
                        type=str,
                        required=True,
                        help="fine_tuning model path")
    args = parser.parse_args()
    print(
        json.dumps(vars(args),
                   sort_keys=True,
                   indent=4,
                   separators=(', ', ': '),
                   ensure_ascii=False))
    init_logger(log_file="./log/{}.log".format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    seed_everything(args.seed)

    # save path
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    # device
    args.device = torch.device(
        "cuda:0" if torch.cuda.is_available() else "cpu")

    # tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(args.model_name_or_path)

    # Dataset & Dataloader
    test_dataset = MrcDataset(args,
                              json_path="./data/test1.json",
                              tokenizer=tokenizer)

    test_iter = DataLoader(test_dataset,
                           shuffle=False,
                           batch_size=args.per_gpu_eval_batch_size,
                           collate_fn=collate_fn,
                           num_workers=24)

    logger.info("The nums of the test_dataset examples is {}".format(
        len(test_dataset.examples)))
    logger.info("The nums of the test_dataset features is {}".format(
        len(test_dataset)))

    # model
    model = MRC_model(args.model_name_or_path)
    model.to(args.device)
    model.load_state_dict(torch.load(args.fine_tunning_model))

    # predict test
    model.eval()
    evaluate(args, test_iter, model, prefix="test")
Ejemplo n.º 3
0
    def train(self, data, seed):

        train_data = data['trainloader']
        valid_data = data['validloader']
        tgt_vocab = data['tgt_vocab']
        label2id= data['label2id']
        seed_everything(seed)
        if self.config.classifier == 'BertSGM' or self.config.classifier == 'SGM':
            with codecs.open(self.config.sgm.label_dict_file, 'r', 'utf-8') as f:
                label_dict = json.load(f)

        # ***************************************************************
        best=0
        for epoch in range(self.start_epoch, self.start_epoch+self.epochs):
            print(f"Epoch {epoch}/{self.epochs}")

            if self.config.classifier=='BertCNN' or self.config.classifier=='BertRCNN':
                train_log = self.train_epoch(train_data)
                valid_log = self.valid_epoch(valid_data)
                logs = dict(train_log, **valid_log)
                show_info = f'\nEpoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()])
                print(show_info)


            if  self.config.classifier=='BertSGM' or self.config.classifier=='SGM':
                self.train_bertsgm_epoch(train_data,epoch)
                logs = self.valid_bertsgm_epoch(valid_data,tgt_vocab,label_dict)
                print(logs)

            if self.config.classifier=='BertSeq2Set':
                self.train_seq2set_epoch(train_data,epoch)
                logs = self.valid_seq2set_epoch(valid_data,tgt_vocab,label2id)
                print(logs)

            # 存储f1值最好的模型
            if logs['valid_f1'] > best:
                best = logs['valid_f1']
                torch.save(self.model, self.config.model_save_path + str(self.config.classifier)+'_bestmodel.pth')
            print('Epoch:%d  best f1:%s' % (epoch, str(best)))

            # early_stopping
            if self.early_stopping:
                self.early_stopping.epoch_step(
                    epoch=epoch, current=logs[self.early_stopping.monitor])
                if self.early_stopping.stop_training:
                    break
Ejemplo n.º 4
0
    def train(self):
        self.logger.info("     rate  step  epoch  |   loss  val_loss  |  time")
        self.logger.info("-" * 68)

        min_loss = np.Inf

        start_time = timer()
        for epoch in range(self.start_epoch, self.num_epochs):
            seed_everything(epoch * 1000 + epoch)

            train_log = self._train_epoch(start_time)
            valid_log = self._valid_epoch()
            logs = dict(train_log, **valid_log)

            rate = self.optimizer.get_lr()
            now_epoch = (self.global_step * self.batch_size /
                         len(self.train_loader.dataset))

            asterisk = " "
            if logs["val_loss"] < min_loss:
                min_loss = logs["val_loss"]
                asterisk = "*"

            self.logger.info(f"{rate[0]:.7f} "
                             f"{self.global_step / 1000:5.2f} "
                             f"{now_epoch:6.2f}  | "
                             f'{logs["loss"]:.4f}    '
                             f'{logs["val_loss"]:.4f} {asterisk}| '
                             f'{time_to_str((timer() - start_time), "sec")}  '
                             f"{torch.cuda.memory_allocated() // 1024 ** 2}")

            valid_probs = logs["val_probs"]
            correct = evaluate(valid_probs)
            self.logger.info(
                f"min: {np.min(valid_probs):.4f} "
                f"max: {np.max(valid_probs):.4f} "
                f"avg: {np.average(valid_probs):.4f} "
                f"acc: {correct}, {float(correct / len(valid_probs)):.4f}")

            if self.model_checkpoint:
                state = self._save_info(epoch, val_loss=logs["val_loss"])
                self.model_checkpoint.step(state=state)
Ejemplo n.º 5
0
def run(args, log):

    df = pd.read_csv(args.df_path)
    df_train = df[df['Fold']!=args.fold]
    df_valid = df[df['Fold']==args.fold]
    dfs = {}
    dfs['train'] = df_train
    dfs['val'] = df_valid
    
    model = get_model(args).cuda()
    
    if args.mode != 'segmentation':
        for param in model.model.encoder.parameters():
            param.requires_grad = True
        for param in model.model.decoder.parameters():
            param.requires_grad = True
        for params in model.model.classification_head.parameters():
            params.requires_grad = False

    elif args.mode == 'classification':
        for param in model.model.encoder.parameters():
            param.requires_grad = False
        for param in model.model.decoder.parameters():
            param.requires_grad = False
        for param in model.classification_head.parameters():
            param.requires_grad = True    

    criterion = get_loss(args)
    optimizer = get_optimizer(args, model)
    
    if args.initial_ckpt is not None:
        last_epoch, step = checkpoint.load_checkpoint(args, model, checkpoint=args.initial_ckpt)
        log.write(f'Resume training from {args.initial_ckpt} @ {last_epoch}\n')
    else:
        last_epoch, step = -1, -1
    
    dataloaders = {mode:get_dataloader(args.data_dir, dfs[mode], mode, args.pretrain, args.batch_size) for mode in ['train', 'val']}   
    seed_everything(seed=123)
    clr = CLR(optimizer, len(dataloaders['train']))

    train(args, model, dataloaders['train'], criterion, optimizer, clr)
    def train(self, train_data, valid_data, seed):
        seed_everything(seed)
        print("model summary info: ")
        for step, (input_ids, input_mask, segment_ids,
                   label_ids) in enumerate(train_data):
            input_ids = input_ids.to(self.device)
            input_mask = input_mask.to(self.device)
            segment_ids = segment_ids.to(self.device)
            summary(self.model,
                    *(input_ids, input_mask, segment_ids),
                    show_input=True)
            break

        # ***************************************************************
        for epoch in range(self.start_epoch, self.start_epoch + self.epochs):
            self.logger.info(f"Epoch {epoch}/{self.epochs}")
            train_log = self.train_epoch(train_data)
            valid_log = self.valid_epoch(valid_data)

            logs = dict(train_log, **valid_log)
            show_info = f'\nEpoch: {epoch} - ' + "-".join(
                [f' {key}: {value:.4f} ' for key, value in logs.items()])
            self.logger.info(show_info)

            # save
            if self.training_monitor:
                self.training_monitor.epoch_step(logs)

            # save model
            if self.model_checkpoint:
                state = self.save_info(epoch, best=logs['valid_loss'])
                self.model_checkpoint.bert_epoch_step(
                    current=logs[self.model_checkpoint.monitor], state=state)

            # early_stopping
            if self.early_stopping:
                self.early_stopping.epoch_step(
                    epoch=epoch, current=logs[self.early_stopping.monitor])
                if self.early_stopping.stop_training:
                    break
Ejemplo n.º 7
0
def submit(args, log):
    df = pd.read_csv(args.df_path)
    df['Image'] = df.Image_Label.map(lambda v: v[:v.find('_')])
    print(df.head())

    model = get_model(args).cuda()
    last_epoch, step = checkpoint.load_checkpoint(args,
                                                  model,
                                                  checkpoint=args.initial_ckpt)
    log.write(f'Loaded checkpoint from {args.initial_ckpt} @ {last_epoch}\n')

    dataloader = get_dataloader(args.data_dir, df, 'test', args.pretrain,
                                args.batch_size)
    seed_everything()

    # inference
    test_ids, mask_predictions = inference_submit(model, dataloader,
                                                  args.tta_augment)

    assert len(test_ids) == mask_predictions.shape[0]

    ids = []
    rles = []
    for i, image_id in tqdm.tqdm(enumerate(test_ids), total=len(test_ids)):
        predictions = mask_predictions[i]
        for cls_idx in range(4):
            prediction = predictions[cls_idx, :, :]
            H, W = prediction.shape
            assert H == 350 and W == 525
            rle_encoded = mask2rle(prediction)
            assert np.all(rle2mask(H, W, rle_encoded) == prediction)
            ids.append(f'{image_id}_{LABEL_LIST[cls_idx]}')
            rles.append(rle_encoded)

    df_submission = pd.DataFrame({'Image_Label': ids, 'EncodedPixels': rles})
    df_submission.to_csv(args.sub_name, index=False)
    print(df_submission.head())
def train(args, logger=None):
    from utils.utils import create_loaders, seed_everything, CIFAR_NORMALIZATION
    import utils.config as cf
    import os
    import torch.backends.cudnn as cudnn
    import time

    seed_everything(args.seed)

    normalize = None
    if args.normalize == "meanstd":
        from torchvision import transforms
        normalize = transforms.Normalize(cf.mean["cifar10"], cf.std["cifar10"])
    elif args.normalize == "default":
        normalize = CIFAR_NORMALIZATION

    # Hyper Parameter settings
    use_cuda = torch.cuda.is_available()
    best_acc = 0
    start_epoch, num_epochs = cf.start_epoch, cf.num_epochs

    # Data Uplaod
    trainloader, testloader = create_loaders(args, augment=not args.no_augment, normalize=normalize)

    # Model
    print('\n[Phase 2] : Model setup')
    net = Wide_ResNet(**vars(args))
    file_name = os.path.join(args.output, "%s/%s/model_%i.pt" % (args.dataset, "wide_resnet", args.seed))
    net.apply(conv_init)

    if use_cuda:
        net.cuda()
        net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count()))
        cudnn.benchmark = True

    criterion = nn.CrossEntropyLoss()

    if args.optimizer == "adam":
        from torch.optim import Adam
        optimizer = Adam(net.parameters(), lr=args.lr)
    elif args.optimizer == "sgd":
        from torch.optim import SGD
        optimizer = None
    elif args.optimizer == "sls":
        from utils.sls import Sls
        n_batches_per_epoch = len(trainloader)
        print(n_batches_per_epoch)
        optimizer = Sls(net.parameters(), n_batches_per_epoch=n_batches_per_epoch)
    else:
        raise ValueError("Only supports adam or sgd for optimizer.")

    # Training
    def train(epoch, optimizer=None):
        net.train()
        net.training = True
        train_loss = 0
        correct = 0
        total = 0
        if args.optimizer == "sgd":
            optimizer = SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), momentum=0.9, weight_decay=5e-4)

        print('\n=> Training Epoch #%d, LR=%.4f' %(epoch, cf.learning_rate(args.lr, epoch)))
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            if use_cuda:
                inputs, targets = inputs.cuda(), targets.cuda() # GPU settings
            optimizer.zero_grad()
            inputs, targets = Variable(inputs), Variable(targets)
            outputs = net(inputs)               # Forward Propagation
            loss = criterion(outputs, targets)  # Loss

            if args.optimizer == "sls":
                def closure():
                    output = net(inputs)
                    loss = criterion(output, targets)
                    return loss
                optimizer.step(closure)
            else:
                loss.backward()
                optimizer.step()

            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += predicted.eq(targets.data).cpu().sum()

            sys.stdout.write('\r')
            sys.stdout.write('| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%'
                    %(epoch, num_epochs, batch_idx+1,
                        len(trainloader), loss.item(), 100.*correct/total))
            sys.stdout.flush()

            if logger is not None:
                logger.write(dict(train_accuracy=100. * correct / total, loss=loss.item()), epoch)

    def test(epoch, best_acc=0):
        net.eval()
        net.training = False
        test_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch_idx, (inputs, targets) in enumerate(testloader):
                if use_cuda:
                    inputs, targets = inputs.cuda(), targets.cuda()
                inputs, targets = Variable(inputs), Variable(targets)
                outputs = net(inputs)
                loss = criterion(outputs, targets)

                test_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += targets.size(0)
                correct += predicted.eq(targets.data).cpu().sum()

            # Save checkpoint when best model
            acc = 100.*correct/total
            if logger is None:
                print("\n| Validation Epoch #%d\t\t\tLoss: %.4f Acc@1: %.2f%%" %(epoch, loss.item(), acc))
            else:
                logger.write(dict(test_loss=loss.item(), test_accuracy=acc), epoch)
            
            if acc > best_acc:
                print('| Saving Best model...\t\t\tTop1 = %.2f%%' %(acc))
                state = {
                        'net':net.module if use_cuda else net,
                        'acc':acc,
                        'epoch':epoch,
                }
                dirname = os.path.dirname(file_name)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                torch.save(net.state_dict(), file_name)
                best_acc = acc
        return best_acc

    print('\n[Phase 3] : Training model')
    print('| Training Epochs = ' + str(num_epochs))
    print('| Initial Learning Rate = ' + str(args.lr))

    elapsed_time = 0
    for epoch in range(start_epoch, start_epoch+num_epochs):
        start_time = time.time()

        train(epoch, optimizer)
        best_acc = test(epoch, best_acc)

        epoch_time = time.time() - start_time
        elapsed_time += epoch_time
        print('| Elapsed time : %d:%02d:%02d'  %(cf.get_hms(elapsed_time)))

    print('\n[Phase 4] : Testing model')
    print('* Test results : Acc@1 = %.2f%%' %(best_acc))
Ejemplo n.º 9
0
import glob, pickle, time, datetime, argparse, gc
import pandas as pd
from sklearn.model_selection import KFold, TimeSeriesSplit

from utils.utils import load_data, load_from_feather, reduce_mem_usage, seed_everything
from model.Model import LGBMModel_group

seed_everything(0)

# Parser  ################################################################
parser = argparse.ArgumentParser()
parser.add_argument('-exp', '--expname')
parser.add_argument('-obj', '--objective', default='regression', choices=['regression', 'poisson', 'tweedie'])
parser.add_argument('-lr', '--learningrate', type=float, default=0.01)
parser.add_argument('-subs', '--subsample', type=float, default=1.0)
parser.add_argument('-featfrac', '--featurefraction', type=float, default=1.0)
parser.add_argument('-cv', '--crossval', default='kfold', choices=['kfold', 'time', 'none'])
parser.add_argument('-nsplit', '--nsplit', type=int, default=4)
parser.add_argument('-num', '--num_boost_round', type=int, default=1000)
parser.add_argument('-early', '--early_stopping_rounds', type=int, default=10)
parser.add_argument('-drate', '--data_rate', type=float, default=0.1)
parser.add_argument('-grp', '--group', default='store', choices=['store', 'cat', 'state'])
parser.add_argument('-prep', '--preprocess', action='store_true')
parser.add_argument('-post', '--postprocess', action='store_true')
args = parser.parse_args()


# Parameter  #############################################################
params = {
    'boosting_type': 'gbdt',
    'objective': args.objective,
Ejemplo n.º 10
0
def main(cfg: DictConfig):
    print('Nishika Second-hand Apartment Price Training')
    cur_dir = hydra.utils.get_original_cwd()
    os.chdir(cur_dir)
    data_dir = './input'

    seed_everything(cfg.data.seed)

    experiment = Experiment(api_key=cfg.exp.api_key,
                            project_name=cfg.exp.project_name,
                            auto_output_logging='simple',
                            auto_metric_logging=False)

    experiment.log_parameters(dict(cfg.data))

    # Config  ####################################################################################
    del_tar_col = ['取引時点']
    id_col = 'ID'
    tar_col = '取引価格(総額)_log'
    g_col = 'year'
    criterion = MAE
    cv = KFold(n_splits=cfg.data.n_splits,
               shuffle=True,
               random_state=cfg.data.seed)
    # cv = GroupKFold(n_splits=5)

    # Load Data  ####################################################################################
    if cfg.exp.use_pickle:
        # pickleから読み込み
        df = unpickle('./input/data.pkl')

    else:
        df = load_data(data_dir,
                       sampling=cfg.data.sampling,
                       seed=cfg.data.seed,
                       id_col=id_col,
                       target_col=tar_col)
        # Preprocessing
        print('Preprocessing')
        df = preprocessing(df, cfg)

        # pickle形式で保存
        to_pickle('./input/data.pkl', df)
        try:
            experiment.log_asset(file_data='./input/data.pkl',
                                 file_name='data.pkl')
        except:
            pass

    features = [c for c in df.columns if c not in del_tar_col]

    # Model  ####################################################################################
    model = None
    if cfg.exp.model == 'lgb':
        model = LGBMModel(dict(cfg.lgb))
    elif cfg.exp.model == 'cat':
        model = CatBoostModel(dict(cfg.cat))

    # Train & Predict  ##############################################################################
    trainer = Trainer(model, id_col, tar_col, g_col, features, cv, criterion,
                      experiment)
    trainer.fit(df)
    trainer.predict(df)
    trainer.get_feature_importance()
Ejemplo n.º 11
0
ARCH = "bert"
SEED = 2323
FOLD_ID = 2

TEST_PATH = "/input/input.txt"
OUTPUT_PATH = "/output/output.txt"
# TEST_PATH = "datasets/input.txt"  #
# TEST_PATH = "datasets/SCM_5k.json"
TEST_PATH = f"datasets/bigfolds/fold{FOLD_ID}_valid.txt"  #
OUTPUT_PATH = "output/output.txt"  #
LOG_DIR = "output/logs"

MAX_SEQ_LENGTH = 445
BATCH_SIZE = 16

seed_everything(SEED)
logger = init_logger(log_name=ARCH, log_dir=LOG_DIR)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print("---------- Bert Eval ... ----------")
start_time = timer()

# bert_config.json, pytorch_model.bin vocab.txt in chpts
BERT_MODEL_PATH = "output/ckpts6920"
BERT_VOCAB_PATH = "output/ckpts6920/vocab.txt"

test_dataset = CAILDataset(
    data_path=TEST_PATH,
    max_seq_len=MAX_SEQ_LENGTH,
    vocab_path=BERT_VOCAB_PATH,
    seed=SEED,
Ejemplo n.º 12
0
from utils.utils import seed_everything
seed_everything()
import warnings
warnings.filterwarnings("ignore")
import torch
import torch.nn as nn
import torchvision
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from albumentations.pytorch import ToTensor
from catalyst.data.sampler import BalanceClassSampler
from torch.utils.data.sampler import SequentialSampler
import torch.nn.functional as F

from data_loader.alaska import Alaska
from data_loader.generator import Alaska2Dataset
from model.network import Net, AttentionNet
from utils.metrics import alaska_weighted_auc
from utils.data_augmentation import get_transforms
from data_loader.dataset_retriever import DatasetRetriever
from trainer.fitter import Fitter

import json



config_json = "./config/baseline.json"
with open(config_json) as f:
  config = json.load(f)
Ejemplo n.º 13
0
def main():
    seed_everything(3)
    print("done")
Ejemplo n.º 14
0
def main():
    logger = logger_factory(log_name=config['model']['arch'],
                            log_dir=config['output']['log_dir'])
    logger.info(f"seed is {config['train']['seed']}")
    n_gpu = torch.cuda.device_count()
    logger.info(f"Cuda device count:{n_gpu}")
    device = f"cuda: {config['train']['n_gpu'][0] if len(config['train']['n_gpu']) else 'cpu'}"
    seed_everything(seed=config['train']['seed'], device=device)
    logger.info('starting to load data from disk')
    torch.cuda.empty_cache()

    model_state_dict = None

    processor = MultiLabelTextProcessor(config['data']['data_path'])

    label_list, num_labels = load_labels(processor)
    logger.info(f"Labels loaded. Count: {num_labels}")
    print(label_list)

    tokenizer = BertTokenizer.from_pretrained(
        config['bert']['path'], do_lower_case=config['train']['do_lower_case'])

    train_examples = None
    num_train_steps = None
    if config['train']['do_train']:
        train_examples = processor.get_train_examples(
            config['data']['data_path'],
            logger=logger,
            size=config['train']['train_size'])
        num_train_steps = int(
            len(train_examples) / config['train']['train_batch_size'] /
            config['train']['gradient_accumulation_steps'] *
            config['train']['num_train_epochs'])

    logger.info(f"Training examples:{len(train_examples)}")
    logger.info(f"Training steps:{num_train_steps}")

    model = get_model(model_state_dict, num_labels)

    logger.info(f"fp16: {config['train']['fp16']}")
    if config['train']['fp16']:
        model.half()

    model.to(device)

    logger.info(f"Model loaded: {config['bert']['path']}")

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = num_train_steps

    if config['train']['fp16']:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=config['train']['learning_rate'],
                              bias_correction=False,
                              max_grad_norm=1.0)
        if config['train']['loss_scale'] == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(
                optimizer, static_loss_scale=config['train']['loss_scale'])

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=config['train']['learning_rate'],
                             warmup=config['train']['warmup_proportion'],
                             t_total=t_total)

    scheduler = CyclicLR(optimizer,
                         base_lr=2e-5,
                         max_lr=5e-5,
                         step_size=2500,
                         last_batch_iteration=0)

    eval_examples = processor.get_dev_examples(
        config['data']['data_path'],
        filename='training.csv',
        size=config['train']['val_size'])
    logger.info(f"Evaluation data loaded. Len: {len(eval_examples)}")
    train_features = convert_examples_to_features(
        train_examples, label_list, config['train']['max_seq_length'],
        tokenizer, logger)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", config['train']['train_batch_size'])
    logger.info("  Num steps = %d", num_train_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in train_features],
                                 dtype=torch.float)
    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_ids)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(
        train_data,
        sampler=train_sampler,
        batch_size=config['train']['train_batch_size'])

    # Freeze BERT layers for 1 epoch
    # model.module.freeze_bert_encoder()
    # fit(1)
    model.unfreeze_bert_encoder()

    fit(model, device, n_gpu, optimizer, train_dataloader, logger, t_total,
        eval_examples, label_list, num_labels, tokenizer)

    # Save a trained model
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(config['bert']['cache'],
                                     "finetuned_pytorch_model.bin")
    torch.save(model_to_save.state_dict(), output_model_file)
    logger.info(f"Model saved! Location: {output_model_file}")

    if None:
        # Load a trained model that you have fine-tuned
        model_state_dict = torch.load(output_model_file)
        model = BertForMultiLabelSequenceClassification.from_pretrained(
            config['bert']['path'],
            num_labels=num_labels,
            state_dict=model_state_dict)
        model.to(device)

        eval(model, device, logger, eval_examples, label_list, num_labels,
             config['train']['max_seq_length'], tokenizer)

        result = predict(model, device, config['data']['data_path'], logger,
                         label_list, tokenizer)
        print(result.shape)
        result.to_csv(config['data']['data_path'] / 'prediction.csv',
                      index=None)
def main():

    parser = ArgumentParser()
    parser.add_argument("--pretrain", default="bert", type=str)
    parser.add_argument("--do_data", action="store_true")
    parser.add_argument("--do_train", action="store_true")
    parser.add_argument("--do_test", action="store_true")
    parser.add_argument("--save_best", action="store_true")
    parser.add_argument("--do_lower_case", action='store_true')
    parser.add_argument("--data_name", default="law", type=str)
    parser.add_argument("--train_data_num", default=0, type=int)
    parser.add_argument("--test_data_num", default=0, type=int)
    parser.add_argument("--epochs", default=5, type=int)
    parser.add_argument("--resume_path", default="", type=str)
    parser.add_argument("--mode", default="min", type=str)
    parser.add_argument("--monitor", default="valid_loss", type=str)
    parser.add_argument("--valid_size", default=0.2, type=float)
    parser.add_argument("--local_rank", type=int, default=-1)
    parser.add_argument("--sorted",
                        default=1,
                        type=int,
                        help="1 : True  0:False")
    parser.add_argument("--n_gpu",
                        type=str,
                        default="0",
                        help='"0,1,.." or "0" or "" ')
    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
    parser.add_argument("--train_batch_size", default=8, type=int)
    parser.add_argument("--eval_batch_size", default=8, type=int)
    parser.add_argument("--train_max_seq_len", default=256, type=int)
    parser.add_argument("--eval_max_seq_len", default=256, type=int)
    parser.add_argument("--loss_scale", type=float, default=0)
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=int,
    )
    parser.add_argument("--weight_decay", default=0.01, type=float)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)
    parser.add_argument("--grad_clip", default=1.0, type=float)
    parser.add_argument("--learning_rate", default=2e-5, type=float)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--fp16", action="store_true")
    parser.add_argument("--fp16_opt_level", type=str, default="O1")

    args = parser.parse_args()

    try:
        pipeline = piop.read_yml("pipeline.yml")
        pl = AttrDict(pipeline["pipeline"])
        config["preprocessor"] = pl.preprocessor
        config["pretrain"] = pl.pretrain
        config["postprocessor"] = pl.postprocessor
        config["classifier"] = pl.classifier
    except Exception as e:
        raise PipelineReadError

    config["checkpoint_dir"] = config["checkpoint_dir"] / config["classifier"]
    config["checkpoint_dir"].mkdir(exist_ok=True)

    torch.save(args, config["checkpoint_dir"] / "training_args.bin")
    seed_everything(args.seed)
    init_logger(log_file=config["log_dir"] /
                "{}.log".format(config["classifier"]))

    logger.info("Training/evaluation parameters %s", args)

    if args.do_data:
        from dataio.task_data import TaskData
        data = TaskData(args.train_data_num)
        labels, sents = data.read_data(
            raw_data_path=config["raw_data_path"],
            data_dir=config["data_dir"],
            preprocessor=Preprocessor(config["preprocessor"])(
                stopwords_path=config["stopwords_path"],
                userdict_path=config["userdict_path"]),
            is_train=True)
        data.train_val_split(X=sents,
                             y=labels,
                             valid_size=args.valid_size,
                             data_dir=config["data_dir"],
                             data_name=args.data_name)
        if config["pretrain"] == "Nopretrain":
            data.build_vocab(config["nopretrain_vocab_path"],
                             sents,
                             min_count=5)

    if args.do_train:
        train(args)

    if args.do_test:
        test(args)
Ejemplo n.º 16
0
from utils.utils import seed_everything
from utils.prep_utils import mag_normalize
import numpy as np
import random
import torch
# from SpecAug.sparse_image_warp_pytorch import sparse_image_warp

seed_everything(42)

def mix_db(x,y,db):
    E_x = np.mean(x**2)
    E_y = np.mean(y**2)
    
    a = E_x/(E_y*(10**(db/10)))
    lam = 1/(1+a)
    
    return lam*x + (1-lam)*y

# def time_warp(spec, W=5):
#     num_rows = spec.shape[1] ##F
#     spec_len = spec.shape[2] ##T

#     y = num_rows // 2
#     horizontal_line_at_ctr = spec[0][y]
#     # assert len(horizontal_line_at_ctr) == spec_len

#     point_to_warp = horizontal_line_at_ctr[random.randrange(W, spec_len-W)]
#     # assert isinstance(point_to_warp, torch.Tensor)

#     # Uniform distribution from (0,W) with chance to be up to W negative
#     dist_to_warp = random.randrange(-W, W)
Ejemplo n.º 17
0
    parser.add_argument('--debug', default=False, action='store_true',
                        help='Debug')
    parser.add_argument('--opus', default=False, action='store_true',
                        help='Change AMINER File Path for Opus')
    parser.add_argument('--debug_name', type=str, default="one_maml_graph",
                        help='where to save/load')
    parser.add_argument('--namestr', type=str, default='Meta-Graph', \
            help='additional info in output filename to describe experiments')
    parser.add_argument('--study_uid', type=str, default='')
    parser.add_argument('--gating', type=str, default=None, choices=[None, 'signature', 'weights', 'signature_cond', 'weights_cond'])
    parser.add_argument('--layer_norm', default=False, action='store_true',
                        help='use layer norm')
    args = parser.parse_args()

    ''' Fix Random Seed '''
    seed_everything(args.seed)
    # Check if settings file
    if os.path.isfile("settings.json"):
        with open('settings.json') as f:
            data = json.load(f)
        args.comet_apikey = data["apikey"]
        args.comet_username = data["username"]
        args.wandb_apikey = data["wandbapikey"]

    args.dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if args.dataset=='PPI':
        project_name = 'meta-graph-ppi'
    elif args.dataset=='REDDIT-MULTI-12K':
        project_name = "meta-graph-reddit"
    elif args.dataset=='FIRSTMM_DB':
        project_name = "meta-graph-firstmmdb"
Ejemplo n.º 18
0
def main():
    args = get_argparse().parse_args()
    print(
        json.dumps(vars(args),
                   sort_keys=True,
                   indent=4,
                   separators=(', ', ': '),
                   ensure_ascii=False))
    init_logger(log_file="./log/{}.log".format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    seed_everything(args.seed)

    # 设置保存目录
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    # device
    args.device = torch.device(
        "cuda:0" if torch.cuda.is_available() else "cpu")

    # tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(args.model_name_or_path)

    # Dataset & Dataloader
    train_dataset = MrcDataset(args,
                               json_path="./data/train.json",
                               tokenizer=tokenizer)
    eval_dataset = MrcDataset(args,
                              json_path="./data/dev.json",
                              tokenizer=tokenizer)
    # eval_dataset, test_dataset = random_split(eval_dataset,
    #                                           [round(0.5 * len(eval_dataset)),
    #                                            len(eval_dataset) - round(0.5 * len(eval_dataset))],
    #                                           generator=torch.Generator().manual_seed(42))
    train_iter = DataLoader(train_dataset,
                            shuffle=True,
                            batch_size=args.per_gpu_train_batch_size,
                            collate_fn=collate_fn,
                            num_workers=10)
    eval_iter = DataLoader(eval_dataset,
                           shuffle=False,
                           batch_size=args.per_gpu_eval_batch_size,
                           collate_fn=collate_fn,
                           num_workers=10)
    # test_iter = DataLoader(test_dataset,
    #                        shuffle=False,
    #                        batch_size=args.per_gpu_eval_batch_size,
    #                        collate_fn=collate_fn,
    #                        num_workers=10)
    logger.info("The nums of the train_dataset examples is {}".format(
        len(train_dataset.examples)))
    logger.info("The nums of the train_dataset features is {}".format(
        len(train_dataset)))
    logger.info("The nums of the eval_dataset examples is {}".format(
        len(eval_dataset.examples)))
    logger.info("The nums of the eval_dataset features is {}".format(
        len(eval_dataset)))

    # model
    model = MRC_model(args.model_name_or_path)
    model.to(args.device)

    # 训练
    best_f1 = 0
    early_stop = 0
    for epoch, _ in enumerate(range(int(args.num_train_epochs))):
        model.train()
        train(args, train_iter, model)
        # 每轮epoch在验证集上计算分数
        eval_f1, eval_EM = evaluate(args, eval_iter, model, prefix="eval")
        logger.info("The F1-score is {}, The EM-score is {}".format(
            eval_f1, eval_EM))
        if eval_f1 > best_f1:
            early_stop = 0
            best_f1 = eval_f1
            logger.info(
                "the best eval f1 is {:.4f}, saving model !!".format(best_f1))
            best_model = copy.deepcopy(
                model.module if hasattr(model, "module") else model)
            torch.save(best_model.state_dict(),
                       os.path.join(args.output_dir, "best_model.pkl"))
        else:
            early_stop += 1
            if early_stop == args.early_stop:
                logger.info("Early stop in {} epoch!".format(epoch))
                break