Exemple #1
0
    def __init__(self, settings):
        super(Trainer, self).__init__()
        self.settings = settings
        self.phase = settings.cmd
        self.batch_size = settings.batch_size
        self.data_dir = settings.data_dir
        self.list_dir = settings.list_dir
        self.checkpoint = settings.resume
        self.load_checkpoint = (len(self.checkpoint) > 0)
        self.num_epochs = settings.num_epochs
        self.lr = float(settings.lr)
        self.save = settings.save_on or settings.out_dir
        self.from_pause = self.settings.continu
        self.path_ctrl = settings.global_path
        self.path = self.path_ctrl.get_path

        log_dir = '' if settings.log_off else self.path_ctrl.get_dir('log')
        self.logger = Logger(scrn=True, log_dir=log_dir, phase=self.phase)

        for k, v in sorted(settings.__dict__.items()):
            self.logger.show("{}: {}".format(k, v))

        self.start_epoch = 0
        self._init_max_acc = 0.0

        self.model = None
        self.criterion = None
Exemple #2
0
def set_gpc_and_logger(args):
    gpc = OutPathGetter(root=os.path.join(args.exp_dir, args.tag),
                        suffix=args.suffix)

    log_dir = '' if args.log_off else gpc.get_dir('log')
    logger = Logger(scrn=True, log_dir=log_dir, phase=args.cmd)

    register('GPC', gpc)
    register('LOGGER', logger)

    return gpc, logger
Exemple #3
0
    def __init__(self,
                 data_dir,
                 ckp_path,
                 save_lr=False,
                 list_dir='',
                 out_dir='./',
                 log_dir=''):
        super(Predictor, self).__init__()
        self.data_dir = data_dir
        self.list_dir = list_dir
        self.out_dir = out_dir
        self.checkpoint = ckp_path
        self.save_lr = save_lr

        self.logger = Logger(scrn=True, log_dir=log_dir, phase='test')

        self.model = None
Exemple #4
0
    def __init__(self,
                 model=None,
                 mode='folder',
                 save_dir=None,
                 scrn=True,
                 log_dir=None,
                 cuda_off=False):

        self.save_dir = save_dir
        self.output = None

        if not cuda_off and torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')

        assert model is not None, "The model must be assigned"
        self.model = self._model_init(model)

        if mode not in Predictor.modes:
            raise NotImplementedError

        self.logger = Logger(scrn=scrn, log_dir=log_dir, phase='predict')

        if mode == 'dataloader':
            self._predict = partial(self._predict_dataloader,
                                    dataloader=None,
                                    save_dir=save_dir)
        elif mode == 'folder':
            # self.suffix = ['.jpg', '.png', '.bmp', '.gif', '.npy']  # 支持的图像格式
            self._predict = partial(self._predict_folder, save_dir=save_dir)
        elif mode == 'list':
            self._predict = partial(self._predict_list, save_dir=save_dir)
        elif mode == 'file':
            self._predict = partial(self._predict_file, save_dir=save_dir)
        elif mode == 'data':
            self._predict = partial(self._predict_data, save_dir=save_dir)
        else:
            raise NotImplementedError
Exemple #5
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
# @Time    : 2019/9/9 9:55
# @Author  : peng.wang
# @Email   : [email protected]
# @FileName: model.py
# @ProjectName :Facility_Location_FangTai
"""

from gurobipy import *
from utils.misc import Logger
import pandas as pd
# define the log file
log = Logger(log_path='../log').logger

# define the facility location problem

YEAR_DAY = 365


class FacilityLocation(object):
    """
    this class is consist of attributes for problem construction
    some utils function for dealing with post-process
    one key function for building the detail model
    """
    def __init__(self, data, config):
        """

        :param data: class of data provide all data used
Exemple #6
0
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
#
from utils.misc import Logger
from utils.util import generate_cutoffs
from utils.misc import save_model_to_file, mean_abs_percentage_error, xgb_mape

log = Logger(log_path=os.path.join(os.path.dirname(os.path.dirname(__file__)),
                                   'log')).logger
warnings.filterwarnings("ignore")

xgb_installed = False
lgt_installed = False
try:
    import xgboost as xgb
    xgb_installed = True
    import lightgbm as lgt
    lgt_installed = True
except ImportError:
    pass


class TrainModel(BaseEstimator):
Exemple #7
0
RESTORE_FROM = "/data/AutoPheno/green/200527/PatchNet/snapshots-fb/LEAF_UNET_B0064_S010700.pth"
SAVE_PRED_EVERY = 1000
SNAPSHOT_DIR = root_dir + 'PatchNet/snapshots'+postfix
IMGSHOT_DIR = root_dir + 'PatchNet/imgshots'+postfix
WEIGHT_DECAY = 0.0005
NUM_EXAMPLES_PER_EPOCH = 13862
NUM_STEPS_PER_EPOCH = math.ceil(NUM_EXAMPLES_PER_EPOCH / float(BATCH_SIZE))
MAX_ITER = max(NUM_EXAMPLES_PER_EPOCH * MAX_EPOCH + 1,
               NUM_STEPS_PER_EPOCH * BATCH_SIZE * MAX_EPOCH + 1)
if not os.path.exists(SNAPSHOT_DIR):
    os.makedirs(SNAPSHOT_DIR)
if not os.path.exists(IMGSHOT_DIR):
    os.makedirs(IMGSHOT_DIR)

LOG_PATH = SNAPSHOT_DIR + "/B"+format(BATCH_SIZE, "04d")+"E"+format(MAX_EPOCH, "04d")+".log"
sys.stdout = Logger(LOG_PATH, sys.stdout)
print(DATA_LIST_PATH)
print("num of epoch:", MAX_EPOCH)
print("RESTORE_FROM:", RESTORE_FROM)
print(NUM_EXAMPLES_PER_EPOCH)


def get_arguments():
    """Parse all the arguments provided from the CLI.

    Returns:
      A list of parsed arguments.
    """
    parser = argparse.ArgumentParser(description="UNet Network")
    parser.add_argument("--set-start", default=False)
    parser.add_argument("--start-step", default=0, type=int)
Exemple #8
0
def run(try_num, config):
    output_dir = f'./dae-out-{try_num}'

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    args = get_args()

    train_features = pd.read_csv('../input/lish-moa/train_features.csv')
    test_features = pd.read_csv('../input/lish-moa/test_features.csv')

    if args.debug:
        train_features = train_features.loc[:500]
        config.update(dict(n_epochs=3, n_folds=2))

    all_features = pd.concat([train_features,
                              test_features]).reset_index(drop=True)
    g_features_columns = [
        col for col in all_features.columns if col.startswith('g-')
    ]
    c_features_columns = [
        col for col in all_features.columns if col.startswith('c-')
    ]
    feature_columns = g_features_columns + c_features_columns
    n_features = len(feature_columns)

    kfold = MultilabelStratifiedKFold(n_splits=config.n_folds,
                                      random_state=42,
                                      shuffle=True)
    logger = Logger()

    for fold_index, (train_idx, valid_idx) in enumerate(
            kfold.split(all_features.values, all_features.values)):
        print('Fold: ', fold_index + 1, flush=True)

        x_train = all_features.loc[train_idx]
        x_valid = all_features.loc[valid_idx]

        model = new_autoencoder(config.model_kind,
                                n_features=n_features).to(DEVICE)
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=config.learning_rate,
                                     weight_decay=1e-5)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                               mode='min',
                                                               factor=0.1,
                                                               patience=3,
                                                               eps=1e-4,
                                                               verbose=True)
        early_stopping = EarlyStopping(patience=10)
        best_score = np.inf

        for epoch in range(config.n_epochs):
            dataset = DaeDataset(x_train,
                                 feature_columns,
                                 noise_ratio=config.noise_ratio)
            dataloader = DataLoader(dataset,
                                    batch_size=config.batch_size,
                                    shuffle=True)

            train_loss = loop_train(model, criterion, dataloader, optimizer)

            dataset = DaeDataset(x_valid,
                                 feature_columns,
                                 noise_ratio=config.noise_ratio)
            dataloader = DataLoader(dataset,
                                    batch_size=config.valid_batch_size,
                                    shuffle=False)
            valid_loss, _ = loop_valid(model, criterion, dataloader)

            scheduler.step(valid_loss)

            logger.update({
                'fold': fold_index,
                'epoch': epoch + 1,
                'train_loss': train_loss,
                'val_loss': valid_loss
            })
            print(
                f'epoch {epoch + 1}/{config.n_epochs}  -  train_loss: {train_loss:.5f}  -  '
                + f'valid_loss: {valid_loss:.5f}',
                flush=True)

            if valid_loss < best_score:
                best_score = valid_loss
                torch.save(model.state_dict(),
                           f'./{output_dir}/dae_fold_weight_{fold_index}.pt')

            if early_stopping.should_stop(valid_loss):
                print('Early stopping', flush=True)
                break

    logger.save(f'./{output_dir}/dae_log.csv')
    oof_preds = []

    for fold_index in range(config.n_folds):
        model = new_autoencoder(config.model_kind,
                                n_features=n_features).to(DEVICE)
        model.load_state_dict(
            torch.load(f'./{output_dir}/dae_fold_weight_{fold_index}.pt'))
        model.eval()

        dataset = DaeDataset(all_features,
                             feature_columns,
                             noise_ratio=config.noise_ratio)
        dataloader = DataLoader(dataset,
                                batch_size=config.valid_batch_size,
                                shuffle=False)

        loss, preds = loop_valid(model, nn.MSELoss(), dataloader)

        logger.update({'fold': fold_index, 'val_loss': loss})
        print('Evaluation   fold: {}  -  valid_loss: {:.5f}'.format(
            fold_index, loss),
              flush=True)

        oof_preds.append(preds)

    print('A Whole Evaluation Score: {:.5f}'.format(
        mean_squared_error(all_features.loc[:, feature_columns].values,
                           np.mean(oof_preds, axis=0))),
          flush=True)

    # for i, preds in enumerate(oof_preds):
    #     create_pred_feature_df(preds, all_features).to_csv(f'./{output_dir}/dae_features_{i}.csv', index=False)
    create_pred_feature_df(np.mean(oof_preds, axis=0), all_features).to_csv(
        f'./{output_dir}/dae_features_mean.csv', index=False)
References
----------

[1] Narula, S. & Weistroffer,
    H. A flexible method for nonlinear multicriteria decision-making problems Systems,
    Man and Cybernetics, IEEE Transactions on, 1989 , 19 , 883-887.

'''
import sys,os


example_path=os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.join(example_path,".."))
from utils.misc import Logger 
sys.stdout = Logger(os.path.splitext(os.path.basename(__file__))[0])


from utils import tui

from method.NAUTILUS import NAUTILUSv1,ENAUTILUS

from optimization.OptimizationMethod import PointSearch
from problem.Problem import PreGeneratedProblem


if __name__ == '__main__':
    # SciPy breaks box constraints
    method = ENAUTILUS(PreGeneratedProblem(filename=os.path.join(example_path,"AuxiliaryServices.csv")), PointSearch)
    zh=tui.iter_enautilus(method)
    ci=method.current_iter
Exemple #10
0
# @Author  : peng.wang
# @Email   : [email protected]
# @FileName: util.py
# @ProjectName :Facility_Location_FangTai
"""

import pandas as pd
import os
import xlwings as xw
from utils.misc import Logger
from pyecharts import options as opts
from pyecharts.charts import Geo
from pyecharts.globals import ChartType, SymbolType
# define the log file
RAW_DATA_PATH = os.path.dirname(os.path.dirname(__file__))
log = Logger(log_path=os.path.join(RAW_DATA_PATH, 'log')).logger


class DataHandler(object):
    """

    """
    def __init__(self, file, config):
        """
        file:文件名
        :param file:
        :param config
        """
        # 读数路径
        self._PATH = os.path.join(os.path.join(RAW_DATA_PATH, 'data'), file)
        self._config = config
def run(try_num, config):
    args = get_args()

    print('args', args, flush=True)
    print('config:', config.to_dict(), flush=True)

    set_seed(config.rand_seed)

    pretrained_model = f"tf_efficientnet_b3_ns"
    model_dir = f'deepinsight-{try_num}'

    if not os.path.exists(model_dir):
        os.mkdir(model_dir)

    train_features = pd.read_csv(f"../input/lish-moa/train_features.csv")
    train_targets = pd.read_csv(f"../input/lish-moa/train_targets_scored.csv")
    test_features = pd.read_csv(f"../input/lish-moa/test_features.csv")

    if config.dae_path:
        dae_features = pd.read_csv(config.dae_path)

    if args.debug:
        train_features = train_features.iloc[:500]
        train_targets = train_targets.iloc[:500]
        if config.dae_path:
            dae_features = pd.concat([dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True)

        config.update(dict(
            kfolds=3,
            n_epoch=3
        ))

    train_features = train_features.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True)
    train_targets = train_targets.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True)

    cat_features_columns = ["cp_dose", 'cp_time']
    num_feature_columns = [c for c in train_features.columns
                           if c != "sig_id" and c not in cat_features_columns + ['cp_type']]
    all_features_columns = cat_features_columns + num_feature_columns
    target_columns = [c for c in train_targets.columns if c != "sig_id"]
    g_feature_columns = [c for c in num_feature_columns if c.startswith("g-")]
    c_feature_columns = [c for c in num_feature_columns if c.startswith("c-")]

    if config.dae_path:
        if config.dae_strategy == 'replace':
            train_features, test_features = assign_dae_features(
                train_features, test_features, dae_features, len(num_feature_columns))
        else:
            train_features, test_features, dae_feature_columns = merge_dae_features(
                train_features, test_features, dae_features, len(g_feature_columns), len(c_feature_columns))
            all_features_columns += dae_feature_columns

    train_targets = train_targets.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True)
    train_features = train_features.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True)

    if config.normalizer == 'rank':
        train_features, test_features = normalize(train_features, test_features, num_feature_columns)

    for df in [train_features, test_features]:
        df['cp_type'] = df['cp_type'].map({'ctl_vehicle': 0, 'trt_cp': 1})
        df['cp_dose'] = df['cp_dose'].map({'D1': 0, 'D2': 1})
        df['cp_time'] = df['cp_time'].map({24: 0, 48: 0.5, 72: 1})

    if config.variance_target_type == 1:
        pickle_path = f'{model_dir}/variance_reduction.pkl'

        variance_target_features = num_feature_columns
        if config.dae_path and config.dae_strategy != 'replace':
            variance_target_features += dae_feature_columns

        if not os.path.exists(pickle_path):
            vt = variance_reduction_fit(train_features, variance_target_features, config.variance_threshold)
            save_pickle(vt, pickle_path)

        vt = load_pickle(pickle_path)
        train_features = variance_reduction_transform(vt, train_features, variance_target_features)
        test_features = variance_reduction_transform(vt, test_features, variance_target_features)
        print('(variance_reduction) Number of features after applying:', len(train_features.columns), flush=True)
        all_features_columns = list(train_features.columns[1:])

    skf = MultilabelStratifiedKFold(n_splits=config.kfolds, shuffle=True, random_state=config.rand_seed)
    y_labels = np.sum(train_targets.drop("sig_id", axis=1), axis=0).index.tolist()
    logger = Logger()

    for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])):
        if args.only_pred:
            print('Skip training', flush=True)
            break

        print(f'Fold: {fold_index}', train_index.shape, val_index.shape, flush=True)

        X_train = train_features.loc[train_index, all_features_columns].copy().values
        y_train = train_targets.iloc[train_index, 1:].copy().values
        X_valid = train_features.loc[val_index, all_features_columns].copy().values
        y_valid = train_targets.iloc[val_index, 1:].copy().values

        if config.normalizer == 'log':
            scaler = LogScaler()
            if config.norm_apply_all:
                scaler.fit(X_train)
                X_train = scaler.transform(X_train)
                X_valid = scaler.transform(X_valid)
            else:
                target_features = [i for i, c in enumerate(all_features_columns) if c in num_feature_columns]
                non_target_features = [i for i, c in enumerate(all_features_columns) if c not in num_feature_columns]

                scaler.fit(X_train[:, target_features])
                X_train_tr = scaler.transform(X_train[:, target_features])
                X_valid_tr = scaler.transform(X_valid[:, target_features])
                X_train = np.concatenate([X_train[:, non_target_features], X_train_tr], axis=1)
                X_valid = np.concatenate([X_valid[:, non_target_features], X_valid_tr], axis=1)
            save_pickle(scaler, f'{model_dir}/scaler-{fold_index}.pkl')

        transformer = DeepInsightTransformer(
            feature_extractor=config.extractor,
            pixels=config.resolution,
            perplexity=config.perplexity,
            random_state=config.rand_seed,
            n_jobs=-1
        ).fit(X_train)

        save_pickle(transformer, f'{model_dir}/transformer-{fold_index}.pkl')

        model = MoAEfficientNet(
            pretrained_model_name=pretrained_model,
            fc_size=config.fc_size,
            drop_rate=config.drop_rate,
            drop_connect_rate=config.drop_connect_rate,
            weight_init='goog',
        ).to(DEVICE)

        if config.smoothing is not None:
            if config.weighted_loss_weights is not None:
                indices = get_minority_target_index(train_targets, threshold=config.weighted_loss_threshold)
                indices = [int(i not in indices) for i, c in enumerate(target_columns)]
                train_loss_function = SmoothBCEwLogits(
                    smoothing=config.smoothing,
                    weight=config.weighted_loss_weights,
                    weight_targets=indices,
                    n_labels=len(target_columns))
            else:
                train_loss_function = SmoothBCEwLogits(smoothing=config.smoothing)
        else:
            train_loss_function = bce_loss

        eval_loss_function = bce_loss

        optimizer = optim.Adam(model.parameters(), weight_decay=config.weight_decay, lr=config.learning_rate)

        if config.scheduler_type == 'ca':
            scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.t_max, eta_min=0, last_epoch=-1)
        elif config.scheduler_type == 'ms':
            scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=config.ms_scheduler_milestones, gamma=0.1)
        else:
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                optimizer, mode='min', factor=0.1, patience=config.rp_patience, eps=1e-4, verbose=True)

        early_stopping = EarlyStopping(patience=7)
        best_score = np.inf
        start_time = time.time()

        for epoch in range(config.n_epoch):

            if config.swap_enable:
                dataset = MoAImageSwapDataset(
                    X_train,
                    y_train,
                    transformer,
                    image_size=config.image_size,
                    swap_prob=config.swap_prob,
                    swap_portion=config.swap_portion)
            else:
                dataset = MoAImageDataset(X_train, y_train, transformer, image_size=config.image_size)

            dataloader = DataLoader(
                dataset,
                batch_size=config.batch_size,
                shuffle=True,
                num_workers=8,
                pin_memory=True,
                drop_last=False)
            loss = loop_train(model, train_loss_function, dataloader, optimizer)

            if config.scheduler_type == 'rp':
                scheduler.step(loss)
            else:
                scheduler.step()
                for param_group in optimizer.param_groups:
                    print('current learning rate:', param_group['lr'])

            del dataset, dataloader

            dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size)
            dataloader = DataLoader(
                dataset,
                batch_size=config.infer_batch_size,
                shuffle=False,
                num_workers=8,
                pin_memory=True,
                drop_last=False)
            valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader)

            del dataset, dataloader

            logger.update({'fold': fold_index, 'epoch': epoch + 1, 'train_loss': loss, 'val_loss': valid_loss})
            print(f'epoch {epoch + 1}/{config.n_epoch}  -  train_loss: {loss:.5f}  -  ' +
                  f'valid_loss: {valid_loss:.5f}  -  elapsed: {time_format(time.time() - start_time)}', flush=True)

            if valid_loss < best_score:
                best_score = valid_loss
                torch.save(model.state_dict(), f'./{model_dir}/deepinsight-{fold_index}.pt')

            if early_stopping.should_stop(valid_loss):
                print('Early stopping', flush=True)
                break

        print(f'Done -> Fold {fold_index}/{config.kfolds}  -  best_valid_loss: {best_score:.5f}  -  ' +
              f'elapsed: {time_format(time.time() - start_time)}', flush=True)

        torch.cuda.empty_cache()
        gc.collect()

        if args.return_first_fold:
            logger.save(f'{model_dir}/log.csv')
            return

    test_preds = np.zeros((test_features.shape[0], len(target_columns)))
    start_time = time.time()
    print('Start infarence', flush=True)

    oof_preds = np.zeros((len(train_features), len(target_columns)))
    eval_loss_function = bce_loss

    for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])):
        print(f'Infarence Fold: {fold_index}', train_index.shape, val_index.shape, flush=True)
        X_valid = train_features.loc[val_index, all_features_columns].copy().values
        y_valid = train_targets.iloc[val_index, 1:].copy().values
        X_test = test_features[all_features_columns].values

        if config.normalizer == 'log':
            scaler = load_pickle(f'{model_dir}/scaler-{fold_index}.pkl')
            X_valid = scaler.transform(X_valid)
            X_test = scaler.transform(X_test)

        transformer = load_pickle(f'{model_dir}/transformer-{fold_index}.pkl')
        model = MoAEfficientNet(
            pretrained_model_name=pretrained_model,
            fc_size=config.fc_size,
            drop_rate=config.drop_rate,
            drop_connect_rate=config.drop_connect_rate,
            weight_init='goog',
        ).to(DEVICE)
        model.load_state_dict(torch.load(f'./{model_dir}/deepinsight-{fold_index}.pt'))

        dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size)
        dataloader = DataLoader(
            dataset,
            batch_size=config.infer_batch_size,
            shuffle=False,
            num_workers=8,
            pin_memory=True,
            drop_last=False)
        valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader)
        print(f'Fold {fold_index}/{config.kfolds}  -  fold_valid_loss: {valid_loss:.5f}', flush=True)
        logger.update({'fold': fold_index, 'val_loss': valid_loss})

        oof_preds[val_index, :] = valid_preds

        dataset = TestDataset(X_test, None, transformer, image_size=config.image_size)
        dataloader = DataLoader(
            dataset,
            batch_size=config.infer_batch_size,
            shuffle=False,
            num_workers=8,
            pin_memory=True,
            drop_last=False)

        preds = loop_preds(model, dataloader)
        test_preds += preds / config.kfolds

    oof_preds_df = train_targets.copy()
    oof_preds_df.loc[:, target_columns] = oof_preds.clip(0, 1)
    oof_preds_df.to_csv(f'{model_dir}/oof_preds.csv', index=False)
    oof_loss = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds)

    print(f'OOF Validation Loss: {oof_loss:.6f}', flush=True)
    print(f'Done infarence  Elapsed {time_format(time.time() - start_time)}', flush=True)
    logger.update({'fold': 'oof', 'val_loss': oof_loss})
    logger.save(f'{model_dir}/log.csv')

    submission = pd.DataFrame(data=test_features['sig_id'].values, columns=['sig_id'])
    submission = submission.reindex(columns=['sig_id'] + target_columns)
    submission.loc[:, target_columns] = test_preds.clip(0, 1)
    submission.loc[test_features['cp_type'] == 0, submission.columns[1:]] = 0
    submission.to_csv(f'{model_dir}/submission.csv', index=False)
Exemple #12
0
def run(try_num, config):
    logger = Logger()
    args = get_args()

    print('config:', config.to_dict(), flush=True)
    print('args:', args, flush=True)
    os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

    model_dir = f'blending-01-nn-{try_num}'

    if not os.path.exists(model_dir):
        os.mkdir(model_dir)

    train_features = pd.read_csv('../input/lish-moa/train_features.csv')
    train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
    dae_features = pd.read_csv(config.dae_path)
    test_features = pd.read_csv('../input/lish-moa/test_features.csv')

    if args.debug:
        train_features = train_features[:500]
        train_targets = train_targets[:500]
        dae_features = pd.concat(
            [dae_features.iloc[:500],
             dae_features.iloc[-3982:]]).reset_index(drop=True)

        config.update(
            dict(
                n_folds=3,
                seeds=[222],
                n_epochs=3,
                batch_size=128,
            ))

    target_columns = [col for col in train_targets.columns if col != 'sig_id']
    n_targets = len(target_columns)

    train_features, train_targets, test_features = preprocess(
        config, model_dir, train_features, train_targets, test_features,
        dae_features)
    features_columns = [
        col for col in train_features.columns if col not in [
            'sig_id', 'cp_type', 'cp_time', 'cp_dose', 'cp_type_ctl_vehicle',
            'cp_type_trt_cp'
        ]
    ]

    metric_loss_function = nn.BCELoss()

    if config.weighted_loss_strategy == 1:
        indices = get_minority_target_index(
            train_targets, threshold=config.weighted_loss_threshold)
        indices = [int(i not in indices) for i, c in enumerate(target_columns)]
        smooth_loss_function = SmoothBCELoss(
            smoothing=config.smoothing,
            weight=config.weighted_loss_weights,
            weight_targets=indices,
            n_labels=n_targets)
    else:
        smooth_loss_function = SmoothBCELoss(smoothing=config.smoothing)

    kfold = MultilabelStratifiedKFold(n_splits=config.n_folds,
                                      random_state=42,
                                      shuffle=True)

    for seed_index, seed in enumerate(config.seeds):
        if args.only_pred:
            print('Skip training', flush=True)
            break

        print(f'Train seed {seed}', flush=True)
        set_seed(seed)

        for fold_index, (train_indices, val_indices) in enumerate(
                kfold.split(train_targets[target_columns].values,
                            train_targets[target_columns].values)):
            print(f'Train fold {fold_index + 1}', flush=True)

            x_train = train_features.loc[train_indices, features_columns]
            y_train = train_targets.loc[train_indices, target_columns]
            x_val = train_features.loc[val_indices, features_columns]
            y_val = train_targets.loc[val_indices, target_columns]

            model = new_model(config.model_kind,
                              len(features_columns)).to(DEVICE)
            checkpoint_path = f'{model_dir}/repeat-{seed}_Fold-{fold_index + 1}.pt'
            optimizer = optim.Adam(model.parameters(),
                                   weight_decay=config.weight_decay,
                                   lr=config.learning_rate)
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                             mode='min',
                                                             factor=0.1,
                                                             patience=3,
                                                             eps=1e-4,
                                                             verbose=True)

            best_loss = np.inf

            for epoch in range(config.n_epochs):
                dataset = MoaDataset(x_train.values, y_train.values)
                dataloader = DataLoader(dataset,
                                        batch_size=config.batch_size,
                                        shuffle=True,
                                        drop_last=True)

                train_loss = loop_train(model,
                                        dataloader,
                                        optimizer,
                                        loss_functions=(
                                            smooth_loss_function,
                                            metric_loss_function,
                                        ))

                dataset = MoaDataset(x_val.values, y_val.values)
                dataloader = DataLoader(dataset,
                                        batch_size=config.val_batch_size,
                                        shuffle=False)
                valid_loss, _ = loop_valid(model, dataloader,
                                           metric_loss_function)

                print(
                    'Epoch {}/{}   -   loss: {:5.5f}   -   val_loss: {:5.5f}'.
                    format(epoch + 1, config.n_epochs, train_loss, valid_loss),
                    flush=True)

                logger.update({
                    'epoch': epoch + 1,
                    'loss': train_loss,
                    'val_loss': valid_loss
                })

                scheduler.step(valid_loss)

                if valid_loss < best_loss:
                    best_loss = valid_loss
                    torch.save(model.state_dict(), checkpoint_path)

    oof_preds = np.zeros((len(train_features), len(config.seeds), n_targets))
    test_preds = np.zeros((len(test_features), n_targets))

    for seed_index in range(len(config.seeds)):
        seed = config.seeds[seed_index]

        print(f'Inference for seed {seed}', flush=True)

        _test_preds_in_seed = np.zeros((len(test_features), n_targets))

        for fold_index, (_, valid_indices) in enumerate(
                kfold.split(train_targets[target_columns].values,
                            train_targets[target_columns].values)):
            x_val = train_features.loc[valid_indices, features_columns]
            y_val = train_targets.loc[valid_indices, target_columns]

            checkpoint_path = f'{model_dir}/repeat-{seed}_Fold-{fold_index + 1}.pt'
            model = new_model(config.model_kind,
                              len(features_columns)).to(DEVICE)
            model.load_state_dict(torch.load(checkpoint_path))

            dataset = MoaDataset(x_val.values, y_val.values)
            dataloader = DataLoader(dataset,
                                    batch_size=config.val_batch_size,
                                    shuffle=False)
            preds = loop_pred(model, dataloader)

            oof_preds[valid_indices, seed_index, :] = preds

            dataset = MoaDataset(test_features[features_columns].values, None)
            dataloader = DataLoader(dataset,
                                    batch_size=config.val_batch_size,
                                    shuffle=False)
            preds = loop_pred(model, dataloader)

            _test_preds_in_seed += preds / config.n_folds

        score = mean_log_loss(train_targets.loc[:, target_columns].values,
                              oof_preds[:, seed_index, :],
                              n_targets=n_targets)
        test_preds += _test_preds_in_seed / len(config.seeds)

        print(f'Score for this seed {score:5.5f}', flush=True)
        logger.update({'val_loss': score})

    # Evalucate validation score
    oof_preds = np.mean(oof_preds, axis=1)
    score = mean_log_loss(train_targets.loc[:, target_columns].values,
                          oof_preds,
                          n_targets=n_targets)
    print(f'Overall score is {score:5.5f}', flush=True)

    # Save validation prediction
    oof_pred_df = train_targets.copy()
    oof_pred_df.iloc[:, 1:] = oof_preds
    oof_pred_df.to_csv(f'{model_dir}/oof_pred.csv', index=False)

    # Save log
    logger.update({'val_loss': score})
    logger.save(f'{model_dir}/log.csv')

    # Save Test Prediction
    test_features = pd.read_csv('../input/lish-moa/test_features.csv')
    submission = create_submission(test_features, ['sig_id'] + target_columns)
    submission[target_columns] = test_preds
    submission.loc[test_features['cp_type'] == 'ctl_vehicle',
                   target_columns] = 0
    submission.to_csv(f'{model_dir}/submission.csv', index=False)
Exemple #13
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
# @Time    : 2019/10/8 22:34
# @Author  : peng.wang
# @Email   : [email protected]
# @FileName: util.py
# @ProjectName :sh-demand-forecast-alg
"""
import pandas as pd
import numpy as np
import requests
import json
import os
from utils.misc import Logger
log = Logger(log_path='log').logger


class DataLoader(object):
    """
    Data loader. Combines a dataset and a sampler, and provides an iterable over
    the given dataset.
    """
    def __init__(self,
                 data,
                 train_len,
                 pred_len,
                 feature_names,
                 target_name,
                 append_train=False):
        self.data = data