コード例 #1
0
ファイル: chm_annot.py プロジェクト: smsinks/chmannot
def gen_featfilt(tuned=False, glb_filtnames=[]):
    tuned = tuned or opts.best
    common_cfg = cfgr('chm_annot', 'common')
    pr = io.param_reader(
        os.path.join(PAR_DIR, 'etc',
                     '%s.yaml' % common_cfg.setdefault('mdl_cfg', 'mdlcfg')))
    filt_names = []
    for filt_name, filter in [
            #		('Var Cut', VarianceThreshold()),
            #		('Chi2 Pval on FPR', SelectFpr(chi2, alpha=0.05)),
            #		('ANOVA-F Pval on FPR', SelectFpr(f_classif, alpha=0.05)),
            #		('Chi2 Top K Perc', SelectPercentile(chi2, percentile=30)),
            #		('ANOVA-F Top K Perc', SelectPercentile(f_classif, percentile=30)),
            #		('Chi2 Top K', SelectKBest(chi2, k=1000)),
            #		('ANOVA-F Top K', SelectKBest(f_classif, k=1000)),
            #		('LinearSVC', LinearSVC(loss='squared_hinge', dual=False, **pr('Classifier', 'LinearSVC') if tuned else {})),
            #		('Logistic Regression', SelectFromModel(LogisticRegression(dual=False, **pr('Feature Selection', 'Logistic Regression') if tuned else {}))),
            #		('Lasso', SelectFromModel(LassoCV(cv=6), threshold=0.16)),
            #		('Lasso-LARS', SelectFromModel(LassoLarsCV(cv=6))),
            #		('Lasso-LARS-IC', SelectFromModel(LassoLarsIC(criterion='aic'), threshold=0.16)),
            #		('Randomized Lasso', SelectFromModel(RandomizedLasso(random_state=0))),
            #		('Extra Trees Regressor', SelectFromModel(ExtraTreesRegressor(100))),
            # ('U102-GSS502', ftslct.MSelectKBest(ftslct.gen_ftslct_func(ftslct.utopk, filtfunc=ftslct.gss_coef, fn=100), k=500)),
            # ('GSS502', ftslct.MSelectKBest(ftslct.gss_coef, k=500)),
            #		('Combined Model', FeatureUnion([('Var Cut', VarianceThreshold()), ('Chi2 Top K', SelectKBest(chi2, k=1000))])),
        ('No Feature Filtering', None)
    ]:
        yield filt_name, filter
        filt_names.append(filt_name)
    if (len(glb_filtnames) < len(filt_names)):
        del glb_filtnames[:]
        glb_filtnames.extend(filt_names)
コード例 #2
0
ファイル: chm_annot.py プロジェクト: smsinks/chmannot
def gen_cb_models(tuned=False, glb_filtnames=[], glb_clfnames=[]):
    tuned = tuned or opts.best
    common_cfg = cfgr('chm_annot', 'common')
    pr = io.param_reader(
        os.path.join(PAR_DIR, 'etc',
                     '%s.yaml' % common_cfg.setdefault('mdl_cfg', 'mdlcfg')))
    #	filtref_func = ftslct.filtref(os.path.join(spdr.DATA_PATH, 'X.npz'), os.path.join(spdr.DATA_PATH, 'union_filt_X.npz'))
    for mdl_name, mdl in [
            # ('RandomForest', Pipeline([('clf', build_model(RandomForestClassifier, 'Classifier', 'Random Forest', tuned=tuned, pr=pr, mltl=opts.mltl, n_jobs=1 if opts.mltl else opts.np, random_state=0))])),
        ('UDT-RF',
         Pipeline([('featfilt',
                    ftslct.MSelectKBest(ftslct.utopk,
                                        filtfunc=ftslct.decision_tree,
                                        k=500,
                                        fn=100)),
                   ('clf',
                    build_model(RandomForestClassifier,
                                'Classifier',
                                'Random Forest',
                                tuned=tuned,
                                pr=pr,
                                mltl=opts.mltl,
                                n_jobs=1 if opts.mltl else opts.np,
                                random_state=0))])),
            # ('RandomForest', Pipeline([('featfilt', SelectFromModel(DecisionTreeClassifier(criterion='entropy', class_weight='balanced', random_state=0))), ('clf', build_model(RandomForestClassifier, 'Classifier', 'Random Forest', tuned=tuned, pr=pr, mltl=opts.mltl, n_jobs=1 if opts.mltl else opts.np, random_state=0))])),
            # ('RbfSVM102-2', Pipeline([('clf', build_model(SVC, 'Classifier', 'RBF SVM 102-2', tuned=tuned, pr=pr, mltl=opts.mltl, probability=True))])),
            # ('RbfSVM103-2', Pipeline([('clf', build_model(SVC, 'Classifier', 'RBF SVM 103-2', tuned=tuned, pr=pr, mltl=opts.mltl, probability=True))])),
            # ('RbfSVM102-3', Pipeline([('clf', build_model(SVC, 'Classifier', 'RBF SVM 102-3', tuned=tuned, pr=pr, mltl=opts.mltl, probability=True))])),
            # ('RbfSVM103-3', Pipeline([('clf', build_model(SVC, 'Classifier', 'RBF SVM 103-3', tuned=tuned, pr=pr, mltl=opts.mltl, probability=True))])),
            # ('DF-RbfSVM', Pipeline([('featfilt', ftslct.MSelectOverValue(ftslct.filtref(os.path.join(spdr.DATA_PATH, 'X.npz'), os.path.join(spdr.DATA_PATH, 'union_filt_X.npz'), os.path.join(spdr.DATA_PATH, 'orig_X.npz')))), ('clf', build_model(SVC, 'Classifier', 'RBF SVM', tuned=tuned, pr=pr, mltl=opts.mltl, probability=True))])),
        ('RbfSVM',
         Pipeline([('clf',
                    build_model(SVC,
                                'Classifier',
                                'RBF SVM',
                                tuned=tuned,
                                pr=pr,
                                mltl=opts.mltl,
                                probability=True))])),
            # ('L1-LinSVC', Pipeline([('clf', build_model(LinearSVC, 'Classifier', 'LinearSVC', tuned=tuned, pr=pr, mltl=opts.mltl, loss='squared_hinge', dual=False))])),
            # ('Perceptron', Pipeline([('clf', build_model(Perceptron, 'Classifier', 'Perceptron', tuned=tuned, pr=pr, mltl=opts.mltl, n_jobs=1 if opts.mltl else opts.np))])),
            # ('MNB', Pipeline([('clf', build_model(MultinomialNB, 'Classifier', 'MultinomialNB', tuned=tuned, pr=pr, mltl=opts.mltl))])),
            #		('5NN', Pipeline([('clf', build_model(KNeighborsClassifier, 'Classifier', 'kNN', tuned=tuned, pr=pr, mltl=opts.mltl, n_neighbors=5, n_jobs=1 if opts.mltl else opts.np))])),
            # ('MEM', Pipeline([('clf', build_model(LogisticRegression, 'Classifier', 'Logistic Regression', tuned=tuned, pr=pr, mltl=opts.mltl, dual=False))])),
            # ('LinearSVC with L2 penalty [Ft Filt] & Perceptron [CLF]', Pipeline([('featfilt', SelectFromModel(build_model(LinearSVC, 'Feature Selection', 'LinearSVC', tuned=tuned, pr=pr, mltl=opts.mltl, loss='squared_hinge', dual=False, penalty='l2'))), ('clf', build_model(Perceptron, 'Classifier', 'Perceptron', tuned=tuned, pr=pr, n_jobs=opts.np))])),
            # ('ExtraTrees', Pipeline([('clf', build_model(ExtraTreesClassifier, 'Classifier', 'Extra Trees', tuned=tuned, pr=pr, mltl=opts.mltl, n_jobs=opts.np))])),
            #		('Random Forest', Pipeline([('clf', build_model(RandomForestClassifier, 'Classifier', 'Random Forest', tuned=tuned, pr=pr, n_jobs=opts.np, random_state=0))]))
    ]:
        yield mdl_name, mdl
コード例 #3
0
ファイル: chm_annot.py プロジェクト: smsinks/chmannot
def gen_clfs(tuned=False, glb_clfnames=[]):
    tuned = tuned or opts.best
    common_cfg = cfgr('chm_annot', 'common')
    pr = io.param_reader(
        os.path.join(PAR_DIR, 'etc',
                     '%s.yaml' % common_cfg.setdefault('mdl_cfg', 'mdlcfg')))
    clf_names = []
    for clf_name, clf in [
            #		('RidgeClassifier', RidgeClassifier(tol=1e-2, solver='lsqr')),
            #		('Perceptron', build_model(Perceptron, 'Classifier', 'Perceptron', tuned=tuned, pr=pr, mltl=opts.mltl, n_jobs=1 if opts.mltl else opts.np)),
            #		('Passive-Aggressive', PassiveAggressiveClassifier(n_iter=50, n_jobs=1 if opts.mltl else opts.np)),
            #		('kNN', KNeighborsClassifier(n_neighbors=100, n_jobs=1 if opts.mltl else opts.np)),
            #		('NearestCentroid', NearestCentroid()),
            #		('BernoulliNB', BernoulliNB()),
            #		('MultinomialNB', MultinomialNB()),
            #		('ExtraTrees', build_model(ExtraTreesClassifier, 'Classifier', 'Extra Trees', tuned=tuned, pr=pr, mltl=opts.mltl, n_jobs=opts.np)),
        ('RandomForest',
         build_model(RandomForestClassifier,
                     'Classifier',
                     'Random Forest',
                     tuned=tuned,
                     pr=pr,
                     mltl=opts.mltl,
                     n_jobs=1 if opts.mltl else opts.np,
                     random_state=0)),
            #		('RandomForest', Pipeline([('clf', build_model(RandomForestClassifier, 'Classifier', 'Random Forest', tuned=tuned, pr=pr, n_jobs=opts.np, random_state=0))])),
            #		('BaggingkNN', BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5, n_jobs=1 if opts.mltl else opts.np, random_state=0)),
            #		('BaggingLinearSVC', build_model(BaggingClassifier, 'Classifier', 'Bagging LinearSVC', tuned=tuned, pr=pr, mltl=opts.mltl, base_estimator=build_model(LinearSVC, 'Classifier', 'LinearSVC', tuned=tuned, pr=pr, mltl=opts.mltl, loss='squared_hinge', dual=False), n_jobs=1 if opts.mltl else opts.np, random_state=0)(LinearSVC(), max_samples=0.5, max_features=0.5)),
            #		('LinSVM', build_model(LinearSVC, 'Classifier', 'LinearSVC', tuned=tuned, pr=pr, mltl=opts.mltl, loss='squared_hinge', dual=False)),
        ('RbfSVM',
         build_model(SVC,
                     'Classifier',
                     'RBF SVM',
                     tuned=tuned,
                     pr=pr,
                     mltl=opts.mltl))
    ]:
        yield clf_name, clf
        clf_names.append(clf_name)
    if (len(glb_clfnames) < len(clf_names)):
        del glb_clfnames[:]
        glb_clfnames.extend(clf_names)
コード例 #4
0
ファイル: func.py プロジェクト: cskyan/bionlpsota
def gen_clf(config, lm_model, lm_config, use_gpu=False, distrb=False, dev_id=None, **kwargs):
    mdl_name, constraints = config.model, config.cnstrnts.split(',') if hasattr(config, 'cnstrnts') and config.cnstrnts else []
    lm_mdl_name = mdl_name.split('_')[0]
    kwargs.update(dict(config=config, lm_model=lm_model, lm_config=lm_config))
    common_cfg = config.common_cfg if hasattr(config, 'common_cfg') else {}
    wsdir = config.wsdir if hasattr(config, 'wsdir') and os.path.isdir(config.wsdir) else '.'
    pr = io.param_reader(os.path.join(wsdir, 'etc', '%s.yaml' % common_cfg.setdefault('mdl_cfg', 'mdlcfg')))
    params = pr('LM', config.lm_params) if lm_mdl_name != 'none' else {}
    for pname in ['pretrained_mdl_path', 'pretrained_vocab_path']:
        if pname in params: del params[pname]

    lvar = locals()
    for x in constraints:
        cnstrnt_cls, cnstrnt_params = copy.deepcopy(C.CNSTRNTS_MAP[x])
        constraint_params = pr('Constraint', C.CNSTRNT_PARAMS_MAP[x])
        cnstrnt_params.update(dict([((k, p), constraint_params[p]) for k, p in cnstrnt_params.keys() if p in constraint_params]))
        cnstrnt_params.update(dict([((k, p), kwargs[p]) for k, p in cnstrnt_params.keys() if p in kwargs]))
        cnstrnt_params.update(dict([((k, p), lvar[p]) for k, p in cnstrnt_params.keys() if p in lvar]))
        kwargs.setdefault('constraints', []).append((cnstrnt_cls, dict([(k, v) for (k, p), v in cnstrnt_params.items()])))

    clf = config.clf[config.encoder](**kwargs) if hasattr(config, 'embed_type') and config.embed_type else config.clf(**kwargs)
    if use_gpu: clf = _handle_model(clf, dev_id=dev_id, distrb=distrb)
    return clf
コード例 #5
0
ファイル: func.py プロジェクト: cskyan/bionlpsota
def gen_mdl(config, use_gpu=False, distrb=False, dev_id=None):
    mdl_name, pretrained = config.model, True if type(config.pretrained) is str and config.pretrained.lower() == 'true' else config.pretrained
    if mdl_name == 'none': return None, None
    wsdir = config.wsdir if hasattr(config, 'wsdir') and os.path.isdir(config.wsdir) else '.'
    common_cfg = config.common_cfg if hasattr(config, 'common_cfg') else {}
    pr = io.param_reader(os.path.join(wsdir, 'etc', '%s.yaml' % common_cfg.setdefault('mdl_cfg', 'mdlcfg')))
    params = pr('LM', config.lm_params)
    lm_config = config.lm_config(**params)
    if distrb: import horovod.torch as hvd
    if (type(pretrained) is str):
        if (not distrb or distrb and hvd.rank() == 0): logging.info('Using pretrained model from `%s`' % pretrained)
        checkpoint = torch.load(pretrained, map_location='cpu')
        model = checkpoint['model']
        model.load_state_dict(checkpoint['state_dict'])
    elif (pretrained):
        if (not distrb or distrb and hvd.rank() == 0): logging.info('Using pretrained model...')
        mdl_name = mdl_name.split('_')[0]
        model = config.lm_model.from_pretrained(params['pretrained_mdl_path'] if 'pretrained_mdl_path' in params else config.lm_mdl_name)
    else:
        if (not distrb or distrb and hvd.rank() == 0): logging.info('Using untrained model...')
        try:
            for pname in ['pretrained_mdl_path', 'pretrained_vocab_path']:
                if pname in params: del params[pname]
            if (mdl_name == 'elmo'):
                pos_params = [lm_config[k] for k in ['options_file','weight_file', 'num_output_representations']]
                kw_params = dict([(k, lm_config[k]) for k in lm_config.keys() if k not in ['options_file','weight_file', 'num_output_representations', 'elmoedim']])
                logging.info('ELMo model parameters: %s %s' % (pos_params, kw_params))
                model = config.lm_model(*pos_params, **kw_params)
            else:
                model = config.lm_model(lm_config)
        except Exception as e:
            logging.warning(e)
            logging.warning('Cannot find the pretrained model file, using online model instead.')
            model = config.lm_model.from_pretrained(config.lm_mdl_name)
    if (use_gpu): model = model.to('cuda')
    return model, lm_config
コード例 #6
0
ファイル: validate.py プロジェクト: cskyan/bionlpsota
def classify(dev_id=None):
    # Prepare model related meta data
    mdl_name = args.model.lower().replace(' ', '_')
    common_cfg = cfgr('validate', 'common')
    pr = io.param_reader(os.path.join(PAR_DIR, 'etc', '%s.yaml' % common_cfg.setdefault('mdl_cfg', 'mdlcfg')))
    config_kwargs = dict([(k, v) for k, v in args.__dict__.items() if not k.startswith('_') and k not in set(['dataset', 'model', 'template']) and v is not None and not callable(v)])
    config = Configurable(args.task, mdl_name, common_cfg=common_cfg, wsdir=PAR_DIR, **config_kwargs)
    params = pr('LM', config.lm_params) if mdl_name != 'none' else {}
    use_gpu = dev_id is not None
    tokenizer = config.tknzr.from_pretrained(params['pretrained_vocab_path'] if 'pretrained_vocab_path' in params else config.lm_mdl_name) if config.tknzr else {}
    _adjust_encoder(tokenizer, config)

    # Prepare task related meta data.
    task_path, task_type, task_dstype, task_cols, task_trsfm, task_extparms = config.input if config.input and os.path.isdir(os.path.join(DATA_PATH, config.input)) else config.task_path, config.task_type, config.task_ds, config.task_col, config.task_trsfm, config.task_ext_params
    ds_kwargs = config.ds_kwargs

    # Prepare data
    if (not config.distrb or config.distrb and hvd.rank() == 0): logging.info('Dataset path: %s' % os.path.join(DATA_PATH, task_path))
    train_ds = task_dstype(os.path.join(DATA_PATH, task_path, 'train.%s' % config.fmt), tokenizer, config, **ds_kwargs)
    # Calculate the class weights if needed
    lb_trsfm = [x['get_lb'] for x in task_trsfm[1] if 'get_lb' in x]
    if (not config.weight_class or task_type == 'sentsim'):
        class_count = None
    elif len(lb_trsfm) > 0:
        lb_df = train_ds.df[task_cols['y']].apply(lb_trsfm[0])
        class_count = np.array([[1 if lb in y else 0 for lb in train_ds.binlb.keys()] for y in lb_df]).sum(axis=0)
    else:
        lb_df = train_ds.df[task_cols['y']]
        binlb = task_extparms['binlb'] if 'binlb' in task_extparms and type(task_extparms['binlb']) is not str else train_ds.binlb
        class_count = lb_df.value_counts()[binlb.keys()].values
    if (class_count is None):
        class_weights = None
        sampler = None
    else:
        class_weights = torch.Tensor(1.0 / class_count)
        class_weights /= class_weights.sum()
        sampler = None # WeightedRandomSampler does not work in new version
        # sampler = WeightedRandomSampler(weights=class_weights, num_samples=config.bsize, replacement=True)
        if not config.distrb and type(dev_id) is list: class_weights = class_weights.repeat(len(dev_id))

    # Partition dataset among workers using DistributedSampler
    if config.distrb: sampler = torch.utils.data.distributed.DistributedSampler(train_ds, num_replicas=hvd.size(), rank=hvd.rank())

    train_loader = DataLoader(train_ds, batch_size=config.bsize, shuffle=sampler is None and config.droplast, sampler=sampler, num_workers=config.np, drop_last=config.droplast)

    # Classifier
    if (not config.distrb or config.distrb and hvd.rank() == 0):
        logging.info('Language model input fields: %s' % config.input_keys)
        logging.info('Classifier hyper-parameters: %s' % config.clf_ext_params)
        logging.info('Classifier task-related parameters: %s' % task_extparms['mdlaware'])
    if (config.resume):
        # Load model
        clf, prv_optimizer, resume, chckpnt = load_model(config.resume)
        if config.refresh:
            logging.info('Refreshing and saving the model with newest code...')
            try:
                if (not distrb or distrb and hvd.rank() == 0):
                    save_model(clf, prv_optimizer, '%s_%s.pth' % (config.task, config.model))
            except Exception as e:
                logging.warning(e)
        # Update parameters
        clf.update_params(task_params=task_extparms['mdlaware'], **config.clf_ext_params)
        if (use_gpu): clf = _handle_model(clf, dev_id=dev_id, distrb=config.distrb)
        # Construct optimizer
        optmzr_cls = config.optmzr if config.optmzr else (torch.optim.Adam, {}, None)
        optimizer = optmzr_cls[0](clf.parameters(), lr=config.lr, weight_decay=config.wdecay, **optmzr_cls[1]) if config.optim == 'adam' else torch.optim.SGD(clf.parameters(), lr=config.lr, momentum=0.9)
        if prv_optimizer: optimizer.load_state_dict(prv_optimizer.state_dict())
        training_steps = int(len(train_ds) / config.bsize) if hasattr(train_ds, '__len__') else config.trainsteps
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(config.wrmprop*training_steps), num_training_steps=training_steps) if not config.noschdlr and len(optmzr_cls) > 2 and optmzr_cls[2] and optmzr_cls[2] == 'linwarm' else None
        if (not config.distrb or config.distrb and hvd.rank() == 0): logging.info((optimizer, scheduler))
    else:
        # Build model
        lm_model, lm_config = gen_mdl(config, use_gpu=use_gpu, distrb=config.distrb, dev_id=dev_id)
        clf = gen_clf(config, lm_model, lm_config, num_lbs=len(train_ds.binlb) if train_ds.binlb else 1, mlt_trnsfmr=True if task_type in ['entlmnt', 'sentsim'] and task_extparms['mdlaware'].setdefault('sentsim_func', None) is not None else False, task_params=task_extparms['mdlaware'], binlb=train_ds.binlb, binlbr=train_ds.binlbr, use_gpu=use_gpu, distrb=config.distrb, dev_id=dev_id, **config.clf_ext_params)
        optmzr_cls = config.optmzr if config.optmzr else (torch.optim.Adam, {}, None)
        optimizer = optmzr_cls[0](clf.parameters(), lr=config.lr, weight_decay=config.wdecay, **optmzr_cls[1]) if config.optim == 'adam' else torch.optim.SGD(clf.parameters(), lr=config.lr, momentum=0.9)
        training_steps = int(len(train_ds) / config.bsize) if hasattr(train_ds, '__len__') else config.trainsteps
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.wrmprop, num_training_steps=training_steps) if not config.noschdlr and len(optmzr_cls) > 2 and optmzr_cls[2] and optmzr_cls[2] == 'linwarm' else None
        if (not config.distrb or config.distrb and hvd.rank() == 0): logging.info((optimizer, scheduler))

    config.execute_all_callback()
    if config.verbose:
        logging.debug(config.__dict__)
        torch.autograd.set_detect_anomaly(True)
    if config.configfmt == 'yaml':
        config.to_yaml()
    else:
        config.to_json()

    if config.distrb:
        # Add Horovod Distributed Optimizer
        optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=clf.named_parameters())
        # Broadcast parameters from rank 0 to all other processes.
        hvd.broadcast_parameters(clf.state_dict(), root_rank=0)

    # Training
    train(clf, optimizer, train_loader, config, scheduler, weights=class_weights, lmcoef=config.lmcoef, clipmaxn=config.clipmaxn, epochs=config.epochs, earlystop=config.earlystop, earlystop_delta=config.es_delta, earlystop_patience=config.es_patience, use_gpu=use_gpu, devq=dev_id, distrb=config.distrb, resume=resume if config.resume else {})

    if config.distrb:
        if hvd.rank() == 0:
            clf = _handle_model(clf, dev_id=dev_id, distrb=False)
        else:
            return

    if config.noeval: return
    dev_ds = task_dstype(os.path.join(DATA_PATH, task_path, 'dev.%s' % config.fmt), tokenizer, config, binlb=task_extparms['binlb'] if 'binlb' in task_extparms and type(task_extparms['binlb']) is not str else train_ds.binlb, **ds_kwargs)
    dev_loader = DataLoader(dev_ds, batch_size=config.bsize, shuffle=False, num_workers=config.np)
    test_ds = task_dstype(os.path.join(DATA_PATH, task_path, 'test.%s' % config.fmt), tokenizer, config, binlb=task_extparms['binlb'] if 'binlb' in task_extparms and type(task_extparms['binlb']) is not str else train_ds.binlb, **ds_kwargs)
    test_loader = DataLoader(test_ds, batch_size=config.bsize, shuffle=False, num_workers=config.np)
    logging.debug(('binlb', train_ds.binlb, dev_ds.binlb, test_ds.binlb))

    # Evaluation
    eval(clf, dev_loader, config, ds_name='dev', use_gpu=use_gpu, devq=dev_id, distrb=config.distrb, ignored_label=task_extparms.setdefault('ignored_label', None))
    if config.traindev: train(clf, optimizer, dev_loader, config, scheduler=scheduler, weights=class_weights, lmcoef=config.lmcoef, clipmaxn=config.clipmaxn, epochs=config.epochs, earlystop=config.earlystop, earlystop_delta=config.es_delta, earlystop_patience=config.es_patience, use_gpu=use_gpu, devq=dev_id, distrb=config.distrb)
    eval(clf, test_loader, config, ds_name='test', use_gpu=use_gpu, devq=dev_id, distrb=config.distrb, ignored_label=task_extparms.setdefault('ignored_label', None))
コード例 #7
0
ファイル: validate.py プロジェクト: cskyan/bionlpsota
def multi_clf(dev_id=None):
    '''Train multiple classifiers and use them to predict multiple set of labels'''
    import inflect
    from bionlp.util import fs
    iflteng = inflect.engine()

    logging.info('### Multi Classifier Head Mode ###')
    # Prepare model related meta data
    mdl_name = args.model.lower().replace(' ', '_')
    common_cfg = cfgr('validate', 'common')
    pr = io.param_reader(os.path.join(PAR_DIR, 'etc', '%s.yaml' % common_cfg.setdefault('mdl_cfg', 'mdlcfg')))
    config_kwargs = dict([(k, v) for k, v in args.__dict__.items() if not k.startswith('_') and k not in set(['dataset', 'model', 'template']) and v is not None and type(v) is not function])
    config = Configurable(args.task, mdl_name, common_cfg=common_cfg, wsdir=PAR_DIR, **config_kwargs)
    params = pr('LM', config.lm_params) if mdl_name != 'none' else {}
    use_gpu = dev_id is not None
    tokenizer = config.tknzr.from_pretrained(params['pretrained_vocab_path'] if 'pretrained_vocab_path' in params else config.lm_mdl_name) if config.tknzr else None
    task_type = config.task_type
    _adjust_encoder(tokenizer, config)
    special_tknids_args = dict(zip(special_tkns[0], special_tknids))
    task_trsfm_kwargs = dict(list(zip(special_tkns[0], special_tknids))+[('model',args.model), ('sentsim_func', args.sentsim_func), ('seqlen',args.maxlen)])
    # Prepare task related meta data.
    task_path, task_dstype, task_cols, task_trsfm, task_extparms = args.input if args.input and os.path.isdir(os.path.join(DATA_PATH, args.input)) else config.task_path, config.task_ds, config.task_col, config.task_trsfm, config.task_ext_params
    trsfms = (task_trsfm[0] if len(task_trsfm) > 0 else [])
    # trsfms_kwargs = ([] if args.model in LM_EMBED_MDL_MAP else ([{'seqlen':args.maxlen, 'xpad_val':task_extparms.setdefault('xpad_val', 0), 'ypad_val':task_extparms.setdefault('ypad_val', None)}] if TASK_TYPE_MAP[args.task]=='nmt' else [{'seqlen':args.maxlen, 'trimlbs':task_extparms.setdefault('trimlbs', False), 'special_tkns':special_tknids_args}, task_trsfm_kwargs, {'seqlen':args.maxlen, 'xpad_val':task_extparms.setdefault('xpad_val', 0), 'ypad_val':task_extparms.setdefault('ypad_val', None)}])) + (task_trsfm[1] if len(task_trsfm) >= 2 else [{}] * len(task_trsfm[0]))
    trsfms_kwargs = ([] if hasattr(config, 'embed_type') and config.embed_type else ([{'seqlen':args.maxlen, 'xpad_val':task_extparms.setdefault('xpad_val', 0), 'ypad_val':task_extparms.setdefault('ypad_val', None)}] if config.task_type=='nmt' else [{'seqlen':args.maxlen, 'trimlbs':task_extparms.setdefault('trimlbs', False), 'required_special_tkns':['start_tknids', 'clf_tknids', 'delim_tknids'] if task_type in ['entlmnt', 'sentsim'] and (task_extparms.setdefault('sentsim_func', None) is None or not mdl_name.startswith('bert')) else ['start_tknids', 'clf_tknids'], 'special_tkns':special_tknids_args}, task_trsfm_kwargs, {'seqlen':args.maxlen, 'xpad_val':task_extparms.setdefault('xpad_val', 0), 'ypad_val':task_extparms.setdefault('ypad_val', None)}])) + (task_trsfm[1] if len(task_trsfm) >= 2 else [{}] * len(task_trsfm[0]))
    ds_kwargs = {'sampw':args.sample_weights, 'sampfrac':args.sampfrac}
    if task_type == 'nmt':
        ds_kwargs.update({'lb_coding':task_extparms.setdefault('lb_coding', 'IOB')})
    elif task_type == 'entlmnt':
        ds_kwargs.update(dict((k, task_extparms[k]) for k in ['origlb', 'lbtxt', 'neglbs', 'reflb'] if k in task_extparms))
    elif task_type == 'sentsim':
        ds_kwargs.update({'ynormfunc':task_extparms.setdefault('ynormfunc', None)})
    global_all_binlb = {}

    ext_params = dict([(k, getattr(args, k)) if hasattr(args, k) else (k, v) for k, v in config.clf_ext_params.items()])
    if hasattr(config, 'embed_type') and config.embed_type: ext_params['embed_type'] = config.embed_type
    task_params = dict([(k, getattr(args, k)) if hasattr(args, k) and getattr(args, k) is not None else (k, v) for k, v in task_extparms.setdefault('mdlcfg', {}).items()])
    logging.info('Classifier hyper-parameters: %s' % ext_params)
    logging.info('Classifier task-related parameters: %s' % task_params)
    orig_epochs = mltclf_epochs = args.epochs
    elapsed_mltclf_epochs, args.epochs = 0, 1
    if (args.resume):
        # Load model
        clf, prv_optimizer, resume, chckpnt = load_model(args.resume)
        if args.refresh:
            logging.info('Refreshing and saving the model with newest code...')
            try:
                save_model(clf, prv_optimizer, '%s_%s.pth' % (args.task, args.model))
            except Exception as e:
                logging.warning(e)
        elapsed_mltclf_epochs, all_binlb = chckpnt.setdefault('mltclf_epochs', 0), clf.binlb
        # Update parameters
        clf.update_params(task_params=task_params, **ext_params)
        if (use_gpu): clf = _handle_model(clf, dev_id=dev_id, distrb=args.distrb)
        # optmzr_cls = OPTMZR_MAP.setdefault(args.model.split('_')[0], (torch.optim.Adam, {}, None))
        optmzr_cls = config.optmzr if config.optmzr else (torch.optim.Adam, {}, None)
        optimizer = optmzr_cls[0](clf.parameters(), lr=args.lr, weight_decay=args.wdecay, **optmzr_cls[1]) if args.optim == 'adam' else torch.optim.SGD(clf.parameters(), lr=args.lr, momentum=0.9)
        if prv_optimizer: optimizer.load_state_dict(prv_optimizer.state_dict())
        training_steps = int(len(train_ds) / args.bsize) if hasattr(train_ds, '__len__') else args.trainsteps
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.wrmprop, num_training_steps=training_steps) if not args.noschdlr and len(optmzr_cls) > 2 and optmzr_cls[2] and optmzr_cls[2] == 'linwarm' else None
        logging.info((optimizer, scheduler))
    else:
        # Build model
        lm_model = gen_mdl(mdl_name, config, pretrained=True if type(args.pretrained) is str and args.pretrained.lower() == 'true' else args.pretrained, use_gpu=use_gpu, distrb=args.distrb, dev_id=dev_id) if mdl_name != 'none' else None
        clf = gen_clf(args.model, config, args.encoder, lm_model=lm_model, mlt_trnsfmr=True if task_type in ['entlmnt', 'sentsim'] and task_params.setdefault('sentsim_func', None) is not None else False, task_params=task_params, use_gpu=use_gpu, distrb=args.distrb, dev_id=dev_id, **ext_params)
        # optmzr_cls = OPTMZR_MAP.setdefault(args.model.split('_')[0], (torch.optim.Adam, {}, None))
        optmzr_cls = config.optmzr if config.optmzr else (torch.optim.Adam, {}, None)
        optimizer = optmzr_cls[0](clf.parameters(), lr=args.lr, weight_decay=args.wdecay, **optmzr_cls[1]) if args.optim == 'adam' else torch.optim.SGD(clf.parameters(), lr=args.lr, momentum=0.9)
        training_steps = int(len(train_ds) / args.bsize) if hasattr(train_ds, '__len__') else args.trainsteps
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.wrmprop, num_training_steps=training_steps) if not args.noschdlr and len(optmzr_cls) > 2 and optmzr_cls[2] and optmzr_cls[2] == 'linwarm' else None
        logging.info((optimizer, scheduler))

    # Prepare data
    logging.info('Dataset path: %s' % os.path.join(DATA_PATH, task_path))
    num_clfs = min([len(fs.listf(os.path.join(DATA_PATH, task_path), pattern='%s_\d.csv' % x)) for x in ['train', 'dev', 'test']])
    for epoch in range(elapsed_mltclf_epochs, mltclf_epochs):
        logging.info('Global %i epoch(s)...' % epoch)
        clf.reset_global_binlb()
        all_binlb = {}
        for i in range(num_clfs):
            logging.info('Training on the %s sub-dataset...' % iflteng.ordinal(i+1))
            train_ds = task_dstype(os.path.join(DATA_PATH, task_path, 'train_%i.%s' % (i, args.fmt)), task_cols['X'], task_cols['y'], config.encode_func, tokenizer, config, sep='\t', index_col=task_cols['index'], binlb=task_extparms['binlb'] if 'binlb' in task_extparms else None, transforms=trsfms, transforms_kwargs=trsfms_kwargs, mltl=task_extparms.setdefault('mltl', False), **ds_kwargs)
            new_lbs = [k for k in train_ds.binlb.keys() if k not in all_binlb]
            all_binlb.update(dict([(k, v) for k, v in zip(new_lbs, range(len(all_binlb), len(all_binlb)+len(new_lbs)))]))
            if mdl_name.startswith('bert'): train_ds = MaskedLMIterDataset(train_ds) if isinstance(train_ds, BaseIterDataset) else MaskedLMDataset(train_ds)
            lb_trsfm = [x['get_lb'] for x in task_trsfm[1] if 'get_lb' in x]
            if (not args.weight_class or task_type == 'sentsim'):
                class_count = None
            elif len(lb_trsfm) > 0:
                lb_df = train_ds.df[task_cols['y']].apply(lb_trsfm[0])
                class_count = np.array([[1 if lb in y else 0 for lb in train_ds.binlb.keys()] for y in lb_df]).sum(axis=0)
            else:
                lb_df = train_ds.df[task_cols['y']]
                binlb = task_extparms['binlb'] if 'binlb' in task_extparms and type(task_extparms['binlb']) is not str else train_ds.binlb
                class_count = lb_df.value_counts()[binlb.keys()].values
            if (class_count is None):
                class_weights = None
                sampler = None
            else:
                class_weights = torch.Tensor(1.0 / class_count)
                class_weights /= class_weights.sum()
                class_weights *= (args.clswfac[min(len(args.clswfac)-1, i)] if type(args.clswfac) is list else args.clswfac)
                sampler = WeightedRandomSampler(weights=class_weights, num_samples=args.bsize, replacement=True)
                if type(dev_id) is list: class_weights = class_weights.repeat(len(dev_id))
            train_loader = DataLoader(train_ds, batch_size=args.bsize, shuffle=False, sampler=None, num_workers=args.np, drop_last=args.droplast)

            dev_ds = task_dstype(os.path.join(DATA_PATH, task_path, 'dev_%i.%s' % (i, args.fmt)), task_cols['X'], task_cols['y'], config.encode_func, tokenizer, config, sep='\t', index_col=task_cols['index'], binlb=task_extparms['binlb'] if 'binlb' in task_extparms and type(task_extparms['binlb']) is not str else all_binlb, transforms=trsfms, transforms_kwargs=trsfms_kwargs, mltl=task_extparms.setdefault('mltl', False), **ds_kwargs)
            if mdl_name.startswith('bert'): dev_ds = MaskedLMIterDataset(train_ds) if isinstance(train_ds, BaseIterDataset) else MaskedLMDataset(dev_ds)
            dev_loader = DataLoader(dev_ds, batch_size=args.bsize, shuffle=False, num_workers=args.np)
            test_ds = task_dstype(os.path.join(DATA_PATH, task_path, 'test_%i.%s' % (i, args.fmt)), task_cols['X'], task_cols['y'], config.encode_func, tokenizer, config, sep='\t', index_col=task_cols['index'], binlb=task_extparms['binlb'] if 'binlb' in task_extparms and type(task_extparms['binlb']) is not str else all_binlb, transforms=trsfms, transforms_kwargs=trsfms_kwargs, mltl=task_extparms.setdefault('mltl', False), **ds_kwargs)
            if mdl_name.startswith('bert'): test_ds = MaskedLMIterDataset(train_ds) if isinstance(train_ds, BaseIterDataset) else MaskedLMDataset(test_ds)
            test_loader = DataLoader(test_ds, batch_size=args.bsize, shuffle=False, num_workers=args.np)
            logging.debug(('binlb', train_ds.binlb, dev_ds.binlb, test_ds.binlb))

            # Adjust the model
            clf.get_linear(binlb=train_ds.binlb, idx=i)

            # Training on splitted datasets
            train(clf, optimizer, train_loader, config, special_tknids_args, scheduler=scheduler, pad_val=(task_extparms.setdefault('xpad_val', 0), train_ds.binlb[task_extparms.setdefault('ypad_val', 0)]) if task_type=='nmt' else task_extparms.setdefault('xpad_val', 0), weights=class_weights, lmcoef=args.lmcoef, clipmaxn=args.clipmaxn, epochs=args.epochs, earlystop=args.earlystop, earlystop_delta=args.es_delta, earlystop_patience=args.es_patience, task_type=task_type, task_name=args.task, mdl_name=args.model, use_gpu=use_gpu, devq=dev_id, resume=resume if args.resume else {}, chckpnt_kwargs=dict(mltclf_epochs=epoch))

            # Adjust the model
            clf_trnsfmr = MultiClfTransformer(clf)
            clf_trnsfmr.merge_linear(num_linear=i+1)
            clf.linear = _handle_model(clf.linear, dev_id=dev_id, distrb=args.distrb)

            # Evaluating on the accumulated dev and test sets
            eval(clf, dev_loader, config, dev_ds.binlbr, special_tknids_args, pad_val=(task_extparms.setdefault('xpad_val', 0), train_ds.binlb[task_extparms.setdefault('ypad_val', 0)]) if task_type=='nmt' else task_extparms.setdefault('xpad_val', 0), task_type=task_type, task_name=args.task, ds_name='dev', mdl_name=args.model, use_gpu=use_gpu, ignored_label=task_extparms.setdefault('ignored_label', None))
            eval(clf, test_loader, config, test_ds.binlbr, special_tknids_args, pad_val=(task_extparms.setdefault('xpad_val', 0), train_ds.binlb[task_extparms.setdefault('ypad_val', 0)]) if task_type=='nmt' else task_extparms.setdefault('xpad_val', 0), task_type=task_type, task_name=args.task, ds_name='test', mdl_name=args.model, use_gpu=use_gpu, ignored_label=task_extparms.setdefault('ignored_label', None))
        global_all_binlb.update(all_binlb)
        # clf.binlb = all_binlb
        # clf.binlbr = dict([(v, k) for k, v in all_binlb.items()])
    else:
        if orig_epochs > 0:
            try:
                save_model(clf, optimizer, '%s_%s.pth' % (args.task, args.model), devq=dev_id, distrb=args.distrb)
            except Exception as e:
                logging.warning(e)
    args.epochs = orig_epochs

    if args.noeval: return
    dev_ds = task_dstype(os.path.join(DATA_PATH, task_path, 'dev.%s' % args.fmt), task_cols['X'], task_cols['y'], config.encode_func, tokenizer, config, sep='\t', index_col=task_cols['index'], binlb=task_extparms['binlb'] if 'binlb' in task_extparms and type(task_extparms['binlb']) is not str else all_binlb, transforms=trsfms, transforms_kwargs=trsfms_kwargs, mltl=task_extparms.setdefault('mltl', False), **ds_kwargs)
    if mdl_name.startswith('bert'): dev_ds = MaskedLMIterDataset(train_ds) if isinstance(train_ds, BaseIterDataset) else MaskedLMDataset(dev_ds)
    dev_loader = DataLoader(dev_ds, batch_size=args.bsize, shuffle=False, num_workers=args.np)
    test_ds = task_dstype(os.path.join(DATA_PATH, task_path, 'test.%s' % args.fmt), task_cols['X'], task_cols['y'], config.encode_func, tokenizer, config, sep='\t', index_col=task_cols['index'], binlb=task_extparms['binlb'] if 'binlb' in task_extparms and type(task_extparms['binlb']) is not str else all_binlb, transforms=trsfms, transforms_kwargs=trsfms_kwargs, mltl=task_extparms.setdefault('mltl', False), **ds_kwargs)
    if mdl_name.startswith('bert'): test_ds = MaskedLMIterDataset(train_ds) if isinstance(train_ds, BaseIterDataset) else MaskedLMDataset(test_ds)
    test_loader = DataLoader(test_ds, batch_size=args.bsize, shuffle=False, num_workers=args.np)

    # Evaluation
    eval(clf, dev_loader, config, dev_ds.binlbr, special_tknids_args, pad_val=(task_extparms.setdefault('xpad_val', 0), train_ds.binlb[task_extparms.setdefault('ypad_val', 0)]) if task_type=='nmt' else task_extparms.setdefault('xpad_val', 0), task_type=task_type, task_name=args.task, ds_name='dev', mdl_name=args.model, use_gpu=use_gpu, ignored_label=task_extparms.setdefault('ignored_label', None))
    if args.traindev: train(clf, optimizer, dev_loader, config, special_tknids_args, scheduler=scheduler, pad_val=(task_extparms.setdefault('xpad_val', 0), train_ds.binlb[task_extparms.setdefault('ypad_val', 0)]) if task_type=='nmt' else task_extparms.setdefault('xpad_val', 0), weights=class_weights, lmcoef=args.lmcoef, clipmaxn=args.clipmaxn, epochs=orig_epochs, earlystop=args.earlystop, earlystop_delta=args.es_delta, earlystop_patience=args.es_patience, task_type=task_type, task_name=args.task, mdl_name=args.model, use_gpu=use_gpu, devq=dev_id)
    eval(clf, test_loader, config, test_ds.binlbr, special_tknids_args, pad_val=(task_extparms.setdefault('xpad_val', 0), train_ds.binlb[task_extparms.setdefault('ypad_val', 0)]) if task_type=='nmt' else task_extparms.setdefault('xpad_val', 0), task_type=task_type, task_name=args.task, ds_name='test', mdl_name=args.model, use_gpu=use_gpu, ignored_label=task_extparms.setdefault('ignored_label', None))
コード例 #8
0
ファイル: chm_annot.py プロジェクト: smsinks/chmannot
def gen_mdl_params(rdtune=False):
    common_cfg = cfgr('chm_annot', 'common')
    pr = io.param_reader(
        os.path.join(PAR_DIR, 'etc',
                     '%s.yaml' % common_cfg.setdefault('mdl_cfg', 'mdlcfg')))
    if (rdtune):
        for mdl_name, mdl, params in [
                # ('Logistic Regression', LogisticRegression(dual=False), {
                # 'param_dist':dict(
                # penalty=['l1', 'l2'],
                # C=np.logspace(-5, 5, 11),
                # tol=np.logspace(-6, 3, 10)),
                # 'n_iter':30
                # }),
                # ('LinearSVC', LinearSVC(dual=False), {
                # 'param_dist':dict(
                # penalty=['l1', 'l2'],
                # C=np.logspace(-5, 5, 11),
                # tol=np.logspace(-6, 3, 10)),
                # 'n_iter':30
                # }),
                # ('Perceptron', Perceptron(), {
                # 'param_dist':dict(
                # alpha=np.logspace(-6, 3, 10),
                # n_iter=stats.randint(3, 20)),
                # 'n_iter':30
                # }),
                # ('MultinomialNB', MultinomialNB(), {
                # 'param_dist':dict(
                # alpha=np.logspace(-6, 3, 10),
                # fit_prior=[True, False]),
                # 'n_iter':30
                # }),
                # ('SVM', SVC(), {
                # 'param_dist':dict(
                # kernel=['linear', 'rbf', 'poly'],
                # C=np.logspace(-5, 5, 11),
                # gamma=np.logspace(-6, 3, 10)),
                # 'n_iter':30
                # }),
                # ('Extra Trees', ExtraTreesClassifier(random_state=0), {
                # 'param_dist':dict(
                # n_estimators=[50, 100] + range(200, 1001, 200),
                # max_features=np.linspace(0.5, 1, 6).tolist()+['sqrt', 'log2'],
                # min_samples_leaf=[1]+range(10, 101, 10),
                # class_weight=['balanced', None]),
                # 'n_iter':30
                # }),
            ('Random Forest', RandomForestClassifier(random_state=0), {
                'param_dist':
                dict(n_estimators=[50, 100] + range(200, 1001, 200),
                     max_features=np.linspace(0.5, 1, 6).tolist() +
                     ['sqrt', 'log2'],
                     max_depth=[None] + range(10, 101, 10),
                     min_samples_leaf=[1] + range(10, 101, 10),
                     class_weight=['balanced', None]),
                'n_iter':
                30
            }),
                # ('Bagging LinearSVC', BaggingClassifier(base_estimator=build_model(LinearSVC, 'Classifier', 'LinearSVC', tuned=opts.best, pr=pr, mltl=opts.mltl, loss='squared_hinge', dual=False), random_state=0), {
                # 'param_dist':dict(
                # n_estimators=[50, 100] + range(200, 1001, 200),
                # max_samples=np.linspace(0.5, 1, 6),
                # max_features=np.linspace(0.5, 1, 6),
                # bootstrap=[True, False],
                # bootstrap_features=[True, False]),
                # 'n_iter':30
                # }),
                # ('AdaBoost LinearSVC', AdaBoostClassifier(base_estimator=build_model(SVC, 'Classifier', 'SVM', tuned=opts.best, pr=pr, mltl=opts.mltl), algorithm='SAMME', random_state=0), {
                # 'param_dist':dict(
                # n_estimators=[50, 100] + range(200, 1001, 200),
                # learning_rate=np.linspace(0.5, 1, 6)),
                # 'n_iter':30
                # }),
                # ('GB LinearSVC', GradientBoostingClassifier(random_state=0), {
                # 'param_dist':dict(
                # n_estimators=[50, 100] + range(200, 1001, 200),
                # subsample = np.linspace(0.5, 1, 6),
                # max_features=np.linspace(0.5, 1, 6).tolist()+['sqrt', 'log2'],
                # min_samples_leaf=[1]+range(10, 101, 10),
                # learning_rate=np.linspace(0.5, 1, 6),
                # loss=['deviance', 'exponential']),
                # 'n_iter':30
                # }),
                # ('UGSS & RF', Pipeline([('featfilt', ftslct.MSelectKBest(ftslct.utopk, filtfunc=ftslct.gss_coef, fn=4000)), ('clf', RandomForestClassifier())]), {
                # 'param_dist':dict(
                # featfilt__k=np.logspace(np.log2(250), np.log2(32000), 8, base=2).astype('int')),
                # 'n_iter':8
                # }),
        ]:
            yield mdl_name, mdl, params
    else:
        for mdl_name, mdl, params in [
                # ('Logistic Regression', LogisticRegression(dual=False), {
                # 'param_grid':dict(
                # penalty=['l1', 'l2'],
                # C=np.logspace(-5, 5, 11),
                # tol=np.logspace(-6, 3, 10))
                # }),
                # ('LinearSVC', LinearSVC(dual=False), {
                # 'param_grid':dict(
                # penalty=['l1', 'l2'],
                # C=np.logspace(-5, 5, 11),
                # tol=np.logspace(-6, 3, 10))
                # }),
                # ('Perceptron', Perceptron(), {
                # 'param_grid':dict(
                # alpha =np.logspace(-5, 5, 11),
                # n_iter=range(3, 20, 3))
                # }),
                # ('MultinomialNB', MultinomialNB(), {
                # 'param_grid':dict(
                # alpha=np.logspace(-6, 3, 10),
                # fit_prior=[True, False])
                # }),
                # ('SVM', SVC(), {
                # 'param_grid':dict(
                # kernel=['linear', 'rbf', 'poly'],
                # C=np.logspace(-5, 5, 11),
                # gamma=np.logspace(-6, 3, 10))
                # }),
                # ('Extra Trees', ExtraTreesClassifier(random_state=0), {
                # 'param_grid':dict(
                # n_estimators=[50, 100] + range(200, 1001, 200),
                # max_features=np.linspace(0.5, 1, 6).tolist()+['sqrt', 'log2'],
                # min_samples_leaf=[1]+range(10, 101, 10),
                # class_weight=['balanced', None])
                # }),
            ('Random Forest', RandomForestClassifier(random_state=0), {
                'param_grid':
                dict(n_estimators=[50, 100] + range(200, 1001, 200),
                     max_features=np.linspace(0.5, 1, 6).tolist() +
                     ['sqrt', 'log2'],
                     max_depth=[None] + range(10, 101, 10),
                     min_samples_leaf=[1] + range(10, 101, 10),
                     class_weight=['balanced', None])
            }),
                # ('Bagging LinearSVC', BaggingClassifier(base_estimator=build_model(LinearSVC, 'Classifier', 'LinearSVC', tuned=opts.best, pr=pr, mltl=opts.mltl, loss='squared_hinge', dual=False), random_state=0), {
                # 'param_grid':dict(
                # n_estimators=[50, 100] + range(200, 1001, 200),
                # max_samples=np.linspace(0.5, 1, 6),
                # max_features=np.linspace(0.5, 1, 6),
                # bootstrap=[True, False],
                # bootstrap_features=[True, False])
                # }),
                # ('AdaBoost LinearSVC', AdaBoostClassifier(base_estimator=build_model(SVC, 'Classifier', 'SVM', tuned=opts.best, pr=pr, mltl=opts.mltl), algorithm='SAMME', random_state=0), {
                # 'param_grid':dict(
                # n_estimators=[50, 100] + range(200, 1001, 200),
                # learning_rate=np.linspace(0.5, 1, 6))
                # }),
                # ('GB LinearSVC', GradientBoostingClassifier(random_state=0), {
                # 'param_grid':dict(
                # n_estimators=[50, 100] + range(200, 1001, 200),
                # subsample = np.linspace(0.5, 1, 6),
                # max_features=np.linspace(0.5, 1, 6).tolist()+['sqrt', 'log2'],
                # min_samples_leaf=[1]+range(10, 101, 10),
                # learning_rate = np.linspace(0.5, 1, 6),
                # loss=['deviance', 'exponential'])
                # }),
                # ('UDT & RF', Pipeline([('featfilt', ftslct.MSelectKBest(ftslct.utopk, filtfunc=ftslct.decision_tree, fn=4000)), ('clf', RandomForestClassifier())]), {
                # 'param_grid':dict(
                # featfilt__k=np.logspace(np.log2(250), np.log2(32000), 8, base=2).astype('int'))
                # }),
                # ('DT & RF', Pipeline([('featfilt', ftslct.MSelectKBest(ftslct.decision_tree)), ('clf', RandomForestClassifier())]), {
                # 'param_grid':dict(
                # featfilt__k=np.logspace(np.log2(250), np.log2(32000), 8, base=2).astype('int'))
                # }),
                # ('UNGL & RF', Pipeline([('featfilt', ftslct.MSelectKBest(ftslct.utopk, filtfunc=ftslct.ngl_coef, fn=4000)), ('clf', RandomForestClassifier())]), {
                # 'param_grid':dict(
                # featfilt__k=np.logspace(np.log2(250), np.log2(32000), 8, base=2).astype('int'))
                # }),
                # ('NGL & RF', Pipeline([('featfilt', ftslct.MSelectKBest(ftslct.ngl_coef)), ('clf', RandomForestClassifier())]), {
                # 'param_grid':dict(
                # featfilt__k=np.logspace(np.log2(250), np.log2(32000), 8, base=2).astype('int'))
                # }),
                # ('UGSS & RF', Pipeline([('featfilt', ftslct.MSelectKBest(ftslct.utopk, filtfunc=ftslct.gss_coef, fn=4000)), ('clf', RandomForestClassifier())]), {
                # 'param_grid':dict(
                # featfilt__k=np.logspace(np.log2(250), np.log2(32000), 8, base=2).astype('int'))
                # }),
                # ('GSS & RF', Pipeline([('featfilt', ftslct.MSelectKBest(ftslct.gss_coef)), ('clf', RandomForestClassifier())]), {
                # 'param_grid':dict(
                # featfilt__k=np.logspace(np.log2(250), np.log2(32000), 8, base=2).astype('int'))
                # })
        ]:
            yield mdl_name, mdl, params