Example #1
0
def checker(predictions,
            y_true,
            probabilities,
            scorer,
            verbose=False,
            plainout=False,
            tier=None):
    "Check our model prediction and dump logloss value"
    if verbose:
        loss = 0
        tot = 0
        if tier:
            print("Predictions for tier %s       :" % tier)
        for pval, yval in zip(predictions, y_true):
            if verbose:
                print("predict value %s, real value %s" % (pval, yval))
            loss += logloss(pval, yval)
            tot += 1
        print("Final Logloss          :", loss / tot)
    plain = ""
    # sklearn metrics for regression
    if not scorer:
        print("ERROR: no scorer provided, please see --help for their list")
        sys.exit(1)
    slist = ['tp', 'tn', 'fp', 'fn', 'tpr', 'tnr', 'fpr', 'fnr', 'auc']
    res = None
    for scr in scorer.split(','):
        if scr.lower() in slist:
            if not res:
                res = rates(y_true, predictions, probabilities)
            if plainout:
                plain += str(res[scr.lower()]) + ','
            else:
                print("Score metric (%s): %s" %
                      (scr.upper(), res[scr.lower()]))
            continue
        scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(',
                                                     '').replace(')', '')
        method = scr_str.split(',')[0]
        res = getattr(metrics, method)(y_true, predictions)
        if plainout:
            plain += str(res) + ','
        else:
            print("Score metric (%s): %s" % (method, res))
    if plainout:
        if tier:
            print("%s,%s" % (tier, plain[:-1]))
        else:
            print(plain[:-1])
Example #2
0
def checker(predictions, y_true, probabilities, scorer, verbose=False, plainout=False, tier=None):
    "Check our model prediction and dump logloss value"
    if  verbose:
        loss = 0
        tot = 0
        if  tier:
            print("Predictions for tier %s       :" % tier)
        for pval, yval in zip(predictions, y_true):
            if  verbose:
                print("predict value %s, real value %s" % (pval, yval))
            loss += logloss(pval, yval)
            tot += 1
        print("Final Logloss          :", loss/tot)
    plain = ""
    # sklearn metrics for regression
    if  not scorer:
        print("ERROR: no scorer provided, please see --help for their list")
        sys.exit(1)
    slist = ['tp', 'tn', 'fp', 'fn', 'tpr', 'tnr', 'fpr', 'fnr', 'auc']
    res = None
    for scr in scorer.split(','):
        if  scr.lower() in slist:
            if  not res:
                res = rates(y_true, predictions, probabilities)
            if  plainout:
                plain += str(res[scr.lower()]) + ','
            else:
                print("Score metric (%s): %s" % (scr.upper(), res[scr.lower()]))
            continue        
        scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '')
        method = scr_str.split(',')[0]
        res = getattr(metrics, method)(y_true, predictions)
        if  plainout:
            plain += str(res) + ','
        else:
            print("Score metric (%s): %s" % (method, res))
    if  plainout:
        if  tier:
            print("%s,%s" % (tier, plain[:-1]))
        else:
            print(plain[:-1])
Example #3
0
def model(train_file, newdata_file, idcol, tcol, learner, lparams=None,
        drops=None, split=0.3, scorer=None, scaler=None, ofile=None,
        idx=0, limit=-1,  gsearch=None, crossval=None, seed=123,
        verbose=False, timeout=None, proba=False):
    """
    Build and run ML algorihtm for given train/test dataframe
    and classifier name. The learners are defined externally
    in DCAF.ml.clf module.
    """
    clf = learners()[learner]
    if  proba and not (hasattr(clf, 'predict_proba') and callable(getattr(clf, 'predict_proba'))):
        raise Exception("ERROR: model %s does not provide method 'predict_proba'" % learner)
    if  lparams:
        if  isinstance(lparams, str):
            lparams = json.loads(lparams)
        elif isinstance(lparams, dict):
            pass
        else:
            raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams)))
        for key, val in lparams.items():
            setattr(clf, key, val)
    setattr(clf, "random_state", seed)
    random.seed(seed)
    if  scaler:
        clf = Pipeline([('scaler',getattr(preprocessing, scaler)()), ('clf', clf)])
    print(clf)
    if  split:
        if  isinstance(split, int):
            split = split/100.
        elif isinstance(split, float):
            pass
        elif isinstance(split, basestring):
            split = float(split)
        print("Split level: train %s%%, validation %s%%" % (round((1-split)*100), round(split*100)))
    if  verbose:
        print("idx/limit", idx, limit)

    # read data and normalize it
    if  drops:
        if  isinstance(drops, basestring):
            drops = drops.split(',')
        if  idcol not in drops:
            drops += [idcol]
    else:
        drops = [idcol]
    xdf = read_data(train_file, drops, idx, limit, scaler)

    # get target variable and exclude choice from train data
    target = xdf[tcol]
    xdf = xdf.drop(tcol, axis=1)
    if  verbose:
        print("Train file", train_file)
        print("Columns:", ','.join(xdf.columns))
        print("train shapes:", xdf.shape, target.shape)
        if  verbose>1:
            print("Target:", tcol, target)

    # split our train data
    if  split:
        x_train, x_rest, y_train, y_rest = \
                train_test_split(xdf, target, test_size=split, random_state=seed)
        if  verbose:
            print("train shapes after splitting:", x_train.shape, y_train.shape)
    else:
        x_train = xdf
        y_train = target
        x_rest = None
        y_rest = None
    if  gsearch:
        param_search(clf, x_train, y_train, x_rest, y_rest, gsearch)
        sys.exit(0)
    if  crossval:
        crossvalidation(clf, xdf, target)
        sys.exit(0)

    time0 = time.time()
    fit = clf.fit(x_train, y_train)
    rtime = time.time()-time0
    if  verbose:
        print("Train elapsed time", time.time()-time0)
    if  split:
        if  proba:
            print("ERROR in model.py: probabilities not supported in split mode")
            sys.exit(1)
        time0 = time.time()
        predictions = fit.predict(x_rest)
        rtime += time.time()-time0
        try:
            importances = clf.feature_importances_
            if  importances.any():
                print("Feature ranking:")
                columns = xdf.columns
                indices = np.argsort(importances)[::-1]
                num = 9 if len(columns)>9 else len(columns)
                for f in range(num):
                    print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]]))
        except:
            pass
        if  scorer:
            for scr in scorer.split(','):
                slist = ['tp', 'tn', 'fp', 'fn', 'tpr', 'tnr', 'fpr', 'fnr']
                if  scr.lower() in slist:
                    res = rates(y_rest, predictions)
                    print("Score metric (%s): %s" % (scr.upper(), res[scr.lower()]))
                    continue
                scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '')
                method = scr_str.split(',')[0]
                res = getattr(metrics, method)(y_rest, predictions)
                print("Score metric (%s): %s" % (method, res))
        if  verbose:
            loss = 0
            tot = 0
            for pval, yval in zip(predictions, y_rest):
                if  verbose>1:
                    print("predict value %s, real value %s" % (pval, yval))
                loss += logloss(pval, yval)
                tot += 1
            print("Final Logloss", loss/tot)
    else:
        print("Since there is no train/validation splitting, no prediction metrics will be shown")

    # new data file for which we want to predict
    if  newdata_file:
        nfiles = []
        if  os.path.isfile(newdata_file):
            nfiles = [newdata_file]
        else:
            if newdata_file.find(',') != -1:
                nfiles = newdata_file.split(',')
            elif newdata_file.find('*') != -1:
                nfiles = glob.glob(newdata_file)
            elif os.path.isdir(newdata_file):
                for ext in ['.csv.gz', '.csv', 'csv.bz2']:
                    nfiles = [f for f in findfiles(fin, ext)]
            else:
                print("ERROR: no files found for --newdata=%s" % newdata_file)
                sys.exit(1)
            if  not len(nfiles):
                print("WARNING: no files to predict in %s" % newdata_file)
                return
        outfname = None
        for ni, nfile in enumerate(nfiles): # iterate on files to predict
            if  len(nfiles) > 1:
                outfname = '%s_%s_%s' % (learner, ofile, ni)
                print("You provided file list, the output file name %s will be replaced with %s_%s_%s" % (ofile, learner, ofile, ni))
            else:
                outfname = ofile
            tdf = read_data(nfile, drops, scaler=scaler)
            if  tcol in tdf.columns:
                tdf = tdf.drop(tcol, axis=1)
            if  verbose:
                print("New data file", nfile)
                print("Columns:", ','.join(tdf.columns))
                print("test shapes:", tdf.shape)
            datasets = [int(i) for i in list(tdf.get('dataset', []))]
            if  datasets:
                dbs_h = get_dbs_header(tdf, nfile)
                dbses = [int(i) for i in list(tdf[dbs_h])]
            if  verbose:
                print(tdf)
            time0 = time.time()
            predictions = fit.predict(tdf) if not proba else np.asarray(fit.predict_proba(tdf))[:,list(fit.classes_).index(1)]
            rtime += time.time()-time0
            if  datasets:
                out = pd.DataFrame({'dataset':datasets, dbs_h: dbses, 'prediction':predictions})
            else:
                out = pd.DataFrame({'prediction':predictions})
            if  outfname:
                out.to_csv(outfname, header=True, index=False)
            if  timeout: # output running time
                data = {}
                if  os.path.isfile(timeout): # append if file exists
                    headers = []
                    for line in open(timeout, 'r'):
                        line = line.strip(" \r\n").split(',')
                        if  not headers:
                            headers = line
                            if  line[0] != 'model' or line[1] != 'running_time_s':
                                print("Error writing model running time to %s: unrecognized output file found." % timeout)
                            continue
                        else:
                            data[line[0]] = float(line[1])
                if  learner in data:
                    data[learner] += rtime
                else:
                    data[learner] = rtime
                fstream = open(timeout, 'w')
                fstream.write("model,running_time_s\n")
                for key in sorted(data.keys()):
                    fstream.write("%s,%.3f\n" % (key,data[key]))
                fstream.close()