Python rates Examples

Programming Language: Python

Namespace/Package Name: DCAF.ml.utils

Method/Function: rates

Examples at hotexamples.com: 3

Python rates - 3 examples found. These are the top rated real world Python examples of DCAF.ml.utils.rates extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def checker(predictions,
            y_true,
            probabilities,
            scorer,
            verbose=False,
            plainout=False,
            tier=None):
    "Check our model prediction and dump logloss value"
    if verbose:
        loss = 0
        tot = 0
        if tier:
            print("Predictions for tier %s       :" % tier)
        for pval, yval in zip(predictions, y_true):
            if verbose:
                print("predict value %s, real value %s" % (pval, yval))
            loss += logloss(pval, yval)
            tot += 1
        print("Final Logloss          :", loss / tot)
    plain = ""
    # sklearn metrics for regression
    if not scorer:
        print("ERROR: no scorer provided, please see --help for their list")
        sys.exit(1)
    slist = ['tp', 'tn', 'fp', 'fn', 'tpr', 'tnr', 'fpr', 'fnr', 'auc']
    res = None
    for scr in scorer.split(','):
        if scr.lower() in slist:
            if not res:
                res = rates(y_true, predictions, probabilities)
            if plainout:
                plain += str(res[scr.lower()]) + ','
            else:
                print("Score metric (%s): %s" %
                      (scr.upper(), res[scr.lower()]))
            continue
        scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(',
                                                     '').replace(')', '')
        method = scr_str.split(',')[0]
        res = getattr(metrics, method)(y_true, predictions)
        if plainout:
            plain += str(res) + ','
        else:
            print("Score metric (%s): %s" % (method, res))
    if plainout:
        if tier:
            print("%s,%s" % (tier, plain[:-1]))
        else:
            print(plain[:-1])

Example #2

Show file

File: check_prediction.py Project: dmwm/DMWMAnalytics

def checker(predictions, y_true, probabilities, scorer, verbose=False, plainout=False, tier=None):
    "Check our model prediction and dump logloss value"
    if  verbose:
        loss = 0
        tot = 0
        if  tier:
            print("Predictions for tier %s       :" % tier)
        for pval, yval in zip(predictions, y_true):
            if  verbose:
                print("predict value %s, real value %s" % (pval, yval))
            loss += logloss(pval, yval)
            tot += 1
        print("Final Logloss          :", loss/tot)
    plain = ""
    # sklearn metrics for regression
    if  not scorer:
        print("ERROR: no scorer provided, please see --help for their list")
        sys.exit(1)
    slist = ['tp', 'tn', 'fp', 'fn', 'tpr', 'tnr', 'fpr', 'fnr', 'auc']
    res = None
    for scr in scorer.split(','):
        if  scr.lower() in slist:
            if  not res:
                res = rates(y_true, predictions, probabilities)
            if  plainout:
                plain += str(res[scr.lower()]) + ','
            else:
                print("Score metric (%s): %s" % (scr.upper(), res[scr.lower()]))
            continue        
        scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '')
        method = scr_str.split(',')[0]
        res = getattr(metrics, method)(y_true, predictions)
        if  plainout:
            plain += str(res) + ','
        else:
            print("Score metric (%s): %s" % (method, res))
    if  plainout:
        if  tier:
            print("%s,%s" % (tier, plain[:-1]))
        else:
            print(plain[:-1])

Example #3

Show file

def model(train_file, newdata_file, idcol, tcol, learner, lparams=None,
        drops=None, split=0.3, scorer=None, scaler=None, ofile=None,
        idx=0, limit=-1,  gsearch=None, crossval=None, seed=123,
        verbose=False, timeout=None, proba=False):
    """
    Build and run ML algorihtm for given train/test dataframe
    and classifier name. The learners are defined externally
    in DCAF.ml.clf module.
    """
    clf = learners()[learner]
    if  proba and not (hasattr(clf, 'predict_proba') and callable(getattr(clf, 'predict_proba'))):
        raise Exception("ERROR: model %s does not provide method 'predict_proba'" % learner)
    if  lparams:
        if  isinstance(lparams, str):
            lparams = json.loads(lparams)
        elif isinstance(lparams, dict):
            pass
        else:
            raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams)))
        for key, val in lparams.items():
            setattr(clf, key, val)
    setattr(clf, "random_state", seed)
    random.seed(seed)
    if  scaler:
        clf = Pipeline([('scaler',getattr(preprocessing, scaler)()), ('clf', clf)])
    print(clf)
    if  split:
        if  isinstance(split, int):
            split = split/100.
        elif isinstance(split, float):
            pass
        elif isinstance(split, basestring):
            split = float(split)
        print("Split level: train %s%%, validation %s%%" % (round((1-split)*100), round(split*100)))
    if  verbose:
        print("idx/limit", idx, limit)

    # read data and normalize it
    if  drops:
        if  isinstance(drops, basestring):
            drops = drops.split(',')
        if  idcol not in drops:
            drops += [idcol]
    else:
        drops = [idcol]
    xdf = read_data(train_file, drops, idx, limit, scaler)

    # get target variable and exclude choice from train data
    target = xdf[tcol]
    xdf = xdf.drop(tcol, axis=1)
    if  verbose:
        print("Train file", train_file)
        print("Columns:", ','.join(xdf.columns))
        print("train shapes:", xdf.shape, target.shape)
        if  verbose>1:
            print("Target:", tcol, target)

    # split our train data
    if  split:
        x_train, x_rest, y_train, y_rest = \
                train_test_split(xdf, target, test_size=split, random_state=seed)
        if  verbose:
            print("train shapes after splitting:", x_train.shape, y_train.shape)
    else:
        x_train = xdf
        y_train = target
        x_rest = None
        y_rest = None
    if  gsearch:
        param_search(clf, x_train, y_train, x_rest, y_rest, gsearch)
        sys.exit(0)
    if  crossval:
        crossvalidation(clf, xdf, target)
        sys.exit(0)

    time0 = time.time()
    fit = clf.fit(x_train, y_train)
    rtime = time.time()-time0
    if  verbose:
        print("Train elapsed time", time.time()-time0)
    if  split:
        if  proba:
            print("ERROR in model.py: probabilities not supported in split mode")
            sys.exit(1)
        time0 = time.time()
        predictions = fit.predict(x_rest)
        rtime += time.time()-time0
        try:
            importances = clf.feature_importances_
            if  importances.any():
                print("Feature ranking:")
                columns = xdf.columns
                indices = np.argsort(importances)[::-1]
                num = 9 if len(columns)>9 else len(columns)
                for f in range(num):
                    print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]]))
        except:
            pass
        if  scorer:
            for scr in scorer.split(','):
                slist = ['tp', 'tn', 'fp', 'fn', 'tpr', 'tnr', 'fpr', 'fnr']
                if  scr.lower() in slist:
                    res = rates(y_rest, predictions)
                    print("Score metric (%s): %s" % (scr.upper(), res[scr.lower()]))
                    continue
                scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '')
                method = scr_str.split(',')[0]
                res = getattr(metrics, method)(y_rest, predictions)
                print("Score metric (%s): %s" % (method, res))
        if  verbose:
            loss = 0
            tot = 0
            for pval, yval in zip(predictions, y_rest):
                if  verbose>1:
                    print("predict value %s, real value %s" % (pval, yval))
                loss += logloss(pval, yval)
                tot += 1
            print("Final Logloss", loss/tot)
    else:
        print("Since there is no train/validation splitting, no prediction metrics will be shown")

    # new data file for which we want to predict
    if  newdata_file:
        nfiles = []
        if  os.path.isfile(newdata_file):
            nfiles = [newdata_file]
        else:
            if newdata_file.find(',') != -1:
                nfiles = newdata_file.split(',')
            elif newdata_file.find('*') != -1:
                nfiles = glob.glob(newdata_file)
            elif os.path.isdir(newdata_file):
                for ext in ['.csv.gz', '.csv', 'csv.bz2']:
                    nfiles = [f for f in findfiles(fin, ext)]
            else:
                print("ERROR: no files found for --newdata=%s" % newdata_file)
                sys.exit(1)
            if  not len(nfiles):
                print("WARNING: no files to predict in %s" % newdata_file)
                return
        outfname = None
        for ni, nfile in enumerate(nfiles): # iterate on files to predict
            if  len(nfiles) > 1:
                outfname = '%s_%s_%s' % (learner, ofile, ni)
                print("You provided file list, the output file name %s will be replaced with %s_%s_%s" % (ofile, learner, ofile, ni))
            else:
                outfname = ofile
            tdf = read_data(nfile, drops, scaler=scaler)
            if  tcol in tdf.columns:
                tdf = tdf.drop(tcol, axis=1)
            if  verbose:
                print("New data file", nfile)
                print("Columns:", ','.join(tdf.columns))
                print("test shapes:", tdf.shape)
            datasets = [int(i) for i in list(tdf.get('dataset', []))]
            if  datasets:
                dbs_h = get_dbs_header(tdf, nfile)
                dbses = [int(i) for i in list(tdf[dbs_h])]
            if  verbose:
                print(tdf)
            time0 = time.time()
            predictions = fit.predict(tdf) if not proba else np.asarray(fit.predict_proba(tdf))[:,list(fit.classes_).index(1)]
            rtime += time.time()-time0
            if  datasets:
                out = pd.DataFrame({'dataset':datasets, dbs_h: dbses, 'prediction':predictions})
            else:
                out = pd.DataFrame({'prediction':predictions})
            if  outfname:
                out.to_csv(outfname, header=True, index=False)
            if  timeout: # output running time
                data = {}
                if  os.path.isfile(timeout): # append if file exists
                    headers = []
                    for line in open(timeout, 'r'):
                        line = line.strip(" \r\n").split(',')
                        if  not headers:
                            headers = line
                            if  line[0] != 'model' or line[1] != 'running_time_s':
                                print("Error writing model running time to %s: unrecognized output file found." % timeout)
                            continue
                        else:
                            data[line[0]] = float(line[1])
                if  learner in data:
                    data[learner] += rtime
                else:
                    data[learner] = rtime
                fstream = open(timeout, 'w')
                fstream.write("model,running_time_s\n")
                for key in sorted(data.keys()):
                    fstream.write("%s,%.3f\n" % (key,data[key]))
                fstream.close()