def main(): "Main function" optmgr = OptionParser(learners().keys(), SCORERS.keys()) opts, _ = optmgr.options() if opts.learner_help: obj = learners()[opts.learner_help] print(obj) print(obj.__doc__) sys.exit(0) ofile = opts.predict if not ofile: ofile = "%s.predictions" % opts.learner model2run = 'model' if opts.train.find(',') != -1: # list of files train_files = opts.train.split(',') model2run = 'model_iter' elif os.path.isdir(opts.train): # we got directory name for ext in ['.csv.gz', '.csv']: train_files = [f for f in files(opts.train, ext)] model2run = 'model_iter' if len(train_files): break # random.seed(12345) if model2run == 'model_iter': model_iter(train_file_list=train_files, newdata_file=opts.newdata, idcol=opts.idcol, tcol=opts.target, learner=opts.learner, lparams=opts.lparams, drops=opts.drops, split=opts.split, scaler=opts.scaler, ofile=ofile, seed=opts.seed, verbose=opts.verbose) else: model(train_file=opts.train, newdata_file=opts.newdata, idcol=opts.idcol, tcol=opts.target, learner=opts.learner, lparams=opts.lparams, drops=opts.drops, split=opts.split, scorer=opts.scorer, scaler=opts.scaler, ofile=ofile, idx=opts.idx, limit=opts.limit, gsearch=opts.gsearch, crossval=opts.cv, seed=opts.seed, verbose=opts.verbose, timeout=opts.timeout, proba=opts.proba)
def model_iter(train_file_list, newdata_file, idcol, tcol, learner, lparams=None, drops=None, split=0.1, scaler=None, ofile=None, seed=123, verbose=False): """ Build and run ML algorihtm for given train/test dataframe and classifier name. The learners are defined externally in DCAF.ml.clf module. """ if learner not in ['SGDClassifier', 'SGDRegressor']: raise Exception("Unsupported learner %s" % learner) clf = learners()[learner] setattr(clf, "random_state", seed) random.seed(seed) if lparams: if isinstance(lparams, str): lparams = json.loads(lparams) elif isinstance(lparams, dict): pass else: raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams))) for key, val in lparams.items(): setattr(clf, key, val) if scaler: clf = Pipeline([('scaler',getattr(preprocessing, scaler)()), ('clf', clf)]) print("clf:", clf) if drops: if isinstance(drops, basestring): drops = drops.split(',') if idcol not in drops: drops += [idcol] else: drops = [idcol] fit = None for train_file in train_file_list: print("Train file", train_file) # read data and normalize it xdf = read_data(train_file, drops, scaler=scaler) # get target variable and exclude choice from train data target = xdf[tcol] xdf = xdf.drop(tcol, axis=1) if verbose: print("Columns:", ','.join(xdf.columns)) print("Target:", target) if split: x_train, x_rest, y_train, y_rest = \ train_test_split(xdf, target, test_size=0.1, random_state=seed) time0 = time.time() fit = clf.partial_fit(x_train, y_train) if verbose: print("Train elapsed time", time.time()-time0) print("### SCORE", clf.score(x_rest, y_rest)) else: x_train = xdf y_train = target time0 = time.time() fit = clf.partial_fit(x_train, y_train) if verbose: print("Train elapsed time", time.time()-time0) # new data for which we want to predict if newdata_file: tdf = read_data(newdata_file, drops, scaler=scaler) if tcol in tdf.columns: tdf = tdf.drop(tcol, axis=1) datasets = [int(i) for i in list(tdf['dataset'])] dbs_h = get_dbs_header(tdf, newdata_file) dbses = [int(i) for i in list(tdf[dbs_h])] predictions = fit.predict_proba(tdf) data = {'dataset':datasets, dbs_h: dbses, 'prediction':predictions} out = pd.DataFrame(data=data) if ofile: out.to_csv(ofile, header=True, index=False)
def model(train_file, newdata_file, idcol, tcol, learner, lparams=None, drops=None, split=0.3, scorer=None, scaler=None, ofile=None, idx=0, limit=-1, gsearch=None, crossval=None, verbose=False): """ Build and run ML algorihtm for given train/test dataframe and classifier name. The learners are defined externally in DCAF.ml.clf module. """ clf = learners()[learner] if lparams: if isinstance(lparams, str): lparams = json.loads(lparams) elif isinstance(lparams, dict): pass else: raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams))) for key, val in lparams.items(): setattr(clf, key, val) setattr(clf, "random_state", 123) print(clf) if split: if isinstance(split, int): split = split/100. elif isinstance(split, float): pass elif isinstance(split, basestring): split = float(split) print("Split level: train %s%%, validation %s%%" % (round((1-split)*100), round(split*100))) if verbose: print("idx/limit", idx, limit) # read data and normalize it if drops: if isinstance(drops, basestring): drops = drops.split(',') if idcol not in drops: drops += [idcol] else: drops = [idcol] xdf = read_data(train_file, drops, idx, limit, scaler) # get target variable and exclude choice from train data target = xdf[tcol] xdf = xdf.drop(tcol, axis=1) if verbose: print("Train file", train_file) print("Columns:", ','.join(xdf.columns)) print("train shapes:", xdf.shape, target.shape) if verbose>1: print("Target:", tcol, target) # split our train data if split: x_train, x_rest, y_train, y_rest = \ train_test_split(xdf, target, test_size=split) if verbose: print("train shapes after splitting:", x_train.shape, y_train.shape) else: x_train = xdf y_train = target x_rest = None y_rest = None if gsearch: param_search(clf, x_train, y_train, x_rest, y_rest, gsearch) sys.exit(0) if crossval: crossvalidation(clf, xdf, target) sys.exit(0) if scaler: x_train = getattr(preprocessing, scaler)().fit_transform(x_train) time0 = time.time() fit = clf.fit(x_train, y_train) if verbose: print("Train elapsed time", time.time()-time0) if split: predictions = fit.predict(x_rest) try: importances = clf.feature_importances_ if importances.any(): print("Feature ranking:") columns = xdf.columns indices = np.argsort(importances)[::-1] num = 9 if len(columns)>9 else len(columns) for f in range(num): print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]])) except: pass if scorer: for scr in scorer.split(','): scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '') method = scr_str.split(',')[0] res = getattr(metrics, method)(y_rest, predictions) print("Score metric (%s): %s" % (method, res)) if verbose: loss = 0 tot = 0 for pval, yval in zip(predictions, y_rest): if verbose>1: print("predict value %s, real value %s" % (pval, yval)) loss += logloss(pval, yval) tot += 1 print("Final Logloss", loss/tot) else: print("Since there is no train/validation splitting, no prediction metrics will be shown") # new data file for which we want to predict if newdata_file: tdf = read_data(newdata_file, drops, scaler=scaler) if tcol in tdf.columns: tdf = tdf.drop(tcol, axis=1) if verbose: print("New data file", newdata_file) print("Columns:", ','.join(tdf.columns)) print("test shapes:", tdf.shape) datasets = [int(i) for i in list(tdf['dataset'])] dbses = [int(i) for i in list(tdf['dbs'])] if scaler: tdf = getattr(preprocessing, scaler)().fit_transform(tdf) predictions = fit.predict(tdf) data = {'dataset':datasets, 'dbs': dbses, 'prediction':predictions} out = pd.DataFrame(data=data) if ofile: out.to_csv(ofile, header=True, index=False)
def model(train_file, newdata_file, idcol, tcol, learner, lparams=None, drops=None, split=0.3, scorer=None, scaler=None, ofile=None, idx=0, limit=-1, gsearch=None, crossval=None, seed=123, verbose=False, timeout=None, proba=False): """ Build and run ML algorihtm for given train/test dataframe and classifier name. The learners are defined externally in DCAF.ml.clf module. """ clf = learners()[learner] if proba and not (hasattr(clf, 'predict_proba') and callable(getattr(clf, 'predict_proba'))): raise Exception("ERROR: model %s does not provide method 'predict_proba'" % learner) if lparams: if isinstance(lparams, str): lparams = json.loads(lparams) elif isinstance(lparams, dict): pass else: raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams))) for key, val in lparams.items(): setattr(clf, key, val) setattr(clf, "random_state", seed) random.seed(seed) if scaler: clf = Pipeline([('scaler',getattr(preprocessing, scaler)()), ('clf', clf)]) print(clf) if split: if isinstance(split, int): split = split/100. elif isinstance(split, float): pass elif isinstance(split, basestring): split = float(split) print("Split level: train %s%%, validation %s%%" % (round((1-split)*100), round(split*100))) if verbose: print("idx/limit", idx, limit) # read data and normalize it if drops: if isinstance(drops, basestring): drops = drops.split(',') if idcol not in drops: drops += [idcol] else: drops = [idcol] xdf = read_data(train_file, drops, idx, limit, scaler) # get target variable and exclude choice from train data target = xdf[tcol] xdf = xdf.drop(tcol, axis=1) if verbose: print("Train file", train_file) print("Columns:", ','.join(xdf.columns)) print("train shapes:", xdf.shape, target.shape) if verbose>1: print("Target:", tcol, target) # split our train data if split: x_train, x_rest, y_train, y_rest = \ train_test_split(xdf, target, test_size=split, random_state=seed) if verbose: print("train shapes after splitting:", x_train.shape, y_train.shape) else: x_train = xdf y_train = target x_rest = None y_rest = None if gsearch: param_search(clf, x_train, y_train, x_rest, y_rest, gsearch) sys.exit(0) if crossval: crossvalidation(clf, xdf, target) sys.exit(0) time0 = time.time() fit = clf.fit(x_train, y_train) rtime = time.time()-time0 if verbose: print("Train elapsed time", time.time()-time0) if split: if proba: print("ERROR in model.py: probabilities not supported in split mode") sys.exit(1) time0 = time.time() predictions = fit.predict(x_rest) rtime += time.time()-time0 try: importances = clf.feature_importances_ if importances.any(): print("Feature ranking:") columns = xdf.columns indices = np.argsort(importances)[::-1] num = 9 if len(columns)>9 else len(columns) for f in range(num): print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]])) except: pass if scorer: for scr in scorer.split(','): slist = ['tp', 'tn', 'fp', 'fn', 'tpr', 'tnr', 'fpr', 'fnr'] if scr.lower() in slist: res = rates(y_rest, predictions) print("Score metric (%s): %s" % (scr.upper(), res[scr.lower()])) continue scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '') method = scr_str.split(',')[0] res = getattr(metrics, method)(y_rest, predictions) print("Score metric (%s): %s" % (method, res)) if verbose: loss = 0 tot = 0 for pval, yval in zip(predictions, y_rest): if verbose>1: print("predict value %s, real value %s" % (pval, yval)) loss += logloss(pval, yval) tot += 1 print("Final Logloss", loss/tot) else: print("Since there is no train/validation splitting, no prediction metrics will be shown") # new data file for which we want to predict if newdata_file: nfiles = [] if os.path.isfile(newdata_file): nfiles = [newdata_file] else: if newdata_file.find(',') != -1: nfiles = newdata_file.split(',') elif newdata_file.find('*') != -1: nfiles = glob.glob(newdata_file) elif os.path.isdir(newdata_file): for ext in ['.csv.gz', '.csv', 'csv.bz2']: nfiles = [f for f in findfiles(fin, ext)] else: print("ERROR: no files found for --newdata=%s" % newdata_file) sys.exit(1) if not len(nfiles): print("WARNING: no files to predict in %s" % newdata_file) return outfname = None for ni, nfile in enumerate(nfiles): # iterate on files to predict if len(nfiles) > 1: outfname = '%s_%s_%s' % (learner, ofile, ni) print("You provided file list, the output file name %s will be replaced with %s_%s_%s" % (ofile, learner, ofile, ni)) else: outfname = ofile tdf = read_data(nfile, drops, scaler=scaler) if tcol in tdf.columns: tdf = tdf.drop(tcol, axis=1) if verbose: print("New data file", nfile) print("Columns:", ','.join(tdf.columns)) print("test shapes:", tdf.shape) datasets = [int(i) for i in list(tdf.get('dataset', []))] if datasets: dbs_h = get_dbs_header(tdf, nfile) dbses = [int(i) for i in list(tdf[dbs_h])] if verbose: print(tdf) time0 = time.time() predictions = fit.predict(tdf) if not proba else np.asarray(fit.predict_proba(tdf))[:,list(fit.classes_).index(1)] rtime += time.time()-time0 if datasets: out = pd.DataFrame({'dataset':datasets, dbs_h: dbses, 'prediction':predictions}) else: out = pd.DataFrame({'prediction':predictions}) if outfname: out.to_csv(outfname, header=True, index=False) if timeout: # output running time data = {} if os.path.isfile(timeout): # append if file exists headers = [] for line in open(timeout, 'r'): line = line.strip(" \r\n").split(',') if not headers: headers = line if line[0] != 'model' or line[1] != 'running_time_s': print("Error writing model running time to %s: unrecognized output file found." % timeout) continue else: data[line[0]] = float(line[1]) if learner in data: data[learner] += rtime else: data[learner] = rtime fstream = open(timeout, 'w') fstream.write("model,running_time_s\n") for key in sorted(data.keys()): fstream.write("%s,%.3f\n" % (key,data[key])) fstream.close()
def model(train_file, newdata_file, idcol, tcol, learner, lparams=None, drops=None, split=0.3, scorer=None, scaler=None, ofile=None, idx=0, limit=-1, gsearch=None, crossval=None, verbose=False): """ Build and run ML algorihtm for given train/test dataframe and classifier name. The learners are defined externally in DCAF.ml.clf module. """ split = 0 # change by Ting to use the whole training set for training, not for validation. clf = learners()[learner] if lparams: if isinstance(lparams, str): lparams = json.loads(lparams) elif isinstance(lparams, dict): pass else: raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams))) for key, val in lparams.items(): setattr(clf, key, val) setattr(clf, "random_state", 123) print clf if split: if isinstance(split, int): split = split/100. elif isinstance(split, float): pass elif isinstance(split, basestring): split = float(split) print "Split level: train %s%%, validation %s%%" % (round((1-split)*100), round(split*100)) if verbose: print "idx/limit", idx, limit # read data and normalize it if drops: if isinstance(drops, basestring): drops = drops.split(',') if idcol not in drops: drops += [idcol] else: drops = [idcol] xdf = read_data(train_file, drops, idx, limit, scaler) # get target variable and exclude choice from train data target = xdf[tcol] xdf = xdf.drop(tcol, axis=1) if verbose: print "Train file", train_file print "Columns:", ','.join(xdf.columns) print "train shapes:", xdf.shape, target.shape if verbose>1: print "Target:", tcol, target # split our train data if split: # x_train, x_rest, y_train, y_rest = train_test_split(xdf, target, test_size=split) x_train, x_rest, y_train, y_rest = train_test_split(xdf, target, test_size=split, random_state=1234) # change by Ting, for controlling random seed if verbose: print "train shapes after splitting:", x_train.shape, y_train.shape else: x_train = xdf y_train = target x_rest = None y_rest = None if gsearch: param_search(clf, x_train, y_train, x_rest, y_rest, gsearch) sys.exit(0) if crossval: crossvalidation(clf, xdf, target) sys.exit(0) ############################################################################### # add by Ting to do feature selection and measuare feature importance # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile, f_classif selector = SelectPercentile(f_classif, percentile=100) # by F test selector.fit(x_train, y_train) pvs = selector.pvalues_ # output scores of features columns = xdf.columns indices = np.argsort(pvs) num = len(columns) print("\n Feature ranking by ANOVA F test:") for f in range(num): print("%d. feature selection test p-value %f, feature %s" % (f + 1, pvs[indices[f]], columns[indices[f]])) selector = SelectPercentile(chi2, percentile=10) # by chi square test selector.fit(x_train, y_train) pvs = selector.pvalues_ # output scores of features columns = xdf.columns indices = np.argsort(pvs) num = len(columns) print("\n Feature ranking by Chi Squared test:") for f in range(num): print("%d. feature selection test p-value %f, feature %s" % (f + 1, pvs[indices[f]], columns[indices[f]])) ############################################################################### # preprocessing of "scaler" type # scaler = None # added by ting, to ignore the standardization, but fail to do that. todo if scaler: x_train = getattr(preprocessing, scaler)().fit_transform(x_train) time0 = time.time() fit = clf.fit(x_train, y_train) if verbose: print "Train elapsed time", time.time()-time0 # comment out by Ting, move it to the new test dataset # # for validation # if split: # predictions = fit.predict(x_rest) # try: # importances = clf.feature_importances_ # if importances.any(): # print "Feature ranking:" # columns = xdf.columns # indices = np.argsort(importances)[::-1] # # num = 9 if len(columns)>9 else len(columns) # num = len(columns) # change by Ting # for f in range(num): # print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]])) # except: # pass # if scorer: # for scr in scorer.split(','): # scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '') # method = scr_str.split(',')[0] # res = getattr(metrics, method)(y_rest, predictions) # print "Score metric (%s): %s" % (method, res) # if verbose: # loss = 0 # tot = 0 # for pval, yval in zip(predictions, y_rest): # if verbose>1: # print "predict value %s, real value %s" % (pval, yval) # loss += logloss(pval, yval) # tot += 1 # print "Final Logloss", loss/tot # else: # print "Since there is no train/validation splitting, no prediction metrics will be shown" # predict on new data set, by the learned classifier if newdata_file: tdf = read_data(newdata_file, drops, scaler=scaler) if tcol in tdf.columns: tdf = tdf.drop(tcol, axis=1) if verbose: print "New data file", newdata_file print "Columns:", ','.join(tdf.columns) print "test shapes:", tdf.shape datasets = [int(i) for i in list(tdf['dataset'])] dbses = [int(i) for i in list(tdf['dbs'])] if scaler: tdf = getattr(preprocessing, scaler)().fit_transform(tdf) predictions = fit.predict(tdf) data = {'dataset':datasets, 'dbs': dbses, 'prediction':predictions} out = pd.DataFrame(data=data) if ofile: out.to_csv(ofile, header=True, index=False) try: importances = clf.feature_importances_ if importances.any(): print "\n Feature ranking by random forest classifier:" columns = xdf.columns indices = np.argsort(importances)[::-1] # num = 9 if len(columns)>9 else len(columns) num = len(columns) # change by Ting, to output all features' importances for f in range(num): print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]])) except: pass