def fetch(self, api, params=None): "Fetch data for given api" url = '%s/%s' % (self.url, api) data = json.loads(super(DashboardService, self).fetch(url, params)) if data and 'jobs' in data: for row in data['jobs']: yield row
def fetch(self, api, params=None): "Fetch data for given api" url = '%s/%s/?%s' % (self.url, api, urllib.urlencode(params, doseq=True)) docid = genkey("url=%s params=%s" % (url, params)) res = self.storage.fetch_one('cache', {'_id':docid}) if res and 'data' in res: if self.verbose: print("%s::fetch url=%s, params=%s, docid=%s" \ % (self.name, url, params, docid)) data = res['data'] else: if self.verbose: print("%s::fetch url=%s, params=%s" % (self.name, url, params)) # NOTE: popularity DB has two different access points, one # within CERN network and out outside. The former does not require # authentication, while later passes through CERN SSO. # The following block reflects this, in a future, when popularity DB # will move into cmsweb domain we'll no longer need it if self.url.find('cms-popularity-prod') != -1 or \ self.url.find('cmsweb') != -1: data = getdata(url, ckey=self.ckey, cert=self.cert, debug=self.verbose) else: data = sso_getdata(url, ckey=self.ckey, cert=self.cert, debug=self.verbose) self.storage.insert('cache', {'_id':docid, 'data': data, 'url': url, 'params': params}) data = json.loads(data) for row in data['DATA']: yield row
def fetch(self, api, params=None): "Fetch data for given api" url = '%s/%s' % (self.url, api) if self.verbose: print(url, params) data = json.loads(super(DashboardService, self).fetch(url, params)) if data and 'jobs' in data: for row in data['jobs']: yield row
def dataset_dbsinst(self, dataset): "Find dbsinstance of given dataset" url = '%s/datasets' % self.url params = {'dataset': dataset} for dbsinst in self.all_dbs: dbs_url = url.replace('prod/global', dbsinst) data = json.loads(super(DBSService, self).fetch(dbs_url, params)) if len(data) and 'dataset' in data[0] and data[0]['dataset'] == dataset: return dbsinst
def dataset_dbsinst(self, dataset): "Find dbsinstance of given dataset" url = '%s/datasets' % self.url params = {'dataset': dataset} for dbsinst in self.all_dbs: dbs_url = url.replace('prod/global', dbsinst) data = json.loads(super(DBSService, self).fetch(dbs_url, params)) if len(data) and 'dataset' in data[0] and data[0][ 'dataset'] == dataset: return dbsinst
def dataset_release_versions(self, dataset, dbsinst=None): "Return dataset release versions" url = '%s/releaseversions' % self.url params = {'dataset':dataset} dbs_url = url.replace('prod/global', dbsinst) data = json.loads(super(DBSService, self).fetch(dbs_url, params)) if not len(data) or not 'release_version' in data[0]: for dbsinst in self.all_dbs: dbs_url = url.replace('prod/global', dbsinst) data = json.loads(super(DBSService, self).fetch(dbs_url, params)) if len(data) and 'release_version' in data[0]: break if data and isinstance(data, list) and len(data) > 0: if 'release_version' in data[0]: for ver in set(data[0]['release_version']): yield ver else: yield "N/A" else: yield "N/A"
def data_tiers(self): "Return list of known data-tiers" if self.tiers: return self.tiers url = '%s/datatiers' % self.url params = {} for dbsinst in self.all_dbs: dbs_url = url.replace('prod/global', dbsinst) data = json.loads(super(DBSService, self).fetch(dbs_url, params)) for tdict in data: self.tiers.add(tdict['data_tier_name']) return self.tiers
def data_tiers(self): "Return list of known data-tiers" if self.tiers: return self.tiers url = '%s/datatiers' % self.url params = {} for dbsinst in self.all_dbs: dbs_url = url.replace('prod/global', dbsinst) data = json.loads(super(DBSService, self).fetch(dbs_url, params)) for tdict in data: self.tiers.add(tdict['data_tier_name']) return list(self.tiers)
def dataset_release_versions(self, dataset, dbsinst=None): "Return dataset release versions" url = '%s/releaseversions' % self.url params = {'dataset': dataset} dbs_url = url.replace('prod/global', dbsinst) data = json.loads(super(DBSService, self).fetch(dbs_url, params)) if not len(data) or not 'release_version' in data[0]: for dbsinst in self.all_dbs: dbs_url = url.replace('prod/global', dbsinst) data = json.loads( super(DBSService, self).fetch(dbs_url, params)) if len(data) and 'release_version' in data[0]: break if data and isinstance(data, list) and len(data) > 0: if 'release_version' in data[0]: for ver in set(data[0]['release_version']): yield ver else: yield "N/A" else: yield "N/A"
def popdb_datasets(tstart, tstop, url): "Fetch data from popDB for given time frame and print out datasets" api = 'DSStatInTimeWindow' ckey, cert = get_key_cert() params = {'tstart': tstart, 'tstop': tstop} url = '%s/%s/?%s' % (url, api, urllib.urlencode(params, doseq=True)) data = getdata(url, ckey=ckey, cert=cert, debug=0) data = json.loads(data) headers = [] for row in data['DATA']: if not headers: headers = row.keys() print(','.join(headers)) out = [str(row[k]) for k in headers] print(','.join(out))
def popdb_datasets(tstart, tstop, url): "Fetch data from popDB for given time frame and print out datasets" api = "DSStatInTimeWindow" ckey, cert = get_key_cert() params = {"tstart": tstart, "tstop": tstop} url = "%s/%s/?%s" % (url, api, urllib.urlencode(params, doseq=True)) data = getdata(url, ckey=ckey, cert=cert, debug=0) data = json.loads(data) headers = [] for row in data["DATA"]: if not headers: headers = row.keys() print(",".join(headers)) out = [str(row[k]) for k in headers] print(",".join(out))
def fetch(self, api, params=None): "Fetch data for given api" if api == 'replicas': url = '%s/blockReplicas' % self.url else: url = '%s/%s' % (self.url, api) try: data = json.loads(super(PhedexService, self).fetch(url, params)) rid = 0 for row in data['phedex']['block']: for repl in row['replica']: node = repl['node'] yield node rid += 1 except Exception as exc: print(str(exc)) pass
def fetch(self, api, params=None): "Fetch data for given api" url = '%s/%s/?%s' % (self.url, api, urllib.urlencode(params, doseq=True)) docid = genkey("url=%s params=%s" % (url, params)) res = self.storage.fetch_one('cache', {'_id':docid}) if res and 'data' in res: if self.verbose: print("%s::fetch url=%s, params=%s, docid=%s" \ % (self.name, url, params, docid)) data = res['data'] else: if self.verbose: print("%s::fetch url=%s, params=%s" % (self.name, url, params)) data = getdata(url, ckey=self.ckey, cert=self.cert, debug=self.verbose) self.storage.insert('cache', {'_id':docid, 'data': data, 'url': url, 'params': params}) data = json.loads(data) for row in data['DATA']: yield row
def popdb_datasets(tstart, tstop, url): "Fetch data from popDB for given time frame and print out datasets" api = 'DSStatInTimeWindow' ckey, cert = get_key_cert() params = {'tstart':tstart, 'tstop':tstop} url = '%s/%s?%s' % (url, api, urllib.urlencode(params, doseq=True)) # NOTE: popularity DB has two different access points, one # within CERN network and out outside. The former does not require # authentication, while later passes through CERN SSO. # The following block reflects this, in a future, when popularity DB # will move into cmsweb domain we'll no longer need it if url.find('cms-popularity-prod.cern.ch') != -1: data = getdata(url, ckey=ckey, cert=cert, debug=0) else: data = sso_getdata(url, ckey=ckey, cert=cert, debug=0) data = json.loads(data) headers = [] for row in data['DATA']: if not headers: headers = row.keys() print(','.join(headers)) out = [str(row[k]) for k in headers] print(','.join(out))
def fetch(self, api, params=None, dbsinst='prod/global', cache=True): "Fetch data for given api" if dbsinst: dbs_url = self.url.replace('prod/global', dbsinst) inst = {'dbs_instance': self.all_dbs.index(dbsinst)} if api == 'releases': url = '%s/releaseversions' % dbs_url else: url = '%s/%s' % (dbs_url, api) data = json.loads(super(DBSService, self).fetch(url, params, cache)) if api == 'releases': data = data[0]['release_version'] for row in data: if api == 'datasets': try: row['rid'] = row['dataset_id'] except KeyError: print("Unable to process dataset row", row) if 'dataset' in row: h = hashlib.md5() h.update(row['dataset']) row['rid'] = int(h.hexdigest()[:10], 16) print("Generated new dataset_id", row['dataset'], h.hexdigest(), row['rid']) except: print("Unable to process dataset row", row) raise row.update(inst) yield row elif api == 'releases': rid = genkey(row, truncate=5) rec = {'release': row, 'rid': rid} yield rec elif api == 'filesummaries': yield row else: yield row
def sitedb_parser(source): """SiteDB parser""" if isinstance(source, str) or isinstance(source, unicode): data = json.loads(source) elif isinstance(source, InstanceType) or isinstance(source, file): # got data descriptor try: data = json.load(source) except Exception as exc: print_exc(exc) source.close() raise source.close() else: data = source if not isinstance(data, dict): raise Exception('Wrong data type, %s' % type(data)) if 'desc' in data: columns = data['desc']['columns'] for row in data['result']: yield rowdict(columns, row) else: for row in data['result']: yield row
def fetch(self, api, params=None, dbsinst='prod/global', cache=True): "Fetch data for given api" if dbsinst: dbs_url = self.url.replace('prod/global', dbsinst) inst = {'dbs_instance':self.all_dbs.index(dbsinst)} if api == 'releases': url = '%s/releaseversions' % dbs_url else: url = '%s/%s' % (dbs_url, api) data = json.loads(super(DBSService, self).fetch(url, params, cache)) if api == 'releases': data = data[0]['release_version'] for row in data: if api == 'datasets': try: row['rid'] = row['dataset_id'] except KeyError: print("Unable to process dataset row", row) if 'dataset' in row: h = hashlib.md5() h.update(row['dataset']) row['rid'] = int(h.hexdigest()[:10], 16) print("Generated new dataset_id", row['dataset'], h.hexdigest(), row['rid']) except: print("Unable to process dataset row", row) raise row.update(inst) yield row elif api == 'releases': rid = genkey(row, truncate=5) rec = {'release':row, 'rid':rid} yield rec elif api == 'filesummaries': yield row else: yield row
def model(train_file, newdata_file, idcol, tcol, learner, lparams=None, drops=None, split=0.3, scorer=None, scaler=None, ofile=None, idx=0, limit=-1, gsearch=None, crossval=None, verbose=False): """ Build and run ML algorihtm for given train/test dataframe and classifier name. The learners are defined externally in DCAF.ml.clf module. """ split = 0 # change by Ting to use the whole training set for training, not for validation. clf = learners()[learner] if lparams: if isinstance(lparams, str): lparams = json.loads(lparams) elif isinstance(lparams, dict): pass else: raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams))) for key, val in lparams.items(): setattr(clf, key, val) setattr(clf, "random_state", 123) print clf if split: if isinstance(split, int): split = split/100. elif isinstance(split, float): pass elif isinstance(split, basestring): split = float(split) print "Split level: train %s%%, validation %s%%" % (round((1-split)*100), round(split*100)) if verbose: print "idx/limit", idx, limit # read data and normalize it if drops: if isinstance(drops, basestring): drops = drops.split(',') if idcol not in drops: drops += [idcol] else: drops = [idcol] xdf = read_data(train_file, drops, idx, limit, scaler) # get target variable and exclude choice from train data target = xdf[tcol] xdf = xdf.drop(tcol, axis=1) if verbose: print "Train file", train_file print "Columns:", ','.join(xdf.columns) print "train shapes:", xdf.shape, target.shape if verbose>1: print "Target:", tcol, target # split our train data if split: # x_train, x_rest, y_train, y_rest = train_test_split(xdf, target, test_size=split) x_train, x_rest, y_train, y_rest = train_test_split(xdf, target, test_size=split, random_state=1234) # change by Ting, for controlling random seed if verbose: print "train shapes after splitting:", x_train.shape, y_train.shape else: x_train = xdf y_train = target x_rest = None y_rest = None if gsearch: param_search(clf, x_train, y_train, x_rest, y_rest, gsearch) sys.exit(0) if crossval: crossvalidation(clf, xdf, target) sys.exit(0) ############################################################################### # add by Ting to do feature selection and measuare feature importance # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile, f_classif selector = SelectPercentile(f_classif, percentile=100) # by F test selector.fit(x_train, y_train) pvs = selector.pvalues_ # output scores of features columns = xdf.columns indices = np.argsort(pvs) num = len(columns) print("\n Feature ranking by ANOVA F test:") for f in range(num): print("%d. feature selection test p-value %f, feature %s" % (f + 1, pvs[indices[f]], columns[indices[f]])) selector = SelectPercentile(chi2, percentile=10) # by chi square test selector.fit(x_train, y_train) pvs = selector.pvalues_ # output scores of features columns = xdf.columns indices = np.argsort(pvs) num = len(columns) print("\n Feature ranking by Chi Squared test:") for f in range(num): print("%d. feature selection test p-value %f, feature %s" % (f + 1, pvs[indices[f]], columns[indices[f]])) ############################################################################### # preprocessing of "scaler" type # scaler = None # added by ting, to ignore the standardization, but fail to do that. todo if scaler: x_train = getattr(preprocessing, scaler)().fit_transform(x_train) time0 = time.time() fit = clf.fit(x_train, y_train) if verbose: print "Train elapsed time", time.time()-time0 # comment out by Ting, move it to the new test dataset # # for validation # if split: # predictions = fit.predict(x_rest) # try: # importances = clf.feature_importances_ # if importances.any(): # print "Feature ranking:" # columns = xdf.columns # indices = np.argsort(importances)[::-1] # # num = 9 if len(columns)>9 else len(columns) # num = len(columns) # change by Ting # for f in range(num): # print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]])) # except: # pass # if scorer: # for scr in scorer.split(','): # scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '') # method = scr_str.split(',')[0] # res = getattr(metrics, method)(y_rest, predictions) # print "Score metric (%s): %s" % (method, res) # if verbose: # loss = 0 # tot = 0 # for pval, yval in zip(predictions, y_rest): # if verbose>1: # print "predict value %s, real value %s" % (pval, yval) # loss += logloss(pval, yval) # tot += 1 # print "Final Logloss", loss/tot # else: # print "Since there is no train/validation splitting, no prediction metrics will be shown" # predict on new data set, by the learned classifier if newdata_file: tdf = read_data(newdata_file, drops, scaler=scaler) if tcol in tdf.columns: tdf = tdf.drop(tcol, axis=1) if verbose: print "New data file", newdata_file print "Columns:", ','.join(tdf.columns) print "test shapes:", tdf.shape datasets = [int(i) for i in list(tdf['dataset'])] dbses = [int(i) for i in list(tdf['dbs'])] if scaler: tdf = getattr(preprocessing, scaler)().fit_transform(tdf) predictions = fit.predict(tdf) data = {'dataset':datasets, 'dbs': dbses, 'prediction':predictions} out = pd.DataFrame(data=data) if ofile: out.to_csv(ofile, header=True, index=False) try: importances = clf.feature_importances_ if importances.any(): print "\n Feature ranking by random forest classifier:" columns = xdf.columns indices = np.argsort(importances)[::-1] # num = 9 if len(columns)>9 else len(columns) num = len(columns) # change by Ting, to output all features' importances for f in range(num): print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]])) except: pass
def fetch(self, api, params=None): "Fetch data for given api" url = '%s/%s' % (self.url, api) data = json.loads(super(mcmService, self).fetch(url, params)) yield data['results']
def model(train_file, newdata_file, idcol, tcol, learner, lparams=None, drops=None, split=0.3, scorer=None, scaler=None, ofile=None, idx=0, limit=-1, gsearch=None, crossval=None, verbose=False): """ Build and run ML algorihtm for given train/test dataframe and classifier name. The learners are defined externally in DCAF.ml.clf module. """ clf = learners()[learner] if lparams: if isinstance(lparams, str): lparams = json.loads(lparams) elif isinstance(lparams, dict): pass else: raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams))) for key, val in lparams.items(): setattr(clf, key, val) setattr(clf, "random_state", 123) print(clf) if split: if isinstance(split, int): split = split/100. elif isinstance(split, float): pass elif isinstance(split, basestring): split = float(split) print("Split level: train %s%%, validation %s%%" % (round((1-split)*100), round(split*100))) if verbose: print("idx/limit", idx, limit) # read data and normalize it if drops: if isinstance(drops, basestring): drops = drops.split(',') if idcol not in drops: drops += [idcol] else: drops = [idcol] xdf = read_data(train_file, drops, idx, limit, scaler) # get target variable and exclude choice from train data target = xdf[tcol] xdf = xdf.drop(tcol, axis=1) if verbose: print("Train file", train_file) print("Columns:", ','.join(xdf.columns)) print("train shapes:", xdf.shape, target.shape) if verbose>1: print("Target:", tcol, target) # split our train data if split: x_train, x_rest, y_train, y_rest = \ train_test_split(xdf, target, test_size=split) if verbose: print("train shapes after splitting:", x_train.shape, y_train.shape) else: x_train = xdf y_train = target x_rest = None y_rest = None if gsearch: param_search(clf, x_train, y_train, x_rest, y_rest, gsearch) sys.exit(0) if crossval: crossvalidation(clf, xdf, target) sys.exit(0) if scaler: x_train = getattr(preprocessing, scaler)().fit_transform(x_train) time0 = time.time() fit = clf.fit(x_train, y_train) if verbose: print("Train elapsed time", time.time()-time0) if split: predictions = fit.predict(x_rest) try: importances = clf.feature_importances_ if importances.any(): print("Feature ranking:") columns = xdf.columns indices = np.argsort(importances)[::-1] num = 9 if len(columns)>9 else len(columns) for f in range(num): print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]])) except: pass if scorer: for scr in scorer.split(','): scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '') method = scr_str.split(',')[0] res = getattr(metrics, method)(y_rest, predictions) print("Score metric (%s): %s" % (method, res)) if verbose: loss = 0 tot = 0 for pval, yval in zip(predictions, y_rest): if verbose>1: print("predict value %s, real value %s" % (pval, yval)) loss += logloss(pval, yval) tot += 1 print("Final Logloss", loss/tot) else: print("Since there is no train/validation splitting, no prediction metrics will be shown") # new data file for which we want to predict if newdata_file: tdf = read_data(newdata_file, drops, scaler=scaler) if tcol in tdf.columns: tdf = tdf.drop(tcol, axis=1) if verbose: print("New data file", newdata_file) print("Columns:", ','.join(tdf.columns)) print("test shapes:", tdf.shape) datasets = [int(i) for i in list(tdf['dataset'])] dbses = [int(i) for i in list(tdf['dbs'])] if scaler: tdf = getattr(preprocessing, scaler)().fit_transform(tdf) predictions = fit.predict(tdf) data = {'dataset':datasets, 'dbs': dbses, 'prediction':predictions} out = pd.DataFrame(data=data) if ofile: out.to_csv(ofile, header=True, index=False)
def model_iter(train_file_list, newdata_file, idcol, tcol, learner, lparams=None, drops=None, split=0.1, scaler=None, ofile=None, seed=123, verbose=False): """ Build and run ML algorihtm for given train/test dataframe and classifier name. The learners are defined externally in DCAF.ml.clf module. """ if learner not in ['SGDClassifier', 'SGDRegressor']: raise Exception("Unsupported learner %s" % learner) clf = learners()[learner] setattr(clf, "random_state", seed) random.seed(seed) if lparams: if isinstance(lparams, str): lparams = json.loads(lparams) elif isinstance(lparams, dict): pass else: raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams))) for key, val in lparams.items(): setattr(clf, key, val) if scaler: clf = Pipeline([('scaler',getattr(preprocessing, scaler)()), ('clf', clf)]) print("clf:", clf) if drops: if isinstance(drops, basestring): drops = drops.split(',') if idcol not in drops: drops += [idcol] else: drops = [idcol] fit = None for train_file in train_file_list: print("Train file", train_file) # read data and normalize it xdf = read_data(train_file, drops, scaler=scaler) # get target variable and exclude choice from train data target = xdf[tcol] xdf = xdf.drop(tcol, axis=1) if verbose: print("Columns:", ','.join(xdf.columns)) print("Target:", target) if split: x_train, x_rest, y_train, y_rest = \ train_test_split(xdf, target, test_size=0.1, random_state=seed) time0 = time.time() fit = clf.partial_fit(x_train, y_train) if verbose: print("Train elapsed time", time.time()-time0) print("### SCORE", clf.score(x_rest, y_rest)) else: x_train = xdf y_train = target time0 = time.time() fit = clf.partial_fit(x_train, y_train) if verbose: print("Train elapsed time", time.time()-time0) # new data for which we want to predict if newdata_file: tdf = read_data(newdata_file, drops, scaler=scaler) if tcol in tdf.columns: tdf = tdf.drop(tcol, axis=1) datasets = [int(i) for i in list(tdf['dataset'])] dbs_h = get_dbs_header(tdf, newdata_file) dbses = [int(i) for i in list(tdf[dbs_h])] predictions = fit.predict_proba(tdf) data = {'dataset':datasets, dbs_h: dbses, 'prediction':predictions} out = pd.DataFrame(data=data) if ofile: out.to_csv(ofile, header=True, index=False)
def model(train_file, newdata_file, idcol, tcol, learner, lparams=None, drops=None, split=0.3, scorer=None, scaler=None, ofile=None, idx=0, limit=-1, gsearch=None, crossval=None, seed=123, verbose=False, timeout=None, proba=False): """ Build and run ML algorihtm for given train/test dataframe and classifier name. The learners are defined externally in DCAF.ml.clf module. """ clf = learners()[learner] if proba and not (hasattr(clf, 'predict_proba') and callable(getattr(clf, 'predict_proba'))): raise Exception("ERROR: model %s does not provide method 'predict_proba'" % learner) if lparams: if isinstance(lparams, str): lparams = json.loads(lparams) elif isinstance(lparams, dict): pass else: raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams))) for key, val in lparams.items(): setattr(clf, key, val) setattr(clf, "random_state", seed) random.seed(seed) if scaler: clf = Pipeline([('scaler',getattr(preprocessing, scaler)()), ('clf', clf)]) print(clf) if split: if isinstance(split, int): split = split/100. elif isinstance(split, float): pass elif isinstance(split, basestring): split = float(split) print("Split level: train %s%%, validation %s%%" % (round((1-split)*100), round(split*100))) if verbose: print("idx/limit", idx, limit) # read data and normalize it if drops: if isinstance(drops, basestring): drops = drops.split(',') if idcol not in drops: drops += [idcol] else: drops = [idcol] xdf = read_data(train_file, drops, idx, limit, scaler) # get target variable and exclude choice from train data target = xdf[tcol] xdf = xdf.drop(tcol, axis=1) if verbose: print("Train file", train_file) print("Columns:", ','.join(xdf.columns)) print("train shapes:", xdf.shape, target.shape) if verbose>1: print("Target:", tcol, target) # split our train data if split: x_train, x_rest, y_train, y_rest = \ train_test_split(xdf, target, test_size=split, random_state=seed) if verbose: print("train shapes after splitting:", x_train.shape, y_train.shape) else: x_train = xdf y_train = target x_rest = None y_rest = None if gsearch: param_search(clf, x_train, y_train, x_rest, y_rest, gsearch) sys.exit(0) if crossval: crossvalidation(clf, xdf, target) sys.exit(0) time0 = time.time() fit = clf.fit(x_train, y_train) rtime = time.time()-time0 if verbose: print("Train elapsed time", time.time()-time0) if split: if proba: print("ERROR in model.py: probabilities not supported in split mode") sys.exit(1) time0 = time.time() predictions = fit.predict(x_rest) rtime += time.time()-time0 try: importances = clf.feature_importances_ if importances.any(): print("Feature ranking:") columns = xdf.columns indices = np.argsort(importances)[::-1] num = 9 if len(columns)>9 else len(columns) for f in range(num): print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]])) except: pass if scorer: for scr in scorer.split(','): slist = ['tp', 'tn', 'fp', 'fn', 'tpr', 'tnr', 'fpr', 'fnr'] if scr.lower() in slist: res = rates(y_rest, predictions) print("Score metric (%s): %s" % (scr.upper(), res[scr.lower()])) continue scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '') method = scr_str.split(',')[0] res = getattr(metrics, method)(y_rest, predictions) print("Score metric (%s): %s" % (method, res)) if verbose: loss = 0 tot = 0 for pval, yval in zip(predictions, y_rest): if verbose>1: print("predict value %s, real value %s" % (pval, yval)) loss += logloss(pval, yval) tot += 1 print("Final Logloss", loss/tot) else: print("Since there is no train/validation splitting, no prediction metrics will be shown") # new data file for which we want to predict if newdata_file: nfiles = [] if os.path.isfile(newdata_file): nfiles = [newdata_file] else: if newdata_file.find(',') != -1: nfiles = newdata_file.split(',') elif newdata_file.find('*') != -1: nfiles = glob.glob(newdata_file) elif os.path.isdir(newdata_file): for ext in ['.csv.gz', '.csv', 'csv.bz2']: nfiles = [f for f in findfiles(fin, ext)] else: print("ERROR: no files found for --newdata=%s" % newdata_file) sys.exit(1) if not len(nfiles): print("WARNING: no files to predict in %s" % newdata_file) return outfname = None for ni, nfile in enumerate(nfiles): # iterate on files to predict if len(nfiles) > 1: outfname = '%s_%s_%s' % (learner, ofile, ni) print("You provided file list, the output file name %s will be replaced with %s_%s_%s" % (ofile, learner, ofile, ni)) else: outfname = ofile tdf = read_data(nfile, drops, scaler=scaler) if tcol in tdf.columns: tdf = tdf.drop(tcol, axis=1) if verbose: print("New data file", nfile) print("Columns:", ','.join(tdf.columns)) print("test shapes:", tdf.shape) datasets = [int(i) for i in list(tdf.get('dataset', []))] if datasets: dbs_h = get_dbs_header(tdf, nfile) dbses = [int(i) for i in list(tdf[dbs_h])] if verbose: print(tdf) time0 = time.time() predictions = fit.predict(tdf) if not proba else np.asarray(fit.predict_proba(tdf))[:,list(fit.classes_).index(1)] rtime += time.time()-time0 if datasets: out = pd.DataFrame({'dataset':datasets, dbs_h: dbses, 'prediction':predictions}) else: out = pd.DataFrame({'prediction':predictions}) if outfname: out.to_csv(outfname, header=True, index=False) if timeout: # output running time data = {} if os.path.isfile(timeout): # append if file exists headers = [] for line in open(timeout, 'r'): line = line.strip(" \r\n").split(',') if not headers: headers = line if line[0] != 'model' or line[1] != 'running_time_s': print("Error writing model running time to %s: unrecognized output file found." % timeout) continue else: data[line[0]] = float(line[1]) if learner in data: data[learner] += rtime else: data[learner] = rtime fstream = open(timeout, 'w') fstream.write("model,running_time_s\n") for key in sorted(data.keys()): fstream.write("%s,%.3f\n" % (key,data[key])) fstream.close()