Exemple #1
0
 def fetch(self, api, params=None):
     "Fetch data for given api"
     url = '%s/%s' % (self.url, api)
     data = json.loads(super(DashboardService, self).fetch(url, params))
     if  data and 'jobs' in data:
         for row in data['jobs']:
             yield row
Exemple #2
0
 def fetch(self, api, params=None):
     "Fetch data for given api"
     url = '%s/%s/?%s' % (self.url, api, urllib.urlencode(params, doseq=True))
     docid = genkey("url=%s params=%s" % (url, params))
     res = self.storage.fetch_one('cache', {'_id':docid})
     if  res and 'data' in res:
         if  self.verbose:
             print("%s::fetch url=%s, params=%s, docid=%s" \
                     % (self.name, url, params, docid))
         data = res['data']
     else:
         if  self.verbose:
             print("%s::fetch url=%s, params=%s" % (self.name, url, params))
         # NOTE: popularity DB has two different access points, one
         # within CERN network and out outside. The former does not require
         # authentication, while later passes through CERN SSO.
         # The following block reflects this, in a future, when popularity DB
         # will move into cmsweb domain we'll no longer need it
         if  self.url.find('cms-popularity-prod') != -1 or \
                 self.url.find('cmsweb') != -1:
             data = getdata(url, ckey=self.ckey, cert=self.cert, debug=self.verbose)
         else:
             data = sso_getdata(url, ckey=self.ckey, cert=self.cert, debug=self.verbose)
         self.storage.insert('cache', {'_id':docid, 'data': data, 'url': url, 'params': params})
     data = json.loads(data)
     for row in data['DATA']:
         yield row
Exemple #3
0
 def fetch(self, api, params=None):
     "Fetch data for given api"
     url = '%s/%s' % (self.url, api)
     if self.verbose:
         print(url, params)
     data = json.loads(super(DashboardService, self).fetch(url, params))
     if data and 'jobs' in data:
         for row in data['jobs']:
             yield row
Exemple #4
0
 def dataset_dbsinst(self, dataset):
     "Find dbsinstance of given dataset"
     url = '%s/datasets' % self.url
     params = {'dataset': dataset}
     for dbsinst in self.all_dbs:
         dbs_url = url.replace('prod/global', dbsinst)
         data = json.loads(super(DBSService, self).fetch(dbs_url, params))
         if  len(data) and 'dataset' in data[0] and data[0]['dataset'] == dataset:
             return dbsinst
Exemple #5
0
 def dataset_dbsinst(self, dataset):
     "Find dbsinstance of given dataset"
     url = '%s/datasets' % self.url
     params = {'dataset': dataset}
     for dbsinst in self.all_dbs:
         dbs_url = url.replace('prod/global', dbsinst)
         data = json.loads(super(DBSService, self).fetch(dbs_url, params))
         if len(data) and 'dataset' in data[0] and data[0][
                 'dataset'] == dataset:
             return dbsinst
Exemple #6
0
 def dataset_release_versions(self, dataset, dbsinst=None):
     "Return dataset release versions"
     url = '%s/releaseversions' % self.url
     params = {'dataset':dataset}
     dbs_url = url.replace('prod/global', dbsinst)
     data = json.loads(super(DBSService, self).fetch(dbs_url, params))
     if  not len(data) or not 'release_version' in data[0]:
         for dbsinst in self.all_dbs:
             dbs_url = url.replace('prod/global', dbsinst)
             data = json.loads(super(DBSService, self).fetch(dbs_url, params))
             if  len(data) and 'release_version' in data[0]:
                 break
     if  data and isinstance(data, list) and len(data) > 0:
         if  'release_version' in data[0]:
             for ver in set(data[0]['release_version']):
                 yield ver
         else:
             yield "N/A"
     else:
         yield "N/A"
Exemple #7
0
 def data_tiers(self):
     "Return list of known data-tiers"
     if  self.tiers:
         return self.tiers
     url = '%s/datatiers' % self.url
     params = {}
     for dbsinst in self.all_dbs:
         dbs_url = url.replace('prod/global', dbsinst)
         data = json.loads(super(DBSService, self).fetch(dbs_url, params))
         for tdict in data:
             self.tiers.add(tdict['data_tier_name'])
     return self.tiers
Exemple #8
0
 def data_tiers(self):
     "Return list of known data-tiers"
     if self.tiers:
         return self.tiers
     url = '%s/datatiers' % self.url
     params = {}
     for dbsinst in self.all_dbs:
         dbs_url = url.replace('prod/global', dbsinst)
         data = json.loads(super(DBSService, self).fetch(dbs_url, params))
         for tdict in data:
             self.tiers.add(tdict['data_tier_name'])
     return list(self.tiers)
Exemple #9
0
 def dataset_release_versions(self, dataset, dbsinst=None):
     "Return dataset release versions"
     url = '%s/releaseversions' % self.url
     params = {'dataset': dataset}
     dbs_url = url.replace('prod/global', dbsinst)
     data = json.loads(super(DBSService, self).fetch(dbs_url, params))
     if not len(data) or not 'release_version' in data[0]:
         for dbsinst in self.all_dbs:
             dbs_url = url.replace('prod/global', dbsinst)
             data = json.loads(
                 super(DBSService, self).fetch(dbs_url, params))
             if len(data) and 'release_version' in data[0]:
                 break
     if data and isinstance(data, list) and len(data) > 0:
         if 'release_version' in data[0]:
             for ver in set(data[0]['release_version']):
                 yield ver
         else:
             yield "N/A"
     else:
         yield "N/A"
def popdb_datasets(tstart, tstop, url):
    "Fetch data from popDB for given time frame and print out datasets"
    api = 'DSStatInTimeWindow'
    ckey, cert = get_key_cert()
    params = {'tstart': tstart, 'tstop': tstop}
    url = '%s/%s/?%s' % (url, api, urllib.urlencode(params, doseq=True))
    data = getdata(url, ckey=ckey, cert=cert, debug=0)
    data = json.loads(data)
    headers = []
    for row in data['DATA']:
        if not headers:
            headers = row.keys()
            print(','.join(headers))
        out = [str(row[k]) for k in headers]
        print(','.join(out))
def popdb_datasets(tstart, tstop, url):
    "Fetch data from popDB for given time frame and print out datasets"
    api = "DSStatInTimeWindow"
    ckey, cert = get_key_cert()
    params = {"tstart": tstart, "tstop": tstop}
    url = "%s/%s/?%s" % (url, api, urllib.urlencode(params, doseq=True))
    data = getdata(url, ckey=ckey, cert=cert, debug=0)
    data = json.loads(data)
    headers = []
    for row in data["DATA"]:
        if not headers:
            headers = row.keys()
            print(",".join(headers))
        out = [str(row[k]) for k in headers]
        print(",".join(out))
Exemple #12
0
 def fetch(self, api, params=None):
     "Fetch data for given api"
     if  api == 'replicas':
         url = '%s/blockReplicas' % self.url
     else:
         url = '%s/%s' % (self.url, api)
     try:
         data = json.loads(super(PhedexService, self).fetch(url, params))
         rid = 0
         for row in data['phedex']['block']:
             for repl in row['replica']:
                 node = repl['node']
                 yield node
             rid += 1
     except Exception as exc:
         print(str(exc))
         pass
Exemple #13
0
 def fetch(self, api, params=None):
     "Fetch data for given api"
     if api == 'replicas':
         url = '%s/blockReplicas' % self.url
     else:
         url = '%s/%s' % (self.url, api)
     try:
         data = json.loads(super(PhedexService, self).fetch(url, params))
         rid = 0
         for row in data['phedex']['block']:
             for repl in row['replica']:
                 node = repl['node']
                 yield node
             rid += 1
     except Exception as exc:
         print(str(exc))
         pass
Exemple #14
0
 def fetch(self, api, params=None):
     "Fetch data for given api"
     url = '%s/%s/?%s' % (self.url, api, urllib.urlencode(params, doseq=True))
     docid = genkey("url=%s params=%s" % (url, params))
     res = self.storage.fetch_one('cache', {'_id':docid})
     if  res and 'data' in res:
         if  self.verbose:
             print("%s::fetch url=%s, params=%s, docid=%s" \
                     % (self.name, url, params, docid))
         data = res['data']
     else:
         if  self.verbose:
             print("%s::fetch url=%s, params=%s" % (self.name, url, params))
         data = getdata(url, ckey=self.ckey, cert=self.cert, debug=self.verbose)
         self.storage.insert('cache', {'_id':docid, 'data': data, 'url': url, 'params': params})
     data = json.loads(data)
     for row in data['DATA']:
         yield row
def popdb_datasets(tstart, tstop, url):
    "Fetch data from popDB for given time frame and print out datasets"
    api = 'DSStatInTimeWindow'
    ckey, cert = get_key_cert()
    params = {'tstart':tstart, 'tstop':tstop}
    url = '%s/%s?%s' % (url, api, urllib.urlencode(params, doseq=True))
    # NOTE: popularity DB has two different access points, one
    # within CERN network and out outside. The former does not require
    # authentication, while later passes through CERN SSO.
    # The following block reflects this, in a future, when popularity DB
    # will move into cmsweb domain we'll no longer need it
    if  url.find('cms-popularity-prod.cern.ch') != -1:
        data = getdata(url, ckey=ckey, cert=cert, debug=0)
    else:
        data = sso_getdata(url, ckey=ckey, cert=cert, debug=0)
    data = json.loads(data)
    headers = []
    for row in data['DATA']:
        if  not headers:
            headers = row.keys()
            print(','.join(headers))
        out = [str(row[k]) for k in headers]
        print(','.join(out))
Exemple #16
0
 def fetch(self, api, params=None, dbsinst='prod/global', cache=True):
     "Fetch data for given api"
     if dbsinst:
         dbs_url = self.url.replace('prod/global', dbsinst)
     inst = {'dbs_instance': self.all_dbs.index(dbsinst)}
     if api == 'releases':
         url = '%s/releaseversions' % dbs_url
     else:
         url = '%s/%s' % (dbs_url, api)
     data = json.loads(super(DBSService, self).fetch(url, params, cache))
     if api == 'releases':
         data = data[0]['release_version']
     for row in data:
         if api == 'datasets':
             try:
                 row['rid'] = row['dataset_id']
             except KeyError:
                 print("Unable to process dataset row", row)
                 if 'dataset' in row:
                     h = hashlib.md5()
                     h.update(row['dataset'])
                     row['rid'] = int(h.hexdigest()[:10], 16)
                     print("Generated new dataset_id", row['dataset'],
                           h.hexdigest(), row['rid'])
             except:
                 print("Unable to process dataset row", row)
                 raise
             row.update(inst)
             yield row
         elif api == 'releases':
             rid = genkey(row, truncate=5)
             rec = {'release': row, 'rid': rid}
             yield rec
         elif api == 'filesummaries':
             yield row
         else:
             yield row
Exemple #17
0
def sitedb_parser(source):
    """SiteDB parser"""
    if  isinstance(source, str) or isinstance(source, unicode):
        data = json.loads(source)
    elif isinstance(source, InstanceType) or isinstance(source, file):
        # got data descriptor
        try:
            data = json.load(source)
        except Exception as exc:
            print_exc(exc)
            source.close()
            raise
        source.close()
    else:
        data = source
    if  not isinstance(data, dict):
        raise Exception('Wrong data type, %s' % type(data))
    if  'desc' in data:
        columns = data['desc']['columns']
        for row in data['result']:
            yield rowdict(columns, row)
    else:
        for row in data['result']:
            yield row
Exemple #18
0
 def fetch(self, api, params=None, dbsinst='prod/global', cache=True):
     "Fetch data for given api"
     if  dbsinst:
         dbs_url = self.url.replace('prod/global', dbsinst)
     inst = {'dbs_instance':self.all_dbs.index(dbsinst)}
     if  api == 'releases':
         url = '%s/releaseversions' % dbs_url
     else:
         url = '%s/%s' % (dbs_url, api)
     data = json.loads(super(DBSService, self).fetch(url, params, cache))
     if  api == 'releases':
         data = data[0]['release_version']
     for row in data:
         if  api == 'datasets':
             try:
                 row['rid'] = row['dataset_id']
             except KeyError:
                 print("Unable to process dataset row", row)
                 if  'dataset' in row:
                     h = hashlib.md5()
                     h.update(row['dataset'])
                     row['rid'] = int(h.hexdigest()[:10], 16)
                     print("Generated new dataset_id", row['dataset'], h.hexdigest(), row['rid'])
             except:
                 print("Unable to process dataset row", row)
                 raise
             row.update(inst)
             yield row
         elif api == 'releases':
             rid = genkey(row, truncate=5)
             rec = {'release':row, 'rid':rid}
             yield rec
         elif api == 'filesummaries':
             yield row
         else:
             yield row
Exemple #19
0
def model(train_file, newdata_file, idcol, tcol, learner, lparams=None,
        drops=None, split=0.3, scorer=None,
        scaler=None, ofile=None, idx=0, limit=-1, gsearch=None, crossval=None, verbose=False):
    """
    Build and run ML algorihtm for given train/test dataframe
    and classifier name. The learners are defined externally
    in DCAF.ml.clf module.
    """
    split = 0 # change by Ting to use the whole training set for training, not for validation. 

    clf = learners()[learner]
    if  lparams:
        if  isinstance(lparams, str):
            lparams = json.loads(lparams)
        elif isinstance(lparams, dict):
            pass
        else:
            raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams)))
        for key, val in lparams.items():
            setattr(clf, key, val)
    setattr(clf, "random_state", 123) 

    print clf
    if  split:
        if  isinstance(split, int):
            split = split/100.
        elif isinstance(split, float):
            pass
        elif isinstance(split, basestring):
            split = float(split)
        print "Split level: train %s%%, validation %s%%" % (round((1-split)*100), round(split*100))
    if  verbose:
        print "idx/limit", idx, limit

    # read data and normalize it
    if  drops:
        if  isinstance(drops, basestring):
            drops = drops.split(',')
        if  idcol not in drops:
            drops += [idcol]
    else:
        drops = [idcol]
    xdf = read_data(train_file, drops, idx, limit, scaler)

    # get target variable and exclude choice from train data
    target = xdf[tcol]
    xdf = xdf.drop(tcol, axis=1)
    if  verbose:
        print "Train file", train_file
        print "Columns:", ','.join(xdf.columns)
        print "train shapes:", xdf.shape, target.shape
        if  verbose>1:
            print "Target:", tcol, target

    # split our train data
    if  split:
        # x_train, x_rest, y_train, y_rest = train_test_split(xdf, target, test_size=split) 
        x_train, x_rest, y_train, y_rest = train_test_split(xdf, target, test_size=split, random_state=1234) # change by Ting, for controlling random seed
        if  verbose:
            print "train shapes after splitting:", x_train.shape, y_train.shape
    else:
        x_train = xdf
        y_train = target
        x_rest = None
        y_rest = None
    if  gsearch:
        param_search(clf, x_train, y_train, x_rest, y_rest, gsearch)
        sys.exit(0)
    if  crossval:
        crossvalidation(clf, xdf, target)
        sys.exit(0)

    ###############################################################################
    # add by Ting to do feature selection and measuare feature importance
    # Univariate feature selection with F-test for feature scoring
    # We use the default selection function: the 10% most significant features
    from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile, f_classif

    selector = SelectPercentile(f_classif, percentile=100) # by F test
    selector.fit(x_train, y_train)
    pvs = selector.pvalues_
    # output scores of features
    columns = xdf.columns
    indices = np.argsort(pvs)
    num = len(columns) 
    print("\n Feature ranking by ANOVA F test:")
    for f in range(num):
        print("%d. feature selection test p-value %f, feature %s" % (f + 1, pvs[indices[f]], columns[indices[f]]))


    selector = SelectPercentile(chi2, percentile=10) # by chi square test
    selector.fit(x_train, y_train)
    pvs = selector.pvalues_
    # output scores of features
    columns = xdf.columns
    indices = np.argsort(pvs)
    num = len(columns) 
    print("\n Feature ranking by Chi Squared test:")
    for f in range(num):
        print("%d. feature selection test p-value %f, feature %s" % (f + 1, pvs[indices[f]], columns[indices[f]]))

    
    ###############################################################################

    # preprocessing of "scaler" type
    # scaler = None  # added by ting, to ignore the standardization, but fail to do that. todo
    if  scaler:
        x_train = getattr(preprocessing, scaler)().fit_transform(x_train)

    time0 = time.time()
    fit = clf.fit(x_train, y_train)
    if  verbose:
        print "Train elapsed time", time.time()-time0
        
    # comment out by Ting, move it to the new test dataset
    # # for validation
    # if  split:
    #     predictions = fit.predict(x_rest)
    #     try:
    #         importances = clf.feature_importances_
    #         if  importances.any():
    #             print "Feature ranking:"
    #             columns = xdf.columns
    #             indices = np.argsort(importances)[::-1]
    #             # num = 9 if len(columns)>9 else len(columns) 
    #             num = len(columns) # change by Ting
    #             for f in range(num):
    #                 print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]]))
    #     except:
    #         pass
    #     if  scorer:
    #         for scr in scorer.split(','):
    #             scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '')
    #             method = scr_str.split(',')[0]
    #             res = getattr(metrics, method)(y_rest, predictions)
    #             print "Score metric (%s): %s" % (method, res)
    #     if  verbose:
    #         loss = 0
    #         tot = 0
    #         for pval, yval in zip(predictions, y_rest):
    #             if  verbose>1:
    #                 print "predict value %s, real value %s" % (pval, yval)
    #             loss += logloss(pval, yval)
    #             tot += 1
    #         print "Final Logloss", loss/tot
    # else:
    #     print "Since there is no train/validation splitting, no prediction metrics will be shown"

    # predict on new data set, by the learned classifier
    if  newdata_file:
        tdf = read_data(newdata_file, drops, scaler=scaler)
        if  tcol in tdf.columns:
            tdf = tdf.drop(tcol, axis=1)
        if  verbose:
            print "New data file", newdata_file
            print "Columns:", ','.join(tdf.columns)
            print "test shapes:", tdf.shape
        datasets = [int(i) for i in list(tdf['dataset'])]
        dbses = [int(i) for i in list(tdf['dbs'])]
        if  scaler:
            tdf = getattr(preprocessing, scaler)().fit_transform(tdf)
        predictions = fit.predict(tdf)
        data = {'dataset':datasets, 'dbs': dbses, 'prediction':predictions}
        out = pd.DataFrame(data=data)
        if  ofile:
            out.to_csv(ofile, header=True, index=False)

        try:
            importances = clf.feature_importances_
            if  importances.any():
                print "\n Feature ranking by random forest classifier:"
                columns = xdf.columns
                indices = np.argsort(importances)[::-1]
                # num = 9 if len(columns)>9 else len(columns)  
                num = len(columns) # change by Ting, to output all features' importances
                for f in range(num):
                    print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]]))
        except:
            pass
Exemple #20
0
 def fetch(self, api, params=None):
     "Fetch data for given api"
     url = '%s/%s' % (self.url, api)
     data = json.loads(super(mcmService, self).fetch(url, params))
     yield data['results']
Exemple #21
0
 def fetch(self, api, params=None):
     "Fetch data for given api"
     url = '%s/%s' % (self.url, api)
     data = json.loads(super(mcmService, self).fetch(url, params))
     yield data['results']
Exemple #22
0
def model(train_file, newdata_file, idcol, tcol, learner, lparams=None,
        drops=None, split=0.3, scorer=None,
        scaler=None, ofile=None, idx=0, limit=-1, gsearch=None, crossval=None, verbose=False):
    """
    Build and run ML algorihtm for given train/test dataframe
    and classifier name. The learners are defined externally
    in DCAF.ml.clf module.
    """
    clf = learners()[learner]
    if  lparams:
        if  isinstance(lparams, str):
            lparams = json.loads(lparams)
        elif isinstance(lparams, dict):
            pass
        else:
            raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams)))
        for key, val in lparams.items():
            setattr(clf, key, val)
    setattr(clf, "random_state", 123)
    print(clf)
    if  split:
        if  isinstance(split, int):
            split = split/100.
        elif isinstance(split, float):
            pass
        elif isinstance(split, basestring):
            split = float(split)
        print("Split level: train %s%%, validation %s%%" % (round((1-split)*100), round(split*100)))
    if  verbose:
        print("idx/limit", idx, limit)

    # read data and normalize it
    if  drops:
        if  isinstance(drops, basestring):
            drops = drops.split(',')
        if  idcol not in drops:
            drops += [idcol]
    else:
        drops = [idcol]
    xdf = read_data(train_file, drops, idx, limit, scaler)

    # get target variable and exclude choice from train data
    target = xdf[tcol]
    xdf = xdf.drop(tcol, axis=1)
    if  verbose:
        print("Train file", train_file)
        print("Columns:", ','.join(xdf.columns))
        print("train shapes:", xdf.shape, target.shape)
        if  verbose>1:
            print("Target:", tcol, target)

    # split our train data
    if  split:
        x_train, x_rest, y_train, y_rest = \
                train_test_split(xdf, target, test_size=split)
        if  verbose:
            print("train shapes after splitting:", x_train.shape, y_train.shape)
    else:
        x_train = xdf
        y_train = target
        x_rest = None
        y_rest = None
    if  gsearch:
        param_search(clf, x_train, y_train, x_rest, y_rest, gsearch)
        sys.exit(0)
    if  crossval:
        crossvalidation(clf, xdf, target)
        sys.exit(0)

    if  scaler:
        x_train = getattr(preprocessing, scaler)().fit_transform(x_train)
    time0 = time.time()
    fit = clf.fit(x_train, y_train)
    if  verbose:
        print("Train elapsed time", time.time()-time0)
    if  split:
        predictions = fit.predict(x_rest)
        try:
            importances = clf.feature_importances_
            if  importances.any():
                print("Feature ranking:")
                columns = xdf.columns
                indices = np.argsort(importances)[::-1]
                num = 9 if len(columns)>9 else len(columns)
                for f in range(num):
                    print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]]))
        except:
            pass
        if  scorer:
            for scr in scorer.split(','):
                scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '')
                method = scr_str.split(',')[0]
                res = getattr(metrics, method)(y_rest, predictions)
                print("Score metric (%s): %s" % (method, res))
        if  verbose:
            loss = 0
            tot = 0
            for pval, yval in zip(predictions, y_rest):
                if  verbose>1:
                    print("predict value %s, real value %s" % (pval, yval))
                loss += logloss(pval, yval)
                tot += 1
            print("Final Logloss", loss/tot)
    else:
        print("Since there is no train/validation splitting, no prediction metrics will be shown")

    # new data file for which we want to predict
    if  newdata_file:
        tdf = read_data(newdata_file, drops, scaler=scaler)
        if  tcol in tdf.columns:
            tdf = tdf.drop(tcol, axis=1)
        if  verbose:
            print("New data file", newdata_file)
            print("Columns:", ','.join(tdf.columns))
            print("test shapes:", tdf.shape)
        datasets = [int(i) for i in list(tdf['dataset'])]
        dbses = [int(i) for i in list(tdf['dbs'])]
        if  scaler:
            tdf = getattr(preprocessing, scaler)().fit_transform(tdf)
        predictions = fit.predict(tdf)
        data = {'dataset':datasets, 'dbs': dbses, 'prediction':predictions}
        out = pd.DataFrame(data=data)
        if  ofile:
            out.to_csv(ofile, header=True, index=False)
Exemple #23
0
def model_iter(train_file_list, newdata_file, idcol, tcol,
    learner, lparams=None, drops=None, split=0.1, scaler=None, ofile=None, seed=123, verbose=False):
    """
    Build and run ML algorihtm for given train/test dataframe
    and classifier name. The learners are defined externally
    in DCAF.ml.clf module.
    """
    if  learner not in ['SGDClassifier', 'SGDRegressor']:
        raise Exception("Unsupported learner %s" % learner)
    clf = learners()[learner]
    setattr(clf, "random_state", seed)
    random.seed(seed)
    if  lparams:
        if  isinstance(lparams, str):
            lparams = json.loads(lparams)
        elif isinstance(lparams, dict):
            pass
        else:
            raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams)))
        for key, val in lparams.items():
            setattr(clf, key, val)
    if  scaler:
        clf = Pipeline([('scaler',getattr(preprocessing, scaler)()), ('clf', clf)])
    print("clf:", clf)

    if  drops:
        if  isinstance(drops, basestring):
            drops = drops.split(',')
        if  idcol not in drops:
            drops += [idcol]
    else:
        drops = [idcol]
    fit = None
    for train_file in train_file_list:
        print("Train file", train_file)
        # read data and normalize it
        xdf = read_data(train_file, drops, scaler=scaler)

        # get target variable and exclude choice from train data
        target = xdf[tcol]
        xdf = xdf.drop(tcol, axis=1)
        if  verbose:
            print("Columns:", ','.join(xdf.columns))
            print("Target:", target)

        if  split:
            x_train, x_rest, y_train, y_rest = \
                    train_test_split(xdf, target, test_size=0.1, random_state=seed)
            time0 = time.time()
            fit = clf.partial_fit(x_train, y_train)
            if  verbose:
                print("Train elapsed time", time.time()-time0)
            print("### SCORE", clf.score(x_rest, y_rest))
        else:
            x_train = xdf
            y_train = target
            time0 = time.time()
            fit = clf.partial_fit(x_train, y_train)
            if  verbose:
                print("Train elapsed time", time.time()-time0)

    # new data for which we want to predict
    if  newdata_file:
        tdf = read_data(newdata_file, drops, scaler=scaler)
        if  tcol in tdf.columns:
            tdf = tdf.drop(tcol, axis=1)
        datasets = [int(i) for i in list(tdf['dataset'])]
        dbs_h = get_dbs_header(tdf, newdata_file)
        dbses = [int(i) for i in list(tdf[dbs_h])]
        predictions = fit.predict_proba(tdf)
        data = {'dataset':datasets, dbs_h: dbses, 'prediction':predictions}
        out = pd.DataFrame(data=data)
        if  ofile:
            out.to_csv(ofile, header=True, index=False)
Exemple #24
0
def model(train_file, newdata_file, idcol, tcol, learner, lparams=None,
        drops=None, split=0.3, scorer=None, scaler=None, ofile=None,
        idx=0, limit=-1,  gsearch=None, crossval=None, seed=123,
        verbose=False, timeout=None, proba=False):
    """
    Build and run ML algorihtm for given train/test dataframe
    and classifier name. The learners are defined externally
    in DCAF.ml.clf module.
    """
    clf = learners()[learner]
    if  proba and not (hasattr(clf, 'predict_proba') and callable(getattr(clf, 'predict_proba'))):
        raise Exception("ERROR: model %s does not provide method 'predict_proba'" % learner)
    if  lparams:
        if  isinstance(lparams, str):
            lparams = json.loads(lparams)
        elif isinstance(lparams, dict):
            pass
        else:
            raise Exception('Invalid data type for lparams="%s", type: %s' % (lparams, type(lparams)))
        for key, val in lparams.items():
            setattr(clf, key, val)
    setattr(clf, "random_state", seed)
    random.seed(seed)
    if  scaler:
        clf = Pipeline([('scaler',getattr(preprocessing, scaler)()), ('clf', clf)])
    print(clf)
    if  split:
        if  isinstance(split, int):
            split = split/100.
        elif isinstance(split, float):
            pass
        elif isinstance(split, basestring):
            split = float(split)
        print("Split level: train %s%%, validation %s%%" % (round((1-split)*100), round(split*100)))
    if  verbose:
        print("idx/limit", idx, limit)

    # read data and normalize it
    if  drops:
        if  isinstance(drops, basestring):
            drops = drops.split(',')
        if  idcol not in drops:
            drops += [idcol]
    else:
        drops = [idcol]
    xdf = read_data(train_file, drops, idx, limit, scaler)

    # get target variable and exclude choice from train data
    target = xdf[tcol]
    xdf = xdf.drop(tcol, axis=1)
    if  verbose:
        print("Train file", train_file)
        print("Columns:", ','.join(xdf.columns))
        print("train shapes:", xdf.shape, target.shape)
        if  verbose>1:
            print("Target:", tcol, target)

    # split our train data
    if  split:
        x_train, x_rest, y_train, y_rest = \
                train_test_split(xdf, target, test_size=split, random_state=seed)
        if  verbose:
            print("train shapes after splitting:", x_train.shape, y_train.shape)
    else:
        x_train = xdf
        y_train = target
        x_rest = None
        y_rest = None
    if  gsearch:
        param_search(clf, x_train, y_train, x_rest, y_rest, gsearch)
        sys.exit(0)
    if  crossval:
        crossvalidation(clf, xdf, target)
        sys.exit(0)

    time0 = time.time()
    fit = clf.fit(x_train, y_train)
    rtime = time.time()-time0
    if  verbose:
        print("Train elapsed time", time.time()-time0)
    if  split:
        if  proba:
            print("ERROR in model.py: probabilities not supported in split mode")
            sys.exit(1)
        time0 = time.time()
        predictions = fit.predict(x_rest)
        rtime += time.time()-time0
        try:
            importances = clf.feature_importances_
            if  importances.any():
                print("Feature ranking:")
                columns = xdf.columns
                indices = np.argsort(importances)[::-1]
                num = 9 if len(columns)>9 else len(columns)
                for f in range(num):
                    print("%d. importance %f, feature %s" % (f + 1, importances[indices[f]], columns[indices[f]]))
        except:
            pass
        if  scorer:
            for scr in scorer.split(','):
                slist = ['tp', 'tn', 'fp', 'fn', 'tpr', 'tnr', 'fpr', 'fnr']
                if  scr.lower() in slist:
                    res = rates(y_rest, predictions)
                    print("Score metric (%s): %s" % (scr.upper(), res[scr.lower()]))
                    continue
                scr_str = repr(metrics.SCORERS[scr]).replace('make_scorer(', '').replace(')', '')
                method = scr_str.split(',')[0]
                res = getattr(metrics, method)(y_rest, predictions)
                print("Score metric (%s): %s" % (method, res))
        if  verbose:
            loss = 0
            tot = 0
            for pval, yval in zip(predictions, y_rest):
                if  verbose>1:
                    print("predict value %s, real value %s" % (pval, yval))
                loss += logloss(pval, yval)
                tot += 1
            print("Final Logloss", loss/tot)
    else:
        print("Since there is no train/validation splitting, no prediction metrics will be shown")

    # new data file for which we want to predict
    if  newdata_file:
        nfiles = []
        if  os.path.isfile(newdata_file):
            nfiles = [newdata_file]
        else:
            if newdata_file.find(',') != -1:
                nfiles = newdata_file.split(',')
            elif newdata_file.find('*') != -1:
                nfiles = glob.glob(newdata_file)
            elif os.path.isdir(newdata_file):
                for ext in ['.csv.gz', '.csv', 'csv.bz2']:
                    nfiles = [f for f in findfiles(fin, ext)]
            else:
                print("ERROR: no files found for --newdata=%s" % newdata_file)
                sys.exit(1)
            if  not len(nfiles):
                print("WARNING: no files to predict in %s" % newdata_file)
                return
        outfname = None
        for ni, nfile in enumerate(nfiles): # iterate on files to predict
            if  len(nfiles) > 1:
                outfname = '%s_%s_%s' % (learner, ofile, ni)
                print("You provided file list, the output file name %s will be replaced with %s_%s_%s" % (ofile, learner, ofile, ni))
            else:
                outfname = ofile
            tdf = read_data(nfile, drops, scaler=scaler)
            if  tcol in tdf.columns:
                tdf = tdf.drop(tcol, axis=1)
            if  verbose:
                print("New data file", nfile)
                print("Columns:", ','.join(tdf.columns))
                print("test shapes:", tdf.shape)
            datasets = [int(i) for i in list(tdf.get('dataset', []))]
            if  datasets:
                dbs_h = get_dbs_header(tdf, nfile)
                dbses = [int(i) for i in list(tdf[dbs_h])]
            if  verbose:
                print(tdf)
            time0 = time.time()
            predictions = fit.predict(tdf) if not proba else np.asarray(fit.predict_proba(tdf))[:,list(fit.classes_).index(1)]
            rtime += time.time()-time0
            if  datasets:
                out = pd.DataFrame({'dataset':datasets, dbs_h: dbses, 'prediction':predictions})
            else:
                out = pd.DataFrame({'prediction':predictions})
            if  outfname:
                out.to_csv(outfname, header=True, index=False)
            if  timeout: # output running time
                data = {}
                if  os.path.isfile(timeout): # append if file exists
                    headers = []
                    for line in open(timeout, 'r'):
                        line = line.strip(" \r\n").split(',')
                        if  not headers:
                            headers = line
                            if  line[0] != 'model' or line[1] != 'running_time_s':
                                print("Error writing model running time to %s: unrecognized output file found." % timeout)
                            continue
                        else:
                            data[line[0]] = float(line[1])
                if  learner in data:
                    data[learner] += rtime
                else:
                    data[learner] = rtime
                fstream = open(timeout, 'w')
                fstream.write("model,running_time_s\n")
                for key in sorted(data.keys()):
                    fstream.write("%s,%.3f\n" % (key,data[key]))
                fstream.close()