Ejemplo n.º 1
0
 def add_file(prediction, create, value, *args, **kwargs):
     train_featureset = prediction.model.featureset
     fset_data, data = featurize.load_featureset(train_featureset.file_uri)
     if 'class' in prediction.dataset.name or 'regr' in prediction.dataset.name:
         labels = data['labels']
     else:
         labels = []
     model_data = joblib.load(prediction.model.file_uri)
     if hasattr(model_data, 'best_estimator_'):
         model_data = model_data.best_estimator_
     preds = model_data.predict(fset_data)
     pred_probs = (pd.DataFrame(model_data.predict_proba(fset_data),
                                index=fset_data.index.astype(str),
                                columns=model_data.classes_) if hasattr(
                                    model_data, 'predict_proba') else [])
     all_classes = model_data.classes_ if hasattr(model_data,
                                                  'classes_') else []
     pred_path = pjoin(TMP_DIR, '{}.npz'.format(str(uuid.uuid4())))
     featurize.save_featureset(fset_data,
                               pred_path,
                               labels=labels,
                               preds=preds,
                               pred_probs=pred_probs)
     prediction.file_uri = pred_path
     DBSession().commit()
Ejemplo n.º 2
0
def compute_or_read_features():
    featurefile = f'{datadir}/plasticc_featuretable.npz'

    def worker(tsobj):
        global features_to_use
        thisfeats = featurize.featurize_single_ts(
            tsobj, features_to_use=features_to_use, raise_exceptions=False)
        return thisfeats

    if os.path.exists(featurefile):
        featuretable, _ = featurize.load_featureset(featurefile)
    else:
        light_curves = read_lightcurves(datadir)
        features_list = []
        with tqdm(total=nobjects, desc="Computing Features") as pbar:
            with multiprocessing.Pool() as pool:
                results = pool.imap(worker, list(light_curves.values()))
                for res in results:
                    features_list.append(res)
                    pbar.update()

        featuretable = featurize.assemble_featureset(
            features_list=features_list, time_series=light_curves.values())
        # Save the computed feature set to a file
        featurize.save_featureset(fset=featuretable, path=featurefile)

    # convert cesium's MultiIndex to an AstroPy Table that is better supported by
    # scikit-learn
    old_names = featuretable.columns.values
    new_names = ['{}_{}'.format(x, pbmap.get(y, 'meta')) for x, y in old_names]
    cols = [featuretable[col] for col in old_names]
    allfeats = Table(cols, names=new_names)
    del featuretable

    return allfeats
Ejemplo n.º 3
0
 def add_file(model, create, value, *args, **kwargs):
     model_params = {
         "RandomForestClassifier": {
             "bootstrap": True,
             "criterion": "gini",
             "oob_score": False,
             "max_features": "auto",
             "n_estimators": 10,
             "random_state": 0
         },
         "RandomForestRegressor": {
             "bootstrap": True,
             "criterion": "mse",
             "oob_score": False,
             "max_features": "auto",
             "n_estimators": 10
         },
         "LinearSGDClassifier": {
             "loss": "hinge"
         },
         "LinearRegressor": {
             "fit_intercept": True
         }
     }
     fset_data, data = featurize.load_featureset(model.featureset.file_uri)
     model_data = MODELS_TYPE_DICT[model.type](**model_params[model.type])
     model_data.fit(fset_data, data['labels'])
     model.file_uri = pjoin('/tmp/', '{}.pkl'.format(str(uuid.uuid4())))
     joblib.dump(model_data, model.file_uri)
     DBSession().commit()
Ejemplo n.º 4
0
    def get(self, prediction_id=None, action=None):
        if action == 'download':
            pred_path = Prediction.get_if_owned_by(prediction_id,
                                                   self.current_user).file_uri
            fset, data = featurize.load_featureset(pred_path)
            result = pd.DataFrame(({'label': data['labels']}
                                   if len(data['labels']) > 0 else None),
                                  index=fset.index)
            if len(data.get('pred_probs', [])) > 0:
                result = pd.concat((result, data['pred_probs']), axis=1)
            else:
                result['prediction'] = data['preds']
            result.index.name = 'ts_name'
            self.set_header("Content-Type", 'text/csv; charset="utf-8"')
            self.set_header("Content-Disposition", "attachment; "
                            "filename=cesium_prediction_results.csv")
            self.write(result.to_csv(index=True))
        else:
            if prediction_id is None:
                predictions = [prediction
                               for project in self.current_user.projects
                               for prediction in project.predictions]
                prediction_info = [p.display_info() for p in predictions]
            else:
                prediction = Prediction.get_if_owned_by(prediction_id,
                                                        self.current_user)
                prediction_info = prediction.display_info()

            return self.success(prediction_info)
Ejemplo n.º 5
0
def test_roundtrip_featureset(tmpdir):
    fset_path = os.path.join(str(tmpdir), 'test.npz')
    for n_channels in [1, 3]:
        for labels in [['class1', 'class2'], []]:
            fset, labels = sample_featureset(3,
                                             n_channels, ['amplitude'],
                                             labels,
                                             names=['a', 'b', 'c'],
                                             meta_features=['meta1'])

            pred_probs = pd.DataFrame(np.random.random((len(fset), 2)),
                                      index=fset.index.values,
                                      columns=['class1', 'class2'])

            featurize.save_featureset(fset,
                                      fset_path,
                                      labels=labels,
                                      pred_probs=pred_probs)
            fset_loaded, data_loaded = featurize.load_featureset(fset_path)
            npt.assert_allclose(fset.values, fset_loaded.values)
            npt.assert_array_equal(fset.index, fset_loaded.index)
            npt.assert_array_equal(fset.columns, fset_loaded.columns)
            assert isinstance(fset_loaded, pd.DataFrame)
            npt.assert_array_equal(labels, data_loaded['labels'])
            npt.assert_allclose(pred_probs, data_loaded['pred_probs'])
            npt.assert_array_equal(pred_probs.columns,
                                   data_loaded['pred_probs'].columns)
Ejemplo n.º 6
0
    def get(self, prediction_id=None, action=None):
        if action == 'download':
            prediction = Prediction.get_if_owned_by(prediction_id, self.current_user)
            pred_path = prediction.file_uri
            fset, data = featurize.load_featureset(pred_path)
            result = pd.DataFrame(({'label': data['labels']}
                                   if len(data['labels']) > 0 else None),
                                  index=fset.index)
            if len(data.get('pred_probs', [])) > 0:
                result = pd.concat((result, data['pred_probs']), axis=1)
            else:
                result['prediction'] = data['preds']
            result.index.name = 'ts_name'
            self.set_header("Content-Type", 'text/csv; charset="utf-8"')
            self.set_header(
                "Content-Disposition", "attachment; "
                f"filename=cesium_prediction_results_{prediction.project.name}"
                f"_{prediction.dataset.name}"
                f"_{prediction.model.name}_{prediction.finished}.csv")
            self.write(result.to_csv(index=True))
        else:
            if prediction_id is None:
                predictions = [prediction
                               for project in self.current_user.projects
                               for prediction in project.predictions]
                prediction_info = [p.display_info() for p in predictions]
            else:
                prediction = Prediction.get_if_owned_by(prediction_id,
                                                        self.current_user)
                prediction_info = prediction.display_info()

            return self.success(prediction_info)
Ejemplo n.º 7
0
 def display_info(self):
     info = self.to_dict()
     info['model_type'] = self.model.type
     info['dataset_name'] = self.dataset.name
     info['model_name'] = self.model.name
     info['featureset_name'] = self.model.featureset.name
     if self.task_id is None:
         fset, data = featurize.load_featureset(self.file_uri)
         info['isProbabilistic'] = (len(data['pred_probs']) > 0)
         info['results'] = Prediction.format_pred_data(fset, data)
     return info
Ejemplo n.º 8
0
 def display_info(self):
     info = self.to_dict()
     info['model_type'] = self.model.type
     info['dataset_name'] = self.dataset.name
     info['model_name'] = self.model.name
     info['featureset_name'] = self.model.featureset.name
     if self.task_id is None:
         fset, data = featurize.load_featureset(self.file_uri)
         info['isProbabilistic'] = (len(data['pred_probs']) > 0)
         info['results'] = Prediction.format_pred_data(fset, data)
     return info
Ejemplo n.º 9
0
def _build_model_compute_statistics(fset_path, model_type, model_params,
                                    params_to_optimize, model_path):
    '''Build model and return summary statistics.

    Parameters
    ----------
    fset_path : str
        Path to feature set .npz file.
    model_type : str
        Type of model to be built, e.g. 'RandomForestClassifier'.
    model_params : dict
        Dictionary with hyperparameter values to be used in model building.
        Keys are parameter names, values are the associated parameter values.
        These hyperparameters will be passed to the model constructor as-is
        (for hyperparameter optimization, see `params_to_optimize`).
    params_to_optimize : dict or list of dict
        During hyperparameter optimization, various model parameters
        are adjusted to give an optimal fit. This dictionary gives the
        different values that should be explored for each parameter. E.g.,
        `{'alpha': [1, 2], 'beta': [4, 5, 6]}` would fit models on all
        6 combinations of alpha and beta and compare the resulting models'
        goodness-of-fit. If None, only those hyperparameters specified in
        `model_parameters` will be used (passed to model constructor as-is).
    model_path : str
        Path indicating where serialized model will be saved.

    Returns
    -------
    score : float
        The model's training score.
    best_params : dict
        Dictionary of best hyperparameter values (keys are parameter names,
        values are the corresponding best values) determined by `scikit-learn`'s
        `GridSearchCV`. If no hyperparameter optimization is performed (i.e.
        `params_to_optimize` is None or is an empty dict, this will be an empty
        dict.
    '''
    fset, data = featurize.load_featureset(fset_path)
    if len(data['labels']) != len(fset):
        raise ValueError("Cannot build model for unlabeled feature set.")
    n_jobs = (model_params.pop('n_jobs') if 'n_jobs' in model_params
              and params_to_optimize else -1)
    model = MODELS_TYPE_DICT[model_type](**model_params)
    if params_to_optimize:
        model = GridSearchCV(model, params_to_optimize,
                             n_jobs=n_jobs)
    model.fit(fset, data['labels'])
    score = model.score(fset, data['labels'])
    best_params = model.best_params_ if params_to_optimize else {}
    joblib.dump(model, model_path)

    return score, best_params
Ejemplo n.º 10
0
def _build_model_compute_statistics(fset_path, model_type, model_params,
                                    params_to_optimize, model_path):
    '''Build model and return summary statistics.

    Parameters
    ----------
    fset_path : str
        Path to feature set .npz file.
    model_type : str
        Type of model to be built, e.g. 'RandomForestClassifier'.
    model_params : dict
        Dictionary with hyperparameter values to be used in model building.
        Keys are parameter names, values are the associated parameter values.
        These hyperparameters will be passed to the model constructor as-is
        (for hyperparameter optimization, see `params_to_optimize`).
    params_to_optimize : dict or list of dict
        During hyperparameter optimization, various model parameters
        are adjusted to give an optimal fit. This dictionary gives the
        different values that should be explored for each parameter. E.g.,
        `{'alpha': [1, 2], 'beta': [4, 5, 6]}` would fit models on all
        6 combinations of alpha and beta and compare the resulting models'
        goodness-of-fit. If None, only those hyperparameters specified in
        `model_parameters` will be used (passed to model constructor as-is).
    model_path : str
        Path indicating where serialized model will be saved.

    Returns
    -------
    score : float
        The model's training score.
    best_params : dict
        Dictionary of best hyperparameter values (keys are parameter names,
        values are the corresponding best values) determined by `scikit-learn`'s
        `GridSearchCV`. If no hyperparameter optimization is performed (i.e.
        `params_to_optimize` is None or is an empty dict, this will be an empty
        dict.
    '''
    fset, data = featurize.load_featureset(fset_path)
    if len(data['labels']) != len(fset):
        raise ValueError("Cannot build model for unlabeled feature set.")
    n_jobs = (model_params.pop('n_jobs')
              if 'n_jobs' in model_params and params_to_optimize else -1)
    model = MODELS_TYPE_DICT[model_type](**model_params)
    if params_to_optimize:
        model = GridSearchCV(model, params_to_optimize, n_jobs=n_jobs)
    model.fit(fset, data['labels'])
    score = model.score(fset, data['labels'])
    best_params = model.best_params_ if params_to_optimize else {}
    joblib.dump(model, model_path)

    return score, best_params
Ejemplo n.º 11
0
def feature_scatterplot(fset_path, features_to_plot):
    """Create scatter plot of feature set.

    Parameters
    ----------
    fset_path : str
        Path to feature set to be plotted.
    features_to_plot : list of str
        List of feature names to be plotted.

    Returns
    -------
    (str, str)
        Returns (docs_json, render_items) json for the desired plot.
    """
    fset, data = featurize.load_featureset(fset_path)
    fset = fset[features_to_plot]
    colors = cycle(palette[5])
    plots = np.array([[figure(width=300, height=200)
                       for j in range(len(features_to_plot))]
                      for i in range(len(features_to_plot))])

    for (j, i), p in np.ndenumerate(plots):
        if (j == i == 0):
            p.title.text = "Scatterplot matrix"
        p.circle(fset.values[:,i], fset.values[:,j], color=next(colors))
        p.xaxis.minor_tick_line_color = None
        p.yaxis.minor_tick_line_color = None
        p.ygrid[0].ticker.desired_num_ticks = 2
        p.xgrid[0].ticker.desired_num_ticks = 4
        p.outline_line_color = None
        p.axis.visible = None

    plot = gridplot(plots.tolist(), ncol=len(features_to_plot), mergetools=True, responsive=True, title="Test")

    # Convert plot to json objects necessary for rendering with bokeh on the
    # frontend
    render_items = [{'docid': plot._id, 'elementid': make_id()}]

    doc = Document()
    doc.add_root(plot)
    docs_json_inner = doc.to_json()
    docs_json = {render_items[0]['docid']: docs_json_inner}

    docs_json = serialize_json(docs_json)
    render_items = serialize_json(render_items)

    return docs_json, render_items
Ejemplo n.º 12
0
def test_roundtrip_featureset(tmpdir):
    fset_path = os.path.join(str(tmpdir), 'test.npz')
    for n_channels in [1, 3]:
        for labels in [['class1', 'class2'], []]:
            fset, labels = sample_featureset(3, n_channels, ['amplitude'],
                                             labels, names=['a', 'b', 'c'],
                                             meta_features=['meta1'])

            pred_probs = pd.DataFrame(np.random.random((len(fset), 2)),
                                      index=fset.index.values,
                                      columns=['class1', 'class2'])

            featurize.save_featureset(fset, fset_path, labels=labels,
                                      pred_probs=pred_probs)
            fset_loaded, data_loaded = featurize.load_featureset(fset_path)
            npt.assert_allclose(fset.values, fset_loaded.values)
            npt.assert_array_equal(fset.index, fset_loaded.index)
            npt.assert_array_equal(fset.columns, fset_loaded.columns)
            assert isinstance(fset_loaded, pd.DataFrame)
            npt.assert_array_equal(labels, data_loaded['labels'])
            npt.assert_allclose(pred_probs, data_loaded['pred_probs'])
            npt.assert_array_equal(pred_probs.columns,
                                   data_loaded['pred_probs'].columns)
Ejemplo n.º 13
0
    def get(self, featureset_id=None, action=None):
        if action == 'download':
            featureset = Featureset.get_if_owned_by(featureset_id,
                                                    self.current_user)
            fset_path = featureset.file_uri
            fset, data = featurize.load_featureset(fset_path)
            if 'labels' in data:
                fset['labels'] = data['labels']
            self.set_header("Content-Type", 'text/csv; charset="utf-8"')
            self.set_header(
                "Content-Disposition", "attachment; "
                f"filename=cesium_featureset_{featureset.project.name}"
                f"_{featureset.name}_{featureset.finished}.csv")
            self.write(fset.to_csv(index=True))
        else:
            if featureset_id is not None:
                featureset_info = Featureset.get_if_owned_by(featureset_id,
                                                             self.current_user)
            else:
                featureset_info = [f for p in self.current_user.projects
                                   for f in p.featuresets]
                featureset_info.sort(key=lambda f: f.created_at, reverse=True)

            self.success(featureset_info)
Ejemplo n.º 14
0
def featurize_data(pbmap, pbnames, lcdata, metadata, nobjects, featurefile):
    """
    ***Feature extractor  for PLaSTiCC***

    Extracts features from data by some Cesium library functions.
    Builds a timeseries dictionary and for each time series extracts
    features. Features described in file: feature_sets.
    
    Created on Mon Apr 29 19:30:52 2019
    
    @author: luisarribas
    
    """
    print("")
    print("EXTRACTING FEATURES")
    print("===================")
    print("")
    print("Building Timeseries....wait")
    print("===========================")
    #**********************BUILD TIME SERIES**********************************
    tsdict = OrderedDict()
    for i in range(nobjects):
        row = metadata[i]
        thisid = row['object_id']
        target = row['target']

        meta = {'zBand':row['zBand'],\
                'z':row['hostgal_photoz'],\
                'zerr':row['hostgal_photoz_err'],\
                'mag':row['magnitude'],\
                'u-b':row['u-b'],\
                'b-v':row['b-v']
                }

        ind = (lcdata['object_id'] == thisid)
        thislc = lcdata[ind]

        pbind = [(thislc['passband'] == pb) for pb in pbmap]
        t = [thislc['mjd'][mask].data for mask in pbind]
        m = [thislc['flux'][mask].data for mask in pbind]
        e = [thislc['flux_err'][mask].data for mask in pbind]

        tsdict[thisid] = TimeSeries(t=t, m=m, e=e,\
                            label=target, name=thisid, meta_features=meta,\
                            channel_names=pbnames )

    print("")
    print("OK!")
    print(" ")

    #***********************FEATURE EXTRACTION WITH CESIUM********************
    warnings.simplefilter('ignore')
    if os.path.exists(featurefile):
        print("")
        print("Loading features from file....wait")
        print("==================================")
        featuretable, _ = featurize.load_featureset(featurefile)
        print("")
        print("OK!")
        print(" ")
    else:
        features_list = []
        print("")
        print("Computing features....wait")
        print("==========================")

        with schwimmbad.MultiPool() as pool:
            results = pool.imap(worker, list(tsdict.values()))
            for res in results:
                features_list.append(res)

        featuretable = featurize.assemble_featureset(features_list=features_list,\
                                  time_series=tsdict.values())
        featurize.impute_featureset(fset=featuretable,
                                    strategy='constant',
                                    value=0,
                                    max_value=18446744073709551000,
                                    inplace=True)
        featurize.save_featureset(fset=featuretable, path=featurefile)
        print("")
        print("OK!")
        print(" ")

    #*******Build Pandas dataframe output*************************************
    old_names = featuretable.columns.values
    new_names = ['{}_{}'.format(x, pbmap.get(y, 'meta')) for x, y in old_names]
    cols = [featuretable[col] for col in old_names]
    allfeats = Table(cols, names=new_names, masked=False)
    allfeats['target'] = metadata['target']
    allfeats = allfeats.to_pandas()
    allfeats = np.nan_to_num(allfeats)
    new_names.append('target')
    allfeats = Table(allfeats, names=new_names, masked=False)
    allfeats = allfeats.to_pandas()
    print("")
    print("Extracted features = ", len(allfeats.columns))
    print("==========================")
    print("")
    print("Nan Values detected = ", sum(len(allfeats) - allfeats.count()))
    print("==========================")

    return allfeats